Python 3.8.10 (default, Jun 2 2021, 10:49:15) Type 'copyright', 'credits' or 'license' for more information IPython 7.26.0 -- An enhanced Interactive Python. Type '?' for help. In [1]: import nltk In [2]: text = "The Natural Language Toolkit, or more commonly NLTK, it's a suite of libraries and programs for ...: symbolic and statistical natural language processing (NLP) for English written in the Python programming ...: language. It was developed by Steven Bird and Edward Loper in the Department of Computer and Informatio ...: n Science at the University of Pennsylvania. NLTK includes graphical demonstrations and sample data. It ...: is accompanied by a book that explains the underlying concepts behind the language processing tasks supp ...: orted by the toolkit, plus a cookbook." # 测试英文 word tokenize 功能 In [3]: tokens = nltk.word_tokenize(text) In [4]: tokens[0:20] Out[4]: ['The', 'Natural', 'Language', 'Toolkit', ','
, 'or', 'more', 'commonly', 'NLTK', ',', 'it', "'s", 'a', 'suite', 'of', 'libraries', 'and', 'programs', 'for', 'symbolic'] # 测试英文词性标注功能 In [5]: tagged_tokens = nltk.pos_tag(tokens) In [6]: tagged_tokens[0:20] Out[6]: [('The', 'DT'), ('Natural', 'NNP'), ('Language', 'NNP'), ('Toolkit', 'NNP'), (',', ','), ('or', 'CC'), ('more', 'JJR'), ('commonly', 'RB'), ('NLTK', 'NNP'), (',', ','), ('it', 'PRP'), ("'s", 'VBZ'), ('a', 'DT'), ('suite', 'NN'), ('of', 'IN'), ('libraries'
, 'NNS'), ('and', 'CC'), ('programs', 'NNS'), ('for', 'IN'), ('symbolic', 'JJ')] # 测试英文断句功能 In [8]: sents = nltk.sent_tokenize(text) In [9]: sents Out[9]: ["The Natural Language Toolkit, or more commonly NLTK, it's a suite of libraries and programs for symbolic and statistical natural language processing (NLP) for English written in the Python programming language.", 'It was developed by Steven Bird and Edward Loper in the Department of Computer and Information Science at the University of Pennsylvania.', 'NLTK includes graphical demonstrations and sample data.', 'It is accompanied by a book that explains the underlying concepts behind the language processing tasks supported by the toolkit, plus a cookbook.'] # 测试英文命名实体标注功能 In [10]: entities = nltk.chunk.ne_chunk(tagged_tokens) --------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) <ipython-input-10-a85268718ce4> in <module> ----> 1 entities = nltk.chunk.ne_chunk(tagged_tokens) ~/textminer/nlp_tools/venv/lib/python3.8/site-packages/nltk/chunk/__init__.py in ne_chunk(tagged_tokens, binary) 183 else: 184 chunker_pickle = _MULTICLASS_NE_CHUNKER --> 185 chunker = load(chunker_pickle) 186 return chunker.parse(tagged_tokens) 187 ~/textminer/nlp_tools/venv/lib/python3.8/site-packages/nltk/data.py in load(resource_url, format, cache, verbose, logic_parser, fstruct_reader, encoding) 753 resource_val = opened_resource.read() 754 elif format == "pickle": --> 755 resource_val = pickle.load(
opened_resource) 756 elif format == "json": 757 import json ModuleNotFoundError: No module named 'numpy' # 发现缺少 numpy 模块,直接 pip install 安装 In [11]: pip install numpy Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple Collecting numpy Downloading https://pypi.tuna.tsinghua.edu.cn/packages/aa/69/260a4a1cc89cc00b51f432db048c396952f5c05dfa1345a1b3dbd9ea3544/numpy-1.21.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.8 MB) |████████████████████████████████| 15.8 MB 11.2 MB/s Installing collected packages: numpy Successfully installed numpy-1.21.2 Note: you may need to restart the kernel to use updated packages. # 继续测试英文命名实体识别 In [12]: entities = nltk.chunk.ne_chunk(tagged_tokens) In [13]: entities Out[13]: Tree('S', [('The', 'DT'), Tree('ORGANIZATION', [('Natural', 'NNP'), ('Language', 'NNP'), ('Toolkit', 'NNP')]), (',', ','), ('or', 'CC'), ('more', 'JJR'), ('commonly', 'RB'), Tree('ORGANIZATION', [('NLTK', 'NNP')]), (',', ','), ('it', 'PRP'), ("'s", 'VBZ'), ('a', 'DT'
), ('suite', 'NN'), ('of', 'IN'), ('libraries', 'NNS'), ('and', 'CC'), ('programs', 'NNS'), ('for', 'IN'), ('symbolic', 'JJ'), ('and', 'CC'), ('statistical', 'JJ'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('(', '('), Tree('ORGANIZATION', [('NLP', 'NNP')]), (')', ')'), ('for', 'IN'), Tree('GPE', [('English', 'NNP')]), ('written', 'VBN'), ('in', 'IN'), ('the', 'DT'), Tree('GPE', [('Python', 'NNP')]),
('programming', 'NN'), ('language', 'NN'), ('.', '.'), ('It', 'PRP'), ('was', 'VBD'), ('developed', 'VBN'), ('by', 'IN'), Tree('PERSON', [('Steven', 'NNP'), ('Bird', 'NNP')]), ('and', 'CC'), Tree('PERSON', [('Edward', 'NNP'), ('Loper', 'NNP')]), ('in', 'IN'), ('the', 'DT'), Tree('ORGANIZATION', [('Department', 'NNP')]), ('of', 'IN'), Tree('ORGANIZATION', [('Computer', 'NNP')]), ('and', 'CC'), Tree('ORGANIZATION', [('Information', 'NNP'), ('Science', 'NNP'
)]), ('at', 'IN'), ('the', 'DT'), Tree('ORGANIZATION', [('University', 'NNP')]), ('of', 'IN'), Tree('GPE', [('Pennsylvania', 'NNP')]), ('.', '.'), Tree('ORGANIZATION', [('NLTK', 'NNP')]), ('includes', 'VBZ'), ('graphical', 'JJ'), ('demonstrations', 'NNS'), ('and', 'CC'), ('sample', 'JJ'), ('data', 'NNS'), ('.', '.'), ('It', 'PRP'), ('is', 'VBZ'), ('accompanied', 'VBN'), ('by', 'IN'), ('a', 'DT'), ('book', 'NN'), ('that', 'WDT')
, ('explains', 'VBZ'), ('the', 'DT'), ('underlying', 'JJ'), ('concepts', 'NNS'), ('behind', 'IN'), ('the', 'DT'), ('language', 'NN'), ('processing', 'NN'), ('tasks', 'NNS'), ('supported', 'VBN'), ('by', 'IN'), ('the', 'DT'), ('toolkit', 'NN'), (',', ','), ('plus', 'CC'), ('a', 'DT'), ('cookbook', 'NN'), ('.', '.')]) |