mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 13:11:03 +03:00 
			
		
		
		
	Merge branch 'develop' into spacy.io
This commit is contained in:
		
						commit
						17038fe768
					
				|  | @ -89,7 +89,7 @@ cdef class Lexeme: | ||||||
|             return lex.lang |             return lex.lang | ||||||
|         else: |         else: | ||||||
|             return 0 |             return 0 | ||||||
|      | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef inline bint c_check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: |     cdef inline bint c_check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: | ||||||
|         cdef flags_t one = 1 |         cdef flags_t one = 1 | ||||||
|  |  | ||||||
|  | @ -214,9 +214,6 @@ def test_doc_retokenize_spans_entity_merge_iob(): | ||||||
|         retokenizer.merge(doc[2:4]) |         retokenizer.merge(doc[2:4]) | ||||||
|         retokenizer.merge(doc[4:6]) |         retokenizer.merge(doc[4:6]) | ||||||
|         retokenizer.merge(doc[7:9]) |         retokenizer.merge(doc[7:9]) | ||||||
|     for token in doc: |  | ||||||
|         print(token) |  | ||||||
|         print(token.ent_iob) |  | ||||||
|     assert len(doc) == 6 |     assert len(doc) == 6 | ||||||
|     assert doc[3].ent_iob_ == "B" |     assert doc[3].ent_iob_ == "B" | ||||||
|     assert doc[4].ent_iob_ == "I" |     assert doc[4].ent_iob_ == "I" | ||||||
|  | @ -270,16 +267,16 @@ def test_doc_retokenize_merge_extension_attrs(en_vocab): | ||||||
|         attrs = {"lemma": "hello world", "_": {"a": True, "b": "1"}} |         attrs = {"lemma": "hello world", "_": {"a": True, "b": "1"}} | ||||||
|         retokenizer.merge(doc[0:2], attrs=attrs) |         retokenizer.merge(doc[0:2], attrs=attrs) | ||||||
|     assert doc[0].lemma_ == "hello world" |     assert doc[0].lemma_ == "hello world" | ||||||
|     assert doc[0]._.a == True |     assert doc[0]._.a is True | ||||||
|     assert doc[0]._.b == "1" |     assert doc[0]._.b == "1" | ||||||
|     # Test bulk merging |     # Test bulk merging | ||||||
|     doc = Doc(en_vocab, words=["hello", "world", "!", "!"]) |     doc = Doc(en_vocab, words=["hello", "world", "!", "!"]) | ||||||
|     with doc.retokenize() as retokenizer: |     with doc.retokenize() as retokenizer: | ||||||
|         retokenizer.merge(doc[0:2], attrs={"_": {"a": True, "b": "1"}}) |         retokenizer.merge(doc[0:2], attrs={"_": {"a": True, "b": "1"}}) | ||||||
|         retokenizer.merge(doc[2:4], attrs={"_": {"a": None, "b": "2"}}) |         retokenizer.merge(doc[2:4], attrs={"_": {"a": None, "b": "2"}}) | ||||||
|     assert doc[0]._.a == True |     assert doc[0]._.a is True | ||||||
|     assert doc[0]._.b == "1" |     assert doc[0]._.b == "1" | ||||||
|     assert doc[1]._.a == None |     assert doc[1]._.a is None | ||||||
|     assert doc[1]._.b == "2" |     assert doc[1]._.b == "2" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -292,3 +289,29 @@ def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs | ||||||
|     with pytest.raises(ValueError): |     with pytest.raises(ValueError): | ||||||
|         with doc.retokenize() as retokenizer: |         with doc.retokenize() as retokenizer: | ||||||
|             retokenizer.merge(doc[0:2], attrs=attrs) |             retokenizer.merge(doc[0:2], attrs=attrs) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_doc_retokenizer_merge_lex_attrs(en_vocab): | ||||||
|  |     """Test that retokenization also sets attributes on the lexeme if they're | ||||||
|  |     lexical attributes. For example, if a user sets IS_STOP, it should mean that | ||||||
|  |     "all tokens with that lexeme" are marked as a stop word, so the ambiguity | ||||||
|  |     here is acceptable. Also see #2390. | ||||||
|  |     """ | ||||||
|  |     # Test regular merging | ||||||
|  |     doc = Doc(en_vocab, words=["hello", "world", "!"]) | ||||||
|  |     assert not any(t.is_stop for t in doc) | ||||||
|  |     with doc.retokenize() as retokenizer: | ||||||
|  |         retokenizer.merge(doc[0:2], attrs={"lemma": "hello world", "is_stop": True}) | ||||||
|  |     assert doc[0].lemma_ == "hello world" | ||||||
|  |     assert doc[0].is_stop | ||||||
|  |     # Test bulk merging | ||||||
|  |     doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"]) | ||||||
|  |     assert not any(t.like_num for t in doc) | ||||||
|  |     assert not any(t.is_stop for t in doc) | ||||||
|  |     with doc.retokenize() as retokenizer: | ||||||
|  |         retokenizer.merge(doc[0:2], attrs={"like_num": True}) | ||||||
|  |         retokenizer.merge(doc[2:4], attrs={"is_stop": True}) | ||||||
|  |     assert doc[0].like_num | ||||||
|  |     assert doc[1].is_stop | ||||||
|  |     assert not doc[0].is_stop | ||||||
|  |     assert not doc[1].like_num | ||||||
|  |  | ||||||
|  | @ -137,10 +137,10 @@ def test_doc_retokenize_split_extension_attrs(en_vocab): | ||||||
|         attrs = {"lemma": ["los", "angeles"], "_": underscore} |         attrs = {"lemma": ["los", "angeles"], "_": underscore} | ||||||
|         retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs) |         retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs) | ||||||
|     assert doc[0].lemma_ == "los" |     assert doc[0].lemma_ == "los" | ||||||
|     assert doc[0]._.a == True |     assert doc[0]._.a is True | ||||||
|     assert doc[0]._.b == "1" |     assert doc[0]._.b == "1" | ||||||
|     assert doc[1].lemma_ == "angeles" |     assert doc[1].lemma_ == "angeles" | ||||||
|     assert doc[1]._.a == False |     assert doc[1]._.a is False | ||||||
|     assert doc[1]._.b == "2" |     assert doc[1]._.b == "2" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -165,3 +165,21 @@ def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs | ||||||
|         with doc.retokenize() as retokenizer: |         with doc.retokenize() as retokenizer: | ||||||
|             heads = [(doc[0], 1), doc[1]] |             heads = [(doc[0], 1), doc[1]] | ||||||
|             retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs) |             retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_doc_retokenizer_split_lex_attrs(en_vocab): | ||||||
|  |     """Test that retokenization also sets attributes on the lexeme if they're | ||||||
|  |     lexical attributes. For example, if a user sets IS_STOP, it should mean that | ||||||
|  |     "all tokens with that lexeme" are marked as a stop word, so the ambiguity | ||||||
|  |     here is acceptable. Also see #2390. | ||||||
|  |     """ | ||||||
|  |     assert not Doc(en_vocab, words=["Los"])[0].is_stop | ||||||
|  |     assert not Doc(en_vocab, words=["Angeles"])[0].is_stop | ||||||
|  |     doc = Doc(en_vocab, words=["LosAngeles", "start"]) | ||||||
|  |     assert not doc[0].is_stop | ||||||
|  |     with doc.retokenize() as retokenizer: | ||||||
|  |         attrs = {"is_stop": [True, False]} | ||||||
|  |         heads = [(doc[0], 1), doc[1]] | ||||||
|  |         retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs) | ||||||
|  |     assert doc[0].is_stop | ||||||
|  |     assert not doc[1].is_stop | ||||||
|  |  | ||||||
|  | @ -1,89 +0,0 @@ | ||||||
| # coding: utf-8 |  | ||||||
| from __future__ import unicode_literals |  | ||||||
| 
 |  | ||||||
| import json |  | ||||||
| from tempfile import NamedTemporaryFile |  | ||||||
| 
 |  | ||||||
| from spacy.cli.train import train |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_cli_trained_model_can_be_saved(tmpdir): |  | ||||||
|     lang = "nl" |  | ||||||
|     output_dir = str(tmpdir) |  | ||||||
|     train_file = NamedTemporaryFile("wb", dir=output_dir, delete=False) |  | ||||||
|     train_corpus = [ |  | ||||||
|         { |  | ||||||
|             "id": "identifier_0", |  | ||||||
|             "paragraphs": [ |  | ||||||
|                 { |  | ||||||
|                     "raw": "Jan houdt van Marie.\n", |  | ||||||
|                     "sentences": [ |  | ||||||
|                         { |  | ||||||
|                             "tokens": [ |  | ||||||
|                                 { |  | ||||||
|                                     "id": 0, |  | ||||||
|                                     "dep": "nsubj", |  | ||||||
|                                     "head": 1, |  | ||||||
|                                     "tag": "NOUN", |  | ||||||
|                                     "orth": "Jan", |  | ||||||
|                                     "ner": "B-PER", |  | ||||||
|                                 }, |  | ||||||
|                                 { |  | ||||||
|                                     "id": 1, |  | ||||||
|                                     "dep": "ROOT", |  | ||||||
|                                     "head": 0, |  | ||||||
|                                     "tag": "VERB", |  | ||||||
|                                     "orth": "houdt", |  | ||||||
|                                     "ner": "O", |  | ||||||
|                                 }, |  | ||||||
|                                 { |  | ||||||
|                                     "id": 2, |  | ||||||
|                                     "dep": "case", |  | ||||||
|                                     "head": 1, |  | ||||||
|                                     "tag": "ADP", |  | ||||||
|                                     "orth": "van", |  | ||||||
|                                     "ner": "O", |  | ||||||
|                                 }, |  | ||||||
|                                 { |  | ||||||
|                                     "id": 3, |  | ||||||
|                                     "dep": "obj", |  | ||||||
|                                     "head": -2, |  | ||||||
|                                     "tag": "NOUN", |  | ||||||
|                                     "orth": "Marie", |  | ||||||
|                                     "ner": "B-PER", |  | ||||||
|                                 }, |  | ||||||
|                                 { |  | ||||||
|                                     "id": 4, |  | ||||||
|                                     "dep": "punct", |  | ||||||
|                                     "head": -3, |  | ||||||
|                                     "tag": "PUNCT", |  | ||||||
|                                     "orth": ".", |  | ||||||
|                                     "ner": "O", |  | ||||||
|                                 }, |  | ||||||
|                                 { |  | ||||||
|                                     "id": 5, |  | ||||||
|                                     "dep": "", |  | ||||||
|                                     "head": -1, |  | ||||||
|                                     "tag": "SPACE", |  | ||||||
|                                     "orth": "\n", |  | ||||||
|                                     "ner": "O", |  | ||||||
|                                 }, |  | ||||||
|                             ], |  | ||||||
|                             "brackets": [], |  | ||||||
|                         } |  | ||||||
|                     ], |  | ||||||
|                 } |  | ||||||
|             ], |  | ||||||
|         } |  | ||||||
|     ] |  | ||||||
| 
 |  | ||||||
|     train_file.write(json.dumps(train_corpus).encode("utf-8")) |  | ||||||
|     train_file.close() |  | ||||||
|     train_data = train_file.name |  | ||||||
|     dev_data = train_data |  | ||||||
| 
 |  | ||||||
|     # spacy train -n 1 -g -1 nl output_nl training_corpus.json training \ |  | ||||||
|     # corpus.json |  | ||||||
|     train(lang, output_dir, train_data, dev_data, n_iter=1) |  | ||||||
| 
 |  | ||||||
|     assert True |  | ||||||
|  | @ -1,25 +0,0 @@ | ||||||
| # coding: utf-8 |  | ||||||
| from __future__ import unicode_literals |  | ||||||
| 
 |  | ||||||
| import random |  | ||||||
| from spacy.lang.en import English |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_train_with_many_entity_types(): |  | ||||||
|     """Test issue that arises when too many labels are added to NER model. |  | ||||||
|     NB: currently causes segfault! |  | ||||||
|     """ |  | ||||||
|     train_data = [] |  | ||||||
|     train_data.extend([("One sentence", {"entities": []})]) |  | ||||||
|     entity_types = [str(i) for i in range(1000)] |  | ||||||
|     nlp = English(pipeline=[]) |  | ||||||
|     ner = nlp.create_pipe("ner") |  | ||||||
|     nlp.add_pipe(ner) |  | ||||||
|     for entity_type in list(entity_types): |  | ||||||
|         ner.add_label(entity_type) |  | ||||||
|     optimizer = nlp.begin_training() |  | ||||||
|     for i in range(20): |  | ||||||
|         losses = {} |  | ||||||
|         random.shuffle(train_data) |  | ||||||
|         for statement, entities in train_data: |  | ||||||
|             nlp.update([statement], [entities], sgd=optimizer, losses=losses, drop=0.5) |  | ||||||
|  | @ -11,7 +11,7 @@ from spacy.lang.lex_attrs import is_stop | ||||||
| from spacy.vectors import Vectors | from spacy.vectors import Vectors | ||||||
| from spacy.vocab import Vocab | from spacy.vocab import Vocab | ||||||
| from spacy.language import Language | from spacy.language import Language | ||||||
| from spacy.tokens import Doc, Span | from spacy.tokens import Doc, Span, Token | ||||||
| from spacy.pipeline import Tagger, EntityRecognizer | from spacy.pipeline import Tagger, EntityRecognizer | ||||||
| from spacy.attrs import HEAD, DEP | from spacy.attrs import HEAD, DEP | ||||||
| from spacy.matcher import Matcher | from spacy.matcher import Matcher | ||||||
|  | @ -272,3 +272,60 @@ def test_issue1967(label): | ||||||
|     entry = ([0], ["word"], ["tag"], [0], ["dep"], [label]) |     entry = ([0], ["word"], ["tag"], [0], ["dep"], [label]) | ||||||
|     gold_parses = [(None, [(entry, None)])] |     gold_parses = [(None, [(entry, None)])] | ||||||
|     ner.moves.get_actions(gold_parses=gold_parses) |     ner.moves.get_actions(gold_parses=gold_parses) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_issue1971(en_vocab): | ||||||
|  |     # Possibly related to #2675 and #2671? | ||||||
|  |     matcher = Matcher(en_vocab) | ||||||
|  |     pattern = [ | ||||||
|  |         {"ORTH": "Doe"}, | ||||||
|  |         {"ORTH": "!", "OP": "?"}, | ||||||
|  |         {"_": {"optional": True}, "OP": "?"}, | ||||||
|  |         {"ORTH": "!", "OP": "?"}, | ||||||
|  |     ] | ||||||
|  |     Token.set_extension("optional", default=False) | ||||||
|  |     matcher.add("TEST", None, pattern) | ||||||
|  |     doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"]) | ||||||
|  |     # We could also assert length 1 here, but this is more conclusive, because | ||||||
|  |     # the real problem here is that it returns a duplicate match for a match_id | ||||||
|  |     # that's not actually in the vocab! | ||||||
|  |     matches = matcher(doc) | ||||||
|  |     assert all([match_id in en_vocab.strings for match_id, start, end in matches]) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_issue_1971_2(en_vocab): | ||||||
|  |     matcher = Matcher(en_vocab) | ||||||
|  |     pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}] | ||||||
|  |     pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}]  # {"IN": ["EUR"]}}] | ||||||
|  |     doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"]) | ||||||
|  |     matcher.add("TEST1", None, pattern1, pattern2) | ||||||
|  |     matches = matcher(doc) | ||||||
|  |     assert len(matches) == 2 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_issue_1971_3(en_vocab): | ||||||
|  |     """Test that pattern matches correctly for multiple extension attributes.""" | ||||||
|  |     Token.set_extension("a", default=1, force=True) | ||||||
|  |     Token.set_extension("b", default=2, force=True) | ||||||
|  |     doc = Doc(en_vocab, words=["hello", "world"]) | ||||||
|  |     matcher = Matcher(en_vocab) | ||||||
|  |     matcher.add("A", None, [{"_": {"a": 1}}]) | ||||||
|  |     matcher.add("B", None, [{"_": {"b": 2}}]) | ||||||
|  |     matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc)) | ||||||
|  |     assert len(matches) == 4 | ||||||
|  |     assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)]) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_issue_1971_4(en_vocab): | ||||||
|  |     """Test that pattern matches correctly with multiple extension attribute | ||||||
|  |     values on a single token. | ||||||
|  |     """ | ||||||
|  |     Token.set_extension("ext_a", default="str_a", force=True) | ||||||
|  |     Token.set_extension("ext_b", default="str_b", force=True) | ||||||
|  |     matcher = Matcher(en_vocab) | ||||||
|  |     doc = Doc(en_vocab, words=["this", "is", "text"]) | ||||||
|  |     pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3 | ||||||
|  |     matcher.add("TEST", None, pattern) | ||||||
|  |     matches = matcher(doc) | ||||||
|  |     # Uncommenting this caused a segmentation fault | ||||||
|  |     assert len(matches) == 1 | ||||||
|  |  | ||||||
|  | @ -1,62 +0,0 @@ | ||||||
| # coding: utf8 |  | ||||||
| from __future__ import unicode_literals |  | ||||||
| 
 |  | ||||||
| from spacy.matcher import Matcher |  | ||||||
| from spacy.tokens import Token, Doc |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_issue1971(en_vocab): |  | ||||||
|     # Possibly related to #2675 and #2671? |  | ||||||
|     matcher = Matcher(en_vocab) |  | ||||||
|     pattern = [ |  | ||||||
|         {"ORTH": "Doe"}, |  | ||||||
|         {"ORTH": "!", "OP": "?"}, |  | ||||||
|         {"_": {"optional": True}, "OP": "?"}, |  | ||||||
|         {"ORTH": "!", "OP": "?"}, |  | ||||||
|     ] |  | ||||||
|     Token.set_extension("optional", default=False) |  | ||||||
|     matcher.add("TEST", None, pattern) |  | ||||||
|     doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"]) |  | ||||||
|     # We could also assert length 1 here, but this is more conclusive, because |  | ||||||
|     # the real problem here is that it returns a duplicate match for a match_id |  | ||||||
|     # that's not actually in the vocab! |  | ||||||
|     matches = matcher(doc) |  | ||||||
|     assert all([match_id in en_vocab.strings for match_id, start, end in matches]) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_issue_1971_2(en_vocab): |  | ||||||
|     matcher = Matcher(en_vocab) |  | ||||||
|     pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}] |  | ||||||
|     pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}]  # {"IN": ["EUR"]}}] |  | ||||||
|     doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"]) |  | ||||||
|     matcher.add("TEST1", None, pattern1, pattern2) |  | ||||||
|     matches = matcher(doc) |  | ||||||
|     assert len(matches) == 2 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_issue_1971_3(en_vocab): |  | ||||||
|     """Test that pattern matches correctly for multiple extension attributes.""" |  | ||||||
|     Token.set_extension("a", default=1, force=True) |  | ||||||
|     Token.set_extension("b", default=2, force=True) |  | ||||||
|     doc = Doc(en_vocab, words=["hello", "world"]) |  | ||||||
|     matcher = Matcher(en_vocab) |  | ||||||
|     matcher.add("A", None, [{"_": {"a": 1}}]) |  | ||||||
|     matcher.add("B", None, [{"_": {"b": 2}}]) |  | ||||||
|     matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc)) |  | ||||||
|     assert len(matches) == 4 |  | ||||||
|     assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)]) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_issue_1971_4(en_vocab): |  | ||||||
|     """Test that pattern matches correctly with multiple extension attribute |  | ||||||
|     values on a single token. |  | ||||||
|     """ |  | ||||||
|     Token.set_extension("ext_a", default="str_a", force=True) |  | ||||||
|     Token.set_extension("ext_b", default="str_b", force=True) |  | ||||||
|     matcher = Matcher(en_vocab) |  | ||||||
|     doc = Doc(en_vocab, words=["this", "is", "text"]) |  | ||||||
|     pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3 |  | ||||||
|     matcher.add("TEST", None, pattern) |  | ||||||
|     matches = matcher(doc) |  | ||||||
|     # Uncommenting this caused a segmentation fault |  | ||||||
|     assert len(matches) == 1 |  | ||||||
|  | @ -2,15 +2,18 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| import pytest | import pytest | ||||||
|  | from spacy import displacy | ||||||
| from spacy.lang.en import English | from spacy.lang.en import English | ||||||
| from spacy.lang.ja import Japanese | from spacy.lang.ja import Japanese | ||||||
| from spacy.lang.xx import MultiLanguage | from spacy.lang.xx import MultiLanguage | ||||||
| from spacy.language import Language | from spacy.language import Language | ||||||
| from spacy.matcher import Matcher | from spacy.matcher import Matcher | ||||||
| from spacy.tokens import Span | from spacy.tokens import Doc, Span | ||||||
| from spacy.vocab import Vocab | from spacy.vocab import Vocab | ||||||
|  | from spacy.compat import pickle | ||||||
| from spacy._ml import link_vectors_to_models | from spacy._ml import link_vectors_to_models | ||||||
| import numpy | import numpy | ||||||
|  | import random | ||||||
| 
 | 
 | ||||||
| from ..util import get_doc | from ..util import get_doc | ||||||
| 
 | 
 | ||||||
|  | @ -54,6 +57,25 @@ def test_issue2626_2835(en_tokenizer, text): | ||||||
|     assert doc |     assert doc | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_issue2656(en_tokenizer): | ||||||
|  |     """Test that tokenizer correctly splits of punctuation after numbers with | ||||||
|  |     decimal points. | ||||||
|  |     """ | ||||||
|  |     doc = en_tokenizer("I went for 40.3, and got home by 10.0.") | ||||||
|  |     assert len(doc) == 11 | ||||||
|  |     assert doc[0].text == "I" | ||||||
|  |     assert doc[1].text == "went" | ||||||
|  |     assert doc[2].text == "for" | ||||||
|  |     assert doc[3].text == "40.3" | ||||||
|  |     assert doc[4].text == "," | ||||||
|  |     assert doc[5].text == "and" | ||||||
|  |     assert doc[6].text == "got" | ||||||
|  |     assert doc[7].text == "home" | ||||||
|  |     assert doc[8].text == "by" | ||||||
|  |     assert doc[9].text == "10.0" | ||||||
|  |     assert doc[10].text == "." | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_issue2671(): | def test_issue2671(): | ||||||
|     """Ensure the correct entity ID is returned for matches with quantifiers. |     """Ensure the correct entity ID is returned for matches with quantifiers. | ||||||
|     See also #2675 |     See also #2675 | ||||||
|  | @ -77,6 +99,17 @@ def test_issue2671(): | ||||||
|         assert nlp.vocab.strings[match_id] == pattern_id |         assert nlp.vocab.strings[match_id] == pattern_id | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_issue2728(en_vocab): | ||||||
|  |     """Test that displaCy ENT visualizer escapes HTML correctly.""" | ||||||
|  |     doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"]) | ||||||
|  |     doc.ents = [Span(doc, 0, 1, label="TEST")] | ||||||
|  |     html = displacy.render(doc, style="ent") | ||||||
|  |     assert "<RELEASE>" in html | ||||||
|  |     doc.ents = [Span(doc, 1, 2, label="TEST")] | ||||||
|  |     html = displacy.render(doc, style="ent") | ||||||
|  |     assert "<RELEASE>" in html | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_issue2754(en_tokenizer): | def test_issue2754(en_tokenizer): | ||||||
|     """Test that words like 'a' and 'a.m.' don't get exceptional norm values.""" |     """Test that words like 'a' and 'a.m.' don't get exceptional norm values.""" | ||||||
|     a = en_tokenizer("a") |     a = en_tokenizer("a") | ||||||
|  | @ -106,6 +139,48 @@ def test_issue2782(text, lang_cls): | ||||||
|     assert doc[0].like_num |     assert doc[0].like_num | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_issue2800(): | ||||||
|  |     """Test issue that arises when too many labels are added to NER model. | ||||||
|  |     Used to cause segfault. | ||||||
|  |     """ | ||||||
|  |     train_data = [] | ||||||
|  |     train_data.extend([("One sentence", {"entities": []})]) | ||||||
|  |     entity_types = [str(i) for i in range(1000)] | ||||||
|  |     nlp = English() | ||||||
|  |     ner = nlp.create_pipe("ner") | ||||||
|  |     nlp.add_pipe(ner) | ||||||
|  |     for entity_type in list(entity_types): | ||||||
|  |         ner.add_label(entity_type) | ||||||
|  |     optimizer = nlp.begin_training() | ||||||
|  |     for i in range(20): | ||||||
|  |         losses = {} | ||||||
|  |         random.shuffle(train_data) | ||||||
|  |         for statement, entities in train_data: | ||||||
|  |             nlp.update([statement], [entities], sgd=optimizer, losses=losses, drop=0.5) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_issue2822(it_tokenizer): | ||||||
|  |     """Test that the abbreviation of poco is kept as one word.""" | ||||||
|  |     doc = it_tokenizer("Vuoi un po' di zucchero?") | ||||||
|  |     assert len(doc) == 6 | ||||||
|  |     assert doc[0].text == "Vuoi" | ||||||
|  |     assert doc[1].text == "un" | ||||||
|  |     assert doc[2].text == "po'" | ||||||
|  |     assert doc[2].lemma_ == "poco" | ||||||
|  |     assert doc[3].text == "di" | ||||||
|  |     assert doc[4].text == "zucchero" | ||||||
|  |     assert doc[5].text == "?" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_issue2833(en_vocab): | ||||||
|  |     """Test that a custom error is raised if a token or span is pickled.""" | ||||||
|  |     doc = Doc(en_vocab, words=["Hello", "world"]) | ||||||
|  |     with pytest.raises(NotImplementedError): | ||||||
|  |         pickle.dumps(doc[0]) | ||||||
|  |     with pytest.raises(NotImplementedError): | ||||||
|  |         pickle.dumps(doc[0:2]) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_issue2871(): | def test_issue2871(): | ||||||
|     """Test that vectors recover the correct key for spaCy reserved words.""" |     """Test that vectors recover the correct key for spaCy reserved words.""" | ||||||
|     words = ["dog", "cat", "SUFFIX"] |     words = ["dog", "cat", "SUFFIX"] | ||||||
|  | @ -134,3 +209,19 @@ def test_issue2901(): | ||||||
| 
 | 
 | ||||||
|     doc = nlp("pythonが大好きです") |     doc = nlp("pythonが大好きです") | ||||||
|     assert doc |     assert doc | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_issue2926(fr_tokenizer): | ||||||
|  |     """Test that the tokenizer correctly splits tokens separated by a slash (/) | ||||||
|  |     ending in a digit. | ||||||
|  |     """ | ||||||
|  |     doc = fr_tokenizer("Learn html5/css3/javascript/jquery") | ||||||
|  |     assert len(doc) == 8 | ||||||
|  |     assert doc[0].text == "Learn" | ||||||
|  |     assert doc[1].text == "html5" | ||||||
|  |     assert doc[2].text == "/" | ||||||
|  |     assert doc[3].text == "css3" | ||||||
|  |     assert doc[4].text == "/" | ||||||
|  |     assert doc[5].text == "javascript" | ||||||
|  |     assert doc[6].text == "/" | ||||||
|  |     assert doc[7].text == "jquery" | ||||||
|  |  | ||||||
|  | @ -1,24 +0,0 @@ | ||||||
| # coding: utf8 |  | ||||||
| from __future__ import unicode_literals |  | ||||||
| from spacy.lang.en import English |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_issue2656(): |  | ||||||
|     """ Test that tokenizer correctly splits of punctuation after numbers with decimal points """ |  | ||||||
|     text = "I went for 40.3, and got home by 10.0." |  | ||||||
|     nlp = English() |  | ||||||
|     doc = nlp(text) |  | ||||||
| 
 |  | ||||||
|     assert len(doc) == 11 |  | ||||||
| 
 |  | ||||||
|     assert doc[0].text == "I" |  | ||||||
|     assert doc[1].text == "went" |  | ||||||
|     assert doc[2].text == "for" |  | ||||||
|     assert doc[3].text == "40.3" |  | ||||||
|     assert doc[4].text == "," |  | ||||||
|     assert doc[5].text == "and" |  | ||||||
|     assert doc[6].text == "got" |  | ||||||
|     assert doc[7].text == "home" |  | ||||||
|     assert doc[8].text == "by" |  | ||||||
|     assert doc[9].text == "10.0" |  | ||||||
|     assert doc[10].text == "." |  | ||||||
|  | @ -1,16 +0,0 @@ | ||||||
| # coding: utf8 |  | ||||||
| from __future__ import unicode_literals |  | ||||||
| 
 |  | ||||||
| from spacy import displacy |  | ||||||
| from spacy.tokens import Doc, Span |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_issue2728(en_vocab): |  | ||||||
|     """Test that displaCy ENT visualizer escapes HTML correctly.""" |  | ||||||
|     doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"]) |  | ||||||
|     doc.ents = [Span(doc, 0, 1, label="TEST")] |  | ||||||
|     html = displacy.render(doc, style="ent") |  | ||||||
|     assert "<RELEASE>" in html |  | ||||||
|     doc.ents = [Span(doc, 1, 2, label="TEST")] |  | ||||||
|     html = displacy.render(doc, style="ent") |  | ||||||
|     assert "<RELEASE>" in html |  | ||||||
|  | @ -1,21 +0,0 @@ | ||||||
| # coding: utf8 |  | ||||||
| from __future__ import unicode_literals |  | ||||||
| from spacy.lang.it import Italian |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_issue2822(): |  | ||||||
|     """ Test that the abbreviation of poco is kept as one word """ |  | ||||||
|     nlp = Italian() |  | ||||||
|     text = "Vuoi un po' di zucchero?" |  | ||||||
| 
 |  | ||||||
|     doc = nlp(text) |  | ||||||
| 
 |  | ||||||
|     assert len(doc) == 6 |  | ||||||
| 
 |  | ||||||
|     assert doc[0].text == "Vuoi" |  | ||||||
|     assert doc[1].text == "un" |  | ||||||
|     assert doc[2].text == "po'" |  | ||||||
|     assert doc[2].lemma_ == "poco" |  | ||||||
|     assert doc[3].text == "di" |  | ||||||
|     assert doc[4].text == "zucchero" |  | ||||||
|     assert doc[5].text == "?" |  | ||||||
|  | @ -1,15 +0,0 @@ | ||||||
| # coding: utf-8 |  | ||||||
| from __future__ import unicode_literals |  | ||||||
| 
 |  | ||||||
| import pytest |  | ||||||
| from spacy.tokens import Doc |  | ||||||
| from spacy.compat import pickle |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_issue2833(en_vocab): |  | ||||||
|     """Test that a custom error is raised if a token or span is pickled.""" |  | ||||||
|     doc = Doc(en_vocab, words=["Hello", "world"]) |  | ||||||
|     with pytest.raises(NotImplementedError): |  | ||||||
|         pickle.dumps(doc[0]) |  | ||||||
|     with pytest.raises(NotImplementedError): |  | ||||||
|         pickle.dumps(doc[0:2]) |  | ||||||
|  | @ -1,21 +0,0 @@ | ||||||
| # coding: utf8 |  | ||||||
| from __future__ import unicode_literals |  | ||||||
| from spacy.lang.fr import French |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_issue2926(): |  | ||||||
|     """ Test that the tokenizer correctly splits tokens separated by a slash (/) ending in a digit """ |  | ||||||
|     nlp = French() |  | ||||||
|     text = "Learn html5/css3/javascript/jquery" |  | ||||||
|     doc = nlp(text) |  | ||||||
| 
 |  | ||||||
|     assert len(doc) == 8 |  | ||||||
| 
 |  | ||||||
|     assert doc[0].text == "Learn" |  | ||||||
|     assert doc[1].text == "html5" |  | ||||||
|     assert doc[2].text == "/" |  | ||||||
|     assert doc[3].text == "css3" |  | ||||||
|     assert doc[4].text == "/" |  | ||||||
|     assert doc[5].text == "javascript" |  | ||||||
|     assert doc[6].text == "/" |  | ||||||
|     assert doc[7].text == "jquery" |  | ||||||
|  | @ -2,10 +2,8 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from spacy.lang.en import English | from spacy.lang.en import English | ||||||
| import pytest |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.xfail |  | ||||||
| def test_issue3209(): | def test_issue3209(): | ||||||
|     """Test issue that occurred in spaCy nightly where NER labels were being |     """Test issue that occurred in spaCy nightly where NER labels were being | ||||||
|     mapped to classes incorrectly after loading the model, when the labels |     mapped to classes incorrectly after loading the model, when the labels | ||||||
|  |  | ||||||
|  | @ -66,9 +66,13 @@ cdef class Retokenizer: | ||||||
|             for extension in extensions: |             for extension in extensions: | ||||||
|                 _validate_extensions(extension) |                 _validate_extensions(extension) | ||||||
|             attrs = {key: value for key, value in attrs.items() if key != "_"} |             attrs = {key: value for key, value in attrs.items() if key != "_"} | ||||||
|  |             # NB: Since we support {"KEY": [value, value]} syntax here, this | ||||||
|  |             # will only "intify" the keys, not the values | ||||||
|             attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) |             attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) | ||||||
|             attrs["_"] = extensions |             attrs["_"] = extensions | ||||||
|         else: |         else: | ||||||
|  |             # NB: Since we support {"KEY": [value, value]} syntax here, this | ||||||
|  |             # will only "intify" the keys, not the values | ||||||
|             attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) |             attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) | ||||||
|         head_offsets = [] |         head_offsets = [] | ||||||
|         for head in heads: |         for head in heads: | ||||||
|  | @ -153,7 +157,11 @@ def _merge(Doc doc, int start, int end, attributes): | ||||||
|         elif attr_name == TAG: |         elif attr_name == TAG: | ||||||
|             doc.vocab.morphology.assign_tag(token, attr_value) |             doc.vocab.morphology.assign_tag(token, attr_value) | ||||||
|         else: |         else: | ||||||
|  |             # Set attributes on both token and lexeme to take care of token | ||||||
|  |             # attribute vs. lexical attribute without having to enumerate them. | ||||||
|  |             # If an attribute name is not valid, set_struct_attr will ignore it. | ||||||
|             Token.set_struct_attr(token, attr_name, attr_value) |             Token.set_struct_attr(token, attr_name, attr_value) | ||||||
|  |             Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value) | ||||||
|     # Make sure ent_iob remains consistent |     # Make sure ent_iob remains consistent | ||||||
|     if doc.c[end].ent_iob == 1 and token.ent_iob in (0, 2): |     if doc.c[end].ent_iob == 1 and token.ent_iob in (0, 2): | ||||||
|         if token.ent_type == doc.c[end].ent_type: |         if token.ent_type == doc.c[end].ent_type: | ||||||
|  | @ -216,6 +224,7 @@ def _bulk_merge(Doc doc, merges): | ||||||
|     """ |     """ | ||||||
|     cdef Span span |     cdef Span span | ||||||
|     cdef const LexemeC* lex |     cdef const LexemeC* lex | ||||||
|  |     cdef TokenC* token | ||||||
|     cdef Pool mem = Pool() |     cdef Pool mem = Pool() | ||||||
|     tokens = <TokenC**>mem.alloc(len(merges), sizeof(TokenC)) |     tokens = <TokenC**>mem.alloc(len(merges), sizeof(TokenC)) | ||||||
|     spans = [] |     spans = [] | ||||||
|  | @ -231,15 +240,6 @@ def _bulk_merge(Doc doc, merges): | ||||||
|         # House the new merged token where it starts |         # House the new merged token where it starts | ||||||
|         token = &doc.c[start] |         token = &doc.c[start] | ||||||
|         tokens[merge_index] = token |         tokens[merge_index] = token | ||||||
|         # Assign attributes |  | ||||||
|         for attr_name, attr_value in attributes.items(): |  | ||||||
|             if attr_name == "_":  # Set extension attributes |  | ||||||
|                 for ext_attr_key, ext_attr_value in attr_value.items(): |  | ||||||
|                     doc[start]._.set(ext_attr_key, ext_attr_value) |  | ||||||
|             elif attr_name == TAG: |  | ||||||
|                 doc.vocab.morphology.assign_tag(token, attr_value) |  | ||||||
|             else: |  | ||||||
|                 Token.set_struct_attr(token, attr_name, attr_value) |  | ||||||
|     # Resize the doc.tensor, if it's set. Let the last row for each token stand |     # Resize the doc.tensor, if it's set. Let the last row for each token stand | ||||||
|     # for the merged region. To do this, we create a boolean array indicating |     # for the merged region. To do this, we create a boolean array indicating | ||||||
|     # whether the row is to be deleted, then use numpy.delete |     # whether the row is to be deleted, then use numpy.delete | ||||||
|  | @ -255,14 +255,30 @@ def _bulk_merge(Doc doc, merges): | ||||||
|     # We update token.lex after keeping span root and dep, since |     # We update token.lex after keeping span root and dep, since | ||||||
|     # setting token.lex will change span.start and span.end properties |     # setting token.lex will change span.start and span.end properties | ||||||
|     # as it modifies the character offsets in the doc |     # as it modifies the character offsets in the doc | ||||||
|     for token_index in range(len(merges)): |     for token_index, (span, attributes) in enumerate(merges): | ||||||
|         new_orth = ''.join([t.text_with_ws for t in spans[token_index]]) |         new_orth = ''.join([t.text_with_ws for t in spans[token_index]]) | ||||||
|         if spans[token_index][-1].whitespace_: |         if spans[token_index][-1].whitespace_: | ||||||
|             new_orth = new_orth[:-len(spans[token_index][-1].whitespace_)] |             new_orth = new_orth[:-len(spans[token_index][-1].whitespace_)] | ||||||
|  |         token = tokens[token_index] | ||||||
|         lex = doc.vocab.get(doc.mem, new_orth) |         lex = doc.vocab.get(doc.mem, new_orth) | ||||||
|         tokens[token_index].lex = lex |         token.lex = lex | ||||||
|         # We set trailing space here too |         # We set trailing space here too | ||||||
|         tokens[token_index].spacy = doc.c[spans[token_index].end-1].spacy |         token.spacy = doc.c[spans[token_index].end-1].spacy | ||||||
|  |         py_token = span[0] | ||||||
|  |         # Assign attributes | ||||||
|  |         for attr_name, attr_value in attributes.items(): | ||||||
|  |             if attr_name == "_":  # Set extension attributes | ||||||
|  |                 for ext_attr_key, ext_attr_value in attr_value.items(): | ||||||
|  |                     py_token._.set(ext_attr_key, ext_attr_value) | ||||||
|  |             elif attr_name == TAG: | ||||||
|  |                 doc.vocab.morphology.assign_tag(token, attr_value) | ||||||
|  |             else: | ||||||
|  |                 # Set attributes on both token and lexeme to take care of token | ||||||
|  |                 # attribute vs. lexical attribute without having to enumerate | ||||||
|  |                 # them. If an attribute name is not valid, set_struct_attr will | ||||||
|  |                 # ignore it. | ||||||
|  |                 Token.set_struct_attr(token, attr_name, attr_value) | ||||||
|  |                 Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value) | ||||||
|     # Begin by setting all the head indices to absolute token positions |     # Begin by setting all the head indices to absolute token positions | ||||||
|     # This is easier to work with for now than the offsets |     # This is easier to work with for now than the offsets | ||||||
|     # Before thinking of something simpler, beware the case where a |     # Before thinking of something simpler, beware the case where a | ||||||
|  | @ -281,7 +297,7 @@ def _bulk_merge(Doc doc, merges): | ||||||
|     current_offset = 0 |     current_offset = 0 | ||||||
|     for i in range(doc.length): |     for i in range(doc.length): | ||||||
|         if current_span_index < len(spans) and i == spans[current_span_index].end: |         if current_span_index < len(spans) and i == spans[current_span_index].end: | ||||||
|             #last token was the last of the span |             # Last token was the last of the span | ||||||
|             current_offset += (spans[current_span_index].end - spans[current_span_index].start) -1 |             current_offset += (spans[current_span_index].end - spans[current_span_index].start) -1 | ||||||
|             current_span_index += 1 |             current_span_index += 1 | ||||||
|         if current_span_index < len(spans) and \ |         if current_span_index < len(spans) and \ | ||||||
|  | @ -405,10 +421,17 @@ def _split(Doc doc, int token_index, orths, heads, attrs): | ||||||
|             if attr_name == "_": |             if attr_name == "_": | ||||||
|                 for ext_attr_key, ext_attr_value in attr_value.items(): |                 for ext_attr_key, ext_attr_value in attr_value.items(): | ||||||
|                     doc[token_index + i]._.set(ext_attr_key, ext_attr_value) |                     doc[token_index + i]._.set(ext_attr_key, ext_attr_value) | ||||||
|  |             # NB: We need to call get_string_id here because only the keys are | ||||||
|  |             # "intified" (since we support "KEY": [value, value] syntax here). | ||||||
|             elif attr_name == TAG: |             elif attr_name == TAG: | ||||||
|                 doc.vocab.morphology.assign_tag(token, get_string_id(attr_value)) |                 doc.vocab.morphology.assign_tag(token, get_string_id(attr_value)) | ||||||
|             else: |             else: | ||||||
|  |                 # Set attributes on both token and lexeme to take care of token | ||||||
|  |                 # attribute vs. lexical attribute without having to enumerate | ||||||
|  |                 # them. If an attribute name is not valid, set_struct_attr will | ||||||
|  |                 # ignore it. | ||||||
|                 Token.set_struct_attr(token, attr_name, get_string_id(attr_value)) |                 Token.set_struct_attr(token, attr_name, get_string_id(attr_value)) | ||||||
|  |                 Lexeme.set_struct_attr(<LexemeC*>token.lex, attr_name, get_string_id(attr_value)) | ||||||
|     # Assign correct dependencies to the inner token |     # Assign correct dependencies to the inner token | ||||||
|     for i, head in enumerate(heads): |     for i, head in enumerate(heads): | ||||||
|         doc.c[token_index + i].head = head |         doc.c[token_index + i].head = head | ||||||
|  |  | ||||||
|  | @ -402,9 +402,11 @@ invalidated, although they may accidentally continue to work. | ||||||
| 
 | 
 | ||||||
| ### Retokenizer.merge {#retokenizer.merge tag="method"} | ### Retokenizer.merge {#retokenizer.merge tag="method"} | ||||||
| 
 | 
 | ||||||
| Mark a span for merging. The `attrs` will be applied to the resulting token. | Mark a span for merging. The `attrs` will be applied to the resulting token (if | ||||||
| Writable custom extension attributes can be provided as a dictionary mapping | they're context-dependent token attributes like `LEMMA` or `DEP`) or to the | ||||||
| attribute names to values as the `"_"` key. | underlying lexeme (if they're context-independent lexical attributes like | ||||||
|  | `LOWER` or `IS_STOP`). Writable custom extension attributes can be provided as a | ||||||
|  | dictionary mapping attribute names to values as the `"_"` key. | ||||||
| 
 | 
 | ||||||
| > #### Example | > #### Example | ||||||
| > | > | ||||||
|  | @ -431,7 +433,10 @@ second subtoken of `doc[3]`. This mechanism allows attaching subtokens to other | ||||||
| newly created subtokens, without having to keep track of the changing token | newly created subtokens, without having to keep track of the changing token | ||||||
| indices. If the specified head token will be split within the retokenizer block | indices. If the specified head token will be split within the retokenizer block | ||||||
| and no subtoken index is specified, it will default to `0`. Attributes to set on | and no subtoken index is specified, it will default to `0`. Attributes to set on | ||||||
| subtokens can be provided as a list of values. | subtokens can be provided as a list of values. They'll be applied to the | ||||||
|  | resulting token (if they're context-dependent token attributes like `LEMMA` or | ||||||
|  | `DEP`) or to the underlying lexeme (if they're context-independent lexical | ||||||
|  | attributes like `LOWER` or `IS_STOP`). | ||||||
| 
 | 
 | ||||||
| > #### Example | > #### Example | ||||||
| > | > | ||||||
|  |  | ||||||
|  | @ -995,6 +995,14 @@ with doc.retokenize() as retokenizer: | ||||||
| print("After:", [token.text for token in doc]) | print("After:", [token.text for token in doc]) | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
|  | If an attribute in the `attrs` is a context-dependent token attribute, it will | ||||||
|  | be applied to the underlying [`Token`](/api/token). For example `LEMMA`, `POS` | ||||||
|  | or `DEP` only apply to a word in context, so they're token attributes. If an | ||||||
|  | attribute is a context-independent lexical attribute, it will be applied to the | ||||||
|  | underlying [`Lexeme`](/api/lexeme), the entry in the vocabulary. For example, | ||||||
|  | `LOWER` or `IS_STOP` apply to all words of the same spelling, regardless of the | ||||||
|  | context. | ||||||
|  | 
 | ||||||
| <Infobox title="Tip: merging entities and noun phrases"> | <Infobox title="Tip: merging entities and noun phrases"> | ||||||
| 
 | 
 | ||||||
| If you need to merge named entities or noun chunks, check out the built-in | If you need to merge named entities or noun chunks, check out the built-in | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user