mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 05:01:02 +03:00 
			
		
		
		
	* Migrate regressions 1-1000 * Move serialize test to correct file * Remove tests that won't work in v3 * Migrate regressions 1000-1500 Removed regression test 1250 because v3 doesn't support the old LEX scheme anymore. * Add missing imports in serializer tests * Migrate tests 1500-2000 * Migrate regressions from 2000-2500 * Migrate regressions from 2501-3000 * Migrate regressions from 3000-3501 * Migrate regressions from 3501-4000 * Migrate regressions from 4001-4500 * Migrate regressions from 4501-5000 * Migrate regressions from 5001-5501 * Migrate regressions from 5501 to 7000 * Migrate regressions from 7001 to 8000 * Migrate remaining regression tests * Fixing missing imports * Update docs with new system [ci skip] * Update CONTRIBUTING.md - Fix formatting - Update wording * Remove lemmatizer tests in el lang * Move a few tests into the general tokenizer * Separate Doc and DocBin tests
		
			
				
	
	
		
			297 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			297 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import numpy
 | |
| import pytest
 | |
| 
 | |
| from spacy.vocab import Vocab
 | |
| from spacy.tokens import Doc, Token
 | |
| 
 | |
| 
 | |
| @pytest.mark.issue(3540)
 | |
| def test_issue3540(en_vocab):
 | |
|     words = ["I", "live", "in", "NewYork", "right", "now"]
 | |
|     tensor = numpy.asarray(
 | |
|         [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
 | |
|         dtype="f",
 | |
|     )
 | |
|     doc = Doc(en_vocab, words=words)
 | |
|     doc.tensor = tensor
 | |
|     gold_text = ["I", "live", "in", "NewYork", "right", "now"]
 | |
|     assert [token.text for token in doc] == gold_text
 | |
|     gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
 | |
|     for i, lemma in enumerate(gold_lemma):
 | |
|         doc[i].lemma_ = lemma
 | |
|     assert [token.lemma_ for token in doc] == gold_lemma
 | |
|     vectors_1 = [token.vector for token in doc]
 | |
|     assert len(vectors_1) == len(doc)
 | |
| 
 | |
|     with doc.retokenize() as retokenizer:
 | |
|         heads = [(doc[3], 1), doc[2]]
 | |
|         attrs = {
 | |
|             "POS": ["PROPN", "PROPN"],
 | |
|             "LEMMA": ["New", "York"],
 | |
|             "DEP": ["pobj", "compound"],
 | |
|         }
 | |
|         retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
 | |
| 
 | |
|     gold_text = ["I", "live", "in", "New", "York", "right", "now"]
 | |
|     assert [token.text for token in doc] == gold_text
 | |
|     gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
 | |
|     assert [token.lemma_ for token in doc] == gold_lemma
 | |
|     vectors_2 = [token.vector for token in doc]
 | |
|     assert len(vectors_2) == len(doc)
 | |
|     assert vectors_1[0].tolist() == vectors_2[0].tolist()
 | |
|     assert vectors_1[1].tolist() == vectors_2[1].tolist()
 | |
|     assert vectors_1[2].tolist() == vectors_2[2].tolist()
 | |
|     assert vectors_1[4].tolist() == vectors_2[5].tolist()
 | |
|     assert vectors_1[5].tolist() == vectors_2[6].tolist()
 | |
| 
 | |
| 
 | |
| def test_doc_retokenize_split(en_vocab):
 | |
|     words = ["LosAngeles", "start", "."]
 | |
|     heads = [1, 2, 2]
 | |
|     deps = ["dep"] * len(heads)
 | |
|     doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
 | |
|     assert len(doc) == 3
 | |
|     assert len(str(doc)) == 19
 | |
|     assert doc[0].head.text == "start"
 | |
|     assert doc[1].head.text == "."
 | |
|     with doc.retokenize() as retokenizer:
 | |
|         retokenizer.split(
 | |
|             doc[0],
 | |
|             ["Los", "Angeles"],
 | |
|             [(doc[0], 1), doc[1]],
 | |
|             attrs={
 | |
|                 "tag": ["NNP"] * 2,
 | |
|                 "lemma": ["Los", "Angeles"],
 | |
|                 "ent_type": ["GPE"] * 2,
 | |
|                 "morph": ["Number=Sing"] * 2,
 | |
|             },
 | |
|         )
 | |
|     assert len(doc) == 4
 | |
|     assert doc[0].text == "Los"
 | |
|     assert doc[0].head.text == "Angeles"
 | |
|     assert doc[0].idx == 0
 | |
|     assert str(doc[0].morph) == "Number=Sing"
 | |
|     assert doc[1].idx == 3
 | |
|     assert doc[1].text == "Angeles"
 | |
|     assert doc[1].head.text == "start"
 | |
|     assert str(doc[1].morph) == "Number=Sing"
 | |
|     assert doc[2].text == "start"
 | |
|     assert doc[2].head.text == "."
 | |
|     assert doc[3].text == "."
 | |
|     assert doc[3].head.text == "."
 | |
|     assert len(str(doc)) == 19
 | |
| 
 | |
| 
 | |
| def test_doc_retokenize_split_lemmas(en_vocab):
 | |
|     # If lemmas are not set, leave unset
 | |
|     words = ["LosAngeles", "start", "."]
 | |
|     heads = [1, 2, 2]
 | |
|     deps = ["dep"] * len(heads)
 | |
|     doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
 | |
|     with doc.retokenize() as retokenizer:
 | |
|         retokenizer.split(
 | |
|             doc[0],
 | |
|             ["Los", "Angeles"],
 | |
|             [(doc[0], 1), doc[1]],
 | |
|         )
 | |
|     assert doc[0].lemma_ == ""
 | |
|     assert doc[1].lemma_ == ""
 | |
| 
 | |
|     # If lemmas are set, use split orth as default lemma
 | |
|     words = ["LosAngeles", "start", "."]
 | |
|     heads = [1, 2, 2]
 | |
|     deps = ["dep"] * len(heads)
 | |
|     doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
 | |
|     for t in doc:
 | |
|         t.lemma_ = "a"
 | |
|     with doc.retokenize() as retokenizer:
 | |
|         retokenizer.split(
 | |
|             doc[0],
 | |
|             ["Los", "Angeles"],
 | |
|             [(doc[0], 1), doc[1]],
 | |
|         )
 | |
|     assert doc[0].lemma_ == "Los"
 | |
|     assert doc[1].lemma_ == "Angeles"
 | |
| 
 | |
| 
 | |
| def test_doc_retokenize_split_dependencies(en_vocab):
 | |
|     doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
 | |
|     dep1 = doc.vocab.strings.add("amod")
 | |
|     dep2 = doc.vocab.strings.add("subject")
 | |
|     with doc.retokenize() as retokenizer:
 | |
|         retokenizer.split(
 | |
|             doc[0],
 | |
|             ["Los", "Angeles"],
 | |
|             [(doc[0], 1), doc[1]],
 | |
|             attrs={"dep": [dep1, dep2]},
 | |
|         )
 | |
|     assert doc[0].dep == dep1
 | |
|     assert doc[1].dep == dep2
 | |
| 
 | |
| 
 | |
| def test_doc_retokenize_split_heads_error(en_vocab):
 | |
|     doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
 | |
|     # Not enough heads
 | |
|     with pytest.raises(ValueError):
 | |
|         with doc.retokenize() as retokenizer:
 | |
|             retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1]])
 | |
| 
 | |
|     # Too many heads
 | |
|     with pytest.raises(ValueError):
 | |
|         with doc.retokenize() as retokenizer:
 | |
|             retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1], doc[1], doc[1]])
 | |
| 
 | |
| 
 | |
| def test_doc_retokenize_spans_entity_split_iob():
 | |
|     # Test entity IOB stays consistent after merging
 | |
|     words = ["abc", "d", "e"]
 | |
|     doc = Doc(Vocab(), words=words)
 | |
|     doc.ents = [(doc.vocab.strings.add("ent-abcd"), 0, 2)]
 | |
|     assert doc[0].ent_iob_ == "B"
 | |
|     assert doc[1].ent_iob_ == "I"
 | |
|     with doc.retokenize() as retokenizer:
 | |
|         retokenizer.split(doc[0], ["a", "b", "c"], [(doc[0], 1), (doc[0], 2), doc[1]])
 | |
|     assert doc[0].ent_iob_ == "B"
 | |
|     assert doc[1].ent_iob_ == "I"
 | |
|     assert doc[2].ent_iob_ == "I"
 | |
|     assert doc[3].ent_iob_ == "I"
 | |
| 
 | |
| 
 | |
| def test_doc_retokenize_spans_sentence_update_after_split(en_vocab):
 | |
|     # fmt: off
 | |
|     words = ["StewartLee", "is", "a", "stand", "up", "comedian", ".", "He",
 | |
|              "lives", "in", "England", "and", "loves", "JoePasquale", "."]
 | |
|     heads = [1, 1, 3, 5, 3, 1, 1, 8, 8, 8, 9, 8, 8, 14, 12]
 | |
|     deps = ["nsubj", "ROOT", "det", "amod", "prt", "attr", "punct", "nsubj",
 | |
|             "ROOT", "prep", "pobj", "cc", "conj", "compound", "punct"]
 | |
|     # fmt: on
 | |
|     doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
 | |
|     sent1, sent2 = list(doc.sents)
 | |
|     init_len = len(sent1)
 | |
|     init_len2 = len(sent2)
 | |
|     with doc.retokenize() as retokenizer:
 | |
|         retokenizer.split(
 | |
|             doc[0],
 | |
|             ["Stewart", "Lee"],
 | |
|             [(doc[0], 1), doc[1]],
 | |
|             attrs={"dep": ["compound", "nsubj"]},
 | |
|         )
 | |
|         retokenizer.split(
 | |
|             doc[13],
 | |
|             ["Joe", "Pasquale"],
 | |
|             [(doc[13], 1), doc[12]],
 | |
|             attrs={"dep": ["compound", "dobj"]},
 | |
|         )
 | |
|     sent1, sent2 = list(doc.sents)
 | |
|     assert len(sent1) == init_len + 1
 | |
|     assert len(sent2) == init_len2 + 1
 | |
| 
 | |
| 
 | |
| def test_doc_retokenize_split_orths_mismatch(en_vocab):
 | |
|     """Test that the regular retokenizer.split raises an error if the orths
 | |
|     don't match the original token text. There might still be a method that
 | |
|     allows this, but for the default use cases, merging and splitting should
 | |
|     always conform with spaCy's non-destructive tokenization policy. Otherwise,
 | |
|     it can lead to very confusing and unexpected results.
 | |
|     """
 | |
|     doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
 | |
|     with pytest.raises(ValueError):
 | |
|         with doc.retokenize() as retokenizer:
 | |
|             retokenizer.split(doc[0], ["L", "A"], [(doc[0], 0), (doc[0], 0)])
 | |
| 
 | |
| 
 | |
| def test_doc_retokenize_split_extension_attrs(en_vocab):
 | |
|     Token.set_extension("a", default=False, force=True)
 | |
|     Token.set_extension("b", default="nothing", force=True)
 | |
|     doc = Doc(en_vocab, words=["LosAngeles", "start"])
 | |
|     with doc.retokenize() as retokenizer:
 | |
|         heads = [(doc[0], 1), doc[1]]
 | |
|         underscore = [{"a": True, "b": "1"}, {"b": "2"}]
 | |
|         attrs = {"lemma": ["los", "angeles"], "_": underscore}
 | |
|         retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
 | |
|     assert doc[0].lemma_ == "los"
 | |
|     assert doc[0]._.a is True
 | |
|     assert doc[0]._.b == "1"
 | |
|     assert doc[1].lemma_ == "angeles"
 | |
|     assert doc[1]._.a is False
 | |
|     assert doc[1]._.b == "2"
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "underscore_attrs",
 | |
|     [
 | |
|         [{"a": "x"}, {}],  # Overwriting getter without setter
 | |
|         [{"b": "x"}, {}],  # Overwriting method
 | |
|         [{"c": "x"}, {}],  # Overwriting nonexistent attribute
 | |
|         [{"a": "x"}, {"x": "x"}],  # Combination
 | |
|         [{"a": "x", "x": "x"}, {"x": "x"}],  # Combination
 | |
|         {"x": "x"},  # Not a list of dicts
 | |
|     ],
 | |
| )
 | |
| def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs):
 | |
|     Token.set_extension("x", default=False, force=True)
 | |
|     Token.set_extension("a", getter=lambda x: x, force=True)
 | |
|     Token.set_extension("b", method=lambda x: x, force=True)
 | |
|     doc = Doc(en_vocab, words=["LosAngeles", "start"])
 | |
|     attrs = {"_": underscore_attrs}
 | |
|     with pytest.raises(ValueError):
 | |
|         with doc.retokenize() as retokenizer:
 | |
|             heads = [(doc[0], 1), doc[1]]
 | |
|             retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
 | |
| 
 | |
| 
 | |
| def test_doc_retokenizer_split_lex_attrs(en_vocab):
 | |
|     """Test that retokenization also sets attributes on the lexeme if they're
 | |
|     lexical attributes. For example, if a user sets IS_STOP, it should mean that
 | |
|     "all tokens with that lexeme" are marked as a stop word, so the ambiguity
 | |
|     here is acceptable. Also see #2390.
 | |
|     """
 | |
|     assert not Doc(en_vocab, words=["Los"])[0].is_stop
 | |
|     assert not Doc(en_vocab, words=["Angeles"])[0].is_stop
 | |
|     doc = Doc(en_vocab, words=["LosAngeles", "start"])
 | |
|     assert not doc[0].is_stop
 | |
|     with doc.retokenize() as retokenizer:
 | |
|         attrs = {"is_stop": [True, False]}
 | |
|         heads = [(doc[0], 1), doc[1]]
 | |
|         retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
 | |
|     assert doc[0].is_stop
 | |
|     assert not doc[1].is_stop
 | |
| 
 | |
| 
 | |
| def test_doc_retokenizer_realloc(en_vocab):
 | |
|     """#4604: realloc correctly when new tokens outnumber original tokens"""
 | |
|     text = "Hyperglycemic adverse events following antipsychotic drug administration in the"
 | |
|     doc = Doc(en_vocab, words=text.split()[:-1])
 | |
|     with doc.retokenize() as retokenizer:
 | |
|         token = doc[0]
 | |
|         heads = [(token, 0)] * len(token)
 | |
|         retokenizer.split(doc[token.i], list(token.text), heads=heads)
 | |
|     doc = Doc(en_vocab, words=text.split())
 | |
|     with doc.retokenize() as retokenizer:
 | |
|         token = doc[0]
 | |
|         heads = [(token, 0)] * len(token)
 | |
|         retokenizer.split(doc[token.i], list(token.text), heads=heads)
 | |
| 
 | |
| 
 | |
| def test_doc_retokenizer_split_norm(en_vocab):
 | |
|     """#6060: reset norm in split"""
 | |
|     text = "The quick brownfoxjumpsoverthe lazy dog w/ white spots"
 | |
|     doc = Doc(en_vocab, words=text.split())
 | |
| 
 | |
|     # Set custom norm on the w/ token.
 | |
|     doc[5].norm_ = "with"
 | |
| 
 | |
|     # Retokenize to split out the words in the token at doc[2].
 | |
|     token = doc[2]
 | |
|     with doc.retokenize() as retokenizer:
 | |
|         retokenizer.split(
 | |
|             token,
 | |
|             ["brown", "fox", "jumps", "over", "the"],
 | |
|             heads=[(token, idx) for idx in range(5)],
 | |
|         )
 | |
| 
 | |
|     assert doc[9].text == "w/"
 | |
|     assert doc[9].norm_ == "with"
 | |
|     assert doc[5].text == "over"
 | |
|     assert doc[5].norm_ == "over"
 |