mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 13:41:21 +03:00 
			
		
		
		
	* Implement new API for {Phrase}Matcher.add (backwards-compatible)
* Update docs
* Also update DependencyMatcher.add
* Update internals
* Rewrite tests to use new API
* Add basic check for common mistake
Raise error with suggestion if user likely passed in a pattern instead of a list of patterns
* Fix typo [ci skip]
		
	
			
		
			
				
	
	
		
			334 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			334 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf8
 | |
| from __future__ import unicode_literals
 | |
| 
 | |
| import pytest
 | |
| import gc
 | |
| import numpy
 | |
| import copy
 | |
| from spacy.lang.en import English
 | |
| from spacy.lang.en.stop_words import STOP_WORDS
 | |
| from spacy.lang.lex_attrs import is_stop
 | |
| from spacy.vectors import Vectors
 | |
| from spacy.vocab import Vocab
 | |
| from spacy.language import Language
 | |
| from spacy.tokens import Doc, Span, Token
 | |
| from spacy.pipeline import Tagger, EntityRecognizer
 | |
| from spacy.attrs import HEAD, DEP
 | |
| from spacy.matcher import Matcher
 | |
| 
 | |
| from ..util import make_tempdir
 | |
| 
 | |
| 
 | |
| def test_issue1506():
 | |
|     def string_generator():
 | |
|         for _ in range(10001):
 | |
|             yield "It's sentence produced by that bug."
 | |
|         for _ in range(10001):
 | |
|             yield "I erase some hbdsaj lemmas."
 | |
|         for _ in range(10001):
 | |
|             yield "I erase lemmas."
 | |
|         for _ in range(10001):
 | |
|             yield "It's sentence produced by that bug."
 | |
|         for _ in range(10001):
 | |
|             yield "It's sentence produced by that bug."
 | |
| 
 | |
|     nlp = English()
 | |
|     for i, d in enumerate(nlp.pipe(string_generator())):
 | |
|         # We should run cleanup more than one time to actually cleanup data.
 | |
|         # In first run — clean up only mark strings as «not hitted».
 | |
|         if i == 10000 or i == 20000 or i == 30000:
 | |
|             gc.collect()
 | |
|         for t in d:
 | |
|             str(t.lemma_)
 | |
| 
 | |
| 
 | |
| def test_issue1518():
 | |
|     """Test vectors.resize() works."""
 | |
|     vectors = Vectors(shape=(10, 10))
 | |
|     vectors.add("hello", row=2)
 | |
|     vectors.resize((5, 9))
 | |
| 
 | |
| 
 | |
| def test_issue1537():
 | |
|     """Test that Span.as_doc() doesn't segfault."""
 | |
|     string = "The sky is blue . The man is pink . The dog is purple ."
 | |
|     doc = Doc(Vocab(), words=string.split())
 | |
|     doc[0].sent_start = True
 | |
|     for word in doc[1:]:
 | |
|         if word.nbor(-1).text == ".":
 | |
|             word.sent_start = True
 | |
|         else:
 | |
|             word.sent_start = False
 | |
|     sents = list(doc.sents)
 | |
|     sent0 = sents[0].as_doc()
 | |
|     sent1 = sents[1].as_doc()
 | |
|     assert isinstance(sent0, Doc)
 | |
|     assert isinstance(sent1, Doc)
 | |
| 
 | |
| 
 | |
| # TODO: Currently segfaulting, due to l_edge and r_edge misalignment
 | |
| # def test_issue1537_model():
 | |
| #    nlp = load_spacy('en')
 | |
| #    doc = nlp('The sky is blue. The man is pink. The dog is purple.')
 | |
| #    sents = [s.as_doc() for s in doc.sents]
 | |
| #    print(list(sents[0].noun_chunks))
 | |
| #    print(list(sents[1].noun_chunks))
 | |
| 
 | |
| 
 | |
| def test_issue1539():
 | |
|     """Ensure vectors.resize() doesn't try to modify dictionary during iteration."""
 | |
|     v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100])
 | |
|     v.resize((100, 100))
 | |
| 
 | |
| 
 | |
| def test_issue1547():
 | |
|     """Test that entity labels still match after merging tokens."""
 | |
|     words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
 | |
|     doc = Doc(Vocab(), words=words)
 | |
|     doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
 | |
|     with doc.retokenize() as retokenizer:
 | |
|         retokenizer.merge(doc[5:7])
 | |
|     assert [ent.text for ent in doc.ents]
 | |
| 
 | |
| 
 | |
| def test_issue1612(en_tokenizer):
 | |
|     doc = en_tokenizer("The black cat purrs.")
 | |
|     span = doc[1:3]
 | |
|     assert span.orth_ == span.text
 | |
| 
 | |
| 
 | |
| def test_issue1654():
 | |
|     nlp = Language(Vocab())
 | |
|     assert not nlp.pipeline
 | |
|     nlp.add_pipe(lambda doc: doc, name="1")
 | |
|     nlp.add_pipe(lambda doc: doc, name="2", after="1")
 | |
|     nlp.add_pipe(lambda doc: doc, name="3", after="2")
 | |
|     assert nlp.pipe_names == ["1", "2", "3"]
 | |
|     nlp2 = Language(Vocab())
 | |
|     assert not nlp2.pipeline
 | |
|     nlp2.add_pipe(lambda doc: doc, name="3")
 | |
|     nlp2.add_pipe(lambda doc: doc, name="2", before="3")
 | |
|     nlp2.add_pipe(lambda doc: doc, name="1", before="2")
 | |
|     assert nlp2.pipe_names == ["1", "2", "3"]
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize("text", ["test@example.com", "john.doe@example.co.uk"])
 | |
| def test_issue1698(en_tokenizer, text):
 | |
|     doc = en_tokenizer(text)
 | |
|     assert len(doc) == 1
 | |
|     assert not doc[0].like_url
 | |
| 
 | |
| 
 | |
| def test_issue1727():
 | |
|     """Test that models with no pretrained vectors can be deserialized
 | |
|     correctly after vectors are added."""
 | |
|     data = numpy.ones((3, 300), dtype="f")
 | |
|     vectors = Vectors(data=data, keys=["I", "am", "Matt"])
 | |
|     tagger = Tagger(Vocab())
 | |
|     tagger.add_label("PRP")
 | |
|     with pytest.warns(UserWarning):
 | |
|         tagger.begin_training()
 | |
|     assert tagger.cfg.get("pretrained_dims", 0) == 0
 | |
|     tagger.vocab.vectors = vectors
 | |
|     with make_tempdir() as path:
 | |
|         tagger.to_disk(path)
 | |
|         tagger = Tagger(Vocab()).from_disk(path)
 | |
|         assert tagger.cfg.get("pretrained_dims", 0) == 0
 | |
| 
 | |
| 
 | |
| def test_issue1757():
 | |
|     """Test comparison against None doesn't cause segfault."""
 | |
|     doc = Doc(Vocab(), words=["a", "b", "c"])
 | |
|     assert not doc[0] < None
 | |
|     assert not doc[0] is None
 | |
|     assert doc[0] >= None
 | |
|     assert not doc[:2] < None
 | |
|     assert not doc[:2] is None
 | |
|     assert doc[:2] >= None
 | |
|     assert not doc.vocab["a"] is None
 | |
|     assert not doc.vocab["a"] < None
 | |
| 
 | |
| 
 | |
| def test_issue1758(en_tokenizer):
 | |
|     """Test that "would've" is handled by the English tokenizer exceptions."""
 | |
|     tokens = en_tokenizer("would've")
 | |
|     assert len(tokens) == 2
 | |
|     assert tokens[0].tag_ == "MD"
 | |
|     assert tokens[1].lemma_ == "have"
 | |
| 
 | |
| 
 | |
| def test_issue1773(en_tokenizer):
 | |
|     """Test that spaces don't receive a POS but no TAG. This is the root cause
 | |
|     of the serialization issue reported in #1773."""
 | |
|     doc = en_tokenizer("\n")
 | |
|     if doc[0].pos_ == "SPACE":
 | |
|         assert doc[0].tag_ != ""
 | |
| 
 | |
| 
 | |
| def test_issue1799():
 | |
|     """Test sentence boundaries are deserialized correctly, even for
 | |
|     non-projective sentences."""
 | |
|     heads_deps = numpy.asarray(
 | |
|         [
 | |
|             [1, 397],
 | |
|             [4, 436],
 | |
|             [2, 426],
 | |
|             [1, 402],
 | |
|             [0, 8206900633647566924],
 | |
|             [18446744073709551615, 440],
 | |
|             [18446744073709551614, 442],
 | |
|         ],
 | |
|         dtype="uint64",
 | |
|     )
 | |
|     doc = Doc(Vocab(), words="Just what I was looking for .".split())
 | |
|     doc.vocab.strings.add("ROOT")
 | |
|     doc = doc.from_array([HEAD, DEP], heads_deps)
 | |
|     assert len(list(doc.sents)) == 1
 | |
| 
 | |
| 
 | |
| def test_issue1807():
 | |
|     """Test vocab.set_vector also adds the word to the vocab."""
 | |
|     vocab = Vocab(vectors_name="test_issue1807")
 | |
|     assert "hello" not in vocab
 | |
|     vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
 | |
|     assert "hello" in vocab
 | |
| 
 | |
| 
 | |
| def test_issue1834():
 | |
|     """Test that sentence boundaries & parse/tag flags are not lost
 | |
|     during serialization."""
 | |
|     string = "This is a first sentence . And another one"
 | |
|     doc = Doc(Vocab(), words=string.split())
 | |
|     doc[6].sent_start = True
 | |
|     new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
 | |
|     assert new_doc[6].sent_start
 | |
|     assert not new_doc.is_parsed
 | |
|     assert not new_doc.is_tagged
 | |
|     doc.is_parsed = True
 | |
|     doc.is_tagged = True
 | |
|     new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
 | |
|     assert new_doc.is_parsed
 | |
|     assert new_doc.is_tagged
 | |
| 
 | |
| 
 | |
| def test_issue1868():
 | |
|     """Test Vocab.__contains__ works with int keys."""
 | |
|     vocab = Vocab()
 | |
|     lex = vocab["hello"]
 | |
|     assert lex.orth in vocab
 | |
|     assert lex.orth_ in vocab
 | |
|     assert "some string" not in vocab
 | |
|     int_id = vocab.strings.add("some string")
 | |
|     assert int_id not in vocab
 | |
| 
 | |
| 
 | |
| def test_issue1883():
 | |
|     matcher = Matcher(Vocab())
 | |
|     matcher.add("pat1", [[{"orth": "hello"}]])
 | |
|     doc = Doc(matcher.vocab, words=["hello"])
 | |
|     assert len(matcher(doc)) == 1
 | |
|     new_matcher = copy.deepcopy(matcher)
 | |
|     new_doc = Doc(new_matcher.vocab, words=["hello"])
 | |
|     assert len(new_matcher(new_doc)) == 1
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize("word", ["the"])
 | |
| def test_issue1889(word):
 | |
|     assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
 | |
| 
 | |
| 
 | |
| def test_issue1915():
 | |
|     cfg = {"hidden_depth": 2}  # should error out
 | |
|     nlp = Language()
 | |
|     nlp.add_pipe(nlp.create_pipe("ner"))
 | |
|     nlp.get_pipe("ner").add_label("answer")
 | |
|     with pytest.raises(ValueError):
 | |
|         nlp.begin_training(**cfg)
 | |
| 
 | |
| 
 | |
| def test_issue1945():
 | |
|     """Test regression in Matcher introduced in v2.0.6."""
 | |
|     matcher = Matcher(Vocab())
 | |
|     matcher.add("MWE", [[{"orth": "a"}, {"orth": "a"}]])
 | |
|     doc = Doc(matcher.vocab, words=["a", "a", "a"])
 | |
|     matches = matcher(doc)  # we should see two overlapping matches here
 | |
|     assert len(matches) == 2
 | |
|     assert matches[0][1:] == (0, 2)
 | |
|     assert matches[1][1:] == (1, 3)
 | |
| 
 | |
| 
 | |
| def test_issue1963(en_tokenizer):
 | |
|     """Test that doc.merge() resizes doc.tensor"""
 | |
|     doc = en_tokenizer("a b c d")
 | |
|     doc.tensor = numpy.ones((len(doc), 128), dtype="f")
 | |
|     with doc.retokenize() as retokenizer:
 | |
|         retokenizer.merge(doc[0:2])
 | |
|     assert len(doc) == 3
 | |
|     assert doc.tensor.shape == (3, 128)
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize("label", ["U-JOB-NAME"])
 | |
| def test_issue1967(label):
 | |
|     ner = EntityRecognizer(Vocab())
 | |
|     entry = ([0], ["word"], ["tag"], [0], ["dep"], [label])
 | |
|     gold_parses = [(None, [(entry, None)])]
 | |
|     ner.moves.get_actions(gold_parses=gold_parses)
 | |
| 
 | |
| 
 | |
| def test_issue1971(en_vocab):
 | |
|     # Possibly related to #2675 and #2671?
 | |
|     matcher = Matcher(en_vocab)
 | |
|     pattern = [
 | |
|         {"ORTH": "Doe"},
 | |
|         {"ORTH": "!", "OP": "?"},
 | |
|         {"_": {"optional": True}, "OP": "?"},
 | |
|         {"ORTH": "!", "OP": "?"},
 | |
|     ]
 | |
|     Token.set_extension("optional", default=False)
 | |
|     matcher.add("TEST", [pattern])
 | |
|     doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
 | |
|     # We could also assert length 1 here, but this is more conclusive, because
 | |
|     # the real problem here is that it returns a duplicate match for a match_id
 | |
|     # that's not actually in the vocab!
 | |
|     matches = matcher(doc)
 | |
|     assert all([match_id in en_vocab.strings for match_id, start, end in matches])
 | |
| 
 | |
| 
 | |
| def test_issue_1971_2(en_vocab):
 | |
|     matcher = Matcher(en_vocab)
 | |
|     pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
 | |
|     pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}]  # {"IN": ["EUR"]}}]
 | |
|     doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
 | |
|     matcher.add("TEST1", [pattern1, pattern2])
 | |
|     matches = matcher(doc)
 | |
|     assert len(matches) == 2
 | |
| 
 | |
| 
 | |
| def test_issue_1971_3(en_vocab):
 | |
|     """Test that pattern matches correctly for multiple extension attributes."""
 | |
|     Token.set_extension("a", default=1, force=True)
 | |
|     Token.set_extension("b", default=2, force=True)
 | |
|     doc = Doc(en_vocab, words=["hello", "world"])
 | |
|     matcher = Matcher(en_vocab)
 | |
|     matcher.add("A", [[{"_": {"a": 1}}]])
 | |
|     matcher.add("B", [[{"_": {"b": 2}}]])
 | |
|     matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
 | |
|     assert len(matches) == 4
 | |
|     assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
 | |
| 
 | |
| 
 | |
| def test_issue_1971_4(en_vocab):
 | |
|     """Test that pattern matches correctly with multiple extension attribute
 | |
|     values on a single token.
 | |
|     """
 | |
|     Token.set_extension("ext_a", default="str_a", force=True)
 | |
|     Token.set_extension("ext_b", default="str_b", force=True)
 | |
|     matcher = Matcher(en_vocab)
 | |
|     doc = Doc(en_vocab, words=["this", "is", "text"])
 | |
|     pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
 | |
|     matcher.add("TEST", [pattern])
 | |
|     matches = matcher(doc)
 | |
|     # Uncommenting this caused a segmentation fault
 | |
|     assert len(matches) == 1
 | |
|     assert matches[0] == (en_vocab.strings["TEST"], 0, 3)
 |