2019-02-12 17:45:31 +03:00
|
|
|
import pytest
|
2022-08-22 13:04:30 +03:00
|
|
|
import warnings
|
2020-05-25 11:13:56 +03:00
|
|
|
import srsly
|
2019-10-08 13:07:02 +03:00
|
|
|
from mock import Mock
|
2021-12-04 22:34:48 +03:00
|
|
|
|
|
|
|
from spacy.lang.en import English
|
|
|
|
from spacy.matcher import PhraseMatcher, Matcher
|
2020-08-31 15:53:22 +03:00
|
|
|
from spacy.tokens import Doc, Span
|
2021-12-04 22:34:48 +03:00
|
|
|
from spacy.vocab import Vocab
|
|
|
|
|
|
|
|
|
|
|
|
from ..util import make_tempdir
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.issue(3248)
|
|
|
|
def test_issue3248_1():
|
|
|
|
"""Test that the PhraseMatcher correctly reports its number of rules, not
|
|
|
|
total number of patterns."""
|
|
|
|
nlp = English()
|
|
|
|
matcher = PhraseMatcher(nlp.vocab)
|
|
|
|
matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
|
|
|
|
matcher.add("TEST2", [nlp("d")])
|
|
|
|
assert len(matcher) == 2
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.issue(3331)
|
|
|
|
def test_issue3331(en_vocab):
|
|
|
|
"""Test that duplicate patterns for different rules result in multiple
|
|
|
|
matches, one per rule.
|
|
|
|
"""
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
|
|
|
matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])])
|
|
|
|
matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])])
|
|
|
|
doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
|
|
|
|
matches = matcher(doc)
|
|
|
|
assert len(matches) == 2
|
|
|
|
match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
|
|
|
|
assert sorted(match_ids) == ["A", "B"]
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.issue(3972)
|
|
|
|
def test_issue3972(en_vocab):
|
|
|
|
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs."""
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
|
|
|
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
|
|
|
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
|
|
|
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
|
|
|
|
matches = matcher(doc)
|
|
|
|
|
|
|
|
assert len(matches) == 2
|
|
|
|
|
|
|
|
# We should have a match for each of the two rules
|
|
|
|
found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
|
|
|
|
assert "A" in found_ids
|
|
|
|
assert "B" in found_ids
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.issue(4002)
|
|
|
|
def test_issue4002(en_vocab):
|
|
|
|
"""Test that the PhraseMatcher can match on overwritten NORM attributes."""
|
|
|
|
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
|
|
|
pattern1 = Doc(en_vocab, words=["c", "d"])
|
|
|
|
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
|
|
|
matcher.add("TEST", [pattern1])
|
|
|
|
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
|
|
|
|
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
|
|
|
|
matches = matcher(doc)
|
|
|
|
assert len(matches) == 1
|
|
|
|
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
|
|
|
pattern2 = Doc(en_vocab, words=["1", "2"])
|
|
|
|
pattern2[0].norm_ = "c"
|
|
|
|
pattern2[1].norm_ = "d"
|
|
|
|
assert [t.norm_ for t in pattern2] == ["c", "d"]
|
|
|
|
matcher.add("TEST", [pattern2])
|
|
|
|
matches = matcher(doc)
|
|
|
|
assert len(matches) == 1
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.issue(4373)
|
|
|
|
def test_issue4373():
|
|
|
|
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
|
|
|
|
matcher = Matcher(Vocab())
|
|
|
|
assert isinstance(matcher.vocab, Vocab)
|
|
|
|
matcher = PhraseMatcher(Vocab())
|
|
|
|
assert isinstance(matcher.vocab, Vocab)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.issue(4651)
|
|
|
|
def test_issue4651_with_phrase_matcher_attr():
|
|
|
|
"""Test that the EntityRuler PhraseMatcher is deserialized correctly using
|
|
|
|
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
|
|
|
specified.
|
|
|
|
"""
|
|
|
|
text = "Spacy is a python library for nlp"
|
|
|
|
nlp = English()
|
|
|
|
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
|
|
|
ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
|
|
|
|
ruler.add_patterns(patterns)
|
|
|
|
doc = nlp(text)
|
|
|
|
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
|
|
|
nlp_reloaded = English()
|
|
|
|
with make_tempdir() as d:
|
|
|
|
file_path = d / "entityruler"
|
|
|
|
ruler.to_disk(file_path)
|
|
|
|
nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path)
|
|
|
|
doc_reloaded = nlp_reloaded(text)
|
|
|
|
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
|
|
|
assert res == res_reloaded
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.issue(6839)
|
|
|
|
def test_issue6839(en_vocab):
|
|
|
|
"""Ensure that PhraseMatcher accepts Span as input"""
|
|
|
|
# fmt: off
|
|
|
|
words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]
|
|
|
|
# fmt: on
|
|
|
|
doc = Doc(en_vocab, words=words)
|
|
|
|
span = doc[:8]
|
|
|
|
pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
|
|
|
matcher.add("SPACY", [pattern])
|
|
|
|
matches = matcher(span)
|
|
|
|
assert matches
|
2018-11-15 05:00:58 +03:00
|
|
|
|
2018-07-25 00:38:44 +03:00
|
|
|
|
2022-05-12 13:23:52 +03:00
|
|
|
@pytest.mark.issue(10643)
|
|
|
|
def test_issue10643(en_vocab):
|
|
|
|
"""Ensure overlapping terms can be removed from PhraseMatcher"""
|
|
|
|
|
|
|
|
# fmt: off
|
|
|
|
words = ["Only", "save", "out", "the", "binary", "data", "for", "the", "individual", "components", "."]
|
|
|
|
# fmt: on
|
|
|
|
doc = Doc(en_vocab, words=words)
|
|
|
|
terms = {
|
|
|
|
"0": Doc(en_vocab, words=["binary"]),
|
|
|
|
"1": Doc(en_vocab, words=["binary", "data"]),
|
|
|
|
}
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
|
|
|
for match_id, term in terms.items():
|
|
|
|
matcher.add(match_id, [term])
|
|
|
|
|
|
|
|
matches = matcher(doc)
|
|
|
|
assert matches == [(en_vocab.strings["0"], 4, 5), (en_vocab.strings["1"], 4, 6)]
|
|
|
|
|
|
|
|
matcher.remove("0")
|
|
|
|
assert len(matcher) == 1
|
|
|
|
new_matches = matcher(doc)
|
|
|
|
assert new_matches == [(en_vocab.strings["1"], 4, 6)]
|
|
|
|
|
|
|
|
matcher.remove("1")
|
|
|
|
assert len(matcher) == 0
|
|
|
|
no_matches = matcher(doc)
|
|
|
|
assert not no_matches
|
|
|
|
|
|
|
|
|
2018-07-25 00:38:44 +03:00
|
|
|
def test_matcher_phrase_matcher(en_vocab):
|
|
|
|
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
2019-09-27 17:22:34 +03:00
|
|
|
# intermediate phrase
|
|
|
|
pattern = Doc(en_vocab, words=["Google", "Now"])
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("COMPANY", [pattern])
|
2019-09-27 17:22:34 +03:00
|
|
|
assert len(matcher(doc)) == 1
|
|
|
|
# initial token
|
|
|
|
pattern = Doc(en_vocab, words=["I"])
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("I", [pattern])
|
2019-09-27 17:22:34 +03:00
|
|
|
assert len(matcher(doc)) == 1
|
|
|
|
# initial phrase
|
|
|
|
pattern = Doc(en_vocab, words=["I", "like"])
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("ILIKE", [pattern])
|
2019-09-27 17:22:34 +03:00
|
|
|
assert len(matcher(doc)) == 1
|
|
|
|
# final token
|
|
|
|
pattern = Doc(en_vocab, words=["best"])
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("BEST", [pattern])
|
2019-09-27 17:22:34 +03:00
|
|
|
assert len(matcher(doc)) == 1
|
|
|
|
# final phrase
|
|
|
|
pattern = Doc(en_vocab, words=["Now", "best"])
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("NOWBEST", [pattern])
|
2018-07-25 00:38:44 +03:00
|
|
|
assert len(matcher(doc)) == 1
|
|
|
|
|
|
|
|
|
|
|
|
def test_phrase_matcher_length(en_vocab):
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
|
|
|
assert len(matcher) == 0
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST", [Doc(en_vocab, words=["test"])])
|
2018-07-25 00:38:44 +03:00
|
|
|
assert len(matcher) == 1
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST2", [Doc(en_vocab, words=["test2"])])
|
2018-07-25 00:38:44 +03:00
|
|
|
assert len(matcher) == 2
|
|
|
|
|
|
|
|
|
|
|
|
def test_phrase_matcher_contains(en_vocab):
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST", [Doc(en_vocab, words=["test"])])
|
2018-11-27 03:09:36 +03:00
|
|
|
assert "TEST" in matcher
|
|
|
|
assert "TEST2" not in matcher
|
2018-11-15 05:00:58 +03:00
|
|
|
|
|
|
|
|
2019-09-27 17:22:34 +03:00
|
|
|
def test_phrase_matcher_repeated_add(en_vocab):
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
|
|
|
# match ID only gets added once
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST", [Doc(en_vocab, words=["like"])])
|
|
|
|
matcher.add("TEST", [Doc(en_vocab, words=["like"])])
|
|
|
|
matcher.add("TEST", [Doc(en_vocab, words=["like"])])
|
|
|
|
matcher.add("TEST", [Doc(en_vocab, words=["like"])])
|
2019-09-27 17:22:34 +03:00
|
|
|
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
|
|
|
assert "TEST" in matcher
|
|
|
|
assert "TEST2" not in matcher
|
|
|
|
assert len(matcher(doc)) == 1
|
|
|
|
|
|
|
|
|
|
|
|
def test_phrase_matcher_remove(en_vocab):
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST1", [Doc(en_vocab, words=["like"])])
|
|
|
|
matcher.add("TEST2", [Doc(en_vocab, words=["best"])])
|
2019-09-27 17:22:34 +03:00
|
|
|
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
|
|
|
assert "TEST1" in matcher
|
|
|
|
assert "TEST2" in matcher
|
|
|
|
assert "TEST3" not in matcher
|
|
|
|
assert len(matcher(doc)) == 2
|
|
|
|
matcher.remove("TEST1")
|
|
|
|
assert "TEST1" not in matcher
|
|
|
|
assert "TEST2" in matcher
|
|
|
|
assert "TEST3" not in matcher
|
|
|
|
assert len(matcher(doc)) == 1
|
|
|
|
matcher.remove("TEST2")
|
|
|
|
assert "TEST1" not in matcher
|
|
|
|
assert "TEST2" not in matcher
|
|
|
|
assert "TEST3" not in matcher
|
|
|
|
assert len(matcher(doc)) == 0
|
|
|
|
with pytest.raises(KeyError):
|
|
|
|
matcher.remove("TEST3")
|
|
|
|
assert "TEST1" not in matcher
|
|
|
|
assert "TEST2" not in matcher
|
|
|
|
assert "TEST3" not in matcher
|
|
|
|
assert len(matcher(doc)) == 0
|
|
|
|
|
|
|
|
|
|
|
|
def test_phrase_matcher_overlapping_with_remove(en_vocab):
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST", [Doc(en_vocab, words=["like"])])
|
2019-09-27 17:22:34 +03:00
|
|
|
# TEST2 is added alongside TEST
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST2", [Doc(en_vocab, words=["like"])])
|
2019-09-27 17:22:34 +03:00
|
|
|
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
|
|
|
assert "TEST" in matcher
|
|
|
|
assert len(matcher) == 2
|
|
|
|
assert len(matcher(doc)) == 2
|
|
|
|
# removing TEST does not remove the entry for TEST2
|
|
|
|
matcher.remove("TEST")
|
|
|
|
assert "TEST" not in matcher
|
|
|
|
assert len(matcher) == 1
|
|
|
|
assert len(matcher(doc)) == 1
|
|
|
|
assert matcher(doc)[0][0] == en_vocab.strings["TEST2"]
|
|
|
|
# removing TEST2 removes all
|
|
|
|
matcher.remove("TEST2")
|
|
|
|
assert "TEST2" not in matcher
|
|
|
|
assert len(matcher) == 0
|
|
|
|
assert len(matcher(doc)) == 0
|
|
|
|
|
|
|
|
|
2018-11-15 05:00:58 +03:00
|
|
|
def test_phrase_matcher_string_attrs(en_vocab):
|
2018-11-27 03:09:36 +03:00
|
|
|
words1 = ["I", "like", "cats"]
|
|
|
|
pos1 = ["PRON", "VERB", "NOUN"]
|
|
|
|
words2 = ["Yes", ",", "you", "hate", "dogs", "very", "much"]
|
|
|
|
pos2 = ["INTJ", "PUNCT", "PRON", "VERB", "NOUN", "ADV", "ADV"]
|
2020-09-21 21:43:54 +03:00
|
|
|
pattern = Doc(en_vocab, words=words1, pos=pos1)
|
2018-11-27 03:09:36 +03:00
|
|
|
matcher = PhraseMatcher(en_vocab, attr="POS")
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST", [pattern])
|
2020-09-21 21:43:54 +03:00
|
|
|
doc = Doc(en_vocab, words=words2, pos=pos2)
|
2018-11-15 05:00:58 +03:00
|
|
|
matches = matcher(doc)
|
|
|
|
assert len(matches) == 1
|
|
|
|
match_id, start, end = matches[0]
|
2018-11-27 03:09:36 +03:00
|
|
|
assert match_id == en_vocab.strings["TEST"]
|
2018-11-15 05:00:58 +03:00
|
|
|
assert start == 2
|
|
|
|
assert end == 5
|
|
|
|
|
|
|
|
|
|
|
|
def test_phrase_matcher_string_attrs_negative(en_vocab):
|
|
|
|
"""Test that token with the control codes as ORTH are *not* matched."""
|
2018-11-27 03:09:36 +03:00
|
|
|
words1 = ["I", "like", "cats"]
|
|
|
|
pos1 = ["PRON", "VERB", "NOUN"]
|
|
|
|
words2 = ["matcher:POS-PRON", "matcher:POS-VERB", "matcher:POS-NOUN"]
|
|
|
|
pos2 = ["X", "X", "X"]
|
2020-09-21 21:43:54 +03:00
|
|
|
pattern = Doc(en_vocab, words=words1, pos=pos1)
|
2018-11-27 03:09:36 +03:00
|
|
|
matcher = PhraseMatcher(en_vocab, attr="POS")
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST", [pattern])
|
2020-09-21 21:43:54 +03:00
|
|
|
doc = Doc(en_vocab, words=words2, pos=pos2)
|
2018-11-15 05:00:58 +03:00
|
|
|
matches = matcher(doc)
|
|
|
|
assert len(matches) == 0
|
|
|
|
|
|
|
|
|
|
|
|
def test_phrase_matcher_bool_attrs(en_vocab):
|
2018-11-27 03:09:36 +03:00
|
|
|
words1 = ["Hello", "world", "!"]
|
|
|
|
words2 = ["No", "problem", ",", "he", "said", "."]
|
2018-11-15 05:00:58 +03:00
|
|
|
pattern = Doc(en_vocab, words=words1)
|
2018-11-27 03:09:36 +03:00
|
|
|
matcher = PhraseMatcher(en_vocab, attr="IS_PUNCT")
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST", [pattern])
|
2018-11-15 05:00:58 +03:00
|
|
|
doc = Doc(en_vocab, words=words2)
|
|
|
|
matches = matcher(doc)
|
|
|
|
assert len(matches) == 2
|
|
|
|
match_id1, start1, end1 = matches[0]
|
|
|
|
match_id2, start2, end2 = matches[1]
|
2018-11-27 03:09:36 +03:00
|
|
|
assert match_id1 == en_vocab.strings["TEST"]
|
|
|
|
assert match_id2 == en_vocab.strings["TEST"]
|
2018-11-15 05:00:58 +03:00
|
|
|
assert start1 == 0
|
|
|
|
assert end1 == 3
|
|
|
|
assert start2 == 3
|
|
|
|
assert end2 == 6
|
2019-02-12 17:45:31 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_phrase_matcher_validation(en_vocab):
|
|
|
|
doc1 = Doc(en_vocab, words=["Test"])
|
2020-09-17 01:14:01 +03:00
|
|
|
doc1[0].dep_ = "ROOT"
|
2019-02-12 17:45:31 +03:00
|
|
|
doc2 = Doc(en_vocab, words=["Test"])
|
2020-09-17 01:14:01 +03:00
|
|
|
doc2[0].tag_ = "TAG"
|
|
|
|
doc2[0].pos_ = "X"
|
2020-10-01 23:21:46 +03:00
|
|
|
doc2[0].set_morph("Feat=Val")
|
2019-02-12 17:45:31 +03:00
|
|
|
doc3 = Doc(en_vocab, words=["Test"])
|
|
|
|
matcher = PhraseMatcher(en_vocab, validate=True)
|
|
|
|
with pytest.warns(UserWarning):
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST1", [doc1])
|
2019-02-12 17:45:31 +03:00
|
|
|
with pytest.warns(UserWarning):
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST2", [doc2])
|
2022-08-22 13:04:30 +03:00
|
|
|
with warnings.catch_warnings():
|
|
|
|
warnings.simplefilter("error")
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST3", [doc3])
|
2019-02-12 17:45:31 +03:00
|
|
|
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
|
2022-08-22 13:04:30 +03:00
|
|
|
with warnings.catch_warnings():
|
|
|
|
warnings.simplefilter("error")
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST4", [doc2])
|
2019-08-21 15:00:37 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_attr_validation(en_vocab):
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
PhraseMatcher(en_vocab, attr="UNSUPPORTED")
|
|
|
|
|
|
|
|
|
|
|
|
def test_attr_pipeline_checks(en_vocab):
|
|
|
|
doc1 = Doc(en_vocab, words=["Test"])
|
2020-09-17 01:14:01 +03:00
|
|
|
doc1[0].dep_ = "ROOT"
|
2019-08-21 15:00:37 +03:00
|
|
|
doc2 = Doc(en_vocab, words=["Test"])
|
2020-09-17 01:14:01 +03:00
|
|
|
doc2[0].tag_ = "TAG"
|
|
|
|
doc2[0].pos_ = "X"
|
2020-10-01 23:21:46 +03:00
|
|
|
doc2[0].set_morph("Feat=Val")
|
2020-09-17 01:14:01 +03:00
|
|
|
doc2[0].lemma_ = "LEMMA"
|
2019-08-21 15:00:37 +03:00
|
|
|
doc3 = Doc(en_vocab, words=["Test"])
|
2020-09-17 01:14:01 +03:00
|
|
|
# DEP requires DEP
|
2019-08-21 15:00:37 +03:00
|
|
|
matcher = PhraseMatcher(en_vocab, attr="DEP")
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST1", [doc1])
|
2019-08-21 15:00:37 +03:00
|
|
|
with pytest.raises(ValueError):
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST2", [doc2])
|
2019-08-21 15:00:37 +03:00
|
|
|
with pytest.raises(ValueError):
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST3", [doc3])
|
2020-09-17 01:14:01 +03:00
|
|
|
# TAG, POS, LEMMA require those values
|
2019-08-21 15:00:37 +03:00
|
|
|
for attr in ("TAG", "POS", "LEMMA"):
|
|
|
|
matcher = PhraseMatcher(en_vocab, attr=attr)
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST2", [doc2])
|
2019-08-21 15:00:37 +03:00
|
|
|
with pytest.raises(ValueError):
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST1", [doc1])
|
2019-08-21 15:00:37 +03:00
|
|
|
with pytest.raises(ValueError):
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST3", [doc3])
|
2019-08-21 15:00:37 +03:00
|
|
|
# TEXT/ORTH only require tokens
|
|
|
|
matcher = PhraseMatcher(en_vocab, attr="ORTH")
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST3", [doc3])
|
2019-08-21 15:00:37 +03:00
|
|
|
matcher = PhraseMatcher(en_vocab, attr="TEXT")
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("TEST3", [doc3])
|
2019-10-08 13:07:02 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_phrase_matcher_callback(en_vocab):
|
|
|
|
mock = Mock()
|
|
|
|
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
|
|
|
pattern = Doc(en_vocab, words=["Google", "Now"])
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("COMPANY", [pattern], on_match=mock)
|
2019-10-08 13:07:02 +03:00
|
|
|
matches = matcher(doc)
|
|
|
|
mock.assert_called_once_with(matcher, doc, 0, matches)
|
2019-10-14 13:19:51 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_phrase_matcher_remove_overlapping_patterns(en_vocab):
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
|
|
|
pattern1 = Doc(en_vocab, words=["this"])
|
|
|
|
pattern2 = Doc(en_vocab, words=["this", "is"])
|
|
|
|
pattern3 = Doc(en_vocab, words=["this", "is", "a"])
|
|
|
|
pattern4 = Doc(en_vocab, words=["this", "is", "a", "word"])
|
2019-10-25 23:21:08 +03:00
|
|
|
matcher.add("THIS", [pattern1, pattern2, pattern3, pattern4])
|
2019-10-14 13:19:51 +03:00
|
|
|
matcher.remove("THIS")
|
2019-10-25 23:21:08 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_phrase_matcher_basic_check(en_vocab):
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
|
|
|
# Potential mistake: pass in pattern instead of list of patterns
|
|
|
|
pattern = Doc(en_vocab, words=["hello", "world"])
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
matcher.add("TEST", pattern)
|
2020-05-25 11:13:56 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_phrase_matcher_pickle(en_vocab):
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
|
|
|
mock = Mock()
|
|
|
|
matcher.add("TEST", [Doc(en_vocab, words=["test"])])
|
|
|
|
matcher.add("TEST2", [Doc(en_vocab, words=["test2"])], on_match=mock)
|
|
|
|
doc = Doc(en_vocab, words=["these", "are", "tests", ":", "test", "test2"])
|
|
|
|
assert len(matcher) == 2
|
|
|
|
|
|
|
|
b = srsly.pickle_dumps(matcher)
|
|
|
|
matcher_unpickled = srsly.pickle_loads(b)
|
|
|
|
|
|
|
|
# call after pickling to avoid recursion error related to mock
|
|
|
|
matches = matcher(doc)
|
|
|
|
matches_unpickled = matcher_unpickled(doc)
|
|
|
|
|
|
|
|
assert len(matcher) == len(matcher_unpickled)
|
|
|
|
assert matches == matches_unpickled
|
|
|
|
|
|
|
|
# clunky way to vaguely check that callback is unpickled
|
|
|
|
(vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1]
|
|
|
|
assert isinstance(callbacks.get("TEST2"), Mock)
|
2020-08-31 15:53:22 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_phrase_matcher_as_spans(en_vocab):
|
|
|
|
"""Test the new as_spans=True API."""
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
|
|
|
matcher.add("A", [Doc(en_vocab, words=["hello", "world"])])
|
|
|
|
matcher.add("B", [Doc(en_vocab, words=["test"])])
|
|
|
|
doc = Doc(en_vocab, words=["...", "hello", "world", "this", "is", "a", "test"])
|
|
|
|
matches = matcher(doc, as_spans=True)
|
|
|
|
assert len(matches) == 2
|
|
|
|
assert isinstance(matches[0], Span)
|
|
|
|
assert matches[0].text == "hello world"
|
|
|
|
assert matches[0].label_ == "A"
|
|
|
|
assert isinstance(matches[1], Span)
|
|
|
|
assert matches[1].text == "test"
|
|
|
|
assert matches[1].label_ == "B"
|
2020-08-31 18:01:24 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_phrase_matcher_deprecated(en_vocab):
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
|
|
|
matcher.add("TEST", [Doc(en_vocab, words=["helllo"])])
|
|
|
|
doc = Doc(en_vocab, words=["hello", "world"])
|
|
|
|
with pytest.warns(DeprecationWarning) as record:
|
|
|
|
for _ in matcher.pipe([doc]):
|
|
|
|
pass
|
|
|
|
assert record.list
|
|
|
|
assert "spaCy v3.0" in str(record.list[0].message)
|
2021-01-26 06:52:45 +03:00
|
|
|
|
|
|
|
|
Update/remove old Matcher syntax (#11370)
* Clean up old Matcher call style related stuff
In v2 Matcher.add was called with (key, on_match, *patterns). In v3 this
was changed to (key, patterns, *, on_match=None), but there were various
points where the old call syntax was documented or handled specially.
This removes all those.
The Matcher itself didn't need any code changes, as it just gives a
generic type error. However the PhraseMatcher required some changes
because it would automatically "fix" the old call style.
Surprisingly, the tokenizer was still using the old call style in one
place.
After these changes tests failed in two places:
1. one test for the "new" call style, including the "old" call style. I
removed this test.
2. deserializing the PhraseMatcher fails because the input docs are a
set.
I am not sure why 2 is happening - I guess it's a quirk of the
serialization format? - so for now I just convert the set to a list when
deserializing. The check that the input Docs are a List in the
PhraseMatcher is a new check, but makes it parallel with the other
Matchers, which seemed like the right thing to do.
* Add notes related to input docs / deserialization type
* Remove Typing import
* Remove old note about call style change
* Apply suggestions from code review
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
* Use separate method for setting internal doc representations
In addition to the title change, this changes the internal dict to be a
defaultdict, instead of a dict with frequent use of setdefault.
* Add _add_from_arrays for unpickling
* Cleanup around adding from arrays
This moves adding to internal structures into the private batch method,
and removes the single-add method.
This has one behavioral change for `add`, in that if something is wrong
with the list of input Docs (such as one of the items not being a Doc),
valid items before the invalid one will not be added. Also the callback
will not be updated if anything is invalid. This change should not be
significant.
This also adds a test to check failure when given a non-Doc.
* Update spacy/matcher/phrasematcher.pyx
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2022-08-30 16:40:31 +03:00
|
|
|
def test_phrase_matcher_non_doc(en_vocab):
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
|
|
|
doc = Doc(en_vocab, words=["hello", "world"])
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
matcher.add("TEST", [doc, "junk"])
|
|
|
|
|
|
|
|
|
2021-01-26 06:52:45 +03:00
|
|
|
@pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"])
|
|
|
|
def test_phrase_matcher_sent_start(en_vocab, attr):
|
2021-01-30 04:52:33 +03:00
|
|
|
_ = PhraseMatcher(en_vocab, attr=attr) # noqa: F841
|
2021-02-10 15:43:32 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_span_in_phrasematcher(en_vocab):
|
|
|
|
"""Ensure that PhraseMatcher accepts Span and Doc as input"""
|
2021-02-13 04:55:56 +03:00
|
|
|
# fmt: off
|
|
|
|
words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]
|
|
|
|
# fmt: on
|
|
|
|
doc = Doc(en_vocab, words=words)
|
2021-02-10 15:43:32 +03:00
|
|
|
span = doc[:8]
|
|
|
|
pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
|
|
|
matcher.add("SPACY", [pattern])
|
|
|
|
matches_doc = matcher(doc)
|
|
|
|
matches_span = matcher(span)
|
|
|
|
assert len(matches_doc) == 1
|
|
|
|
assert len(matches_span) == 1
|
|
|
|
|
|
|
|
|
|
|
|
def test_span_v_doc_in_phrasematcher(en_vocab):
|
|
|
|
"""Ensure that PhraseMatcher only returns matches in input Span and not in entire Doc"""
|
2021-02-13 04:55:56 +03:00
|
|
|
# fmt: off
|
|
|
|
words = [
|
|
|
|
"I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "Spans",
|
|
|
|
"and", "Docs", "in", "my", "matchers", "," "and", "Spans", "and", "Docs",
|
|
|
|
"everywhere", "."
|
|
|
|
]
|
|
|
|
# fmt: on
|
|
|
|
doc = Doc(en_vocab, words=words)
|
2021-02-10 15:43:32 +03:00
|
|
|
span = doc[9:15] # second clause
|
|
|
|
pattern = Doc(en_vocab, words=["Spans", "and", "Docs"])
|
|
|
|
matcher = PhraseMatcher(en_vocab)
|
|
|
|
matcher.add("SPACY", [pattern])
|
|
|
|
matches_doc = matcher(doc)
|
|
|
|
matches_span = matcher(span)
|
|
|
|
assert len(matches_doc) == 3
|
|
|
|
assert len(matches_span) == 1
|