From cfffdba7b18d2f55df21c0d9b21a1ee0a7665cc8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 25 Oct 2019 22:21:08 +0200 Subject: [PATCH] Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522) * Implement new API for {Phrase}Matcher.add (backwards-compatible) * Update docs * Also update DependencyMatcher.add * Update internals * Rewrite tests to use new API * Add basic check for common mistake Raise error with suggestion if user likely passed in a pattern instead of a list of patterns * Fix typo [ci skip] --- spacy/errors.py | 7 ++ spacy/matcher/dependencymatcher.pyx | 5 +- spacy/matcher/matcher.pyx | 19 +++- spacy/matcher/phrasematcher.pyx | 17 +++- spacy/pipeline/entityruler.py | 4 +- spacy/tests/matcher/test_matcher_api.py | 95 +++++++++++------- spacy/tests/matcher/test_matcher_logic.py | 12 +-- .../tests/matcher/test_pattern_validation.py | 6 +- spacy/tests/matcher/test_phrase_matcher.py | 96 ++++++++++++------- spacy/tests/regression/test_issue1-1000.py | 30 +++--- spacy/tests/regression/test_issue1001-1500.py | 4 +- spacy/tests/regression/test_issue1501-2000.py | 14 +-- spacy/tests/regression/test_issue2001-2500.py | 2 +- spacy/tests/regression/test_issue2501-3000.py | 4 +- spacy/tests/regression/test_issue3001-3500.py | 16 ++-- spacy/tests/regression/test_issue3549.py | 4 +- spacy/tests/regression/test_issue3555.py | 2 +- spacy/tests/regression/test_issue3839.py | 4 +- spacy/tests/regression/test_issue3879.py | 2 +- spacy/tests/regression/test_issue3951.py | 2 +- spacy/tests/regression/test_issue3972.py | 4 +- spacy/tests/regression/test_issue4002.py | 4 +- spacy/tests/regression/test_issue4120.py | 6 +- website/docs/api/matcher.md | 17 ++-- website/docs/api/phrasematcher.md | 17 ++++ 25 files changed, 250 insertions(+), 143 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 23203d98a..ddf14585b 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -504,6 +504,13 @@ class Errors(object): E175 = ("Can't remove rule for unknown match pattern ID: {key}") E176 = ("Alias '{alias}' is not defined in the Knowledge Base.") E177 = ("Ill-formed IOB input detected: {tag}") + E178 = ("Invalid pattern. Expected list of dicts but got: {pat}. Maybe you " + "accidentally passed a single pattern to Matcher.add instead of a " + "list of patterns? If you only want to add one pattern, make sure " + "to wrap it in a list. For example: matcher.add('{key}', [pattern])") + E179 = ("Invalid pattern. Expected a list of Doc objects but got a single " + "Doc. If you only want to add one pattern, make sure to wrap it " + "in a list. For example: matcher.add('{key}', [doc])") @add_codes diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index b58d36d62..ae2ad3ca6 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -102,7 +102,10 @@ cdef class DependencyMatcher: visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True idx = idx + 1 - def add(self, key, on_match, *patterns): + def add(self, key, patterns, *_patterns, on_match=None): + if patterns is None or hasattr(patterns, "__call__"): # old API + on_match = patterns + patterns = _patterns for pattern in patterns: if len(pattern) == 0: raise ValueError(Errors.E012.format(key=key)) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index af0450592..6f6848102 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -74,7 +74,7 @@ cdef class Matcher: """ return self._normalize_key(key) in self._patterns - def add(self, key, on_match, *patterns): + def add(self, key, patterns, *_patterns, on_match=None): """Add a match-rule to the matcher. A match-rule consists of: an ID key, an on_match callback, and one or more patterns. @@ -98,16 +98,29 @@ cdef class Matcher: operator will behave non-greedily. This quirk in the semantics makes the matcher more efficient, by avoiding the need for back-tracking. + As of spaCy v2.2.2, Matcher.add supports the future API, which makes + the patterns the second argument and a list (instead of a variable + number of arguments). The on_match callback becomes an optional keyword + argument. + key (unicode): The match ID. - on_match (callable): Callback executed on match. - *patterns (list): List of token descriptions. + patterns (list): The patterns to add for the given key. + on_match (callable): Optional callback executed on match. + *_patterns (list): For backwards compatibility: list of patterns to add + as variable arguments. Will be ignored if a list of patterns is + provided as the second argument. """ errors = {} if on_match is not None and not hasattr(on_match, "__call__"): raise ValueError(Errors.E171.format(arg_type=type(on_match))) + if patterns is None or hasattr(patterns, "__call__"): # old API + on_match = patterns + patterns = _patterns for i, pattern in enumerate(patterns): if len(pattern) == 0: raise ValueError(Errors.E012.format(key=key)) + if not isinstance(pattern, list): + raise ValueError(Errors.E178.format(pat=pattern, key=key)) if self.validator: errors[i] = validate_json(pattern, self.validator) if any(err for err in errors.values()): diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 135e81efe..4de5782f9 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -152,16 +152,27 @@ cdef class PhraseMatcher: del self._callbacks[key] del self._docs[key] - def add(self, key, on_match, *docs): + def add(self, key, docs, *_docs, on_match=None): """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID key, an on_match callback, and one or more patterns. + As of spaCy v2.2.2, PhraseMatcher.add supports the future API, which + makes the patterns the second argument and a list (instead of a variable + number of arguments). The on_match callback becomes an optional keyword + argument. + key (unicode): The match ID. + docs (list): List of `Doc` objects representing match patterns. on_match (callable): Callback executed on match. - *docs (Doc): `Doc` objects representing match patterns. + *_docs (Doc): For backwards compatibility: list of patterns to add + as variable arguments. Will be ignored if a list of patterns is + provided as the second argument. DOCS: https://spacy.io/api/phrasematcher#add """ + if docs is None or hasattr(docs, "__call__"): # old API + on_match = docs + docs = _docs _ = self.vocab[key] self._callbacks[key] = on_match @@ -171,6 +182,8 @@ cdef class PhraseMatcher: cdef MapStruct* internal_node cdef void* result + if isinstance(docs, Doc): + raise ValueError(Errors.E179.format(key=key)) for doc in docs: if len(doc) == 0: continue diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 882e87547..6bd6c4ea9 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -204,9 +204,9 @@ class EntityRuler(object): else: raise ValueError(Errors.E097.format(pattern=pattern)) for label, patterns in self.token_patterns.items(): - self.matcher.add(label, None, *patterns) + self.matcher.add(label, patterns) for label, patterns in self.phrase_patterns.items(): - self.phrase_matcher.add(label, None, *patterns) + self.phrase_matcher.add(label, patterns) def _split_label(self, label): """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 730756524..e4584d03a 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -17,7 +17,7 @@ def matcher(en_vocab): } matcher = Matcher(en_vocab) for key, patterns in rules.items(): - matcher.add(key, None, *patterns) + matcher.add(key, patterns) return matcher @@ -25,11 +25,11 @@ def test_matcher_from_api_docs(en_vocab): matcher = Matcher(en_vocab) pattern = [{"ORTH": "test"}] assert len(matcher) == 0 - matcher.add("Rule", None, pattern) + matcher.add("Rule", [pattern]) assert len(matcher) == 1 matcher.remove("Rule") assert "Rule" not in matcher - matcher.add("Rule", None, pattern) + matcher.add("Rule", [pattern]) assert "Rule" in matcher on_match, patterns = matcher.get("Rule") assert len(patterns[0]) @@ -52,7 +52,7 @@ def test_matcher_from_usage_docs(en_vocab): token.vocab[token.text].norm_ = "happy emoji" matcher = Matcher(en_vocab) - matcher.add("HAPPY", label_sentiment, *pos_patterns) + matcher.add("HAPPY", pos_patterns, on_match=label_sentiment) matcher(doc) assert doc.sentiment != 0 assert doc[1].norm_ == "happy emoji" @@ -60,11 +60,33 @@ def test_matcher_from_usage_docs(en_vocab): def test_matcher_len_contains(matcher): assert len(matcher) == 3 - matcher.add("TEST", None, [{"ORTH": "test"}]) + matcher.add("TEST", [[{"ORTH": "test"}]]) assert "TEST" in matcher assert "TEST2" not in matcher +def test_matcher_add_new_old_api(en_vocab): + doc = Doc(en_vocab, words=["a", "b"]) + patterns = [[{"TEXT": "a"}], [{"TEXT": "a"}, {"TEXT": "b"}]] + matcher = Matcher(en_vocab) + matcher.add("OLD_API", None, *patterns) + assert len(matcher(doc)) == 2 + matcher = Matcher(en_vocab) + on_match = Mock() + matcher.add("OLD_API_CALLBACK", on_match, *patterns) + assert len(matcher(doc)) == 2 + assert on_match.call_count == 2 + # New API: add(key: str, patterns: List[List[dict]], on_match: Callable) + matcher = Matcher(en_vocab) + matcher.add("NEW_API", patterns) + assert len(matcher(doc)) == 2 + matcher = Matcher(en_vocab) + on_match = Mock() + matcher.add("NEW_API_CALLBACK", patterns, on_match=on_match) + assert len(matcher(doc)) == 2 + assert on_match.call_count == 2 + + def test_matcher_no_match(matcher): doc = Doc(matcher.vocab, words=["I", "like", "cheese", "."]) assert matcher(doc) == [] @@ -100,12 +122,12 @@ def test_matcher_empty_dict(en_vocab): """Test matcher allows empty token specs, meaning match on any token.""" matcher = Matcher(en_vocab) doc = Doc(matcher.vocab, words=["a", "b", "c"]) - matcher.add("A.C", None, [{"ORTH": "a"}, {}, {"ORTH": "c"}]) + matcher.add("A.C", [[{"ORTH": "a"}, {}, {"ORTH": "c"}]]) matches = matcher(doc) assert len(matches) == 1 assert matches[0][1:] == (0, 3) matcher = Matcher(en_vocab) - matcher.add("A.", None, [{"ORTH": "a"}, {}]) + matcher.add("A.", [[{"ORTH": "a"}, {}]]) matches = matcher(doc) assert matches[0][1:] == (0, 2) @@ -114,7 +136,7 @@ def test_matcher_operator_shadow(en_vocab): matcher = Matcher(en_vocab) doc = Doc(matcher.vocab, words=["a", "b", "c"]) pattern = [{"ORTH": "a"}, {"IS_ALPHA": True, "OP": "+"}, {"ORTH": "c"}] - matcher.add("A.C", None, pattern) + matcher.add("A.C", [pattern]) matches = matcher(doc) assert len(matches) == 1 assert matches[0][1:] == (0, 3) @@ -136,12 +158,12 @@ def test_matcher_match_zero(matcher): {"IS_PUNCT": True}, {"ORTH": '"'}, ] - matcher.add("Quote", None, pattern1) + matcher.add("Quote", [pattern1]) doc = Doc(matcher.vocab, words=words1) assert len(matcher(doc)) == 1 doc = Doc(matcher.vocab, words=words2) assert len(matcher(doc)) == 0 - matcher.add("Quote", None, pattern2) + matcher.add("Quote", [pattern2]) assert len(matcher(doc)) == 0 @@ -149,7 +171,7 @@ def test_matcher_match_zero_plus(matcher): words = 'He said , " some words " ...'.split() pattern = [{"ORTH": '"'}, {"OP": "*", "IS_PUNCT": False}, {"ORTH": '"'}] matcher = Matcher(matcher.vocab) - matcher.add("Quote", None, pattern) + matcher.add("Quote", [pattern]) doc = Doc(matcher.vocab, words=words) assert len(matcher(doc)) == 1 @@ -160,11 +182,8 @@ def test_matcher_match_one_plus(matcher): doc = Doc(control.vocab, words=["Philippe", "Philippe"]) m = control(doc) assert len(m) == 2 - matcher.add( - "KleenePhilippe", - None, - [{"ORTH": "Philippe", "OP": "1"}, {"ORTH": "Philippe", "OP": "+"}], - ) + pattern = [{"ORTH": "Philippe", "OP": "1"}, {"ORTH": "Philippe", "OP": "+"}] + matcher.add("KleenePhilippe", [pattern]) m = matcher(doc) assert len(m) == 1 @@ -172,7 +191,7 @@ def test_matcher_match_one_plus(matcher): def test_matcher_any_token_operator(en_vocab): """Test that patterns with "any token" {} work with operators.""" matcher = Matcher(en_vocab) - matcher.add("TEST", None, [{"ORTH": "test"}, {"OP": "*"}]) + matcher.add("TEST", [[{"ORTH": "test"}, {"OP": "*"}]]) doc = Doc(en_vocab, words=["test", "hello", "world"]) matches = [doc[start:end].text for _, start, end in matcher(doc)] assert len(matches) == 3 @@ -186,7 +205,7 @@ def test_matcher_extension_attribute(en_vocab): get_is_fruit = lambda token: token.text in ("apple", "banana") Token.set_extension("is_fruit", getter=get_is_fruit, force=True) pattern = [{"ORTH": "an"}, {"_": {"is_fruit": True}}] - matcher.add("HAVING_FRUIT", None, pattern) + matcher.add("HAVING_FRUIT", [pattern]) doc = Doc(en_vocab, words=["an", "apple"]) matches = matcher(doc) assert len(matches) == 1 @@ -198,7 +217,7 @@ def test_matcher_extension_attribute(en_vocab): def test_matcher_set_value(en_vocab): matcher = Matcher(en_vocab) pattern = [{"ORTH": {"IN": ["an", "a"]}}] - matcher.add("A_OR_AN", None, pattern) + matcher.add("A_OR_AN", [pattern]) doc = Doc(en_vocab, words=["an", "a", "apple"]) matches = matcher(doc) assert len(matches) == 2 @@ -210,7 +229,7 @@ def test_matcher_set_value(en_vocab): def test_matcher_set_value_operator(en_vocab): matcher = Matcher(en_vocab) pattern = [{"ORTH": {"IN": ["a", "the"]}, "OP": "?"}, {"ORTH": "house"}] - matcher.add("DET_HOUSE", None, pattern) + matcher.add("DET_HOUSE", [pattern]) doc = Doc(en_vocab, words=["In", "a", "house"]) matches = matcher(doc) assert len(matches) == 2 @@ -222,7 +241,7 @@ def test_matcher_set_value_operator(en_vocab): def test_matcher_regex(en_vocab): matcher = Matcher(en_vocab) pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}] - matcher.add("A_OR_AN", None, pattern) + matcher.add("A_OR_AN", [pattern]) doc = Doc(en_vocab, words=["an", "a", "hi"]) matches = matcher(doc) assert len(matches) == 2 @@ -234,7 +253,7 @@ def test_matcher_regex(en_vocab): def test_matcher_regex_shape(en_vocab): matcher = Matcher(en_vocab) pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}] - matcher.add("NON_ALPHA", None, pattern) + matcher.add("NON_ALPHA", [pattern]) doc = Doc(en_vocab, words=["99", "problems", "!"]) matches = matcher(doc) assert len(matches) == 2 @@ -246,7 +265,7 @@ def test_matcher_regex_shape(en_vocab): def test_matcher_compare_length(en_vocab): matcher = Matcher(en_vocab) pattern = [{"LENGTH": {">=": 2}}] - matcher.add("LENGTH_COMPARE", None, pattern) + matcher.add("LENGTH_COMPARE", [pattern]) doc = Doc(en_vocab, words=["a", "aa", "aaa"]) matches = matcher(doc) assert len(matches) == 2 @@ -260,7 +279,7 @@ def test_matcher_extension_set_membership(en_vocab): get_reversed = lambda token: "".join(reversed(token.text)) Token.set_extension("reversed", getter=get_reversed, force=True) pattern = [{"_": {"reversed": {"IN": ["eyb", "ih"]}}}] - matcher.add("REVERSED", None, pattern) + matcher.add("REVERSED", [pattern]) doc = Doc(en_vocab, words=["hi", "bye", "hello"]) matches = matcher(doc) assert len(matches) == 2 @@ -328,9 +347,9 @@ def dependency_matcher(en_vocab): ] matcher = DependencyMatcher(en_vocab) - matcher.add("pattern1", None, pattern1) - matcher.add("pattern2", None, pattern2) - matcher.add("pattern3", None, pattern3) + matcher.add("pattern1", [pattern1]) + matcher.add("pattern2", [pattern2]) + matcher.add("pattern3", [pattern3]) return matcher @@ -347,6 +366,14 @@ def test_dependency_matcher_compile(dependency_matcher): # assert matches[2][1] == [[4, 3, 2]] +def test_matcher_basic_check(en_vocab): + matcher = Matcher(en_vocab) + # Potential mistake: pass in pattern instead of list of patterns + pattern = [{"TEXT": "hello"}, {"TEXT": "world"}] + with pytest.raises(ValueError): + matcher.add("TEST", pattern) + + def test_attr_pipeline_checks(en_vocab): doc1 = Doc(en_vocab, words=["Test"]) doc1.is_parsed = True @@ -355,7 +382,7 @@ def test_attr_pipeline_checks(en_vocab): doc3 = Doc(en_vocab, words=["Test"]) # DEP requires is_parsed matcher = Matcher(en_vocab) - matcher.add("TEST", None, [{"DEP": "a"}]) + matcher.add("TEST", [[{"DEP": "a"}]]) matcher(doc1) with pytest.raises(ValueError): matcher(doc2) @@ -364,7 +391,7 @@ def test_attr_pipeline_checks(en_vocab): # TAG, POS, LEMMA require is_tagged for attr in ("TAG", "POS", "LEMMA"): matcher = Matcher(en_vocab) - matcher.add("TEST", None, [{attr: "a"}]) + matcher.add("TEST", [[{attr: "a"}]]) matcher(doc2) with pytest.raises(ValueError): matcher(doc1) @@ -372,12 +399,12 @@ def test_attr_pipeline_checks(en_vocab): matcher(doc3) # TEXT/ORTH only require tokens matcher = Matcher(en_vocab) - matcher.add("TEST", None, [{"ORTH": "a"}]) + matcher.add("TEST", [[{"ORTH": "a"}]]) matcher(doc1) matcher(doc2) matcher(doc3) matcher = Matcher(en_vocab) - matcher.add("TEST", None, [{"TEXT": "a"}]) + matcher.add("TEST", [[{"TEXT": "a"}]]) matcher(doc1) matcher(doc2) matcher(doc3) @@ -407,7 +434,7 @@ def test_attr_pipeline_checks(en_vocab): def test_matcher_schema_token_attributes(en_vocab, pattern, text): matcher = Matcher(en_vocab) doc = Doc(en_vocab, words=text.split(" ")) - matcher.add("Rule", None, pattern) + matcher.add("Rule", [pattern]) assert len(matcher) == 1 matches = matcher(doc) assert len(matches) == 1 @@ -417,7 +444,7 @@ def test_matcher_valid_callback(en_vocab): """Test that on_match can only be None or callable.""" matcher = Matcher(en_vocab) with pytest.raises(ValueError): - matcher.add("TEST", [], [{"TEXT": "test"}]) + matcher.add("TEST", [[{"TEXT": "test"}]], on_match=[]) matcher(Doc(en_vocab, words=["test"])) @@ -425,7 +452,7 @@ def test_matcher_callback(en_vocab): mock = Mock() matcher = Matcher(en_vocab) pattern = [{"ORTH": "test"}] - matcher.add("Rule", mock, pattern) + matcher.add("Rule", [pattern], on_match=mock) doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) matches = matcher(doc) mock.assert_called_once_with(matcher, doc, 0, matches) diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index b9c435c17..240ace537 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -55,7 +55,7 @@ def test_greedy_matching(doc, text, pattern, re_pattern): """Test that the greedy matching behavior of the * op is consistant with other re implementations.""" matcher = Matcher(doc.vocab) - matcher.add(re_pattern, None, pattern) + matcher.add(re_pattern, [pattern]) matches = matcher(doc) re_matches = [m.span() for m in re.finditer(re_pattern, text)] for match, re_match in zip(matches, re_matches): @@ -77,7 +77,7 @@ def test_match_consuming(doc, text, pattern, re_pattern): """Test that matcher.__call__ consumes tokens on a match similar to re.findall.""" matcher = Matcher(doc.vocab) - matcher.add(re_pattern, None, pattern) + matcher.add(re_pattern, [pattern]) matches = matcher(doc) re_matches = [m.span() for m in re.finditer(re_pattern, text)] assert len(matches) == len(re_matches) @@ -111,7 +111,7 @@ def test_operator_combos(en_vocab): pattern.append({"ORTH": part[0], "OP": "+"}) else: pattern.append({"ORTH": part}) - matcher.add("PATTERN", None, pattern) + matcher.add("PATTERN", [pattern]) matches = matcher(doc) if result: assert matches, (string, pattern_str) @@ -123,7 +123,7 @@ def test_matcher_end_zero_plus(en_vocab): """Test matcher works when patterns end with * operator. (issue 1450)""" matcher = Matcher(en_vocab) pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}] - matcher.add("TSTEND", None, pattern) + matcher.add("TSTEND", [pattern]) nlp = lambda string: Doc(matcher.vocab, words=string.split()) assert len(matcher(nlp("a"))) == 1 assert len(matcher(nlp("a b"))) == 2 @@ -140,7 +140,7 @@ def test_matcher_sets_return_correct_tokens(en_vocab): [{"LOWER": {"IN": ["one"]}}], [{"LOWER": {"IN": ["two"]}}], ] - matcher.add("TEST", None, *patterns) + matcher.add("TEST", patterns) doc = Doc(en_vocab, words="zero one two three".split()) matches = matcher(doc) texts = [Span(doc, s, e, label=L).text for L, s, e in matches] @@ -154,7 +154,7 @@ def test_matcher_remove(): pattern = [{"ORTH": "test"}, {"OP": "?"}] assert len(matcher) == 0 - matcher.add("Rule", None, pattern) + matcher.add("Rule", [pattern]) assert "Rule" in matcher # should give two matches diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index 80f08e40c..2db2f9eb3 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -50,7 +50,7 @@ def validator(): def test_matcher_pattern_validation(en_vocab, pattern): matcher = Matcher(en_vocab, validate=True) with pytest.raises(MatchPatternError): - matcher.add("TEST", None, pattern) + matcher.add("TEST", [pattern]) @pytest.mark.parametrize("pattern,n_errors,_", TEST_PATTERNS) @@ -71,6 +71,6 @@ def test_minimal_pattern_validation(en_vocab, pattern, n_errors, n_min_errors): matcher = Matcher(en_vocab) if n_min_errors > 0: with pytest.raises(ValueError): - matcher.add("TEST", None, pattern) + matcher.add("TEST", [pattern]) elif n_errors == 0: - matcher.add("TEST", None, pattern) + matcher.add("TEST", [pattern]) diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 2a7532e85..7a6585e06 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -13,53 +13,75 @@ def test_matcher_phrase_matcher(en_vocab): # intermediate phrase pattern = Doc(en_vocab, words=["Google", "Now"]) matcher = PhraseMatcher(en_vocab) - matcher.add("COMPANY", None, pattern) + matcher.add("COMPANY", [pattern]) assert len(matcher(doc)) == 1 # initial token pattern = Doc(en_vocab, words=["I"]) matcher = PhraseMatcher(en_vocab) - matcher.add("I", None, pattern) + matcher.add("I", [pattern]) assert len(matcher(doc)) == 1 # initial phrase pattern = Doc(en_vocab, words=["I", "like"]) matcher = PhraseMatcher(en_vocab) - matcher.add("ILIKE", None, pattern) + matcher.add("ILIKE", [pattern]) assert len(matcher(doc)) == 1 # final token pattern = Doc(en_vocab, words=["best"]) matcher = PhraseMatcher(en_vocab) - matcher.add("BEST", None, pattern) + matcher.add("BEST", [pattern]) assert len(matcher(doc)) == 1 # final phrase pattern = Doc(en_vocab, words=["Now", "best"]) matcher = PhraseMatcher(en_vocab) - matcher.add("NOWBEST", None, pattern) + matcher.add("NOWBEST", [pattern]) assert len(matcher(doc)) == 1 def test_phrase_matcher_length(en_vocab): matcher = PhraseMatcher(en_vocab) assert len(matcher) == 0 - matcher.add("TEST", None, Doc(en_vocab, words=["test"])) + matcher.add("TEST", [Doc(en_vocab, words=["test"])]) assert len(matcher) == 1 - matcher.add("TEST2", None, Doc(en_vocab, words=["test2"])) + matcher.add("TEST2", [Doc(en_vocab, words=["test2"])]) assert len(matcher) == 2 def test_phrase_matcher_contains(en_vocab): matcher = PhraseMatcher(en_vocab) - matcher.add("TEST", None, Doc(en_vocab, words=["test"])) + matcher.add("TEST", [Doc(en_vocab, words=["test"])]) assert "TEST" in matcher assert "TEST2" not in matcher +def test_phrase_matcher_add_new_api(en_vocab): + doc = Doc(en_vocab, words=["a", "b"]) + patterns = [Doc(en_vocab, words=["a"]), Doc(en_vocab, words=["a", "b"])] + matcher = PhraseMatcher(en_vocab) + matcher.add("OLD_API", None, *patterns) + assert len(matcher(doc)) == 2 + matcher = PhraseMatcher(en_vocab) + on_match = Mock() + matcher.add("OLD_API_CALLBACK", on_match, *patterns) + assert len(matcher(doc)) == 2 + assert on_match.call_count == 2 + # New API: add(key: str, patterns: List[List[dict]], on_match: Callable) + matcher = PhraseMatcher(en_vocab) + matcher.add("NEW_API", patterns) + assert len(matcher(doc)) == 2 + matcher = PhraseMatcher(en_vocab) + on_match = Mock() + matcher.add("NEW_API_CALLBACK", patterns, on_match=on_match) + assert len(matcher(doc)) == 2 + assert on_match.call_count == 2 + + def test_phrase_matcher_repeated_add(en_vocab): matcher = PhraseMatcher(en_vocab) # match ID only gets added once - matcher.add("TEST", None, Doc(en_vocab, words=["like"])) - matcher.add("TEST", None, Doc(en_vocab, words=["like"])) - matcher.add("TEST", None, Doc(en_vocab, words=["like"])) - matcher.add("TEST", None, Doc(en_vocab, words=["like"])) + matcher.add("TEST", [Doc(en_vocab, words=["like"])]) + matcher.add("TEST", [Doc(en_vocab, words=["like"])]) + matcher.add("TEST", [Doc(en_vocab, words=["like"])]) + matcher.add("TEST", [Doc(en_vocab, words=["like"])]) doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) assert "TEST" in matcher assert "TEST2" not in matcher @@ -68,8 +90,8 @@ def test_phrase_matcher_repeated_add(en_vocab): def test_phrase_matcher_remove(en_vocab): matcher = PhraseMatcher(en_vocab) - matcher.add("TEST1", None, Doc(en_vocab, words=["like"])) - matcher.add("TEST2", None, Doc(en_vocab, words=["best"])) + matcher.add("TEST1", [Doc(en_vocab, words=["like"])]) + matcher.add("TEST2", [Doc(en_vocab, words=["best"])]) doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) assert "TEST1" in matcher assert "TEST2" in matcher @@ -95,9 +117,9 @@ def test_phrase_matcher_remove(en_vocab): def test_phrase_matcher_overlapping_with_remove(en_vocab): matcher = PhraseMatcher(en_vocab) - matcher.add("TEST", None, Doc(en_vocab, words=["like"])) + matcher.add("TEST", [Doc(en_vocab, words=["like"])]) # TEST2 is added alongside TEST - matcher.add("TEST2", None, Doc(en_vocab, words=["like"])) + matcher.add("TEST2", [Doc(en_vocab, words=["like"])]) doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) assert "TEST" in matcher assert len(matcher) == 2 @@ -122,7 +144,7 @@ def test_phrase_matcher_string_attrs(en_vocab): pos2 = ["INTJ", "PUNCT", "PRON", "VERB", "NOUN", "ADV", "ADV"] pattern = get_doc(en_vocab, words=words1, pos=pos1) matcher = PhraseMatcher(en_vocab, attr="POS") - matcher.add("TEST", None, pattern) + matcher.add("TEST", [pattern]) doc = get_doc(en_vocab, words=words2, pos=pos2) matches = matcher(doc) assert len(matches) == 1 @@ -140,7 +162,7 @@ def test_phrase_matcher_string_attrs_negative(en_vocab): pos2 = ["X", "X", "X"] pattern = get_doc(en_vocab, words=words1, pos=pos1) matcher = PhraseMatcher(en_vocab, attr="POS") - matcher.add("TEST", None, pattern) + matcher.add("TEST", [pattern]) doc = get_doc(en_vocab, words=words2, pos=pos2) matches = matcher(doc) assert len(matches) == 0 @@ -151,7 +173,7 @@ def test_phrase_matcher_bool_attrs(en_vocab): words2 = ["No", "problem", ",", "he", "said", "."] pattern = Doc(en_vocab, words=words1) matcher = PhraseMatcher(en_vocab, attr="IS_PUNCT") - matcher.add("TEST", None, pattern) + matcher.add("TEST", [pattern]) doc = Doc(en_vocab, words=words2) matches = matcher(doc) assert len(matches) == 2 @@ -173,15 +195,15 @@ def test_phrase_matcher_validation(en_vocab): doc3 = Doc(en_vocab, words=["Test"]) matcher = PhraseMatcher(en_vocab, validate=True) with pytest.warns(UserWarning): - matcher.add("TEST1", None, doc1) + matcher.add("TEST1", [doc1]) with pytest.warns(UserWarning): - matcher.add("TEST2", None, doc2) + matcher.add("TEST2", [doc2]) with pytest.warns(None) as record: - matcher.add("TEST3", None, doc3) + matcher.add("TEST3", [doc3]) assert not record.list matcher = PhraseMatcher(en_vocab, attr="POS", validate=True) with pytest.warns(None) as record: - matcher.add("TEST4", None, doc2) + matcher.add("TEST4", [doc2]) assert not record.list @@ -198,24 +220,24 @@ def test_attr_pipeline_checks(en_vocab): doc3 = Doc(en_vocab, words=["Test"]) # DEP requires is_parsed matcher = PhraseMatcher(en_vocab, attr="DEP") - matcher.add("TEST1", None, doc1) + matcher.add("TEST1", [doc1]) with pytest.raises(ValueError): - matcher.add("TEST2", None, doc2) + matcher.add("TEST2", [doc2]) with pytest.raises(ValueError): - matcher.add("TEST3", None, doc3) + matcher.add("TEST3", [doc3]) # TAG, POS, LEMMA require is_tagged for attr in ("TAG", "POS", "LEMMA"): matcher = PhraseMatcher(en_vocab, attr=attr) - matcher.add("TEST2", None, doc2) + matcher.add("TEST2", [doc2]) with pytest.raises(ValueError): - matcher.add("TEST1", None, doc1) + matcher.add("TEST1", [doc1]) with pytest.raises(ValueError): - matcher.add("TEST3", None, doc3) + matcher.add("TEST3", [doc3]) # TEXT/ORTH only require tokens matcher = PhraseMatcher(en_vocab, attr="ORTH") - matcher.add("TEST3", None, doc3) + matcher.add("TEST3", [doc3]) matcher = PhraseMatcher(en_vocab, attr="TEXT") - matcher.add("TEST3", None, doc3) + matcher.add("TEST3", [doc3]) def test_phrase_matcher_callback(en_vocab): @@ -223,7 +245,7 @@ def test_phrase_matcher_callback(en_vocab): doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) pattern = Doc(en_vocab, words=["Google", "Now"]) matcher = PhraseMatcher(en_vocab) - matcher.add("COMPANY", mock, pattern) + matcher.add("COMPANY", [pattern], on_match=mock) matches = matcher(doc) mock.assert_called_once_with(matcher, doc, 0, matches) @@ -234,5 +256,13 @@ def test_phrase_matcher_remove_overlapping_patterns(en_vocab): pattern2 = Doc(en_vocab, words=["this", "is"]) pattern3 = Doc(en_vocab, words=["this", "is", "a"]) pattern4 = Doc(en_vocab, words=["this", "is", "a", "word"]) - matcher.add("THIS", None, pattern1, pattern2, pattern3, pattern4) + matcher.add("THIS", [pattern1, pattern2, pattern3, pattern4]) matcher.remove("THIS") + + +def test_phrase_matcher_basic_check(en_vocab): + matcher = PhraseMatcher(en_vocab) + # Potential mistake: pass in pattern instead of list of patterns + pattern = Doc(en_vocab, words=["hello", "world"]) + with pytest.raises(ValueError): + matcher.add("TEST", pattern) diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index 989eba805..6d88d68c2 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -30,7 +30,7 @@ def test_issue118(en_tokenizer, patterns): doc = en_tokenizer(text) ORG = doc.vocab.strings["ORG"] matcher = Matcher(doc.vocab) - matcher.add("BostonCeltics", None, *patterns) + matcher.add("BostonCeltics", patterns) assert len(list(doc.ents)) == 0 matches = [(ORG, start, end) for _, start, end in matcher(doc)] assert matches == [(ORG, 9, 11), (ORG, 10, 11)] @@ -57,7 +57,7 @@ def test_issue118_prefix_reorder(en_tokenizer, patterns): doc = en_tokenizer(text) ORG = doc.vocab.strings["ORG"] matcher = Matcher(doc.vocab) - matcher.add("BostonCeltics", None, *patterns) + matcher.add("BostonCeltics", patterns) assert len(list(doc.ents)) == 0 matches = [(ORG, start, end) for _, start, end in matcher(doc)] doc.ents += tuple(matches)[1:] @@ -78,7 +78,7 @@ def test_issue242(en_tokenizer): ] doc = en_tokenizer(text) matcher = Matcher(doc.vocab) - matcher.add("FOOD", None, *patterns) + matcher.add("FOOD", patterns) matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)] match1, match2 = matches assert match1[1] == 3 @@ -127,17 +127,13 @@ def test_issue587(en_tokenizer): """Test that Matcher doesn't segfault on particular input""" doc = en_tokenizer("a b; c") matcher = Matcher(doc.vocab) - matcher.add("TEST1", None, [{ORTH: "a"}, {ORTH: "b"}]) + matcher.add("TEST1", [[{ORTH: "a"}, {ORTH: "b"}]]) matches = matcher(doc) assert len(matches) == 1 - matcher.add( - "TEST2", None, [{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}] - ) + matcher.add("TEST2", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]]) matches = matcher(doc) assert len(matches) == 2 - matcher.add( - "TEST3", None, [{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}] - ) + matcher.add("TEST3", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]]) matches = matcher(doc) assert len(matches) == 2 @@ -145,7 +141,7 @@ def test_issue587(en_tokenizer): def test_issue588(en_vocab): matcher = Matcher(en_vocab) with pytest.raises(ValueError): - matcher.add("TEST", None, []) + matcher.add("TEST", [[]]) @pytest.mark.xfail @@ -161,11 +157,9 @@ def test_issue590(en_vocab): doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"]) matcher = Matcher(en_vocab) matcher.add( - "ab", - None, - [{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}], + "ab", [[{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}]] ) - matcher.add("ab", None, [{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}]) + matcher.add("ab", [[{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}]]) matches = matcher(doc) assert len(matches) == 2 @@ -221,7 +215,7 @@ def test_issue615(en_tokenizer): label = "Sport_Equipment" doc = en_tokenizer(text) matcher = Matcher(doc.vocab) - matcher.add(label, merge_phrases, pattern) + matcher.add(label, [pattern], on_match=merge_phrases) matcher(doc) entities = list(doc.ents) assert entities != [] @@ -339,7 +333,7 @@ def test_issue850(): vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) matcher = Matcher(vocab) pattern = [{"LOWER": "bob"}, {"OP": "*"}, {"LOWER": "frank"}] - matcher.add("FarAway", None, pattern) + matcher.add("FarAway", [pattern]) doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) match = matcher(doc) assert len(match) == 1 @@ -353,7 +347,7 @@ def test_issue850_basic(): vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) matcher = Matcher(vocab) pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}] - matcher.add("FarAway", None, pattern) + matcher.add("FarAway", [pattern]) doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) match = matcher(doc) assert len(match) == 1 diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py index 889a5dc71..924c5aa3e 100644 --- a/spacy/tests/regression/test_issue1001-1500.py +++ b/spacy/tests/regression/test_issue1001-1500.py @@ -111,7 +111,7 @@ def test_issue1434(): hello_world = Doc(vocab, words=["Hello", "World"]) hello = Doc(vocab, words=["Hello"]) matcher = Matcher(vocab) - matcher.add("MyMatcher", None, pattern) + matcher.add("MyMatcher", [pattern]) matches = matcher(hello_world) assert matches matches = matcher(hello) @@ -133,7 +133,7 @@ def test_issue1450(string, start, end): """Test matcher works when patterns end with * operator.""" pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}] matcher = Matcher(Vocab()) - matcher.add("TSTEND", None, pattern) + matcher.add("TSTEND", [pattern]) doc = Doc(Vocab(), words=string.split()) matches = matcher(doc) if start is None or end is None: diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index a9cf070cd..e498417d1 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -224,7 +224,7 @@ def test_issue1868(): def test_issue1883(): matcher = Matcher(Vocab()) - matcher.add("pat1", None, [{"orth": "hello"}]) + matcher.add("pat1", [[{"orth": "hello"}]]) doc = Doc(matcher.vocab, words=["hello"]) assert len(matcher(doc)) == 1 new_matcher = copy.deepcopy(matcher) @@ -249,7 +249,7 @@ def test_issue1915(): def test_issue1945(): """Test regression in Matcher introduced in v2.0.6.""" matcher = Matcher(Vocab()) - matcher.add("MWE", None, [{"orth": "a"}, {"orth": "a"}]) + matcher.add("MWE", [[{"orth": "a"}, {"orth": "a"}]]) doc = Doc(matcher.vocab, words=["a", "a", "a"]) matches = matcher(doc) # we should see two overlapping matches here assert len(matches) == 2 @@ -285,7 +285,7 @@ def test_issue1971(en_vocab): {"ORTH": "!", "OP": "?"}, ] Token.set_extension("optional", default=False) - matcher.add("TEST", None, pattern) + matcher.add("TEST", [pattern]) doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"]) # We could also assert length 1 here, but this is more conclusive, because # the real problem here is that it returns a duplicate match for a match_id @@ -299,7 +299,7 @@ def test_issue_1971_2(en_vocab): pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}] pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}] doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"]) - matcher.add("TEST1", None, pattern1, pattern2) + matcher.add("TEST1", [pattern1, pattern2]) matches = matcher(doc) assert len(matches) == 2 @@ -310,8 +310,8 @@ def test_issue_1971_3(en_vocab): Token.set_extension("b", default=2, force=True) doc = Doc(en_vocab, words=["hello", "world"]) matcher = Matcher(en_vocab) - matcher.add("A", None, [{"_": {"a": 1}}]) - matcher.add("B", None, [{"_": {"b": 2}}]) + matcher.add("A", [[{"_": {"a": 1}}]]) + matcher.add("B", [[{"_": {"b": 2}}]]) matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc)) assert len(matches) == 4 assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)]) @@ -326,7 +326,7 @@ def test_issue_1971_4(en_vocab): matcher = Matcher(en_vocab) doc = Doc(en_vocab, words=["this", "is", "text"]) pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3 - matcher.add("TEST", None, pattern) + matcher.add("TEST", [pattern]) matches = matcher(doc) # Uncommenting this caused a segmentation fault assert len(matches) == 1 diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index 4292c8d23..e95c1a9b9 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -128,7 +128,7 @@ def test_issue2464(en_vocab): """Test problem with successive ?. This is the same bug, so putting it here.""" matcher = Matcher(en_vocab) doc = Doc(en_vocab, words=["a", "b"]) - matcher.add("4", None, [{"OP": "?"}, {"OP": "?"}]) + matcher.add("4", [[{"OP": "?"}, {"OP": "?"}]]) matches = matcher(doc) assert len(matches) == 3 diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index e26ccbf4b..73ff7376a 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -37,7 +37,7 @@ def test_issue2569(en_tokenizer): doc = en_tokenizer("It is May 15, 1993.") doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])] matcher = Matcher(doc.vocab) - matcher.add("RULE", None, [{"ENT_TYPE": "DATE", "OP": "+"}]) + matcher.add("RULE", [[{"ENT_TYPE": "DATE", "OP": "+"}]]) matched = [doc[start:end] for _, start, end in matcher(doc)] matched = sorted(matched, key=len, reverse=True) assert len(matched) == 10 @@ -89,7 +89,7 @@ def test_issue2671(): {"IS_PUNCT": True, "OP": "?"}, {"LOWER": "adrenaline"}, ] - matcher.add(pattern_id, None, pattern) + matcher.add(pattern_id, [pattern]) doc1 = nlp("This is a high-adrenaline situation.") doc2 = nlp("This is a high adrenaline situation.") matches1 = matcher(doc1) diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 63836e7bd..b883ae67a 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -52,7 +52,7 @@ def test_issue3009(en_vocab): doc = get_doc(en_vocab, words=words, tags=tags) matcher = Matcher(en_vocab) for i, pattern in enumerate(patterns): - matcher.add(str(i), None, pattern) + matcher.add(str(i), [pattern]) matches = matcher(doc) assert matches @@ -116,8 +116,8 @@ def test_issue3248_1(): total number of patterns.""" nlp = English() matcher = PhraseMatcher(nlp.vocab) - matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c")) - matcher.add("TEST2", None, nlp("d")) + matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")]) + matcher.add("TEST2", [nlp("d")]) assert len(matcher) == 2 @@ -125,8 +125,8 @@ def test_issue3248_2(): """Test that the PhraseMatcher can be pickled correctly.""" nlp = English() matcher = PhraseMatcher(nlp.vocab) - matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c")) - matcher.add("TEST2", None, nlp("d")) + matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")]) + matcher.add("TEST2", [nlp("d")]) data = pickle.dumps(matcher) new_matcher = pickle.loads(data) assert len(new_matcher) == len(matcher) @@ -170,7 +170,7 @@ def test_issue3328(en_vocab): [{"LOWER": {"IN": ["hello", "how"]}}], [{"LOWER": {"IN": ["you", "doing"]}}], ] - matcher.add("TEST", None, *patterns) + matcher.add("TEST", patterns) matches = matcher(doc) assert len(matches) == 4 matched_texts = [doc[start:end].text for _, start, end in matches] @@ -183,8 +183,8 @@ def test_issue3331(en_vocab): matches, one per rule. """ matcher = PhraseMatcher(en_vocab) - matcher.add("A", None, Doc(en_vocab, words=["Barack", "Obama"])) - matcher.add("B", None, Doc(en_vocab, words=["Barack", "Obama"])) + matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])]) + matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])]) doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"]) matches = matcher(doc) assert len(matches) == 2 diff --git a/spacy/tests/regression/test_issue3549.py b/spacy/tests/regression/test_issue3549.py index 3932bf19c..587b3a857 100644 --- a/spacy/tests/regression/test_issue3549.py +++ b/spacy/tests/regression/test_issue3549.py @@ -10,6 +10,6 @@ def test_issue3549(en_vocab): """Test that match pattern validation doesn't raise on empty errors.""" matcher = Matcher(en_vocab, validate=True) pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] - matcher.add("GOOD", None, pattern) + matcher.add("GOOD", [pattern]) with pytest.raises(MatchPatternError): - matcher.add("BAD", None, [{"X": "Y"}]) + matcher.add("BAD", [[{"X": "Y"}]]) diff --git a/spacy/tests/regression/test_issue3555.py b/spacy/tests/regression/test_issue3555.py index 096b33367..8444f11f2 100644 --- a/spacy/tests/regression/test_issue3555.py +++ b/spacy/tests/regression/test_issue3555.py @@ -12,6 +12,6 @@ def test_issue3555(en_vocab): Token.set_extension("issue3555", default=None) matcher = Matcher(en_vocab) pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}] - matcher.add("TEST", None, pattern) + matcher.add("TEST", [pattern]) doc = Doc(en_vocab, words=["have", "apple"]) matcher(doc) diff --git a/spacy/tests/regression/test_issue3839.py b/spacy/tests/regression/test_issue3839.py index c24c60b6d..fe722a681 100644 --- a/spacy/tests/regression/test_issue3839.py +++ b/spacy/tests/regression/test_issue3839.py @@ -12,10 +12,10 @@ def test_issue3839(en_vocab): match_id = "PATTERN" pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}] pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}] - matcher.add(match_id, None, pattern1) + matcher.add(match_id, [pattern1]) matches = matcher(doc) assert matches[0][0] == en_vocab.strings[match_id] matcher = Matcher(en_vocab) - matcher.add(match_id, None, pattern2) + matcher.add(match_id, [pattern2]) matches = matcher(doc) assert matches[0][0] == en_vocab.strings[match_id] diff --git a/spacy/tests/regression/test_issue3879.py b/spacy/tests/regression/test_issue3879.py index 123e9fce3..5cd245231 100644 --- a/spacy/tests/regression/test_issue3879.py +++ b/spacy/tests/regression/test_issue3879.py @@ -10,5 +10,5 @@ def test_issue3879(en_vocab): assert len(doc) == 5 pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}] matcher = Matcher(en_vocab) - matcher.add("TEST", None, pattern) + matcher.add("TEST", [pattern]) assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test' diff --git a/spacy/tests/regression/test_issue3951.py b/spacy/tests/regression/test_issue3951.py index e07ffd36e..33230112f 100644 --- a/spacy/tests/regression/test_issue3951.py +++ b/spacy/tests/regression/test_issue3951.py @@ -14,7 +14,7 @@ def test_issue3951(en_vocab): {"OP": "?"}, {"LOWER": "world"}, ] - matcher.add("TEST", None, pattern) + matcher.add("TEST", [pattern]) doc = Doc(en_vocab, words=["Hello", "my", "new", "world"]) matches = matcher(doc) assert len(matches) == 0 diff --git a/spacy/tests/regression/test_issue3972.py b/spacy/tests/regression/test_issue3972.py index a7f76e4d7..22b8d486e 100644 --- a/spacy/tests/regression/test_issue3972.py +++ b/spacy/tests/regression/test_issue3972.py @@ -9,8 +9,8 @@ def test_issue3972(en_vocab): """Test that the PhraseMatcher returns duplicates for duplicate match IDs. """ matcher = PhraseMatcher(en_vocab) - matcher.add("A", None, Doc(en_vocab, words=["New", "York"])) - matcher.add("B", None, Doc(en_vocab, words=["New", "York"])) + matcher.add("A", [Doc(en_vocab, words=["New", "York"])]) + matcher.add("B", [Doc(en_vocab, words=["New", "York"])]) doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"]) matches = matcher(doc) diff --git a/spacy/tests/regression/test_issue4002.py b/spacy/tests/regression/test_issue4002.py index 37e054b3e..d075128aa 100644 --- a/spacy/tests/regression/test_issue4002.py +++ b/spacy/tests/regression/test_issue4002.py @@ -11,7 +11,7 @@ def test_issue4002(en_vocab): matcher = PhraseMatcher(en_vocab, attr="NORM") pattern1 = Doc(en_vocab, words=["c", "d"]) assert [t.norm_ for t in pattern1] == ["c", "d"] - matcher.add("TEST", None, pattern1) + matcher.add("TEST", [pattern1]) doc = Doc(en_vocab, words=["a", "b", "c", "d"]) assert [t.norm_ for t in doc] == ["a", "b", "c", "d"] matches = matcher(doc) @@ -21,6 +21,6 @@ def test_issue4002(en_vocab): pattern2[0].norm_ = "c" pattern2[1].norm_ = "d" assert [t.norm_ for t in pattern2] == ["c", "d"] - matcher.add("TEST", None, pattern2) + matcher.add("TEST", [pattern2]) matches = matcher(doc) assert len(matches) == 1 diff --git a/spacy/tests/regression/test_issue4120.py b/spacy/tests/regression/test_issue4120.py index 2ce5aec6a..d288f46c4 100644 --- a/spacy/tests/regression/test_issue4120.py +++ b/spacy/tests/regression/test_issue4120.py @@ -8,7 +8,7 @@ from spacy.tokens import Doc def test_issue4120(en_vocab): """Test that matches without a final {OP: ?} token are returned.""" matcher = Matcher(en_vocab) - matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}]) + matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]]) doc1 = Doc(en_vocab, words=["a"]) assert len(matcher(doc1)) == 1 # works @@ -16,11 +16,11 @@ def test_issue4120(en_vocab): assert len(matcher(doc2)) == 2 # fixed matcher = Matcher(en_vocab) - matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]) + matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]]) doc3 = Doc(en_vocab, words=["a", "b", "b", "c"]) assert len(matcher(doc3)) == 2 # works matcher = Matcher(en_vocab) - matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]) + matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]]) doc4 = Doc(en_vocab, words=["a", "b", "b", "c"]) assert len(matcher(doc4)) == 3 # fixed diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index 84d9ed888..bfd4fb0ec 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -157,16 +157,19 @@ overwritten. | `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | - + -As of spaCy 2.0, `Matcher.add_pattern` and `Matcher.add_entity` are deprecated -and have been replaced with a simpler [`Matcher.add`](/api/matcher#add) that -lets you add a list of patterns and a callback for a given match ID. +As of spaCy 2.2.2, `Matcher.add` also supports the new API, which will become +the default in the future. The patterns are now the second argument and a list +(instead of a variable number of arguments). The `on_match` callback becomes an +optional keyword argument. ```diff -- matcher.add_entity("GoogleNow", on_match=merge_phrases) -- matcher.add_pattern("GoogleNow", [{ORTH: "Google"}, {ORTH: "Now"}]) -+ matcher.add('GoogleNow', merge_phrases, [{"ORTH": "Google"}, {"ORTH": "Now"}]) +patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]] +- matcher.add("GoogleNow", None, *patterns) ++ matcher.add("GoogleNow", patterns) +- matcher.add("GoogleNow", on_match, *patterns) ++ matcher.add("GoogleNow", patterns, on_match=on_match) ``` diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index 9d95522ac..c7311a401 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -153,6 +153,23 @@ overwritten. | `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | `*docs` | `Doc` | `Doc` objects of the phrases to match. | + + +As of spaCy 2.2.2, `PhraseMatcher.add` also supports the new API, which will +become the default in the future. The `Doc` patterns are now the second argument +and a list (instead of a variable number of arguments). The `on_match` callback +becomes an optional keyword argument. + +```diff +patterns = [nlp("health care reform"), nlp("healthcare reform")] +- matcher.add("HEALTH", None, *patterns) ++ matcher.add("HEALTH", patterns) +- matcher.add("HEALTH", on_match, *patterns) ++ matcher.add("HEALTH", patterns, on_match=on_match) +``` + + + ## PhraseMatcher.remove {#remove tag="method" new="2.2"} Remove a rule from the matcher by match ID. A `KeyError` is raised if the key