Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522)

* Implement new API for {Phrase}Matcher.add (backwards-compatible)

* Update docs

* Also update DependencyMatcher.add

* Update internals

* Rewrite tests to use new API

* Add basic check for common mistake

Raise error with suggestion if user likely passed in a pattern instead of a list of patterns

* Fix typo [ci skip]
This commit is contained in:
Ines Montani 2019-10-25 22:21:08 +02:00 committed by Matthew Honnibal
parent d2da117114
commit cfffdba7b1
25 changed files with 250 additions and 143 deletions

View File

@ -504,6 +504,13 @@ class Errors(object):
E175 = ("Can't remove rule for unknown match pattern ID: {key}") E175 = ("Can't remove rule for unknown match pattern ID: {key}")
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.") E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
E177 = ("Ill-formed IOB input detected: {tag}") E177 = ("Ill-formed IOB input detected: {tag}")
E178 = ("Invalid pattern. Expected list of dicts but got: {pat}. Maybe you "
"accidentally passed a single pattern to Matcher.add instead of a "
"list of patterns? If you only want to add one pattern, make sure "
"to wrap it in a list. For example: matcher.add('{key}', [pattern])")
E179 = ("Invalid pattern. Expected a list of Doc objects but got a single "
"Doc. If you only want to add one pattern, make sure to wrap it "
"in a list. For example: matcher.add('{key}', [doc])")
@add_codes @add_codes

View File

@ -102,7 +102,10 @@ cdef class DependencyMatcher:
visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True
idx = idx + 1 idx = idx + 1
def add(self, key, on_match, *patterns): def add(self, key, patterns, *_patterns, on_match=None):
if patterns is None or hasattr(patterns, "__call__"): # old API
on_match = patterns
patterns = _patterns
for pattern in patterns: for pattern in patterns:
if len(pattern) == 0: if len(pattern) == 0:
raise ValueError(Errors.E012.format(key=key)) raise ValueError(Errors.E012.format(key=key))

View File

@ -74,7 +74,7 @@ cdef class Matcher:
""" """
return self._normalize_key(key) in self._patterns return self._normalize_key(key) in self._patterns
def add(self, key, on_match, *patterns): def add(self, key, patterns, *_patterns, on_match=None):
"""Add a match-rule to the matcher. A match-rule consists of: an ID """Add a match-rule to the matcher. A match-rule consists of: an ID
key, an on_match callback, and one or more patterns. key, an on_match callback, and one or more patterns.
@ -98,16 +98,29 @@ cdef class Matcher:
operator will behave non-greedily. This quirk in the semantics makes operator will behave non-greedily. This quirk in the semantics makes
the matcher more efficient, by avoiding the need for back-tracking. the matcher more efficient, by avoiding the need for back-tracking.
As of spaCy v2.2.2, Matcher.add supports the future API, which makes
the patterns the second argument and a list (instead of a variable
number of arguments). The on_match callback becomes an optional keyword
argument.
key (unicode): The match ID. key (unicode): The match ID.
on_match (callable): Callback executed on match. patterns (list): The patterns to add for the given key.
*patterns (list): List of token descriptions. on_match (callable): Optional callback executed on match.
*_patterns (list): For backwards compatibility: list of patterns to add
as variable arguments. Will be ignored if a list of patterns is
provided as the second argument.
""" """
errors = {} errors = {}
if on_match is not None and not hasattr(on_match, "__call__"): if on_match is not None and not hasattr(on_match, "__call__"):
raise ValueError(Errors.E171.format(arg_type=type(on_match))) raise ValueError(Errors.E171.format(arg_type=type(on_match)))
if patterns is None or hasattr(patterns, "__call__"): # old API
on_match = patterns
patterns = _patterns
for i, pattern in enumerate(patterns): for i, pattern in enumerate(patterns):
if len(pattern) == 0: if len(pattern) == 0:
raise ValueError(Errors.E012.format(key=key)) raise ValueError(Errors.E012.format(key=key))
if not isinstance(pattern, list):
raise ValueError(Errors.E178.format(pat=pattern, key=key))
if self.validator: if self.validator:
errors[i] = validate_json(pattern, self.validator) errors[i] = validate_json(pattern, self.validator)
if any(err for err in errors.values()): if any(err for err in errors.values()):

View File

@ -152,16 +152,27 @@ cdef class PhraseMatcher:
del self._callbacks[key] del self._callbacks[key]
del self._docs[key] del self._docs[key]
def add(self, key, on_match, *docs): def add(self, key, docs, *_docs, on_match=None):
"""Add a match-rule to the phrase-matcher. A match-rule consists of: an ID """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
key, an on_match callback, and one or more patterns. key, an on_match callback, and one or more patterns.
As of spaCy v2.2.2, PhraseMatcher.add supports the future API, which
makes the patterns the second argument and a list (instead of a variable
number of arguments). The on_match callback becomes an optional keyword
argument.
key (unicode): The match ID. key (unicode): The match ID.
docs (list): List of `Doc` objects representing match patterns.
on_match (callable): Callback executed on match. on_match (callable): Callback executed on match.
*docs (Doc): `Doc` objects representing match patterns. *_docs (Doc): For backwards compatibility: list of patterns to add
as variable arguments. Will be ignored if a list of patterns is
provided as the second argument.
DOCS: https://spacy.io/api/phrasematcher#add DOCS: https://spacy.io/api/phrasematcher#add
""" """
if docs is None or hasattr(docs, "__call__"): # old API
on_match = docs
docs = _docs
_ = self.vocab[key] _ = self.vocab[key]
self._callbacks[key] = on_match self._callbacks[key] = on_match
@ -171,6 +182,8 @@ cdef class PhraseMatcher:
cdef MapStruct* internal_node cdef MapStruct* internal_node
cdef void* result cdef void* result
if isinstance(docs, Doc):
raise ValueError(Errors.E179.format(key=key))
for doc in docs: for doc in docs:
if len(doc) == 0: if len(doc) == 0:
continue continue

View File

@ -204,9 +204,9 @@ class EntityRuler(object):
else: else:
raise ValueError(Errors.E097.format(pattern=pattern)) raise ValueError(Errors.E097.format(pattern=pattern))
for label, patterns in self.token_patterns.items(): for label, patterns in self.token_patterns.items():
self.matcher.add(label, None, *patterns) self.matcher.add(label, patterns)
for label, patterns in self.phrase_patterns.items(): for label, patterns in self.phrase_patterns.items():
self.phrase_matcher.add(label, None, *patterns) self.phrase_matcher.add(label, patterns)
def _split_label(self, label): def _split_label(self, label):
"""Split Entity label into ent_label and ent_id if it contains self.ent_id_sep """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep

View File

@ -17,7 +17,7 @@ def matcher(en_vocab):
} }
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
for key, patterns in rules.items(): for key, patterns in rules.items():
matcher.add(key, None, *patterns) matcher.add(key, patterns)
return matcher return matcher
@ -25,11 +25,11 @@ def test_matcher_from_api_docs(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
pattern = [{"ORTH": "test"}] pattern = [{"ORTH": "test"}]
assert len(matcher) == 0 assert len(matcher) == 0
matcher.add("Rule", None, pattern) matcher.add("Rule", [pattern])
assert len(matcher) == 1 assert len(matcher) == 1
matcher.remove("Rule") matcher.remove("Rule")
assert "Rule" not in matcher assert "Rule" not in matcher
matcher.add("Rule", None, pattern) matcher.add("Rule", [pattern])
assert "Rule" in matcher assert "Rule" in matcher
on_match, patterns = matcher.get("Rule") on_match, patterns = matcher.get("Rule")
assert len(patterns[0]) assert len(patterns[0])
@ -52,7 +52,7 @@ def test_matcher_from_usage_docs(en_vocab):
token.vocab[token.text].norm_ = "happy emoji" token.vocab[token.text].norm_ = "happy emoji"
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add("HAPPY", label_sentiment, *pos_patterns) matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)
matcher(doc) matcher(doc)
assert doc.sentiment != 0 assert doc.sentiment != 0
assert doc[1].norm_ == "happy emoji" assert doc[1].norm_ == "happy emoji"
@ -60,11 +60,33 @@ def test_matcher_from_usage_docs(en_vocab):
def test_matcher_len_contains(matcher): def test_matcher_len_contains(matcher):
assert len(matcher) == 3 assert len(matcher) == 3
matcher.add("TEST", None, [{"ORTH": "test"}]) matcher.add("TEST", [[{"ORTH": "test"}]])
assert "TEST" in matcher assert "TEST" in matcher
assert "TEST2" not in matcher assert "TEST2" not in matcher
def test_matcher_add_new_old_api(en_vocab):
doc = Doc(en_vocab, words=["a", "b"])
patterns = [[{"TEXT": "a"}], [{"TEXT": "a"}, {"TEXT": "b"}]]
matcher = Matcher(en_vocab)
matcher.add("OLD_API", None, *patterns)
assert len(matcher(doc)) == 2
matcher = Matcher(en_vocab)
on_match = Mock()
matcher.add("OLD_API_CALLBACK", on_match, *patterns)
assert len(matcher(doc)) == 2
assert on_match.call_count == 2
# New API: add(key: str, patterns: List[List[dict]], on_match: Callable)
matcher = Matcher(en_vocab)
matcher.add("NEW_API", patterns)
assert len(matcher(doc)) == 2
matcher = Matcher(en_vocab)
on_match = Mock()
matcher.add("NEW_API_CALLBACK", patterns, on_match=on_match)
assert len(matcher(doc)) == 2
assert on_match.call_count == 2
def test_matcher_no_match(matcher): def test_matcher_no_match(matcher):
doc = Doc(matcher.vocab, words=["I", "like", "cheese", "."]) doc = Doc(matcher.vocab, words=["I", "like", "cheese", "."])
assert matcher(doc) == [] assert matcher(doc) == []
@ -100,12 +122,12 @@ def test_matcher_empty_dict(en_vocab):
"""Test matcher allows empty token specs, meaning match on any token.""" """Test matcher allows empty token specs, meaning match on any token."""
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
doc = Doc(matcher.vocab, words=["a", "b", "c"]) doc = Doc(matcher.vocab, words=["a", "b", "c"])
matcher.add("A.C", None, [{"ORTH": "a"}, {}, {"ORTH": "c"}]) matcher.add("A.C", [[{"ORTH": "a"}, {}, {"ORTH": "c"}]])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 1 assert len(matches) == 1
assert matches[0][1:] == (0, 3) assert matches[0][1:] == (0, 3)
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add("A.", None, [{"ORTH": "a"}, {}]) matcher.add("A.", [[{"ORTH": "a"}, {}]])
matches = matcher(doc) matches = matcher(doc)
assert matches[0][1:] == (0, 2) assert matches[0][1:] == (0, 2)
@ -114,7 +136,7 @@ def test_matcher_operator_shadow(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
doc = Doc(matcher.vocab, words=["a", "b", "c"]) doc = Doc(matcher.vocab, words=["a", "b", "c"])
pattern = [{"ORTH": "a"}, {"IS_ALPHA": True, "OP": "+"}, {"ORTH": "c"}] pattern = [{"ORTH": "a"}, {"IS_ALPHA": True, "OP": "+"}, {"ORTH": "c"}]
matcher.add("A.C", None, pattern) matcher.add("A.C", [pattern])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 1 assert len(matches) == 1
assert matches[0][1:] == (0, 3) assert matches[0][1:] == (0, 3)
@ -136,12 +158,12 @@ def test_matcher_match_zero(matcher):
{"IS_PUNCT": True}, {"IS_PUNCT": True},
{"ORTH": '"'}, {"ORTH": '"'},
] ]
matcher.add("Quote", None, pattern1) matcher.add("Quote", [pattern1])
doc = Doc(matcher.vocab, words=words1) doc = Doc(matcher.vocab, words=words1)
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
doc = Doc(matcher.vocab, words=words2) doc = Doc(matcher.vocab, words=words2)
assert len(matcher(doc)) == 0 assert len(matcher(doc)) == 0
matcher.add("Quote", None, pattern2) matcher.add("Quote", [pattern2])
assert len(matcher(doc)) == 0 assert len(matcher(doc)) == 0
@ -149,7 +171,7 @@ def test_matcher_match_zero_plus(matcher):
words = 'He said , " some words " ...'.split() words = 'He said , " some words " ...'.split()
pattern = [{"ORTH": '"'}, {"OP": "*", "IS_PUNCT": False}, {"ORTH": '"'}] pattern = [{"ORTH": '"'}, {"OP": "*", "IS_PUNCT": False}, {"ORTH": '"'}]
matcher = Matcher(matcher.vocab) matcher = Matcher(matcher.vocab)
matcher.add("Quote", None, pattern) matcher.add("Quote", [pattern])
doc = Doc(matcher.vocab, words=words) doc = Doc(matcher.vocab, words=words)
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
@ -160,11 +182,8 @@ def test_matcher_match_one_plus(matcher):
doc = Doc(control.vocab, words=["Philippe", "Philippe"]) doc = Doc(control.vocab, words=["Philippe", "Philippe"])
m = control(doc) m = control(doc)
assert len(m) == 2 assert len(m) == 2
matcher.add( pattern = [{"ORTH": "Philippe", "OP": "1"}, {"ORTH": "Philippe", "OP": "+"}]
"KleenePhilippe", matcher.add("KleenePhilippe", [pattern])
None,
[{"ORTH": "Philippe", "OP": "1"}, {"ORTH": "Philippe", "OP": "+"}],
)
m = matcher(doc) m = matcher(doc)
assert len(m) == 1 assert len(m) == 1
@ -172,7 +191,7 @@ def test_matcher_match_one_plus(matcher):
def test_matcher_any_token_operator(en_vocab): def test_matcher_any_token_operator(en_vocab):
"""Test that patterns with "any token" {} work with operators.""" """Test that patterns with "any token" {} work with operators."""
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add("TEST", None, [{"ORTH": "test"}, {"OP": "*"}]) matcher.add("TEST", [[{"ORTH": "test"}, {"OP": "*"}]])
doc = Doc(en_vocab, words=["test", "hello", "world"]) doc = Doc(en_vocab, words=["test", "hello", "world"])
matches = [doc[start:end].text for _, start, end in matcher(doc)] matches = [doc[start:end].text for _, start, end in matcher(doc)]
assert len(matches) == 3 assert len(matches) == 3
@ -186,7 +205,7 @@ def test_matcher_extension_attribute(en_vocab):
get_is_fruit = lambda token: token.text in ("apple", "banana") get_is_fruit = lambda token: token.text in ("apple", "banana")
Token.set_extension("is_fruit", getter=get_is_fruit, force=True) Token.set_extension("is_fruit", getter=get_is_fruit, force=True)
pattern = [{"ORTH": "an"}, {"_": {"is_fruit": True}}] pattern = [{"ORTH": "an"}, {"_": {"is_fruit": True}}]
matcher.add("HAVING_FRUIT", None, pattern) matcher.add("HAVING_FRUIT", [pattern])
doc = Doc(en_vocab, words=["an", "apple"]) doc = Doc(en_vocab, words=["an", "apple"])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 1 assert len(matches) == 1
@ -198,7 +217,7 @@ def test_matcher_extension_attribute(en_vocab):
def test_matcher_set_value(en_vocab): def test_matcher_set_value(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
pattern = [{"ORTH": {"IN": ["an", "a"]}}] pattern = [{"ORTH": {"IN": ["an", "a"]}}]
matcher.add("A_OR_AN", None, pattern) matcher.add("A_OR_AN", [pattern])
doc = Doc(en_vocab, words=["an", "a", "apple"]) doc = Doc(en_vocab, words=["an", "a", "apple"])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 2 assert len(matches) == 2
@ -210,7 +229,7 @@ def test_matcher_set_value(en_vocab):
def test_matcher_set_value_operator(en_vocab): def test_matcher_set_value_operator(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
pattern = [{"ORTH": {"IN": ["a", "the"]}, "OP": "?"}, {"ORTH": "house"}] pattern = [{"ORTH": {"IN": ["a", "the"]}, "OP": "?"}, {"ORTH": "house"}]
matcher.add("DET_HOUSE", None, pattern) matcher.add("DET_HOUSE", [pattern])
doc = Doc(en_vocab, words=["In", "a", "house"]) doc = Doc(en_vocab, words=["In", "a", "house"])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 2 assert len(matches) == 2
@ -222,7 +241,7 @@ def test_matcher_set_value_operator(en_vocab):
def test_matcher_regex(en_vocab): def test_matcher_regex(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}] pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}]
matcher.add("A_OR_AN", None, pattern) matcher.add("A_OR_AN", [pattern])
doc = Doc(en_vocab, words=["an", "a", "hi"]) doc = Doc(en_vocab, words=["an", "a", "hi"])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 2 assert len(matches) == 2
@ -234,7 +253,7 @@ def test_matcher_regex(en_vocab):
def test_matcher_regex_shape(en_vocab): def test_matcher_regex_shape(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}] pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}]
matcher.add("NON_ALPHA", None, pattern) matcher.add("NON_ALPHA", [pattern])
doc = Doc(en_vocab, words=["99", "problems", "!"]) doc = Doc(en_vocab, words=["99", "problems", "!"])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 2 assert len(matches) == 2
@ -246,7 +265,7 @@ def test_matcher_regex_shape(en_vocab):
def test_matcher_compare_length(en_vocab): def test_matcher_compare_length(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
pattern = [{"LENGTH": {">=": 2}}] pattern = [{"LENGTH": {">=": 2}}]
matcher.add("LENGTH_COMPARE", None, pattern) matcher.add("LENGTH_COMPARE", [pattern])
doc = Doc(en_vocab, words=["a", "aa", "aaa"]) doc = Doc(en_vocab, words=["a", "aa", "aaa"])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 2 assert len(matches) == 2
@ -260,7 +279,7 @@ def test_matcher_extension_set_membership(en_vocab):
get_reversed = lambda token: "".join(reversed(token.text)) get_reversed = lambda token: "".join(reversed(token.text))
Token.set_extension("reversed", getter=get_reversed, force=True) Token.set_extension("reversed", getter=get_reversed, force=True)
pattern = [{"_": {"reversed": {"IN": ["eyb", "ih"]}}}] pattern = [{"_": {"reversed": {"IN": ["eyb", "ih"]}}}]
matcher.add("REVERSED", None, pattern) matcher.add("REVERSED", [pattern])
doc = Doc(en_vocab, words=["hi", "bye", "hello"]) doc = Doc(en_vocab, words=["hi", "bye", "hello"])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 2 assert len(matches) == 2
@ -328,9 +347,9 @@ def dependency_matcher(en_vocab):
] ]
matcher = DependencyMatcher(en_vocab) matcher = DependencyMatcher(en_vocab)
matcher.add("pattern1", None, pattern1) matcher.add("pattern1", [pattern1])
matcher.add("pattern2", None, pattern2) matcher.add("pattern2", [pattern2])
matcher.add("pattern3", None, pattern3) matcher.add("pattern3", [pattern3])
return matcher return matcher
@ -347,6 +366,14 @@ def test_dependency_matcher_compile(dependency_matcher):
# assert matches[2][1] == [[4, 3, 2]] # assert matches[2][1] == [[4, 3, 2]]
def test_matcher_basic_check(en_vocab):
matcher = Matcher(en_vocab)
# Potential mistake: pass in pattern instead of list of patterns
pattern = [{"TEXT": "hello"}, {"TEXT": "world"}]
with pytest.raises(ValueError):
matcher.add("TEST", pattern)
def test_attr_pipeline_checks(en_vocab): def test_attr_pipeline_checks(en_vocab):
doc1 = Doc(en_vocab, words=["Test"]) doc1 = Doc(en_vocab, words=["Test"])
doc1.is_parsed = True doc1.is_parsed = True
@ -355,7 +382,7 @@ def test_attr_pipeline_checks(en_vocab):
doc3 = Doc(en_vocab, words=["Test"]) doc3 = Doc(en_vocab, words=["Test"])
# DEP requires is_parsed # DEP requires is_parsed
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add("TEST", None, [{"DEP": "a"}]) matcher.add("TEST", [[{"DEP": "a"}]])
matcher(doc1) matcher(doc1)
with pytest.raises(ValueError): with pytest.raises(ValueError):
matcher(doc2) matcher(doc2)
@ -364,7 +391,7 @@ def test_attr_pipeline_checks(en_vocab):
# TAG, POS, LEMMA require is_tagged # TAG, POS, LEMMA require is_tagged
for attr in ("TAG", "POS", "LEMMA"): for attr in ("TAG", "POS", "LEMMA"):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add("TEST", None, [{attr: "a"}]) matcher.add("TEST", [[{attr: "a"}]])
matcher(doc2) matcher(doc2)
with pytest.raises(ValueError): with pytest.raises(ValueError):
matcher(doc1) matcher(doc1)
@ -372,12 +399,12 @@ def test_attr_pipeline_checks(en_vocab):
matcher(doc3) matcher(doc3)
# TEXT/ORTH only require tokens # TEXT/ORTH only require tokens
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add("TEST", None, [{"ORTH": "a"}]) matcher.add("TEST", [[{"ORTH": "a"}]])
matcher(doc1) matcher(doc1)
matcher(doc2) matcher(doc2)
matcher(doc3) matcher(doc3)
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add("TEST", None, [{"TEXT": "a"}]) matcher.add("TEST", [[{"TEXT": "a"}]])
matcher(doc1) matcher(doc1)
matcher(doc2) matcher(doc2)
matcher(doc3) matcher(doc3)
@ -407,7 +434,7 @@ def test_attr_pipeline_checks(en_vocab):
def test_matcher_schema_token_attributes(en_vocab, pattern, text): def test_matcher_schema_token_attributes(en_vocab, pattern, text):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
doc = Doc(en_vocab, words=text.split(" ")) doc = Doc(en_vocab, words=text.split(" "))
matcher.add("Rule", None, pattern) matcher.add("Rule", [pattern])
assert len(matcher) == 1 assert len(matcher) == 1
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 1 assert len(matches) == 1
@ -417,7 +444,7 @@ def test_matcher_valid_callback(en_vocab):
"""Test that on_match can only be None or callable.""" """Test that on_match can only be None or callable."""
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
with pytest.raises(ValueError): with pytest.raises(ValueError):
matcher.add("TEST", [], [{"TEXT": "test"}]) matcher.add("TEST", [[{"TEXT": "test"}]], on_match=[])
matcher(Doc(en_vocab, words=["test"])) matcher(Doc(en_vocab, words=["test"]))
@ -425,7 +452,7 @@ def test_matcher_callback(en_vocab):
mock = Mock() mock = Mock()
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
pattern = [{"ORTH": "test"}] pattern = [{"ORTH": "test"}]
matcher.add("Rule", mock, pattern) matcher.add("Rule", [pattern], on_match=mock)
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
matches = matcher(doc) matches = matcher(doc)
mock.assert_called_once_with(matcher, doc, 0, matches) mock.assert_called_once_with(matcher, doc, 0, matches)

View File

@ -55,7 +55,7 @@ def test_greedy_matching(doc, text, pattern, re_pattern):
"""Test that the greedy matching behavior of the * op is consistant with """Test that the greedy matching behavior of the * op is consistant with
other re implementations.""" other re implementations."""
matcher = Matcher(doc.vocab) matcher = Matcher(doc.vocab)
matcher.add(re_pattern, None, pattern) matcher.add(re_pattern, [pattern])
matches = matcher(doc) matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern, text)] re_matches = [m.span() for m in re.finditer(re_pattern, text)]
for match, re_match in zip(matches, re_matches): for match, re_match in zip(matches, re_matches):
@ -77,7 +77,7 @@ def test_match_consuming(doc, text, pattern, re_pattern):
"""Test that matcher.__call__ consumes tokens on a match similar to """Test that matcher.__call__ consumes tokens on a match similar to
re.findall.""" re.findall."""
matcher = Matcher(doc.vocab) matcher = Matcher(doc.vocab)
matcher.add(re_pattern, None, pattern) matcher.add(re_pattern, [pattern])
matches = matcher(doc) matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern, text)] re_matches = [m.span() for m in re.finditer(re_pattern, text)]
assert len(matches) == len(re_matches) assert len(matches) == len(re_matches)
@ -111,7 +111,7 @@ def test_operator_combos(en_vocab):
pattern.append({"ORTH": part[0], "OP": "+"}) pattern.append({"ORTH": part[0], "OP": "+"})
else: else:
pattern.append({"ORTH": part}) pattern.append({"ORTH": part})
matcher.add("PATTERN", None, pattern) matcher.add("PATTERN", [pattern])
matches = matcher(doc) matches = matcher(doc)
if result: if result:
assert matches, (string, pattern_str) assert matches, (string, pattern_str)
@ -123,7 +123,7 @@ def test_matcher_end_zero_plus(en_vocab):
"""Test matcher works when patterns end with * operator. (issue 1450)""" """Test matcher works when patterns end with * operator. (issue 1450)"""
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}] pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
matcher.add("TSTEND", None, pattern) matcher.add("TSTEND", [pattern])
nlp = lambda string: Doc(matcher.vocab, words=string.split()) nlp = lambda string: Doc(matcher.vocab, words=string.split())
assert len(matcher(nlp("a"))) == 1 assert len(matcher(nlp("a"))) == 1
assert len(matcher(nlp("a b"))) == 2 assert len(matcher(nlp("a b"))) == 2
@ -140,7 +140,7 @@ def test_matcher_sets_return_correct_tokens(en_vocab):
[{"LOWER": {"IN": ["one"]}}], [{"LOWER": {"IN": ["one"]}}],
[{"LOWER": {"IN": ["two"]}}], [{"LOWER": {"IN": ["two"]}}],
] ]
matcher.add("TEST", None, *patterns) matcher.add("TEST", patterns)
doc = Doc(en_vocab, words="zero one two three".split()) doc = Doc(en_vocab, words="zero one two three".split())
matches = matcher(doc) matches = matcher(doc)
texts = [Span(doc, s, e, label=L).text for L, s, e in matches] texts = [Span(doc, s, e, label=L).text for L, s, e in matches]
@ -154,7 +154,7 @@ def test_matcher_remove():
pattern = [{"ORTH": "test"}, {"OP": "?"}] pattern = [{"ORTH": "test"}, {"OP": "?"}]
assert len(matcher) == 0 assert len(matcher) == 0
matcher.add("Rule", None, pattern) matcher.add("Rule", [pattern])
assert "Rule" in matcher assert "Rule" in matcher
# should give two matches # should give two matches

View File

@ -50,7 +50,7 @@ def validator():
def test_matcher_pattern_validation(en_vocab, pattern): def test_matcher_pattern_validation(en_vocab, pattern):
matcher = Matcher(en_vocab, validate=True) matcher = Matcher(en_vocab, validate=True)
with pytest.raises(MatchPatternError): with pytest.raises(MatchPatternError):
matcher.add("TEST", None, pattern) matcher.add("TEST", [pattern])
@pytest.mark.parametrize("pattern,n_errors,_", TEST_PATTERNS) @pytest.mark.parametrize("pattern,n_errors,_", TEST_PATTERNS)
@ -71,6 +71,6 @@ def test_minimal_pattern_validation(en_vocab, pattern, n_errors, n_min_errors):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
if n_min_errors > 0: if n_min_errors > 0:
with pytest.raises(ValueError): with pytest.raises(ValueError):
matcher.add("TEST", None, pattern) matcher.add("TEST", [pattern])
elif n_errors == 0: elif n_errors == 0:
matcher.add("TEST", None, pattern) matcher.add("TEST", [pattern])

View File

@ -13,53 +13,75 @@ def test_matcher_phrase_matcher(en_vocab):
# intermediate phrase # intermediate phrase
pattern = Doc(en_vocab, words=["Google", "Now"]) pattern = Doc(en_vocab, words=["Google", "Now"])
matcher = PhraseMatcher(en_vocab) matcher = PhraseMatcher(en_vocab)
matcher.add("COMPANY", None, pattern) matcher.add("COMPANY", [pattern])
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
# initial token # initial token
pattern = Doc(en_vocab, words=["I"]) pattern = Doc(en_vocab, words=["I"])
matcher = PhraseMatcher(en_vocab) matcher = PhraseMatcher(en_vocab)
matcher.add("I", None, pattern) matcher.add("I", [pattern])
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
# initial phrase # initial phrase
pattern = Doc(en_vocab, words=["I", "like"]) pattern = Doc(en_vocab, words=["I", "like"])
matcher = PhraseMatcher(en_vocab) matcher = PhraseMatcher(en_vocab)
matcher.add("ILIKE", None, pattern) matcher.add("ILIKE", [pattern])
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
# final token # final token
pattern = Doc(en_vocab, words=["best"]) pattern = Doc(en_vocab, words=["best"])
matcher = PhraseMatcher(en_vocab) matcher = PhraseMatcher(en_vocab)
matcher.add("BEST", None, pattern) matcher.add("BEST", [pattern])
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
# final phrase # final phrase
pattern = Doc(en_vocab, words=["Now", "best"]) pattern = Doc(en_vocab, words=["Now", "best"])
matcher = PhraseMatcher(en_vocab) matcher = PhraseMatcher(en_vocab)
matcher.add("NOWBEST", None, pattern) matcher.add("NOWBEST", [pattern])
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
def test_phrase_matcher_length(en_vocab): def test_phrase_matcher_length(en_vocab):
matcher = PhraseMatcher(en_vocab) matcher = PhraseMatcher(en_vocab)
assert len(matcher) == 0 assert len(matcher) == 0
matcher.add("TEST", None, Doc(en_vocab, words=["test"])) matcher.add("TEST", [Doc(en_vocab, words=["test"])])
assert len(matcher) == 1 assert len(matcher) == 1
matcher.add("TEST2", None, Doc(en_vocab, words=["test2"])) matcher.add("TEST2", [Doc(en_vocab, words=["test2"])])
assert len(matcher) == 2 assert len(matcher) == 2
def test_phrase_matcher_contains(en_vocab): def test_phrase_matcher_contains(en_vocab):
matcher = PhraseMatcher(en_vocab) matcher = PhraseMatcher(en_vocab)
matcher.add("TEST", None, Doc(en_vocab, words=["test"])) matcher.add("TEST", [Doc(en_vocab, words=["test"])])
assert "TEST" in matcher assert "TEST" in matcher
assert "TEST2" not in matcher assert "TEST2" not in matcher
def test_phrase_matcher_add_new_api(en_vocab):
doc = Doc(en_vocab, words=["a", "b"])
patterns = [Doc(en_vocab, words=["a"]), Doc(en_vocab, words=["a", "b"])]
matcher = PhraseMatcher(en_vocab)
matcher.add("OLD_API", None, *patterns)
assert len(matcher(doc)) == 2
matcher = PhraseMatcher(en_vocab)
on_match = Mock()
matcher.add("OLD_API_CALLBACK", on_match, *patterns)
assert len(matcher(doc)) == 2
assert on_match.call_count == 2
# New API: add(key: str, patterns: List[List[dict]], on_match: Callable)
matcher = PhraseMatcher(en_vocab)
matcher.add("NEW_API", patterns)
assert len(matcher(doc)) == 2
matcher = PhraseMatcher(en_vocab)
on_match = Mock()
matcher.add("NEW_API_CALLBACK", patterns, on_match=on_match)
assert len(matcher(doc)) == 2
assert on_match.call_count == 2
def test_phrase_matcher_repeated_add(en_vocab): def test_phrase_matcher_repeated_add(en_vocab):
matcher = PhraseMatcher(en_vocab) matcher = PhraseMatcher(en_vocab)
# match ID only gets added once # match ID only gets added once
matcher.add("TEST", None, Doc(en_vocab, words=["like"])) matcher.add("TEST", [Doc(en_vocab, words=["like"])])
matcher.add("TEST", None, Doc(en_vocab, words=["like"])) matcher.add("TEST", [Doc(en_vocab, words=["like"])])
matcher.add("TEST", None, Doc(en_vocab, words=["like"])) matcher.add("TEST", [Doc(en_vocab, words=["like"])])
matcher.add("TEST", None, Doc(en_vocab, words=["like"])) matcher.add("TEST", [Doc(en_vocab, words=["like"])])
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
assert "TEST" in matcher assert "TEST" in matcher
assert "TEST2" not in matcher assert "TEST2" not in matcher
@ -68,8 +90,8 @@ def test_phrase_matcher_repeated_add(en_vocab):
def test_phrase_matcher_remove(en_vocab): def test_phrase_matcher_remove(en_vocab):
matcher = PhraseMatcher(en_vocab) matcher = PhraseMatcher(en_vocab)
matcher.add("TEST1", None, Doc(en_vocab, words=["like"])) matcher.add("TEST1", [Doc(en_vocab, words=["like"])])
matcher.add("TEST2", None, Doc(en_vocab, words=["best"])) matcher.add("TEST2", [Doc(en_vocab, words=["best"])])
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
assert "TEST1" in matcher assert "TEST1" in matcher
assert "TEST2" in matcher assert "TEST2" in matcher
@ -95,9 +117,9 @@ def test_phrase_matcher_remove(en_vocab):
def test_phrase_matcher_overlapping_with_remove(en_vocab): def test_phrase_matcher_overlapping_with_remove(en_vocab):
matcher = PhraseMatcher(en_vocab) matcher = PhraseMatcher(en_vocab)
matcher.add("TEST", None, Doc(en_vocab, words=["like"])) matcher.add("TEST", [Doc(en_vocab, words=["like"])])
# TEST2 is added alongside TEST # TEST2 is added alongside TEST
matcher.add("TEST2", None, Doc(en_vocab, words=["like"])) matcher.add("TEST2", [Doc(en_vocab, words=["like"])])
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
assert "TEST" in matcher assert "TEST" in matcher
assert len(matcher) == 2 assert len(matcher) == 2
@ -122,7 +144,7 @@ def test_phrase_matcher_string_attrs(en_vocab):
pos2 = ["INTJ", "PUNCT", "PRON", "VERB", "NOUN", "ADV", "ADV"] pos2 = ["INTJ", "PUNCT", "PRON", "VERB", "NOUN", "ADV", "ADV"]
pattern = get_doc(en_vocab, words=words1, pos=pos1) pattern = get_doc(en_vocab, words=words1, pos=pos1)
matcher = PhraseMatcher(en_vocab, attr="POS") matcher = PhraseMatcher(en_vocab, attr="POS")
matcher.add("TEST", None, pattern) matcher.add("TEST", [pattern])
doc = get_doc(en_vocab, words=words2, pos=pos2) doc = get_doc(en_vocab, words=words2, pos=pos2)
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 1 assert len(matches) == 1
@ -140,7 +162,7 @@ def test_phrase_matcher_string_attrs_negative(en_vocab):
pos2 = ["X", "X", "X"] pos2 = ["X", "X", "X"]
pattern = get_doc(en_vocab, words=words1, pos=pos1) pattern = get_doc(en_vocab, words=words1, pos=pos1)
matcher = PhraseMatcher(en_vocab, attr="POS") matcher = PhraseMatcher(en_vocab, attr="POS")
matcher.add("TEST", None, pattern) matcher.add("TEST", [pattern])
doc = get_doc(en_vocab, words=words2, pos=pos2) doc = get_doc(en_vocab, words=words2, pos=pos2)
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 0 assert len(matches) == 0
@ -151,7 +173,7 @@ def test_phrase_matcher_bool_attrs(en_vocab):
words2 = ["No", "problem", ",", "he", "said", "."] words2 = ["No", "problem", ",", "he", "said", "."]
pattern = Doc(en_vocab, words=words1) pattern = Doc(en_vocab, words=words1)
matcher = PhraseMatcher(en_vocab, attr="IS_PUNCT") matcher = PhraseMatcher(en_vocab, attr="IS_PUNCT")
matcher.add("TEST", None, pattern) matcher.add("TEST", [pattern])
doc = Doc(en_vocab, words=words2) doc = Doc(en_vocab, words=words2)
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 2 assert len(matches) == 2
@ -173,15 +195,15 @@ def test_phrase_matcher_validation(en_vocab):
doc3 = Doc(en_vocab, words=["Test"]) doc3 = Doc(en_vocab, words=["Test"])
matcher = PhraseMatcher(en_vocab, validate=True) matcher = PhraseMatcher(en_vocab, validate=True)
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
matcher.add("TEST1", None, doc1) matcher.add("TEST1", [doc1])
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
matcher.add("TEST2", None, doc2) matcher.add("TEST2", [doc2])
with pytest.warns(None) as record: with pytest.warns(None) as record:
matcher.add("TEST3", None, doc3) matcher.add("TEST3", [doc3])
assert not record.list assert not record.list
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True) matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
with pytest.warns(None) as record: with pytest.warns(None) as record:
matcher.add("TEST4", None, doc2) matcher.add("TEST4", [doc2])
assert not record.list assert not record.list
@ -198,24 +220,24 @@ def test_attr_pipeline_checks(en_vocab):
doc3 = Doc(en_vocab, words=["Test"]) doc3 = Doc(en_vocab, words=["Test"])
# DEP requires is_parsed # DEP requires is_parsed
matcher = PhraseMatcher(en_vocab, attr="DEP") matcher = PhraseMatcher(en_vocab, attr="DEP")
matcher.add("TEST1", None, doc1) matcher.add("TEST1", [doc1])
with pytest.raises(ValueError): with pytest.raises(ValueError):
matcher.add("TEST2", None, doc2) matcher.add("TEST2", [doc2])
with pytest.raises(ValueError): with pytest.raises(ValueError):
matcher.add("TEST3", None, doc3) matcher.add("TEST3", [doc3])
# TAG, POS, LEMMA require is_tagged # TAG, POS, LEMMA require is_tagged
for attr in ("TAG", "POS", "LEMMA"): for attr in ("TAG", "POS", "LEMMA"):
matcher = PhraseMatcher(en_vocab, attr=attr) matcher = PhraseMatcher(en_vocab, attr=attr)
matcher.add("TEST2", None, doc2) matcher.add("TEST2", [doc2])
with pytest.raises(ValueError): with pytest.raises(ValueError):
matcher.add("TEST1", None, doc1) matcher.add("TEST1", [doc1])
with pytest.raises(ValueError): with pytest.raises(ValueError):
matcher.add("TEST3", None, doc3) matcher.add("TEST3", [doc3])
# TEXT/ORTH only require tokens # TEXT/ORTH only require tokens
matcher = PhraseMatcher(en_vocab, attr="ORTH") matcher = PhraseMatcher(en_vocab, attr="ORTH")
matcher.add("TEST3", None, doc3) matcher.add("TEST3", [doc3])
matcher = PhraseMatcher(en_vocab, attr="TEXT") matcher = PhraseMatcher(en_vocab, attr="TEXT")
matcher.add("TEST3", None, doc3) matcher.add("TEST3", [doc3])
def test_phrase_matcher_callback(en_vocab): def test_phrase_matcher_callback(en_vocab):
@ -223,7 +245,7 @@ def test_phrase_matcher_callback(en_vocab):
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
pattern = Doc(en_vocab, words=["Google", "Now"]) pattern = Doc(en_vocab, words=["Google", "Now"])
matcher = PhraseMatcher(en_vocab) matcher = PhraseMatcher(en_vocab)
matcher.add("COMPANY", mock, pattern) matcher.add("COMPANY", [pattern], on_match=mock)
matches = matcher(doc) matches = matcher(doc)
mock.assert_called_once_with(matcher, doc, 0, matches) mock.assert_called_once_with(matcher, doc, 0, matches)
@ -234,5 +256,13 @@ def test_phrase_matcher_remove_overlapping_patterns(en_vocab):
pattern2 = Doc(en_vocab, words=["this", "is"]) pattern2 = Doc(en_vocab, words=["this", "is"])
pattern3 = Doc(en_vocab, words=["this", "is", "a"]) pattern3 = Doc(en_vocab, words=["this", "is", "a"])
pattern4 = Doc(en_vocab, words=["this", "is", "a", "word"]) pattern4 = Doc(en_vocab, words=["this", "is", "a", "word"])
matcher.add("THIS", None, pattern1, pattern2, pattern3, pattern4) matcher.add("THIS", [pattern1, pattern2, pattern3, pattern4])
matcher.remove("THIS") matcher.remove("THIS")
def test_phrase_matcher_basic_check(en_vocab):
matcher = PhraseMatcher(en_vocab)
# Potential mistake: pass in pattern instead of list of patterns
pattern = Doc(en_vocab, words=["hello", "world"])
with pytest.raises(ValueError):
matcher.add("TEST", pattern)

View File

@ -30,7 +30,7 @@ def test_issue118(en_tokenizer, patterns):
doc = en_tokenizer(text) doc = en_tokenizer(text)
ORG = doc.vocab.strings["ORG"] ORG = doc.vocab.strings["ORG"]
matcher = Matcher(doc.vocab) matcher = Matcher(doc.vocab)
matcher.add("BostonCeltics", None, *patterns) matcher.add("BostonCeltics", patterns)
assert len(list(doc.ents)) == 0 assert len(list(doc.ents)) == 0
matches = [(ORG, start, end) for _, start, end in matcher(doc)] matches = [(ORG, start, end) for _, start, end in matcher(doc)]
assert matches == [(ORG, 9, 11), (ORG, 10, 11)] assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
@ -57,7 +57,7 @@ def test_issue118_prefix_reorder(en_tokenizer, patterns):
doc = en_tokenizer(text) doc = en_tokenizer(text)
ORG = doc.vocab.strings["ORG"] ORG = doc.vocab.strings["ORG"]
matcher = Matcher(doc.vocab) matcher = Matcher(doc.vocab)
matcher.add("BostonCeltics", None, *patterns) matcher.add("BostonCeltics", patterns)
assert len(list(doc.ents)) == 0 assert len(list(doc.ents)) == 0
matches = [(ORG, start, end) for _, start, end in matcher(doc)] matches = [(ORG, start, end) for _, start, end in matcher(doc)]
doc.ents += tuple(matches)[1:] doc.ents += tuple(matches)[1:]
@ -78,7 +78,7 @@ def test_issue242(en_tokenizer):
] ]
doc = en_tokenizer(text) doc = en_tokenizer(text)
matcher = Matcher(doc.vocab) matcher = Matcher(doc.vocab)
matcher.add("FOOD", None, *patterns) matcher.add("FOOD", patterns)
matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)] matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)]
match1, match2 = matches match1, match2 = matches
assert match1[1] == 3 assert match1[1] == 3
@ -127,17 +127,13 @@ def test_issue587(en_tokenizer):
"""Test that Matcher doesn't segfault on particular input""" """Test that Matcher doesn't segfault on particular input"""
doc = en_tokenizer("a b; c") doc = en_tokenizer("a b; c")
matcher = Matcher(doc.vocab) matcher = Matcher(doc.vocab)
matcher.add("TEST1", None, [{ORTH: "a"}, {ORTH: "b"}]) matcher.add("TEST1", [[{ORTH: "a"}, {ORTH: "b"}]])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 1 assert len(matches) == 1
matcher.add( matcher.add("TEST2", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]])
"TEST2", None, [{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]
)
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 2 assert len(matches) == 2
matcher.add( matcher.add("TEST3", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]])
"TEST3", None, [{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]
)
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 2 assert len(matches) == 2
@ -145,7 +141,7 @@ def test_issue587(en_tokenizer):
def test_issue588(en_vocab): def test_issue588(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
with pytest.raises(ValueError): with pytest.raises(ValueError):
matcher.add("TEST", None, []) matcher.add("TEST", [[]])
@pytest.mark.xfail @pytest.mark.xfail
@ -161,11 +157,9 @@ def test_issue590(en_vocab):
doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"]) doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add( matcher.add(
"ab", "ab", [[{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}]]
None,
[{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}],
) )
matcher.add("ab", None, [{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}]) matcher.add("ab", [[{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}]])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 2 assert len(matches) == 2
@ -221,7 +215,7 @@ def test_issue615(en_tokenizer):
label = "Sport_Equipment" label = "Sport_Equipment"
doc = en_tokenizer(text) doc = en_tokenizer(text)
matcher = Matcher(doc.vocab) matcher = Matcher(doc.vocab)
matcher.add(label, merge_phrases, pattern) matcher.add(label, [pattern], on_match=merge_phrases)
matcher(doc) matcher(doc)
entities = list(doc.ents) entities = list(doc.ents)
assert entities != [] assert entities != []
@ -339,7 +333,7 @@ def test_issue850():
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
matcher = Matcher(vocab) matcher = Matcher(vocab)
pattern = [{"LOWER": "bob"}, {"OP": "*"}, {"LOWER": "frank"}] pattern = [{"LOWER": "bob"}, {"OP": "*"}, {"LOWER": "frank"}]
matcher.add("FarAway", None, pattern) matcher.add("FarAway", [pattern])
doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
match = matcher(doc) match = matcher(doc)
assert len(match) == 1 assert len(match) == 1
@ -353,7 +347,7 @@ def test_issue850_basic():
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
matcher = Matcher(vocab) matcher = Matcher(vocab)
pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}] pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}]
matcher.add("FarAway", None, pattern) matcher.add("FarAway", [pattern])
doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
match = matcher(doc) match = matcher(doc)
assert len(match) == 1 assert len(match) == 1

View File

@ -111,7 +111,7 @@ def test_issue1434():
hello_world = Doc(vocab, words=["Hello", "World"]) hello_world = Doc(vocab, words=["Hello", "World"])
hello = Doc(vocab, words=["Hello"]) hello = Doc(vocab, words=["Hello"])
matcher = Matcher(vocab) matcher = Matcher(vocab)
matcher.add("MyMatcher", None, pattern) matcher.add("MyMatcher", [pattern])
matches = matcher(hello_world) matches = matcher(hello_world)
assert matches assert matches
matches = matcher(hello) matches = matcher(hello)
@ -133,7 +133,7 @@ def test_issue1450(string, start, end):
"""Test matcher works when patterns end with * operator.""" """Test matcher works when patterns end with * operator."""
pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}] pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
matcher = Matcher(Vocab()) matcher = Matcher(Vocab())
matcher.add("TSTEND", None, pattern) matcher.add("TSTEND", [pattern])
doc = Doc(Vocab(), words=string.split()) doc = Doc(Vocab(), words=string.split())
matches = matcher(doc) matches = matcher(doc)
if start is None or end is None: if start is None or end is None:

View File

@ -224,7 +224,7 @@ def test_issue1868():
def test_issue1883(): def test_issue1883():
matcher = Matcher(Vocab()) matcher = Matcher(Vocab())
matcher.add("pat1", None, [{"orth": "hello"}]) matcher.add("pat1", [[{"orth": "hello"}]])
doc = Doc(matcher.vocab, words=["hello"]) doc = Doc(matcher.vocab, words=["hello"])
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
new_matcher = copy.deepcopy(matcher) new_matcher = copy.deepcopy(matcher)
@ -249,7 +249,7 @@ def test_issue1915():
def test_issue1945(): def test_issue1945():
"""Test regression in Matcher introduced in v2.0.6.""" """Test regression in Matcher introduced in v2.0.6."""
matcher = Matcher(Vocab()) matcher = Matcher(Vocab())
matcher.add("MWE", None, [{"orth": "a"}, {"orth": "a"}]) matcher.add("MWE", [[{"orth": "a"}, {"orth": "a"}]])
doc = Doc(matcher.vocab, words=["a", "a", "a"]) doc = Doc(matcher.vocab, words=["a", "a", "a"])
matches = matcher(doc) # we should see two overlapping matches here matches = matcher(doc) # we should see two overlapping matches here
assert len(matches) == 2 assert len(matches) == 2
@ -285,7 +285,7 @@ def test_issue1971(en_vocab):
{"ORTH": "!", "OP": "?"}, {"ORTH": "!", "OP": "?"},
] ]
Token.set_extension("optional", default=False) Token.set_extension("optional", default=False)
matcher.add("TEST", None, pattern) matcher.add("TEST", [pattern])
doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"]) doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
# We could also assert length 1 here, but this is more conclusive, because # We could also assert length 1 here, but this is more conclusive, because
# the real problem here is that it returns a duplicate match for a match_id # the real problem here is that it returns a duplicate match for a match_id
@ -299,7 +299,7 @@ def test_issue_1971_2(en_vocab):
pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}] pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}] pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}]
doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"]) doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
matcher.add("TEST1", None, pattern1, pattern2) matcher.add("TEST1", [pattern1, pattern2])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 2 assert len(matches) == 2
@ -310,8 +310,8 @@ def test_issue_1971_3(en_vocab):
Token.set_extension("b", default=2, force=True) Token.set_extension("b", default=2, force=True)
doc = Doc(en_vocab, words=["hello", "world"]) doc = Doc(en_vocab, words=["hello", "world"])
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add("A", None, [{"_": {"a": 1}}]) matcher.add("A", [[{"_": {"a": 1}}]])
matcher.add("B", None, [{"_": {"b": 2}}]) matcher.add("B", [[{"_": {"b": 2}}]])
matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc)) matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
assert len(matches) == 4 assert len(matches) == 4
assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)]) assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
@ -326,7 +326,7 @@ def test_issue_1971_4(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
doc = Doc(en_vocab, words=["this", "is", "text"]) doc = Doc(en_vocab, words=["this", "is", "text"])
pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3 pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
matcher.add("TEST", None, pattern) matcher.add("TEST", [pattern])
matches = matcher(doc) matches = matcher(doc)
# Uncommenting this caused a segmentation fault # Uncommenting this caused a segmentation fault
assert len(matches) == 1 assert len(matches) == 1

View File

@ -128,7 +128,7 @@ def test_issue2464(en_vocab):
"""Test problem with successive ?. This is the same bug, so putting it here.""" """Test problem with successive ?. This is the same bug, so putting it here."""
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
doc = Doc(en_vocab, words=["a", "b"]) doc = Doc(en_vocab, words=["a", "b"])
matcher.add("4", None, [{"OP": "?"}, {"OP": "?"}]) matcher.add("4", [[{"OP": "?"}, {"OP": "?"}]])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 3 assert len(matches) == 3

View File

@ -37,7 +37,7 @@ def test_issue2569(en_tokenizer):
doc = en_tokenizer("It is May 15, 1993.") doc = en_tokenizer("It is May 15, 1993.")
doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])] doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])]
matcher = Matcher(doc.vocab) matcher = Matcher(doc.vocab)
matcher.add("RULE", None, [{"ENT_TYPE": "DATE", "OP": "+"}]) matcher.add("RULE", [[{"ENT_TYPE": "DATE", "OP": "+"}]])
matched = [doc[start:end] for _, start, end in matcher(doc)] matched = [doc[start:end] for _, start, end in matcher(doc)]
matched = sorted(matched, key=len, reverse=True) matched = sorted(matched, key=len, reverse=True)
assert len(matched) == 10 assert len(matched) == 10
@ -89,7 +89,7 @@ def test_issue2671():
{"IS_PUNCT": True, "OP": "?"}, {"IS_PUNCT": True, "OP": "?"},
{"LOWER": "adrenaline"}, {"LOWER": "adrenaline"},
] ]
matcher.add(pattern_id, None, pattern) matcher.add(pattern_id, [pattern])
doc1 = nlp("This is a high-adrenaline situation.") doc1 = nlp("This is a high-adrenaline situation.")
doc2 = nlp("This is a high adrenaline situation.") doc2 = nlp("This is a high adrenaline situation.")
matches1 = matcher(doc1) matches1 = matcher(doc1)

View File

@ -52,7 +52,7 @@ def test_issue3009(en_vocab):
doc = get_doc(en_vocab, words=words, tags=tags) doc = get_doc(en_vocab, words=words, tags=tags)
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
for i, pattern in enumerate(patterns): for i, pattern in enumerate(patterns):
matcher.add(str(i), None, pattern) matcher.add(str(i), [pattern])
matches = matcher(doc) matches = matcher(doc)
assert matches assert matches
@ -116,8 +116,8 @@ def test_issue3248_1():
total number of patterns.""" total number of patterns."""
nlp = English() nlp = English()
matcher = PhraseMatcher(nlp.vocab) matcher = PhraseMatcher(nlp.vocab)
matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c")) matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
matcher.add("TEST2", None, nlp("d")) matcher.add("TEST2", [nlp("d")])
assert len(matcher) == 2 assert len(matcher) == 2
@ -125,8 +125,8 @@ def test_issue3248_2():
"""Test that the PhraseMatcher can be pickled correctly.""" """Test that the PhraseMatcher can be pickled correctly."""
nlp = English() nlp = English()
matcher = PhraseMatcher(nlp.vocab) matcher = PhraseMatcher(nlp.vocab)
matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c")) matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
matcher.add("TEST2", None, nlp("d")) matcher.add("TEST2", [nlp("d")])
data = pickle.dumps(matcher) data = pickle.dumps(matcher)
new_matcher = pickle.loads(data) new_matcher = pickle.loads(data)
assert len(new_matcher) == len(matcher) assert len(new_matcher) == len(matcher)
@ -170,7 +170,7 @@ def test_issue3328(en_vocab):
[{"LOWER": {"IN": ["hello", "how"]}}], [{"LOWER": {"IN": ["hello", "how"]}}],
[{"LOWER": {"IN": ["you", "doing"]}}], [{"LOWER": {"IN": ["you", "doing"]}}],
] ]
matcher.add("TEST", None, *patterns) matcher.add("TEST", patterns)
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 4 assert len(matches) == 4
matched_texts = [doc[start:end].text for _, start, end in matches] matched_texts = [doc[start:end].text for _, start, end in matches]
@ -183,8 +183,8 @@ def test_issue3331(en_vocab):
matches, one per rule. matches, one per rule.
""" """
matcher = PhraseMatcher(en_vocab) matcher = PhraseMatcher(en_vocab)
matcher.add("A", None, Doc(en_vocab, words=["Barack", "Obama"])) matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])])
matcher.add("B", None, Doc(en_vocab, words=["Barack", "Obama"])) matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])])
doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"]) doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 2 assert len(matches) == 2

View File

@ -10,6 +10,6 @@ def test_issue3549(en_vocab):
"""Test that match pattern validation doesn't raise on empty errors.""" """Test that match pattern validation doesn't raise on empty errors."""
matcher = Matcher(en_vocab, validate=True) matcher = Matcher(en_vocab, validate=True)
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
matcher.add("GOOD", None, pattern) matcher.add("GOOD", [pattern])
with pytest.raises(MatchPatternError): with pytest.raises(MatchPatternError):
matcher.add("BAD", None, [{"X": "Y"}]) matcher.add("BAD", [[{"X": "Y"}]])

View File

@ -12,6 +12,6 @@ def test_issue3555(en_vocab):
Token.set_extension("issue3555", default=None) Token.set_extension("issue3555", default=None)
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}] pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
matcher.add("TEST", None, pattern) matcher.add("TEST", [pattern])
doc = Doc(en_vocab, words=["have", "apple"]) doc = Doc(en_vocab, words=["have", "apple"])
matcher(doc) matcher(doc)

View File

@ -12,10 +12,10 @@ def test_issue3839(en_vocab):
match_id = "PATTERN" match_id = "PATTERN"
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}] pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}] pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
matcher.add(match_id, None, pattern1) matcher.add(match_id, [pattern1])
matches = matcher(doc) matches = matcher(doc)
assert matches[0][0] == en_vocab.strings[match_id] assert matches[0][0] == en_vocab.strings[match_id]
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add(match_id, None, pattern2) matcher.add(match_id, [pattern2])
matches = matcher(doc) matches = matcher(doc)
assert matches[0][0] == en_vocab.strings[match_id] assert matches[0][0] == en_vocab.strings[match_id]

View File

@ -10,5 +10,5 @@ def test_issue3879(en_vocab):
assert len(doc) == 5 assert len(doc) == 5
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}] pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add("TEST", None, pattern) matcher.add("TEST", [pattern])
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test' assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'

View File

@ -14,7 +14,7 @@ def test_issue3951(en_vocab):
{"OP": "?"}, {"OP": "?"},
{"LOWER": "world"}, {"LOWER": "world"},
] ]
matcher.add("TEST", None, pattern) matcher.add("TEST", [pattern])
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"]) doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 0 assert len(matches) == 0

View File

@ -9,8 +9,8 @@ def test_issue3972(en_vocab):
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs. """Test that the PhraseMatcher returns duplicates for duplicate match IDs.
""" """
matcher = PhraseMatcher(en_vocab) matcher = PhraseMatcher(en_vocab)
matcher.add("A", None, Doc(en_vocab, words=["New", "York"])) matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
matcher.add("B", None, Doc(en_vocab, words=["New", "York"])) matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"]) doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
matches = matcher(doc) matches = matcher(doc)

View File

@ -11,7 +11,7 @@ def test_issue4002(en_vocab):
matcher = PhraseMatcher(en_vocab, attr="NORM") matcher = PhraseMatcher(en_vocab, attr="NORM")
pattern1 = Doc(en_vocab, words=["c", "d"]) pattern1 = Doc(en_vocab, words=["c", "d"])
assert [t.norm_ for t in pattern1] == ["c", "d"] assert [t.norm_ for t in pattern1] == ["c", "d"]
matcher.add("TEST", None, pattern1) matcher.add("TEST", [pattern1])
doc = Doc(en_vocab, words=["a", "b", "c", "d"]) doc = Doc(en_vocab, words=["a", "b", "c", "d"])
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"] assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
matches = matcher(doc) matches = matcher(doc)
@ -21,6 +21,6 @@ def test_issue4002(en_vocab):
pattern2[0].norm_ = "c" pattern2[0].norm_ = "c"
pattern2[1].norm_ = "d" pattern2[1].norm_ = "d"
assert [t.norm_ for t in pattern2] == ["c", "d"] assert [t.norm_ for t in pattern2] == ["c", "d"]
matcher.add("TEST", None, pattern2) matcher.add("TEST", [pattern2])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 1 assert len(matches) == 1

View File

@ -8,7 +8,7 @@ from spacy.tokens import Doc
def test_issue4120(en_vocab): def test_issue4120(en_vocab):
"""Test that matches without a final {OP: ?} token are returned.""" """Test that matches without a final {OP: ?} token are returned."""
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}]) matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
doc1 = Doc(en_vocab, words=["a"]) doc1 = Doc(en_vocab, words=["a"])
assert len(matcher(doc1)) == 1 # works assert len(matcher(doc1)) == 1 # works
@ -16,11 +16,11 @@ def test_issue4120(en_vocab):
assert len(matcher(doc2)) == 2 # fixed assert len(matcher(doc2)) == 2 # fixed
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]) matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
doc3 = Doc(en_vocab, words=["a", "b", "b", "c"]) doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
assert len(matcher(doc3)) == 2 # works assert len(matcher(doc3)) == 2 # works
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]) matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"]) doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
assert len(matcher(doc4)) == 3 # fixed assert len(matcher(doc4)) == 3 # fixed

View File

@ -157,16 +157,19 @@ overwritten.
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
| `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | | `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
<Infobox title="Changed in v2.0" variant="warning"> <Infobox title="Changed in v2.2.2" variant="warning">
As of spaCy 2.0, `Matcher.add_pattern` and `Matcher.add_entity` are deprecated As of spaCy 2.2.2, `Matcher.add` also supports the new API, which will become
and have been replaced with a simpler [`Matcher.add`](/api/matcher#add) that the default in the future. The patterns are now the second argument and a list
lets you add a list of patterns and a callback for a given match ID. (instead of a variable number of arguments). The `on_match` callback becomes an
optional keyword argument.
```diff ```diff
- matcher.add_entity("GoogleNow", on_match=merge_phrases) patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
- matcher.add_pattern("GoogleNow", [{ORTH: "Google"}, {ORTH: "Now"}]) - matcher.add("GoogleNow", None, *patterns)
+ matcher.add('GoogleNow', merge_phrases, [{"ORTH": "Google"}, {"ORTH": "Now"}]) + matcher.add("GoogleNow", patterns)
- matcher.add("GoogleNow", on_match, *patterns)
+ matcher.add("GoogleNow", patterns, on_match=on_match)
``` ```
</Infobox> </Infobox>

View File

@ -153,6 +153,23 @@ overwritten.
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
| `*docs` | `Doc` | `Doc` objects of the phrases to match. | | `*docs` | `Doc` | `Doc` objects of the phrases to match. |
<Infobox title="Changed in v2.2.2" variant="warning">
As of spaCy 2.2.2, `PhraseMatcher.add` also supports the new API, which will
become the default in the future. The `Doc` patterns are now the second argument
and a list (instead of a variable number of arguments). The `on_match` callback
becomes an optional keyword argument.
```diff
patterns = [nlp("health care reform"), nlp("healthcare reform")]
- matcher.add("HEALTH", None, *patterns)
+ matcher.add("HEALTH", patterns)
- matcher.add("HEALTH", on_match, *patterns)
+ matcher.add("HEALTH", patterns, on_match=on_match)
```
</Infobox>
## PhraseMatcher.remove {#remove tag="method" new="2.2"} ## PhraseMatcher.remove {#remove tag="method" new="2.2"}
Remove a rule from the matcher by match ID. A `KeyError` is raised if the key Remove a rule from the matcher by match ID. A `KeyError` is raised if the key