mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522)
* Implement new API for {Phrase}Matcher.add (backwards-compatible) * Update docs * Also update DependencyMatcher.add * Update internals * Rewrite tests to use new API * Add basic check for common mistake Raise error with suggestion if user likely passed in a pattern instead of a list of patterns * Fix typo [ci skip]
This commit is contained in:
parent
d2da117114
commit
cfffdba7b1
|
@ -504,6 +504,13 @@ class Errors(object):
|
||||||
E175 = ("Can't remove rule for unknown match pattern ID: {key}")
|
E175 = ("Can't remove rule for unknown match pattern ID: {key}")
|
||||||
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
|
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
|
||||||
E177 = ("Ill-formed IOB input detected: {tag}")
|
E177 = ("Ill-formed IOB input detected: {tag}")
|
||||||
|
E178 = ("Invalid pattern. Expected list of dicts but got: {pat}. Maybe you "
|
||||||
|
"accidentally passed a single pattern to Matcher.add instead of a "
|
||||||
|
"list of patterns? If you only want to add one pattern, make sure "
|
||||||
|
"to wrap it in a list. For example: matcher.add('{key}', [pattern])")
|
||||||
|
E179 = ("Invalid pattern. Expected a list of Doc objects but got a single "
|
||||||
|
"Doc. If you only want to add one pattern, make sure to wrap it "
|
||||||
|
"in a list. For example: matcher.add('{key}', [doc])")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -102,7 +102,10 @@ cdef class DependencyMatcher:
|
||||||
visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True
|
visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True
|
||||||
idx = idx + 1
|
idx = idx + 1
|
||||||
|
|
||||||
def add(self, key, on_match, *patterns):
|
def add(self, key, patterns, *_patterns, on_match=None):
|
||||||
|
if patterns is None or hasattr(patterns, "__call__"): # old API
|
||||||
|
on_match = patterns
|
||||||
|
patterns = _patterns
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
if len(pattern) == 0:
|
if len(pattern) == 0:
|
||||||
raise ValueError(Errors.E012.format(key=key))
|
raise ValueError(Errors.E012.format(key=key))
|
||||||
|
|
|
@ -74,7 +74,7 @@ cdef class Matcher:
|
||||||
"""
|
"""
|
||||||
return self._normalize_key(key) in self._patterns
|
return self._normalize_key(key) in self._patterns
|
||||||
|
|
||||||
def add(self, key, on_match, *patterns):
|
def add(self, key, patterns, *_patterns, on_match=None):
|
||||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
||||||
key, an on_match callback, and one or more patterns.
|
key, an on_match callback, and one or more patterns.
|
||||||
|
|
||||||
|
@ -98,16 +98,29 @@ cdef class Matcher:
|
||||||
operator will behave non-greedily. This quirk in the semantics makes
|
operator will behave non-greedily. This quirk in the semantics makes
|
||||||
the matcher more efficient, by avoiding the need for back-tracking.
|
the matcher more efficient, by avoiding the need for back-tracking.
|
||||||
|
|
||||||
|
As of spaCy v2.2.2, Matcher.add supports the future API, which makes
|
||||||
|
the patterns the second argument and a list (instead of a variable
|
||||||
|
number of arguments). The on_match callback becomes an optional keyword
|
||||||
|
argument.
|
||||||
|
|
||||||
key (unicode): The match ID.
|
key (unicode): The match ID.
|
||||||
on_match (callable): Callback executed on match.
|
patterns (list): The patterns to add for the given key.
|
||||||
*patterns (list): List of token descriptions.
|
on_match (callable): Optional callback executed on match.
|
||||||
|
*_patterns (list): For backwards compatibility: list of patterns to add
|
||||||
|
as variable arguments. Will be ignored if a list of patterns is
|
||||||
|
provided as the second argument.
|
||||||
"""
|
"""
|
||||||
errors = {}
|
errors = {}
|
||||||
if on_match is not None and not hasattr(on_match, "__call__"):
|
if on_match is not None and not hasattr(on_match, "__call__"):
|
||||||
raise ValueError(Errors.E171.format(arg_type=type(on_match)))
|
raise ValueError(Errors.E171.format(arg_type=type(on_match)))
|
||||||
|
if patterns is None or hasattr(patterns, "__call__"): # old API
|
||||||
|
on_match = patterns
|
||||||
|
patterns = _patterns
|
||||||
for i, pattern in enumerate(patterns):
|
for i, pattern in enumerate(patterns):
|
||||||
if len(pattern) == 0:
|
if len(pattern) == 0:
|
||||||
raise ValueError(Errors.E012.format(key=key))
|
raise ValueError(Errors.E012.format(key=key))
|
||||||
|
if not isinstance(pattern, list):
|
||||||
|
raise ValueError(Errors.E178.format(pat=pattern, key=key))
|
||||||
if self.validator:
|
if self.validator:
|
||||||
errors[i] = validate_json(pattern, self.validator)
|
errors[i] = validate_json(pattern, self.validator)
|
||||||
if any(err for err in errors.values()):
|
if any(err for err in errors.values()):
|
||||||
|
|
|
@ -152,16 +152,27 @@ cdef class PhraseMatcher:
|
||||||
del self._callbacks[key]
|
del self._callbacks[key]
|
||||||
del self._docs[key]
|
del self._docs[key]
|
||||||
|
|
||||||
def add(self, key, on_match, *docs):
|
def add(self, key, docs, *_docs, on_match=None):
|
||||||
"""Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
|
"""Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
|
||||||
key, an on_match callback, and one or more patterns.
|
key, an on_match callback, and one or more patterns.
|
||||||
|
|
||||||
|
As of spaCy v2.2.2, PhraseMatcher.add supports the future API, which
|
||||||
|
makes the patterns the second argument and a list (instead of a variable
|
||||||
|
number of arguments). The on_match callback becomes an optional keyword
|
||||||
|
argument.
|
||||||
|
|
||||||
key (unicode): The match ID.
|
key (unicode): The match ID.
|
||||||
|
docs (list): List of `Doc` objects representing match patterns.
|
||||||
on_match (callable): Callback executed on match.
|
on_match (callable): Callback executed on match.
|
||||||
*docs (Doc): `Doc` objects representing match patterns.
|
*_docs (Doc): For backwards compatibility: list of patterns to add
|
||||||
|
as variable arguments. Will be ignored if a list of patterns is
|
||||||
|
provided as the second argument.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#add
|
DOCS: https://spacy.io/api/phrasematcher#add
|
||||||
"""
|
"""
|
||||||
|
if docs is None or hasattr(docs, "__call__"): # old API
|
||||||
|
on_match = docs
|
||||||
|
docs = _docs
|
||||||
|
|
||||||
_ = self.vocab[key]
|
_ = self.vocab[key]
|
||||||
self._callbacks[key] = on_match
|
self._callbacks[key] = on_match
|
||||||
|
@ -171,6 +182,8 @@ cdef class PhraseMatcher:
|
||||||
cdef MapStruct* internal_node
|
cdef MapStruct* internal_node
|
||||||
cdef void* result
|
cdef void* result
|
||||||
|
|
||||||
|
if isinstance(docs, Doc):
|
||||||
|
raise ValueError(Errors.E179.format(key=key))
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
if len(doc) == 0:
|
if len(doc) == 0:
|
||||||
continue
|
continue
|
||||||
|
|
|
@ -204,9 +204,9 @@ class EntityRuler(object):
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E097.format(pattern=pattern))
|
raise ValueError(Errors.E097.format(pattern=pattern))
|
||||||
for label, patterns in self.token_patterns.items():
|
for label, patterns in self.token_patterns.items():
|
||||||
self.matcher.add(label, None, *patterns)
|
self.matcher.add(label, patterns)
|
||||||
for label, patterns in self.phrase_patterns.items():
|
for label, patterns in self.phrase_patterns.items():
|
||||||
self.phrase_matcher.add(label, None, *patterns)
|
self.phrase_matcher.add(label, patterns)
|
||||||
|
|
||||||
def _split_label(self, label):
|
def _split_label(self, label):
|
||||||
"""Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
|
"""Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
|
||||||
|
|
|
@ -17,7 +17,7 @@ def matcher(en_vocab):
|
||||||
}
|
}
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
for key, patterns in rules.items():
|
for key, patterns in rules.items():
|
||||||
matcher.add(key, None, *patterns)
|
matcher.add(key, patterns)
|
||||||
return matcher
|
return matcher
|
||||||
|
|
||||||
|
|
||||||
|
@ -25,11 +25,11 @@ def test_matcher_from_api_docs(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
pattern = [{"ORTH": "test"}]
|
pattern = [{"ORTH": "test"}]
|
||||||
assert len(matcher) == 0
|
assert len(matcher) == 0
|
||||||
matcher.add("Rule", None, pattern)
|
matcher.add("Rule", [pattern])
|
||||||
assert len(matcher) == 1
|
assert len(matcher) == 1
|
||||||
matcher.remove("Rule")
|
matcher.remove("Rule")
|
||||||
assert "Rule" not in matcher
|
assert "Rule" not in matcher
|
||||||
matcher.add("Rule", None, pattern)
|
matcher.add("Rule", [pattern])
|
||||||
assert "Rule" in matcher
|
assert "Rule" in matcher
|
||||||
on_match, patterns = matcher.get("Rule")
|
on_match, patterns = matcher.get("Rule")
|
||||||
assert len(patterns[0])
|
assert len(patterns[0])
|
||||||
|
@ -52,7 +52,7 @@ def test_matcher_from_usage_docs(en_vocab):
|
||||||
token.vocab[token.text].norm_ = "happy emoji"
|
token.vocab[token.text].norm_ = "happy emoji"
|
||||||
|
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("HAPPY", label_sentiment, *pos_patterns)
|
matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)
|
||||||
matcher(doc)
|
matcher(doc)
|
||||||
assert doc.sentiment != 0
|
assert doc.sentiment != 0
|
||||||
assert doc[1].norm_ == "happy emoji"
|
assert doc[1].norm_ == "happy emoji"
|
||||||
|
@ -60,11 +60,33 @@ def test_matcher_from_usage_docs(en_vocab):
|
||||||
|
|
||||||
def test_matcher_len_contains(matcher):
|
def test_matcher_len_contains(matcher):
|
||||||
assert len(matcher) == 3
|
assert len(matcher) == 3
|
||||||
matcher.add("TEST", None, [{"ORTH": "test"}])
|
matcher.add("TEST", [[{"ORTH": "test"}]])
|
||||||
assert "TEST" in matcher
|
assert "TEST" in matcher
|
||||||
assert "TEST2" not in matcher
|
assert "TEST2" not in matcher
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_add_new_old_api(en_vocab):
|
||||||
|
doc = Doc(en_vocab, words=["a", "b"])
|
||||||
|
patterns = [[{"TEXT": "a"}], [{"TEXT": "a"}, {"TEXT": "b"}]]
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("OLD_API", None, *patterns)
|
||||||
|
assert len(matcher(doc)) == 2
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
on_match = Mock()
|
||||||
|
matcher.add("OLD_API_CALLBACK", on_match, *patterns)
|
||||||
|
assert len(matcher(doc)) == 2
|
||||||
|
assert on_match.call_count == 2
|
||||||
|
# New API: add(key: str, patterns: List[List[dict]], on_match: Callable)
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("NEW_API", patterns)
|
||||||
|
assert len(matcher(doc)) == 2
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
on_match = Mock()
|
||||||
|
matcher.add("NEW_API_CALLBACK", patterns, on_match=on_match)
|
||||||
|
assert len(matcher(doc)) == 2
|
||||||
|
assert on_match.call_count == 2
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_no_match(matcher):
|
def test_matcher_no_match(matcher):
|
||||||
doc = Doc(matcher.vocab, words=["I", "like", "cheese", "."])
|
doc = Doc(matcher.vocab, words=["I", "like", "cheese", "."])
|
||||||
assert matcher(doc) == []
|
assert matcher(doc) == []
|
||||||
|
@ -100,12 +122,12 @@ def test_matcher_empty_dict(en_vocab):
|
||||||
"""Test matcher allows empty token specs, meaning match on any token."""
|
"""Test matcher allows empty token specs, meaning match on any token."""
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
doc = Doc(matcher.vocab, words=["a", "b", "c"])
|
doc = Doc(matcher.vocab, words=["a", "b", "c"])
|
||||||
matcher.add("A.C", None, [{"ORTH": "a"}, {}, {"ORTH": "c"}])
|
matcher.add("A.C", [[{"ORTH": "a"}, {}, {"ORTH": "c"}]])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 1
|
assert len(matches) == 1
|
||||||
assert matches[0][1:] == (0, 3)
|
assert matches[0][1:] == (0, 3)
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("A.", None, [{"ORTH": "a"}, {}])
|
matcher.add("A.", [[{"ORTH": "a"}, {}]])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert matches[0][1:] == (0, 2)
|
assert matches[0][1:] == (0, 2)
|
||||||
|
|
||||||
|
@ -114,7 +136,7 @@ def test_matcher_operator_shadow(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
doc = Doc(matcher.vocab, words=["a", "b", "c"])
|
doc = Doc(matcher.vocab, words=["a", "b", "c"])
|
||||||
pattern = [{"ORTH": "a"}, {"IS_ALPHA": True, "OP": "+"}, {"ORTH": "c"}]
|
pattern = [{"ORTH": "a"}, {"IS_ALPHA": True, "OP": "+"}, {"ORTH": "c"}]
|
||||||
matcher.add("A.C", None, pattern)
|
matcher.add("A.C", [pattern])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 1
|
assert len(matches) == 1
|
||||||
assert matches[0][1:] == (0, 3)
|
assert matches[0][1:] == (0, 3)
|
||||||
|
@ -136,12 +158,12 @@ def test_matcher_match_zero(matcher):
|
||||||
{"IS_PUNCT": True},
|
{"IS_PUNCT": True},
|
||||||
{"ORTH": '"'},
|
{"ORTH": '"'},
|
||||||
]
|
]
|
||||||
matcher.add("Quote", None, pattern1)
|
matcher.add("Quote", [pattern1])
|
||||||
doc = Doc(matcher.vocab, words=words1)
|
doc = Doc(matcher.vocab, words=words1)
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
doc = Doc(matcher.vocab, words=words2)
|
doc = Doc(matcher.vocab, words=words2)
|
||||||
assert len(matcher(doc)) == 0
|
assert len(matcher(doc)) == 0
|
||||||
matcher.add("Quote", None, pattern2)
|
matcher.add("Quote", [pattern2])
|
||||||
assert len(matcher(doc)) == 0
|
assert len(matcher(doc)) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@ -149,7 +171,7 @@ def test_matcher_match_zero_plus(matcher):
|
||||||
words = 'He said , " some words " ...'.split()
|
words = 'He said , " some words " ...'.split()
|
||||||
pattern = [{"ORTH": '"'}, {"OP": "*", "IS_PUNCT": False}, {"ORTH": '"'}]
|
pattern = [{"ORTH": '"'}, {"OP": "*", "IS_PUNCT": False}, {"ORTH": '"'}]
|
||||||
matcher = Matcher(matcher.vocab)
|
matcher = Matcher(matcher.vocab)
|
||||||
matcher.add("Quote", None, pattern)
|
matcher.add("Quote", [pattern])
|
||||||
doc = Doc(matcher.vocab, words=words)
|
doc = Doc(matcher.vocab, words=words)
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
|
|
||||||
|
@ -160,11 +182,8 @@ def test_matcher_match_one_plus(matcher):
|
||||||
doc = Doc(control.vocab, words=["Philippe", "Philippe"])
|
doc = Doc(control.vocab, words=["Philippe", "Philippe"])
|
||||||
m = control(doc)
|
m = control(doc)
|
||||||
assert len(m) == 2
|
assert len(m) == 2
|
||||||
matcher.add(
|
pattern = [{"ORTH": "Philippe", "OP": "1"}, {"ORTH": "Philippe", "OP": "+"}]
|
||||||
"KleenePhilippe",
|
matcher.add("KleenePhilippe", [pattern])
|
||||||
None,
|
|
||||||
[{"ORTH": "Philippe", "OP": "1"}, {"ORTH": "Philippe", "OP": "+"}],
|
|
||||||
)
|
|
||||||
m = matcher(doc)
|
m = matcher(doc)
|
||||||
assert len(m) == 1
|
assert len(m) == 1
|
||||||
|
|
||||||
|
@ -172,7 +191,7 @@ def test_matcher_match_one_plus(matcher):
|
||||||
def test_matcher_any_token_operator(en_vocab):
|
def test_matcher_any_token_operator(en_vocab):
|
||||||
"""Test that patterns with "any token" {} work with operators."""
|
"""Test that patterns with "any token" {} work with operators."""
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("TEST", None, [{"ORTH": "test"}, {"OP": "*"}])
|
matcher.add("TEST", [[{"ORTH": "test"}, {"OP": "*"}]])
|
||||||
doc = Doc(en_vocab, words=["test", "hello", "world"])
|
doc = Doc(en_vocab, words=["test", "hello", "world"])
|
||||||
matches = [doc[start:end].text for _, start, end in matcher(doc)]
|
matches = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||||
assert len(matches) == 3
|
assert len(matches) == 3
|
||||||
|
@ -186,7 +205,7 @@ def test_matcher_extension_attribute(en_vocab):
|
||||||
get_is_fruit = lambda token: token.text in ("apple", "banana")
|
get_is_fruit = lambda token: token.text in ("apple", "banana")
|
||||||
Token.set_extension("is_fruit", getter=get_is_fruit, force=True)
|
Token.set_extension("is_fruit", getter=get_is_fruit, force=True)
|
||||||
pattern = [{"ORTH": "an"}, {"_": {"is_fruit": True}}]
|
pattern = [{"ORTH": "an"}, {"_": {"is_fruit": True}}]
|
||||||
matcher.add("HAVING_FRUIT", None, pattern)
|
matcher.add("HAVING_FRUIT", [pattern])
|
||||||
doc = Doc(en_vocab, words=["an", "apple"])
|
doc = Doc(en_vocab, words=["an", "apple"])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 1
|
assert len(matches) == 1
|
||||||
|
@ -198,7 +217,7 @@ def test_matcher_extension_attribute(en_vocab):
|
||||||
def test_matcher_set_value(en_vocab):
|
def test_matcher_set_value(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
pattern = [{"ORTH": {"IN": ["an", "a"]}}]
|
pattern = [{"ORTH": {"IN": ["an", "a"]}}]
|
||||||
matcher.add("A_OR_AN", None, pattern)
|
matcher.add("A_OR_AN", [pattern])
|
||||||
doc = Doc(en_vocab, words=["an", "a", "apple"])
|
doc = Doc(en_vocab, words=["an", "a", "apple"])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
|
@ -210,7 +229,7 @@ def test_matcher_set_value(en_vocab):
|
||||||
def test_matcher_set_value_operator(en_vocab):
|
def test_matcher_set_value_operator(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
pattern = [{"ORTH": {"IN": ["a", "the"]}, "OP": "?"}, {"ORTH": "house"}]
|
pattern = [{"ORTH": {"IN": ["a", "the"]}, "OP": "?"}, {"ORTH": "house"}]
|
||||||
matcher.add("DET_HOUSE", None, pattern)
|
matcher.add("DET_HOUSE", [pattern])
|
||||||
doc = Doc(en_vocab, words=["In", "a", "house"])
|
doc = Doc(en_vocab, words=["In", "a", "house"])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
|
@ -222,7 +241,7 @@ def test_matcher_set_value_operator(en_vocab):
|
||||||
def test_matcher_regex(en_vocab):
|
def test_matcher_regex(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}]
|
pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}]
|
||||||
matcher.add("A_OR_AN", None, pattern)
|
matcher.add("A_OR_AN", [pattern])
|
||||||
doc = Doc(en_vocab, words=["an", "a", "hi"])
|
doc = Doc(en_vocab, words=["an", "a", "hi"])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
|
@ -234,7 +253,7 @@ def test_matcher_regex(en_vocab):
|
||||||
def test_matcher_regex_shape(en_vocab):
|
def test_matcher_regex_shape(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}]
|
pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}]
|
||||||
matcher.add("NON_ALPHA", None, pattern)
|
matcher.add("NON_ALPHA", [pattern])
|
||||||
doc = Doc(en_vocab, words=["99", "problems", "!"])
|
doc = Doc(en_vocab, words=["99", "problems", "!"])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
|
@ -246,7 +265,7 @@ def test_matcher_regex_shape(en_vocab):
|
||||||
def test_matcher_compare_length(en_vocab):
|
def test_matcher_compare_length(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
pattern = [{"LENGTH": {">=": 2}}]
|
pattern = [{"LENGTH": {">=": 2}}]
|
||||||
matcher.add("LENGTH_COMPARE", None, pattern)
|
matcher.add("LENGTH_COMPARE", [pattern])
|
||||||
doc = Doc(en_vocab, words=["a", "aa", "aaa"])
|
doc = Doc(en_vocab, words=["a", "aa", "aaa"])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
|
@ -260,7 +279,7 @@ def test_matcher_extension_set_membership(en_vocab):
|
||||||
get_reversed = lambda token: "".join(reversed(token.text))
|
get_reversed = lambda token: "".join(reversed(token.text))
|
||||||
Token.set_extension("reversed", getter=get_reversed, force=True)
|
Token.set_extension("reversed", getter=get_reversed, force=True)
|
||||||
pattern = [{"_": {"reversed": {"IN": ["eyb", "ih"]}}}]
|
pattern = [{"_": {"reversed": {"IN": ["eyb", "ih"]}}}]
|
||||||
matcher.add("REVERSED", None, pattern)
|
matcher.add("REVERSED", [pattern])
|
||||||
doc = Doc(en_vocab, words=["hi", "bye", "hello"])
|
doc = Doc(en_vocab, words=["hi", "bye", "hello"])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
|
@ -328,9 +347,9 @@ def dependency_matcher(en_vocab):
|
||||||
]
|
]
|
||||||
|
|
||||||
matcher = DependencyMatcher(en_vocab)
|
matcher = DependencyMatcher(en_vocab)
|
||||||
matcher.add("pattern1", None, pattern1)
|
matcher.add("pattern1", [pattern1])
|
||||||
matcher.add("pattern2", None, pattern2)
|
matcher.add("pattern2", [pattern2])
|
||||||
matcher.add("pattern3", None, pattern3)
|
matcher.add("pattern3", [pattern3])
|
||||||
|
|
||||||
return matcher
|
return matcher
|
||||||
|
|
||||||
|
@ -347,6 +366,14 @@ def test_dependency_matcher_compile(dependency_matcher):
|
||||||
# assert matches[2][1] == [[4, 3, 2]]
|
# assert matches[2][1] == [[4, 3, 2]]
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_basic_check(en_vocab):
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
# Potential mistake: pass in pattern instead of list of patterns
|
||||||
|
pattern = [{"TEXT": "hello"}, {"TEXT": "world"}]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
matcher.add("TEST", pattern)
|
||||||
|
|
||||||
|
|
||||||
def test_attr_pipeline_checks(en_vocab):
|
def test_attr_pipeline_checks(en_vocab):
|
||||||
doc1 = Doc(en_vocab, words=["Test"])
|
doc1 = Doc(en_vocab, words=["Test"])
|
||||||
doc1.is_parsed = True
|
doc1.is_parsed = True
|
||||||
|
@ -355,7 +382,7 @@ def test_attr_pipeline_checks(en_vocab):
|
||||||
doc3 = Doc(en_vocab, words=["Test"])
|
doc3 = Doc(en_vocab, words=["Test"])
|
||||||
# DEP requires is_parsed
|
# DEP requires is_parsed
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("TEST", None, [{"DEP": "a"}])
|
matcher.add("TEST", [[{"DEP": "a"}]])
|
||||||
matcher(doc1)
|
matcher(doc1)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
matcher(doc2)
|
matcher(doc2)
|
||||||
|
@ -364,7 +391,7 @@ def test_attr_pipeline_checks(en_vocab):
|
||||||
# TAG, POS, LEMMA require is_tagged
|
# TAG, POS, LEMMA require is_tagged
|
||||||
for attr in ("TAG", "POS", "LEMMA"):
|
for attr in ("TAG", "POS", "LEMMA"):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("TEST", None, [{attr: "a"}])
|
matcher.add("TEST", [[{attr: "a"}]])
|
||||||
matcher(doc2)
|
matcher(doc2)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
matcher(doc1)
|
matcher(doc1)
|
||||||
|
@ -372,12 +399,12 @@ def test_attr_pipeline_checks(en_vocab):
|
||||||
matcher(doc3)
|
matcher(doc3)
|
||||||
# TEXT/ORTH only require tokens
|
# TEXT/ORTH only require tokens
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("TEST", None, [{"ORTH": "a"}])
|
matcher.add("TEST", [[{"ORTH": "a"}]])
|
||||||
matcher(doc1)
|
matcher(doc1)
|
||||||
matcher(doc2)
|
matcher(doc2)
|
||||||
matcher(doc3)
|
matcher(doc3)
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("TEST", None, [{"TEXT": "a"}])
|
matcher.add("TEST", [[{"TEXT": "a"}]])
|
||||||
matcher(doc1)
|
matcher(doc1)
|
||||||
matcher(doc2)
|
matcher(doc2)
|
||||||
matcher(doc3)
|
matcher(doc3)
|
||||||
|
@ -407,7 +434,7 @@ def test_attr_pipeline_checks(en_vocab):
|
||||||
def test_matcher_schema_token_attributes(en_vocab, pattern, text):
|
def test_matcher_schema_token_attributes(en_vocab, pattern, text):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
doc = Doc(en_vocab, words=text.split(" "))
|
doc = Doc(en_vocab, words=text.split(" "))
|
||||||
matcher.add("Rule", None, pattern)
|
matcher.add("Rule", [pattern])
|
||||||
assert len(matcher) == 1
|
assert len(matcher) == 1
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 1
|
assert len(matches) == 1
|
||||||
|
@ -417,7 +444,7 @@ def test_matcher_valid_callback(en_vocab):
|
||||||
"""Test that on_match can only be None or callable."""
|
"""Test that on_match can only be None or callable."""
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
matcher.add("TEST", [], [{"TEXT": "test"}])
|
matcher.add("TEST", [[{"TEXT": "test"}]], on_match=[])
|
||||||
matcher(Doc(en_vocab, words=["test"]))
|
matcher(Doc(en_vocab, words=["test"]))
|
||||||
|
|
||||||
|
|
||||||
|
@ -425,7 +452,7 @@ def test_matcher_callback(en_vocab):
|
||||||
mock = Mock()
|
mock = Mock()
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
pattern = [{"ORTH": "test"}]
|
pattern = [{"ORTH": "test"}]
|
||||||
matcher.add("Rule", mock, pattern)
|
matcher.add("Rule", [pattern], on_match=mock)
|
||||||
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
|
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
mock.assert_called_once_with(matcher, doc, 0, matches)
|
mock.assert_called_once_with(matcher, doc, 0, matches)
|
||||||
|
|
|
@ -55,7 +55,7 @@ def test_greedy_matching(doc, text, pattern, re_pattern):
|
||||||
"""Test that the greedy matching behavior of the * op is consistant with
|
"""Test that the greedy matching behavior of the * op is consistant with
|
||||||
other re implementations."""
|
other re implementations."""
|
||||||
matcher = Matcher(doc.vocab)
|
matcher = Matcher(doc.vocab)
|
||||||
matcher.add(re_pattern, None, pattern)
|
matcher.add(re_pattern, [pattern])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
re_matches = [m.span() for m in re.finditer(re_pattern, text)]
|
re_matches = [m.span() for m in re.finditer(re_pattern, text)]
|
||||||
for match, re_match in zip(matches, re_matches):
|
for match, re_match in zip(matches, re_matches):
|
||||||
|
@ -77,7 +77,7 @@ def test_match_consuming(doc, text, pattern, re_pattern):
|
||||||
"""Test that matcher.__call__ consumes tokens on a match similar to
|
"""Test that matcher.__call__ consumes tokens on a match similar to
|
||||||
re.findall."""
|
re.findall."""
|
||||||
matcher = Matcher(doc.vocab)
|
matcher = Matcher(doc.vocab)
|
||||||
matcher.add(re_pattern, None, pattern)
|
matcher.add(re_pattern, [pattern])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
re_matches = [m.span() for m in re.finditer(re_pattern, text)]
|
re_matches = [m.span() for m in re.finditer(re_pattern, text)]
|
||||||
assert len(matches) == len(re_matches)
|
assert len(matches) == len(re_matches)
|
||||||
|
@ -111,7 +111,7 @@ def test_operator_combos(en_vocab):
|
||||||
pattern.append({"ORTH": part[0], "OP": "+"})
|
pattern.append({"ORTH": part[0], "OP": "+"})
|
||||||
else:
|
else:
|
||||||
pattern.append({"ORTH": part})
|
pattern.append({"ORTH": part})
|
||||||
matcher.add("PATTERN", None, pattern)
|
matcher.add("PATTERN", [pattern])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
if result:
|
if result:
|
||||||
assert matches, (string, pattern_str)
|
assert matches, (string, pattern_str)
|
||||||
|
@ -123,7 +123,7 @@ def test_matcher_end_zero_plus(en_vocab):
|
||||||
"""Test matcher works when patterns end with * operator. (issue 1450)"""
|
"""Test matcher works when patterns end with * operator. (issue 1450)"""
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
|
pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
|
||||||
matcher.add("TSTEND", None, pattern)
|
matcher.add("TSTEND", [pattern])
|
||||||
nlp = lambda string: Doc(matcher.vocab, words=string.split())
|
nlp = lambda string: Doc(matcher.vocab, words=string.split())
|
||||||
assert len(matcher(nlp("a"))) == 1
|
assert len(matcher(nlp("a"))) == 1
|
||||||
assert len(matcher(nlp("a b"))) == 2
|
assert len(matcher(nlp("a b"))) == 2
|
||||||
|
@ -140,7 +140,7 @@ def test_matcher_sets_return_correct_tokens(en_vocab):
|
||||||
[{"LOWER": {"IN": ["one"]}}],
|
[{"LOWER": {"IN": ["one"]}}],
|
||||||
[{"LOWER": {"IN": ["two"]}}],
|
[{"LOWER": {"IN": ["two"]}}],
|
||||||
]
|
]
|
||||||
matcher.add("TEST", None, *patterns)
|
matcher.add("TEST", patterns)
|
||||||
doc = Doc(en_vocab, words="zero one two three".split())
|
doc = Doc(en_vocab, words="zero one two three".split())
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
texts = [Span(doc, s, e, label=L).text for L, s, e in matches]
|
texts = [Span(doc, s, e, label=L).text for L, s, e in matches]
|
||||||
|
@ -154,7 +154,7 @@ def test_matcher_remove():
|
||||||
|
|
||||||
pattern = [{"ORTH": "test"}, {"OP": "?"}]
|
pattern = [{"ORTH": "test"}, {"OP": "?"}]
|
||||||
assert len(matcher) == 0
|
assert len(matcher) == 0
|
||||||
matcher.add("Rule", None, pattern)
|
matcher.add("Rule", [pattern])
|
||||||
assert "Rule" in matcher
|
assert "Rule" in matcher
|
||||||
|
|
||||||
# should give two matches
|
# should give two matches
|
||||||
|
|
|
@ -50,7 +50,7 @@ def validator():
|
||||||
def test_matcher_pattern_validation(en_vocab, pattern):
|
def test_matcher_pattern_validation(en_vocab, pattern):
|
||||||
matcher = Matcher(en_vocab, validate=True)
|
matcher = Matcher(en_vocab, validate=True)
|
||||||
with pytest.raises(MatchPatternError):
|
with pytest.raises(MatchPatternError):
|
||||||
matcher.add("TEST", None, pattern)
|
matcher.add("TEST", [pattern])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("pattern,n_errors,_", TEST_PATTERNS)
|
@pytest.mark.parametrize("pattern,n_errors,_", TEST_PATTERNS)
|
||||||
|
@ -71,6 +71,6 @@ def test_minimal_pattern_validation(en_vocab, pattern, n_errors, n_min_errors):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
if n_min_errors > 0:
|
if n_min_errors > 0:
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
matcher.add("TEST", None, pattern)
|
matcher.add("TEST", [pattern])
|
||||||
elif n_errors == 0:
|
elif n_errors == 0:
|
||||||
matcher.add("TEST", None, pattern)
|
matcher.add("TEST", [pattern])
|
||||||
|
|
|
@ -13,53 +13,75 @@ def test_matcher_phrase_matcher(en_vocab):
|
||||||
# intermediate phrase
|
# intermediate phrase
|
||||||
pattern = Doc(en_vocab, words=["Google", "Now"])
|
pattern = Doc(en_vocab, words=["Google", "Now"])
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
matcher.add("COMPANY", None, pattern)
|
matcher.add("COMPANY", [pattern])
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
# initial token
|
# initial token
|
||||||
pattern = Doc(en_vocab, words=["I"])
|
pattern = Doc(en_vocab, words=["I"])
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
matcher.add("I", None, pattern)
|
matcher.add("I", [pattern])
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
# initial phrase
|
# initial phrase
|
||||||
pattern = Doc(en_vocab, words=["I", "like"])
|
pattern = Doc(en_vocab, words=["I", "like"])
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
matcher.add("ILIKE", None, pattern)
|
matcher.add("ILIKE", [pattern])
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
# final token
|
# final token
|
||||||
pattern = Doc(en_vocab, words=["best"])
|
pattern = Doc(en_vocab, words=["best"])
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
matcher.add("BEST", None, pattern)
|
matcher.add("BEST", [pattern])
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
# final phrase
|
# final phrase
|
||||||
pattern = Doc(en_vocab, words=["Now", "best"])
|
pattern = Doc(en_vocab, words=["Now", "best"])
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
matcher.add("NOWBEST", None, pattern)
|
matcher.add("NOWBEST", [pattern])
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_phrase_matcher_length(en_vocab):
|
def test_phrase_matcher_length(en_vocab):
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
assert len(matcher) == 0
|
assert len(matcher) == 0
|
||||||
matcher.add("TEST", None, Doc(en_vocab, words=["test"]))
|
matcher.add("TEST", [Doc(en_vocab, words=["test"])])
|
||||||
assert len(matcher) == 1
|
assert len(matcher) == 1
|
||||||
matcher.add("TEST2", None, Doc(en_vocab, words=["test2"]))
|
matcher.add("TEST2", [Doc(en_vocab, words=["test2"])])
|
||||||
assert len(matcher) == 2
|
assert len(matcher) == 2
|
||||||
|
|
||||||
|
|
||||||
def test_phrase_matcher_contains(en_vocab):
|
def test_phrase_matcher_contains(en_vocab):
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
matcher.add("TEST", None, Doc(en_vocab, words=["test"]))
|
matcher.add("TEST", [Doc(en_vocab, words=["test"])])
|
||||||
assert "TEST" in matcher
|
assert "TEST" in matcher
|
||||||
assert "TEST2" not in matcher
|
assert "TEST2" not in matcher
|
||||||
|
|
||||||
|
|
||||||
|
def test_phrase_matcher_add_new_api(en_vocab):
|
||||||
|
doc = Doc(en_vocab, words=["a", "b"])
|
||||||
|
patterns = [Doc(en_vocab, words=["a"]), Doc(en_vocab, words=["a", "b"])]
|
||||||
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
matcher.add("OLD_API", None, *patterns)
|
||||||
|
assert len(matcher(doc)) == 2
|
||||||
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
on_match = Mock()
|
||||||
|
matcher.add("OLD_API_CALLBACK", on_match, *patterns)
|
||||||
|
assert len(matcher(doc)) == 2
|
||||||
|
assert on_match.call_count == 2
|
||||||
|
# New API: add(key: str, patterns: List[List[dict]], on_match: Callable)
|
||||||
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
matcher.add("NEW_API", patterns)
|
||||||
|
assert len(matcher(doc)) == 2
|
||||||
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
on_match = Mock()
|
||||||
|
matcher.add("NEW_API_CALLBACK", patterns, on_match=on_match)
|
||||||
|
assert len(matcher(doc)) == 2
|
||||||
|
assert on_match.call_count == 2
|
||||||
|
|
||||||
|
|
||||||
def test_phrase_matcher_repeated_add(en_vocab):
|
def test_phrase_matcher_repeated_add(en_vocab):
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
# match ID only gets added once
|
# match ID only gets added once
|
||||||
matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
|
matcher.add("TEST", [Doc(en_vocab, words=["like"])])
|
||||||
matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
|
matcher.add("TEST", [Doc(en_vocab, words=["like"])])
|
||||||
matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
|
matcher.add("TEST", [Doc(en_vocab, words=["like"])])
|
||||||
matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
|
matcher.add("TEST", [Doc(en_vocab, words=["like"])])
|
||||||
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
||||||
assert "TEST" in matcher
|
assert "TEST" in matcher
|
||||||
assert "TEST2" not in matcher
|
assert "TEST2" not in matcher
|
||||||
|
@ -68,8 +90,8 @@ def test_phrase_matcher_repeated_add(en_vocab):
|
||||||
|
|
||||||
def test_phrase_matcher_remove(en_vocab):
|
def test_phrase_matcher_remove(en_vocab):
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
matcher.add("TEST1", None, Doc(en_vocab, words=["like"]))
|
matcher.add("TEST1", [Doc(en_vocab, words=["like"])])
|
||||||
matcher.add("TEST2", None, Doc(en_vocab, words=["best"]))
|
matcher.add("TEST2", [Doc(en_vocab, words=["best"])])
|
||||||
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
||||||
assert "TEST1" in matcher
|
assert "TEST1" in matcher
|
||||||
assert "TEST2" in matcher
|
assert "TEST2" in matcher
|
||||||
|
@ -95,9 +117,9 @@ def test_phrase_matcher_remove(en_vocab):
|
||||||
|
|
||||||
def test_phrase_matcher_overlapping_with_remove(en_vocab):
|
def test_phrase_matcher_overlapping_with_remove(en_vocab):
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
|
matcher.add("TEST", [Doc(en_vocab, words=["like"])])
|
||||||
# TEST2 is added alongside TEST
|
# TEST2 is added alongside TEST
|
||||||
matcher.add("TEST2", None, Doc(en_vocab, words=["like"]))
|
matcher.add("TEST2", [Doc(en_vocab, words=["like"])])
|
||||||
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
||||||
assert "TEST" in matcher
|
assert "TEST" in matcher
|
||||||
assert len(matcher) == 2
|
assert len(matcher) == 2
|
||||||
|
@ -122,7 +144,7 @@ def test_phrase_matcher_string_attrs(en_vocab):
|
||||||
pos2 = ["INTJ", "PUNCT", "PRON", "VERB", "NOUN", "ADV", "ADV"]
|
pos2 = ["INTJ", "PUNCT", "PRON", "VERB", "NOUN", "ADV", "ADV"]
|
||||||
pattern = get_doc(en_vocab, words=words1, pos=pos1)
|
pattern = get_doc(en_vocab, words=words1, pos=pos1)
|
||||||
matcher = PhraseMatcher(en_vocab, attr="POS")
|
matcher = PhraseMatcher(en_vocab, attr="POS")
|
||||||
matcher.add("TEST", None, pattern)
|
matcher.add("TEST", [pattern])
|
||||||
doc = get_doc(en_vocab, words=words2, pos=pos2)
|
doc = get_doc(en_vocab, words=words2, pos=pos2)
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 1
|
assert len(matches) == 1
|
||||||
|
@ -140,7 +162,7 @@ def test_phrase_matcher_string_attrs_negative(en_vocab):
|
||||||
pos2 = ["X", "X", "X"]
|
pos2 = ["X", "X", "X"]
|
||||||
pattern = get_doc(en_vocab, words=words1, pos=pos1)
|
pattern = get_doc(en_vocab, words=words1, pos=pos1)
|
||||||
matcher = PhraseMatcher(en_vocab, attr="POS")
|
matcher = PhraseMatcher(en_vocab, attr="POS")
|
||||||
matcher.add("TEST", None, pattern)
|
matcher.add("TEST", [pattern])
|
||||||
doc = get_doc(en_vocab, words=words2, pos=pos2)
|
doc = get_doc(en_vocab, words=words2, pos=pos2)
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 0
|
assert len(matches) == 0
|
||||||
|
@ -151,7 +173,7 @@ def test_phrase_matcher_bool_attrs(en_vocab):
|
||||||
words2 = ["No", "problem", ",", "he", "said", "."]
|
words2 = ["No", "problem", ",", "he", "said", "."]
|
||||||
pattern = Doc(en_vocab, words=words1)
|
pattern = Doc(en_vocab, words=words1)
|
||||||
matcher = PhraseMatcher(en_vocab, attr="IS_PUNCT")
|
matcher = PhraseMatcher(en_vocab, attr="IS_PUNCT")
|
||||||
matcher.add("TEST", None, pattern)
|
matcher.add("TEST", [pattern])
|
||||||
doc = Doc(en_vocab, words=words2)
|
doc = Doc(en_vocab, words=words2)
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
|
@ -173,15 +195,15 @@ def test_phrase_matcher_validation(en_vocab):
|
||||||
doc3 = Doc(en_vocab, words=["Test"])
|
doc3 = Doc(en_vocab, words=["Test"])
|
||||||
matcher = PhraseMatcher(en_vocab, validate=True)
|
matcher = PhraseMatcher(en_vocab, validate=True)
|
||||||
with pytest.warns(UserWarning):
|
with pytest.warns(UserWarning):
|
||||||
matcher.add("TEST1", None, doc1)
|
matcher.add("TEST1", [doc1])
|
||||||
with pytest.warns(UserWarning):
|
with pytest.warns(UserWarning):
|
||||||
matcher.add("TEST2", None, doc2)
|
matcher.add("TEST2", [doc2])
|
||||||
with pytest.warns(None) as record:
|
with pytest.warns(None) as record:
|
||||||
matcher.add("TEST3", None, doc3)
|
matcher.add("TEST3", [doc3])
|
||||||
assert not record.list
|
assert not record.list
|
||||||
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
|
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
|
||||||
with pytest.warns(None) as record:
|
with pytest.warns(None) as record:
|
||||||
matcher.add("TEST4", None, doc2)
|
matcher.add("TEST4", [doc2])
|
||||||
assert not record.list
|
assert not record.list
|
||||||
|
|
||||||
|
|
||||||
|
@ -198,24 +220,24 @@ def test_attr_pipeline_checks(en_vocab):
|
||||||
doc3 = Doc(en_vocab, words=["Test"])
|
doc3 = Doc(en_vocab, words=["Test"])
|
||||||
# DEP requires is_parsed
|
# DEP requires is_parsed
|
||||||
matcher = PhraseMatcher(en_vocab, attr="DEP")
|
matcher = PhraseMatcher(en_vocab, attr="DEP")
|
||||||
matcher.add("TEST1", None, doc1)
|
matcher.add("TEST1", [doc1])
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
matcher.add("TEST2", None, doc2)
|
matcher.add("TEST2", [doc2])
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
matcher.add("TEST3", None, doc3)
|
matcher.add("TEST3", [doc3])
|
||||||
# TAG, POS, LEMMA require is_tagged
|
# TAG, POS, LEMMA require is_tagged
|
||||||
for attr in ("TAG", "POS", "LEMMA"):
|
for attr in ("TAG", "POS", "LEMMA"):
|
||||||
matcher = PhraseMatcher(en_vocab, attr=attr)
|
matcher = PhraseMatcher(en_vocab, attr=attr)
|
||||||
matcher.add("TEST2", None, doc2)
|
matcher.add("TEST2", [doc2])
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
matcher.add("TEST1", None, doc1)
|
matcher.add("TEST1", [doc1])
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
matcher.add("TEST3", None, doc3)
|
matcher.add("TEST3", [doc3])
|
||||||
# TEXT/ORTH only require tokens
|
# TEXT/ORTH only require tokens
|
||||||
matcher = PhraseMatcher(en_vocab, attr="ORTH")
|
matcher = PhraseMatcher(en_vocab, attr="ORTH")
|
||||||
matcher.add("TEST3", None, doc3)
|
matcher.add("TEST3", [doc3])
|
||||||
matcher = PhraseMatcher(en_vocab, attr="TEXT")
|
matcher = PhraseMatcher(en_vocab, attr="TEXT")
|
||||||
matcher.add("TEST3", None, doc3)
|
matcher.add("TEST3", [doc3])
|
||||||
|
|
||||||
|
|
||||||
def test_phrase_matcher_callback(en_vocab):
|
def test_phrase_matcher_callback(en_vocab):
|
||||||
|
@ -223,7 +245,7 @@ def test_phrase_matcher_callback(en_vocab):
|
||||||
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
||||||
pattern = Doc(en_vocab, words=["Google", "Now"])
|
pattern = Doc(en_vocab, words=["Google", "Now"])
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
matcher.add("COMPANY", mock, pattern)
|
matcher.add("COMPANY", [pattern], on_match=mock)
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
mock.assert_called_once_with(matcher, doc, 0, matches)
|
mock.assert_called_once_with(matcher, doc, 0, matches)
|
||||||
|
|
||||||
|
@ -234,5 +256,13 @@ def test_phrase_matcher_remove_overlapping_patterns(en_vocab):
|
||||||
pattern2 = Doc(en_vocab, words=["this", "is"])
|
pattern2 = Doc(en_vocab, words=["this", "is"])
|
||||||
pattern3 = Doc(en_vocab, words=["this", "is", "a"])
|
pattern3 = Doc(en_vocab, words=["this", "is", "a"])
|
||||||
pattern4 = Doc(en_vocab, words=["this", "is", "a", "word"])
|
pattern4 = Doc(en_vocab, words=["this", "is", "a", "word"])
|
||||||
matcher.add("THIS", None, pattern1, pattern2, pattern3, pattern4)
|
matcher.add("THIS", [pattern1, pattern2, pattern3, pattern4])
|
||||||
matcher.remove("THIS")
|
matcher.remove("THIS")
|
||||||
|
|
||||||
|
|
||||||
|
def test_phrase_matcher_basic_check(en_vocab):
|
||||||
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
# Potential mistake: pass in pattern instead of list of patterns
|
||||||
|
pattern = Doc(en_vocab, words=["hello", "world"])
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
matcher.add("TEST", pattern)
|
||||||
|
|
|
@ -30,7 +30,7 @@ def test_issue118(en_tokenizer, patterns):
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
ORG = doc.vocab.strings["ORG"]
|
ORG = doc.vocab.strings["ORG"]
|
||||||
matcher = Matcher(doc.vocab)
|
matcher = Matcher(doc.vocab)
|
||||||
matcher.add("BostonCeltics", None, *patterns)
|
matcher.add("BostonCeltics", patterns)
|
||||||
assert len(list(doc.ents)) == 0
|
assert len(list(doc.ents)) == 0
|
||||||
matches = [(ORG, start, end) for _, start, end in matcher(doc)]
|
matches = [(ORG, start, end) for _, start, end in matcher(doc)]
|
||||||
assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
|
assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
|
||||||
|
@ -57,7 +57,7 @@ def test_issue118_prefix_reorder(en_tokenizer, patterns):
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
ORG = doc.vocab.strings["ORG"]
|
ORG = doc.vocab.strings["ORG"]
|
||||||
matcher = Matcher(doc.vocab)
|
matcher = Matcher(doc.vocab)
|
||||||
matcher.add("BostonCeltics", None, *patterns)
|
matcher.add("BostonCeltics", patterns)
|
||||||
assert len(list(doc.ents)) == 0
|
assert len(list(doc.ents)) == 0
|
||||||
matches = [(ORG, start, end) for _, start, end in matcher(doc)]
|
matches = [(ORG, start, end) for _, start, end in matcher(doc)]
|
||||||
doc.ents += tuple(matches)[1:]
|
doc.ents += tuple(matches)[1:]
|
||||||
|
@ -78,7 +78,7 @@ def test_issue242(en_tokenizer):
|
||||||
]
|
]
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
matcher = Matcher(doc.vocab)
|
matcher = Matcher(doc.vocab)
|
||||||
matcher.add("FOOD", None, *patterns)
|
matcher.add("FOOD", patterns)
|
||||||
matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)]
|
matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)]
|
||||||
match1, match2 = matches
|
match1, match2 = matches
|
||||||
assert match1[1] == 3
|
assert match1[1] == 3
|
||||||
|
@ -127,17 +127,13 @@ def test_issue587(en_tokenizer):
|
||||||
"""Test that Matcher doesn't segfault on particular input"""
|
"""Test that Matcher doesn't segfault on particular input"""
|
||||||
doc = en_tokenizer("a b; c")
|
doc = en_tokenizer("a b; c")
|
||||||
matcher = Matcher(doc.vocab)
|
matcher = Matcher(doc.vocab)
|
||||||
matcher.add("TEST1", None, [{ORTH: "a"}, {ORTH: "b"}])
|
matcher.add("TEST1", [[{ORTH: "a"}, {ORTH: "b"}]])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 1
|
assert len(matches) == 1
|
||||||
matcher.add(
|
matcher.add("TEST2", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]])
|
||||||
"TEST2", None, [{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]
|
|
||||||
)
|
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
matcher.add(
|
matcher.add("TEST3", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]])
|
||||||
"TEST3", None, [{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]
|
|
||||||
)
|
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
|
|
||||||
|
@ -145,7 +141,7 @@ def test_issue587(en_tokenizer):
|
||||||
def test_issue588(en_vocab):
|
def test_issue588(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
matcher.add("TEST", None, [])
|
matcher.add("TEST", [[]])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
|
@ -161,11 +157,9 @@ def test_issue590(en_vocab):
|
||||||
doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
|
doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add(
|
matcher.add(
|
||||||
"ab",
|
"ab", [[{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}]]
|
||||||
None,
|
|
||||||
[{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}],
|
|
||||||
)
|
)
|
||||||
matcher.add("ab", None, [{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}])
|
matcher.add("ab", [[{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}]])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
|
|
||||||
|
@ -221,7 +215,7 @@ def test_issue615(en_tokenizer):
|
||||||
label = "Sport_Equipment"
|
label = "Sport_Equipment"
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
matcher = Matcher(doc.vocab)
|
matcher = Matcher(doc.vocab)
|
||||||
matcher.add(label, merge_phrases, pattern)
|
matcher.add(label, [pattern], on_match=merge_phrases)
|
||||||
matcher(doc)
|
matcher(doc)
|
||||||
entities = list(doc.ents)
|
entities = list(doc.ents)
|
||||||
assert entities != []
|
assert entities != []
|
||||||
|
@ -339,7 +333,7 @@ def test_issue850():
|
||||||
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
|
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
|
||||||
matcher = Matcher(vocab)
|
matcher = Matcher(vocab)
|
||||||
pattern = [{"LOWER": "bob"}, {"OP": "*"}, {"LOWER": "frank"}]
|
pattern = [{"LOWER": "bob"}, {"OP": "*"}, {"LOWER": "frank"}]
|
||||||
matcher.add("FarAway", None, pattern)
|
matcher.add("FarAway", [pattern])
|
||||||
doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
|
doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
|
||||||
match = matcher(doc)
|
match = matcher(doc)
|
||||||
assert len(match) == 1
|
assert len(match) == 1
|
||||||
|
@ -353,7 +347,7 @@ def test_issue850_basic():
|
||||||
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
|
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
|
||||||
matcher = Matcher(vocab)
|
matcher = Matcher(vocab)
|
||||||
pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}]
|
pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}]
|
||||||
matcher.add("FarAway", None, pattern)
|
matcher.add("FarAway", [pattern])
|
||||||
doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
|
doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
|
||||||
match = matcher(doc)
|
match = matcher(doc)
|
||||||
assert len(match) == 1
|
assert len(match) == 1
|
||||||
|
|
|
@ -111,7 +111,7 @@ def test_issue1434():
|
||||||
hello_world = Doc(vocab, words=["Hello", "World"])
|
hello_world = Doc(vocab, words=["Hello", "World"])
|
||||||
hello = Doc(vocab, words=["Hello"])
|
hello = Doc(vocab, words=["Hello"])
|
||||||
matcher = Matcher(vocab)
|
matcher = Matcher(vocab)
|
||||||
matcher.add("MyMatcher", None, pattern)
|
matcher.add("MyMatcher", [pattern])
|
||||||
matches = matcher(hello_world)
|
matches = matcher(hello_world)
|
||||||
assert matches
|
assert matches
|
||||||
matches = matcher(hello)
|
matches = matcher(hello)
|
||||||
|
@ -133,7 +133,7 @@ def test_issue1450(string, start, end):
|
||||||
"""Test matcher works when patterns end with * operator."""
|
"""Test matcher works when patterns end with * operator."""
|
||||||
pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
|
pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
|
||||||
matcher = Matcher(Vocab())
|
matcher = Matcher(Vocab())
|
||||||
matcher.add("TSTEND", None, pattern)
|
matcher.add("TSTEND", [pattern])
|
||||||
doc = Doc(Vocab(), words=string.split())
|
doc = Doc(Vocab(), words=string.split())
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
if start is None or end is None:
|
if start is None or end is None:
|
||||||
|
|
|
@ -224,7 +224,7 @@ def test_issue1868():
|
||||||
|
|
||||||
def test_issue1883():
|
def test_issue1883():
|
||||||
matcher = Matcher(Vocab())
|
matcher = Matcher(Vocab())
|
||||||
matcher.add("pat1", None, [{"orth": "hello"}])
|
matcher.add("pat1", [[{"orth": "hello"}]])
|
||||||
doc = Doc(matcher.vocab, words=["hello"])
|
doc = Doc(matcher.vocab, words=["hello"])
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
new_matcher = copy.deepcopy(matcher)
|
new_matcher = copy.deepcopy(matcher)
|
||||||
|
@ -249,7 +249,7 @@ def test_issue1915():
|
||||||
def test_issue1945():
|
def test_issue1945():
|
||||||
"""Test regression in Matcher introduced in v2.0.6."""
|
"""Test regression in Matcher introduced in v2.0.6."""
|
||||||
matcher = Matcher(Vocab())
|
matcher = Matcher(Vocab())
|
||||||
matcher.add("MWE", None, [{"orth": "a"}, {"orth": "a"}])
|
matcher.add("MWE", [[{"orth": "a"}, {"orth": "a"}]])
|
||||||
doc = Doc(matcher.vocab, words=["a", "a", "a"])
|
doc = Doc(matcher.vocab, words=["a", "a", "a"])
|
||||||
matches = matcher(doc) # we should see two overlapping matches here
|
matches = matcher(doc) # we should see two overlapping matches here
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
|
@ -285,7 +285,7 @@ def test_issue1971(en_vocab):
|
||||||
{"ORTH": "!", "OP": "?"},
|
{"ORTH": "!", "OP": "?"},
|
||||||
]
|
]
|
||||||
Token.set_extension("optional", default=False)
|
Token.set_extension("optional", default=False)
|
||||||
matcher.add("TEST", None, pattern)
|
matcher.add("TEST", [pattern])
|
||||||
doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
|
doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
|
||||||
# We could also assert length 1 here, but this is more conclusive, because
|
# We could also assert length 1 here, but this is more conclusive, because
|
||||||
# the real problem here is that it returns a duplicate match for a match_id
|
# the real problem here is that it returns a duplicate match for a match_id
|
||||||
|
@ -299,7 +299,7 @@ def test_issue_1971_2(en_vocab):
|
||||||
pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
|
pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
|
||||||
pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}]
|
pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}]
|
||||||
doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
|
doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
|
||||||
matcher.add("TEST1", None, pattern1, pattern2)
|
matcher.add("TEST1", [pattern1, pattern2])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
|
|
||||||
|
@ -310,8 +310,8 @@ def test_issue_1971_3(en_vocab):
|
||||||
Token.set_extension("b", default=2, force=True)
|
Token.set_extension("b", default=2, force=True)
|
||||||
doc = Doc(en_vocab, words=["hello", "world"])
|
doc = Doc(en_vocab, words=["hello", "world"])
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("A", None, [{"_": {"a": 1}}])
|
matcher.add("A", [[{"_": {"a": 1}}]])
|
||||||
matcher.add("B", None, [{"_": {"b": 2}}])
|
matcher.add("B", [[{"_": {"b": 2}}]])
|
||||||
matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
|
matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
|
||||||
assert len(matches) == 4
|
assert len(matches) == 4
|
||||||
assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
|
assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
|
||||||
|
@ -326,7 +326,7 @@ def test_issue_1971_4(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
doc = Doc(en_vocab, words=["this", "is", "text"])
|
doc = Doc(en_vocab, words=["this", "is", "text"])
|
||||||
pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
|
pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
|
||||||
matcher.add("TEST", None, pattern)
|
matcher.add("TEST", [pattern])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
# Uncommenting this caused a segmentation fault
|
# Uncommenting this caused a segmentation fault
|
||||||
assert len(matches) == 1
|
assert len(matches) == 1
|
||||||
|
|
|
@ -128,7 +128,7 @@ def test_issue2464(en_vocab):
|
||||||
"""Test problem with successive ?. This is the same bug, so putting it here."""
|
"""Test problem with successive ?. This is the same bug, so putting it here."""
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
doc = Doc(en_vocab, words=["a", "b"])
|
doc = Doc(en_vocab, words=["a", "b"])
|
||||||
matcher.add("4", None, [{"OP": "?"}, {"OP": "?"}])
|
matcher.add("4", [[{"OP": "?"}, {"OP": "?"}]])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 3
|
assert len(matches) == 3
|
||||||
|
|
||||||
|
|
|
@ -37,7 +37,7 @@ def test_issue2569(en_tokenizer):
|
||||||
doc = en_tokenizer("It is May 15, 1993.")
|
doc = en_tokenizer("It is May 15, 1993.")
|
||||||
doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])]
|
doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])]
|
||||||
matcher = Matcher(doc.vocab)
|
matcher = Matcher(doc.vocab)
|
||||||
matcher.add("RULE", None, [{"ENT_TYPE": "DATE", "OP": "+"}])
|
matcher.add("RULE", [[{"ENT_TYPE": "DATE", "OP": "+"}]])
|
||||||
matched = [doc[start:end] for _, start, end in matcher(doc)]
|
matched = [doc[start:end] for _, start, end in matcher(doc)]
|
||||||
matched = sorted(matched, key=len, reverse=True)
|
matched = sorted(matched, key=len, reverse=True)
|
||||||
assert len(matched) == 10
|
assert len(matched) == 10
|
||||||
|
@ -89,7 +89,7 @@ def test_issue2671():
|
||||||
{"IS_PUNCT": True, "OP": "?"},
|
{"IS_PUNCT": True, "OP": "?"},
|
||||||
{"LOWER": "adrenaline"},
|
{"LOWER": "adrenaline"},
|
||||||
]
|
]
|
||||||
matcher.add(pattern_id, None, pattern)
|
matcher.add(pattern_id, [pattern])
|
||||||
doc1 = nlp("This is a high-adrenaline situation.")
|
doc1 = nlp("This is a high-adrenaline situation.")
|
||||||
doc2 = nlp("This is a high adrenaline situation.")
|
doc2 = nlp("This is a high adrenaline situation.")
|
||||||
matches1 = matcher(doc1)
|
matches1 = matcher(doc1)
|
||||||
|
|
|
@ -52,7 +52,7 @@ def test_issue3009(en_vocab):
|
||||||
doc = get_doc(en_vocab, words=words, tags=tags)
|
doc = get_doc(en_vocab, words=words, tags=tags)
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
for i, pattern in enumerate(patterns):
|
for i, pattern in enumerate(patterns):
|
||||||
matcher.add(str(i), None, pattern)
|
matcher.add(str(i), [pattern])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert matches
|
assert matches
|
||||||
|
|
||||||
|
@ -116,8 +116,8 @@ def test_issue3248_1():
|
||||||
total number of patterns."""
|
total number of patterns."""
|
||||||
nlp = English()
|
nlp = English()
|
||||||
matcher = PhraseMatcher(nlp.vocab)
|
matcher = PhraseMatcher(nlp.vocab)
|
||||||
matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c"))
|
matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
|
||||||
matcher.add("TEST2", None, nlp("d"))
|
matcher.add("TEST2", [nlp("d")])
|
||||||
assert len(matcher) == 2
|
assert len(matcher) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@ -125,8 +125,8 @@ def test_issue3248_2():
|
||||||
"""Test that the PhraseMatcher can be pickled correctly."""
|
"""Test that the PhraseMatcher can be pickled correctly."""
|
||||||
nlp = English()
|
nlp = English()
|
||||||
matcher = PhraseMatcher(nlp.vocab)
|
matcher = PhraseMatcher(nlp.vocab)
|
||||||
matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c"))
|
matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
|
||||||
matcher.add("TEST2", None, nlp("d"))
|
matcher.add("TEST2", [nlp("d")])
|
||||||
data = pickle.dumps(matcher)
|
data = pickle.dumps(matcher)
|
||||||
new_matcher = pickle.loads(data)
|
new_matcher = pickle.loads(data)
|
||||||
assert len(new_matcher) == len(matcher)
|
assert len(new_matcher) == len(matcher)
|
||||||
|
@ -170,7 +170,7 @@ def test_issue3328(en_vocab):
|
||||||
[{"LOWER": {"IN": ["hello", "how"]}}],
|
[{"LOWER": {"IN": ["hello", "how"]}}],
|
||||||
[{"LOWER": {"IN": ["you", "doing"]}}],
|
[{"LOWER": {"IN": ["you", "doing"]}}],
|
||||||
]
|
]
|
||||||
matcher.add("TEST", None, *patterns)
|
matcher.add("TEST", patterns)
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 4
|
assert len(matches) == 4
|
||||||
matched_texts = [doc[start:end].text for _, start, end in matches]
|
matched_texts = [doc[start:end].text for _, start, end in matches]
|
||||||
|
@ -183,8 +183,8 @@ def test_issue3331(en_vocab):
|
||||||
matches, one per rule.
|
matches, one per rule.
|
||||||
"""
|
"""
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
matcher.add("A", None, Doc(en_vocab, words=["Barack", "Obama"]))
|
matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])])
|
||||||
matcher.add("B", None, Doc(en_vocab, words=["Barack", "Obama"]))
|
matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])])
|
||||||
doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
|
doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
|
|
|
@ -10,6 +10,6 @@ def test_issue3549(en_vocab):
|
||||||
"""Test that match pattern validation doesn't raise on empty errors."""
|
"""Test that match pattern validation doesn't raise on empty errors."""
|
||||||
matcher = Matcher(en_vocab, validate=True)
|
matcher = Matcher(en_vocab, validate=True)
|
||||||
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
|
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
|
||||||
matcher.add("GOOD", None, pattern)
|
matcher.add("GOOD", [pattern])
|
||||||
with pytest.raises(MatchPatternError):
|
with pytest.raises(MatchPatternError):
|
||||||
matcher.add("BAD", None, [{"X": "Y"}])
|
matcher.add("BAD", [[{"X": "Y"}]])
|
||||||
|
|
|
@ -12,6 +12,6 @@ def test_issue3555(en_vocab):
|
||||||
Token.set_extension("issue3555", default=None)
|
Token.set_extension("issue3555", default=None)
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
|
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
|
||||||
matcher.add("TEST", None, pattern)
|
matcher.add("TEST", [pattern])
|
||||||
doc = Doc(en_vocab, words=["have", "apple"])
|
doc = Doc(en_vocab, words=["have", "apple"])
|
||||||
matcher(doc)
|
matcher(doc)
|
||||||
|
|
|
@ -12,10 +12,10 @@ def test_issue3839(en_vocab):
|
||||||
match_id = "PATTERN"
|
match_id = "PATTERN"
|
||||||
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
|
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||||
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
|
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||||
matcher.add(match_id, None, pattern1)
|
matcher.add(match_id, [pattern1])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert matches[0][0] == en_vocab.strings[match_id]
|
assert matches[0][0] == en_vocab.strings[match_id]
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add(match_id, None, pattern2)
|
matcher.add(match_id, [pattern2])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert matches[0][0] == en_vocab.strings[match_id]
|
assert matches[0][0] == en_vocab.strings[match_id]
|
||||||
|
|
|
@ -10,5 +10,5 @@ def test_issue3879(en_vocab):
|
||||||
assert len(doc) == 5
|
assert len(doc) == 5
|
||||||
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
|
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("TEST", None, pattern)
|
matcher.add("TEST", [pattern])
|
||||||
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
|
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
|
||||||
|
|
|
@ -14,7 +14,7 @@ def test_issue3951(en_vocab):
|
||||||
{"OP": "?"},
|
{"OP": "?"},
|
||||||
{"LOWER": "world"},
|
{"LOWER": "world"},
|
||||||
]
|
]
|
||||||
matcher.add("TEST", None, pattern)
|
matcher.add("TEST", [pattern])
|
||||||
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
|
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 0
|
assert len(matches) == 0
|
||||||
|
|
|
@ -9,8 +9,8 @@ def test_issue3972(en_vocab):
|
||||||
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
|
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
|
||||||
"""
|
"""
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
matcher.add("A", None, Doc(en_vocab, words=["New", "York"]))
|
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
||||||
matcher.add("B", None, Doc(en_vocab, words=["New", "York"]))
|
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
||||||
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
|
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ def test_issue4002(en_vocab):
|
||||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||||
pattern1 = Doc(en_vocab, words=["c", "d"])
|
pattern1 = Doc(en_vocab, words=["c", "d"])
|
||||||
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
||||||
matcher.add("TEST", None, pattern1)
|
matcher.add("TEST", [pattern1])
|
||||||
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
|
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
|
||||||
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
|
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
|
@ -21,6 +21,6 @@ def test_issue4002(en_vocab):
|
||||||
pattern2[0].norm_ = "c"
|
pattern2[0].norm_ = "c"
|
||||||
pattern2[1].norm_ = "d"
|
pattern2[1].norm_ = "d"
|
||||||
assert [t.norm_ for t in pattern2] == ["c", "d"]
|
assert [t.norm_ for t in pattern2] == ["c", "d"]
|
||||||
matcher.add("TEST", None, pattern2)
|
matcher.add("TEST", [pattern2])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 1
|
assert len(matches) == 1
|
||||||
|
|
|
@ -8,7 +8,7 @@ from spacy.tokens import Doc
|
||||||
def test_issue4120(en_vocab):
|
def test_issue4120(en_vocab):
|
||||||
"""Test that matches without a final {OP: ?} token are returned."""
|
"""Test that matches without a final {OP: ?} token are returned."""
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}])
|
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
|
||||||
doc1 = Doc(en_vocab, words=["a"])
|
doc1 = Doc(en_vocab, words=["a"])
|
||||||
assert len(matcher(doc1)) == 1 # works
|
assert len(matcher(doc1)) == 1 # works
|
||||||
|
|
||||||
|
@ -16,11 +16,11 @@ def test_issue4120(en_vocab):
|
||||||
assert len(matcher(doc2)) == 2 # fixed
|
assert len(matcher(doc2)) == 2 # fixed
|
||||||
|
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}])
|
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
|
||||||
doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||||
assert len(matcher(doc3)) == 2 # works
|
assert len(matcher(doc3)) == 2 # works
|
||||||
|
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}])
|
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
|
||||||
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||||
assert len(matcher(doc4)) == 3 # fixed
|
assert len(matcher(doc4)) == 3 # fixed
|
||||||
|
|
|
@ -157,16 +157,19 @@ overwritten.
|
||||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||||
| `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
|
| `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
|
||||||
|
|
||||||
<Infobox title="Changed in v2.0" variant="warning">
|
<Infobox title="Changed in v2.2.2" variant="warning">
|
||||||
|
|
||||||
As of spaCy 2.0, `Matcher.add_pattern` and `Matcher.add_entity` are deprecated
|
As of spaCy 2.2.2, `Matcher.add` also supports the new API, which will become
|
||||||
and have been replaced with a simpler [`Matcher.add`](/api/matcher#add) that
|
the default in the future. The patterns are now the second argument and a list
|
||||||
lets you add a list of patterns and a callback for a given match ID.
|
(instead of a variable number of arguments). The `on_match` callback becomes an
|
||||||
|
optional keyword argument.
|
||||||
|
|
||||||
```diff
|
```diff
|
||||||
- matcher.add_entity("GoogleNow", on_match=merge_phrases)
|
patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
|
||||||
- matcher.add_pattern("GoogleNow", [{ORTH: "Google"}, {ORTH: "Now"}])
|
- matcher.add("GoogleNow", None, *patterns)
|
||||||
+ matcher.add('GoogleNow', merge_phrases, [{"ORTH": "Google"}, {"ORTH": "Now"}])
|
+ matcher.add("GoogleNow", patterns)
|
||||||
|
- matcher.add("GoogleNow", on_match, *patterns)
|
||||||
|
+ matcher.add("GoogleNow", patterns, on_match=on_match)
|
||||||
```
|
```
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
|
@ -153,6 +153,23 @@ overwritten.
|
||||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||||
| `*docs` | `Doc` | `Doc` objects of the phrases to match. |
|
| `*docs` | `Doc` | `Doc` objects of the phrases to match. |
|
||||||
|
|
||||||
|
<Infobox title="Changed in v2.2.2" variant="warning">
|
||||||
|
|
||||||
|
As of spaCy 2.2.2, `PhraseMatcher.add` also supports the new API, which will
|
||||||
|
become the default in the future. The `Doc` patterns are now the second argument
|
||||||
|
and a list (instead of a variable number of arguments). The `on_match` callback
|
||||||
|
becomes an optional keyword argument.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
patterns = [nlp("health care reform"), nlp("healthcare reform")]
|
||||||
|
- matcher.add("HEALTH", None, *patterns)
|
||||||
|
+ matcher.add("HEALTH", patterns)
|
||||||
|
- matcher.add("HEALTH", on_match, *patterns)
|
||||||
|
+ matcher.add("HEALTH", patterns, on_match=on_match)
|
||||||
|
```
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
## PhraseMatcher.remove {#remove tag="method" new="2.2"}
|
## PhraseMatcher.remove {#remove tag="method" new="2.2"}
|
||||||
|
|
||||||
Remove a rule from the matcher by match ID. A `KeyError` is raised if the key
|
Remove a rule from the matcher by match ID. A `KeyError` is raised if the key
|
||||||
|
|
Loading…
Reference in New Issue
Block a user