mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Fix Dependency Matcher Ordering Issue (#9337)
* Fix inconsistency This makes the failing test pass, so that behavior is consistent whether patterns are added in one call or two. The issue is that the hash for patterns depended on the index of the pattern in the list of current patterns, not the list of total patterns, so a second call would get identical match ids. * Add illustrative test case * Add failing test for remove case Patterns are not removed from the internal matcher on calls to remove, which causes spurious weird matches (or misses). * Fix removal issue Remove patterns from the internal matcher. * Check that the single add call also gets no matches
This commit is contained in:
parent
5dbe4e8392
commit
2a7e327310
|
@ -177,13 +177,14 @@ cdef class DependencyMatcher:
|
|||
|
||||
# Add 'RIGHT_ATTRS' to self._patterns[key]
|
||||
_patterns = [[[pat["RIGHT_ATTRS"]] for pat in pattern] for pattern in patterns]
|
||||
pattern_offset = len(self._patterns[key])
|
||||
self._patterns[key].extend(_patterns)
|
||||
|
||||
# Add each node pattern of all the input patterns individually to the
|
||||
# matcher. This enables only a single instance of Matcher to be used.
|
||||
# Multiple adds are required to track each node pattern.
|
||||
tokens_to_key_list = []
|
||||
for i, current_patterns in enumerate(_patterns):
|
||||
for i, current_patterns in enumerate(_patterns, start=pattern_offset):
|
||||
|
||||
# Preallocate list space
|
||||
tokens_to_key = [None] * len(current_patterns)
|
||||
|
@ -263,7 +264,9 @@ cdef class DependencyMatcher:
|
|||
self._raw_patterns.pop(key)
|
||||
self._tree.pop(key)
|
||||
self._root.pop(key)
|
||||
self._tokens_to_key.pop(key)
|
||||
for mklist in self._tokens_to_key.pop(key):
|
||||
for mkey in mklist:
|
||||
self._matcher.remove(mkey)
|
||||
|
||||
def _get_keys_to_position_maps(self, doc):
|
||||
"""
|
||||
|
|
|
@ -368,3 +368,87 @@ def test_dependency_matcher_span_user_data(en_tokenizer):
|
|||
assert doc_match[0] == span_match[0]
|
||||
for doc_t_i, span_t_i in zip(doc_match[1], span_match[1]):
|
||||
assert doc_t_i == span_t_i + offset
|
||||
|
||||
|
||||
def test_dependency_matcher_order_issue(en_tokenizer):
|
||||
# issue from #9263
|
||||
doc = en_tokenizer("I like text")
|
||||
doc[2].head = doc[1]
|
||||
|
||||
# this matches on attrs but not rel op
|
||||
pattern1 = [
|
||||
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}},
|
||||
{
|
||||
"LEFT_ID": "root",
|
||||
"RIGHT_ID": "r",
|
||||
"RIGHT_ATTRS": {"ORTH": "text"},
|
||||
"REL_OP": "<",
|
||||
},
|
||||
]
|
||||
|
||||
# this matches on rel op but not attrs
|
||||
pattern2 = [
|
||||
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}},
|
||||
{
|
||||
"LEFT_ID": "root",
|
||||
"RIGHT_ID": "r",
|
||||
"RIGHT_ATTRS": {"ORTH": "fish"},
|
||||
"REL_OP": ">",
|
||||
},
|
||||
]
|
||||
|
||||
matcher = DependencyMatcher(en_tokenizer.vocab)
|
||||
|
||||
# This should behave the same as the next pattern
|
||||
matcher.add("check", [pattern1, pattern2])
|
||||
matches = matcher(doc)
|
||||
|
||||
assert matches == []
|
||||
|
||||
# use a new matcher
|
||||
matcher = DependencyMatcher(en_tokenizer.vocab)
|
||||
# adding one at a time under same label gets a match
|
||||
matcher.add("check", [pattern1])
|
||||
matcher.add("check", [pattern2])
|
||||
matches = matcher(doc)
|
||||
|
||||
assert matches == []
|
||||
|
||||
|
||||
def test_dependency_matcher_remove(en_tokenizer):
|
||||
# issue from #9263
|
||||
doc = en_tokenizer("The red book")
|
||||
doc[1].head = doc[2]
|
||||
|
||||
# this matches
|
||||
pattern1 = [
|
||||
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "book"}},
|
||||
{
|
||||
"LEFT_ID": "root",
|
||||
"RIGHT_ID": "r",
|
||||
"RIGHT_ATTRS": {"ORTH": "red"},
|
||||
"REL_OP": ">",
|
||||
},
|
||||
]
|
||||
|
||||
# add and then remove it
|
||||
matcher = DependencyMatcher(en_tokenizer.vocab)
|
||||
matcher.add("check", [pattern1])
|
||||
matcher.remove("check")
|
||||
|
||||
# this matches on rel op but not attrs
|
||||
pattern2 = [
|
||||
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "flag"}},
|
||||
{
|
||||
"LEFT_ID": "root",
|
||||
"RIGHT_ID": "r",
|
||||
"RIGHT_ATTRS": {"ORTH": "blue"},
|
||||
"REL_OP": ">",
|
||||
},
|
||||
]
|
||||
|
||||
# Adding this new pattern with the same label, which should not match
|
||||
matcher.add("check", [pattern2])
|
||||
matches = matcher(doc)
|
||||
|
||||
assert matches == []
|
||||
|
|
Loading…
Reference in New Issue
Block a user