Fix Dependency Matcher Ordering Issue (#9337)

* Fix inconsistency

This makes the failing test pass, so that behavior is consistent whether
patterns are added in one call or two.

The issue is that the hash for patterns depended on the index of the
pattern in the list of current patterns, not the list of total patterns,
so a second call would get identical match ids.

* Add illustrative test case

* Add failing test for remove case

Patterns are not removed from the internal matcher on calls to remove,
which causes spurious weird matches (or misses).

* Fix removal issue

Remove patterns from the internal matcher.

* Check that the single add call also gets no matches
This commit is contained in:
Paul O'Leary McCann 2021-10-11 08:26:13 +00:00 committed by GitHub
parent 5dbe4e8392
commit 2a7e327310
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 89 additions and 2 deletions

View File

@ -177,13 +177,14 @@ cdef class DependencyMatcher:
# Add 'RIGHT_ATTRS' to self._patterns[key]
_patterns = [[[pat["RIGHT_ATTRS"]] for pat in pattern] for pattern in patterns]
pattern_offset = len(self._patterns[key])
self._patterns[key].extend(_patterns)
# Add each node pattern of all the input patterns individually to the
# matcher. This enables only a single instance of Matcher to be used.
# Multiple adds are required to track each node pattern.
tokens_to_key_list = []
for i, current_patterns in enumerate(_patterns):
for i, current_patterns in enumerate(_patterns, start=pattern_offset):
# Preallocate list space
tokens_to_key = [None] * len(current_patterns)
@ -263,7 +264,9 @@ cdef class DependencyMatcher:
self._raw_patterns.pop(key)
self._tree.pop(key)
self._root.pop(key)
self._tokens_to_key.pop(key)
for mklist in self._tokens_to_key.pop(key):
for mkey in mklist:
self._matcher.remove(mkey)
def _get_keys_to_position_maps(self, doc):
"""

View File

@ -368,3 +368,87 @@ def test_dependency_matcher_span_user_data(en_tokenizer):
assert doc_match[0] == span_match[0]
for doc_t_i, span_t_i in zip(doc_match[1], span_match[1]):
assert doc_t_i == span_t_i + offset
def test_dependency_matcher_order_issue(en_tokenizer):
# issue from #9263
doc = en_tokenizer("I like text")
doc[2].head = doc[1]
# this matches on attrs but not rel op
pattern1 = [
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}},
{
"LEFT_ID": "root",
"RIGHT_ID": "r",
"RIGHT_ATTRS": {"ORTH": "text"},
"REL_OP": "<",
},
]
# this matches on rel op but not attrs
pattern2 = [
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}},
{
"LEFT_ID": "root",
"RIGHT_ID": "r",
"RIGHT_ATTRS": {"ORTH": "fish"},
"REL_OP": ">",
},
]
matcher = DependencyMatcher(en_tokenizer.vocab)
# This should behave the same as the next pattern
matcher.add("check", [pattern1, pattern2])
matches = matcher(doc)
assert matches == []
# use a new matcher
matcher = DependencyMatcher(en_tokenizer.vocab)
# adding one at a time under same label gets a match
matcher.add("check", [pattern1])
matcher.add("check", [pattern2])
matches = matcher(doc)
assert matches == []
def test_dependency_matcher_remove(en_tokenizer):
# issue from #9263
doc = en_tokenizer("The red book")
doc[1].head = doc[2]
# this matches
pattern1 = [
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "book"}},
{
"LEFT_ID": "root",
"RIGHT_ID": "r",
"RIGHT_ATTRS": {"ORTH": "red"},
"REL_OP": ">",
},
]
# add and then remove it
matcher = DependencyMatcher(en_tokenizer.vocab)
matcher.add("check", [pattern1])
matcher.remove("check")
# this matches on rel op but not attrs
pattern2 = [
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "flag"}},
{
"LEFT_ID": "root",
"RIGHT_ID": "r",
"RIGHT_ATTRS": {"ORTH": "blue"},
"REL_OP": ">",
},
]
# Adding this new pattern with the same label, which should not match
matcher.add("check", [pattern2])
matches = matcher(doc)
assert matches == []