diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 0cda37012..b667e6b2f 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -177,13 +177,14 @@ cdef class DependencyMatcher: # Add 'RIGHT_ATTRS' to self._patterns[key] _patterns = [[[pat["RIGHT_ATTRS"]] for pat in pattern] for pattern in patterns] + pattern_offset = len(self._patterns[key]) self._patterns[key].extend(_patterns) # Add each node pattern of all the input patterns individually to the # matcher. This enables only a single instance of Matcher to be used. # Multiple adds are required to track each node pattern. tokens_to_key_list = [] - for i, current_patterns in enumerate(_patterns): + for i, current_patterns in enumerate(_patterns, start=pattern_offset): # Preallocate list space tokens_to_key = [None] * len(current_patterns) @@ -263,7 +264,9 @@ cdef class DependencyMatcher: self._raw_patterns.pop(key) self._tree.pop(key) self._root.pop(key) - self._tokens_to_key.pop(key) + for mklist in self._tokens_to_key.pop(key): + for mkey in mklist: + self._matcher.remove(mkey) def _get_keys_to_position_maps(self, doc): """ diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py index 0e1eae588..61ae43c52 100644 --- a/spacy/tests/matcher/test_dependency_matcher.py +++ b/spacy/tests/matcher/test_dependency_matcher.py @@ -368,3 +368,87 @@ def test_dependency_matcher_span_user_data(en_tokenizer): assert doc_match[0] == span_match[0] for doc_t_i, span_t_i in zip(doc_match[1], span_match[1]): assert doc_t_i == span_t_i + offset + + +def test_dependency_matcher_order_issue(en_tokenizer): + # issue from #9263 + doc = en_tokenizer("I like text") + doc[2].head = doc[1] + + # this matches on attrs but not rel op + pattern1 = [ + {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}}, + { + "LEFT_ID": "root", + "RIGHT_ID": "r", + "RIGHT_ATTRS": {"ORTH": "text"}, + "REL_OP": "<", + }, + ] + + # this matches on rel op but not attrs + pattern2 = [ + {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}}, + { + "LEFT_ID": "root", + "RIGHT_ID": "r", + "RIGHT_ATTRS": {"ORTH": "fish"}, + "REL_OP": ">", + }, + ] + + matcher = DependencyMatcher(en_tokenizer.vocab) + + # This should behave the same as the next pattern + matcher.add("check", [pattern1, pattern2]) + matches = matcher(doc) + + assert matches == [] + + # use a new matcher + matcher = DependencyMatcher(en_tokenizer.vocab) + # adding one at a time under same label gets a match + matcher.add("check", [pattern1]) + matcher.add("check", [pattern2]) + matches = matcher(doc) + + assert matches == [] + + +def test_dependency_matcher_remove(en_tokenizer): + # issue from #9263 + doc = en_tokenizer("The red book") + doc[1].head = doc[2] + + # this matches + pattern1 = [ + {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "book"}}, + { + "LEFT_ID": "root", + "RIGHT_ID": "r", + "RIGHT_ATTRS": {"ORTH": "red"}, + "REL_OP": ">", + }, + ] + + # add and then remove it + matcher = DependencyMatcher(en_tokenizer.vocab) + matcher.add("check", [pattern1]) + matcher.remove("check") + + # this matches on rel op but not attrs + pattern2 = [ + {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "flag"}}, + { + "LEFT_ID": "root", + "RIGHT_ID": "r", + "RIGHT_ATTRS": {"ORTH": "blue"}, + "REL_OP": ">", + }, + ] + + # Adding this new pattern with the same label, which should not match + matcher.add("check", [pattern2]) + matches = matcher(doc) + + assert matches == []