From 45c9a688285081cd69faa0627d9bcaf1f5e799a1 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sat, 31 Oct 2020 12:18:48 +0100 Subject: [PATCH] Identify final Matcher pattern node by quantifier (#6317) Modify the internal pattern representation in `Matcher` patterns to identify the final ID state using a unique quantifier rather than a combination of other attributes. It was insufficient to identify the final ID node based on an uninitialized `quantifier` (coincidentally being the same as the `ZERO`) with `nr_attr` as 0. (In addition, it was potentially bug-prone that `nr_attr` was set to 0 even though attrs were allocated.) In the case of `{"OP": "!"}` (a valid, if pointless, pattern), `nr_attr` is 0 and the quantifier is ZERO, so the previous methods for incrementing to the ID node at the end of the pattern weren't able to distinguish the final ID node from the `{"OP": "!"}` pattern. --- spacy/matcher/matcher.pxd | 1 + spacy/matcher/matcher.pyx | 30 ++++++------------------- spacy/tests/matcher/test_matcher_api.py | 12 ++++++++++ 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index dd04153bf..a668dc142 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -26,6 +26,7 @@ cdef enum quantifier_t: ZERO_PLUS ONE ONE_PLUS + FINAL_ID cdef struct AttrValueC: diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 8fbfe305a..bc615b07c 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -558,7 +558,7 @@ cdef char get_is_match(PatternStateC state, cdef char get_is_final(PatternStateC state) nogil: - if state.pattern[1].nr_attr == 0 and state.pattern[1].attrs != NULL: + if state.pattern[1].quantifier == FINAL_ID: id_attr = state.pattern[1].attrs[0] if id_attr.attr != ID: with gil: @@ -597,36 +597,20 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) pattern[i].nr_py = len(predicates) pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0) i = len(token_specs) - # Even though here, nr_attr == 0, we're storing the ID value in attrs[0] (bug-prone, thread carefully!) - pattern[i].attrs = mem.alloc(2, sizeof(AttrValueC)) + # Use quantifier to identify final ID pattern node (rather than previous + # uninitialized quantifier == 0/ZERO + nr_attr == 0 + non-zero-length attrs) + pattern[i].quantifier = FINAL_ID + pattern[i].attrs = mem.alloc(1, sizeof(AttrValueC)) pattern[i].attrs[0].attr = ID pattern[i].attrs[0].value = entity_id - pattern[i].nr_attr = 0 + pattern[i].nr_attr = 1 pattern[i].nr_extra_attr = 0 pattern[i].nr_py = 0 return pattern cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil: - # There have been a few bugs here. We used to have two functions, - # get_ent_id and get_pattern_key that tried to do the same thing. These - # are now unified to try to solve the "ghost match" problem. - # Below is the previous implementation of get_ent_id and the comment on it, - # preserved for reference while we figure out whether the heisenbug in the - # matcher is resolved. - # - # - # cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil: - # # The code was originally designed to always have pattern[1].attrs.value - # # be the ent_id when we get to the end of a pattern. However, Issue #2671 - # # showed this wasn't the case when we had a reject-and-continue before a - # # match. - # # The patch to #2671 was wrong though, which came up in #3839. - # while pattern.attrs.attr != ID: - # pattern += 1 - # return pattern.attrs.value - while pattern.nr_attr != 0 or pattern.nr_extra_attr != 0 or pattern.nr_py != 0 \ - or pattern.quantifier != ZERO: + while pattern.quantifier != FINAL_ID: pattern += 1 id_attr = pattern[0].attrs[0] if id_attr.attr != ID: diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 1112195da..a23eb392d 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -479,3 +479,15 @@ def test_matcher_span(matcher): assert len(matcher(doc)) == 2 assert len(matcher(span_js)) == 1 assert len(matcher(span_java)) == 1 + + +def test_matcher_remove_zero_operator(en_vocab): + matcher = Matcher(en_vocab) + pattern = [{"OP": "!"}] + matcher.add("Rule", [pattern]) + doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) + matches = matcher(doc) + assert len(matches) == 0 + assert "Rule" in matcher + matcher.remove("Rule") + assert "Rule" not in matcher