Identify final Matcher pattern node by quantifier (#6317)

Modify the internal pattern representation in `Matcher` patterns to
identify the final ID state using a unique quantifier rather than a
combination of other attributes.

It was insufficient to identify the final ID node based on an
uninitialized `quantifier` (coincidentally being the same as the `ZERO`)
with `nr_attr` as 0. (In addition, it was potentially bug-prone that
`nr_attr` was set to 0 even though attrs were allocated.)

In the case of `{"OP": "!"}` (a valid, if pointless, pattern), `nr_attr`
is 0 and the quantifier is ZERO, so the previous methods for
incrementing to the ID node at the end of the pattern weren't able to
distinguish the final ID node from the `{"OP": "!"}` pattern.
This commit is contained in:
Adriane Boyd 2020-10-31 12:18:48 +01:00 committed by GitHub
parent 0e55f806dd
commit 45c9a68828
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 20 additions and 23 deletions

View File

@ -26,6 +26,7 @@ cdef enum quantifier_t:
ZERO_PLUS ZERO_PLUS
ONE ONE
ONE_PLUS ONE_PLUS
FINAL_ID
cdef struct AttrValueC: cdef struct AttrValueC:

View File

@ -558,7 +558,7 @@ cdef char get_is_match(PatternStateC state,
cdef char get_is_final(PatternStateC state) nogil: cdef char get_is_final(PatternStateC state) nogil:
if state.pattern[1].nr_attr == 0 and state.pattern[1].attrs != NULL: if state.pattern[1].quantifier == FINAL_ID:
id_attr = state.pattern[1].attrs[0] id_attr = state.pattern[1].attrs[0]
if id_attr.attr != ID: if id_attr.attr != ID:
with gil: with gil:
@ -597,36 +597,20 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
pattern[i].nr_py = len(predicates) pattern[i].nr_py = len(predicates)
pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0) pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0)
i = len(token_specs) i = len(token_specs)
# Even though here, nr_attr == 0, we're storing the ID value in attrs[0] (bug-prone, thread carefully!) # Use quantifier to identify final ID pattern node (rather than previous
pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC)) # uninitialized quantifier == 0/ZERO + nr_attr == 0 + non-zero-length attrs)
pattern[i].quantifier = FINAL_ID
pattern[i].attrs = <AttrValueC*>mem.alloc(1, sizeof(AttrValueC))
pattern[i].attrs[0].attr = ID pattern[i].attrs[0].attr = ID
pattern[i].attrs[0].value = entity_id pattern[i].attrs[0].value = entity_id
pattern[i].nr_attr = 0 pattern[i].nr_attr = 1
pattern[i].nr_extra_attr = 0 pattern[i].nr_extra_attr = 0
pattern[i].nr_py = 0 pattern[i].nr_py = 0
return pattern return pattern
cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil: cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
# There have been a few bugs here. We used to have two functions, while pattern.quantifier != FINAL_ID:
# get_ent_id and get_pattern_key that tried to do the same thing. These
# are now unified to try to solve the "ghost match" problem.
# Below is the previous implementation of get_ent_id and the comment on it,
# preserved for reference while we figure out whether the heisenbug in the
# matcher is resolved.
#
#
# cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
# # The code was originally designed to always have pattern[1].attrs.value
# # be the ent_id when we get to the end of a pattern. However, Issue #2671
# # showed this wasn't the case when we had a reject-and-continue before a
# # match.
# # The patch to #2671 was wrong though, which came up in #3839.
# while pattern.attrs.attr != ID:
# pattern += 1
# return pattern.attrs.value
while pattern.nr_attr != 0 or pattern.nr_extra_attr != 0 or pattern.nr_py != 0 \
or pattern.quantifier != ZERO:
pattern += 1 pattern += 1
id_attr = pattern[0].attrs[0] id_attr = pattern[0].attrs[0]
if id_attr.attr != ID: if id_attr.attr != ID:

View File

@ -479,3 +479,15 @@ def test_matcher_span(matcher):
assert len(matcher(doc)) == 2 assert len(matcher(doc)) == 2
assert len(matcher(span_js)) == 1 assert len(matcher(span_js)) == 1
assert len(matcher(span_java)) == 1 assert len(matcher(span_java)) == 1
def test_matcher_remove_zero_operator(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"OP": "!"}]
matcher.add("Rule", [pattern])
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
matches = matcher(doc)
assert len(matches) == 0
assert "Rule" in matcher
matcher.remove("Rule")
assert "Rule" not in matcher