mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Identify final Matcher pattern node by quantifier (#6317)
Modify the internal pattern representation in `Matcher` patterns to identify the final ID state using a unique quantifier rather than a combination of other attributes. It was insufficient to identify the final ID node based on an uninitialized `quantifier` (coincidentally being the same as the `ZERO`) with `nr_attr` as 0. (In addition, it was potentially bug-prone that `nr_attr` was set to 0 even though attrs were allocated.) In the case of `{"OP": "!"}` (a valid, if pointless, pattern), `nr_attr` is 0 and the quantifier is ZERO, so the previous methods for incrementing to the ID node at the end of the pattern weren't able to distinguish the final ID node from the `{"OP": "!"}` pattern.
This commit is contained in:
parent
0e55f806dd
commit
45c9a68828
|
@ -26,6 +26,7 @@ cdef enum quantifier_t:
|
|||
ZERO_PLUS
|
||||
ONE
|
||||
ONE_PLUS
|
||||
FINAL_ID
|
||||
|
||||
|
||||
cdef struct AttrValueC:
|
||||
|
|
|
@ -558,7 +558,7 @@ cdef char get_is_match(PatternStateC state,
|
|||
|
||||
|
||||
cdef char get_is_final(PatternStateC state) nogil:
|
||||
if state.pattern[1].nr_attr == 0 and state.pattern[1].attrs != NULL:
|
||||
if state.pattern[1].quantifier == FINAL_ID:
|
||||
id_attr = state.pattern[1].attrs[0]
|
||||
if id_attr.attr != ID:
|
||||
with gil:
|
||||
|
@ -597,36 +597,20 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
|
|||
pattern[i].nr_py = len(predicates)
|
||||
pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0)
|
||||
i = len(token_specs)
|
||||
# Even though here, nr_attr == 0, we're storing the ID value in attrs[0] (bug-prone, thread carefully!)
|
||||
pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
|
||||
# Use quantifier to identify final ID pattern node (rather than previous
|
||||
# uninitialized quantifier == 0/ZERO + nr_attr == 0 + non-zero-length attrs)
|
||||
pattern[i].quantifier = FINAL_ID
|
||||
pattern[i].attrs = <AttrValueC*>mem.alloc(1, sizeof(AttrValueC))
|
||||
pattern[i].attrs[0].attr = ID
|
||||
pattern[i].attrs[0].value = entity_id
|
||||
pattern[i].nr_attr = 0
|
||||
pattern[i].nr_attr = 1
|
||||
pattern[i].nr_extra_attr = 0
|
||||
pattern[i].nr_py = 0
|
||||
return pattern
|
||||
|
||||
|
||||
cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
||||
# There have been a few bugs here. We used to have two functions,
|
||||
# get_ent_id and get_pattern_key that tried to do the same thing. These
|
||||
# are now unified to try to solve the "ghost match" problem.
|
||||
# Below is the previous implementation of get_ent_id and the comment on it,
|
||||
# preserved for reference while we figure out whether the heisenbug in the
|
||||
# matcher is resolved.
|
||||
#
|
||||
#
|
||||
# cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
||||
# # The code was originally designed to always have pattern[1].attrs.value
|
||||
# # be the ent_id when we get to the end of a pattern. However, Issue #2671
|
||||
# # showed this wasn't the case when we had a reject-and-continue before a
|
||||
# # match.
|
||||
# # The patch to #2671 was wrong though, which came up in #3839.
|
||||
# while pattern.attrs.attr != ID:
|
||||
# pattern += 1
|
||||
# return pattern.attrs.value
|
||||
while pattern.nr_attr != 0 or pattern.nr_extra_attr != 0 or pattern.nr_py != 0 \
|
||||
or pattern.quantifier != ZERO:
|
||||
while pattern.quantifier != FINAL_ID:
|
||||
pattern += 1
|
||||
id_attr = pattern[0].attrs[0]
|
||||
if id_attr.attr != ID:
|
||||
|
|
|
@ -479,3 +479,15 @@ def test_matcher_span(matcher):
|
|||
assert len(matcher(doc)) == 2
|
||||
assert len(matcher(span_js)) == 1
|
||||
assert len(matcher(span_java)) == 1
|
||||
|
||||
|
||||
def test_matcher_remove_zero_operator(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"OP": "!"}]
|
||||
matcher.add("Rule", [pattern])
|
||||
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 0
|
||||
assert "Rule" in matcher
|
||||
matcher.remove("Rule")
|
||||
assert "Rule" not in matcher
|
||||
|
|
Loading…
Reference in New Issue
Block a user