mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Unify matcher get_ent_id and get_pattern_key (#4415)
This is basically stabbing blindly at the ghost match problem, but it at least seems like there was a bug previously here --- so this should hopefully be an improvement, even if it doesn't fix the ghost match problem.
This commit is contained in:
parent
77643de2ca
commit
fa95c030a5
|
@ -138,7 +138,7 @@ cdef class Matcher:
|
||||||
self._callbacks.pop(key)
|
self._callbacks.pop(key)
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
while i < self.patterns.size():
|
while i < self.patterns.size():
|
||||||
pattern_key = get_pattern_key(self.patterns.at(i))
|
pattern_key = get_ent_id(self.patterns.at(i))
|
||||||
if pattern_key == key:
|
if pattern_key == key:
|
||||||
self.patterns.erase(self.patterns.begin()+i)
|
self.patterns.erase(self.patterns.begin()+i)
|
||||||
else:
|
else:
|
||||||
|
@ -293,18 +293,6 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
|
||||||
# There have been a few bugs here.
|
|
||||||
# The code was originally designed to always have pattern[1].attrs.value
|
|
||||||
# be the ent_id when we get to the end of a pattern. However, Issue #2671
|
|
||||||
# showed this wasn't the case when we had a reject-and-continue before a
|
|
||||||
# match.
|
|
||||||
# The patch to #2671 was wrong though, which came up in #3839.
|
|
||||||
while pattern.attrs.attr != ID:
|
|
||||||
pattern += 1
|
|
||||||
return pattern.attrs.value
|
|
||||||
|
|
||||||
|
|
||||||
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
|
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
|
||||||
char* cached_py_predicates,
|
char* cached_py_predicates,
|
||||||
Token token, const attr_t* extra_attrs, py_predicates) except *:
|
Token token, const attr_t* extra_attrs, py_predicates) except *:
|
||||||
|
@ -583,8 +571,26 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
|
||||||
return pattern
|
return pattern
|
||||||
|
|
||||||
|
|
||||||
cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil:
|
cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
||||||
while pattern.nr_attr != 0 or pattern.nr_extra_attr != 0 or pattern.nr_py != 0:
|
# There have been a few bugs here. We used to have two functions,
|
||||||
|
# get_ent_id and get_pattern_key that tried to do the same thing. These
|
||||||
|
# are now unified to try to solve the "ghost match" problem.
|
||||||
|
# Below is the previous implementation of get_ent_id and the comment on it,
|
||||||
|
# preserved for reference while we figure out whether the heisenbug in the
|
||||||
|
# matcher is resolved.
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
||||||
|
# # The code was originally designed to always have pattern[1].attrs.value
|
||||||
|
# # be the ent_id when we get to the end of a pattern. However, Issue #2671
|
||||||
|
# # showed this wasn't the case when we had a reject-and-continue before a
|
||||||
|
# # match.
|
||||||
|
# # The patch to #2671 was wrong though, which came up in #3839.
|
||||||
|
# while pattern.attrs.attr != ID:
|
||||||
|
# pattern += 1
|
||||||
|
# return pattern.attrs.value
|
||||||
|
while pattern.nr_attr != 0 or pattern.nr_extra_attr != 0 or pattern.nr_py != 0 \
|
||||||
|
or pattern.quantifier != ZERO:
|
||||||
pattern += 1
|
pattern += 1
|
||||||
id_attr = pattern[0].attrs[0]
|
id_attr = pattern[0].attrs[0]
|
||||||
if id_attr.attr != ID:
|
if id_attr.attr != ID:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user