From fa95c030a511337935d1a2e930cb954c7a4cd376 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 9 Oct 2019 15:26:31 +0200 Subject: [PATCH] Unify matcher get_ent_id and get_pattern_key (#4415) This is basically stabbing blindly at the ghost match problem, but it at least seems like there was a bug previously here --- so this should hopefully be an improvement, even if it doesn't fix the ghost match problem. --- spacy/matcher/matcher.pyx | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 950a7b977..eb1eecda0 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -138,7 +138,7 @@ cdef class Matcher: self._callbacks.pop(key) cdef int i = 0 while i < self.patterns.size(): - pattern_key = get_pattern_key(self.patterns.at(i)) + pattern_key = get_ent_id(self.patterns.at(i)) if pattern_key == key: self.patterns.erase(self.patterns.begin()+i) else: @@ -293,18 +293,6 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None, return output -cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil: - # There have been a few bugs here. - # The code was originally designed to always have pattern[1].attrs.value - # be the ent_id when we get to the end of a pattern. However, Issue #2671 - # showed this wasn't the case when we had a reject-and-continue before a - # match. - # The patch to #2671 was wrong though, which came up in #3839. - while pattern.attrs.attr != ID: - pattern += 1 - return pattern.attrs.value - - cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches, char* cached_py_predicates, Token token, const attr_t* extra_attrs, py_predicates) except *: @@ -583,8 +571,26 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) return pattern -cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil: - while pattern.nr_attr != 0 or pattern.nr_extra_attr != 0 or pattern.nr_py != 0: +cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil: + # There have been a few bugs here. We used to have two functions, + # get_ent_id and get_pattern_key that tried to do the same thing. These + # are now unified to try to solve the "ghost match" problem. + # Below is the previous implementation of get_ent_id and the comment on it, + # preserved for reference while we figure out whether the heisenbug in the + # matcher is resolved. + # + # + # cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil: + # # The code was originally designed to always have pattern[1].attrs.value + # # be the ent_id when we get to the end of a pattern. However, Issue #2671 + # # showed this wasn't the case when we had a reject-and-continue before a + # # match. + # # The patch to #2671 was wrong though, which came up in #3839. + # while pattern.attrs.attr != ID: + # pattern += 1 + # return pattern.attrs.value + while pattern.nr_attr != 0 or pattern.nr_extra_attr != 0 or pattern.nr_py != 0 \ + or pattern.quantifier != ZERO: pattern += 1 id_attr = pattern[0].attrs[0] if id_attr.attr != ID: