diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index fa67f32d6..a0c69f4bf 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -69,6 +69,7 @@ cdef enum action_t: REPEAT ACCEPT ADVANCE_ZERO + ACCEPT_PREV PANIC # A "match expression" conists of one or more token patterns @@ -120,24 +121,27 @@ cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0: cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil: + lookahead = &pattern[1] for attr in pattern.attrs[:pattern.nr_attr]: if get_token_attr(token, attr.attr) != attr.value: if pattern.quantifier == ONE: return REJECT elif pattern.quantifier == ZERO: - return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE + return ACCEPT if lookahead.nr_attr == 0 else ADVANCE elif pattern.quantifier in (ZERO_ONE, ZERO_PLUS): - return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE_ZERO + return ACCEPT_PREV if lookahead.nr_attr == 0 else ADVANCE_ZERO else: return PANIC if pattern.quantifier == ZERO: return REJECT + elif lookahead.nr_attr == 0: + return ACCEPT elif pattern.quantifier in (ONE, ZERO_ONE): - return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE + return ADVANCE elif pattern.quantifier == ZERO_PLUS: # This is a bandaid over the 'shadowing' problem described here: # https://github.com/explosion/spaCy/issues/864 - next_action = get_action(pattern+1, token) + next_action = get_action(lookahead, token) if next_action is REJECT: return REPEAT else: @@ -345,6 +349,9 @@ cdef class Matcher: while action == ADVANCE_ZERO: state.second += 1 action = get_action(state.second, token) + if action == PANIC: + raise Exception("Error selecting action in matcher") + if action == REPEAT: # Leave the state in the queue, and advance to next slot # (i.e. we don't overwrite -- we want to greedily match more @@ -356,14 +363,15 @@ cdef class Matcher: partials[q] = state partials[q].second += 1 q += 1 - elif action == ACCEPT: + elif action in (ACCEPT, ACCEPT_PREV): # TODO: What to do about patterns starting with ZERO? Need to # adjust the start position. start = state.first - end = token_i+1 + end = token_i+1 if action == ACCEPT else token_i ent_id = state.second[1].attrs[0].value label = state.second[1].attrs[1].value matches.append((ent_id, start, end)) + partials.resize(q) # Check whether we open any new patterns on this token for pattern in self.patterns: @@ -383,9 +391,9 @@ cdef class Matcher: state.first = token_i state.second = pattern + 1 partials.push_back(state) - elif action == ACCEPT: + elif action in (ACCEPT, ACCEPT_PREV): start = token_i - end = token_i+1 + end = token_i+1 if action == ACCEPT else token_i ent_id = pattern[1].attrs[0].value label = pattern[1].attrs[1].value matches.append((ent_id, start, end)) diff --git a/spacy/tests/regression/test_issue1450.py b/spacy/tests/regression/test_issue1450.py new file mode 100644 index 000000000..6f1d4f568 --- /dev/null +++ b/spacy/tests/regression/test_issue1450.py @@ -0,0 +1,58 @@ +from __future__ import unicode_literals +import pytest + +from ...matcher import Matcher +from ...tokens import Doc +from ...vocab import Vocab + + +@pytest.mark.parametrize( + 'string,start,end', + [ + ('a', 0, 1), + ('a b', 0, 2), + ('a c', 0, 1), + ('a b c', 0, 2), + ('a b b c', 0, 2), + ('a b b', 0, 2), + ] +) +def test_issue1450_matcher_end_zero_plus(string, start, end): + '''Test matcher works when patterns end with * operator. + + Original example (rewritten to avoid model usage) + + nlp = spacy.load('en_core_web_sm') + matcher = Matcher(nlp.vocab) + matcher.add( + "TSTEND", + on_match_1, + [ + {TAG: "JJ", LOWER: "new"}, + {TAG: "NN", 'OP': "*"} + ] + ) + doc = nlp(u'Could you create a new ticket for me?') + print([(w.tag_, w.text, w.lower_) for w in doc]) + matches = matcher(doc) + print(matches) + assert len(matches) == 1 + assert matches[0][1] == 4 + assert matches[0][2] == 5 + ''' + matcher = Matcher(Vocab()) + matcher.add( + "TSTEND", + None, + [ + {'ORTH': "a"}, + {'ORTH': "b", 'OP': "*"} + ] + ) + doc = Doc(Vocab(), words=string.split()) + matches = matcher(doc) + if start is None or end is None: + assert matches == [] + + assert matches[0][1] == start + assert matches[0][2] == end diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py index 9fcb47305..5b08ede39 100644 --- a/spacy/tests/test_matcher.py +++ b/spacy/tests/test_matcher.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from ..matcher import Matcher, PhraseMatcher from .util import get_doc +from ..tokens import Doc import pytest @@ -212,3 +213,24 @@ def test_operator_combos(matcher): assert matches, (string, pattern_str) else: assert not matches, (string, pattern_str) + + +def test_matcher_end_zero_plus(matcher): + '''Test matcher works when patterns end with * operator. (issue 1450)''' + matcher = Matcher(matcher.vocab) + matcher.add( + "TSTEND", + None, + [ + {'ORTH': "a"}, + {'ORTH': "b", 'OP': "*"} + ] + ) + nlp = lambda string: Doc(matcher.vocab, words=string.split()) + assert len(matcher(nlp(u'a'))) == 1 + assert len(matcher(nlp(u'a b'))) == 1 + assert len(matcher(nlp(u'a b'))) == 1 + assert len(matcher(nlp(u'a c'))) == 1 + assert len(matcher(nlp(u'a b c'))) == 1 + assert len(matcher(nlp(u'a b b c'))) == 1 + assert len(matcher(nlp(u'a b b'))) == 1