From 2534cd57d7aa99f98cd4ea0f1aeab5404d0d493d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Oct 2017 08:59:35 +0200 Subject: [PATCH] Add bandaid solution to the 'shadowing' problem in #864 --- spacy/matcher.pyx | 10 +++++++++- spacy/tests/test_matcher.py | 12 +++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 8893b2fed..41d7029f0 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -71,6 +71,11 @@ cdef enum action_t: ADVANCE_ZERO PANIC +# A "match expression" conists of one or more token patterns +# Each token pattern consists of a quantifier and 0+ (attr, value) pairs. +# A state is an (int, pattern pointer) pair, where the int is the start +# position, and the pattern pointer shows where we're up to +# in the pattern. cdef struct AttrValueC: attr_id_t attr @@ -130,7 +135,10 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil: elif pattern.quantifier in (ONE, ZERO_ONE): return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE elif pattern.quantifier == ZERO_PLUS: - return REPEAT + # This is a bandaid over the 'shadowing' problem described here: + # https://github.com/explosion/spaCy/issues/864 + next_action = get_action(pattern+1, token) + return REPEAT if next_action is REJECT else next_action else: return PANIC diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py index b36c67d8c..ce6f2d91e 100644 --- a/spacy/tests/test_matcher.py +++ b/spacy/tests/test_matcher.py @@ -111,7 +111,17 @@ def test_matcher_empty_dict(en_vocab): matches = matcher(doc) assert matches[0][1:] == (0, 2) - +def test_matcher_operator_shadow(en_vocab): + matcher = Matcher(en_vocab) + abc = ["a", "b", "c"] + doc = get_doc(matcher.vocab, abc) + matcher.add('A.C', None, [{'ORTH': 'a'}, + {"IS_ALPHA": True, "OP": "+"}, + {'ORTH': 'c'}]) + matches = matcher(doc) + assert len(matches) == 1 + assert matches[0][1:] == (0, 3) + def test_matcher_phrase_matcher(en_vocab): words = ["Google", "Now"] doc = get_doc(en_vocab, words)