Add bandaid solution to the 'shadowing' problem in #864

This commit is contained in:
Matthew Honnibal 2017-10-09 08:59:35 +02:00
parent d8a2506023
commit 2534cd57d7
2 changed files with 20 additions and 2 deletions

View File

@ -71,6 +71,11 @@ cdef enum action_t:
ADVANCE_ZERO
PANIC
# A "match expression" conists of one or more token patterns
# Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
# A state is an (int, pattern pointer) pair, where the int is the start
# position, and the pattern pointer shows where we're up to
# in the pattern.
cdef struct AttrValueC:
attr_id_t attr
@ -130,7 +135,10 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
elif pattern.quantifier in (ONE, ZERO_ONE):
return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE
elif pattern.quantifier == ZERO_PLUS:
return REPEAT
# This is a bandaid over the 'shadowing' problem described here:
# https://github.com/explosion/spaCy/issues/864
next_action = get_action(pattern+1, token)
return REPEAT if next_action is REJECT else next_action
else:
return PANIC

View File

@ -111,6 +111,16 @@ def test_matcher_empty_dict(en_vocab):
matches = matcher(doc)
assert matches[0][1:] == (0, 2)
def test_matcher_operator_shadow(en_vocab):
matcher = Matcher(en_vocab)
abc = ["a", "b", "c"]
doc = get_doc(matcher.vocab, abc)
matcher.add('A.C', None, [{'ORTH': 'a'},
{"IS_ALPHA": True, "OP": "+"},
{'ORTH': 'c'}])
matches = matcher(doc)
assert len(matches) == 1
assert matches[0][1:] == (0, 3)
def test_matcher_phrase_matcher(en_vocab):
words = ["Google", "Now"]