adding double match for optional operator at the end (#4166)

This commit is contained in:
Sofie Van Landeghem 2019-08-21 22:46:56 +02:00 committed by Matthew Honnibal
parent 01c5980187
commit de272f8b82
3 changed files with 19 additions and 5 deletions

View File

@ -17,6 +17,7 @@ cdef enum action_t:
RETRY_ADVANCE = 0110
MATCH_EXTEND = 1001
MATCH_REJECT = 2000
MATCH_DOUBLE = 3000
cdef enum quantifier_t:

View File

@ -332,6 +332,16 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
matches.push_back(
MatchC(pattern_id=ent_id, start=state.start,
length=state.length+1))
elif action == MATCH_DOUBLE:
# push match without last token if length > 0
if state.length > 0:
matches.push_back(
MatchC(pattern_id=ent_id, start=state.start,
length=state.length))
# push match with last token
matches.push_back(
MatchC(pattern_id=ent_id, start=state.start,
length=state.length+1))
elif action == MATCH_REJECT:
matches.push_back(
MatchC(pattern_id=ent_id, start=state.start,
@ -439,6 +449,7 @@ cdef action_t get_action(PatternStateC state,
RETRY_ADVANCE = 0110
RETRY_EXTEND = 0011
MATCH_REJECT = 2000 # Match, but don't include last token
MATCH_DOUBLE = 3000 # Match both with and without last token
Problem: If a quantifier is matching, we're adding a lot of open partials
"""
@ -476,8 +487,10 @@ cdef action_t get_action(PatternStateC state,
return RETRY
elif quantifier == ZERO_ONE:
if is_match and is_final:
# Yes, final: 1000
return MATCH
# Yes, final: 3000
# To cater for a pattern ending in "?", we need to add
# a match both with and without the last token
return MATCH_DOUBLE
elif is_match and not is_final:
# Yes, non-final: 0110
# We need both branches here, consider a pair like:

View File

@ -6,15 +6,15 @@ from spacy.matcher import Matcher
from spacy.tokens import Doc
@pytest.mark.xfail
def test_issue4120(en_vocab):
"""Test that matches without a final {OP: ?} token are returned."""
matcher = Matcher(en_vocab)
matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}])
doc1 = Doc(en_vocab, words=["a"])
assert len(matcher(doc1)) == 1 # works
doc2 = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc2)) == 2 # doesn't work
assert len(matcher(doc2)) == 2 # fixed
matcher = Matcher(en_vocab)
matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}])
@ -24,4 +24,4 @@ def test_issue4120(en_vocab):
matcher = Matcher(en_vocab)
matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}])
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
assert len(matcher(doc4)) == 3 # doesn't work
assert len(matcher(doc4)) == 3 # fixed