From de272f8b82b4e2aa63d7f10830d2cd6c27a47517 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 21 Aug 2019 22:46:56 +0200 Subject: [PATCH] adding double match for optional operator at the end (#4166) --- spacy/matcher/matcher.pxd | 1 + spacy/matcher/matcher.pyx | 17 +++++++++++++++-- spacy/tests/regression/test_issue4120.py | 6 +++--- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index a146cd107..dd04153bf 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -17,6 +17,7 @@ cdef enum action_t: RETRY_ADVANCE = 0110 MATCH_EXTEND = 1001 MATCH_REJECT = 2000 + MATCH_DOUBLE = 3000 cdef enum quantifier_t: diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index f22b23260..260e72e40 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -332,6 +332,16 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match matches.push_back( MatchC(pattern_id=ent_id, start=state.start, length=state.length+1)) + elif action == MATCH_DOUBLE: + # push match without last token if length > 0 + if state.length > 0: + matches.push_back( + MatchC(pattern_id=ent_id, start=state.start, + length=state.length)) + # push match with last token + matches.push_back( + MatchC(pattern_id=ent_id, start=state.start, + length=state.length+1)) elif action == MATCH_REJECT: matches.push_back( MatchC(pattern_id=ent_id, start=state.start, @@ -439,6 +449,7 @@ cdef action_t get_action(PatternStateC state, RETRY_ADVANCE = 0110 RETRY_EXTEND = 0011 MATCH_REJECT = 2000 # Match, but don't include last token + MATCH_DOUBLE = 3000 # Match both with and without last token Problem: If a quantifier is matching, we're adding a lot of open partials """ @@ -476,8 +487,10 @@ cdef action_t get_action(PatternStateC state, return RETRY elif quantifier == ZERO_ONE: if is_match and is_final: - # Yes, final: 1000 - return MATCH + # Yes, final: 3000 + # To cater for a pattern ending in "?", we need to add + # a match both with and without the last token + return MATCH_DOUBLE elif is_match and not is_final: # Yes, non-final: 0110 # We need both branches here, consider a pair like: diff --git a/spacy/tests/regression/test_issue4120.py b/spacy/tests/regression/test_issue4120.py index 652825064..4806d1607 100644 --- a/spacy/tests/regression/test_issue4120.py +++ b/spacy/tests/regression/test_issue4120.py @@ -6,15 +6,15 @@ from spacy.matcher import Matcher from spacy.tokens import Doc -@pytest.mark.xfail def test_issue4120(en_vocab): """Test that matches without a final {OP: ?} token are returned.""" matcher = Matcher(en_vocab) matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}]) doc1 = Doc(en_vocab, words=["a"]) assert len(matcher(doc1)) == 1 # works + doc2 = Doc(en_vocab, words=["a", "b", "c"]) - assert len(matcher(doc2)) == 2 # doesn't work + assert len(matcher(doc2)) == 2 # fixed matcher = Matcher(en_vocab) matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]) @@ -24,4 +24,4 @@ def test_issue4120(en_vocab): matcher = Matcher(en_vocab) matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]) doc4 = Doc(en_vocab, words=["a", "b", "b", "c"]) - assert len(matcher(doc4)) == 3 # doesn't work + assert len(matcher(doc4)) == 3 # fixed