mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
adding double match for optional operator at the end (#4166)
This commit is contained in:
parent
01c5980187
commit
de272f8b82
|
@ -17,6 +17,7 @@ cdef enum action_t:
|
|||
RETRY_ADVANCE = 0110
|
||||
MATCH_EXTEND = 1001
|
||||
MATCH_REJECT = 2000
|
||||
MATCH_DOUBLE = 3000
|
||||
|
||||
|
||||
cdef enum quantifier_t:
|
||||
|
|
|
@ -332,6 +332,16 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
|||
matches.push_back(
|
||||
MatchC(pattern_id=ent_id, start=state.start,
|
||||
length=state.length+1))
|
||||
elif action == MATCH_DOUBLE:
|
||||
# push match without last token if length > 0
|
||||
if state.length > 0:
|
||||
matches.push_back(
|
||||
MatchC(pattern_id=ent_id, start=state.start,
|
||||
length=state.length))
|
||||
# push match with last token
|
||||
matches.push_back(
|
||||
MatchC(pattern_id=ent_id, start=state.start,
|
||||
length=state.length+1))
|
||||
elif action == MATCH_REJECT:
|
||||
matches.push_back(
|
||||
MatchC(pattern_id=ent_id, start=state.start,
|
||||
|
@ -439,6 +449,7 @@ cdef action_t get_action(PatternStateC state,
|
|||
RETRY_ADVANCE = 0110
|
||||
RETRY_EXTEND = 0011
|
||||
MATCH_REJECT = 2000 # Match, but don't include last token
|
||||
MATCH_DOUBLE = 3000 # Match both with and without last token
|
||||
|
||||
Problem: If a quantifier is matching, we're adding a lot of open partials
|
||||
"""
|
||||
|
@ -476,8 +487,10 @@ cdef action_t get_action(PatternStateC state,
|
|||
return RETRY
|
||||
elif quantifier == ZERO_ONE:
|
||||
if is_match and is_final:
|
||||
# Yes, final: 1000
|
||||
return MATCH
|
||||
# Yes, final: 3000
|
||||
# To cater for a pattern ending in "?", we need to add
|
||||
# a match both with and without the last token
|
||||
return MATCH_DOUBLE
|
||||
elif is_match and not is_final:
|
||||
# Yes, non-final: 0110
|
||||
# We need both branches here, consider a pair like:
|
||||
|
|
|
@ -6,15 +6,15 @@ from spacy.matcher import Matcher
|
|||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue4120(en_vocab):
|
||||
"""Test that matches without a final {OP: ?} token are returned."""
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}])
|
||||
doc1 = Doc(en_vocab, words=["a"])
|
||||
assert len(matcher(doc1)) == 1 # works
|
||||
|
||||
doc2 = Doc(en_vocab, words=["a", "b", "c"])
|
||||
assert len(matcher(doc2)) == 2 # doesn't work
|
||||
assert len(matcher(doc2)) == 2 # fixed
|
||||
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}])
|
||||
|
@ -24,4 +24,4 @@ def test_issue4120(en_vocab):
|
|||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}])
|
||||
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||
assert len(matcher(doc4)) == 3 # doesn't work
|
||||
assert len(matcher(doc4)) == 3 # fixed
|
||||
|
|
Loading…
Reference in New Issue
Block a user