mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
adding double match for optional operator at the end (#4166)
This commit is contained in:
parent
01c5980187
commit
de272f8b82
|
@ -17,6 +17,7 @@ cdef enum action_t:
|
||||||
RETRY_ADVANCE = 0110
|
RETRY_ADVANCE = 0110
|
||||||
MATCH_EXTEND = 1001
|
MATCH_EXTEND = 1001
|
||||||
MATCH_REJECT = 2000
|
MATCH_REJECT = 2000
|
||||||
|
MATCH_DOUBLE = 3000
|
||||||
|
|
||||||
|
|
||||||
cdef enum quantifier_t:
|
cdef enum quantifier_t:
|
||||||
|
|
|
@ -332,6 +332,16 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
||||||
matches.push_back(
|
matches.push_back(
|
||||||
MatchC(pattern_id=ent_id, start=state.start,
|
MatchC(pattern_id=ent_id, start=state.start,
|
||||||
length=state.length+1))
|
length=state.length+1))
|
||||||
|
elif action == MATCH_DOUBLE:
|
||||||
|
# push match without last token if length > 0
|
||||||
|
if state.length > 0:
|
||||||
|
matches.push_back(
|
||||||
|
MatchC(pattern_id=ent_id, start=state.start,
|
||||||
|
length=state.length))
|
||||||
|
# push match with last token
|
||||||
|
matches.push_back(
|
||||||
|
MatchC(pattern_id=ent_id, start=state.start,
|
||||||
|
length=state.length+1))
|
||||||
elif action == MATCH_REJECT:
|
elif action == MATCH_REJECT:
|
||||||
matches.push_back(
|
matches.push_back(
|
||||||
MatchC(pattern_id=ent_id, start=state.start,
|
MatchC(pattern_id=ent_id, start=state.start,
|
||||||
|
@ -439,6 +449,7 @@ cdef action_t get_action(PatternStateC state,
|
||||||
RETRY_ADVANCE = 0110
|
RETRY_ADVANCE = 0110
|
||||||
RETRY_EXTEND = 0011
|
RETRY_EXTEND = 0011
|
||||||
MATCH_REJECT = 2000 # Match, but don't include last token
|
MATCH_REJECT = 2000 # Match, but don't include last token
|
||||||
|
MATCH_DOUBLE = 3000 # Match both with and without last token
|
||||||
|
|
||||||
Problem: If a quantifier is matching, we're adding a lot of open partials
|
Problem: If a quantifier is matching, we're adding a lot of open partials
|
||||||
"""
|
"""
|
||||||
|
@ -476,8 +487,10 @@ cdef action_t get_action(PatternStateC state,
|
||||||
return RETRY
|
return RETRY
|
||||||
elif quantifier == ZERO_ONE:
|
elif quantifier == ZERO_ONE:
|
||||||
if is_match and is_final:
|
if is_match and is_final:
|
||||||
# Yes, final: 1000
|
# Yes, final: 3000
|
||||||
return MATCH
|
# To cater for a pattern ending in "?", we need to add
|
||||||
|
# a match both with and without the last token
|
||||||
|
return MATCH_DOUBLE
|
||||||
elif is_match and not is_final:
|
elif is_match and not is_final:
|
||||||
# Yes, non-final: 0110
|
# Yes, non-final: 0110
|
||||||
# We need both branches here, consider a pair like:
|
# We need both branches here, consider a pair like:
|
||||||
|
|
|
@ -6,15 +6,15 @@ from spacy.matcher import Matcher
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_issue4120(en_vocab):
|
def test_issue4120(en_vocab):
|
||||||
"""Test that matches without a final {OP: ?} token are returned."""
|
"""Test that matches without a final {OP: ?} token are returned."""
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}])
|
matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}])
|
||||||
doc1 = Doc(en_vocab, words=["a"])
|
doc1 = Doc(en_vocab, words=["a"])
|
||||||
assert len(matcher(doc1)) == 1 # works
|
assert len(matcher(doc1)) == 1 # works
|
||||||
|
|
||||||
doc2 = Doc(en_vocab, words=["a", "b", "c"])
|
doc2 = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
assert len(matcher(doc2)) == 2 # doesn't work
|
assert len(matcher(doc2)) == 2 # fixed
|
||||||
|
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}])
|
matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}])
|
||||||
|
@ -24,4 +24,4 @@ def test_issue4120(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}])
|
matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}])
|
||||||
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||||
assert len(matcher(doc4)) == 3 # doesn't work
|
assert len(matcher(doc4)) == 3 # fixed
|
||||||
|
|
Loading…
Reference in New Issue
Block a user