mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Skip 0-length matches (#6759)
Add hack to prevent matcher from returning 0-length matches.
This commit is contained in:
parent
28256522c8
commit
bc7d83d4be
|
@ -313,7 +313,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
|||
# We need to deduplicate, because we could otherwise arrive at the same
|
||||
# match through two paths, e.g. .?.? matching 'a'. Are we matching the
|
||||
# first .?, or the second .? -- it doesn't matter, it's just one match.
|
||||
if match not in seen:
|
||||
# Skip 0-length matches. (TODO: fix algorithm)
|
||||
if match not in seen and matches[i].length > 0:
|
||||
output.append(match)
|
||||
seen.add(match)
|
||||
return output
|
||||
|
|
|
@ -493,3 +493,13 @@ def test_matcher_remove_zero_operator(en_vocab):
|
|||
assert "Rule" in matcher
|
||||
matcher.remove("Rule")
|
||||
assert "Rule" not in matcher
|
||||
|
||||
|
||||
def test_matcher_no_zero_length(en_vocab):
|
||||
doc = Doc(en_vocab, words=["a", "b"])
|
||||
doc[0].tag_ = "A"
|
||||
doc[1].tag_ = "B"
|
||||
doc.is_tagged = True
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
|
||||
assert len(matcher(doc)) == 0
|
||||
|
|
Loading…
Reference in New Issue
Block a user