mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Skip 0-length matches (#6759)
Add hack to prevent matcher from returning 0-length matches.
This commit is contained in:
parent
28256522c8
commit
bc7d83d4be
|
@ -313,7 +313,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
# We need to deduplicate, because we could otherwise arrive at the same
|
# We need to deduplicate, because we could otherwise arrive at the same
|
||||||
# match through two paths, e.g. .?.? matching 'a'. Are we matching the
|
# match through two paths, e.g. .?.? matching 'a'. Are we matching the
|
||||||
# first .?, or the second .? -- it doesn't matter, it's just one match.
|
# first .?, or the second .? -- it doesn't matter, it's just one match.
|
||||||
if match not in seen:
|
# Skip 0-length matches. (TODO: fix algorithm)
|
||||||
|
if match not in seen and matches[i].length > 0:
|
||||||
output.append(match)
|
output.append(match)
|
||||||
seen.add(match)
|
seen.add(match)
|
||||||
return output
|
return output
|
||||||
|
|
|
@ -493,3 +493,13 @@ def test_matcher_remove_zero_operator(en_vocab):
|
||||||
assert "Rule" in matcher
|
assert "Rule" in matcher
|
||||||
matcher.remove("Rule")
|
matcher.remove("Rule")
|
||||||
assert "Rule" not in matcher
|
assert "Rule" not in matcher
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_no_zero_length(en_vocab):
|
||||||
|
doc = Doc(en_vocab, words=["a", "b"])
|
||||||
|
doc[0].tag_ = "A"
|
||||||
|
doc[1].tag_ = "B"
|
||||||
|
doc.is_tagged = True
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
|
||||||
|
assert len(matcher(doc)) == 0
|
||||||
|
|
Loading…
Reference in New Issue
Block a user