Skip 0-length matches (#6759)

Add hack to prevent matcher from returning 0-length matches.
This commit is contained in:
Adriane Boyd 2021-01-19 00:38:11 +01:00 committed by GitHub
parent 28256522c8
commit bc7d83d4be
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 12 additions and 1 deletions

View File

@ -313,7 +313,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
# We need to deduplicate, because we could otherwise arrive at the same
# match through two paths, e.g. .?.? matching 'a'. Are we matching the
# first .?, or the second .? -- it doesn't matter, it's just one match.
if match not in seen:
# Skip 0-length matches. (TODO: fix algorithm)
if match not in seen and matches[i].length > 0:
output.append(match)
seen.add(match)
return output

View File

@ -493,3 +493,13 @@ def test_matcher_remove_zero_operator(en_vocab):
assert "Rule" in matcher
matcher.remove("Rule")
assert "Rule" not in matcher
def test_matcher_no_zero_length(en_vocab):
doc = Doc(en_vocab, words=["a", "b"])
doc[0].tag_ = "A"
doc[1].tag_ = "B"
doc.is_tagged = True
matcher = Matcher(en_vocab)
matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
assert len(matcher(doc)) == 0