diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 644f7704b..a367dcc3a 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -313,7 +313,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e # We need to deduplicate, because we could otherwise arrive at the same # match through two paths, e.g. .?.? matching 'a'. Are we matching the # first .?, or the second .? -- it doesn't matter, it's just one match. - if match not in seen: + # Skip 0-length matches. (TODO: fix algorithm) + if match not in seen and matches[i].length > 0: output.append(match) seen.add(match) return output diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 236f25130..75ee255d4 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -493,3 +493,13 @@ def test_matcher_remove_zero_operator(en_vocab): assert "Rule" in matcher matcher.remove("Rule") assert "Rule" not in matcher + + +def test_matcher_no_zero_length(en_vocab): + doc = Doc(en_vocab, words=["a", "b"]) + doc[0].tag_ = "A" + doc[1].tag_ = "B" + doc.is_tagged = True + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]]) + assert len(matcher(doc)) == 0