From bc7d83d4be0742c01425529baa8aa356b7bc0c50 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 19 Jan 2021 00:38:11 +0100 Subject: [PATCH] Skip 0-length matches (#6759) Add hack to prevent matcher from returning 0-length matches. --- spacy/matcher/matcher.pyx | 3 ++- spacy/tests/matcher/test_matcher_api.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 644f7704b..a367dcc3a 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -313,7 +313,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e # We need to deduplicate, because we could otherwise arrive at the same # match through two paths, e.g. .?.? matching 'a'. Are we matching the # first .?, or the second .? -- it doesn't matter, it's just one match. - if match not in seen: + # Skip 0-length matches. (TODO: fix algorithm) + if match not in seen and matches[i].length > 0: output.append(match) seen.add(match) return output diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 236f25130..75ee255d4 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -493,3 +493,13 @@ def test_matcher_remove_zero_operator(en_vocab): assert "Rule" in matcher matcher.remove("Rule") assert "Rule" not in matcher + + +def test_matcher_no_zero_length(en_vocab): + doc = Doc(en_vocab, words=["a", "b"]) + doc[0].tag_ = "A" + doc[1].tag_ = "B" + doc.is_tagged = True + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]]) + assert len(matcher(doc)) == 0