Fix PhraseMatcher remove overlapping terms (#10734)

* Add regression test for issue 10643 * Improve overlapping terms testcase * Fix removing overlapping terms in phrase matcher (#10643)
2025-11-01 16:37:45 +03:00 · 2022-05-12 12:23:52 +02:00 · 2022-05-12 12:23:52 +02:00 · cb06309ed8
commit cb06309ed8
parent 6f9e2ca81f
2 changed files with 32 additions and 0 deletions
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -118,6 +118,8 @@ cdef class PhraseMatcher:
                    # if token is not found, break out of the loop
                    current_node = NULL
                    break
            path_nodes.push_back(current_node)
            path_keys.push_back(self._terminal_hash)
            # remove the tokens from trie node if there are no other
            # keywords with them
            result = map_get(current_node, self._terminal_hash)
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@ -122,6 +122,36 @@ def test_issue6839(en_vocab):
    assert matches
@pytest.mark.issue(10643)
 def test_issue10643(en_vocab):
    """Ensure overlapping terms can be removed from PhraseMatcher"""
    # fmt: off
    words = ["Only", "save", "out", "the", "binary", "data", "for", "the", "individual", "components", "."]
    # fmt: on
    doc = Doc(en_vocab, words=words)
    terms = {
        "0": Doc(en_vocab, words=["binary"]),
        "1": Doc(en_vocab, words=["binary", "data"]),
    }
    matcher = PhraseMatcher(en_vocab)
    for match_id, term in terms.items():
        matcher.add(match_id, [term])
    matches = matcher(doc)
    assert matches == [(en_vocab.strings["0"], 4, 5), (en_vocab.strings["1"], 4, 6)]
    matcher.remove("0")
    assert len(matcher) == 1
    new_matches = matcher(doc)
    assert new_matches == [(en_vocab.strings["1"], 4, 6)]
    matcher.remove("1")
    assert len(matcher) == 0
    no_matches = matcher(doc)
    assert not no_matches
 def test_matcher_phrase_matcher(en_vocab):
    doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
    # intermediate phrase