Fix PhraseMatcher remove overlapping terms (#10734)

* Add regression test for issue 10643

* Improve overlapping terms testcase

* Fix removing overlapping terms in phrase matcher (#10643)
This commit is contained in:
Patrick Düggelin 2022-05-12 12:23:52 +02:00 committed by GitHub
parent 6f9e2ca81f
commit cb06309ed8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 32 additions and 0 deletions

View File

@ -118,6 +118,8 @@ cdef class PhraseMatcher:
# if token is not found, break out of the loop # if token is not found, break out of the loop
current_node = NULL current_node = NULL
break break
path_nodes.push_back(current_node)
path_keys.push_back(self._terminal_hash)
# remove the tokens from trie node if there are no other # remove the tokens from trie node if there are no other
# keywords with them # keywords with them
result = map_get(current_node, self._terminal_hash) result = map_get(current_node, self._terminal_hash)

View File

@ -122,6 +122,36 @@ def test_issue6839(en_vocab):
assert matches assert matches
@pytest.mark.issue(10643)
def test_issue10643(en_vocab):
"""Ensure overlapping terms can be removed from PhraseMatcher"""
# fmt: off
words = ["Only", "save", "out", "the", "binary", "data", "for", "the", "individual", "components", "."]
# fmt: on
doc = Doc(en_vocab, words=words)
terms = {
"0": Doc(en_vocab, words=["binary"]),
"1": Doc(en_vocab, words=["binary", "data"]),
}
matcher = PhraseMatcher(en_vocab)
for match_id, term in terms.items():
matcher.add(match_id, [term])
matches = matcher(doc)
assert matches == [(en_vocab.strings["0"], 4, 5), (en_vocab.strings["1"], 4, 6)]
matcher.remove("0")
assert len(matcher) == 1
new_matches = matcher(doc)
assert new_matches == [(en_vocab.strings["1"], 4, 6)]
matcher.remove("1")
assert len(matcher) == 0
no_matches = matcher(doc)
assert not no_matches
def test_matcher_phrase_matcher(en_vocab): def test_matcher_phrase_matcher(en_vocab):
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
# intermediate phrase # intermediate phrase