mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Fix PhraseMatcher remove overlapping terms (#10734)
* Add regression test for issue 10643 * Improve overlapping terms testcase * Fix removing overlapping terms in phrase matcher (#10643)
This commit is contained in:
parent
6f9e2ca81f
commit
cb06309ed8
|
@ -118,6 +118,8 @@ cdef class PhraseMatcher:
|
|||
# if token is not found, break out of the loop
|
||||
current_node = NULL
|
||||
break
|
||||
path_nodes.push_back(current_node)
|
||||
path_keys.push_back(self._terminal_hash)
|
||||
# remove the tokens from trie node if there are no other
|
||||
# keywords with them
|
||||
result = map_get(current_node, self._terminal_hash)
|
||||
|
|
|
@ -122,6 +122,36 @@ def test_issue6839(en_vocab):
|
|||
assert matches
|
||||
|
||||
|
||||
@pytest.mark.issue(10643)
|
||||
def test_issue10643(en_vocab):
|
||||
"""Ensure overlapping terms can be removed from PhraseMatcher"""
|
||||
|
||||
# fmt: off
|
||||
words = ["Only", "save", "out", "the", "binary", "data", "for", "the", "individual", "components", "."]
|
||||
# fmt: on
|
||||
doc = Doc(en_vocab, words=words)
|
||||
terms = {
|
||||
"0": Doc(en_vocab, words=["binary"]),
|
||||
"1": Doc(en_vocab, words=["binary", "data"]),
|
||||
}
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
for match_id, term in terms.items():
|
||||
matcher.add(match_id, [term])
|
||||
|
||||
matches = matcher(doc)
|
||||
assert matches == [(en_vocab.strings["0"], 4, 5), (en_vocab.strings["1"], 4, 6)]
|
||||
|
||||
matcher.remove("0")
|
||||
assert len(matcher) == 1
|
||||
new_matches = matcher(doc)
|
||||
assert new_matches == [(en_vocab.strings["1"], 4, 6)]
|
||||
|
||||
matcher.remove("1")
|
||||
assert len(matcher) == 0
|
||||
no_matches = matcher(doc)
|
||||
assert not no_matches
|
||||
|
||||
|
||||
def test_matcher_phrase_matcher(en_vocab):
|
||||
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
||||
# intermediate phrase
|
||||
|
|
Loading…
Reference in New Issue
Block a user