mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Fix PhraseMatcher remove overlapping terms (#10734)
* Add regression test for issue 10643 * Improve overlapping terms testcase * Fix removing overlapping terms in phrase matcher (#10643)
This commit is contained in:
parent
6f9e2ca81f
commit
cb06309ed8
|
@ -118,6 +118,8 @@ cdef class PhraseMatcher:
|
||||||
# if token is not found, break out of the loop
|
# if token is not found, break out of the loop
|
||||||
current_node = NULL
|
current_node = NULL
|
||||||
break
|
break
|
||||||
|
path_nodes.push_back(current_node)
|
||||||
|
path_keys.push_back(self._terminal_hash)
|
||||||
# remove the tokens from trie node if there are no other
|
# remove the tokens from trie node if there are no other
|
||||||
# keywords with them
|
# keywords with them
|
||||||
result = map_get(current_node, self._terminal_hash)
|
result = map_get(current_node, self._terminal_hash)
|
||||||
|
|
|
@ -122,6 +122,36 @@ def test_issue6839(en_vocab):
|
||||||
assert matches
|
assert matches
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(10643)
|
||||||
|
def test_issue10643(en_vocab):
|
||||||
|
"""Ensure overlapping terms can be removed from PhraseMatcher"""
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
words = ["Only", "save", "out", "the", "binary", "data", "for", "the", "individual", "components", "."]
|
||||||
|
# fmt: on
|
||||||
|
doc = Doc(en_vocab, words=words)
|
||||||
|
terms = {
|
||||||
|
"0": Doc(en_vocab, words=["binary"]),
|
||||||
|
"1": Doc(en_vocab, words=["binary", "data"]),
|
||||||
|
}
|
||||||
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
for match_id, term in terms.items():
|
||||||
|
matcher.add(match_id, [term])
|
||||||
|
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert matches == [(en_vocab.strings["0"], 4, 5), (en_vocab.strings["1"], 4, 6)]
|
||||||
|
|
||||||
|
matcher.remove("0")
|
||||||
|
assert len(matcher) == 1
|
||||||
|
new_matches = matcher(doc)
|
||||||
|
assert new_matches == [(en_vocab.strings["1"], 4, 6)]
|
||||||
|
|
||||||
|
matcher.remove("1")
|
||||||
|
assert len(matcher) == 0
|
||||||
|
no_matches = matcher(doc)
|
||||||
|
assert not no_matches
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_phrase_matcher(en_vocab):
|
def test_matcher_phrase_matcher(en_vocab):
|
||||||
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
||||||
# intermediate phrase
|
# intermediate phrase
|
||||||
|
|
Loading…
Reference in New Issue
Block a user