mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-16 03:20:34 +03:00
Remove cruft in matching loop for partial matches
There was a bit of unnecessary code left over from FlashText in the matching loop to handle partial token matches, which we don't have with PhraseMatcher.
This commit is contained in:
parent
c38c330585
commit
a7e9c0fd3e
|
@ -184,56 +184,34 @@ cdef class PhraseMatcher:
|
||||||
return matches
|
return matches
|
||||||
current_dict = self.keyword_trie_dict
|
current_dict = self.keyword_trie_dict
|
||||||
start = 0
|
start = 0
|
||||||
reset_current_dict = False
|
|
||||||
idx = 0
|
idx = 0
|
||||||
doc_array_len = len(doc_array)
|
doc_array_len = len(doc_array)
|
||||||
while idx < doc_array_len:
|
while idx < doc_array_len:
|
||||||
|
start = idx
|
||||||
token = doc_array[idx]
|
token = doc_array[idx]
|
||||||
# if end is present in current_dict
|
# look for sequences from this position
|
||||||
if self._terminal in current_dict or token in current_dict:
|
if token in current_dict:
|
||||||
if self._terminal in current_dict:
|
current_dict_continued = current_dict[token]
|
||||||
ent_ids = current_dict[self._terminal]
|
idy = idx + 1
|
||||||
for ent_id in ent_ids:
|
while idy < doc_array_len:
|
||||||
matches.append((self.vocab.strings[ent_id], start, idx))
|
if self._terminal in current_dict_continued:
|
||||||
|
ent_ids = current_dict_continued[self._terminal]
|
||||||
# look for longer sequences from this position
|
for ent_id in ent_ids:
|
||||||
if token in current_dict:
|
matches.append((self.vocab.strings[ent_id], start, idy))
|
||||||
current_dict_continued = current_dict[token]
|
inner_token = doc_array[idy]
|
||||||
|
if inner_token in current_dict_continued:
|
||||||
idy = idx + 1
|
current_dict_continued = current_dict_continued[inner_token]
|
||||||
while idy < doc_array_len:
|
|
||||||
inner_token = doc_array[idy]
|
|
||||||
if self._terminal in current_dict_continued:
|
|
||||||
ent_ids = current_dict_continued[self._terminal]
|
|
||||||
for ent_id in ent_ids:
|
|
||||||
matches.append((self.vocab.strings[ent_id], start, idy))
|
|
||||||
if inner_token in current_dict_continued:
|
|
||||||
current_dict_continued = current_dict_continued[inner_token]
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
idy += 1
|
idy += 1
|
||||||
else:
|
else:
|
||||||
# end of doc_array reached
|
break
|
||||||
if self._terminal in current_dict_continued:
|
else:
|
||||||
ent_ids = current_dict_continued[self._terminal]
|
# end of doc_array reached
|
||||||
for ent_id in ent_ids:
|
if self._terminal in current_dict_continued:
|
||||||
matches.append((self.vocab.strings[ent_id], start, idy))
|
ent_ids = current_dict_continued[self._terminal]
|
||||||
current_dict = self.keyword_trie_dict
|
for ent_id in ent_ids:
|
||||||
reset_current_dict = True
|
matches.append((self.vocab.strings[ent_id], start, idy))
|
||||||
else:
|
current_dict = self.keyword_trie_dict
|
||||||
# we reset current_dict
|
|
||||||
current_dict = self.keyword_trie_dict
|
|
||||||
reset_current_dict = True
|
|
||||||
# if we are end of doc_array and have a sequence discovered
|
|
||||||
if idx + 1 >= doc_array_len:
|
|
||||||
if self._terminal in current_dict:
|
|
||||||
ent_ids = current_dict[self._terminal]
|
|
||||||
for ent_id in ent_ids:
|
|
||||||
matches.append((self.vocab.strings[ent_id], start, doc_array_len))
|
|
||||||
idx += 1
|
idx += 1
|
||||||
if reset_current_dict:
|
|
||||||
reset_current_dict = False
|
|
||||||
start = idx
|
|
||||||
for i, (ent_id, start, end) in enumerate(matches):
|
for i, (ent_id, start, end) in enumerate(matches):
|
||||||
on_match = self._callbacks.get(ent_id)
|
on_match = self._callbacks.get(ent_id)
|
||||||
if on_match is not None:
|
if on_match is not None:
|
||||||
|
|
|
@ -8,10 +8,31 @@ from ..util import get_doc
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_phrase_matcher(en_vocab):
|
def test_matcher_phrase_matcher(en_vocab):
|
||||||
doc = Doc(en_vocab, words=["Google", "Now"])
|
|
||||||
matcher = PhraseMatcher(en_vocab)
|
|
||||||
matcher.add("COMPANY", None, doc)
|
|
||||||
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
||||||
|
# intermediate phrase
|
||||||
|
pattern = Doc(en_vocab, words=["Google", "Now"])
|
||||||
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
matcher.add("COMPANY", None, pattern)
|
||||||
|
assert len(matcher(doc)) == 1
|
||||||
|
# initial token
|
||||||
|
pattern = Doc(en_vocab, words=["I"])
|
||||||
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
matcher.add("I", None, pattern)
|
||||||
|
assert len(matcher(doc)) == 1
|
||||||
|
# initial phrase
|
||||||
|
pattern = Doc(en_vocab, words=["I", "like"])
|
||||||
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
matcher.add("ILIKE", None, pattern)
|
||||||
|
assert len(matcher(doc)) == 1
|
||||||
|
# final token
|
||||||
|
pattern = Doc(en_vocab, words=["best"])
|
||||||
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
matcher.add("BEST", None, pattern)
|
||||||
|
assert len(matcher(doc)) == 1
|
||||||
|
# final phrase
|
||||||
|
pattern = Doc(en_vocab, words=["Now", "best"])
|
||||||
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
matcher.add("NOWBEST", None, pattern)
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user