Remove cruft in matching loop for partial matches

There was a bit of unnecessary code left over from FlashText in the
matching loop to handle partial token matches, which we don't have with
PhraseMatcher.
This commit is contained in:
Adriane Boyd 2019-09-23 09:11:13 +02:00
parent c38c330585
commit a7e9c0fd3e
2 changed files with 45 additions and 46 deletions

View File

@ -184,56 +184,34 @@ cdef class PhraseMatcher:
return matches
current_dict = self.keyword_trie_dict
start = 0
reset_current_dict = False
idx = 0
doc_array_len = len(doc_array)
while idx < doc_array_len:
start = idx
token = doc_array[idx]
# if end is present in current_dict
if self._terminal in current_dict or token in current_dict:
if self._terminal in current_dict:
ent_ids = current_dict[self._terminal]
for ent_id in ent_ids:
matches.append((self.vocab.strings[ent_id], start, idx))
# look for longer sequences from this position
if token in current_dict:
current_dict_continued = current_dict[token]
idy = idx + 1
while idy < doc_array_len:
inner_token = doc_array[idy]
if self._terminal in current_dict_continued:
ent_ids = current_dict_continued[self._terminal]
for ent_id in ent_ids:
matches.append((self.vocab.strings[ent_id], start, idy))
if inner_token in current_dict_continued:
current_dict_continued = current_dict_continued[inner_token]
else:
break
# look for sequences from this position
if token in current_dict:
current_dict_continued = current_dict[token]
idy = idx + 1
while idy < doc_array_len:
if self._terminal in current_dict_continued:
ent_ids = current_dict_continued[self._terminal]
for ent_id in ent_ids:
matches.append((self.vocab.strings[ent_id], start, idy))
inner_token = doc_array[idy]
if inner_token in current_dict_continued:
current_dict_continued = current_dict_continued[inner_token]
idy += 1
else:
# end of doc_array reached
if self._terminal in current_dict_continued:
ent_ids = current_dict_continued[self._terminal]
for ent_id in ent_ids:
matches.append((self.vocab.strings[ent_id], start, idy))
current_dict = self.keyword_trie_dict
reset_current_dict = True
else:
# we reset current_dict
current_dict = self.keyword_trie_dict
reset_current_dict = True
# if we are end of doc_array and have a sequence discovered
if idx + 1 >= doc_array_len:
if self._terminal in current_dict:
ent_ids = current_dict[self._terminal]
for ent_id in ent_ids:
matches.append((self.vocab.strings[ent_id], start, doc_array_len))
break
else:
# end of doc_array reached
if self._terminal in current_dict_continued:
ent_ids = current_dict_continued[self._terminal]
for ent_id in ent_ids:
matches.append((self.vocab.strings[ent_id], start, idy))
current_dict = self.keyword_trie_dict
idx += 1
if reset_current_dict:
reset_current_dict = False
start = idx
for i, (ent_id, start, end) in enumerate(matches):
on_match = self._callbacks.get(ent_id)
if on_match is not None:

View File

@ -8,10 +8,31 @@ from ..util import get_doc
def test_matcher_phrase_matcher(en_vocab):
doc = Doc(en_vocab, words=["Google", "Now"])
matcher = PhraseMatcher(en_vocab)
matcher.add("COMPANY", None, doc)
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
# intermediate phrase
pattern = Doc(en_vocab, words=["Google", "Now"])
matcher = PhraseMatcher(en_vocab)
matcher.add("COMPANY", None, pattern)
assert len(matcher(doc)) == 1
# initial token
pattern = Doc(en_vocab, words=["I"])
matcher = PhraseMatcher(en_vocab)
matcher.add("I", None, pattern)
assert len(matcher(doc)) == 1
# initial phrase
pattern = Doc(en_vocab, words=["I", "like"])
matcher = PhraseMatcher(en_vocab)
matcher.add("ILIKE", None, pattern)
assert len(matcher(doc)) == 1
# final token
pattern = Doc(en_vocab, words=["best"])
matcher = PhraseMatcher(en_vocab)
matcher.add("BEST", None, pattern)
assert len(matcher(doc)) == 1
# final phrase
pattern = Doc(en_vocab, words=["Now", "best"])
matcher = PhraseMatcher(en_vocab)
matcher.add("NOWBEST", None, pattern)
assert len(matcher(doc)) == 1