Remove cruft in matching loop for partial matches

There was a bit of unnecessary code left over from FlashText in the matching loop to handle partial token matches, which we don't have with PhraseMatcher.
2025-10-25 13:11:03 +03:00 · 2019-09-23 09:11:13 +02:00 · 2019-09-23 09:11:13 +02:00 · a7e9c0fd3e
commit a7e9c0fd3e
parent c38c330585
2 changed files with 45 additions and 46 deletions
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -184,56 +184,34 @@ cdef class PhraseMatcher:
            return matches
        current_dict = self.keyword_trie_dict
        start = 0
        reset_current_dict = False
        idx = 0
        doc_array_len = len(doc_array)
        while idx < doc_array_len:
            start = idx
            token = doc_array[idx]
-            # if end is present in current_dict
+            # look for sequences from this position
-            if self._terminal in current_dict or token in current_dict:
+            if token in current_dict:
-                if self._terminal in current_dict:
+                current_dict_continued = current_dict[token]
-                    ent_ids = current_dict[self._terminal]
+                idy = idx + 1
-                    for ent_id in ent_ids:
+                while idy < doc_array_len:
-                        matches.append((self.vocab.strings[ent_id], start, idx))
+                    if self._terminal in current_dict_continued:
-
+                        ent_ids = current_dict_continued[self._terminal]
-                # look for longer sequences from this position
+                        for ent_id in ent_ids:
-                if token in current_dict:
+                            matches.append((self.vocab.strings[ent_id], start, idy))
-                    current_dict_continued = current_dict[token]
+                    inner_token = doc_array[idy]
-
+                    if inner_token in current_dict_continued:
-                    idy = idx + 1
+                        current_dict_continued = current_dict_continued[inner_token]
                    while idy < doc_array_len:
                        inner_token = doc_array[idy]
                        if self._terminal in current_dict_continued:
                            ent_ids = current_dict_continued[self._terminal]
                            for ent_id in ent_ids:
                                matches.append((self.vocab.strings[ent_id], start, idy))
                        if inner_token in current_dict_continued:
                            current_dict_continued = current_dict_continued[inner_token]
                        else:
                            break
                        idy += 1
                    else:
-                        # end of doc_array reached
+                        break
-                        if self._terminal in current_dict_continued:
+                else:
-                            ent_ids = current_dict_continued[self._terminal]
+                    # end of doc_array reached
-                            for ent_id in ent_ids:
+                    if self._terminal in current_dict_continued:
-                                matches.append((self.vocab.strings[ent_id], start, idy))
+                        ent_ids = current_dict_continued[self._terminal]
-                current_dict = self.keyword_trie_dict
+                        for ent_id in ent_ids:
-                reset_current_dict = True
+                            matches.append((self.vocab.strings[ent_id], start, idy))
-            else:
+            current_dict = self.keyword_trie_dict
                # we reset current_dict
                current_dict = self.keyword_trie_dict
                reset_current_dict = True
            # if we are end of doc_array and have a sequence discovered
            if idx + 1 >= doc_array_len:
                if self._terminal in current_dict:
                    ent_ids = current_dict[self._terminal]
                    for ent_id in ent_ids:
                        matches.append((self.vocab.strings[ent_id], start, doc_array_len))
            idx += 1
            if reset_current_dict:
                reset_current_dict = False
                start = idx
        for i, (ent_id, start, end) in enumerate(matches):
            on_match = self._callbacks.get(ent_id)
            if on_match is not None:
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@ -8,10 +8,31 @@ from ..util import get_doc
 def test_matcher_phrase_matcher(en_vocab):
    doc = Doc(en_vocab, words=["Google", "Now"])
    matcher = PhraseMatcher(en_vocab)
    matcher.add("COMPANY", None, doc)
    doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
    # intermediate phrase
    pattern = Doc(en_vocab, words=["Google", "Now"])
    matcher = PhraseMatcher(en_vocab)
    matcher.add("COMPANY", None, pattern)
    assert len(matcher(doc)) == 1
    # initial token
    pattern = Doc(en_vocab, words=["I"])
    matcher = PhraseMatcher(en_vocab)
    matcher.add("I", None, pattern)
    assert len(matcher(doc)) == 1
    # initial phrase
    pattern = Doc(en_vocab, words=["I", "like"])
    matcher = PhraseMatcher(en_vocab)
    matcher.add("ILIKE", None, pattern)
    assert len(matcher(doc)) == 1
    # final token
    pattern = Doc(en_vocab, words=["best"])
    matcher = PhraseMatcher(en_vocab)
    matcher.add("BEST", None, pattern)
    assert len(matcher(doc)) == 1
    # final phrase
    pattern = Doc(en_vocab, words=["Now", "best"])
    matcher = PhraseMatcher(en_vocab)
    matcher.add("NOWBEST", None, pattern)
    assert len(matcher(doc)) == 1