Switch to PhraseMatcher.find_matches

2025-08-09 22:54:53 +03:00 · 2019-09-26 14:43:22 +02:00 · 2019-09-26 14:43:22 +02:00 · ae348bee43
commit ae348bee43
parent 63b014d09f
2 changed files with 6 additions and 4 deletions
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -8,6 +8,7 @@ from .structs cimport LexemeC, TokenC
 from .strings cimport StringStore
 from .tokens.doc cimport Doc
 from .vocab cimport Vocab, LexemesOrTokens, _Cached
+from .matcher.phrasematcher cimport PhraseMatcher, MatchStruct


 cdef class Tokenizer:
@ -21,7 +22,7 @@ cdef class Tokenizer:
    cdef object _suffix_search
    cdef object _infix_finditer
    cdef object _rules
-    cdef object _special_matcher
+    cdef PhraseMatcher _special_matcher
    cdef int _property_init_count
    cdef int _property_init_max

--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -22,7 +22,6 @@ from . import util

 from .attrs import intify_attrs
 from .lexeme cimport EMPTY_LEXEME
-from .matcher import PhraseMatcher
 from .symbols import ORTH

 cdef class Tokenizer:
@ -242,10 +241,12 @@ cdef class Tokenizer:
        cdef int orig_final_spacy
        cdef int orig_idx
        cdef Pool mem = Pool()
-        spans = [doc[match[1]:match[2]] for match in self._special_matcher(doc)]
+        cdef vector[MatchStruct] c_matches
+        self._special_matcher.find_matches(doc, &c_matches)
        # Skip processing if no matches
-        if len(spans) == 0:
+        if c_matches.size() == 0:
            return True
+        spans = [doc[match.start:match.end] for match in c_matches]
        spans = util.filter_spans(spans)
        # Put span info in span.start-indexed dict and calculate maximum
        # intermediate document size