Switch to PhraseMatcher.find_matches

This commit is contained in:
Adriane Boyd 2019-09-26 14:43:22 +02:00
parent 63b014d09f
commit ae348bee43
2 changed files with 6 additions and 4 deletions

View File

@ -8,6 +8,7 @@ from .structs cimport LexemeC, TokenC
from .strings cimport StringStore
from .tokens.doc cimport Doc
from .vocab cimport Vocab, LexemesOrTokens, _Cached
from .matcher.phrasematcher cimport PhraseMatcher, MatchStruct
cdef class Tokenizer:
@ -21,7 +22,7 @@ cdef class Tokenizer:
cdef object _suffix_search
cdef object _infix_finditer
cdef object _rules
cdef object _special_matcher
cdef PhraseMatcher _special_matcher
cdef int _property_init_count
cdef int _property_init_max

View File

@ -22,7 +22,6 @@ from . import util
from .attrs import intify_attrs
from .lexeme cimport EMPTY_LEXEME
from .matcher import PhraseMatcher
from .symbols import ORTH
cdef class Tokenizer:
@ -242,10 +241,12 @@ cdef class Tokenizer:
cdef int orig_final_spacy
cdef int orig_idx
cdef Pool mem = Pool()
spans = [doc[match[1]:match[2]] for match in self._special_matcher(doc)]
cdef vector[MatchStruct] c_matches
self._special_matcher.find_matches(doc, &c_matches)
# Skip processing if no matches
if len(spans) == 0:
if c_matches.size() == 0:
return True
spans = [doc[match.start:match.end] for match in c_matches]
spans = util.filter_spans(spans)
# Put span info in span.start-indexed dict and calculate maximum
# intermediate document size