diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 69e80250c..506076ac1 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -8,6 +8,7 @@ from .structs cimport LexemeC, TokenC from .strings cimport StringStore from .tokens.doc cimport Doc from .vocab cimport Vocab, LexemesOrTokens, _Cached +from .matcher.phrasematcher cimport PhraseMatcher, MatchStruct cdef class Tokenizer: @@ -21,7 +22,7 @@ cdef class Tokenizer: cdef object _suffix_search cdef object _infix_finditer cdef object _rules - cdef object _special_matcher + cdef PhraseMatcher _special_matcher cdef int _property_init_count cdef int _property_init_max diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index dc1ee98c1..8433f2ca5 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -22,7 +22,6 @@ from . import util from .attrs import intify_attrs from .lexeme cimport EMPTY_LEXEME -from .matcher import PhraseMatcher from .symbols import ORTH cdef class Tokenizer: @@ -242,10 +241,12 @@ cdef class Tokenizer: cdef int orig_final_spacy cdef int orig_idx cdef Pool mem = Pool() - spans = [doc[match[1]:match[2]] for match in self._special_matcher(doc)] + cdef vector[MatchStruct] c_matches + self._special_matcher.find_matches(doc, &c_matches) # Skip processing if no matches - if len(spans) == 0: + if c_matches.size() == 0: return True + spans = [doc[match.start:match.end] for match in c_matches] spans = util.filter_spans(spans) # Put span info in span.start-indexed dict and calculate maximum # intermediate document size