Fix token_match in tokenizer

This commit is contained in:
Adriane Boyd 2020-11-25 11:43:05 +01:00
parent 724831b066
commit cf693f0eae

View File

@ -404,9 +404,7 @@ cdef class Tokenizer:
cdef unicode minus_suf
cdef size_t last_size = 0
while string and len(string) != last_size:
if self.token_match and self.token_match(string) \
and not self.find_prefix(string) \
and not self.find_suffix(string):
if self.token_match and self.token_match(string):
break
if with_special_cases and self._specials.get(hash_string(string)) != NULL:
break
@ -679,6 +677,8 @@ cdef class Tokenizer:
break
suffixes.append(("SUFFIX", substring[split:]))
substring = substring[:split]
if len(substring) == 0:
continue
if token_match(substring):
tokens.append(("TOKEN_MATCH", substring))
substring = ''