Fix token_match in tokenizer

This commit is contained in:
Adriane Boyd 2020-11-25 11:43:05 +01:00
parent 724831b066
commit cf693f0eae

View File

@ -404,9 +404,7 @@ cdef class Tokenizer:
cdef unicode minus_suf cdef unicode minus_suf
cdef size_t last_size = 0 cdef size_t last_size = 0
while string and len(string) != last_size: while string and len(string) != last_size:
if self.token_match and self.token_match(string) \ if self.token_match and self.token_match(string):
and not self.find_prefix(string) \
and not self.find_suffix(string):
break break
if with_special_cases and self._specials.get(hash_string(string)) != NULL: if with_special_cases and self._specials.get(hash_string(string)) != NULL:
break break
@ -679,6 +677,8 @@ cdef class Tokenizer:
break break
suffixes.append(("SUFFIX", substring[split:])) suffixes.append(("SUFFIX", substring[split:]))
substring = substring[:split] substring = substring[:split]
if len(substring) == 0:
continue
if token_match(substring): if token_match(substring):
tokens.append(("TOKEN_MATCH", substring)) tokens.append(("TOKEN_MATCH", substring))
substring = '' substring = ''