mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
Fix token_match in tokenizer
This commit is contained in:
parent
724831b066
commit
cf693f0eae
|
@ -404,9 +404,7 @@ cdef class Tokenizer:
|
||||||
cdef unicode minus_suf
|
cdef unicode minus_suf
|
||||||
cdef size_t last_size = 0
|
cdef size_t last_size = 0
|
||||||
while string and len(string) != last_size:
|
while string and len(string) != last_size:
|
||||||
if self.token_match and self.token_match(string) \
|
if self.token_match and self.token_match(string):
|
||||||
and not self.find_prefix(string) \
|
|
||||||
and not self.find_suffix(string):
|
|
||||||
break
|
break
|
||||||
if with_special_cases and self._specials.get(hash_string(string)) != NULL:
|
if with_special_cases and self._specials.get(hash_string(string)) != NULL:
|
||||||
break
|
break
|
||||||
|
@ -679,6 +677,8 @@ cdef class Tokenizer:
|
||||||
break
|
break
|
||||||
suffixes.append(("SUFFIX", substring[split:]))
|
suffixes.append(("SUFFIX", substring[split:]))
|
||||||
substring = substring[:split]
|
substring = substring[:split]
|
||||||
|
if len(substring) == 0:
|
||||||
|
continue
|
||||||
if token_match(substring):
|
if token_match(substring):
|
||||||
tokens.append(("TOKEN_MATCH", substring))
|
tokens.append(("TOKEN_MATCH", substring))
|
||||||
substring = ''
|
substring = ''
|
||||||
|
|
Loading…
Reference in New Issue
Block a user