Fix token_match in tokenizer

2025-10-26 05:31:15 +03:00 · 2020-11-25 11:43:05 +01:00 · 2020-11-25 11:43:05 +01:00 · cf693f0eae
commit cf693f0eae
parent 724831b066
1 changed files with 3 additions and 3 deletions
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -404,9 +404,7 @@ cdef class Tokenizer:
        cdef unicode minus_suf
        cdef size_t last_size = 0
        while string and len(string) != last_size:
-            if self.token_match and self.token_match(string) \
-                    and not self.find_prefix(string) \
-                    and not self.find_suffix(string):
+            if self.token_match and self.token_match(string):
                break
            if with_special_cases and self._specials.get(hash_string(string)) != NULL:
                break
@ -679,6 +677,8 @@ cdef class Tokenizer:
                            break
                        suffixes.append(("SUFFIX", substring[split:]))
                        substring = substring[:split]
+                if len(substring) == 0:
+                    continue
                if token_match(substring):
                    tokens.append(("TOKEN_MATCH", substring))
                    substring = ''