Allow zero-width 'infix' token

2025-07-17 11:42:30 +03:00 · 2017-01-23 18:28:01 +01:00 · 2017-01-23 18:28:01 +01:00 · dce8f5515e
commit dce8f5515e
parent 01c2daf0c9
1 changed files with 9 additions and 12 deletions
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -289,21 +289,18 @@ cdef class Tokenizer:
                        infix_end = match.end()
                        if infix_start == start:
                            continue
-                        if infix_start == infix_end:
-                            msg = ("Tokenizer found a zero-width 'infix' token.\n"
-                                   "If you're using a built-in tokenizer, please\n"
-                                   "report this bug. If you're using a tokenizer\n"
-                                   "you developed, check your TOKENIZER_INFIXES\n"
-                                   "tuple.\n"
-                                   "String being matched: {string}\n"
-                                   "Language: {lang}")
-                            raise ValueError(msg.format(string=string, lang=self.vocab.lang))

                        span = string[start:infix_start]
                        tokens.push_back(self.vocab.get(tokens.mem, span), False)
-                    
-                        infix_span = string[infix_start:infix_end]
-                        tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
+
+                        if infix_start != infix_end:
+                            # If infix_start != infix_end, it means the infix
+                            # token is non-empty. Empty infix tokens are useful
+                            # for tokenization in some languages (see
+                            # https://github.com/explosion/spaCy/issues/768)
+                            infix_span = string[infix_start:infix_end]
+                            tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
+
                        start = infix_end
                    span = string[start:]
                    tokens.push_back(self.vocab.get(tokens.mem, span), False)