Changed tokenizer to add infix when infix_start is offset

2025-12-08 02:34:17 +03:00 · 2017-11-19 15:14:40 +01:00 · 2017-11-19 15:14:40 +01:00 · 33b0f86de3
commit 33b0f86de3
parent 8be3392302
1 changed files with 3 additions and 4 deletions
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -241,9 +241,8 @@ cdef class Tokenizer:
                    for match in matches:
                        infix_start = match.start()
                        infix_end = match.end()
-                        if infix_start == start:
-                            continue

+                        if infix_start != start:
                            span = string[start:infix_start]
                            tokens.push_back(self.vocab.get(tokens.mem, span), False)