* Fix Issue #360: Tokenizer failed when the infix regex matched the start of the string while trying to tokenize multi-infix tokens.

2025-07-15 10:42:34 +03:00 · 2016-05-09 13:23:47 +02:00 · 2016-05-09 13:23:47 +02:00 · cc8bf62208
commit cc8bf62208
parent eab2376547
2 changed files with 8 additions and 0 deletions
--- a/spacy/tests/tokenizer/test_infix.py
+++ b/spacy/tests/tokenizer/test_infix.py
@ -24,6 +24,12 @@ def test_ellipsis(en_tokenizer):
    tokens = en_tokenizer('best...known')
    assert len(tokens) == 3

+def test_big_ellipsis(en_tokenizer):
+    '''Test regression identified in Issue #360'''
+    tokens = en_tokenizer(u'$45...............Asking')
+    assert len(tokens) > 2
+
+

 def test_email(en_tokenizer):
    tokens = en_tokenizer('hello@example.com')
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -227,6 +227,8 @@ cdef class Tokenizer:
                    for match in matches:
                        infix_start = match.start()
                        infix_end = match.end()
+                        if infix_start == start:
+                            continue
                        span = string[start:infix_start]
                        tokens.push_back(self.vocab.get(tokens.mem, span), False)