From cc8bf62208384eedd212f547b70fbf3f3d59eea4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 May 2016 13:23:47 +0200 Subject: [PATCH] * Fix Issue #360: Tokenizer failed when the infix regex matched the start of the string while trying to tokenize multi-infix tokens. --- spacy/tests/tokenizer/test_infix.py | 6 ++++++ spacy/tokenizer.pyx | 2 ++ 2 files changed, 8 insertions(+) diff --git a/spacy/tests/tokenizer/test_infix.py b/spacy/tests/tokenizer/test_infix.py index 351394021..1b7cbaa7b 100644 --- a/spacy/tests/tokenizer/test_infix.py +++ b/spacy/tests/tokenizer/test_infix.py @@ -24,6 +24,12 @@ def test_ellipsis(en_tokenizer): tokens = en_tokenizer('best...known') assert len(tokens) == 3 +def test_big_ellipsis(en_tokenizer): + '''Test regression identified in Issue #360''' + tokens = en_tokenizer(u'$45...............Asking') + assert len(tokens) > 2 + + def test_email(en_tokenizer): tokens = en_tokenizer('hello@example.com') diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 229e70793..0a2df1bcb 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -227,6 +227,8 @@ cdef class Tokenizer: for match in matches: infix_start = match.start() infix_end = match.end() + if infix_start == start: + continue span = string[start:infix_start] tokens.push_back(self.vocab.get(tokens.mem, span), False)