From 0894b8c0ef2ad8a44958931092b5fa3c6f0cd517 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 12 Jan 2017 22:58:26 +0100 Subject: [PATCH] Don't split tokens with digits and "/" infixes (resolves #740) --- spacy/language_data/punctuation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index d8ed19ca1..8b92922f8 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -103,7 +103,7 @@ TOKENIZER_SUFFIXES = ( TOKENIZER_INFIXES = ( LIST_ELLIPSES + [ - r'(?<=[0-9])[+\-\*/^](?=[0-9])', + r'(?<=[0-9])[+\-\*^](?=[0-9-])', r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),