From 012f4820cb1cfb052e4d3bd2b6e1031ec9f30882 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 2 Feb 2017 16:22:40 +0100 Subject: [PATCH] Keep infixes of punctuation + hyphens as one token (see #801) --- spacy/language_data/punctuation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index 0eacb4324..02329ef92 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -106,7 +106,7 @@ TOKENIZER_INFIXES = ( r'(?<=[0-9])[+\-\*^](?=[0-9-])', r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), - r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS), + r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS), r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA) ] )