diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index 0eacb4324..02329ef92 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -106,7 +106,7 @@ TOKENIZER_INFIXES = ( r'(?<=[0-9])[+\-\*^](?=[0-9-])', r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), - r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS), + r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS), r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA) ] )