Keep infixes of punctuation + hyphens as one token (see #801)

This commit is contained in:
Ines Montani 2017-02-02 16:22:40 +01:00
parent 1219a5f513
commit 012f4820cb

View File

@ -106,7 +106,7 @@ TOKENIZER_INFIXES = (
r'(?<=[0-9])[+\-\*^](?=[0-9-])',
r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA)
]
)