mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Keep infixes of punctuation + hyphens as one token (see #801)
This commit is contained in:
parent
1219a5f513
commit
012f4820cb
|
@ -106,7 +106,7 @@ TOKENIZER_INFIXES = (
|
|||
r'(?<=[0-9])[+\-\*^](?=[0-9-])',
|
||||
r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
|
||||
r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
|
||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA)
|
||||
]
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue
Block a user