mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
53362b6b93
Use global prefixes and suffixes for non-language-specific rules, import list of alpha unicode characters and adjust regexes.
26 lines
647 B
Python
26 lines
647 B
Python
# encoding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from ..language_data.punctuation import ALPHA, ALPHA_LOWER, ALPHA_UPPER, LIST_ELLIPSES
|
|
|
|
|
|
TOKENIZER_SUFFIXES = [
|
|
r'(?<=[{al})])-e'.format(al=ALPHA_LOWER)
|
|
]
|
|
|
|
TOKENIZER_INFIXES = [
|
|
r'(?<=[0-9])-(?=[0-9])',
|
|
r'(?<=[0-9])[+\-\*/^](?=[0-9])',
|
|
r'(?<=[{a}])--(?=[{a}])',
|
|
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
|
r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
|
r'(?<=[0-9{a}])"(?=[\-{a}])'.format(a=ALPHA),
|
|
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA)
|
|
]
|
|
|
|
|
|
TOKENIZER_INFIXES += LIST_ELLIPSES
|
|
|
|
|
|
__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|