spaCy/spacy/lang/tt/punctuation.py
Aliia E 428bae66b5 Add Tatar Language Support (#2444)
* add Tatar lang support

* add Tatar letters

* add Tatar tests

* sign contributor agreement

* sign contributor agreement [x]

* remove comments from Language class

* remove all template comments
2018-06-19 10:17:53 +02:00

20 lines
914 B
Python

# coding: utf8
from __future__ import unicode_literals
from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, QUOTES, HYPHENS
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
_hyphens_no_dash = HYPHENS.replace('-', '').strip('|').replace('||', '')
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}])[,!?/\(\)]+(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}{q}])[:<>=](?=[{a}])'.format(a=ALPHA, q=QUOTES),
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=QUOTES),
r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA,
h=_hyphens_no_dash),
r'(?<=[0-9])-(?=[0-9])'])
TOKENIZER_INFIXES = _infixes