diff --git a/spacy/lang/bn/punctuation.py b/spacy/lang/bn/punctuation.py index 8b37d6cb3..66b7d967c 100644 --- a/spacy/lang/bn/punctuation.py +++ b/spacy/lang/bn/punctuation.py @@ -1,45 +1,33 @@ # coding: utf8 from __future__ import unicode_literals - - -CURRENCY_SYMBOLS = r"\$ ¢ £ € ¥ ฿ ৳" - -_PUNCT = '। ॥' - -LIST_PUNCT.extend(_PUNCT.strip().split()) - -TOKENIZER_PREFIXES = ( - [r'\+'] + - LIST_PUNCT + - LIST_ELLIPSES + - LIST_QUOTES -) - -TOKENIZER_SUFFIXES = ( - LIST_PUNCT + - LIST_ELLIPSES + - LIST_QUOTES + - [ - r'(?<=[0-9])\+', - r'(?<=°[FfCcKk])\.', - r'(?<=[0-9])(?:{c})'.format(c=CURRENCY), - r'(?<=[0-9])(?:{u})'.format(u=UNITS), - r'(?<=[{al}{p}{c}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES, c=CURRENCY_SYMBOLS), - r'(?<=[{al})])-e'.format(al=ALPHA_LOWER) - ] -) - -TOKENIZER_INFIXES = ( - LIST_ELLIPSES + - [ - r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER), - r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), - r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), - r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA), - r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), - r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_QUOTES.replace("'", "").strip().replace(" ", "")), - ] -) from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES + + +_currency = r"\$|¢|£|€|¥|฿|৳" +_quotes = QUOTES.replace("'", '') +_list_punct = LIST_PUNCT + '। ॥'.strip().split() + + +_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES) + +_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + + [r'(?<=[0-9])\+', + r'(?<=°[FfCcKk])\.', + r'(?<=[0-9])(?:{})'.format(_currency), + r'(?<=[0-9])(?:{})'.format(UNITS), + r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)]) + +_infixes = (LIST_ELLIPSES + + [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), + r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_quotes)]) + + +TOKENIZER_PREFIXES = _prefixes +TOKENIZER_SUFFIXES = _suffixes +TOKENIZER_INFIXES = _infixes