From 9f0fd5963f96aa4deb4f0dfd00cdc957ad2ef8b7 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 9 May 2017 00:01:59 +0200 Subject: [PATCH] Reorganise Hungarian punctuation rules --- spacy/lang/hu/punctuation.py | 53 ++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py index 474a6dc47..27a2912e2 100644 --- a/spacy/lang/hu/punctuation.py +++ b/spacy/lang/hu/punctuation.py @@ -1,40 +1,35 @@ # coding: utf8 from __future__ import unicode_literals +from ..punctuation import TOKENIZER_INFIXES +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY +from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER -_currency_symbols = r"\$ ¢ £ € ¥ ฿" +_currency = r'\$|¢|£|€|¥|฿' +_quotes = QUOTES.replace("'", '') -TOKENIZER_PREFIXES = ( - [r'\+'] + - LIST_PUNCT + - LIST_ELLIPSES + - LIST_QUOTES) +_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES) + +_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + + [r'(?<=[0-9])\+', + r'(?<=°[FfCcKk])\.', + r'(?<=[0-9])(?:{})'.format(_currency), + r'(?<=[0-9])(?:{})'.format(UNITS), + r'(?<=[{}{}{}(?:{})])\.'.format(ALPHA_LOWER, r'%²\-\)\]\+', QUOTES, _currency), + r'(?<=[{})])-e'.format(ALPHA_LOWER)]) -TOKENIZER_SUFFIXES = ( - LIST_PUNCT + - LIST_ELLIPSES + - LIST_QUOTES + - [ - r'(?<=[0-9])\+', - r'(?<=°[FfCcKk])\.', - r'(?<=[0-9])(?:{c})'.format(c=CURRENCY), - r'(?<=[0-9])(?:{u})'.format(u=UNITS), - r'(?<=[{al}{p}{c}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES, c=_currency_symbols), - r'(?<=[{al})])-e'.format(al=ALPHA_LOWER)]) +_infixes = (LIST_ELLIPSES + + [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), + r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_quotes)]) -TOKENIZER_INFIXES = ( - LIST_ELLIPSES + - [ - r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER), - r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), - r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), - r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA), - r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), - r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_QUOTES.replace("'", "").strip().replace(" ", ""))]) -from ..char_classes import TOKENIZER_INFIXES, LIST_PUNCT LIST_ELLIPSES -from ..char_classes import LIST_QUOTES, CURRENCY, QUOTES, UNITS -from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER +TOKENIZER_PREFIXES = _prefixes +TOKENIZER_SUFFIXES = _suffixes +TOKENIZER_INFIXES = _infixes