Reorganise Hungarian prefixes/suffixes/infixes

Use global prefixes and suffixes for non-language-specific rules,
import list of alpha unicode characters and adjust regexes.
This commit is contained in:
Ines Montani 2017-01-08 20:40:33 +01:00
parent 347c4a2d06
commit 53362b6b93
3 changed files with 34 additions and 95 deletions

View File

@ -4,22 +4,25 @@ from __future__ import unicode_literals
import six
from spacy.language_data import strings_to_exc, update_exc
from .punctuations import *
from .punctuation import *
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import ABBREVIATIONS
from .tokenizer_exceptions import OTHER_EXC
from .. import language_data as base
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES + TOKENIZER_PREFIXES
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
TOKENIZER_INFIXES = TOKENIZER_INFIXES
# HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]]
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS))
TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES
TOKENIZER_SUFFIXES = base.TOKENIZER_SUFFIXES + TOKENIZER_SUFFIXES
TOKENIZER_INFIXES = TOKENIZER_INFIXES
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]

25
spacy/hu/punctuation.py Normal file
View File

@ -0,0 +1,25 @@
# encoding: utf8
from __future__ import unicode_literals
from ..language_data.punctuation import ALPHA, ALPHA_LOWER, ALPHA_UPPER, LIST_ELLIPSES
TOKENIZER_SUFFIXES = [
r'(?<=[{al})])-e'.format(al=ALPHA_LOWER)
]
TOKENIZER_INFIXES = [
r'(?<=[0-9])-(?=[0-9])',
r'(?<=[0-9])[+\-\*/^](?=[0-9])',
r'(?<=[{a}])--(?=[{a}])',
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r'(?<=[0-9{a}])"(?=[\-{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA)
]
TOKENIZER_INFIXES += LIST_ELLIPSES
__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]

View File

@ -1,89 +0,0 @@
# encoding: utf8
from __future__ import unicode_literals
TOKENIZER_PREFIXES = r'''
+
'''.strip().split('\n')
TOKENIZER_SUFFIXES = r'''
,
\"
\)
\]
\}
\*
\!
\?
\$
>
:
;
'
«
_
''
\.\.
\.\.\.
\.\.\.\.
(?<=[a-züóőúéáűí)\]"'´«‘’%\)²“”+-])\.
(?<=[a-züóőúéáűí)])-e
\-\-
´
(?<=[0-9])\+
(?<=[a-z0-9üóőúéáűí][\)\]"'%\)§/])\.
(?<=[0-9])km²
(?<=[0-9])
(?<=[0-9])cm²
(?<=[0-9])mm²
(?<=[0-9])km³
(?<=[0-9])
(?<=[0-9])cm³
(?<=[0-9])mm³
(?<=[0-9])ha
(?<=[0-9])km
(?<=[0-9])m
(?<=[0-9])cm
(?<=[0-9])mm
(?<=[0-9])µm
(?<=[0-9])nm
(?<=[0-9])yd
(?<=[0-9])in
(?<=[0-9])ft
(?<=[0-9])kg
(?<=[0-9])g
(?<=[0-9])mg
(?<=[0-9])µg
(?<=[0-9])t
(?<=[0-9])lb
(?<=[0-9])oz
(?<=[0-9])m/s
(?<=[0-9])km/h
(?<=[0-9])mph
(?<=°[FCK])\.
(?<=[0-9])hPa
(?<=[0-9])Pa
(?<=[0-9])mbar
(?<=[0-9])mb
(?<=[0-9])T
(?<=[0-9])G
(?<=[0-9])M
(?<=[0-9])K
(?<=[0-9])kb
'''.strip().split('\n')
TOKENIZER_INFIXES = r'''
\.\.+
(?<=[a-züóőúéáűí])\.(?=[A-ZÜÓŐÚÉÁŰÍ])
(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ0-9])"(?=[\-a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])--(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
(?<=[0-9])[+\-\*/^](?=[0-9])
(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]),(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
'''.strip().split('\n')
__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]