Improved Romanian tokenization for UD RRT (#5036)

Modifications to Romanian tokenization to improve tokenization for
UD_Romanian-RRT.
This commit is contained in:
adrianeboyd 2020-02-19 16:16:00 +01:00 committed by GitHub
parent c7e4fe9c5c
commit 2164e71ea8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 215 additions and 1 deletions

View File

@ -3,6 +3,8 @@ from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
@ -24,6 +26,9 @@ class RomanianDefaults(Language.Defaults):
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
tag_map = TAG_MAP

View File

@ -0,0 +1,164 @@
# coding: utf8
from __future__ import unicode_literals
import itertools
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from ..char_classes import LIST_ICONS, CURRENCY
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
_list_icons = [x for x in LIST_ICONS if x != "°"]
_list_icons = [x.replace("\\u00B0", "") for x in _list_icons]
_ro_variants = {
"Ă": ["Ă", "A"],
"Â": ["Â", "A"],
"Î": ["Î", "I"],
"Ș": ["Ș", "Ş", "S"],
"Ț": ["Ț", "Ţ", "T"],
}
def _make_ro_variants(tokens):
variants = []
for token in tokens:
upper_token = token.upper()
upper_char_variants = [_ro_variants.get(c, [c]) for c in upper_token]
upper_variants = ["".join(x) for x in itertools.product(*upper_char_variants)]
for variant in upper_variants:
variants.extend([variant, variant.lower(), variant.title()])
return sorted(list(set(variants)))
# UD_Romanian-RRT closed class prefixes
# POS: ADP|AUX|CCONJ|DET|NUM|PART|PRON|SCONJ
_ud_rrt_prefixes = [
"a-",
"c-",
"ce-",
"cu-",
"d-",
"de-",
"dintr-",
"e-",
"făr-",
"i-",
"l-",
"le-",
"m-",
"mi-",
"n-",
"ne-",
"p-",
"pe-",
"prim-",
"printr-",
"s-",
"se-",
"te-",
"v-",
"într-",
"ș-",
"și-",
"ți-",
]
_ud_rrt_prefix_variants = _make_ro_variants(_ud_rrt_prefixes)
# UD_Romanian-RRT closed class suffixes without NUM
# POS: ADP|AUX|CCONJ|DET|PART|PRON|SCONJ
_ud_rrt_suffixes = [
"-a",
"-aceasta",
"-ai",
"-al",
"-ale",
"-alta",
"-am",
"-ar",
"-astea",
"-atâta",
"-au",
"-aș",
"-ați",
"-i",
"-ilor",
"-l",
"-le",
"-lea",
"-mea",
"-meu",
"-mi",
"-mă",
"-n",
"-ndărătul",
"-ne",
"-o",
"-oi",
"-or",
"-s",
"-se",
"-si",
"-te",
"-ul",
"-ului",
"-un",
"-uri",
"-urile",
"-urilor",
"-veți",
"-vă",
"-ăștia",
"-și",
"-ți",
]
_ud_rrt_suffix_variants = _make_ro_variants(_ud_rrt_suffixes)
_prefixes = (
["§", "%", "=", "", "", r"\+(?![0-9])"]
+ _ud_rrt_prefix_variants
+ LIST_PUNCT
+ LIST_ELLIPSES
+ LIST_QUOTES
+ LIST_CURRENCY
+ LIST_ICONS
)
_suffixes = (
_ud_rrt_suffix_variants
+ LIST_PUNCT
+ LIST_ELLIPSES
+ LIST_QUOTES
+ _list_icons
+ ["", ""]
+ [
r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
]
)
_infixes = (
LIST_ELLIPSES
+ _list_icons
+ [
r"(?<=[0-9])[+\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[:<>=](?=[{a}])".format(a=ALPHA),
]
)
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_SUFFIXES = _suffixes
TOKENIZER_INFIXES = _infixes

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
from ...symbols import ORTH
from .punctuation import _make_ro_variants
_exc = {}
@ -45,8 +46,52 @@ for orth in [
"dpdv",
"șamd.",
"ș.a.m.d.",
# below: from UD_Romanian-RRT:
"A.c.",
"A.f.",
"A.r.",
"Al.",
"Art.",
"Aug.",
"Bd.",
"Dem.",
"Dr.",
"Fig.",
"Fr.",
"Gh.",
"Gr.",
"Lt.",
"Nr.",
"Obs.",
"Prof.",
"Sf.",
"a.m.",
"a.r.",
"alin.",
"art.",
"d-l",
"d-lui",
"d-nei",
"ex.",
"fig.",
"ian.",
"lit.",
"lt.",
"p.a.",
"p.m.",
"pct.",
"prep.",
"sf.",
"tel.",
"univ.",
"îngr.",
"într-adevăr",
"Șt.",
"ș.a.",
]:
_exc[orth] = [{ORTH: orth}]
# note: does not distinguish capitalized-only exceptions from others
for variant in _make_ro_variants([orth]):
_exc[variant] = [{ORTH: variant}]
TOKENIZER_EXCEPTIONS = _exc