spaCy/spacy/lang/ro/tokenizer_exceptions.py

98 lines
1.4 KiB
Python
Raw Normal View History

# coding: utf8
from __future__ import unicode_literals
from ...symbols import ORTH
from .punctuation import _make_ro_variants
_exc = {}
# Source: https://en.wiktionary.org/wiki/Category:Romanian_abbreviations
for orth in [
"1-a",
"2-a",
"3-a",
"4-a",
"5-a",
"6-a",
"7-a",
"8-a",
"9-a",
"10-a",
"11-a",
"12-a",
"1-ul",
"2-lea",
"3-lea",
"4-lea",
"5-lea",
"6-lea",
"7-lea",
"8-lea",
"9-lea",
"10-lea",
"11-lea",
"12-lea",
"d-voastră",
"dvs.",
"ing.",
"dr.",
"Rom.",
"str.",
"nr.",
"etc.",
"d.p.d.v.",
"dpdv",
"șamd.",
"ș.a.m.d.",
# below: from UD_Romanian-RRT:
"A.c.",
"A.f.",
"A.r.",
"Al.",
"Art.",
"Aug.",
"Bd.",
"Dem.",
"Dr.",
"Fig.",
"Fr.",
"Gh.",
"Gr.",
"Lt.",
"Nr.",
"Obs.",
"Prof.",
"Sf.",
"a.m.",
"a.r.",
"alin.",
"art.",
"d-l",
"d-lui",
"d-nei",
"ex.",
"fig.",
"ian.",
"lit.",
"lt.",
"p.a.",
"p.m.",
"pct.",
"prep.",
"sf.",
"tel.",
"univ.",
"îngr.",
"într-adevăr",
"Șt.",
"ș.a.",
]:
# note: does not distinguish capitalized-only exceptions from others
for variant in _make_ro_variants([orth]):
_exc[variant] = [{ORTH: variant}]
TOKENIZER_EXCEPTIONS = _exc