from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH from ...util import update_exc from .punctuation import _make_ro_variants _exc = {} # Source: https://en.wiktionary.org/wiki/Category:Romanian_abbreviations for orth in [ "1-a", "2-a", "3-a", "4-a", "5-a", "6-a", "7-a", "8-a", "9-a", "10-a", "11-a", "12-a", "1-ul", "2-lea", "3-lea", "4-lea", "5-lea", "6-lea", "7-lea", "8-lea", "9-lea", "10-lea", "11-lea", "12-lea", "d-voastră", "dvs.", "ing.", "dr.", "Rom.", "str.", "nr.", "etc.", "d.p.d.v.", "dpdv", "șamd.", "ș.a.m.d.", # below: from UD_Romanian-RRT: "A.c.", "A.f.", "A.r.", "Al.", "Art.", "Aug.", "Bd.", "Dem.", "Dr.", "Fig.", "Fr.", "Gh.", "Gr.", "Lt.", "Nr.", "Obs.", "Prof.", "Sf.", "a.m.", "a.r.", "alin.", "art.", "d-l", "d-lui", "d-nei", "ex.", "fig.", "ian.", "lit.", "lt.", "p.a.", "p.m.", "pct.", "prep.", "sf.", "tel.", "univ.", "îngr.", "într-adevăr", "Șt.", "ș.a.", ]: # note: does not distinguish capitalized-only exceptions from others for variant in _make_ro_variants([orth]): _exc[variant] = [{ORTH: variant}] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)