spaCy/spacy/lang/lb/tokenizer_exceptions.py

53 lines
1.1 KiB
Python
Raw Normal View History

from ...symbols import NORM, ORTH
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
# TODO
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
2019-11-20 15:15:24 +03:00
_exc = {}
# translate / delete what is not necessary
for exc_data in [
{ORTH: "t", NORM: "et"},
{ORTH: "T", NORM: "et"},
{ORTH: "'t", NORM: "et"},
{ORTH: "'T", NORM: "et"},
{ORTH: "wgl.", NORM: "wannechgelift"},
{ORTH: "M.", NORM: "Monsieur"},
{ORTH: "Mme.", NORM: "Madame"},
{ORTH: "Dr.", NORM: "Dokter"},
{ORTH: "Tel.", NORM: "Telefon"},
{ORTH: "asw.", NORM: "an sou weider"},
{ORTH: "etc.", NORM: "et cetera"},
{ORTH: "bzw.", NORM: "bezéiungsweis"},
{ORTH: "Jan.", NORM: "Januar"},
2019-10-18 12:27:38 +03:00
]:
_exc[exc_data[ORTH]] = [exc_data]
# to be extended
for orth in [
2019-10-18 12:27:38 +03:00
"z.B.",
"Dipl.",
"Dr.",
"etc.",
"i.e.",
"o.k.",
"O.K.",
"p.a.",
"p.s.",
"P.S.",
"phil.",
"q.e.d.",
"R.I.P.",
"rer.",
"sen.",
"ë.a.",
"U.S.",
"U.S.A.",
]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)