2019-10-14 13:27:50 +03:00
|
|
|
|
# coding: utf8
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
2019-10-18 12:27:38 +03:00
|
|
|
|
from ...symbols import ORTH, LEMMA, NORM
|
2019-10-14 13:27:50 +03:00
|
|
|
|
from ..punctuation import TOKENIZER_PREFIXES
|
|
|
|
|
|
|
|
|
|
# TODO
|
|
|
|
|
# tokenize cliticised definite article "d'" as token of its own: d'Kanner > [d'] [Kanner]
|
|
|
|
|
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
|
|
|
|
|
|
|
|
|
|
# how to write the tokenisation exeption for the articles d' / D' ? This one is not working.
|
2019-10-18 12:27:38 +03:00
|
|
|
|
_prefixes = [
|
|
|
|
|
prefix for prefix in TOKENIZER_PREFIXES if prefix not in ["d'", "D'", "d’", "D’"]
|
|
|
|
|
]
|
2019-10-14 13:27:50 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_exc = {
|
|
|
|
|
"d'mannst": [
|
|
|
|
|
{ORTH: "d'", LEMMA: "d'"},
|
2019-10-18 12:27:38 +03:00
|
|
|
|
{ORTH: "mannst", LEMMA: "mann", NORM: "mann"},
|
|
|
|
|
],
|
2019-10-14 13:27:50 +03:00
|
|
|
|
"d'éischt": [
|
|
|
|
|
{ORTH: "d'", LEMMA: "d'"},
|
2019-10-18 12:27:38 +03:00
|
|
|
|
{ORTH: "éischt", LEMMA: "éischt", NORM: "éischt"},
|
|
|
|
|
],
|
2019-10-14 13:27:50 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# translate / delete what is not necessary
|
|
|
|
|
# what does PRON_LEMMA mean?
|
|
|
|
|
for exc_data in [
|
|
|
|
|
{ORTH: "wgl.", LEMMA: "wann ech gelift", NORM: "wann ech gelieft"},
|
|
|
|
|
{ORTH: "M.", LEMMA: "Monsieur", NORM: "Monsieur"},
|
2019-10-18 12:27:38 +03:00
|
|
|
|
{ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"},
|
|
|
|
|
{ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"},
|
2019-10-14 13:27:50 +03:00
|
|
|
|
{ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"},
|
|
|
|
|
{ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"},
|
|
|
|
|
{ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"},
|
|
|
|
|
{ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"},
|
2019-10-18 12:27:38 +03:00
|
|
|
|
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
|
|
|
|
|
]:
|
2019-10-14 13:27:50 +03:00
|
|
|
|
_exc[exc_data[ORTH]] = [exc_data]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# to be extended
|
|
|
|
|
for orth in [
|
2019-10-18 12:27:38 +03:00
|
|
|
|
"z.B.",
|
|
|
|
|
"Dipl.",
|
|
|
|
|
"Dr.",
|
|
|
|
|
"etc.",
|
|
|
|
|
"i.e.",
|
|
|
|
|
"o.k.",
|
|
|
|
|
"O.K.",
|
|
|
|
|
"p.a.",
|
|
|
|
|
"p.s.",
|
|
|
|
|
"P.S.",
|
|
|
|
|
"phil.",
|
|
|
|
|
"q.e.d.",
|
|
|
|
|
"R.I.P.",
|
|
|
|
|
"rer.",
|
|
|
|
|
"sen.",
|
|
|
|
|
"ë.a.",
|
|
|
|
|
"U.S.",
|
|
|
|
|
"U.S.A.",
|
|
|
|
|
]:
|
2019-10-14 13:27:50 +03:00
|
|
|
|
_exc[orth] = [{ORTH: orth}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TOKENIZER_PREFIXES = _prefixes
|
|
|
|
|
TOKENIZER_EXCEPTIONS = _exc
|