from ...symbols import NORM, ORTH from ...util import update_exc from ..tokenizer_exceptions import BASE_EXCEPTIONS _exc = {} # Source https://www.cs.tut.fi/~jkorpela/kielenopas/5.5.html for exc_data in [ {ORTH: "aik."}, {ORTH: "alk."}, {ORTH: "alv."}, {ORTH: "ark."}, {ORTH: "as."}, {ORTH: "eaa."}, {ORTH: "ed."}, {ORTH: "esim."}, {ORTH: "huom."}, {ORTH: "jne."}, {ORTH: "joht."}, {ORTH: "k."}, {ORTH: "ks."}, {ORTH: "lk."}, {ORTH: "lkm."}, {ORTH: "lyh."}, {ORTH: "läh."}, {ORTH: "miel."}, {ORTH: "milj."}, {ORTH: "Mm."}, {ORTH: "mm."}, {ORTH: "myöh."}, {ORTH: "n."}, {ORTH: "nimim."}, {ORTH: "n:o"}, {ORTH: "N:o"}, {ORTH: "nro"}, {ORTH: "ns."}, {ORTH: "nyk."}, {ORTH: "oik."}, {ORTH: "os."}, {ORTH: "p."}, {ORTH: "par."}, {ORTH: "per."}, {ORTH: "pj."}, {ORTH: "puh.joht."}, {ORTH: "prof."}, {ORTH: "puh."}, {ORTH: "pvm."}, {ORTH: "rak."}, {ORTH: "ry."}, {ORTH: "s."}, {ORTH: "siht."}, {ORTH: "synt."}, {ORTH: "t."}, {ORTH: "tark."}, {ORTH: "til."}, {ORTH: "tms."}, {ORTH: "toim."}, {ORTH: "v."}, {ORTH: "vas."}, {ORTH: "vast."}, {ORTH: "vrt."}, {ORTH: "yht."}, {ORTH: "yl."}, {ORTH: "ym."}, {ORTH: "yms."}, {ORTH: "yo."}, {ORTH: "yliopp."}, {ORTH: "ao."}, {ORTH: "em."}, {ORTH: "ko."}, {ORTH: "ml."}, {ORTH: "po."}, {ORTH: "so."}, {ORTH: "ts."}, {ORTH: "vm."}, {ORTH: "srk."}, ]: _exc[exc_data[ORTH]] = [exc_data] # Source: https://kaino.kotus.fi/visk/sisallys.php?p=141 conj_contraction_bases = [ ("ett", "että"), ("jott", "jotta"), ("kosk", "koska"), ("mutt", "mutta"), ("vaikk", "vaikka"), ("ehk", "ehkä"), ("miks", "miksi"), ("siks", "siksi"), ("joll", "jos"), ("ell", "jos"), ] conj_contraction_negations = [ ("en", "en"), ("et", "et"), ("ei", "ei"), ("emme", "emme"), ("ette", "ette"), ("eivat", "eivät"), ("eivät", "eivät"), ] for (base_lower, base_norm) in conj_contraction_bases: for base in [base_lower, base_lower.title()]: for (suffix, suffix_norm) in conj_contraction_negations: _exc[base + suffix] = [ {ORTH: base, NORM: base_norm}, {ORTH: suffix, NORM: suffix_norm}, ] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)