spaCy/spacy/lang/fi/tokenizer_exceptions.py

112 lines
2.4 KiB
Python
Raw Normal View History

2023-06-26 12:41:03 +03:00
from ...symbols import NORM, ORTH
from ...util import update_exc
2023-06-26 12:41:03 +03:00
from ..tokenizer_exceptions import BASE_EXCEPTIONS
2017-05-08 16:48:31 +03:00
_exc = {}
2017-02-01 01:27:29 +03:00
# Source https://www.cs.tut.fi/~jkorpela/kielenopas/5.5.html
2017-05-08 16:48:31 +03:00
for exc_data in [
{ORTH: "aik."},
{ORTH: "alk."},
{ORTH: "alv."},
{ORTH: "ark."},
{ORTH: "as."},
{ORTH: "eaa."},
{ORTH: "ed."},
{ORTH: "esim."},
{ORTH: "huom."},
{ORTH: "jne."},
{ORTH: "joht."},
{ORTH: "k."},
{ORTH: "ks."},
{ORTH: "lk."},
{ORTH: "lkm."},
{ORTH: "lyh."},
{ORTH: "läh."},
{ORTH: "miel."},
{ORTH: "milj."},
{ORTH: "Mm."},
{ORTH: "mm."},
{ORTH: "myöh."},
{ORTH: "n."},
{ORTH: "nimim."},
{ORTH: "n:o"},
{ORTH: "N:o"},
{ORTH: "nro"},
{ORTH: "ns."},
{ORTH: "nyk."},
{ORTH: "oik."},
{ORTH: "os."},
{ORTH: "p."},
{ORTH: "par."},
{ORTH: "per."},
{ORTH: "pj."},
{ORTH: "puh.joht."},
{ORTH: "prof."},
{ORTH: "puh."},
{ORTH: "pvm."},
{ORTH: "rak."},
{ORTH: "ry."},
{ORTH: "s."},
{ORTH: "siht."},
{ORTH: "synt."},
{ORTH: "t."},
{ORTH: "tark."},
{ORTH: "til."},
{ORTH: "tms."},
{ORTH: "toim."},
{ORTH: "v."},
{ORTH: "vas."},
{ORTH: "vast."},
{ORTH: "vrt."},
{ORTH: "yht."},
{ORTH: "yl."},
{ORTH: "ym."},
{ORTH: "yms."},
{ORTH: "yo."},
{ORTH: "yliopp."},
{ORTH: "ao."},
{ORTH: "em."},
{ORTH: "ko."},
{ORTH: "ml."},
{ORTH: "po."},
{ORTH: "so."},
{ORTH: "ts."},
{ORTH: "vm."},
{ORTH: "srk."},
]:
2017-11-02 01:02:45 +03:00
_exc[exc_data[ORTH]] = [exc_data]
2017-05-08 16:48:31 +03:00
# Source: https://kaino.kotus.fi/visk/sisallys.php?p=141
conj_contraction_bases = [
2021-06-28 12:48:00 +03:00
("ett", "että"),
("jott", "jotta"),
("kosk", "koska"),
("mutt", "mutta"),
("vaikk", "vaikka"),
("ehk", "ehkä"),
("miks", "miksi"),
("siks", "siksi"),
("joll", "jos"),
("ell", "jos"),
]
conj_contraction_negations = [
2021-06-28 12:48:00 +03:00
("en", "en"),
("et", "et"),
("ei", "ei"),
("emme", "emme"),
("ette", "ette"),
("eivat", "eivät"),
("eivät", "eivät"),
]
for (base_lower, base_norm) in conj_contraction_bases:
for base in [base_lower, base_lower.title()]:
for (suffix, suffix_norm) in conj_contraction_negations:
2021-06-28 12:48:00 +03:00
_exc[base + suffix] = [
{ORTH: base, NORM: base_norm},
{ORTH: suffix, NORM: suffix_norm},
]
2017-05-08 16:48:31 +03:00
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)