spaCy/spacy/lang/fi/tokenizer_exceptions.py

from ...symbols import NORM, ORTH
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS

_exc = {}


# Source https://www.cs.tut.fi/~jkorpela/kielenopas/5.5.html
for exc_data in [
    {ORTH: "aik."},
    {ORTH: "alk."},
    {ORTH: "alv."},
    {ORTH: "ark."},
    {ORTH: "as."},
    {ORTH: "eaa."},
    {ORTH: "ed."},
    {ORTH: "esim."},
    {ORTH: "huom."},
    {ORTH: "jne."},
    {ORTH: "joht."},
    {ORTH: "k."},
    {ORTH: "ks."},
    {ORTH: "lk."},
    {ORTH: "lkm."},
    {ORTH: "lyh."},
    {ORTH: "läh."},
    {ORTH: "miel."},
    {ORTH: "milj."},
    {ORTH: "Mm."},
    {ORTH: "mm."},
    {ORTH: "myöh."},
    {ORTH: "n."},
    {ORTH: "nimim."},
    {ORTH: "n:o"},
    {ORTH: "N:o"},
    {ORTH: "nro"},
    {ORTH: "ns."},
    {ORTH: "nyk."},
    {ORTH: "oik."},
    {ORTH: "os."},
    {ORTH: "p."},
    {ORTH: "par."},
    {ORTH: "per."},
    {ORTH: "pj."},
    {ORTH: "puh.joht."},
    {ORTH: "prof."},
    {ORTH: "puh."},
    {ORTH: "pvm."},
    {ORTH: "rak."},
    {ORTH: "ry."},
    {ORTH: "s."},
    {ORTH: "siht."},
    {ORTH: "synt."},
    {ORTH: "t."},
    {ORTH: "tark."},
    {ORTH: "til."},
    {ORTH: "tms."},
    {ORTH: "toim."},
    {ORTH: "v."},
    {ORTH: "vas."},
    {ORTH: "vast."},
    {ORTH: "vrt."},
    {ORTH: "yht."},
    {ORTH: "yl."},
    {ORTH: "ym."},
    {ORTH: "yms."},
    {ORTH: "yo."},
    {ORTH: "yliopp."},
    {ORTH: "ao."},
    {ORTH: "em."},
    {ORTH: "ko."},
    {ORTH: "ml."},
    {ORTH: "po."},
    {ORTH: "so."},
    {ORTH: "ts."},
    {ORTH: "vm."},
    {ORTH: "srk."},
]:
    _exc[exc_data[ORTH]] = [exc_data]

# Source: https://kaino.kotus.fi/visk/sisallys.php?p=141
conj_contraction_bases = [
    ("ett", "että"),
    ("jott", "jotta"),
    ("kosk", "koska"),
    ("mutt", "mutta"),
    ("vaikk", "vaikka"),
    ("ehk", "ehkä"),
    ("miks", "miksi"),
    ("siks", "siksi"),
    ("joll", "jos"),
    ("ell", "jos"),
]
conj_contraction_negations = [
    ("en", "en"),
    ("et", "et"),
    ("ei", "ei"),
    ("emme", "emme"),
    ("ette", "ette"),
    ("eivat", "eivät"),
    ("eivät", "eivät"),
]
for (base_lower, base_norm) in conj_contraction_bases:
    for base in [base_lower, base_lower.title()]:
        for (suffix, suffix_norm) in conj_contraction_negations:
            _exc[base + suffix] = [
                {ORTH: base, NORM: base_norm},
                {ORTH: suffix, NORM: suffix_norm},
            ]

TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
isort all the things 2023-06-26 12:41:03 +03:00			`from ...symbols import NORM, ORTH`
Tidy up and move noun_chunks, token_match, url_match 2020-07-22 23:18:46 +03:00			`from ...util import update_exc`
isort all the things 2023-06-26 12:41:03 +03:00			`from ..tokenizer_exceptions import BASE_EXCEPTIONS`
Reorganise Finnish language data 2017-05-08 16:48:31 +03:00
			`_exc = {}`

Add preliminary support for Finnish 2017-02-01 01:27:29 +03:00
			`# Source https://www.cs.tut.fi/~jkorpela/kielenopas/5.5.html`
Reorganise Finnish language data 2017-05-08 16:48:31 +03:00			`for exc_data in [`
Remove POS, TAG and LEMMA from tokenizer exceptions 2020-07-23 00:09:01 +03:00			`{ORTH: "aik."},`
			`{ORTH: "alk."},`
			`{ORTH: "alv."},`
			`{ORTH: "ark."},`
			`{ORTH: "as."},`
			`{ORTH: "eaa."},`
			`{ORTH: "ed."},`
			`{ORTH: "esim."},`
			`{ORTH: "huom."},`
			`{ORTH: "jne."},`
			`{ORTH: "joht."},`
			`{ORTH: "k."},`
			`{ORTH: "ks."},`
			`{ORTH: "lk."},`
			`{ORTH: "lkm."},`
			`{ORTH: "lyh."},`
			`{ORTH: "läh."},`
			`{ORTH: "miel."},`
			`{ORTH: "milj."},`
			`{ORTH: "Mm."},`
			`{ORTH: "mm."},`
			`{ORTH: "myöh."},`
			`{ORTH: "n."},`
			`{ORTH: "nimim."},`
			`{ORTH: "n:o"},`
			`{ORTH: "N:o"},`
			`{ORTH: "nro"},`
			`{ORTH: "ns."},`
			`{ORTH: "nyk."},`
			`{ORTH: "oik."},`
			`{ORTH: "os."},`
			`{ORTH: "p."},`
			`{ORTH: "par."},`
			`{ORTH: "per."},`
			`{ORTH: "pj."},`
			`{ORTH: "puh.joht."},`
			`{ORTH: "prof."},`
			`{ORTH: "puh."},`
			`{ORTH: "pvm."},`
			`{ORTH: "rak."},`
			`{ORTH: "ry."},`
			`{ORTH: "s."},`
			`{ORTH: "siht."},`
			`{ORTH: "synt."},`
			`{ORTH: "t."},`
			`{ORTH: "tark."},`
			`{ORTH: "til."},`
			`{ORTH: "tms."},`
			`{ORTH: "toim."},`
			`{ORTH: "v."},`
			`{ORTH: "vas."},`
			`{ORTH: "vast."},`
			`{ORTH: "vrt."},`
			`{ORTH: "yht."},`
			`{ORTH: "yl."},`
			`{ORTH: "ym."},`
			`{ORTH: "yms."},`
			`{ORTH: "yo."},`
			`{ORTH: "yliopp."},`
			`{ORTH: "ao."},`
			`{ORTH: "em."},`
			`{ORTH: "ko."},`
			`{ORTH: "ml."},`
			`{ORTH: "po."},`
			`{ORTH: "so."},`
			`{ORTH: "ts."},`
			`{ORTH: "vm."},`
			`{ORTH: "srk."},`
💫 Tidy up and auto-format .py files (#2983) <!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-11-30 19:03:03 +03:00			`]:`
Tidy up tokenizer exceptions 2017-11-02 01:02:45 +03:00			`_exc[exc_data[ORTH]] = [exc_data]`
Reorganise Finnish language data 2017-05-08 16:48:31 +03:00
[Finnish tokenizer] Handle conjunction contractions (#8105) 2021-06-16 11:56:47 +03:00			`# Source: https://kaino.kotus.fi/visk/sisallys.php?p=141`
			`conj_contraction_bases = [`
Tidy up code 2021-06-28 12:48:00 +03:00			`("ett", "että"),`
			`("jott", "jotta"),`
			`("kosk", "koska"),`
			`("mutt", "mutta"),`
			`("vaikk", "vaikka"),`
			`("ehk", "ehkä"),`
			`("miks", "miksi"),`
			`("siks", "siksi"),`
			`("joll", "jos"),`
			`("ell", "jos"),`
[Finnish tokenizer] Handle conjunction contractions (#8105) 2021-06-16 11:56:47 +03:00			`]`
			`conj_contraction_negations = [`
Tidy up code 2021-06-28 12:48:00 +03:00			`("en", "en"),`
			`("et", "et"),`
			`("ei", "ei"),`
			`("emme", "emme"),`
			`("ette", "ette"),`
			`("eivat", "eivät"),`
			`("eivät", "eivät"),`
			`]`
[Finnish tokenizer] Handle conjunction contractions (#8105) 2021-06-16 11:56:47 +03:00			`for (base_lower, base_norm) in conj_contraction_bases:`
			`for base in [base_lower, base_lower.title()]:`
			`for (suffix, suffix_norm) in conj_contraction_negations:`
Tidy up code 2021-06-28 12:48:00 +03:00			`_exc[base + suffix] = [`
			`{ORTH: base, NORM: base_norm},`
			`{ORTH: suffix, NORM: suffix_norm},`
			`]`
Reorganise Finnish language data 2017-05-08 16:48:31 +03:00
Tidy up and move noun_chunks, token_match, url_match 2020-07-22 23:18:46 +03:00			`TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)`