mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 21:57:15 +03:00
eddeb36c96
<!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
80 lines
2.8 KiB
Python
80 lines
2.8 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from ...symbols import ORTH, LEMMA
|
|
|
|
|
|
_exc = {}
|
|
|
|
|
|
# Source https://www.cs.tut.fi/~jkorpela/kielenopas/5.5.html
|
|
for exc_data in [
|
|
{ORTH: "aik.", LEMMA: "aikaisempi"},
|
|
{ORTH: "alk.", LEMMA: "alkaen"},
|
|
{ORTH: "alv.", LEMMA: "arvonlisävero"},
|
|
{ORTH: "ark.", LEMMA: "arkisin"},
|
|
{ORTH: "as.", LEMMA: "asunto"},
|
|
{ORTH: "ed.", LEMMA: "edellinen"},
|
|
{ORTH: "esim.", LEMMA: "esimerkki"},
|
|
{ORTH: "huom.", LEMMA: "huomautus"},
|
|
{ORTH: "jne.", LEMMA: "ja niin edelleen"},
|
|
{ORTH: "joht.", LEMMA: "johtaja"},
|
|
{ORTH: "k.", LEMMA: "kuollut"},
|
|
{ORTH: "ks.", LEMMA: "katso"},
|
|
{ORTH: "lk.", LEMMA: "luokka"},
|
|
{ORTH: "lkm.", LEMMA: "lukumäärä"},
|
|
{ORTH: "lyh.", LEMMA: "lyhenne"},
|
|
{ORTH: "läh.", LEMMA: "lähettäjä"},
|
|
{ORTH: "miel.", LEMMA: "mieluummin"},
|
|
{ORTH: "milj.", LEMMA: "miljoona"},
|
|
{ORTH: "mm.", LEMMA: "muun muassa"},
|
|
{ORTH: "myöh.", LEMMA: "myöhempi"},
|
|
{ORTH: "n.", LEMMA: "noin"},
|
|
{ORTH: "nimim.", LEMMA: "nimimerkki"},
|
|
{ORTH: "ns.", LEMMA: "niin sanottu"},
|
|
{ORTH: "nyk.", LEMMA: "nykyinen"},
|
|
{ORTH: "oik.", LEMMA: "oikealla"},
|
|
{ORTH: "os.", LEMMA: "osoite"},
|
|
{ORTH: "p.", LEMMA: "päivä"},
|
|
{ORTH: "par.", LEMMA: "paremmin"},
|
|
{ORTH: "per.", LEMMA: "perustettu"},
|
|
{ORTH: "pj.", LEMMA: "puheenjohtaja"},
|
|
{ORTH: "puh.joht.", LEMMA: "puheenjohtaja"},
|
|
{ORTH: "prof.", LEMMA: "professori"},
|
|
{ORTH: "puh.", LEMMA: "puhelin"},
|
|
{ORTH: "pvm.", LEMMA: "päivämäärä"},
|
|
{ORTH: "rak.", LEMMA: "rakennettu"},
|
|
{ORTH: "ry.", LEMMA: "rekisteröity yhdistys"},
|
|
{ORTH: "s.", LEMMA: "sivu"},
|
|
{ORTH: "siht.", LEMMA: "sihteeri"},
|
|
{ORTH: "synt.", LEMMA: "syntynyt"},
|
|
{ORTH: "t.", LEMMA: "toivoo"},
|
|
{ORTH: "tark.", LEMMA: "tarkastanut"},
|
|
{ORTH: "til.", LEMMA: "tilattu"},
|
|
{ORTH: "tms.", LEMMA: "tai muuta sellaista"},
|
|
{ORTH: "toim.", LEMMA: "toimittanut"},
|
|
{ORTH: "v.", LEMMA: "vuosi"},
|
|
{ORTH: "vas.", LEMMA: "vasen"},
|
|
{ORTH: "vast.", LEMMA: "vastaus"},
|
|
{ORTH: "vrt.", LEMMA: "vertaa"},
|
|
{ORTH: "yht.", LEMMA: "yhteensä"},
|
|
{ORTH: "yl.", LEMMA: "yleinen"},
|
|
{ORTH: "ym.", LEMMA: "ynnä muuta"},
|
|
{ORTH: "yms.", LEMMA: "ynnä muuta sellaista"},
|
|
{ORTH: "yo.", LEMMA: "ylioppilas"},
|
|
{ORTH: "yliopp.", LEMMA: "ylioppilas"},
|
|
{ORTH: "ao.", LEMMA: "asianomainen"},
|
|
{ORTH: "em.", LEMMA: "edellä mainittu"},
|
|
{ORTH: "ko.", LEMMA: "kyseessä oleva"},
|
|
{ORTH: "ml.", LEMMA: "mukaan luettuna"},
|
|
{ORTH: "po.", LEMMA: "puheena oleva"},
|
|
{ORTH: "so.", LEMMA: "se on"},
|
|
{ORTH: "ts.", LEMMA: "toisin sanoen"},
|
|
{ORTH: "vm.", LEMMA: "viimeksi mainittu"},
|
|
{ORTH: "srk.", LEMMA: "seurakunta"},
|
|
]:
|
|
_exc[exc_data[ORTH]] = [exc_data]
|
|
|
|
|
|
TOKENIZER_EXCEPTIONS = _exc
|