mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 20:28:20 +03:00
eddeb36c96
<!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
175 lines
2.2 KiB
Python
175 lines
2.2 KiB
Python
# encoding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from ...symbols import ORTH, LEMMA
|
|
|
|
|
|
_exc = {}
|
|
|
|
|
|
for exc_data in [
|
|
{ORTH: "jan.", LEMMA: "januar"},
|
|
{ORTH: "feb.", LEMMA: "februar"},
|
|
{ORTH: "jul.", LEMMA: "juli"},
|
|
]:
|
|
_exc[exc_data[ORTH]] = [exc_data]
|
|
|
|
|
|
for orth in [
|
|
"adm.dir.",
|
|
"a.m.",
|
|
"Aq.",
|
|
"b.c.",
|
|
"bl.a.",
|
|
"bla.",
|
|
"bm.",
|
|
"bto.",
|
|
"ca.",
|
|
"cand.mag.",
|
|
"c.c.",
|
|
"co.",
|
|
"d.d.",
|
|
"dept.",
|
|
"d.m.",
|
|
"dr.philos.",
|
|
"dvs.",
|
|
"d.y.",
|
|
"E. coli",
|
|
"eg.",
|
|
"ekskl.",
|
|
"e.Kr.",
|
|
"el.",
|
|
"e.l.",
|
|
"et.",
|
|
"etg.",
|
|
"ev.",
|
|
"evt.",
|
|
"f.",
|
|
"f.eks.",
|
|
"fhv.",
|
|
"fk.",
|
|
"f.Kr.",
|
|
"f.o.m.",
|
|
"foreg.",
|
|
"fork.",
|
|
"fv.",
|
|
"fvt.",
|
|
"g.",
|
|
"gt.",
|
|
"gl.",
|
|
"gno.",
|
|
"gnr.",
|
|
"grl.",
|
|
"hhv.",
|
|
"hoh.",
|
|
"hr.",
|
|
"h.r.adv.",
|
|
"ifb.",
|
|
"ifm.",
|
|
"iht.",
|
|
"inkl.",
|
|
"istf.",
|
|
"jf.",
|
|
"jr.",
|
|
"jun.",
|
|
"kfr.",
|
|
"kgl.res.",
|
|
"kl.",
|
|
"komm.",
|
|
"kst.",
|
|
"lø.",
|
|
"ma.",
|
|
"mag.art.",
|
|
"m.a.o.",
|
|
"md.",
|
|
"mfl.",
|
|
"mill.",
|
|
"min.",
|
|
"m.m.",
|
|
"mnd.",
|
|
"moh.",
|
|
"Mr.",
|
|
"muh.",
|
|
"mv.",
|
|
"mva.",
|
|
"ndf.",
|
|
"no.",
|
|
"nov.",
|
|
"nr.",
|
|
"nto.",
|
|
"nyno.",
|
|
"n.å.",
|
|
"o.a.",
|
|
"off.",
|
|
"ofl.",
|
|
"okt.",
|
|
"o.l.",
|
|
"on.",
|
|
"op.",
|
|
"osv.",
|
|
"ovf.",
|
|
"p.",
|
|
"p.a.",
|
|
"Pb.",
|
|
"pga.",
|
|
"ph.d.",
|
|
"pkt.",
|
|
"p.m.",
|
|
"pr.",
|
|
"pst.",
|
|
"p.t.",
|
|
"red.anm.",
|
|
"ref.",
|
|
"res.",
|
|
"res.kap.",
|
|
"resp.",
|
|
"rv.",
|
|
"s.",
|
|
"s.d.",
|
|
"sen.",
|
|
"sep.",
|
|
"siviling.",
|
|
"sms.",
|
|
"spm.",
|
|
"sr.",
|
|
"sst.",
|
|
"st.",
|
|
"stip.",
|
|
"stk.",
|
|
"st.meld.",
|
|
"st.prp.",
|
|
"stud.",
|
|
"s.u.",
|
|
"sv.",
|
|
"sø.",
|
|
"s.å.",
|
|
"såk.",
|
|
"temp.",
|
|
"ti.",
|
|
"tils.",
|
|
"tilsv.",
|
|
"tl;dr",
|
|
"tlf.",
|
|
"to.",
|
|
"t.o.m.",
|
|
"ult.",
|
|
"utg.",
|
|
"v.",
|
|
"vedk.",
|
|
"vedr.",
|
|
"vg.",
|
|
"vgs.",
|
|
"vha.",
|
|
"vit.ass.",
|
|
"vn.",
|
|
"vol.",
|
|
"vs.",
|
|
"vsa.",
|
|
"årg.",
|
|
"årh.",
|
|
]:
|
|
_exc[orth] = [{ORTH: orth}]
|
|
|
|
|
|
TOKENIZER_EXCEPTIONS = _exc
|