mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	<!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
		
			
				
	
	
		
			175 lines
		
	
	
		
			2.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			175 lines
		
	
	
		
			2.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# encoding: utf8
 | 
						|
from __future__ import unicode_literals
 | 
						|
 | 
						|
from ...symbols import ORTH, LEMMA
 | 
						|
 | 
						|
 | 
						|
_exc = {}
 | 
						|
 | 
						|
 | 
						|
for exc_data in [
 | 
						|
    {ORTH: "jan.", LEMMA: "januar"},
 | 
						|
    {ORTH: "feb.", LEMMA: "februar"},
 | 
						|
    {ORTH: "jul.", LEMMA: "juli"},
 | 
						|
]:
 | 
						|
    _exc[exc_data[ORTH]] = [exc_data]
 | 
						|
 | 
						|
 | 
						|
for orth in [
 | 
						|
    "adm.dir.",
 | 
						|
    "a.m.",
 | 
						|
    "Aq.",
 | 
						|
    "b.c.",
 | 
						|
    "bl.a.",
 | 
						|
    "bla.",
 | 
						|
    "bm.",
 | 
						|
    "bto.",
 | 
						|
    "ca.",
 | 
						|
    "cand.mag.",
 | 
						|
    "c.c.",
 | 
						|
    "co.",
 | 
						|
    "d.d.",
 | 
						|
    "dept.",
 | 
						|
    "d.m.",
 | 
						|
    "dr.philos.",
 | 
						|
    "dvs.",
 | 
						|
    "d.y.",
 | 
						|
    "E. coli",
 | 
						|
    "eg.",
 | 
						|
    "ekskl.",
 | 
						|
    "e.Kr.",
 | 
						|
    "el.",
 | 
						|
    "e.l.",
 | 
						|
    "et.",
 | 
						|
    "etg.",
 | 
						|
    "ev.",
 | 
						|
    "evt.",
 | 
						|
    "f.",
 | 
						|
    "f.eks.",
 | 
						|
    "fhv.",
 | 
						|
    "fk.",
 | 
						|
    "f.Kr.",
 | 
						|
    "f.o.m.",
 | 
						|
    "foreg.",
 | 
						|
    "fork.",
 | 
						|
    "fv.",
 | 
						|
    "fvt.",
 | 
						|
    "g.",
 | 
						|
    "gt.",
 | 
						|
    "gl.",
 | 
						|
    "gno.",
 | 
						|
    "gnr.",
 | 
						|
    "grl.",
 | 
						|
    "hhv.",
 | 
						|
    "hoh.",
 | 
						|
    "hr.",
 | 
						|
    "h.r.adv.",
 | 
						|
    "ifb.",
 | 
						|
    "ifm.",
 | 
						|
    "iht.",
 | 
						|
    "inkl.",
 | 
						|
    "istf.",
 | 
						|
    "jf.",
 | 
						|
    "jr.",
 | 
						|
    "jun.",
 | 
						|
    "kfr.",
 | 
						|
    "kgl.res.",
 | 
						|
    "kl.",
 | 
						|
    "komm.",
 | 
						|
    "kst.",
 | 
						|
    "lø.",
 | 
						|
    "ma.",
 | 
						|
    "mag.art.",
 | 
						|
    "m.a.o.",
 | 
						|
    "md.",
 | 
						|
    "mfl.",
 | 
						|
    "mill.",
 | 
						|
    "min.",
 | 
						|
    "m.m.",
 | 
						|
    "mnd.",
 | 
						|
    "moh.",
 | 
						|
    "Mr.",
 | 
						|
    "muh.",
 | 
						|
    "mv.",
 | 
						|
    "mva.",
 | 
						|
    "ndf.",
 | 
						|
    "no.",
 | 
						|
    "nov.",
 | 
						|
    "nr.",
 | 
						|
    "nto.",
 | 
						|
    "nyno.",
 | 
						|
    "n.å.",
 | 
						|
    "o.a.",
 | 
						|
    "off.",
 | 
						|
    "ofl.",
 | 
						|
    "okt.",
 | 
						|
    "o.l.",
 | 
						|
    "on.",
 | 
						|
    "op.",
 | 
						|
    "osv.",
 | 
						|
    "ovf.",
 | 
						|
    "p.",
 | 
						|
    "p.a.",
 | 
						|
    "Pb.",
 | 
						|
    "pga.",
 | 
						|
    "ph.d.",
 | 
						|
    "pkt.",
 | 
						|
    "p.m.",
 | 
						|
    "pr.",
 | 
						|
    "pst.",
 | 
						|
    "p.t.",
 | 
						|
    "red.anm.",
 | 
						|
    "ref.",
 | 
						|
    "res.",
 | 
						|
    "res.kap.",
 | 
						|
    "resp.",
 | 
						|
    "rv.",
 | 
						|
    "s.",
 | 
						|
    "s.d.",
 | 
						|
    "sen.",
 | 
						|
    "sep.",
 | 
						|
    "siviling.",
 | 
						|
    "sms.",
 | 
						|
    "spm.",
 | 
						|
    "sr.",
 | 
						|
    "sst.",
 | 
						|
    "st.",
 | 
						|
    "stip.",
 | 
						|
    "stk.",
 | 
						|
    "st.meld.",
 | 
						|
    "st.prp.",
 | 
						|
    "stud.",
 | 
						|
    "s.u.",
 | 
						|
    "sv.",
 | 
						|
    "sø.",
 | 
						|
    "s.å.",
 | 
						|
    "såk.",
 | 
						|
    "temp.",
 | 
						|
    "ti.",
 | 
						|
    "tils.",
 | 
						|
    "tilsv.",
 | 
						|
    "tl;dr",
 | 
						|
    "tlf.",
 | 
						|
    "to.",
 | 
						|
    "t.o.m.",
 | 
						|
    "ult.",
 | 
						|
    "utg.",
 | 
						|
    "v.",
 | 
						|
    "vedk.",
 | 
						|
    "vedr.",
 | 
						|
    "vg.",
 | 
						|
    "vgs.",
 | 
						|
    "vha.",
 | 
						|
    "vit.ass.",
 | 
						|
    "vn.",
 | 
						|
    "vol.",
 | 
						|
    "vs.",
 | 
						|
    "vsa.",
 | 
						|
    "årg.",
 | 
						|
    "årh.",
 | 
						|
]:
 | 
						|
    _exc[orth] = [{ORTH: orth}]
 | 
						|
 | 
						|
 | 
						|
TOKENIZER_EXCEPTIONS = _exc
 |