mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			157 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			157 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | |
| from ...symbols import NORM, ORTH
 | |
| from ...util import update_exc
 | |
| 
 | |
| _exc = {}
 | |
| 
 | |
| 
 | |
| # Verbs
 | |
| 
 | |
| for verb_data in [
 | |
|     {ORTH: "driver"},
 | |
|     {ORTH: "kör"},
 | |
|     {ORTH: "hörr"},
 | |
|     {ORTH: "fattar"},
 | |
|     {ORTH: "hajar"},
 | |
|     {ORTH: "lever"},
 | |
|     {ORTH: "serr"},
 | |
|     {ORTH: "fixar"},
 | |
| ]:
 | |
|     verb_data_tc = dict(verb_data)
 | |
|     verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
 | |
|     for data in [verb_data, verb_data_tc]:
 | |
|         _exc[data[ORTH] + "u"] = [data, {ORTH: "u", NORM: "du"}]
 | |
| 
 | |
| # Abbreviations for weekdays "sön." (for "söndag" / "söner")
 | |
| # are left out because they are ambiguous. The same is the case
 | |
| # for abbreviations "jul." and "Jul." ("juli" / "jul").
 | |
| for exc_data in [
 | |
|     {ORTH: "jan.", NORM: "januari"},
 | |
|     {ORTH: "febr.", NORM: "februari"},
 | |
|     {ORTH: "feb.", NORM: "februari"},
 | |
|     {ORTH: "apr.", NORM: "april"},
 | |
|     {ORTH: "jun.", NORM: "juni"},
 | |
|     {ORTH: "aug.", NORM: "augusti"},
 | |
|     {ORTH: "sept.", NORM: "september"},
 | |
|     {ORTH: "sep.", NORM: "september"},
 | |
|     {ORTH: "okt.", NORM: "oktober"},
 | |
|     {ORTH: "nov.", NORM: "november"},
 | |
|     {ORTH: "dec.", NORM: "december"},
 | |
|     {ORTH: "mån.", NORM: "måndag"},
 | |
|     {ORTH: "tis.", NORM: "tisdag"},
 | |
|     {ORTH: "ons.", NORM: "onsdag"},
 | |
|     {ORTH: "tors.", NORM: "torsdag"},
 | |
|     {ORTH: "fre.", NORM: "fredag"},
 | |
|     {ORTH: "lör.", NORM: "lördag"},
 | |
|     {ORTH: "Jan.", NORM: "Januari"},
 | |
|     {ORTH: "Febr.", NORM: "Februari"},
 | |
|     {ORTH: "Feb.", NORM: "Februari"},
 | |
|     {ORTH: "Apr.", NORM: "April"},
 | |
|     {ORTH: "Jun.", NORM: "Juni"},
 | |
|     {ORTH: "Aug.", NORM: "Augusti"},
 | |
|     {ORTH: "Sept.", NORM: "September"},
 | |
|     {ORTH: "Sep.", NORM: "September"},
 | |
|     {ORTH: "Okt.", NORM: "Oktober"},
 | |
|     {ORTH: "Nov.", NORM: "November"},
 | |
|     {ORTH: "Dec.", NORM: "December"},
 | |
|     {ORTH: "Mån.", NORM: "Måndag"},
 | |
|     {ORTH: "Tis.", NORM: "Tisdag"},
 | |
|     {ORTH: "Ons.", NORM: "Onsdag"},
 | |
|     {ORTH: "Tors.", NORM: "Torsdag"},
 | |
|     {ORTH: "Fre.", NORM: "Fredag"},
 | |
|     {ORTH: "Lör.", NORM: "Lördag"},
 | |
|     {ORTH: "sthlm", NORM: "Stockholm"},
 | |
|     {ORTH: "gbg", NORM: "Göteborg"},
 | |
| ]:
 | |
|     _exc[exc_data[ORTH]] = [exc_data]
 | |
| 
 | |
| 
 | |
| # Specific case abbreviations only
 | |
| for orth in ["AB", "Dr.", "H.M.", "H.K.H.", "m/s", "M/S", "Ph.d.", "S:t", "s:t"]:
 | |
|     _exc[orth] = [{ORTH: orth}]
 | |
| 
 | |
| 
 | |
| ABBREVIATIONS = [
 | |
|     "ang",
 | |
|     "anm",
 | |
|     "bl.a",
 | |
|     "d.v.s",
 | |
|     "doc",
 | |
|     "dvs",
 | |
|     "e.d",
 | |
|     "e.kr",
 | |
|     "el.",
 | |
|     "eng",
 | |
|     "etc",
 | |
|     "exkl",
 | |
|     "ev",
 | |
|     "f.",
 | |
|     "f.d",
 | |
|     "f.kr",
 | |
|     "f.n",
 | |
|     "f.ö",
 | |
|     "fid",
 | |
|     "fig",
 | |
|     "forts",
 | |
|     "fr.o.m",
 | |
|     "förf",
 | |
|     "inkl",
 | |
|     "iofs",
 | |
|     "jur.",
 | |
|     "kap",
 | |
|     "kl",
 | |
|     "kor.",
 | |
|     "kr",
 | |
|     "kungl",
 | |
|     "lat",
 | |
|     "m.a.o",
 | |
|     "m.fl",
 | |
|     "m.m",
 | |
|     "max",
 | |
|     "milj",
 | |
|     "min.",
 | |
|     "mos",
 | |
|     "mt",
 | |
|     "mvh",
 | |
|     "o.d",
 | |
|     "o.s.v",
 | |
|     "obs",
 | |
|     "osv",
 | |
|     "p.g.a",
 | |
|     "proc",
 | |
|     "prof",
 | |
|     "ref",
 | |
|     "resp",
 | |
|     "s.a.s",
 | |
|     "s.k",
 | |
|     "s.t",
 | |
|     "sid",
 | |
|     "t.ex",
 | |
|     "t.h",
 | |
|     "t.o.m",
 | |
|     "t.v",
 | |
|     "tel",
 | |
|     "ung.",
 | |
|     "vol",
 | |
|     "v.",
 | |
|     "äv",
 | |
|     "övers",
 | |
| ]
 | |
| 
 | |
| # Add abbreviation for trailing punctuation too. If the abbreviation already has a trailing punctuation - skip it.
 | |
| for abbr in ABBREVIATIONS:
 | |
|     if not abbr.endswith("."):
 | |
|         ABBREVIATIONS.append(abbr + ".")
 | |
| 
 | |
| for orth in ABBREVIATIONS:
 | |
|     _exc[orth] = [{ORTH: orth}]
 | |
|     capitalized = orth.capitalize()
 | |
|     _exc[capitalized] = [{ORTH: capitalized}]
 | |
| 
 | |
| # Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."),
 | |
| # should be tokenized as two separate tokens.
 | |
| for orth in ["i", "m"]:
 | |
|     _exc[orth + "."] = [{ORTH: orth, NORM: orth}, {ORTH: "."}]
 | |
| 
 | |
| TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 |