spaCy/spacy/lang/lt/tokenizer_exceptions.py
Rokas Ramanauskas 61ce126d4c Lithuanian language support (#3895)
* initial LT lang support

* Added more stopwords. Started setting up some basic test environment (not complete)

* Initial morph rules for LT lang

* Closes #1 Adds tokenizer exceptions for Lithuanian

* Closes #5 Punctuation rules. Closes #6 Lexical Attributes

* test: add native examples to basic tests

* feat: add tag map for lt lang

* fix: remove undefined tag attribute 'Definite'

* feat: add lemmatizer for lt lang

* refactor: add new instances to lt lang morph rules; use tags from tag map

* refactor: add morph rules to lt lang defaults

* refactor: only keep nouns, verbs, adverbs and adjectives in lt lang lemmatizer lookup

* refactor: add capitalized words to lt lang lemmatizer

* refactor: add more num words to lt lang lex attrs

* refactor: update lt lang stop word set

* refactor: add new instances to lt lang tokenizer exceptions

* refactor: remove comments form lt lang init file

* refactor: use function instead of lambda in lt lex lang getter

* refactor: remove conversion to dict in lt init when dict is already provided

* chore: rename lt 'test_basic' to 'test_text'

* feat: add more lt text tests

* feat: add lemmatizer tests

* refactor: remove unused imports, add newline to end of file

* chore: add contributor agreement

* chore: change 'en' to 'lt' in lt example description

* fix: add missing encoding info

* style: add newline to end of file

* refactor: use python2 compatible syntax

* style: reformat code using black
2019-07-08 10:25:22 +02:00

269 lines
3.3 KiB
Python

# coding: utf8
from __future__ import unicode_literals
from ...symbols import ORTH
_exc = {}
for orth in [
"G.",
"J. E.",
"J. Em.",
"J.E.",
"J.Em.",
"K.",
"N.",
"V.",
"Vt.",
"a.",
"a.k.",
"a.s.",
"adv.",
"akad.",
"aklg.",
"akt.",
"al.",
"ang.",
"angl.",
"aps.",
"apskr.",
"apyg.",
"arbat.",
"asist.",
"asm.",
"asm.k.",
"asmv.",
"atk.",
"atsak.",
"atsisk.",
"atsisk.sąsk.",
"atv.",
"aut.",
"avd.",
"b.k.",
"baud.",
"biol.",
"bkl.",
"bot.",
"bt.",
"buv.",
"ch.",
"chem.",
"corp.",
"d.",
"dab.",
"dail.",
"dek.",
"deš.",
"dir.",
"dirig.",
"doc.",
"dol.",
"dr.",
"drp.",
"dvit.",
"dėst.",
"dš.",
"dž.",
"e.b.",
"e.bankas",
"e.p.",
"e.parašas",
"e.paštas",
"e.v.",
"e.valdžia",
"egz.",
"eil.",
"ekon.",
"el.",
"el.bankas",
"el.p.",
"el.parašas",
"el.paštas",
"el.valdžia",
"etc.",
"ež.",
"fak.",
"faks.",
"feat.",
"filol.",
"filos.",
"g.",
"gen.",
"geol.",
"gerb.",
"gim.",
"gr.",
"gv.",
"gyd.",
"gyv.",
"habil.",
"inc.",
"insp.",
"inž.",
"ir pan.",
"ir t. t.",
"isp.",
"istor.",
"it.",
"just.",
"k.",
"k. a.",
"k.a.",
"kab.",
"kand.",
"kart.",
"kat.",
"ketv.",
"kh.",
"kl.",
"kln.",
"km.",
"kn.",
"koresp.",
"kpt.",
"kr.",
"kt.",
"kub.",
"kun.",
"kv.",
"kyš.",
"l. e. p.",
"l.e.p.",
"lenk.",
"liet.",
"lot.",
"lt.",
"ltd.",
"ltn.",
"m.",
"m.e..",
"m.m.",
"mat.",
"med.",
"mgnt.",
"mgr.",
"min.",
"mjr.",
"ml.",
"mln.",
"mlrd.",
"mob.",
"mok.",
"moksl.",
"mokyt.",
"mot.",
"mr.",
"mst.",
"mstl.",
"mėn.",
"nkt.",
"no.",
"nr.",
"ntk.",
"nuotr.",
"op.",
"org.",
"orig.",
"p.",
"p.d.",
"p.m.e.",
"p.s.",
"pab.",
"pan.",
"past.",
"pav.",
"pavad.",
"per.",
"perd.",
"pirm.",
"pl.",
"plg.",
"plk.",
"pr.",
"pr.Kr.",
"pranc.",
"proc.",
"prof.",
"prom.",
"prot.",
"psl.",
"pss.",
"pvz.",
"pšt.",
"r.",
"raj.",
"red.",
"rez.",
"rež.",
"rus.",
"rš.",
"s.",
"sav.",
"saviv.",
"sek.",
"sekr.",
"sen.",
"sh.",
"sk.",
"skg.",
"skv.",
"skyr.",
"sp.",
"spec.",
"sr.",
"st.",
"str.",
"stud.",
"sąs.",
"t.",
"t. p.",
"t. y.",
"t.p.",
"t.t.",
"t.y.",
"techn.",
"tel.",
"teol.",
"th.",
"tir.",
"trit.",
"trln.",
"tšk.",
"tūks.",
"tūkst.",
"up.",
"upl.",
"v.s.",
"vad.",
"val.",
"valg.",
"ved.",
"vert.",
"vet.",
"vid.",
"virš.",
"vlsč.",
"vnt.",
"vok.",
"vs.",
"vtv.",
"vv.",
"vyr.",
"vyresn.",
"zool.",
"Įn",
"įl.",
"š.m.",
"šnek.",
"šv.",
"švč.",
"ž.ū.",
"žin.",
"žml.",
"žr.",
]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc