mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
61ce126d4c
* initial LT lang support * Added more stopwords. Started setting up some basic test environment (not complete) * Initial morph rules for LT lang * Closes #1 Adds tokenizer exceptions for Lithuanian * Closes #5 Punctuation rules. Closes #6 Lexical Attributes * test: add native examples to basic tests * feat: add tag map for lt lang * fix: remove undefined tag attribute 'Definite' * feat: add lemmatizer for lt lang * refactor: add new instances to lt lang morph rules; use tags from tag map * refactor: add morph rules to lt lang defaults * refactor: only keep nouns, verbs, adverbs and adjectives in lt lang lemmatizer lookup * refactor: add capitalized words to lt lang lemmatizer * refactor: add more num words to lt lang lex attrs * refactor: update lt lang stop word set * refactor: add new instances to lt lang tokenizer exceptions * refactor: remove comments form lt lang init file * refactor: use function instead of lambda in lt lex lang getter * refactor: remove conversion to dict in lt init when dict is already provided * chore: rename lt 'test_basic' to 'test_text' * feat: add more lt text tests * feat: add lemmatizer tests * refactor: remove unused imports, add newline to end of file * chore: add contributor agreement * chore: change 'en' to 'lt' in lt example description * fix: add missing encoding info * style: add newline to end of file * refactor: use python2 compatible syntax * style: reformat code using black
269 lines
3.3 KiB
Python
269 lines
3.3 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from ...symbols import ORTH
|
|
|
|
_exc = {}
|
|
|
|
for orth in [
|
|
"G.",
|
|
"J. E.",
|
|
"J. Em.",
|
|
"J.E.",
|
|
"J.Em.",
|
|
"K.",
|
|
"N.",
|
|
"V.",
|
|
"Vt.",
|
|
"a.",
|
|
"a.k.",
|
|
"a.s.",
|
|
"adv.",
|
|
"akad.",
|
|
"aklg.",
|
|
"akt.",
|
|
"al.",
|
|
"ang.",
|
|
"angl.",
|
|
"aps.",
|
|
"apskr.",
|
|
"apyg.",
|
|
"arbat.",
|
|
"asist.",
|
|
"asm.",
|
|
"asm.k.",
|
|
"asmv.",
|
|
"atk.",
|
|
"atsak.",
|
|
"atsisk.",
|
|
"atsisk.sąsk.",
|
|
"atv.",
|
|
"aut.",
|
|
"avd.",
|
|
"b.k.",
|
|
"baud.",
|
|
"biol.",
|
|
"bkl.",
|
|
"bot.",
|
|
"bt.",
|
|
"buv.",
|
|
"ch.",
|
|
"chem.",
|
|
"corp.",
|
|
"d.",
|
|
"dab.",
|
|
"dail.",
|
|
"dek.",
|
|
"deš.",
|
|
"dir.",
|
|
"dirig.",
|
|
"doc.",
|
|
"dol.",
|
|
"dr.",
|
|
"drp.",
|
|
"dvit.",
|
|
"dėst.",
|
|
"dš.",
|
|
"dž.",
|
|
"e.b.",
|
|
"e.bankas",
|
|
"e.p.",
|
|
"e.parašas",
|
|
"e.paštas",
|
|
"e.v.",
|
|
"e.valdžia",
|
|
"egz.",
|
|
"eil.",
|
|
"ekon.",
|
|
"el.",
|
|
"el.bankas",
|
|
"el.p.",
|
|
"el.parašas",
|
|
"el.paštas",
|
|
"el.valdžia",
|
|
"etc.",
|
|
"ež.",
|
|
"fak.",
|
|
"faks.",
|
|
"feat.",
|
|
"filol.",
|
|
"filos.",
|
|
"g.",
|
|
"gen.",
|
|
"geol.",
|
|
"gerb.",
|
|
"gim.",
|
|
"gr.",
|
|
"gv.",
|
|
"gyd.",
|
|
"gyv.",
|
|
"habil.",
|
|
"inc.",
|
|
"insp.",
|
|
"inž.",
|
|
"ir pan.",
|
|
"ir t. t.",
|
|
"isp.",
|
|
"istor.",
|
|
"it.",
|
|
"just.",
|
|
"k.",
|
|
"k. a.",
|
|
"k.a.",
|
|
"kab.",
|
|
"kand.",
|
|
"kart.",
|
|
"kat.",
|
|
"ketv.",
|
|
"kh.",
|
|
"kl.",
|
|
"kln.",
|
|
"km.",
|
|
"kn.",
|
|
"koresp.",
|
|
"kpt.",
|
|
"kr.",
|
|
"kt.",
|
|
"kub.",
|
|
"kun.",
|
|
"kv.",
|
|
"kyš.",
|
|
"l. e. p.",
|
|
"l.e.p.",
|
|
"lenk.",
|
|
"liet.",
|
|
"lot.",
|
|
"lt.",
|
|
"ltd.",
|
|
"ltn.",
|
|
"m.",
|
|
"m.e..",
|
|
"m.m.",
|
|
"mat.",
|
|
"med.",
|
|
"mgnt.",
|
|
"mgr.",
|
|
"min.",
|
|
"mjr.",
|
|
"ml.",
|
|
"mln.",
|
|
"mlrd.",
|
|
"mob.",
|
|
"mok.",
|
|
"moksl.",
|
|
"mokyt.",
|
|
"mot.",
|
|
"mr.",
|
|
"mst.",
|
|
"mstl.",
|
|
"mėn.",
|
|
"nkt.",
|
|
"no.",
|
|
"nr.",
|
|
"ntk.",
|
|
"nuotr.",
|
|
"op.",
|
|
"org.",
|
|
"orig.",
|
|
"p.",
|
|
"p.d.",
|
|
"p.m.e.",
|
|
"p.s.",
|
|
"pab.",
|
|
"pan.",
|
|
"past.",
|
|
"pav.",
|
|
"pavad.",
|
|
"per.",
|
|
"perd.",
|
|
"pirm.",
|
|
"pl.",
|
|
"plg.",
|
|
"plk.",
|
|
"pr.",
|
|
"pr.Kr.",
|
|
"pranc.",
|
|
"proc.",
|
|
"prof.",
|
|
"prom.",
|
|
"prot.",
|
|
"psl.",
|
|
"pss.",
|
|
"pvz.",
|
|
"pšt.",
|
|
"r.",
|
|
"raj.",
|
|
"red.",
|
|
"rez.",
|
|
"rež.",
|
|
"rus.",
|
|
"rš.",
|
|
"s.",
|
|
"sav.",
|
|
"saviv.",
|
|
"sek.",
|
|
"sekr.",
|
|
"sen.",
|
|
"sh.",
|
|
"sk.",
|
|
"skg.",
|
|
"skv.",
|
|
"skyr.",
|
|
"sp.",
|
|
"spec.",
|
|
"sr.",
|
|
"st.",
|
|
"str.",
|
|
"stud.",
|
|
"sąs.",
|
|
"t.",
|
|
"t. p.",
|
|
"t. y.",
|
|
"t.p.",
|
|
"t.t.",
|
|
"t.y.",
|
|
"techn.",
|
|
"tel.",
|
|
"teol.",
|
|
"th.",
|
|
"tir.",
|
|
"trit.",
|
|
"trln.",
|
|
"tšk.",
|
|
"tūks.",
|
|
"tūkst.",
|
|
"up.",
|
|
"upl.",
|
|
"v.s.",
|
|
"vad.",
|
|
"val.",
|
|
"valg.",
|
|
"ved.",
|
|
"vert.",
|
|
"vet.",
|
|
"vid.",
|
|
"virš.",
|
|
"vlsč.",
|
|
"vnt.",
|
|
"vok.",
|
|
"vs.",
|
|
"vtv.",
|
|
"vv.",
|
|
"vyr.",
|
|
"vyresn.",
|
|
"zool.",
|
|
"Įn",
|
|
"įl.",
|
|
"š.m.",
|
|
"šnek.",
|
|
"šv.",
|
|
"švč.",
|
|
"ž.ū.",
|
|
"žin.",
|
|
"žml.",
|
|
"žr.",
|
|
]:
|
|
_exc[orth] = [{ORTH: orth}]
|
|
|
|
TOKENIZER_EXCEPTIONS = _exc
|