Reorganise German language data

This commit is contained in:
ines 2017-05-08 15:44:26 +02:00
parent 7b3a983f96
commit 1bf9d5ec8b
4 changed files with 163 additions and 577 deletions

View File

@ -1,31 +1,32 @@
# coding: utf8
from __future__ import unicode_literals, print_function
from __future__ import unicode_literals
from os import path
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from ..language_data import BASE_EXCEPTIONS
from ..language import Language
from ..attrs import LANG
from .language_data import *
from ..lemmatizerlookup import Lemmatizer
from .lemmatization import LOOK_UP
from ..attrs import LANG
from ..util import update_exc
class German(Language):
lang = 'de'
class Defaults(Language.Defaults):
tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS)
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'de'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
tag_map = TAG_MAP
stop_words = STOP_WORDS
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOK_UP)
return Lemmatizer(LOOKUP)
EXPORT = German
__all__ = ['German']

View File

@ -1,7 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
LOOK_UP = {
LOOKUP = {
"ABMs": "ABM",
"ADACs": "ADAC",
"AGs": "AG",

View File

@ -1,7 +1,8 @@
# coding: utf8
from __future__ import unicode_literals
from ..symbols import *
from ..symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
from ..symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX
TAG_MAP = {

View File

@ -1,602 +1,186 @@
# coding: utf8
from __future__ import unicode_literals
from ..symbols import *
from ..language_data import PRON_LEMMA, DET_LEMMA
from ..symbols import ORTH, LEMMA, TAG, NORM
from ..deprecated import PRON_LEMMA, DET_LEMMA
TOKENIZER_EXCEPTIONS = {
"\\n": [
{ORTH: "\\n", LEMMA: "<nl>", TAG: "SP"}
],
"\\t": [
{ORTH: "\\t", LEMMA: "<tab>", TAG: "SP"}
],
"'S": [
{ORTH: "'S", LEMMA: PRON_LEMMA, TAG: "PPER"}
],
"'n": [
{ORTH: "'n", LEMMA: DET_LEMMA, NORM: "ein"}
],
"'ne": [
{ORTH: "'ne", LEMMA: DET_LEMMA, NORM: "eine"}
],
"'nen": [
{ORTH: "'nen", LEMMA: DET_LEMMA, NORM: "einen"}
],
"'nem": [
{ORTH: "'nem", LEMMA: DET_LEMMA, NORM: "einem"}
],
"'s": [
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER"}
],
"Abb.": [
{ORTH: "Abb.", LEMMA: "Abbildung"}
],
"Abk.": [
{ORTH: "Abk.", LEMMA: "Abkürzung"}
],
"Abt.": [
{ORTH: "Abt.", LEMMA: "Abteilung"}
],
"Apr.": [
{ORTH: "Apr.", LEMMA: "April"}
],
"Aug.": [
{ORTH: "Aug.", LEMMA: "August"}
],
"Bd.": [
{ORTH: "Bd.", LEMMA: "Band"}
],
"Betr.": [
{ORTH: "Betr.", LEMMA: "Betreff"}
],
"Bf.": [
{ORTH: "Bf.", LEMMA: "Bahnhof"}
],
"Bhf.": [
{ORTH: "Bhf.", LEMMA: "Bahnhof"}
],
"Bsp.": [
{ORTH: "Bsp.", LEMMA: "Beispiel"}
],
"Dez.": [
{ORTH: "Dez.", LEMMA: "Dezember"}
],
"Di.": [
{ORTH: "Di.", LEMMA: "Dienstag"}
],
"Do.": [
{ORTH: "Do.", LEMMA: "Donnerstag"}
],
"Fa.": [
{ORTH: "Fa.", LEMMA: "Firma"}
],
"Fam.": [
{ORTH: "Fam.", LEMMA: "Familie"}
],
"Feb.": [
{ORTH: "Feb.", LEMMA: "Februar"}
],
"Fr.": [
{ORTH: "Fr.", LEMMA: "Frau"}
],
"Frl.": [
{ORTH: "Frl.", LEMMA: "Fräulein"}
],
"Hbf.": [
{ORTH: "Hbf.", LEMMA: "Hauptbahnhof"}
],
"Hr.": [
{ORTH: "Hr.", LEMMA: "Herr"}
],
"Hrn.": [
{ORTH: "Hrn.", LEMMA: "Herr"}
],
"Jan.": [
{ORTH: "Jan.", LEMMA: "Januar"}
],
"Jh.": [
{ORTH: "Jh.", LEMMA: "Jahrhundert"}
],
"Jhd.": [
{ORTH: "Jhd.", LEMMA: "Jahrhundert"}
],
"Jul.": [
{ORTH: "Jul.", LEMMA: "Juli"}
],
"Jun.": [
{ORTH: "Jun.", LEMMA: "Juni"}
],
"Mi.": [
{ORTH: "Mi.", LEMMA: "Mittwoch"}
],
"Mio.": [
{ORTH: "Mio.", LEMMA: "Million"}
],
"Mo.": [
{ORTH: "Mo.", LEMMA: "Montag"}
],
"Mrd.": [
{ORTH: "Mrd.", LEMMA: "Milliarde"}
],
"Mrz.": [
{ORTH: "Mrz.", LEMMA: "März"}
],
"MwSt.": [
{ORTH: "MwSt.", LEMMA: "Mehrwertsteuer"}
],
"Mär.": [
{ORTH: "Mär.", LEMMA: "März"}
],
"Nov.": [
{ORTH: "Nov.", LEMMA: "November"}
],
"Nr.": [
{ORTH: "Nr.", LEMMA: "Nummer"}
],
"Okt.": [
{ORTH: "Okt.", LEMMA: "Oktober"}
],
"Orig.": [
{ORTH: "Orig.", LEMMA: "Original"}
],
"Pkt.": [
{ORTH: "Pkt.", LEMMA: "Punkt"}
],
"Prof.": [
{ORTH: "Prof.", LEMMA: "Professor"}
],
"Red.": [
{ORTH: "Red.", LEMMA: "Redaktion"}
],
"S'": [
{ORTH: "S'", LEMMA: PRON_LEMMA, TAG: "PPER"}
],
"Sa.": [
{ORTH: "Sa.", LEMMA: "Samstag"}
],
"Sep.": [
{ORTH: "Sep.", LEMMA: "September"}
],
"Sept.": [
{ORTH: "Sept.", LEMMA: "September"}
],
"So.": [
{ORTH: "So.", LEMMA: "Sonntag"}
],
"Std.": [
{ORTH: "Std.", LEMMA: "Stunde"}
],
"Str.": [
{ORTH: "Str.", LEMMA: "Straße"}
],
"Tel.": [
{ORTH: "Tel.", LEMMA: "Telefon"}
],
"Tsd.": [
{ORTH: "Tsd.", LEMMA: "Tausend"}
],
"Univ.": [
{ORTH: "Univ.", LEMMA: "Universität"}
],
"abzgl.": [
{ORTH: "abzgl.", LEMMA: "abzüglich"}
],
"allg.": [
{ORTH: "allg.", LEMMA: "allgemein"}
],
_exc = {
"auf'm": [
{ORTH: "auf", LEMMA: "auf"},
{ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem" }
],
"bspw.": [
{ORTH: "bspw.", LEMMA: "beispielsweise"}
],
"bzgl.": [
{ORTH: "bzgl.", LEMMA: "bezüglich"}
],
"bzw.": [
{ORTH: "bzw.", LEMMA: "beziehungsweise"}
],
"d.h.": [
{ORTH: "d.h.", LEMMA: "das heißt"}
],
"dgl.": [
{ORTH: "dgl.", LEMMA: "dergleichen"}
],
{ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem" }],
"du's": [
{ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}
],
"ebd.": [
{ORTH: "ebd.", LEMMA: "ebenda"}
],
"eigtl.": [
{ORTH: "eigtl.", LEMMA: "eigentlich"}
],
"engl.": [
{ORTH: "engl.", LEMMA: "englisch"}
],
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}],
"er's": [
{ORTH: "er", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}
],
"evtl.": [
{ORTH: "evtl.", LEMMA: "eventuell"}
],
"frz.": [
{ORTH: "frz.", LEMMA: "französisch"}
],
"gegr.": [
{ORTH: "gegr.", LEMMA: "gegründet"}
],
"ggf.": [
{ORTH: "ggf.", LEMMA: "gegebenenfalls"}
],
"ggfs.": [
{ORTH: "ggfs.", LEMMA: "gegebenenfalls"}
],
"ggü.": [
{ORTH: "ggü.", LEMMA: "gegenüber"}
],
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}],
"hinter'm": [
{ORTH: "hinter", LEMMA: "hinter"},
{ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem"}
],
"i.O.": [
{ORTH: "i.O.", LEMMA: "in Ordnung"}
],
"i.d.R.": [
{ORTH: "i.d.R.", LEMMA: "in der Regel"}
],
{ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem"}],
"ich's": [
{ORTH: "ich", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}
],
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}],
"ihr's": [
{ORTH: "ihr", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}
],
"incl.": [
{ORTH: "incl.", LEMMA: "inklusive"}
],
"inkl.": [
{ORTH: "inkl.", LEMMA: "inklusive"}
],
"insb.": [
{ORTH: "insb.", LEMMA: "insbesondere"}
],
"kath.": [
{ORTH: "kath.", LEMMA: "katholisch"}
],
"lt.": [
{ORTH: "lt.", LEMMA: "laut"}
],
"max.": [
{ORTH: "max.", LEMMA: "maximal"}
],
"min.": [
{ORTH: "min.", LEMMA: "minimal"}
],
"mind.": [
{ORTH: "mind.", LEMMA: "mindestens"}
],
"mtl.": [
{ORTH: "mtl.", LEMMA: "monatlich"}
],
"n.Chr.": [
{ORTH: "n.Chr.", LEMMA: "nach Christus"}
],
"orig.": [
{ORTH: "orig.", LEMMA: "original"}
],
"röm.": [
{ORTH: "röm.", LEMMA: "römisch"}
],
"s'": [
{ORTH: "s'", LEMMA: PRON_LEMMA, TAG: "PPER"}
],
"s.o.": [
{ORTH: "s.o.", LEMMA: "siehe oben"}
],
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}],
"sie's": [
{ORTH: "sie", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}
],
"sog.": [
{ORTH: "sog.", LEMMA: "so genannt"}
],
"stellv.": [
{ORTH: "stellv.", LEMMA: "stellvertretend"}
],
"tägl.": [
{ORTH: "tägl.", LEMMA: "täglich"}
],
"u.U.": [
{ORTH: "u.U.", LEMMA: "unter Umständen"}
],
"u.s.w.": [
{ORTH: "u.s.w.", LEMMA: "und so weiter"}
],
"u.v.m.": [
{ORTH: "u.v.m.", LEMMA: "und vieles mehr"}
],
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}],
"unter'm": [
{ORTH: "unter", LEMMA: "unter"},
{ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem"}
],
"usf.": [
{ORTH: "usf.", LEMMA: "und so fort"}
],
"usw.": [
{ORTH: "usw.", LEMMA: "und so weiter"}
],
"uvm.": [
{ORTH: "uvm.", LEMMA: "und vieles mehr"}
],
"v.Chr.": [
{ORTH: "v.Chr.", LEMMA: "vor Christus"}
],
"v.a.": [
{ORTH: "v.a.", LEMMA: "vor allem"}
],
"v.l.n.r.": [
{ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"}
],
"vgl.": [
{ORTH: "vgl.", LEMMA: "vergleiche"}
],
"vllt.": [
{ORTH: "vllt.", LEMMA: "vielleicht"}
],
"vlt.": [
{ORTH: "vlt.", LEMMA: "vielleicht"}
],
{ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem"}],
"vor'm": [
{ORTH: "vor", LEMMA: "vor"},
{ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem"}
],
{ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem"}],
"wir's": [
{ORTH: "wir", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}
],
"z.B.": [
{ORTH: "z.B.", LEMMA: "zum Beispiel"}
],
"z.Bsp.": [
{ORTH: "z.Bsp.", LEMMA: "zum Beispiel"}
],
"z.T.": [
{ORTH: "z.T.", LEMMA: "zum Teil"}
],
"z.Z.": [
{ORTH: "z.Z.", LEMMA: "zur Zeit"}
],
"z.Zt.": [
{ORTH: "z.Zt.", LEMMA: "zur Zeit"}
],
"z.b.": [
{ORTH: "z.b.", LEMMA: "zum Beispiel"}
],
"zzgl.": [
{ORTH: "zzgl.", LEMMA: "zuzüglich"}
],
"österr.": [
{ORTH: "österr.", LEMMA: "österreichisch"}
],
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}],
"über'm": [
{ORTH: "über", LEMMA: "über"},
{ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem"}
]
{ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem"}]
}
ORTH_ONLY = [
"A.C.",
"a.D.",
"A.D.",
"A.G.",
"a.M.",
"a.Z.",
"Abs.",
"adv.",
"al.",
"B.A.",
"B.Sc.",
"betr.",
"biol.",
"Biol.",
"ca.",
"Chr.",
"Cie.",
"co.",
"Co.",
"D.C.",
"Dipl.-Ing.",
"Dipl.",
"Dr.",
"e.g.",
"e.V.",
"ehem.",
"entspr.",
"erm.",
"etc.",
"ev.",
"G.m.b.H.",
"geb.",
"Gebr.",
"gem.",
"h.c.",
"Hg.",
"hrsg.",
"Hrsg.",
"i.A.",
"i.e.",
"i.G.",
"i.Tr.",
"i.V.",
"Ing.",
"jr.",
"Jr.",
"jun.",
"jur.",
"K.O.",
"L.A.",
"lat.",
"M.A.",
"m.E.",
"m.M.",
"M.Sc.",
"Mr.",
"N.Y.",
"N.Y.C.",
"nat.",
"ö."
"o.a.",
"o.ä.",
"o.g.",
"o.k.",
"O.K.",
"p.a.",
"p.s.",
"P.S.",
"pers.",
"phil.",
"q.e.d.",
"R.I.P.",
"rer.",
"sen.",
"St.",
"std.",
"u.a.",
"U.S.",
"U.S.A.",
"U.S.S.",
"Vol.",
"vs.",
"wiss."
]
for exc_data in [
{ORTH: "'S", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "S'", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "s'", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'n", LEMMA: DET_LEMMA, NORM: "ein"},
{ORTH: "'ne", LEMMA: DET_LEMMA, NORM: "eine"},
{ORTH: "'nen", LEMMA: DET_LEMMA, NORM: "einen"},
{ORTH: "'nem", LEMMA: DET_LEMMA, NORM: "einem"},
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "Abb.", LEMMA: "Abbildung"},
{ORTH: "Abk.", LEMMA: "Abkürzung"},
{ORTH: "Abt.", LEMMA: "Abteilung"},
{ORTH: "Apr.", LEMMA: "April"},
{ORTH: "Aug.", LEMMA: "August"},
{ORTH: "Bd.", LEMMA: "Band"},
{ORTH: "Betr.", LEMMA: "Betreff"},
{ORTH: "Bf.", LEMMA: "Bahnhof"},
{ORTH: "Bhf.", LEMMA: "Bahnhof"},
{ORTH: "Bsp.", LEMMA: "Beispiel"},
{ORTH: "Dez.", LEMMA: "Dezember"},
{ORTH: "Di.", LEMMA: "Dienstag"},
{ORTH: "Do.", LEMMA: "Donnerstag"},
{ORTH: "Fa.", LEMMA: "Firma"},
{ORTH: "Fam.", LEMMA: "Familie"},
{ORTH: "Feb.", LEMMA: "Februar"},
{ORTH: "Fr.", LEMMA: "Frau"},
{ORTH: "Frl.", LEMMA: "Fräulein"},
{ORTH: "Hbf.", LEMMA: "Hauptbahnhof"},
{ORTH: "Hr.", LEMMA: "Herr"},
{ORTH: "Hrn.", LEMMA: "Herr"},
{ORTH: "Jan.", LEMMA: "Januar"},
{ORTH: "Jh.", LEMMA: "Jahrhundert"},
{ORTH: "Jhd.", LEMMA: "Jahrhundert"},
{ORTH: "Jul.", LEMMA: "Juli"},
{ORTH: "Jun.", LEMMA: "Juni"},
{ORTH: "Mi.", LEMMA: "Mittwoch"},
{ORTH: "Mio.", LEMMA: "Million"},
{ORTH: "Mo.", LEMMA: "Montag"},
{ORTH: "Mrd.", LEMMA: "Milliarde"},
{ORTH: "Mrz.", LEMMA: "März"},
{ORTH: "MwSt.", LEMMA: "Mehrwertsteuer"},
{ORTH: "Mär.", LEMMA: "März"},
{ORTH: "Nov.", LEMMA: "November"},
{ORTH: "Nr.", LEMMA: "Nummer"},
{ORTH: "Okt.", LEMMA: "Oktober"},
{ORTH: "Orig.", LEMMA: "Original"},
{ORTH: "Pkt.", LEMMA: "Punkt"},
{ORTH: "Prof.", LEMMA: "Professor"},
{ORTH: "Red.", LEMMA: "Redaktion"},
{ORTH: "Sa.", LEMMA: "Samstag"},
{ORTH: "Sep.", LEMMA: "September"},
{ORTH: "Sept.", LEMMA: "September"},
{ORTH: "So.", LEMMA: "Sonntag"},
{ORTH: "Std.", LEMMA: "Stunde"},
{ORTH: "Str.", LEMMA: "Straße"},
{ORTH: "Tel.", LEMMA: "Telefon"},
{ORTH: "Tsd.", LEMMA: "Tausend"},
{ORTH: "Univ.", LEMMA: "Universität"},
{ORTH: "abzgl.", LEMMA: "abzüglich"},
{ORTH: "allg.", LEMMA: "allgemein"},
{ORTH: "bspw.", LEMMA: "beispielsweise"},
{ORTH: "bzgl.", LEMMA: "bezüglich"},
{ORTH: "bzw.", LEMMA: "beziehungsweise"},
{ORTH: "d.h.", LEMMA: "das heißt"},
{ORTH: "dgl.", LEMMA: "dergleichen"},
{ORTH: "ebd.", LEMMA: "ebenda"},
{ORTH: "eigtl.", LEMMA: "eigentlich"},
{ORTH: "engl.", LEMMA: "englisch"},
{ORTH: "evtl.", LEMMA: "eventuell"},
{ORTH: "frz.", LEMMA: "französisch"},
{ORTH: "gegr.", LEMMA: "gegründet"},
{ORTH: "ggf.", LEMMA: "gegebenenfalls"},
{ORTH: "ggfs.", LEMMA: "gegebenenfalls"},
{ORTH: "ggü.", LEMMA: "gegenüber"},
{ORTH: "i.O.", LEMMA: "in Ordnung"},
{ORTH: "i.d.R.", LEMMA: "in der Regel"},
{ORTH: "incl.", LEMMA: "inklusive"},
{ORTH: "inkl.", LEMMA: "inklusive"},
{ORTH: "insb.", LEMMA: "insbesondere"},
{ORTH: "kath.", LEMMA: "katholisch"},
{ORTH: "lt.", LEMMA: "laut"},
{ORTH: "max.", LEMMA: "maximal"},
{ORTH: "min.", LEMMA: "minimal"},
{ORTH: "mind.", LEMMA: "mindestens"},
{ORTH: "mtl.", LEMMA: "monatlich"},
{ORTH: "n.Chr.", LEMMA: "nach Christus"},
{ORTH: "orig.", LEMMA: "original"},
{ORTH: "röm.", LEMMA: "römisch"},
{ORTH: "s.o.", LEMMA: "siehe oben"},
{ORTH: "sog.", LEMMA: "so genannt"},
{ORTH: "stellv.", LEMMA: "stellvertretend"},
{ORTH: "tägl.", LEMMA: "täglich"},
{ORTH: "u.U.", LEMMA: "unter Umständen"},
{ORTH: "u.s.w.", LEMMA: "und so weiter"},
{ORTH: "u.v.m.", LEMMA: "und vieles mehr"},
{ORTH: "usf.", LEMMA: "und so fort"},
{ORTH: "usw.", LEMMA: "und so weiter"},
{ORTH: "uvm.", LEMMA: "und vieles mehr"},
{ORTH: "v.Chr.", LEMMA: "vor Christus"},
{ORTH: "v.a.", LEMMA: "vor allem"},
{ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"},
{ORTH: "vgl.", LEMMA: "vergleiche"},
{ORTH: "vllt.", LEMMA: "vielleicht"},
{ORTH: "vlt.", LEMMA: "vielleicht"},
{ORTH: "z.B.", LEMMA: "zum Beispiel"},
{ORTH: "z.Bsp.", LEMMA: "zum Beispiel"},
{ORTH: "z.T.", LEMMA: "zum Teil"},
{ORTH: "z.Z.", LEMMA: "zur Zeit"},
{ORTH: "z.Zt.", LEMMA: "zur Zeit"},
{ORTH: "z.b.", LEMMA: "zum Beispiel"},
{ORTH: "zzgl.", LEMMA: "zuzüglich"},
{ORTH: "österr.", LEMMA: "österreichisch"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)]
for orth in [
"A.C.", "a.D.", "A.D.", "A.G.", "a.M.", "a.Z.", "Abs.", "adv.", "al.",
"B.A.", "B.Sc.", "betr.", "biol.", "Biol.", "ca.", "Chr.", "Cie.", "co.",
"Co.", "D.C.", "Dipl.-Ing.", "Dipl.", "Dr.", "e.g.", "e.V.", "ehem.",
"entspr.", "erm.", "etc.", "ev.", "G.m.b.H.", "geb.", "Gebr.", "gem.",
"h.c.", "Hg.", "hrsg.", "Hrsg.", "i.A.", "i.e.", "i.G.", "i.Tr.", "i.V.",
"Ing.", "jr.", "Jr.", "jun.", "jur.", "K.O.", "L.A.", "lat.", "M.A.",
"m.E.", "m.M.", "M.Sc.", "Mr.", "N.Y.", "N.Y.C.", "nat.", "o.a.",
"o.ä.", "o.g.", "o.k.", "O.K.", "p.a.", "p.s.", "P.S.", "pers.", "phil.",
"q.e.d.", "R.I.P.", "rer.", "sen.", "St.", "std.", "u.a.", "U.S.", "U.S.A.",
"U.S.S.", "Vol.", "vs.", "wiss."]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc)