# encoding: utf8 from __future__ import unicode_literals from ..symbols import * from ..language_data import PRON_LEMMA, DET_LEMMA TOKENIZER_EXCEPTIONS = { "\\n": [ {ORTH: "\\n", LEMMA: "", TAG: "SP"} ], "\\t": [ {ORTH: "\\t", LEMMA: "", TAG: "SP"} ], "'S": [ {ORTH: "'S", LEMMA: PRON_LEMMA, TAG: "PPER"} ], "'n": [ {ORTH: "'n", LEMMA: DET_LEMMA, NORM: "ein"} ], "'ne": [ {ORTH: "'ne", LEMMA: DET_LEMMA, NORM: "eine"} ], "'nen": [ {ORTH: "'nen", LEMMA: DET_LEMMA, NORM: "einen"} ], "'nem": [ {ORTH: "'nem", LEMMA: DET_LEMMA, NORM: "einem"} ], "'s": [ {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER"} ], "Abb.": [ {ORTH: "Abb.", LEMMA: "Abbildung"} ], "Abk.": [ {ORTH: "Abk.", LEMMA: "Abkürzung"} ], "Abt.": [ {ORTH: "Abt.", LEMMA: "Abteilung"} ], "Apr.": [ {ORTH: "Apr.", LEMMA: "April"} ], "Aug.": [ {ORTH: "Aug.", LEMMA: "August"} ], "Bd.": [ {ORTH: "Bd.", LEMMA: "Band"} ], "Betr.": [ {ORTH: "Betr.", LEMMA: "Betreff"} ], "Bf.": [ {ORTH: "Bf.", LEMMA: "Bahnhof"} ], "Bhf.": [ {ORTH: "Bhf.", LEMMA: "Bahnhof"} ], "Bsp.": [ {ORTH: "Bsp.", LEMMA: "Beispiel"} ], "Dez.": [ {ORTH: "Dez.", LEMMA: "Dezember"} ], "Di.": [ {ORTH: "Di.", LEMMA: "Dienstag"} ], "Do.": [ {ORTH: "Do.", LEMMA: "Donnerstag"} ], "Fa.": [ {ORTH: "Fa.", LEMMA: "Firma"} ], "Fam.": [ {ORTH: "Fam.", LEMMA: "Familie"} ], "Feb.": [ {ORTH: "Feb.", LEMMA: "Februar"} ], "Fr.": [ {ORTH: "Fr.", LEMMA: "Frau"} ], "Frl.": [ {ORTH: "Frl.", LEMMA: "Fräulein"} ], "Hbf.": [ {ORTH: "Hbf.", LEMMA: "Hauptbahnhof"} ], "Hr.": [ {ORTH: "Hr.", LEMMA: "Herr"} ], "Hrn.": [ {ORTH: "Hrn.", LEMMA: "Herr"} ], "Jan.": [ {ORTH: "Jan.", LEMMA: "Januar"} ], "Jh.": [ {ORTH: "Jh.", LEMMA: "Jahrhundert"} ], "Jhd.": [ {ORTH: "Jhd.", LEMMA: "Jahrhundert"} ], "Jul.": [ {ORTH: "Jul.", LEMMA: "Juli"} ], "Jun.": [ {ORTH: "Jun.", LEMMA: "Juni"} ], "Mi.": [ {ORTH: "Mi.", LEMMA: "Mittwoch"} ], "Mio.": [ {ORTH: "Mio.", LEMMA: "Million"} ], "Mo.": [ {ORTH: "Mo.", LEMMA: "Montag"} ], "Mrd.": [ {ORTH: "Mrd.", LEMMA: "Milliarde"} ], "Mrz.": [ {ORTH: "Mrz.", LEMMA: "März"} ], "MwSt.": [ {ORTH: "MwSt.", LEMMA: "Mehrwertsteuer"} ], "Mär.": [ {ORTH: "Mär.", LEMMA: "März"} ], "Nov.": [ {ORTH: "Nov.", LEMMA: "November"} ], "Nr.": [ {ORTH: "Nr.", LEMMA: "Nummer"} ], "Okt.": [ {ORTH: "Okt.", LEMMA: "Oktober"} ], "Orig.": [ {ORTH: "Orig.", LEMMA: "Original"} ], "Pkt.": [ {ORTH: "Pkt.", LEMMA: "Punkt"} ], "Prof.": [ {ORTH: "Prof.", LEMMA: "Professor"} ], "Red.": [ {ORTH: "Red.", LEMMA: "Redaktion"} ], "S'": [ {ORTH: "S'", LEMMA: PRON_LEMMA, TAG: "PPER"} ], "Sa.": [ {ORTH: "Sa.", LEMMA: "Samstag"} ], "Sep.": [ {ORTH: "Sep.", LEMMA: "September"} ], "Sept.": [ {ORTH: "Sept.", LEMMA: "September"} ], "So.": [ {ORTH: "So.", LEMMA: "Sonntag"} ], "Std.": [ {ORTH: "Std.", LEMMA: "Stunde"} ], "Str.": [ {ORTH: "Str.", LEMMA: "Straße"} ], "Tel.": [ {ORTH: "Tel.", LEMMA: "Telefon"} ], "Tsd.": [ {ORTH: "Tsd.", LEMMA: "Tausend"} ], "Univ.": [ {ORTH: "Univ.", LEMMA: "Universität"} ], "abzgl.": [ {ORTH: "abzgl.", LEMMA: "abzüglich"} ], "allg.": [ {ORTH: "allg.", LEMMA: "allgemein"} ], "auf'm": [ {ORTH: "auf", LEMMA: "auf"}, {ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem" } ], "bspw.": [ {ORTH: "bspw.", LEMMA: "beispielsweise"} ], "bzgl.": [ {ORTH: "bzgl.", LEMMA: "bezüglich"} ], "bzw.": [ {ORTH: "bzw.", LEMMA: "beziehungsweise"} ], "d.h.": [ {ORTH: "d.h.", LEMMA: "das heißt"} ], "dgl.": [ {ORTH: "dgl.", LEMMA: "dergleichen"} ], "du's": [ {ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"}, {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"} ], "ebd.": [ {ORTH: "ebd.", LEMMA: "ebenda"} ], "eigtl.": [ {ORTH: "eigtl.", LEMMA: "eigentlich"} ], "engl.": [ {ORTH: "engl.", LEMMA: "englisch"} ], "er's": [ {ORTH: "er", LEMMA: PRON_LEMMA, TAG: "PPER"}, {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"} ], "evtl.": [ {ORTH: "evtl.", LEMMA: "eventuell"} ], "frz.": [ {ORTH: "frz.", LEMMA: "französisch"} ], "gegr.": [ {ORTH: "gegr.", LEMMA: "gegründet"} ], "ggf.": [ {ORTH: "ggf.", LEMMA: "gegebenenfalls"} ], "ggfs.": [ {ORTH: "ggfs.", LEMMA: "gegebenenfalls"} ], "ggü.": [ {ORTH: "ggü.", LEMMA: "gegenüber"} ], "hinter'm": [ {ORTH: "hinter", LEMMA: "hinter"}, {ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem"} ], "i.O.": [ {ORTH: "i.O.", LEMMA: "in Ordnung"} ], "i.d.R.": [ {ORTH: "i.d.R.", LEMMA: "in der Regel"} ], "ich's": [ {ORTH: "ich", LEMMA: PRON_LEMMA, TAG: "PPER"}, {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"} ], "ihr's": [ {ORTH: "ihr", LEMMA: PRON_LEMMA, TAG: "PPER"}, {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"} ], "incl.": [ {ORTH: "incl.", LEMMA: "inklusive"} ], "inkl.": [ {ORTH: "inkl.", LEMMA: "inklusive"} ], "insb.": [ {ORTH: "insb.", LEMMA: "insbesondere"} ], "kath.": [ {ORTH: "kath.", LEMMA: "katholisch"} ], "lt.": [ {ORTH: "lt.", LEMMA: "laut"} ], "max.": [ {ORTH: "max.", LEMMA: "maximal"} ], "min.": [ {ORTH: "min.", LEMMA: "minimal"} ], "mind.": [ {ORTH: "mind.", LEMMA: "mindestens"} ], "mtl.": [ {ORTH: "mtl.", LEMMA: "monatlich"} ], "n.Chr.": [ {ORTH: "n.Chr.", LEMMA: "nach Christus"} ], "orig.": [ {ORTH: "orig.", LEMMA: "original"} ], "röm.": [ {ORTH: "röm.", LEMMA: "römisch"} ], "s'": [ {ORTH: "s'", LEMMA: PRON_LEMMA, TAG: "PPER"} ], "s.o.": [ {ORTH: "s.o.", LEMMA: "siehe oben"} ], "sie's": [ {ORTH: "sie", LEMMA: PRON_LEMMA, TAG: "PPER"}, {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"} ], "sog.": [ {ORTH: "sog.", LEMMA: "so genannt"} ], "stellv.": [ {ORTH: "stellv.", LEMMA: "stellvertretend"} ], "tägl.": [ {ORTH: "tägl.", LEMMA: "täglich"} ], "u.U.": [ {ORTH: "u.U.", LEMMA: "unter Umständen"} ], "u.s.w.": [ {ORTH: "u.s.w.", LEMMA: "und so weiter"} ], "u.v.m.": [ {ORTH: "u.v.m.", LEMMA: "und vieles mehr"} ], "unter'm": [ {ORTH: "unter", LEMMA: "unter"}, {ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem"} ], "usf.": [ {ORTH: "usf.", LEMMA: "und so fort"} ], "usw.": [ {ORTH: "usw.", LEMMA: "und so weiter"} ], "uvm.": [ {ORTH: "uvm.", LEMMA: "und vieles mehr"} ], "v.Chr.": [ {ORTH: "v.Chr.", LEMMA: "vor Christus"} ], "v.a.": [ {ORTH: "v.a.", LEMMA: "vor allem"} ], "v.l.n.r.": [ {ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"} ], "vgl.": [ {ORTH: "vgl.", LEMMA: "vergleiche"} ], "vllt.": [ {ORTH: "vllt.", LEMMA: "vielleicht"} ], "vlt.": [ {ORTH: "vlt.", LEMMA: "vielleicht"} ], "vor'm": [ {ORTH: "vor", LEMMA: "vor"}, {ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem"} ], "wir's": [ {ORTH: "wir", LEMMA: PRON_LEMMA, TAG: "PPER"}, {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"} ], "z.B.": [ {ORTH: "z.B.", LEMMA: "zum Beispiel"} ], "z.Bsp.": [ {ORTH: "z.Bsp.", LEMMA: "zum Beispiel"} ], "z.T.": [ {ORTH: "z.T.", LEMMA: "zum Teil"} ], "z.Z.": [ {ORTH: "z.Z.", LEMMA: "zur Zeit"} ], "z.Zt.": [ {ORTH: "z.Zt.", LEMMA: "zur Zeit"} ], "z.b.": [ {ORTH: "z.b.", LEMMA: "zum Beispiel"} ], "zzgl.": [ {ORTH: "zzgl.", LEMMA: "zuzüglich"} ], "österr.": [ {ORTH: "österr.", LEMMA: "österreichisch"} ], "über'm": [ {ORTH: "über", LEMMA: "über"}, {ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem"} ] } ORTH_ONLY = [ "A.C.", "a.D.", "A.D.", "A.G.", "a.M.", "a.Z.", "Abs.", "adv.", "al.", "B.A.", "B.Sc.", "betr.", "biol.", "Biol.", "ca.", "Chr.", "Cie.", "co.", "Co.", "D.C.", "Dipl.-Ing.", "Dipl.", "Dr.", "e.g.", "e.V.", "ehem.", "entspr.", "erm.", "etc.", "ev.", "G.m.b.H.", "geb.", "Gebr.", "gem.", "h.c.", "Hg.", "hrsg.", "Hrsg.", "i.A.", "i.e.", "i.G.", "i.Tr.", "i.V.", "Ing.", "jr.", "Jr.", "jun.", "jur.", "K.O.", "L.A.", "lat.", "M.A.", "m.E.", "m.M.", "M.Sc.", "Mr.", "N.Y.", "N.Y.C.", "nat.", "ö." "o.a.", "o.ä.", "o.g.", "o.k.", "O.K.", "p.a.", "p.s.", "P.S.", "pers.", "phil.", "q.e.d.", "R.I.P.", "rer.", "sen.", "St.", "std.", "u.a.", "U.S.", "U.S.A.", "U.S.S.", "Vol.", "vs.", "wiss." ]