Update tokenizer exceptions for German

This commit is contained in:
Ines Montani 2016-12-21 18:06:27 +01:00
parent d60380418e
commit 702d1eed93

View File

@ -2,7 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from ..symbols import * from ..symbols import *
from ..language_data import PRON_LEMMA from ..language_data import PRON_LEMMA, DET_LEMMA
TOKENIZER_EXCEPTIONS = { TOKENIZER_EXCEPTIONS = {
@ -15,23 +15,27 @@ TOKENIZER_EXCEPTIONS = {
], ],
"'S": [ "'S": [
{ORTH: "'S", LEMMA: PRON_LEMMA} {ORTH: "'S", LEMMA: PRON_LEMMA, TAG: "PPER"}
], ],
"'n": [ "'n": [
{ORTH: "'n", LEMMA: "ein"} {ORTH: "'n", LEMMA: DET_LEMMA, NORM: "ein"}
], ],
"'ne": [ "'ne": [
{ORTH: "'ne", LEMMA: "eine"} {ORTH: "'ne", LEMMA: DET_LEMMA, NORM: "eine"}
], ],
"'nen": [ "'nen": [
{ORTH: "'nen", LEMMA: "einen"} {ORTH: "'nen", LEMMA: DET_LEMMA, NORM: "einen"}
],
"'nem": [
{ORTH: "'nem", LEMMA: DET_LEMMA, NORM: "einem"}
], ],
"'s": [ "'s": [
{ORTH: "'s", LEMMA: PRON_LEMMA} {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER"}
], ],
"Abb.": [ "Abb.": [
@ -195,7 +199,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"S'": [ "S'": [
{ORTH: "S'", LEMMA: PRON_LEMMA} {ORTH: "S'", LEMMA: PRON_LEMMA, TAG: "PPER"}
], ],
"Sa.": [ "Sa.": [
@ -244,7 +248,7 @@ TOKENIZER_EXCEPTIONS = {
"auf'm": [ "auf'm": [
{ORTH: "auf", LEMMA: "auf"}, {ORTH: "auf", LEMMA: "auf"},
{ORTH: "'m", LEMMA: PRON_LEMMA} {ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem" }
], ],
"bspw.": [ "bspw.": [
@ -268,8 +272,8 @@ TOKENIZER_EXCEPTIONS = {
], ],
"du's": [ "du's": [
{ORTH: "du", LEMMA: PRON_LEMMA}, {ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA} {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}
], ],
"ebd.": [ "ebd.": [
@ -285,8 +289,8 @@ TOKENIZER_EXCEPTIONS = {
], ],
"er's": [ "er's": [
{ORTH: "er", LEMMA: PRON_LEMMA}, {ORTH: "er", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA} {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}
], ],
"evtl.": [ "evtl.": [
@ -315,7 +319,7 @@ TOKENIZER_EXCEPTIONS = {
"hinter'm": [ "hinter'm": [
{ORTH: "hinter", LEMMA: "hinter"}, {ORTH: "hinter", LEMMA: "hinter"},
{ORTH: "'m", LEMMA: PRON_LEMMA} {ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem"}
], ],
"i.O.": [ "i.O.": [
@ -327,13 +331,13 @@ TOKENIZER_EXCEPTIONS = {
], ],
"ich's": [ "ich's": [
{ORTH: "ich", LEMMA: PRON_LEMMA}, {ORTH: "ich", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA} {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}
], ],
"ihr's": [ "ihr's": [
{ORTH: "ihr", LEMMA: PRON_LEMMA}, {ORTH: "ihr", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA} {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}
], ],
"incl.": [ "incl.": [
@ -385,7 +389,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"s'": [ "s'": [
{ORTH: "s'", LEMMA: PRON_LEMMA} {ORTH: "s'", LEMMA: PRON_LEMMA, TAG: "PPER"}
], ],
"s.o.": [ "s.o.": [
@ -393,8 +397,8 @@ TOKENIZER_EXCEPTIONS = {
], ],
"sie's": [ "sie's": [
{ORTH: "sie", LEMMA: PRON_LEMMA}, {ORTH: "sie", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA} {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}
], ],
"sog.": [ "sog.": [
@ -423,7 +427,7 @@ TOKENIZER_EXCEPTIONS = {
"unter'm": [ "unter'm": [
{ORTH: "unter", LEMMA: "unter"}, {ORTH: "unter", LEMMA: "unter"},
{ORTH: "'m", LEMMA: PRON_LEMMA} {ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem"}
], ],
"usf.": [ "usf.": [
@ -464,12 +468,12 @@ TOKENIZER_EXCEPTIONS = {
"vor'm": [ "vor'm": [
{ORTH: "vor", LEMMA: "vor"}, {ORTH: "vor", LEMMA: "vor"},
{ORTH: "'m", LEMMA: PRON_LEMMA} {ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem"}
], ],
"wir's": [ "wir's": [
{ORTH: "wir", LEMMA: PRON_LEMMA}, {ORTH: "wir", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA} {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}
], ],
"z.B.": [ "z.B.": [
@ -506,7 +510,7 @@ TOKENIZER_EXCEPTIONS = {
"über'm": [ "über'm": [
{ORTH: "über", LEMMA: "über"}, {ORTH: "über", LEMMA: "über"},
{ORTH: "'m", LEMMA: PRON_LEMMA} {ORTH: "'m", LEMMA: DET_LEMMA, NORM: "dem"}
] ]
} }
@ -625,5 +629,5 @@ ORTH_ONLY = [
"wiss.", "wiss.",
"x.", "x.",
"y.", "y.",
"z.", "z."
] ]