Tidy up tokenizer exceptions

This commit is contained in:
ines 2017-11-01 23:02:45 +01:00
parent 3af281a334
commit 819e30a26e
11 changed files with 12 additions and 13 deletions

View File

@ -20,7 +20,7 @@ for exc_data in [
{ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"}, {ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"},
{ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"}, {ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"},
{ORTH: "মি.লি.", LEMMA: "মিলিলিটার"}]: {ORTH: "মি.লি.", LEMMA: "মিলিলিটার"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)] _exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = _exc

View File

@ -8,7 +8,6 @@ _exc = {}
for exc_data in [ for exc_data in [
{ORTH: "Kbh.", LEMMA: "København", NORM: "København"}, {ORTH: "Kbh.", LEMMA: "København", NORM: "København"},
{ORTH: "Jan.", LEMMA: "januar", NORM: "januar"}, {ORTH: "Jan.", LEMMA: "januar", NORM: "januar"},
{ORTH: "Feb.", LEMMA: "februar", NORM: "februar"}, {ORTH: "Feb.", LEMMA: "februar", NORM: "februar"},
{ORTH: "Mar.", LEMMA: "marts", NORM: "marts"}, {ORTH: "Mar.", LEMMA: "marts", NORM: "marts"},
@ -21,7 +20,7 @@ for exc_data in [
{ORTH: "Okt.", LEMMA: "oktober", NORM: "oktober"}, {ORTH: "Okt.", LEMMA: "oktober", NORM: "oktober"},
{ORTH: "Nov.", LEMMA: "november", NORM: "november"}, {ORTH: "Nov.", LEMMA: "november", NORM: "november"},
{ORTH: "Dec.", LEMMA: "december", NORM: "december"}]: {ORTH: "Dec.", LEMMA: "december", NORM: "december"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)] _exc[exc_data[ORTH]] = [exc_data]
for orth in [ for orth in [
"A/S", "beg.", "bl.a.", "ca.", "d.s.s.", "dvs.", "f.eks.", "fr.", "hhv.", "A/S", "beg.", "bl.a.", "ca.", "d.s.s.", "dvs.", "f.eks.", "fr.", "hhv.",

View File

@ -164,7 +164,7 @@ for exc_data in [
{ORTH: "z.b.", LEMMA: "zum Beispiel"}, {ORTH: "z.b.", LEMMA: "zum Beispiel"},
{ORTH: "zzgl.", LEMMA: "zuzüglich"}, {ORTH: "zzgl.", LEMMA: "zuzüglich"},
{ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]: {ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)] _exc[exc_data[ORTH]] = [exc_data]
for orth in [ for orth in [

View File

@ -276,7 +276,7 @@ for exc_data in [
exc_data_apos = dict(exc_data) exc_data_apos = dict(exc_data)
exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH] exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
for data in [exc_data, exc_data_apos]: for data in [exc_data, exc_data_apos]:
_exc[data[ORTH]] = [dict(data)] _exc[data[ORTH]] = [data]
# Times # Times
@ -440,7 +440,7 @@ for exc_data in [
{ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"}, {ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
{ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"}, {ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
{ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]: {ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)] _exc[exc_data[ORTH]] = [exc_data]
for orth in [ for orth in [

View File

@ -26,7 +26,7 @@ for exc_data in [
{ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"}, {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"},
{ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
{ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}]: {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)] _exc[exc_data[ORTH]] = [exc_data]
# Times # Times

View File

@ -73,7 +73,7 @@ for exc_data in [
{ORTH: "ts.", LEMMA: "toisin sanoen"}, {ORTH: "ts.", LEMMA: "toisin sanoen"},
{ORTH: "vm.", LEMMA: "viimeksi mainittu"}, {ORTH: "vm.", LEMMA: "viimeksi mainittu"},
{ORTH: "srk.", LEMMA: "seurakunta"}]: {ORTH: "srk.", LEMMA: "seurakunta"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)] _exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = _exc

View File

@ -54,7 +54,7 @@ for exc_data in [
{LEMMA: "degrés", ORTH: ""}, {LEMMA: "degrés", ORTH: ""},
{LEMMA: "saint", ORTH: "St."}, {LEMMA: "saint", ORTH: "St."},
{LEMMA: "sainte", ORTH: "Ste."}]: {LEMMA: "sainte", ORTH: "Ste."}]:
_exc[exc_data[ORTH]] = [dict(exc_data)] _exc[exc_data[ORTH]] = [exc_data]
for orth in FR_BASE_EXCEPTIONS + ["etc."]: for orth in FR_BASE_EXCEPTIONS + ["etc."]:

View File

@ -11,7 +11,7 @@ for exc_data in [
{ORTH: "jan.", LEMMA: "januar"}, {ORTH: "jan.", LEMMA: "januar"},
{ORTH: "feb.", LEMMA: "februar"}, {ORTH: "feb.", LEMMA: "februar"},
{ORTH: "jul.", LEMMA: "juli"}]: {ORTH: "jul.", LEMMA: "juli"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)] _exc[exc_data[ORTH]] = [exc_data]
for orth in [ for orth in [

View File

@ -13,7 +13,7 @@ for exc_data in [
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV}, {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
{ORTH: "tj.", LEMMA: "to jest", POS: ADV}, {ORTH: "tj.", LEMMA: "to jest", POS: ADV},
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]: {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
_exc[exc_data[ORTH]] = [dict(exc_data)], _exc[exc_data[ORTH]] = [exc_data]
for orth in [ for orth in [
"w.", "r."]: "w.", "r."]:

View File

@ -68,7 +68,7 @@ for exc_data in [
{ORTH: "Sön.", LEMMA: "Söndag"}, {ORTH: "Sön.", LEMMA: "Söndag"},
{ORTH: "sthlm", LEMMA: "Stockholm"}, {ORTH: "sthlm", LEMMA: "Stockholm"},
{ORTH: "gbg", LEMMA: "Göteborg"}]: {ORTH: "gbg", LEMMA: "Göteborg"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)] _exc[exc_data[ORTH]] = [exc_data]
for orth in [ for orth in [

View File

@ -68,7 +68,7 @@ for exc_data in [
{ORTH: "\\n", POS: SPACE}, {ORTH: "\\n", POS: SPACE},
{ORTH: "\u2014", POS: PUNCT, LEMMA: "--"}, {ORTH: "\u2014", POS: PUNCT, LEMMA: "--"},
{ORTH: "\u00a0", POS: SPACE, LEMMA: " "}]: {ORTH: "\u00a0", POS: SPACE, LEMMA: " "}]:
BASE_EXCEPTIONS[exc_data[ORTH]] = [dict(exc_data)] BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data]
for orth in [ for orth in [