From 819e30a26ef65c070676d903e6bcc52fdb04cba1 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 1 Nov 2017 23:02:45 +0100 Subject: [PATCH] Tidy up tokenizer exceptions --- spacy/lang/bn/tokenizer_exceptions.py | 2 +- spacy/lang/da/tokenizer_exceptions.py | 3 +-- spacy/lang/de/tokenizer_exceptions.py | 2 +- spacy/lang/en/tokenizer_exceptions.py | 4 ++-- spacy/lang/es/tokenizer_exceptions.py | 2 +- spacy/lang/fi/tokenizer_exceptions.py | 2 +- spacy/lang/fr/tokenizer_exceptions.py | 2 +- spacy/lang/nb/tokenizer_exceptions.py | 2 +- spacy/lang/pl/tokenizer_exceptions.py | 2 +- spacy/lang/sv/tokenizer_exceptions.py | 2 +- spacy/lang/tokenizer_exceptions.py | 2 +- 11 files changed, 12 insertions(+), 13 deletions(-) diff --git a/spacy/lang/bn/tokenizer_exceptions.py b/spacy/lang/bn/tokenizer_exceptions.py index 5c6de139b..dc1181335 100644 --- a/spacy/lang/bn/tokenizer_exceptions.py +++ b/spacy/lang/bn/tokenizer_exceptions.py @@ -20,7 +20,7 @@ for exc_data in [ {ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"}, {ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"}, {ORTH: "মি.লি.", LEMMA: "মিলিলিটার"}]: - _exc[exc_data[ORTH]] = [dict(exc_data)] + _exc[exc_data[ORTH]] = [exc_data] TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index e8edf36b8..c67c038bf 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -8,7 +8,6 @@ _exc = {} for exc_data in [ {ORTH: "Kbh.", LEMMA: "København", NORM: "København"}, - {ORTH: "Jan.", LEMMA: "januar", NORM: "januar"}, {ORTH: "Feb.", LEMMA: "februar", NORM: "februar"}, {ORTH: "Mar.", LEMMA: "marts", NORM: "marts"}, @@ -21,7 +20,7 @@ for exc_data in [ {ORTH: "Okt.", LEMMA: "oktober", NORM: "oktober"}, {ORTH: "Nov.", LEMMA: "november", NORM: "november"}, {ORTH: "Dec.", LEMMA: "december", NORM: "december"}]: - _exc[exc_data[ORTH]] = [dict(exc_data)] + _exc[exc_data[ORTH]] = [exc_data] for orth in [ "A/S", "beg.", "bl.a.", "ca.", "d.s.s.", "dvs.", "f.eks.", "fr.", "hhv.", diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py index 0b23a1001..cb16fb06c 100644 --- a/spacy/lang/de/tokenizer_exceptions.py +++ b/spacy/lang/de/tokenizer_exceptions.py @@ -164,7 +164,7 @@ for exc_data in [ {ORTH: "z.b.", LEMMA: "zum Beispiel"}, {ORTH: "zzgl.", LEMMA: "zuzüglich"}, {ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]: - _exc[exc_data[ORTH]] = [dict(exc_data)] + _exc[exc_data[ORTH]] = [exc_data] for orth in [ diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index 0e5bbc7f6..a76b5fb2b 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -276,7 +276,7 @@ for exc_data in [ exc_data_apos = dict(exc_data) exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH] for data in [exc_data, exc_data_apos]: - _exc[data[ORTH]] = [dict(data)] + _exc[data[ORTH]] = [data] # Times @@ -440,7 +440,7 @@ for exc_data in [ {ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"}, {ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"}, {ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]: - _exc[exc_data[ORTH]] = [dict(exc_data)] + _exc[exc_data[ORTH]] = [exc_data] for orth in [ diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py index cb62f008f..d4131ddf6 100644 --- a/spacy/lang/es/tokenizer_exceptions.py +++ b/spacy/lang/es/tokenizer_exceptions.py @@ -26,7 +26,7 @@ for exc_data in [ {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"}, {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}]: - _exc[exc_data[ORTH]] = [dict(exc_data)] + _exc[exc_data[ORTH]] = [exc_data] # Times diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py index 33e223575..88859fefb 100644 --- a/spacy/lang/fi/tokenizer_exceptions.py +++ b/spacy/lang/fi/tokenizer_exceptions.py @@ -73,7 +73,7 @@ for exc_data in [ {ORTH: "ts.", LEMMA: "toisin sanoen"}, {ORTH: "vm.", LEMMA: "viimeksi mainittu"}, {ORTH: "srk.", LEMMA: "seurakunta"}]: - _exc[exc_data[ORTH]] = [dict(exc_data)] + _exc[exc_data[ORTH]] = [exc_data] TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 442b367dd..9994686ac 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -54,7 +54,7 @@ for exc_data in [ {LEMMA: "degrés", ORTH: "d°"}, {LEMMA: "saint", ORTH: "St."}, {LEMMA: "sainte", ORTH: "Ste."}]: - _exc[exc_data[ORTH]] = [dict(exc_data)] + _exc[exc_data[ORTH]] = [exc_data] for orth in FR_BASE_EXCEPTIONS + ["etc."]: diff --git a/spacy/lang/nb/tokenizer_exceptions.py b/spacy/lang/nb/tokenizer_exceptions.py index 1529315ca..764866732 100644 --- a/spacy/lang/nb/tokenizer_exceptions.py +++ b/spacy/lang/nb/tokenizer_exceptions.py @@ -11,7 +11,7 @@ for exc_data in [ {ORTH: "jan.", LEMMA: "januar"}, {ORTH: "feb.", LEMMA: "februar"}, {ORTH: "jul.", LEMMA: "juli"}]: - _exc[exc_data[ORTH]] = [dict(exc_data)] + _exc[exc_data[ORTH]] = [exc_data] for orth in [ diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py index fb87ae8a6..6098c2bb6 100644 --- a/spacy/lang/pl/tokenizer_exceptions.py +++ b/spacy/lang/pl/tokenizer_exceptions.py @@ -13,7 +13,7 @@ for exc_data in [ {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV}, {ORTH: "tj.", LEMMA: "to jest", POS: ADV}, {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]: - _exc[exc_data[ORTH]] = [dict(exc_data)], + _exc[exc_data[ORTH]] = [exc_data] for orth in [ "w.", "r."]: diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py index 0575c3892..64aedf8af 100644 --- a/spacy/lang/sv/tokenizer_exceptions.py +++ b/spacy/lang/sv/tokenizer_exceptions.py @@ -68,7 +68,7 @@ for exc_data in [ {ORTH: "Sön.", LEMMA: "Söndag"}, {ORTH: "sthlm", LEMMA: "Stockholm"}, {ORTH: "gbg", LEMMA: "Göteborg"}]: - _exc[exc_data[ORTH]] = [dict(exc_data)] + _exc[exc_data[ORTH]] = [exc_data] for orth in [ diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 73ad88d08..89e1b1476 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -68,7 +68,7 @@ for exc_data in [ {ORTH: "\\n", POS: SPACE}, {ORTH: "\u2014", POS: PUNCT, LEMMA: "--"}, {ORTH: "\u00a0", POS: SPACE, LEMMA: " "}]: - BASE_EXCEPTIONS[exc_data[ORTH]] = [dict(exc_data)] + BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data] for orth in [