diff --git a/spacy/lang/bn/tokenizer_exceptions.py b/spacy/lang/bn/tokenizer_exceptions.py index f6e2c9ed9..5c6de139b 100644 --- a/spacy/lang/bn/tokenizer_exceptions.py +++ b/spacy/lang/bn/tokenizer_exceptions.py @@ -23,4 +23,4 @@ for exc_data in [ _exc[exc_data[ORTH]] = [dict(exc_data)] -TOKENIZER_EXCEPTIONS = dict(_exc) +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index 6bf9ab669..e8edf36b8 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -30,4 +30,4 @@ for orth in [ _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = dict(_exc) +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py index 184d88104..0b23a1001 100644 --- a/spacy/lang/de/tokenizer_exceptions.py +++ b/spacy/lang/de/tokenizer_exceptions.py @@ -181,4 +181,4 @@ for orth in [ _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = dict(_exc) +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index b9fde7882..0e5bbc7f6 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -456,4 +456,4 @@ for string in _exclude: _exc.pop(string) -TOKENIZER_EXCEPTIONS = dict(_exc) +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py index 77d9a2841..cb62f008f 100644 --- a/spacy/lang/es/tokenizer_exceptions.py +++ b/spacy/lang/es/tokenizer_exceptions.py @@ -54,4 +54,4 @@ for orth in [ _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = dict(_exc) +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py index a5e18bcfa..33e223575 100644 --- a/spacy/lang/fi/tokenizer_exceptions.py +++ b/spacy/lang/fi/tokenizer_exceptions.py @@ -76,4 +76,4 @@ for exc_data in [ _exc[exc_data[ORTH]] = [dict(exc_data)] -TOKENIZER_EXCEPTIONS = dict(_exc) +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 5d8c37878..442b367dd 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -147,5 +147,5 @@ _regular_exp += ["^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".f _regular_exp.append(URL_PATTERN) -TOKENIZER_EXCEPTIONS = dict(_exc) +TOKENIZER_EXCEPTIONS = _exc TOKEN_MATCH = re.compile('|'.join('(?:{})'.format(m) for m in _regular_exp), re.IGNORECASE).match diff --git a/spacy/lang/hu/tokenizer_exceptions.py b/spacy/lang/hu/tokenizer_exceptions.py index dd8fdab6c..834c35265 100644 --- a/spacy/lang/hu/tokenizer_exceptions.py +++ b/spacy/lang/hu/tokenizer_exceptions.py @@ -95,5 +95,5 @@ _nums = "(({ne})|({t})|({on})|({c}))({s})?".format( c=CURRENCY, s=_suffixes) -TOKENIZER_EXCEPTIONS = dict(_exc) +TOKENIZER_EXCEPTIONS = _exc TOKEN_MATCH = re.compile("^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match diff --git a/spacy/lang/id/tokenizer_exceptions.py b/spacy/lang/id/tokenizer_exceptions.py index 9978606b0..3bba57e4c 100644 --- a/spacy/lang/id/tokenizer_exceptions.py +++ b/spacy/lang/id/tokenizer_exceptions.py @@ -46,5 +46,4 @@ for orth in [ ]: _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = dict(_exc) - +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/nb/tokenizer_exceptions.py b/spacy/lang/nb/tokenizer_exceptions.py index a01c1363c..1529315ca 100644 --- a/spacy/lang/nb/tokenizer_exceptions.py +++ b/spacy/lang/nb/tokenizer_exceptions.py @@ -35,4 +35,4 @@ for orth in [ _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = dict(_exc) +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py index 4dffb6209..fb87ae8a6 100644 --- a/spacy/lang/pl/tokenizer_exceptions.py +++ b/spacy/lang/pl/tokenizer_exceptions.py @@ -20,4 +20,4 @@ for orth in [ _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = dict(_exc) +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/pt/tokenizer_exceptions.py b/spacy/lang/pt/tokenizer_exceptions.py index 72348fa64..6e8b8a24c 100644 --- a/spacy/lang/pt/tokenizer_exceptions.py +++ b/spacy/lang/pt/tokenizer_exceptions.py @@ -72,4 +72,4 @@ for orth in [ _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = dict(_exc) +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py index b7d9834fe..0575c3892 100644 --- a/spacy/lang/sv/tokenizer_exceptions.py +++ b/spacy/lang/sv/tokenizer_exceptions.py @@ -80,4 +80,4 @@ for orth in [ _exc[orth] = [{ORTH: orth}] -TOKENIZER_EXCEPTIONS = dict(_exc) +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/th/tag_map.py b/spacy/lang/th/tag_map.py index 570871820..374900bd9 100644 --- a/spacy/lang/th/tag_map.py +++ b/spacy/lang/th/tag_map.py @@ -2,10 +2,10 @@ # data from Korakot Chaovavanich (https://www.facebook.com/photo.php?fbid=390564854695031&set=p.390564854695031&type=3&permPage=1&ifg=1) from __future__ import unicode_literals -from ...symbols import * - +from ...symbols import POS, NOUN, PRON, ADJ, ADV, INTJ, PROPN, DET, NUM, AUX +from ...symbols import ADP, CCONJ, PART, PUNCT, SPACE, SCONJ TAG_MAP = { - #NOUN + # NOUN "NOUN": {POS: NOUN}, "NCMN": {POS: NOUN}, "NTTL": {POS: NOUN}, @@ -14,7 +14,7 @@ TAG_MAP = { "CMTR": {POS: NOUN}, "CFQC": {POS: NOUN}, "CVBL": {POS: NOUN}, - #PRON + # PRON "PRON": {POS: PRON}, "NPRP": {POS: PRON}, # ADJ @@ -28,7 +28,7 @@ TAG_MAP = { "ADVI": {POS: ADV}, "ADVP": {POS: ADV}, "ADVS": {POS: ADV}, - # INT + # INT "INT": {POS: INTJ}, # PRON "PROPN": {POS: PROPN}, @@ -50,20 +50,20 @@ TAG_MAP = { "NCNM": {POS: NUM}, "NLBL": {POS: NUM}, "DCNM": {POS: NUM}, - # AUX + # AUX "AUX": {POS: AUX}, "XVBM": {POS: AUX}, "XVAM": {POS: AUX}, "XVMM": {POS: AUX}, "XVBB": {POS: AUX}, "XVAE": {POS: AUX}, - # ADP + # ADP "ADP": {POS: ADP}, "RPRE": {POS: ADP}, # CCONJ "CCONJ": {POS: CCONJ}, "JCRG": {POS: CCONJ}, - # SCONJ + # SCONJ "SCONJ": {POS: SCONJ}, "PREL": {POS: SCONJ}, "JSBR": {POS: SCONJ}, diff --git a/spacy/lang/th/tokenizer_exceptions.py b/spacy/lang/th/tokenizer_exceptions.py index c31595893..ee14acf40 100644 --- a/spacy/lang/th/tokenizer_exceptions.py +++ b/spacy/lang/th/tokenizer_exceptions.py @@ -1,43 +1,23 @@ # encoding: utf8 from __future__ import unicode_literals -from ...symbols import * +from ...symbols import ORTH, LEMMA -TOKENIZER_EXCEPTIONS = { - "ม.ค.": [ - {ORTH: "ม.ค.", LEMMA: "มกราคม"} - ], - "ก.พ.": [ - {ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"} - ], - "มี.ค.": [ - {ORTH: "มี.ค.", LEMMA: "มีนาคม"} - ], - "เม.ย.": [ - {ORTH: "เม.ย.", LEMMA: "เมษายน"} - ], - "พ.ค.": [ - {ORTH: "พ.ค.", LEMMA: "พฤษภาคม"} - ], - "มิ.ย.": [ - {ORTH: "มิ.ย.", LEMMA: "มิถุนายน"} - ], - "ก.ค.": [ - {ORTH: "ก.ค.", LEMMA: "กรกฎาคม"} - ], - "ส.ค.": [ - {ORTH: "ส.ค.", LEMMA: "สิงหาคม"} - ], - "ก.ย.": [ - {ORTH: "ก.ย.", LEMMA: "กันยายน"} - ], - "ต.ค.": [ - {ORTH: "ต.ค.", LEMMA: "ตุลาคม"} - ], - "พ.ย.": [ - {ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"} - ], - "ธ.ค.": [ - {ORTH: "ธ.ค.", LEMMA: "ธันวาคม"} - ] + +_exc = { + "ม.ค.": [{ORTH: "ม.ค.", LEMMA: "มกราคม"}], + "ก.พ.": [{ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"}], + "มี.ค.": [{ORTH: "มี.ค.", LEMMA: "มีนาคม"}], + "เม.ย.": [{ORTH: "เม.ย.", LEMMA: "เมษายน"}], + "พ.ค.": [{ORTH: "พ.ค.", LEMMA: "พฤษภาคม"}], + "มิ.ย.": [{ORTH: "มิ.ย.", LEMMA: "มิถุนายน"}], + "ก.ค.": [{ORTH: "ก.ค.", LEMMA: "กรกฎาคม"}], + "ส.ค.": [{ORTH: "ส.ค.", LEMMA: "สิงหาคม"}], + "ก.ย.": [{ORTH: "ก.ย.", LEMMA: "กันยายน"}], + "ต.ค.": [{ORTH: "ต.ค.", LEMMA: "ตุลาคม"}], + "พ.ย.": [{ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"}], + "ธ.ค.": [{ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}] } + + +TOKENIZER_EXCEPTIONS = _exc