Don't copy exception dicts if not necessary and tidy up

This commit is contained in:
ines 2017-10-31 21:05:29 +01:00
parent 3c8db3e4da
commit 7e424a1804
15 changed files with 39 additions and 60 deletions

View File

@ -23,4 +23,4 @@ for exc_data in [
_exc[exc_data[ORTH]] = [dict(exc_data)] _exc[exc_data[ORTH]] = [dict(exc_data)]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -30,4 +30,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -181,4 +181,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -456,4 +456,4 @@ for string in _exclude:
_exc.pop(string) _exc.pop(string)
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -54,4 +54,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -76,4 +76,4 @@ for exc_data in [
_exc[exc_data[ORTH]] = [dict(exc_data)] _exc[exc_data[ORTH]] = [dict(exc_data)]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -147,5 +147,5 @@ _regular_exp += ["^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".f
_regular_exp.append(URL_PATTERN) _regular_exp.append(URL_PATTERN)
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile('|'.join('(?:{})'.format(m) for m in _regular_exp), re.IGNORECASE).match TOKEN_MATCH = re.compile('|'.join('(?:{})'.format(m) for m in _regular_exp), re.IGNORECASE).match

View File

@ -95,5 +95,5 @@ _nums = "(({ne})|({t})|({on})|({c}))({s})?".format(
c=CURRENCY, s=_suffixes) c=CURRENCY, s=_suffixes)
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile("^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match TOKEN_MATCH = re.compile("^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match

View File

@ -46,5 +46,4 @@ for orth in [
]: ]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -35,4 +35,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -20,4 +20,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -72,4 +72,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -80,4 +80,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -2,10 +2,10 @@
# data from Korakot Chaovavanich (https://www.facebook.com/photo.php?fbid=390564854695031&set=p.390564854695031&type=3&permPage=1&ifg=1) # data from Korakot Chaovavanich (https://www.facebook.com/photo.php?fbid=390564854695031&set=p.390564854695031&type=3&permPage=1&ifg=1)
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import * from ...symbols import POS, NOUN, PRON, ADJ, ADV, INTJ, PROPN, DET, NUM, AUX
from ...symbols import ADP, CCONJ, PART, PUNCT, SPACE, SCONJ
TAG_MAP = { TAG_MAP = {
#NOUN # NOUN
"NOUN": {POS: NOUN}, "NOUN": {POS: NOUN},
"NCMN": {POS: NOUN}, "NCMN": {POS: NOUN},
"NTTL": {POS: NOUN}, "NTTL": {POS: NOUN},
@ -14,7 +14,7 @@ TAG_MAP = {
"CMTR": {POS: NOUN}, "CMTR": {POS: NOUN},
"CFQC": {POS: NOUN}, "CFQC": {POS: NOUN},
"CVBL": {POS: NOUN}, "CVBL": {POS: NOUN},
#PRON # PRON
"PRON": {POS: PRON}, "PRON": {POS: PRON},
"NPRP": {POS: PRON}, "NPRP": {POS: PRON},
# ADJ # ADJ
@ -28,7 +28,7 @@ TAG_MAP = {
"ADVI": {POS: ADV}, "ADVI": {POS: ADV},
"ADVP": {POS: ADV}, "ADVP": {POS: ADV},
"ADVS": {POS: ADV}, "ADVS": {POS: ADV},
# INT # INT
"INT": {POS: INTJ}, "INT": {POS: INTJ},
# PRON # PRON
"PROPN": {POS: PROPN}, "PROPN": {POS: PROPN},
@ -50,20 +50,20 @@ TAG_MAP = {
"NCNM": {POS: NUM}, "NCNM": {POS: NUM},
"NLBL": {POS: NUM}, "NLBL": {POS: NUM},
"DCNM": {POS: NUM}, "DCNM": {POS: NUM},
# AUX # AUX
"AUX": {POS: AUX}, "AUX": {POS: AUX},
"XVBM": {POS: AUX}, "XVBM": {POS: AUX},
"XVAM": {POS: AUX}, "XVAM": {POS: AUX},
"XVMM": {POS: AUX}, "XVMM": {POS: AUX},
"XVBB": {POS: AUX}, "XVBB": {POS: AUX},
"XVAE": {POS: AUX}, "XVAE": {POS: AUX},
# ADP # ADP
"ADP": {POS: ADP}, "ADP": {POS: ADP},
"RPRE": {POS: ADP}, "RPRE": {POS: ADP},
# CCONJ # CCONJ
"CCONJ": {POS: CCONJ}, "CCONJ": {POS: CCONJ},
"JCRG": {POS: CCONJ}, "JCRG": {POS: CCONJ},
# SCONJ # SCONJ
"SCONJ": {POS: SCONJ}, "SCONJ": {POS: SCONJ},
"PREL": {POS: SCONJ}, "PREL": {POS: SCONJ},
"JSBR": {POS: SCONJ}, "JSBR": {POS: SCONJ},

View File

@ -1,43 +1,23 @@
# encoding: utf8 # encoding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import * from ...symbols import ORTH, LEMMA
TOKENIZER_EXCEPTIONS = {
"ม.ค.": [ _exc = {
{ORTH: "ม.ค.", LEMMA: "มกราคม"} "ม.ค.": [{ORTH: "ม.ค.", LEMMA: "มกราคม"}],
], "ก.พ.": [{ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"}],
"ก.พ.": [ "มี.ค.": [{ORTH: "มี.ค.", LEMMA: "มีนาคม"}],
{ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"} "เม.ย.": [{ORTH: "เม.ย.", LEMMA: "เมษายน"}],
], "พ.ค.": [{ORTH: "พ.ค.", LEMMA: "พฤษภาคม"}],
"มี.ค.": [ "มิ.ย.": [{ORTH: "มิ.ย.", LEMMA: "มิถุนายน"}],
{ORTH: "มี.ค.", LEMMA: "มีนาคม"} "ก.ค.": [{ORTH: "ก.ค.", LEMMA: "กรกฎาคม"}],
], "ส.ค.": [{ORTH: "ส.ค.", LEMMA: "สิงหาคม"}],
"เม.ย.": [ "ก.ย.": [{ORTH: "ก.ย.", LEMMA: "กันยายน"}],
{ORTH: "เม.ย.", LEMMA: "เมษายน"} "ต.ค.": [{ORTH: "ต.ค.", LEMMA: "ตุลาคม"}],
], "พ.ย.": [{ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"}],
"พ.ค.": [ "ธ.ค.": [{ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}]
{ORTH: "พ.ค.", LEMMA: "พฤษภาคม"}
],
"มิ.ย.": [
{ORTH: "มิ.ย.", LEMMA: "มิถุนายน"}
],
"ก.ค.": [
{ORTH: "ก.ค.", LEMMA: "กรกฎาคม"}
],
"ส.ค.": [
{ORTH: "ส.ค.", LEMMA: "สิงหาคม"}
],
"ก.ย.": [
{ORTH: "ก.ย.", LEMMA: "กันยายน"}
],
"ต.ค.": [
{ORTH: "ต.ค.", LEMMA: "ตุลาคม"}
],
"พ.ย.": [
{ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"}
],
"ธ.ค.": [
{ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}
]
} }
TOKENIZER_EXCEPTIONS = _exc