Don't copy exception dicts if not necessary and tidy up

This commit is contained in:
ines 2017-10-31 21:05:29 +01:00
parent 3c8db3e4da
commit 7e424a1804
15 changed files with 39 additions and 60 deletions

View File

@ -23,4 +23,4 @@ for exc_data in [
_exc[exc_data[ORTH]] = [dict(exc_data)]
TOKENIZER_EXCEPTIONS = dict(_exc)
TOKENIZER_EXCEPTIONS = _exc

View File

@ -30,4 +30,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc)
TOKENIZER_EXCEPTIONS = _exc

View File

@ -181,4 +181,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc)
TOKENIZER_EXCEPTIONS = _exc

View File

@ -456,4 +456,4 @@ for string in _exclude:
_exc.pop(string)
TOKENIZER_EXCEPTIONS = dict(_exc)
TOKENIZER_EXCEPTIONS = _exc

View File

@ -54,4 +54,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc)
TOKENIZER_EXCEPTIONS = _exc

View File

@ -76,4 +76,4 @@ for exc_data in [
_exc[exc_data[ORTH]] = [dict(exc_data)]
TOKENIZER_EXCEPTIONS = dict(_exc)
TOKENIZER_EXCEPTIONS = _exc

View File

@ -147,5 +147,5 @@ _regular_exp += ["^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".f
_regular_exp.append(URL_PATTERN)
TOKENIZER_EXCEPTIONS = dict(_exc)
TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile('|'.join('(?:{})'.format(m) for m in _regular_exp), re.IGNORECASE).match

View File

@ -95,5 +95,5 @@ _nums = "(({ne})|({t})|({on})|({c}))({s})?".format(
c=CURRENCY, s=_suffixes)
TOKENIZER_EXCEPTIONS = dict(_exc)
TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile("^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match

View File

@ -46,5 +46,4 @@ for orth in [
]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc)
TOKENIZER_EXCEPTIONS = _exc

View File

@ -35,4 +35,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc)
TOKENIZER_EXCEPTIONS = _exc

View File

@ -20,4 +20,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc)
TOKENIZER_EXCEPTIONS = _exc

View File

@ -72,4 +72,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc)
TOKENIZER_EXCEPTIONS = _exc

View File

@ -80,4 +80,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc)
TOKENIZER_EXCEPTIONS = _exc

View File

@ -2,10 +2,10 @@
# data from Korakot Chaovavanich (https://www.facebook.com/photo.php?fbid=390564854695031&set=p.390564854695031&type=3&permPage=1&ifg=1)
from __future__ import unicode_literals
from ...symbols import *
from ...symbols import POS, NOUN, PRON, ADJ, ADV, INTJ, PROPN, DET, NUM, AUX
from ...symbols import ADP, CCONJ, PART, PUNCT, SPACE, SCONJ
TAG_MAP = {
#NOUN
# NOUN
"NOUN": {POS: NOUN},
"NCMN": {POS: NOUN},
"NTTL": {POS: NOUN},
@ -14,7 +14,7 @@ TAG_MAP = {
"CMTR": {POS: NOUN},
"CFQC": {POS: NOUN},
"CVBL": {POS: NOUN},
#PRON
# PRON
"PRON": {POS: PRON},
"NPRP": {POS: PRON},
# ADJ
@ -28,7 +28,7 @@ TAG_MAP = {
"ADVI": {POS: ADV},
"ADVP": {POS: ADV},
"ADVS": {POS: ADV},
# INT
# INT
"INT": {POS: INTJ},
# PRON
"PROPN": {POS: PROPN},
@ -50,20 +50,20 @@ TAG_MAP = {
"NCNM": {POS: NUM},
"NLBL": {POS: NUM},
"DCNM": {POS: NUM},
# AUX
# AUX
"AUX": {POS: AUX},
"XVBM": {POS: AUX},
"XVAM": {POS: AUX},
"XVMM": {POS: AUX},
"XVBB": {POS: AUX},
"XVAE": {POS: AUX},
# ADP
# ADP
"ADP": {POS: ADP},
"RPRE": {POS: ADP},
# CCONJ
"CCONJ": {POS: CCONJ},
"JCRG": {POS: CCONJ},
# SCONJ
# SCONJ
"SCONJ": {POS: SCONJ},
"PREL": {POS: SCONJ},
"JSBR": {POS: SCONJ},

View File

@ -1,43 +1,23 @@
# encoding: utf8
from __future__ import unicode_literals
from ...symbols import *
from ...symbols import ORTH, LEMMA
TOKENIZER_EXCEPTIONS = {
"ม.ค.": [
{ORTH: "ม.ค.", LEMMA: "มกราคม"}
],
"ก.พ.": [
{ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"}
],
"มี.ค.": [
{ORTH: "มี.ค.", LEMMA: "มีนาคม"}
],
"เม.ย.": [
{ORTH: "เม.ย.", LEMMA: "เมษายน"}
],
"พ.ค.": [
{ORTH: "พ.ค.", LEMMA: "พฤษภาคม"}
],
"มิ.ย.": [
{ORTH: "มิ.ย.", LEMMA: "มิถุนายน"}
],
"ก.ค.": [
{ORTH: "ก.ค.", LEMMA: "กรกฎาคม"}
],
"ส.ค.": [
{ORTH: "ส.ค.", LEMMA: "สิงหาคม"}
],
"ก.ย.": [
{ORTH: "ก.ย.", LEMMA: "กันยายน"}
],
"ต.ค.": [
{ORTH: "ต.ค.", LEMMA: "ตุลาคม"}
],
"พ.ย.": [
{ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"}
],
"ธ.ค.": [
{ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}
]
_exc = {
"ม.ค.": [{ORTH: "ม.ค.", LEMMA: "มกราคม"}],
"ก.พ.": [{ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"}],
"มี.ค.": [{ORTH: "มี.ค.", LEMMA: "มีนาคม"}],
"เม.ย.": [{ORTH: "เม.ย.", LEMMA: "เมษายน"}],
"พ.ค.": [{ORTH: "พ.ค.", LEMMA: "พฤษภาคม"}],
"มิ.ย.": [{ORTH: "มิ.ย.", LEMMA: "มิถุนายน"}],
"ก.ค.": [{ORTH: "ก.ค.", LEMMA: "กรกฎาคม"}],
"ส.ค.": [{ORTH: "ส.ค.", LEMMA: "สิงหาคม"}],
"ก.ย.": [{ORTH: "ก.ย.", LEMMA: "กันยายน"}],
"ต.ค.": [{ORTH: "ต.ค.", LEMMA: "ตุลาคม"}],
"พ.ย.": [{ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"}],
"ธ.ค.": [{ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}]
}
TOKENIZER_EXCEPTIONS = _exc