Don't copy exception dicts if not necessary and tidy up

This commit is contained in:
ines 2017-10-31 21:05:29 +01:00
parent 3c8db3e4da
commit 7e424a1804
15 changed files with 39 additions and 60 deletions

View File

@ -23,4 +23,4 @@ for exc_data in [
_exc[exc_data[ORTH]] = [dict(exc_data)] _exc[exc_data[ORTH]] = [dict(exc_data)]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -30,4 +30,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -181,4 +181,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -456,4 +456,4 @@ for string in _exclude:
_exc.pop(string) _exc.pop(string)
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -54,4 +54,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -76,4 +76,4 @@ for exc_data in [
_exc[exc_data[ORTH]] = [dict(exc_data)] _exc[exc_data[ORTH]] = [dict(exc_data)]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -147,5 +147,5 @@ _regular_exp += ["^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".f
_regular_exp.append(URL_PATTERN) _regular_exp.append(URL_PATTERN)
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile('|'.join('(?:{})'.format(m) for m in _regular_exp), re.IGNORECASE).match TOKEN_MATCH = re.compile('|'.join('(?:{})'.format(m) for m in _regular_exp), re.IGNORECASE).match

View File

@ -95,5 +95,5 @@ _nums = "(({ne})|({t})|({on})|({c}))({s})?".format(
c=CURRENCY, s=_suffixes) c=CURRENCY, s=_suffixes)
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile("^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match TOKEN_MATCH = re.compile("^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match

View File

@ -46,5 +46,4 @@ for orth in [
]: ]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -35,4 +35,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -20,4 +20,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -72,4 +72,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -80,4 +80,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc) TOKENIZER_EXCEPTIONS = _exc

View File

@ -2,8 +2,8 @@
# data from Korakot Chaovavanich (https://www.facebook.com/photo.php?fbid=390564854695031&set=p.390564854695031&type=3&permPage=1&ifg=1) # data from Korakot Chaovavanich (https://www.facebook.com/photo.php?fbid=390564854695031&set=p.390564854695031&type=3&permPage=1&ifg=1)
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import * from ...symbols import POS, NOUN, PRON, ADJ, ADV, INTJ, PROPN, DET, NUM, AUX
from ...symbols import ADP, CCONJ, PART, PUNCT, SPACE, SCONJ
TAG_MAP = { TAG_MAP = {
# NOUN # NOUN
"NOUN": {POS: NOUN}, "NOUN": {POS: NOUN},

View File

@ -1,43 +1,23 @@
# encoding: utf8 # encoding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import * from ...symbols import ORTH, LEMMA
TOKENIZER_EXCEPTIONS = {
"ม.ค.": [ _exc = {
{ORTH: "ม.ค.", LEMMA: "มกราคม"} "ม.ค.": [{ORTH: "ม.ค.", LEMMA: "มกราคม"}],
], "ก.พ.": [{ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"}],
"ก.พ.": [ "มี.ค.": [{ORTH: "มี.ค.", LEMMA: "มีนาคม"}],
{ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"} "เม.ย.": [{ORTH: "เม.ย.", LEMMA: "เมษายน"}],
], "พ.ค.": [{ORTH: "พ.ค.", LEMMA: "พฤษภาคม"}],
"มี.ค.": [ "มิ.ย.": [{ORTH: "มิ.ย.", LEMMA: "มิถุนายน"}],
{ORTH: "มี.ค.", LEMMA: "มีนาคม"} "ก.ค.": [{ORTH: "ก.ค.", LEMMA: "กรกฎาคม"}],
], "ส.ค.": [{ORTH: "ส.ค.", LEMMA: "สิงหาคม"}],
"เม.ย.": [ "ก.ย.": [{ORTH: "ก.ย.", LEMMA: "กันยายน"}],
{ORTH: "เม.ย.", LEMMA: "เมษายน"} "ต.ค.": [{ORTH: "ต.ค.", LEMMA: "ตุลาคม"}],
], "พ.ย.": [{ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"}],
"พ.ค.": [ "ธ.ค.": [{ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}]
{ORTH: "พ.ค.", LEMMA: "พฤษภาคม"}
],
"มิ.ย.": [
{ORTH: "มิ.ย.", LEMMA: "มิถุนายน"}
],
"ก.ค.": [
{ORTH: "ก.ค.", LEMMA: "กรกฎาคม"}
],
"ส.ค.": [
{ORTH: "ส.ค.", LEMMA: "สิงหาคม"}
],
"ก.ย.": [
{ORTH: "ก.ย.", LEMMA: "กันยายน"}
],
"ต.ค.": [
{ORTH: "ต.ค.", LEMMA: "ตุลาคม"}
],
"พ.ย.": [
{ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"}
],
"ธ.ค.": [
{ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}
]
} }
TOKENIZER_EXCEPTIONS = _exc