Reorganise Spanish language data

This commit is contained in:
ines 2017-05-08 15:48:04 +02:00
parent c7c21b980f
commit 8e483ec950
4 changed files with 71 additions and 97 deletions

View File

@ -1,14 +1,17 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals
from os import path from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from ..language_data import BASE_EXCEPTIONS
from ..language import Language from ..language import Language
from ..attrs import LANG
from .language_data import *
from ..lemmatizerlookup import Lemmatizer from ..lemmatizerlookup import Lemmatizer
from .lemmatization import LOOK_UP from ..attrs import LANG
from ..util import update_exc
class Spanish(Language): class Spanish(Language):
lang = 'es' lang = 'es'
@ -17,10 +20,13 @@ class Spanish(Language):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'es' lex_attr_getters[LANG] = lambda text: 'es'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP tag_map = dict(TAG_MAP)
stop_words = STOP_WORDS stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
__all__ = ['Spanish']
EXPORT = Spanish

View File

@ -1,7 +1,8 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
LOOK_UP = {
LOOKUP = {
"aba": "abar", "aba": "abar",
"ababa": "abar", "ababa": "abar",
"ababais": "abar", "ababais": "abar",

View File

@ -1,8 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..symbols import *
TAG_MAP = { TAG_MAP = {
"ADJ___": {"morph": "_", "pos": "ADJ"}, "ADJ___": {"morph": "_", "pos": "ADJ"},

View File

@ -1,113 +1,82 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..symbols import * from ..symbols import ORTH, LEMMA, TAG, NORM, ADP, DET
from ..language_data import PRON_LEMMA, DET_LEMMA from ..deprecated import PRON_LEMMA, DET_LEMMA
TOKENIZER_EXCEPTIONS = { _exc = {
"al": [ "al": [
{ORTH: "a", LEMMA: "a", TAG: ADP}, {ORTH: "a", LEMMA: "a", TAG: ADP},
{ORTH: "el", LEMMA: "el", TAG: DET} {ORTH: "l", LEMMA: "el", TAG: DET}],
],
"consigo": [ "consigo": [
{ORTH: "con", LEMMA: "con"}, {ORTH: "con", LEMMA: "con"},
{ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: ""} {ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: ""}],
],
"conmigo": [ "conmigo": [
{ORTH: "con", LEMMA: "con"}, {ORTH: "con", LEMMA: "con"},
{ORTH: "migo", LEMMA: PRON_LEMMA, NORM: ""} {ORTH: "migo", LEMMA: PRON_LEMMA, NORM: ""}],
],
"contigo": [ "contigo": [
{ORTH: "con", LEMMA: "con"}, {ORTH: "con", LEMMA: "con"},
{ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"} {ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"}],
],
"del": [ "del": [
{ORTH: "de", LEMMA: "de", TAG: ADP}, {ORTH: "de", LEMMA: "de", TAG: ADP},
{ORTH: "l", LEMMA: "el", TAG: DET} {ORTH: "l", LEMMA: "el", TAG: DET}],
],
"pel": [ "pel": [
{ORTH: "pe", LEMMA: "per", TAG: ADP}, {ORTH: "pe", LEMMA: "per", TAG: ADP},
{ORTH: "l", LEMMA: "el", TAG: DET} {ORTH: "l", LEMMA: "el", TAG: DET}],
],
"pal": [ "pal": [
{ORTH: "pa", LEMMA: "para"}, {ORTH: "pa", LEMMA: "para"},
{ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"} {ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"}],
],
"pala": [ "pala": [
{ORTH: "pa", LEMMA: "para"}, {ORTH: "pa", LEMMA: "para"},
{ORTH: "la", LEMMA: DET_LEMMA} {ORTH: "la", LEMMA: DET_LEMMA}]
],
"aprox.": [
{ORTH: "aprox.", LEMMA: "aproximadamente"}
],
"dna.": [
{ORTH: "dna.", LEMMA: "docena"}
],
"esq.": [
{ORTH: "esq.", LEMMA: "esquina"}
],
"pág.": [
{ORTH: "pág.", LEMMA: "página"}
],
"p.ej.": [
{ORTH: "p.ej.", LEMMA: "por ejemplo"}
],
"Ud.": [
{ORTH: "Ud.", LEMMA: PRON_LEMMA, NORM: "usted"}
],
"Vd.": [
{ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"}
],
"Uds.": [
{ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}
],
"Vds.": [
{ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}
]
} }
ORTH_ONLY = [ for exc_data in [
"a.C.", {ORTH: "aprox.", LEMMA: "aproximadamente"},
"a.J.C.", {ORTH: "dna.", LEMMA: "docena"},
"apdo.", {ORTH: "esq.", LEMMA: "esquina"},
"Av.", {ORTH: "pág.", LEMMA: "página"},
"Avda.", {ORTH: "p.ej.", LEMMA: "por ejemplo"},
"Cía.", {ORTH: "Ud.", LEMMA: PRON_LEMMA, NORM: "usted"},
"etc.", {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"},
"Gob.", {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
"Gral.", {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}]:
"Ing.", _exc[exc_data[ORTH]] = [dict(exc_data)]
"J.C.",
"Lic.",
"m.n.", # Times
"no.",
"núm.", _exc["12m."] = [
"P.D.", {ORTH: "12"},
"Prof.", {ORTH: "m.", LEMMA: "p.m."}]
"Profa.",
"q.e.p.d."
"S.A.", for h in range(1, 12 + 1):
"S.L.", hour = str(h)
"s.s.s.", for period in ["a.m.", "am"]:
"Sr.", _exc[hour+period] = [
"Sra.", {ORTH: hour},
"Srta." {ORTH: period, LEMMA: "a.m."}]
] for period in ["p.m.", "pm"]:
_exc[hour+period] = [
{ORTH: hour},
{ORTH: period, LEMMA: "p.m."}]
for orth in [
"a.C.", "a.J.C.", "apdo.", "Av.", "Avda.", "Cía.", "etc.", "Gob.", "Gral.",
"Ing.", "J.C.", "Lic.", "m.n.", "no.", "núm.", "P.D.", "Prof.", "Profa.",
"q.e.p.d.", "S.A.", "S.L.", "s.s.s.", "Sr.", "Sra.", "Srta."]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc)