mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
Reorganise Spanish language data
This commit is contained in:
parent
c7c21b980f
commit
8e483ec950
|
@ -1,14 +1,17 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from os import path
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
from .tag_map import TAG_MAP
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lemmatizer import LOOKUP
|
||||||
|
|
||||||
|
from ..language_data import BASE_EXCEPTIONS
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..attrs import LANG
|
|
||||||
|
|
||||||
from .language_data import *
|
|
||||||
from ..lemmatizerlookup import Lemmatizer
|
from ..lemmatizerlookup import Lemmatizer
|
||||||
from .lemmatization import LOOK_UP
|
from ..attrs import LANG
|
||||||
|
from ..util import update_exc
|
||||||
|
|
||||||
|
|
||||||
class Spanish(Language):
|
class Spanish(Language):
|
||||||
lang = 'es'
|
lang = 'es'
|
||||||
|
@ -17,10 +20,13 @@ class Spanish(Language):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'es'
|
lex_attr_getters[LANG] = lambda text: 'es'
|
||||||
|
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
tag_map = TAG_MAP
|
tag_map = dict(TAG_MAP)
|
||||||
stop_words = STOP_WORDS
|
stop_words = set(STOP_WORDS)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create_lemmatizer(cls, nlp=None):
|
||||||
|
return Lemmatizer(LOOKUP)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ['Spanish']
|
||||||
EXPORT = Spanish
|
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
LOOK_UP = {
|
|
||||||
|
LOOKUP = {
|
||||||
"aba": "abar",
|
"aba": "abar",
|
||||||
"ababa": "abar",
|
"ababa": "abar",
|
||||||
"ababais": "abar",
|
"ababais": "abar",
|
|
@ -1,8 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..symbols import *
|
|
||||||
|
|
||||||
|
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
"ADJ___": {"morph": "_", "pos": "ADJ"},
|
"ADJ___": {"morph": "_", "pos": "ADJ"},
|
||||||
|
|
|
@ -1,113 +1,82 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..symbols import *
|
from ..symbols import ORTH, LEMMA, TAG, NORM, ADP, DET
|
||||||
from ..language_data import PRON_LEMMA, DET_LEMMA
|
from ..deprecated import PRON_LEMMA, DET_LEMMA
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = {
|
_exc = {
|
||||||
"al": [
|
"al": [
|
||||||
{ORTH: "a", LEMMA: "a", TAG: ADP},
|
{ORTH: "a", LEMMA: "a", TAG: ADP},
|
||||||
{ORTH: "el", LEMMA: "el", TAG: DET}
|
{ORTH: "l", LEMMA: "el", TAG: DET}],
|
||||||
],
|
|
||||||
|
|
||||||
"consigo": [
|
"consigo": [
|
||||||
{ORTH: "con", LEMMA: "con"},
|
{ORTH: "con", LEMMA: "con"},
|
||||||
{ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: "sí"}
|
{ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: "sí"}],
|
||||||
],
|
|
||||||
|
|
||||||
"conmigo": [
|
"conmigo": [
|
||||||
{ORTH: "con", LEMMA: "con"},
|
{ORTH: "con", LEMMA: "con"},
|
||||||
{ORTH: "migo", LEMMA: PRON_LEMMA, NORM: "mí"}
|
{ORTH: "migo", LEMMA: PRON_LEMMA, NORM: "mí"}],
|
||||||
],
|
|
||||||
|
|
||||||
"contigo": [
|
"contigo": [
|
||||||
{ORTH: "con", LEMMA: "con"},
|
{ORTH: "con", LEMMA: "con"},
|
||||||
{ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"}
|
{ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"}],
|
||||||
],
|
|
||||||
|
|
||||||
"del": [
|
"del": [
|
||||||
{ORTH: "de", LEMMA: "de", TAG: ADP},
|
{ORTH: "de", LEMMA: "de", TAG: ADP},
|
||||||
{ORTH: "l", LEMMA: "el", TAG: DET}
|
{ORTH: "l", LEMMA: "el", TAG: DET}],
|
||||||
],
|
|
||||||
|
|
||||||
"pel": [
|
"pel": [
|
||||||
{ORTH: "pe", LEMMA: "per", TAG: ADP},
|
{ORTH: "pe", LEMMA: "per", TAG: ADP},
|
||||||
{ORTH: "l", LEMMA: "el", TAG: DET}
|
{ORTH: "l", LEMMA: "el", TAG: DET}],
|
||||||
],
|
|
||||||
|
|
||||||
"pal": [
|
"pal": [
|
||||||
{ORTH: "pa", LEMMA: "para"},
|
{ORTH: "pa", LEMMA: "para"},
|
||||||
{ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"}
|
{ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"}],
|
||||||
],
|
|
||||||
|
|
||||||
"pala": [
|
"pala": [
|
||||||
{ORTH: "pa", LEMMA: "para"},
|
{ORTH: "pa", LEMMA: "para"},
|
||||||
{ORTH: "la", LEMMA: DET_LEMMA}
|
{ORTH: "la", LEMMA: DET_LEMMA}]
|
||||||
],
|
|
||||||
|
|
||||||
"aprox.": [
|
|
||||||
{ORTH: "aprox.", LEMMA: "aproximadamente"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"dna.": [
|
|
||||||
{ORTH: "dna.", LEMMA: "docena"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"esq.": [
|
|
||||||
{ORTH: "esq.", LEMMA: "esquina"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"pág.": [
|
|
||||||
{ORTH: "pág.", LEMMA: "página"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"p.ej.": [
|
|
||||||
{ORTH: "p.ej.", LEMMA: "por ejemplo"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Ud.": [
|
|
||||||
{ORTH: "Ud.", LEMMA: PRON_LEMMA, NORM: "usted"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Vd.": [
|
|
||||||
{ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Uds.": [
|
|
||||||
{ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}
|
|
||||||
],
|
|
||||||
|
|
||||||
"Vds.": [
|
|
||||||
{ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
ORTH_ONLY = [
|
for exc_data in [
|
||||||
"a.C.",
|
{ORTH: "aprox.", LEMMA: "aproximadamente"},
|
||||||
"a.J.C.",
|
{ORTH: "dna.", LEMMA: "docena"},
|
||||||
"apdo.",
|
{ORTH: "esq.", LEMMA: "esquina"},
|
||||||
"Av.",
|
{ORTH: "pág.", LEMMA: "página"},
|
||||||
"Avda.",
|
{ORTH: "p.ej.", LEMMA: "por ejemplo"},
|
||||||
"Cía.",
|
{ORTH: "Ud.", LEMMA: PRON_LEMMA, NORM: "usted"},
|
||||||
"etc.",
|
{ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"},
|
||||||
"Gob.",
|
{ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
|
||||||
"Gral.",
|
{ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}]:
|
||||||
"Ing.",
|
_exc[exc_data[ORTH]] = [dict(exc_data)]
|
||||||
"J.C.",
|
|
||||||
"Lic.",
|
|
||||||
"m.n.",
|
# Times
|
||||||
"no.",
|
|
||||||
"núm.",
|
_exc["12m."] = [
|
||||||
"P.D.",
|
{ORTH: "12"},
|
||||||
"Prof.",
|
{ORTH: "m.", LEMMA: "p.m."}]
|
||||||
"Profa.",
|
|
||||||
"q.e.p.d."
|
|
||||||
"S.A.",
|
for h in range(1, 12 + 1):
|
||||||
"S.L.",
|
hour = str(h)
|
||||||
"s.s.s.",
|
for period in ["a.m.", "am"]:
|
||||||
"Sr.",
|
_exc[hour+period] = [
|
||||||
"Sra.",
|
{ORTH: hour},
|
||||||
"Srta."
|
{ORTH: period, LEMMA: "a.m."}]
|
||||||
]
|
for period in ["p.m.", "pm"]:
|
||||||
|
_exc[hour+period] = [
|
||||||
|
{ORTH: hour},
|
||||||
|
{ORTH: period, LEMMA: "p.m."}]
|
||||||
|
|
||||||
|
|
||||||
|
for orth in [
|
||||||
|
"a.C.", "a.J.C.", "apdo.", "Av.", "Avda.", "Cía.", "etc.", "Gob.", "Gral.",
|
||||||
|
"Ing.", "J.C.", "Lic.", "m.n.", "no.", "núm.", "P.D.", "Prof.", "Profa.",
|
||||||
|
"q.e.p.d.", "S.A.", "S.L.", "s.s.s.", "Sr.", "Sra.", "Srta."]:
|
||||||
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = dict(_exc)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user