feature(model): Add support for creating the Spanish model, including rich tagset, configuration, and basich tests

This commit is contained in:
oeg 2017-04-06 18:48:45 +02:00
parent 010293fb2f
commit c693d40791
7 changed files with 376 additions and 742 deletions

View File

@ -17,4 +17,5 @@ class Spanish(Language):
lex_attr_getters[LANG] = lambda text: 'es' lex_attr_getters[LANG] = lambda text: 'es'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
tag_map = TAG_MAP
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -5,6 +5,7 @@ from .. import language_data as base
from ..language_data import update_exc, strings_to_exc from ..language_data import update_exc, strings_to_exc
from ..symbols import ORTH, LEMMA from ..symbols import ORTH, LEMMA
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
@ -39,7 +40,7 @@ def get_time_exc(hours):
] ]
return exc return exc
TAG_MAP = dict(TAG_MAP)
STOP_WORDS = set(STOP_WORDS) STOP_WORDS = set(STOP_WORDS)
@ -51,4 +52,4 @@ update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] __all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"]

File diff suppressed because it is too large Load Diff

View File

@ -49,6 +49,10 @@ def en_vocab():
def en_parser(): def en_parser():
return English.Defaults.create_parser() return English.Defaults.create_parser()
@pytest.fixture
def es_tokenizer():
return Spanish.Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
def de_tokenizer(): def de_tokenizer():

View File

View File

@ -0,0 +1,24 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text,lemma', [("aprox.", "aproximadamente"),
("esq.", "esquina"),
("pág.", "página"),
("p.ej.", "por ejemplo")
])
def test_tokenizer_handles_abbr(es_tokenizer, text, lemma):
tokens = es_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].lemma_ == lemma
def test_tokenizer_handles_exc_in_text(es_tokenizer):
text = "Mariano Rajoy ha corrido aprox. medio kilómetro"
tokens = es_tokenizer(text)
assert len(tokens) == 7
assert tokens[4].text == "aprox."
assert tokens[4].lemma_ == "aproximadamente"

View File

@ -0,0 +1,35 @@
# coding: utf-8
"""Test that longer and mixed texts are tokenized correctly."""
from __future__ import unicode_literals
import pytest
def test_tokenizer_handles_long_text(es_tokenizer):
text = """Cuando a José Mujica lo invitaron a dar una conferencia
en Oxford este verano, su cabeza hizo "crac". La "más antigua" universidad de habla
inglesa, esa que cobra decenas de miles de euros de matrícula a sus alumnos
y en cuyos salones han disertado desde Margaret Thatcher hasta Stephen Hawking,
reclamaba los servicios de este viejo de 81 años, formado en un colegio público
en Montevideo y que pregona las bondades de la vida austera."""
tokens = es_tokenizer(text)
assert len(tokens) == 90
@pytest.mark.parametrize('text,length', [
("¿Por qué José Mujica?", 6),
("“¿Oh no?”", 6),
("""¡Sí! "Vámonos", contestó José Arcadio Buendía""", 11),
("Corrieron aprox. 10km.", 5),
("Y entonces por qué...", 5)])
def test_tokenizer_handles_cnts(es_tokenizer, text, length):
tokens = es_tokenizer(text)
assert len(tokens) == length