correcting tokenizer exception.

Adding tests for lemmatization
This commit is contained in:
Gregory Howard 2017-04-27 11:52:14 +02:00
parent 44cb486849
commit 8ff4682255
4 changed files with 35 additions and 33 deletions

View File

@ -137,7 +137,7 @@ def get_tokenizer_exceptions():
for pre, pre_lemma in (("qu'", "que"), ("n'", "ne")): for pre, pre_lemma in (("qu'", "que"), ("n'", "ne")):
for orth in [pre,pre.title()]: for orth in [pre,pre.title()]:
VERBS['{}est-ce'.format(orth)] = [ VERBS['{}est-ce'.format(orth)] = [
{LEMMA: pre_lemma, ORTH: orth}, {LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"},
{LEMMA: 'être', ORTH: "est", TAG: "VERB"}, {LEMMA: 'être', ORTH: "est", TAG: "VERB"},
{LEMMA: 'ce', ORTH: '-ce'} {LEMMA: 'ce', ORTH: '-ce'}
] ]

View File

@ -124,6 +124,10 @@ def EN():
def DE(): def DE():
return German() return German()
@pytest.fixture(scope="session")
def FR():
return French()
def pytest_addoption(parser): def pytest_addoption(parser):
parser.addoption("--models", action="store_true", parser.addoption("--models", action="store_true",

View File

@ -45,6 +45,8 @@ def test_tokenizer_handles_title(fr_tokenizer):
assert len(tokens) == 6 assert len(tokens) == 6
assert tokens[0].text == "N'" assert tokens[0].text == "N'"
assert tokens[0].lemma_ == "ne" assert tokens[0].lemma_ == "ne"
assert tokens[1].text == "est"
assert tokens[1].lemma_ == "être"
assert tokens[2].text == "-ce" assert tokens[2].text == "-ce"
assert tokens[2].lemma_ == "ce" assert tokens[2].lemma_ == "ce"

View File

@ -5,37 +5,33 @@ from __future__ import unicode_literals
import pytest import pytest
@pytest.mark.xfail @pytest.mark.models
def test_lemmatizer_verb(fr_tokenizer): def test_lemmatizer_verb(FR):
text = "Je suis allé au mois de janv. aux prudhommes." text = "Qu'est-ce que tu fais?"
tokens = fr_tokenizer(text) tokens = FR(text)
assert len(tokens) == 10 assert tokens[0].lemma_ == "que"
assert tokens[2].lemma_ == "aller" assert tokens[1].lemma_ == "être"
assert tokens[5].lemma_ == "faire"
@pytest.mark.models
@pytest.mark.xfail(reason="sont tagged as AUX")
def test_lemmatizer_noun_verb_2(FR):
text = "Les abaissements de température sont gênants."
tokens = FR(text)
assert tokens[4].lemma_ == "être"
@pytest.mark.models
@pytest.mark.xfail(reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN")
def test_lemmatizer_noun(FR):
text = "il y a des Costaricienne."
tokens = FR(text)
assert tokens[4].lemma_ == "Costaricain"
@pytest.mark.models
def test_lemmatizer_noun_2(FR):
text = "Les abaissements de température sont gênants."
tokens = FR(text)
assert tokens[1].lemma_ == "abaissement"
assert tokens[5].lemma_ == "gênant"
@pytest.mark.xfail
def test_tokenizer_verb_2(fr_tokenizer):
text = "Je dois manger ce soir"
tokens = fr_tokenizer(text)
assert len(tokens) == 11
assert tokens[1].lemma_ == "devoir"
@pytest.mark.xfail
def test_tokenizer_verb_noun(fr_tokenizer):
# This one is tricky because notes is a NOUN and can be a VERB
text = "Nous validerons vos notes plus tard"
tokens = fr_tokenizer(text)
assert len(tokens) == 11
assert tokens[1].lemma_ == "valider"
assert tokens[3].lemma_ == "notes"
@pytest.mark.xfail
def test_tokenizer_verb_noun_insensitive(fr_tokenizer):
# This one is tricky because notes is a NOUN and can be a VERB
text = "Les Costaricaines et les costaricains sont jolies"
tokens = fr_tokenizer(text)
assert len(tokens) == 11
assert tokens[1].lemma_ == "costaricain"
assert tokens[4].lemma_ == "costaricain"