spaCy/spacy/tests/fr/test_lemmatization.py

42 lines
1.1 KiB
Python
Raw Normal View History

2017-04-25 18:44:16 +03:00
# coding: utf-8
from __future__ import unicode_literals
import pytest
2017-04-25 18:46:01 +03:00
2017-04-25 18:44:16 +03:00
@pytest.mark.xfail
2017-04-25 18:46:01 +03:00
def test_lemmatizer_verb(fr_tokenizer):
2017-04-25 18:44:16 +03:00
text = "Je suis allé au mois de janv. aux prudhommes."
tokens = fr_tokenizer(text)
assert len(tokens) == 10
assert tokens[2].lemma_ == "aller"
@pytest.mark.xfail
2017-04-25 18:46:01 +03:00
def test_tokenizer_verb_2(fr_tokenizer):
2017-04-25 18:44:16 +03:00
text = "Je dois manger ce soir"
tokens = fr_tokenizer(text)
assert len(tokens) == 11
assert tokens[1].lemma_ == "devoir"
2017-04-25 18:46:01 +03:00
2017-04-25 18:44:16 +03:00
@pytest.mark.xfail
2017-04-25 18:46:01 +03:00
def test_tokenizer_verb_noun(fr_tokenizer):
2017-04-25 18:44:16 +03:00
# This one is tricky because notes is a NOUN and can be a VERB
text = "Nous validerons vos notes plus tard"
tokens = fr_tokenizer(text)
assert len(tokens) == 11
assert tokens[1].lemma_ == "valider"
2017-04-25 19:07:02 +03:00
assert tokens[3].lemma_ == "notes"
@pytest.mark.xfail
def test_tokenizer_verb_noun_insensitive(fr_tokenizer):
# This one is tricky because notes is a NOUN and can be a VERB
text = "Les Costaricaines et les costaricains sont jolies"
tokens = fr_tokenizer(text)
assert len(tokens) == 11
assert tokens[1].lemma_ == "costaricain"
assert tokens[4].lemma_ == "costaricain"