mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 01:34:30 +03:00
correcting tokenizer exception.
Adding tests for lemmatization
This commit is contained in:
parent
44cb486849
commit
8ff4682255
|
@ -137,7 +137,7 @@ def get_tokenizer_exceptions():
|
||||||
for pre, pre_lemma in (("qu'", "que"), ("n'", "ne")):
|
for pre, pre_lemma in (("qu'", "que"), ("n'", "ne")):
|
||||||
for orth in [pre,pre.title()]:
|
for orth in [pre,pre.title()]:
|
||||||
VERBS['{}est-ce'.format(orth)] = [
|
VERBS['{}est-ce'.format(orth)] = [
|
||||||
{LEMMA: pre_lemma, ORTH: orth},
|
{LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"},
|
||||||
{LEMMA: 'être', ORTH: "est", TAG: "VERB"},
|
{LEMMA: 'être', ORTH: "est", TAG: "VERB"},
|
||||||
{LEMMA: 'ce', ORTH: '-ce'}
|
{LEMMA: 'ce', ORTH: '-ce'}
|
||||||
]
|
]
|
||||||
|
|
|
@ -124,6 +124,10 @@ def EN():
|
||||||
def DE():
|
def DE():
|
||||||
return German()
|
return German()
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def FR():
|
||||||
|
return French()
|
||||||
|
|
||||||
|
|
||||||
def pytest_addoption(parser):
|
def pytest_addoption(parser):
|
||||||
parser.addoption("--models", action="store_true",
|
parser.addoption("--models", action="store_true",
|
||||||
|
|
|
@ -45,6 +45,8 @@ def test_tokenizer_handles_title(fr_tokenizer):
|
||||||
assert len(tokens) == 6
|
assert len(tokens) == 6
|
||||||
assert tokens[0].text == "N'"
|
assert tokens[0].text == "N'"
|
||||||
assert tokens[0].lemma_ == "ne"
|
assert tokens[0].lemma_ == "ne"
|
||||||
|
assert tokens[1].text == "est"
|
||||||
|
assert tokens[1].lemma_ == "être"
|
||||||
assert tokens[2].text == "-ce"
|
assert tokens[2].text == "-ce"
|
||||||
assert tokens[2].lemma_ == "ce"
|
assert tokens[2].lemma_ == "ce"
|
||||||
|
|
||||||
|
|
|
@ -5,37 +5,33 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.models
|
||||||
def test_lemmatizer_verb(fr_tokenizer):
|
def test_lemmatizer_verb(FR):
|
||||||
text = "Je suis allé au mois de janv. aux prud’hommes."
|
text = "Qu'est-ce que tu fais?"
|
||||||
tokens = fr_tokenizer(text)
|
tokens = FR(text)
|
||||||
assert len(tokens) == 10
|
assert tokens[0].lemma_ == "que"
|
||||||
assert tokens[2].lemma_ == "aller"
|
assert tokens[1].lemma_ == "être"
|
||||||
|
assert tokens[5].lemma_ == "faire"
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
@pytest.mark.xfail(reason="sont tagged as AUX")
|
||||||
|
def test_lemmatizer_noun_verb_2(FR):
|
||||||
|
text = "Les abaissements de température sont gênants."
|
||||||
|
tokens = FR(text)
|
||||||
|
assert tokens[4].lemma_ == "être"
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
@pytest.mark.xfail(reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN")
|
||||||
|
def test_lemmatizer_noun(FR):
|
||||||
|
text = "il y a des Costaricienne."
|
||||||
|
tokens = FR(text)
|
||||||
|
assert tokens[4].lemma_ == "Costaricain"
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
def test_lemmatizer_noun_2(FR):
|
||||||
|
text = "Les abaissements de température sont gênants."
|
||||||
|
tokens = FR(text)
|
||||||
|
assert tokens[1].lemma_ == "abaissement"
|
||||||
|
assert tokens[5].lemma_ == "gênant"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_tokenizer_verb_2(fr_tokenizer):
|
|
||||||
text = "Je dois manger ce soir"
|
|
||||||
tokens = fr_tokenizer(text)
|
|
||||||
assert len(tokens) == 11
|
|
||||||
assert tokens[1].lemma_ == "devoir"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_tokenizer_verb_noun(fr_tokenizer):
|
|
||||||
# This one is tricky because notes is a NOUN and can be a VERB
|
|
||||||
text = "Nous validerons vos notes plus tard"
|
|
||||||
tokens = fr_tokenizer(text)
|
|
||||||
assert len(tokens) == 11
|
|
||||||
assert tokens[1].lemma_ == "valider"
|
|
||||||
assert tokens[3].lemma_ == "notes"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_tokenizer_verb_noun_insensitive(fr_tokenizer):
|
|
||||||
# This one is tricky because notes is a NOUN and can be a VERB
|
|
||||||
text = "Les Costaricaines et les costaricains sont jolies"
|
|
||||||
tokens = fr_tokenizer(text)
|
|
||||||
assert len(tokens) == 11
|
|
||||||
assert tokens[1].lemma_ == "costaricain"
|
|
||||||
assert tokens[4].lemma_ == "costaricain"
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user