From 8ff468225563046b7199af7148651357ad737978 Mon Sep 17 00:00:00 2001 From: Gregory Howard Date: Thu, 27 Apr 2017 11:52:14 +0200 Subject: [PATCH] correcting tokenizer exception. Adding tests for lemmatization --- spacy/fr/tokenizer_exceptions.py | 2 +- spacy/tests/conftest.py | 4 ++ spacy/tests/fr/test_exceptions.py | 2 + spacy/tests/fr/test_lemmatization.py | 60 +++++++++++++--------------- 4 files changed, 35 insertions(+), 33 deletions(-) diff --git a/spacy/fr/tokenizer_exceptions.py b/spacy/fr/tokenizer_exceptions.py index 72b92cd09..fd05cff95 100644 --- a/spacy/fr/tokenizer_exceptions.py +++ b/spacy/fr/tokenizer_exceptions.py @@ -137,7 +137,7 @@ def get_tokenizer_exceptions(): for pre, pre_lemma in (("qu'", "que"), ("n'", "ne")): for orth in [pre,pre.title()]: VERBS['{}est-ce'.format(orth)] = [ - {LEMMA: pre_lemma, ORTH: orth}, + {LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"}, {LEMMA: 'être', ORTH: "est", TAG: "VERB"}, {LEMMA: 'ce', ORTH: '-ce'} ] diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 37d3180d0..392c5b59e 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -124,6 +124,10 @@ def EN(): def DE(): return German() +@pytest.fixture(scope="session") +def FR(): + return French() + def pytest_addoption(parser): parser.addoption("--models", action="store_true", diff --git a/spacy/tests/fr/test_exceptions.py b/spacy/tests/fr/test_exceptions.py index 13799a9ba..b3ae78e20 100644 --- a/spacy/tests/fr/test_exceptions.py +++ b/spacy/tests/fr/test_exceptions.py @@ -45,6 +45,8 @@ def test_tokenizer_handles_title(fr_tokenizer): assert len(tokens) == 6 assert tokens[0].text == "N'" assert tokens[0].lemma_ == "ne" + assert tokens[1].text == "est" + assert tokens[1].lemma_ == "être" assert tokens[2].text == "-ce" assert tokens[2].lemma_ == "ce" diff --git a/spacy/tests/fr/test_lemmatization.py b/spacy/tests/fr/test_lemmatization.py index 7b5779e18..c009e72c0 100644 --- a/spacy/tests/fr/test_lemmatization.py +++ b/spacy/tests/fr/test_lemmatization.py @@ -5,37 +5,33 @@ from __future__ import unicode_literals import pytest -@pytest.mark.xfail -def test_lemmatizer_verb(fr_tokenizer): - text = "Je suis allé au mois de janv. aux prud’hommes." - tokens = fr_tokenizer(text) - assert len(tokens) == 10 - assert tokens[2].lemma_ == "aller" +@pytest.mark.models +def test_lemmatizer_verb(FR): + text = "Qu'est-ce que tu fais?" + tokens = FR(text) + assert tokens[0].lemma_ == "que" + assert tokens[1].lemma_ == "être" + assert tokens[5].lemma_ == "faire" + +@pytest.mark.models +@pytest.mark.xfail(reason="sont tagged as AUX") +def test_lemmatizer_noun_verb_2(FR): + text = "Les abaissements de température sont gênants." + tokens = FR(text) + assert tokens[4].lemma_ == "être" + +@pytest.mark.models +@pytest.mark.xfail(reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN") +def test_lemmatizer_noun(FR): + text = "il y a des Costaricienne." + tokens = FR(text) + assert tokens[4].lemma_ == "Costaricain" + +@pytest.mark.models +def test_lemmatizer_noun_2(FR): + text = "Les abaissements de température sont gênants." + tokens = FR(text) + assert tokens[1].lemma_ == "abaissement" + assert tokens[5].lemma_ == "gênant" -@pytest.mark.xfail -def test_tokenizer_verb_2(fr_tokenizer): - text = "Je dois manger ce soir" - tokens = fr_tokenizer(text) - assert len(tokens) == 11 - assert tokens[1].lemma_ == "devoir" - - -@pytest.mark.xfail -def test_tokenizer_verb_noun(fr_tokenizer): - # This one is tricky because notes is a NOUN and can be a VERB - text = "Nous validerons vos notes plus tard" - tokens = fr_tokenizer(text) - assert len(tokens) == 11 - assert tokens[1].lemma_ == "valider" - assert tokens[3].lemma_ == "notes" - - -@pytest.mark.xfail -def test_tokenizer_verb_noun_insensitive(fr_tokenizer): - # This one is tricky because notes is a NOUN and can be a VERB - text = "Les Costaricaines et les costaricains sont jolies" - tokens = fr_tokenizer(text) - assert len(tokens) == 11 - assert tokens[1].lemma_ == "costaricain" - assert tokens[4].lemma_ == "costaricain"