From 8ff468225563046b7199af7148651357ad737978 Mon Sep 17 00:00:00 2001
From: Gregory Howard <gregory.f.howard@gmail.com>
Date: Thu, 27 Apr 2017 11:52:14 +0200
Subject: [PATCH] correcting tokenizer exception. Adding tests for
 lemmatization

---
 spacy/fr/tokenizer_exceptions.py     |  2 +-
 spacy/tests/conftest.py              |  4 ++
 spacy/tests/fr/test_exceptions.py    |  2 +
 spacy/tests/fr/test_lemmatization.py | 60 +++++++++++++---------------
 4 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/spacy/fr/tokenizer_exceptions.py b/spacy/fr/tokenizer_exceptions.py
index 72b92cd09..fd05cff95 100644
--- a/spacy/fr/tokenizer_exceptions.py
+++ b/spacy/fr/tokenizer_exceptions.py
@@ -137,7 +137,7 @@ def get_tokenizer_exceptions():
     for pre, pre_lemma in (("qu'", "que"), ("n'", "ne")):
         for orth in [pre,pre.title()]:
             VERBS['{}est-ce'.format(orth)] = [
-                {LEMMA: pre_lemma, ORTH: orth},
+                {LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"},
                 {LEMMA: 'être', ORTH: "est", TAG: "VERB"},
                 {LEMMA: 'ce', ORTH: '-ce'}
             ]
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 37d3180d0..392c5b59e 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -124,6 +124,10 @@ def EN():
 def DE():
     return German()
 
+@pytest.fixture(scope="session")
+def FR():
+    return French()
+
 
 def pytest_addoption(parser):
     parser.addoption("--models", action="store_true",
diff --git a/spacy/tests/fr/test_exceptions.py b/spacy/tests/fr/test_exceptions.py
index 13799a9ba..b3ae78e20 100644
--- a/spacy/tests/fr/test_exceptions.py
+++ b/spacy/tests/fr/test_exceptions.py
@@ -45,6 +45,8 @@ def test_tokenizer_handles_title(fr_tokenizer):
     assert len(tokens) == 6
     assert tokens[0].text == "N'"
     assert tokens[0].lemma_ == "ne"
+    assert tokens[1].text == "est"
+    assert tokens[1].lemma_ == "être"
     assert tokens[2].text == "-ce"
     assert tokens[2].lemma_ == "ce"
 
diff --git a/spacy/tests/fr/test_lemmatization.py b/spacy/tests/fr/test_lemmatization.py
index 7b5779e18..c009e72c0 100644
--- a/spacy/tests/fr/test_lemmatization.py
+++ b/spacy/tests/fr/test_lemmatization.py
@@ -5,37 +5,33 @@ from __future__ import unicode_literals
 import pytest
 
 
-@pytest.mark.xfail
-def test_lemmatizer_verb(fr_tokenizer):
-    text = "Je suis allé au mois de janv. aux prud’hommes."
-    tokens = fr_tokenizer(text)
-    assert len(tokens) == 10
-    assert tokens[2].lemma_ == "aller"
+@pytest.mark.models
+def test_lemmatizer_verb(FR):
+    text = "Qu'est-ce que tu fais?"
+    tokens = FR(text)
+    assert tokens[0].lemma_ == "que"
+    assert tokens[1].lemma_ == "être"
+    assert tokens[5].lemma_ == "faire"
+
+@pytest.mark.models
+@pytest.mark.xfail(reason="sont tagged as AUX")
+def test_lemmatizer_noun_verb_2(FR):
+    text = "Les abaissements de température sont gênants."
+    tokens = FR(text)
+    assert tokens[4].lemma_ == "être"
+
+@pytest.mark.models
+@pytest.mark.xfail(reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN")
+def test_lemmatizer_noun(FR):
+    text = "il y a des Costaricienne."
+    tokens = FR(text)
+    assert tokens[4].lemma_ == "Costaricain"
+
+@pytest.mark.models
+def test_lemmatizer_noun_2(FR):
+    text = "Les abaissements de température sont gênants."
+    tokens = FR(text)
+    assert tokens[1].lemma_ == "abaissement"
+    assert tokens[5].lemma_ == "gênant"
 
 
-@pytest.mark.xfail
-def test_tokenizer_verb_2(fr_tokenizer):
-    text = "Je dois manger ce soir"
-    tokens = fr_tokenizer(text)
-    assert len(tokens) == 11
-    assert tokens[1].lemma_ == "devoir"
-
-
-@pytest.mark.xfail
-def test_tokenizer_verb_noun(fr_tokenizer):
-    # This one is tricky because notes is a NOUN and can be a VERB
-    text = "Nous validerons vos notes plus tard"
-    tokens = fr_tokenizer(text)
-    assert len(tokens) == 11
-    assert tokens[1].lemma_ == "valider"
-    assert tokens[3].lemma_ == "notes"
-
-
-@pytest.mark.xfail
-def test_tokenizer_verb_noun_insensitive(fr_tokenizer):
-    # This one is tricky because notes is a NOUN and can be a VERB
-    text = "Les Costaricaines et les costaricains sont jolies"
-    tokens = fr_tokenizer(text)
-    assert len(tokens) == 11
-    assert tokens[1].lemma_ == "costaricain"
-    assert tokens[4].lemma_ == "costaricain"