From fe442cac53eb999ad8caf6b2a19e4724c51defff Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 18 Mar 2017 16:16:10 +0100 Subject: [PATCH] Fix #717: Set correct lemma for contracted verbs --- spacy/en/morph_rules.py | 5 ++++- spacy/en/tokenizer_exceptions.py | 2 +- spacy/tests/regression/test_issue717.py | 6 ++++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/spacy/en/morph_rules.py b/spacy/en/morph_rules.py index 2c088f09b..129ae799f 100644 --- a/spacy/en/morph_rules.py +++ b/spacy/en/morph_rules.py @@ -54,10 +54,13 @@ MORPH_RULES = { "am": {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"}, "are": {LEMMA: "be", "VerbForm": "Fin", "Person": "Two", "Tense": "Pres", "Mood": "Ind"}, "is": {LEMMA: "be", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", "Mood": "Ind"}, + "'re": {LEMMA: "be", "VerbForm": "Fin", "Person": "Two", "Tense": "Pres", "Mood": "Ind"}, + "'s": {LEMMA: "be", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", "Mood": "Ind"}, }, "VBP": { - "are": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"} + "are": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}, + "'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"} }, "VBD": { diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 8ef862411..07b01c4fb 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -107,7 +107,7 @@ for pron in ["you", "we", "they"]: EXC[orth + "re"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "re", LEMMA: "be", NORM: "are"} + {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"} ] diff --git a/spacy/tests/regression/test_issue717.py b/spacy/tests/regression/test_issue717.py index 55060c05b..1548c06aa 100644 --- a/spacy/tests/regression/test_issue717.py +++ b/spacy/tests/regression/test_issue717.py @@ -4,9 +4,11 @@ from __future__ import unicode_literals import pytest -@pytest.mark.xfail @pytest.mark.models -@pytest.mark.parametrize('text1,text2', [("You're happy", "You are happy")]) +@pytest.mark.parametrize('text1,text2', + [("You're happy", "You are happy"), + ("I'm happy", "I am happy"), + ("he's happy", "he's happy")]) def test_issue717(EN, text1, text2): """Test that contractions are assigned the correct lemma.""" doc1 = EN(text1)