From 4f400fa486ebf4fa7ef5aa90607cca68acb301a8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 25 Mar 2017 21:51:12 +0100 Subject: [PATCH] Prevent lemmatization of base nouns Update lemmatizer's base-form check, for change in morphology class. Closes #903. --- spacy/lemmatizer.py | 8 +++----- spacy/tests/regression/test_issue903.py | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 5 deletions(-) create mode 100644 spacy/tests/regression/test_issue903.py diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 434c49e91..d10b40d7b 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -44,18 +44,16 @@ class Lemmatizer(object): def is_base_form(self, univ_pos, morphology=None): '''Check whether we're dealing with an uninflected paradigm, so we can avoid lemmatization entirely.''' - print("Is base form?", univ_pos, morphology) morphology = {} if morphology is None else morphology others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] true_morph_key = morphology.get('morph', 0) - if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others: + if univ_pos == 'noun' and morphology.get('Number') == 'sing' and not others: return True - elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others: + elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf' and not others: return True elif univ_pos == 'adj' and morphology.get('Degree') == 'pos': return True - elif true_morph_key in \ - (VerbForm_inf, VerbForm_none, Number_sing, Degree_pos): + elif true_morph_key in (VerbForm_inf, VerbForm_none, Number_sing, Degree_pos): return True else: return False diff --git a/spacy/tests/regression/test_issue903.py b/spacy/tests/regression/test_issue903.py new file mode 100644 index 000000000..36acd2dfc --- /dev/null +++ b/spacy/tests/regression/test_issue903.py @@ -0,0 +1,16 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +from ...tokens import Doc + + +@pytest.mark.parametrize('text,tag,lemma', + [("anus", "NN", "anus"), + ("princess", "NN", "princess")]) +def test_issue912(en_vocab, text, tag, lemma): + '''Test base-forms of adjectives are preserved.''' + doc = Doc(en_vocab, words=[text]) + doc[0].tag_ = tag + assert doc[0].lemma_ == lemma +