From 4454c1b23fc3793257ef20174389337f48be596a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 25 Mar 2017 21:29:57 +0100 Subject: [PATCH] Block lemmatization of base-form adjectives Fixes check that an adjective is a base form (as opposed to a comparative or superlative), so that it's not lemmatized. e.g. inner -!> inn. Closes #912. --- spacy/lemmatizer.py | 8 +++++++- spacy/tests/regression/test_issue912.py | 14 ++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/regression/test_issue912.py diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 78ff43039..434c49e91 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -7,6 +7,8 @@ import ujson as json from .en.lemmatizer import INDEX, EXC, RULES from .symbols import POS, NOUN, VERB, ADJ, PUNCT from .symbols import VerbForm_inf, VerbForm_none +from .symbols import Number_sing +from .symbols import Degree_pos class Lemmatizer(object): @@ -42,6 +44,7 @@ class Lemmatizer(object): def is_base_form(self, univ_pos, morphology=None): '''Check whether we're dealing with an uninflected paradigm, so we can avoid lemmatization entirely.''' + print("Is base form?", univ_pos, morphology) morphology = {} if morphology is None else morphology others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] true_morph_key = morphology.get('morph', 0) @@ -49,7 +52,10 @@ class Lemmatizer(object): return True elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others: return True - elif true_morph_key in (VerbForm_inf, VerbForm_none): + elif univ_pos == 'adj' and morphology.get('Degree') == 'pos': + return True + elif true_morph_key in \ + (VerbForm_inf, VerbForm_none, Number_sing, Degree_pos): return True else: return False diff --git a/spacy/tests/regression/test_issue912.py b/spacy/tests/regression/test_issue912.py new file mode 100644 index 000000000..791e2e152 --- /dev/null +++ b/spacy/tests/regression/test_issue912.py @@ -0,0 +1,14 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +from ...tokens import Doc + + +@pytest.mark.parametrize('text,tag,lemma', [("inner", "JJ", "inner")]) +def test_issue912(en_vocab, text, tag, lemma): + '''Test base-forms of adjectives are preserved.''' + doc = Doc(en_vocab, words=[text]) + doc[0].tag_ = tag + assert doc[0].lemma_ == lemma +