Prevent lemmatization of base nouns

Update lemmatizer's base-form check, for change in morphology class.
Closes #903.
This commit is contained in:
Matthew Honnibal 2017-03-25 21:51:12 +01:00
parent 850d35dcb3
commit 4f400fa486
2 changed files with 19 additions and 5 deletions

View File

@ -44,18 +44,16 @@ class Lemmatizer(object):
def is_base_form(self, univ_pos, morphology=None): def is_base_form(self, univ_pos, morphology=None):
'''Check whether we're dealing with an uninflected paradigm, so we can '''Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.''' avoid lemmatization entirely.'''
print("Is base form?", univ_pos, morphology)
morphology = {} if morphology is None else morphology morphology = {} if morphology is None else morphology
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
true_morph_key = morphology.get('morph', 0) true_morph_key = morphology.get('morph', 0)
if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others: if univ_pos == 'noun' and morphology.get('Number') == 'sing' and not others:
return True return True
elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others: elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf' and not others:
return True return True
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos': elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
return True return True
elif true_morph_key in \ elif true_morph_key in (VerbForm_inf, VerbForm_none, Number_sing, Degree_pos):
(VerbForm_inf, VerbForm_none, Number_sing, Degree_pos):
return True return True
else: else:
return False return False

View File

@ -0,0 +1,16 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from ...tokens import Doc
@pytest.mark.parametrize('text,tag,lemma',
[("anus", "NN", "anus"),
("princess", "NN", "princess")])
def test_issue912(en_vocab, text, tag, lemma):
'''Test base-forms of adjectives are preserved.'''
doc = Doc(en_vocab, words=[text])
doc[0].tag_ = tag
assert doc[0].lemma_ == lemma