Fix lemmatizer

This commit is contained in:
Matthew Honnibal 2016-09-27 17:47:05 +02:00
parent 07776d8096
commit a2f3510d6d

View File

@ -47,18 +47,20 @@ class Lemmatizer(object):
elif univ_pos == PUNCT: elif univ_pos == PUNCT:
univ_pos = 'punct' univ_pos = 'punct'
# See Issue #435 for example of where this logic is requied. # See Issue #435 for example of where this logic is requied.
if self.is_base_form(pos, **morphology): if self.is_base_form(univ_pos, **morphology):
return set([string.lower()]) return set([string.lower()])
lemmas = lemmatize(string, self.index.get(pos, {}), self.exc.get(pos, {}), self.rules.get(pos, [])) lemmas = lemmatize(string, self.index.get(univ_pos, {}),
self.exc.get(univ_pos, {}),
self.rules.get(univ_pos, []))
return lemmas return lemmas
def is_base_form(self, pos, **morphology): def is_base_form(self, univ_pos, **morphology):
'''Check whether we're dealing with an uninflected paradigm, so we can '''Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.''' avoid lemmatization entirely.'''
others = [key for key in morphology if key not in ('number', 'pos', 'verbform')] others = [key for key in morphology if key not in ('number', 'pos', 'verbform')]
if pos == 'noun' and morphology.get('number') == 'sing' and not others: if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
return True return True
elif pos == 'verb' and morphology.get('verbform') == 'inf' and not others: elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
return True return True
else: else:
return False return False