Fix passing of morphological features to lemmatizer

This commit is contained in:
Matthew Honnibal 2019-03-07 13:11:38 +01:00
parent 74db1d9602
commit b69013e2d7
2 changed files with 10 additions and 13 deletions

View File

@ -47,17 +47,6 @@ class Lemmatizer(object):
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
"""
morphology = {} if morphology is None else morphology
morphology = dict(morphology)
for key, value in list(morphology.items()):
if value is True:
feat, value = key.split('_')
morphology[feat] = value
others = [
key
for key in morphology
if key not in (POS, "Number", "POS", "VerbForm", "Tense")
]
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
@ -68,7 +57,6 @@ class Lemmatizer(object):
morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres"
and morphology.get("Number") is None
and not others
):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":

View File

@ -141,7 +141,16 @@ cdef class Morphology:
return self.strings.add(py_string.lower())
cdef list lemma_strings
cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
# Normalize features into a dict keyed by the field, to make life easier
# for the lemmatizer. Handles string-to-int conversion too.
string_feats = {}
for key, value in morphology.items():
if value is True:
name, value = self.strings.as_string(key).split('_', 1)
string_feats[name] = value
else:
string_feats[self.strings.as_string(key)] = self.strings.as_string(value)
lemma_strings = self.lemmatizer(py_string, univ_pos, string_feats)
lemma_string = lemma_strings[0]
lemma = self.strings.add(lemma_string)
return lemma