Fix passing of morphological features to lemmatizer

This commit is contained in:
Matthew Honnibal 2019-03-07 13:11:38 +01:00
parent 74db1d9602
commit b69013e2d7
2 changed files with 10 additions and 13 deletions

View File

@ -47,17 +47,6 @@ class Lemmatizer(object):
Check whether we're dealing with an uninflected paradigm, so we can Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely. avoid lemmatization entirely.
""" """
morphology = {} if morphology is None else morphology
morphology = dict(morphology)
for key, value in list(morphology.items()):
if value is True:
feat, value = key.split('_')
morphology[feat] = value
others = [
key
for key in morphology
if key not in (POS, "Number", "POS", "VerbForm", "Tense")
]
if univ_pos == "noun" and morphology.get("Number") == "sing": if univ_pos == "noun" and morphology.get("Number") == "sing":
return True return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf": elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
@ -68,7 +57,6 @@ class Lemmatizer(object):
morphology.get("VerbForm") == "fin" morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres" and morphology.get("Tense") == "pres"
and morphology.get("Number") is None and morphology.get("Number") is None
and not others
): ):
return True return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos": elif univ_pos == "adj" and morphology.get("Degree") == "pos":

View File

@ -141,7 +141,16 @@ cdef class Morphology:
return self.strings.add(py_string.lower()) return self.strings.add(py_string.lower())
cdef list lemma_strings cdef list lemma_strings
cdef unicode lemma_string cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) # Normalize features into a dict keyed by the field, to make life easier
# for the lemmatizer. Handles string-to-int conversion too.
string_feats = {}
for key, value in morphology.items():
if value is True:
name, value = self.strings.as_string(key).split('_', 1)
string_feats[name] = value
else:
string_feats[self.strings.as_string(key)] = self.strings.as_string(value)
lemma_strings = self.lemmatizer(py_string, univ_pos, string_feats)
lemma_string = lemma_strings[0] lemma_string = lemma_strings[0]
lemma = self.strings.add(lemma_string) lemma = self.strings.add(lemma_string)
return lemma return lemma