Fix passing of morphological features to lemmatizer

2025-09-16 00:52:38 +03:00 · 2019-03-07 13:11:38 +01:00 · 2019-03-07 13:11:38 +01:00 · b69013e2d7
commit b69013e2d7
parent 74db1d9602
2 changed files with 10 additions and 13 deletions
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -47,17 +47,6 @@ class Lemmatizer(object):
        Check whether we're dealing with an uninflected paradigm, so we can
        avoid lemmatization entirely.
        """
        morphology = {} if morphology is None else morphology
        morphology = dict(morphology)
        for key, value in list(morphology.items()):
            if value is True:
                feat, value = key.split('_')
                morphology[feat] = value
        others = [
            key
            for key in morphology
            if key not in (POS, "Number", "POS", "VerbForm", "Tense")
        ]
        if univ_pos == "noun" and morphology.get("Number") == "sing":
            return True
        elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
@ -68,7 +57,6 @@ class Lemmatizer(object):
            morphology.get("VerbForm") == "fin"
            and morphology.get("Tense") == "pres"
            and morphology.get("Number") is None
            and not others
        ):
            return True
        elif univ_pos == "adj" and morphology.get("Degree") == "pos":
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -141,7 +141,16 @@ cdef class Morphology:
            return self.strings.add(py_string.lower())
        cdef list lemma_strings
        cdef unicode lemma_string
-        lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
+        # Normalize features into a dict keyed by the field, to make life easier
        # for the lemmatizer. Handles string-to-int conversion too.
        string_feats = {}
        for key, value in morphology.items():
            if value is True:
                name, value = self.strings.as_string(key).split('_', 1)
                string_feats[name] = value
            else:
                string_feats[self.strings.as_string(key)] = self.strings.as_string(value)
        lemma_strings = self.lemmatizer(py_string, univ_pos, string_feats)
        lemma_string = lemma_strings[0]
        lemma = self.strings.add(lemma_string)
        return lemma