Fix non-clobbering lemmatization

2026-03-08 05:41:29 +03:00 · 2017-11-06 12:36:05 +01:00 · 2017-11-06 12:36:05 +01:00 · 31babe3c3f
commit 31babe3c3f
parent 63c6ae4191
2 changed files with 5 additions and 2 deletions
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -109,8 +109,7 @@ cdef class Morphology:
            analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
                                            self.tag_map.get(tag_str, {}))
            self._cache.set(tag_id, token.lex.orth, analysis)
-        if token.lemma == 0:
-            token.lemma = analysis.lemma
+        token.lemma = analysis.lemma
        token.pos = analysis.tag.pos
        token.tag = analysis.tag.name
        token.morph = analysis.tag.morph
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -412,7 +412,11 @@ class Tagger(Pipe):
            for j, tag_id in enumerate(doc_tag_ids):
                # Don't clobber preset POS tags
                if doc.c[j].tag == 0 and doc.c[j].pos == 0:
+                    # Don't clobber preset lemmas
+                    lemma = doc.c[j].lemma
                    vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
+                    if lemma != 0:
+                        doc.c[j].lemma = lemma
                idx += 1
            if tensors is not None:
                if isinstance(doc.tensor, numpy.ndarray) \