* Fix Issue #24: Lemmas are empty when the L field is missing for special-cased tokens

2025-07-18 20:22:25 +03:00 · 2015-02-08 18:30:30 -05:00 · 2015-02-08 18:30:30 -05:00 · 0492cee8b4
commit 0492cee8b4
parent 3e8c87af1a
1 changed files with 4 additions and 0 deletions
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -245,6 +245,8 @@ cdef class Tokenizer:
                tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, &string)
                if lemma:
                    tokens[i].lemma = self.vocab.strings[lemma]
+                else:
+                    tokens[i].lemma = 0
                if 'pos' in props:
                    # TODO: Clean up this mess...
                    tokens[i].tag = tag_names.index(props['pos'])
@ -252,6 +254,8 @@ cdef class Tokenizer:
                    # These are defaults, which can be over-ridden by the
                    # token-specific props.
                    set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1])
+                    if tokens[i].lemma == 0:
+                        tokens[i].lemma = tokens[i].lex.orth
                set_morph_from_dict(&tokens[i].morph, props)
            cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
            cached.length = len(substrings)