diff --git a/spacy/tests/serialize/test_io.py b/spacy/tests/serialize/test_io.py index a64d0cabc..4157ee309 100644 --- a/spacy/tests/serialize/test_io.py +++ b/spacy/tests/serialize/test_io.py @@ -38,3 +38,15 @@ def test_left_right(EN): for child in word.rights: assert child.head.i == word.i + +@pytest.mark.models +def test_lemmas(EN): + orig = EN(u'The geese are flying') + result = Doc(orig.vocab).from_bytes(orig.to_bytes()) + the, geese, are, flying = result + assert the.lemma_ == 'the' + assert geese.lemma_ == 'goose' + assert are.lemma_ == 'be' + assert flying.lemma_ == 'fly' + + diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 01ccb4fd9..7a8822b5f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -398,7 +398,7 @@ cdef class Doc: self.is_parsed = True elif attr_id == TAG: for i in range(length): - tokens[i].tag = values[i] + self.vocab.morphology.assign_tag(&tokens[i], values[i]) if not self.is_tagged and tokens[i].tag != 0: self.is_tagged = True elif attr_id == POS: @@ -413,6 +413,8 @@ cdef class Doc: elif attr_id == ENT_TYPE: for i in range(length): tokens[i].ent_type = values[i] + else: + raise ValueError("Unknown attribute ID: %d" % attr_id) set_children_from_heads(self.data, self.length) return self @@ -469,8 +471,7 @@ cdef class Doc: # Update fields token.lex = lex token.spacy = self.data[end-1].spacy - # What to do about morphology?? - # TODO: token.morph = ??? + self.vocab.morphology.assign_tag(token, self.vocab.strings[tag]) token.tag = self.vocab.strings[tag] token.lemma = self.vocab.strings[lemma] if ent_type == 'O':