From 5e040855a5c5e7725fd875e4b85e38d53e113796 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 17:56:50 +1100 Subject: [PATCH] * Ensure morphological features and lemmas are loaded in from_array, re Issue #152 --- spacy/tests/serialize/test_io.py | 12 ++++++++++++ spacy/tokens/doc.pyx | 4 +++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/spacy/tests/serialize/test_io.py b/spacy/tests/serialize/test_io.py index a64d0cabc..4157ee309 100644 --- a/spacy/tests/serialize/test_io.py +++ b/spacy/tests/serialize/test_io.py @@ -38,3 +38,15 @@ def test_left_right(EN): for child in word.rights: assert child.head.i == word.i + +@pytest.mark.models +def test_lemmas(EN): + orig = EN(u'The geese are flying') + result = Doc(orig.vocab).from_bytes(orig.to_bytes()) + the, geese, are, flying = result + assert the.lemma_ == 'the' + assert geese.lemma_ == 'goose' + assert are.lemma_ == 'be' + assert flying.lemma_ == 'fly' + + diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 01ccb4fd9..2ad1a1d4a 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -398,7 +398,7 @@ cdef class Doc: self.is_parsed = True elif attr_id == TAG: for i in range(length): - tokens[i].tag = values[i] + self.vocab.morphology.assign_tag(&tokens[i], values[i]) if not self.is_tagged and tokens[i].tag != 0: self.is_tagged = True elif attr_id == POS: @@ -413,6 +413,8 @@ cdef class Doc: elif attr_id == ENT_TYPE: for i in range(length): tokens[i].ent_type = values[i] + else: + raise ValueError("Unknown attribute ID: %d" % attr_id) set_children_from_heads(self.data, self.length) return self