From 05596159bfb8e50e4ecbf0f5841aa709a9a71f5d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 22 Sep 2017 15:33:27 -0500 Subject: [PATCH] Fix serialization when pre-trained vectors --- spacy/pipeline.pyx | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index a7ff90174..f5b2db55a 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -145,8 +145,8 @@ class BaseThincComponent(object): deserialize = OrderedDict(( ('cfg', lambda b: self.cfg.update(ujson.loads(b))), - ('model', load_model), ('vocab', lambda b: self.vocab.from_bytes(b)) + ('model', load_model), )) util.from_bytes(bytes_data, deserialize, exclude) return self @@ -154,8 +154,8 @@ class BaseThincComponent(object): def to_disk(self, path, **exclude): serialize = OrderedDict(( ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))), + ('vocab', lambda p: self.vocab.to_disk(p)), ('model', lambda p: p.open('wb').write(self.model.to_bytes())), - ('vocab', lambda p: self.vocab.to_disk(p)) )) util.to_disk(path, serialize, exclude) @@ -168,8 +168,8 @@ class BaseThincComponent(object): deserialize = OrderedDict(( ('cfg', lambda p: self.cfg.update(_load_cfg(p))), - ('model', load_model), ('vocab', lambda p: self.vocab.from_disk(p)), + ('model', load_model), )) util.from_disk(path, deserialize, exclude) return self @@ -289,6 +289,7 @@ class TokenVectorEncoder(BaseThincComponent): pipeline (list): The pipeline the model is part of. """ if self.model is True: + self.cfg['pretrained_dims'] = self.vocab.vectors_length self.model = self.Model(**self.cfg) link_vectors_to_models(self.vocab) @@ -398,6 +399,7 @@ class NeuralTagger(BaseThincComponent): vocab.morphology.lemmatizer, exc=vocab.morphology.exc) if self.model is True: + self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) link_vectors_to_models(self.vocab) @@ -486,6 +488,7 @@ class NeuralLabeller(NeuralTagger): self.model = model self.cfg = dict(cfg) self.cfg.setdefault('cnn_maxout_pieces', 2) + self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1]) @property def labels(self): @@ -508,8 +511,8 @@ class NeuralLabeller(NeuralTagger): self.labels[dep] = len(self.labels) token_vector_width = pipeline[0].model.nO if self.model is True: - self.model = self.Model(len(self.labels), token_vector_width=token_vector_width, - pretrained_dims=self.vocab.vectors_length) + self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] + self.model = self.Model(len(self.labels), **self.cfg) link_vectors_to_models(self.vocab) @classmethod