From 6b019b054037b1ec85b71c47b88dee0df6b77378 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 29 May 2017 10:14:20 +0200 Subject: [PATCH] Update to/from bytes methods --- spacy/pipeline.pyx | 36 ++++++++++++++++++++++++++---------- spacy/syntax/nn_parser.pyx | 37 +++++++++++++++++++++++++++++++------ spacy/vocab.pyx | 22 ++++++++++------------ 3 files changed, 67 insertions(+), 28 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index ed4d5c1e6..236916c8b 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -9,7 +9,6 @@ import numpy cimport numpy as np import cytoolz import util -import ujson from thinc.api import add, layerize, chain, clone, concatenate, with_flatten from thinc.neural import Model, Maxout, Softmax, Affine @@ -160,18 +159,18 @@ class TokenVectorEncoder(object): yield def to_bytes(self, **exclude): - data = { - 'model': self.model, - 'vocab': self.vocab + serialize = { + 'model': lambda: model_to_bytes(self.model), + 'vocab': lambda: self.vocab.to_bytes() } - return util.to_bytes(data, exclude) + return util.to_bytes(serialize, exclude) def from_bytes(self, bytes_data, **exclude): - data = ujson.loads(bytes_data) - if 'model' not in exclude: - util.model_from_bytes(self.model, data['model']) - if 'vocab' not in exclude: - self.vocab.from_bytes(data['vocab']) + deserialize = { + 'model': lambda b: model_from_bytes(self.model, b), + 'vocab': lambda b: self.vocab.from_bytes(b) + } + util.from_bytes(deserialize, exclude) return self def to_disk(self, path, **exclude): @@ -290,6 +289,23 @@ class NeuralTagger(object): with self.model.use_params(params): yield + def to_bytes(self, **exclude): + serialize = { + 'model': lambda: model_to_bytes(self.model), + 'vocab': lambda: self.vocab.to_bytes() + } + return util.to_bytes(serialize, exclude) + + def from_bytes(self, bytes_data, **exclude): + deserialize = { + 'model': lambda b: model_from_bytes(self.model, b), + 'vocab': lambda b: self.vocab.from_bytes(b) + } + util.from_bytes(deserialize, exclude) + return self + + + class NeuralLabeller(NeuralTagger): name = 'nn_labeller' def __init__(self, vocab, model=True): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 4e4dbe39e..99410a2c8 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -260,7 +260,14 @@ cdef class Parser: # Used to set input dimensions in network. lower.begin_training(lower.ops.allocate((500, token_vector_width))) upper.begin_training(upper.ops.allocate((500, hidden_width))) - return lower, upper + cfg = { + 'nr_class': nr_class, + 'depth': depth, + 'token_vector_width': token_vector_width, + 'hidden_width': hidden_width, + 'maxout_pieces': parser_maxout_pieces + } + return (lower, upper), cfg def __init__(self, Vocab vocab, moves=True, model=True, **cfg): """ @@ -611,7 +618,8 @@ cdef class Parser: for label in labels: self.moves.add_action(action, label) if self.model is True: - self.model = self.Model(self.moves.n_moves, **cfg) + self.model, cfg = self.Model(self.moves.n_moves, **cfg) + self.cfg.update(cfg) def preprocess_gold(self, docs_golds): for doc, gold in docs_golds: @@ -633,11 +641,28 @@ cdef class Parser: with (path / 'model.bin').open('wb') as file_: self.model = dill.load(file_) - def to_bytes(self): - dill.dumps(self.model) + def to_bytes(self, **exclude): + serialize = { + 'model': lambda: util.model_to_bytes(self.model), + 'vocab': lambda: self.vocab.to_bytes(), + 'moves': lambda: self.moves.to_bytes(), + 'cfg': lambda: ujson.dumps(self.cfg) + } + return util.to_bytes(serialize, exclude) - def from_bytes(self, data): - self.model = dill.loads(data) + def from_bytes(self, bytes_data, **exclude): + deserialize = { + 'vocab': lambda b: self.vocab.from_bytes(b), + 'moves': lambda b: self.moves.from_bytes(b), + 'cfg': lambda b: self.cfg.update(ujson.loads(b)), + 'model': lambda b: None + } + msg = util.from_bytes(deserialize, exclude) + if 'model' not in exclude: + if self.model is True: + self.model = self.Model(**msg['cfg']) + util.model_from_disk(self.model, msg['model']) + return self class ParserStateError(ValueError): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index d532cd445..bc6166e39 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -291,12 +291,11 @@ cdef class Vocab: **exclude: Named attributes to prevent from being serialized. RETURNS (bytes): The serialized form of the `Vocab` object. """ - data = {} - if 'strings' not in exclude: - data['strings'] = self.strings.to_bytes() - if 'lexemes' not in exclude: - data['lexemes'] = self.lexemes_to_bytes - return ujson.dumps(data) + getters = { + 'strings': lambda: self.strings.to_bytes(), + 'lexemes': lambda: self.lexemes_to_bytes() + } + return util.to_bytes(getters, exclude) def from_bytes(self, bytes_data, **exclude): """Load state from a binary string. @@ -305,12 +304,11 @@ cdef class Vocab: **exclude: Named attributes to prevent from being loaded. RETURNS (Vocab): The `Vocab` object. """ - data = ujson.loads(bytes_data) - if 'strings' not in exclude: - self.strings.from_bytes(data['strings']) - if 'lexemes' not in exclude: - self.lexemes_from_bytes(data['lexemes']) - return self + setters = { + 'strings': lambda b: self.strings.from_bytes(b), + 'lexemes': lambda b: self.lexemes_from_bytes(b) + } + return util.from_bytes(bytes_data, setters, exclude) def lexemes_to_bytes(self): cdef hash_t key