Work on serialization

This commit is contained in:
Matthew Honnibal 2017-05-29 08:40:45 -05:00
parent deac7eb01c
commit aa4c33914b
4 changed files with 50 additions and 45 deletions

View File

@ -9,6 +9,7 @@ from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.optimizers import Adam, SGD from thinc.neural.optimizers import Adam, SGD
import random import random
import ujson import ujson
from collections import OrderedDict
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
from .vocab import Vocab from .vocab import Vocab
@ -154,7 +155,7 @@ class Language(object):
if make_doc is True: if make_doc is True:
factory = self.Defaults.create_tokenizer factory = self.Defaults.create_tokenizer
make_doc = factory(self, **meta.get('tokenizer', {})) make_doc = factory(self, **meta.get('tokenizer', {}))
self.make_doc = make_doc self.tokenizer = make_doc
if pipeline is True: if pipeline is True:
self.pipeline = self.Defaults.create_pipeline(self) self.pipeline = self.Defaults.create_pipeline(self)
elif pipeline: elif pipeline:
@ -196,6 +197,9 @@ class Language(object):
doc = proc(doc) doc = proc(doc)
return doc return doc
def make_doc(self, text):
return self.tokenizer(text)
def update(self, docs, golds, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None):
"""Update the models in the pipeline. """Update the models in the pipeline.
@ -425,19 +429,17 @@ class Language(object):
from being serialized. from being serialized.
RETURNS (bytes): The serialized form of the `Language` object. RETURNS (bytes): The serialized form of the `Language` object.
""" """
serializers = { serializers = OrderedDict((
'vocab': lambda: self.vocab.to_bytes(), ('vocab', lambda: self.vocab.to_bytes()),
'tokenizer': lambda: self.tokenizer.to_bytes(vocab=False), ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
'meta': lambda: ujson.dumps(self.meta) ('meta', lambda: ujson.dumps(self.meta))
} ))
for proc in self.pipeline: for i, proc in enumerate(self.pipeline):
if not hasattr(proc, 'name'): if getattr(proc, 'name', None) in disable:
continue
if proc.name in disable:
continue continue
if not hasattr(proc, 'to_bytes'): if not hasattr(proc, 'to_bytes'):
continue continue
serializers[proc.name] = lambda: proc.to_bytes(vocab=False) serializers[i] = lambda: proc.to_bytes(vocab=False)
return util.to_bytes(serializers, {}) return util.to_bytes(serializers, {})
def from_bytes(self, bytes_data, disable=[]): def from_bytes(self, bytes_data, disable=[]):
@ -447,20 +449,18 @@ class Language(object):
disable (list): Names of the pipeline components to disable. disable (list): Names of the pipeline components to disable.
RETURNS (Language): The `Language` object. RETURNS (Language): The `Language` object.
""" """
deserializers = { deserializers = OrderedDict((
'vocab': lambda b: self.vocab.from_bytes(b), ('vocab', lambda b: self.vocab.from_bytes(b)),
'tokenizer': lambda b: self.tokenizer.from_bytes(b, vocab=False), ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
'meta': lambda b: self.meta.update(ujson.loads(b)) ('meta', lambda b: self.meta.update(ujson.loads(b)))
} ))
for proc in self.pipeline: for i, proc in enumerate(self.pipeline):
if not hasattr(proc, 'name'): if getattr(proc, 'name', None) in disable:
continue continue
if proc.name in disable: if not hasattr(proc, 'from_bytes'):
continue continue
if not hasattr(proc, 'to_disk'): deserializers[i] = lambda b: proc.from_bytes(b, vocab=False)
continue util.from_bytes(bytes_data, deserializers, {})
deserializers[proc.name] = lambda b: proc.from_bytes(b, vocab=False)
util.from_bytes(deserializers, bytes_data, {})
return self return self

View File

@ -9,6 +9,7 @@ import numpy
cimport numpy as np cimport numpy as np
import cytoolz import cytoolz
import util import util
from collections import OrderedDict
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.neural import Model, Maxout, Softmax, Affine from thinc.neural import Model, Maxout, Softmax, Affine
@ -158,18 +159,18 @@ class TokenVectorEncoder(object):
yield yield
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
serialize = { serialize = OrderedDict((
'model': lambda: util.model_to_bytes(self.model), ('model', lambda: util.model_to_bytes(self.model)),
'vocab': lambda: self.vocab.to_bytes() ('vocab', lambda: self.vocab.to_bytes())
} ))
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **exclude):
deserialize = { deserialize = OrderedDict((
'model': lambda b: util.model_from_bytes(self.model, b), ('model', lambda b: util.model_from_bytes(self.model, b)),
'vocab': lambda b: self.vocab.from_bytes(b) ('vocab', lambda b: self.vocab.from_bytes(b))
} ))
util.from_bytes(deserialize, exclude) util.from_bytes(bytes_data, deserialize, exclude)
return self return self
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):

View File

@ -659,9 +659,10 @@ cdef class Parser:
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
serializers = { serializers = {
'model': lambda: util.model_to_bytes(self.model), 'lower_model': lambda: util.model_to_bytes(self.model[0]),
'upper_model': lambda: util.model_to_bytes(self.model[1]),
'vocab': lambda: self.vocab.to_bytes(), 'vocab': lambda: self.vocab.to_bytes(),
'moves': lambda: self.moves.to_bytes(vocab=False), 'moves': lambda: self.moves.to_bytes(strings=False),
'cfg': lambda: ujson.dumps(self.cfg) 'cfg': lambda: ujson.dumps(self.cfg)
} }
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
@ -669,15 +670,19 @@ cdef class Parser:
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **exclude):
deserializers = { deserializers = {
'vocab': lambda b: self.vocab.from_bytes(b), 'vocab': lambda b: self.vocab.from_bytes(b),
'moves': lambda b: self.moves.from_bytes(b), 'moves': lambda b: self.moves.from_bytes(b, strings=False),
'cfg': lambda b: self.cfg.update(ujson.loads(b)), 'cfg': lambda b: self.cfg.update(ujson.loads(b)),
'model': lambda b: None 'lower_model': lambda b: None,
'upper_model': lambda b: None
} }
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'model' not in exclude: if 'model' not in exclude:
if self.model is True: if self.model is True:
self.model, cfg = self.Model(self.moves.n_moves) self.model, cfg = self.Model(self.moves.n_moves)
util.model_from_bytes(self.model, msg['model']) else:
cfg = {}
util.model_from_bytes(self.model[0], msg['lower_model'])
util.model_from_bytes(self.model[1], msg['upper_model'])
self.cfg.update(cfg) self.cfg.update(cfg)
return self return self

View File

@ -11,6 +11,7 @@ import sys
import textwrap import textwrap
import random import random
import numpy import numpy
import io
import msgpack import msgpack
import msgpack_numpy import msgpack_numpy
@ -447,27 +448,25 @@ def model_to_bytes(model):
i += 1 i += 1
if hasattr(layer, '_layers'): if hasattr(layer, '_layers'):
queue.extend(layer._layers) queue.extend(layer._layers)
data = {'metas': tuple(metas), 'weights': tuple(weights), 'dims': data = {'metas': metas, 'weights': weights, 'dims': dims}
tuple(dims)}
return msgpack.dumps(data) return msgpack.dumps(data)
def model_from_bytes(model, bytes_data): def model_from_bytes(model, bytes_data):
data = msgpack.loads(bytes_data) data = msgpack.loads(bytes_data)
metas = data['metas']
weights = data['weights'] weights = data['weights']
metas = data['metas']
dims = data['dims'] dims = data['dims']
queue = [model] queue = [model]
i = 0 i = 0
for layer in queue: for layer in queue:
if hasattr(layer, '_mem'): if hasattr(layer, '_mem'):
params = weights[i] params = weights[i]
flat_mem = layer._mem._mem.ravel() blob = layer._mem._get_blob(params.size)
flat_params = params.ravel() blob[:] = params
flat_mem[:flat_params.size] = flat_params layer._mem._offsets = metas[i]
layer._mem._offsets.update(metas[i])
if hasattr(layer, '_dims'): if hasattr(layer, '_dims'):
layer._dims.update(dims[i]) layer._dims[i] = dims[i]
i += 1 i += 1
if hasattr(layer, '_layers'): if hasattr(layer, '_layers'):
queue.extend(layer._layers) queue.extend(layer._layers)