mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Work on serialization
This commit is contained in:
parent
deac7eb01c
commit
aa4c33914b
|
@ -9,6 +9,7 @@ from thinc.neural.ops import NumpyOps, CupyOps
|
||||||
from thinc.neural.optimizers import Adam, SGD
|
from thinc.neural.optimizers import Adam, SGD
|
||||||
import random
|
import random
|
||||||
import ujson
|
import ujson
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
|
@ -154,7 +155,7 @@ class Language(object):
|
||||||
if make_doc is True:
|
if make_doc is True:
|
||||||
factory = self.Defaults.create_tokenizer
|
factory = self.Defaults.create_tokenizer
|
||||||
make_doc = factory(self, **meta.get('tokenizer', {}))
|
make_doc = factory(self, **meta.get('tokenizer', {}))
|
||||||
self.make_doc = make_doc
|
self.tokenizer = make_doc
|
||||||
if pipeline is True:
|
if pipeline is True:
|
||||||
self.pipeline = self.Defaults.create_pipeline(self)
|
self.pipeline = self.Defaults.create_pipeline(self)
|
||||||
elif pipeline:
|
elif pipeline:
|
||||||
|
@ -196,6 +197,9 @@ class Language(object):
|
||||||
doc = proc(doc)
|
doc = proc(doc)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
def make_doc(self, text):
|
||||||
|
return self.tokenizer(text)
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||||
"""Update the models in the pipeline.
|
"""Update the models in the pipeline.
|
||||||
|
|
||||||
|
@ -425,19 +429,17 @@ class Language(object):
|
||||||
from being serialized.
|
from being serialized.
|
||||||
RETURNS (bytes): The serialized form of the `Language` object.
|
RETURNS (bytes): The serialized form of the `Language` object.
|
||||||
"""
|
"""
|
||||||
serializers = {
|
serializers = OrderedDict((
|
||||||
'vocab': lambda: self.vocab.to_bytes(),
|
('vocab', lambda: self.vocab.to_bytes()),
|
||||||
'tokenizer': lambda: self.tokenizer.to_bytes(vocab=False),
|
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
|
||||||
'meta': lambda: ujson.dumps(self.meta)
|
('meta', lambda: ujson.dumps(self.meta))
|
||||||
}
|
))
|
||||||
for proc in self.pipeline:
|
for i, proc in enumerate(self.pipeline):
|
||||||
if not hasattr(proc, 'name'):
|
if getattr(proc, 'name', None) in disable:
|
||||||
continue
|
|
||||||
if proc.name in disable:
|
|
||||||
continue
|
continue
|
||||||
if not hasattr(proc, 'to_bytes'):
|
if not hasattr(proc, 'to_bytes'):
|
||||||
continue
|
continue
|
||||||
serializers[proc.name] = lambda: proc.to_bytes(vocab=False)
|
serializers[i] = lambda: proc.to_bytes(vocab=False)
|
||||||
return util.to_bytes(serializers, {})
|
return util.to_bytes(serializers, {})
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, disable=[]):
|
def from_bytes(self, bytes_data, disable=[]):
|
||||||
|
@ -447,20 +449,18 @@ class Language(object):
|
||||||
disable (list): Names of the pipeline components to disable.
|
disable (list): Names of the pipeline components to disable.
|
||||||
RETURNS (Language): The `Language` object.
|
RETURNS (Language): The `Language` object.
|
||||||
"""
|
"""
|
||||||
deserializers = {
|
deserializers = OrderedDict((
|
||||||
'vocab': lambda b: self.vocab.from_bytes(b),
|
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||||
'tokenizer': lambda b: self.tokenizer.from_bytes(b, vocab=False),
|
('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
|
||||||
'meta': lambda b: self.meta.update(ujson.loads(b))
|
('meta', lambda b: self.meta.update(ujson.loads(b)))
|
||||||
}
|
))
|
||||||
for proc in self.pipeline:
|
for i, proc in enumerate(self.pipeline):
|
||||||
if not hasattr(proc, 'name'):
|
if getattr(proc, 'name', None) in disable:
|
||||||
continue
|
continue
|
||||||
if proc.name in disable:
|
if not hasattr(proc, 'from_bytes'):
|
||||||
continue
|
continue
|
||||||
if not hasattr(proc, 'to_disk'):
|
deserializers[i] = lambda b: proc.from_bytes(b, vocab=False)
|
||||||
continue
|
util.from_bytes(bytes_data, deserializers, {})
|
||||||
deserializers[proc.name] = lambda b: proc.from_bytes(b, vocab=False)
|
|
||||||
util.from_bytes(deserializers, bytes_data, {})
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ import numpy
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
import cytoolz
|
import cytoolz
|
||||||
import util
|
import util
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||||
from thinc.neural import Model, Maxout, Softmax, Affine
|
from thinc.neural import Model, Maxout, Softmax, Affine
|
||||||
|
@ -158,18 +159,18 @@ class TokenVectorEncoder(object):
|
||||||
yield
|
yield
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
serialize = {
|
serialize = OrderedDict((
|
||||||
'model': lambda: util.model_to_bytes(self.model),
|
('model', lambda: util.model_to_bytes(self.model)),
|
||||||
'vocab': lambda: self.vocab.to_bytes()
|
('vocab', lambda: self.vocab.to_bytes())
|
||||||
}
|
))
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
deserialize = {
|
deserialize = OrderedDict((
|
||||||
'model': lambda b: util.model_from_bytes(self.model, b),
|
('model', lambda b: util.model_from_bytes(self.model, b)),
|
||||||
'vocab': lambda b: self.vocab.from_bytes(b)
|
('vocab', lambda b: self.vocab.from_bytes(b))
|
||||||
}
|
))
|
||||||
util.from_bytes(deserialize, exclude)
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **exclude):
|
||||||
|
|
|
@ -659,9 +659,10 @@ cdef class Parser:
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
serializers = {
|
serializers = {
|
||||||
'model': lambda: util.model_to_bytes(self.model),
|
'lower_model': lambda: util.model_to_bytes(self.model[0]),
|
||||||
|
'upper_model': lambda: util.model_to_bytes(self.model[1]),
|
||||||
'vocab': lambda: self.vocab.to_bytes(),
|
'vocab': lambda: self.vocab.to_bytes(),
|
||||||
'moves': lambda: self.moves.to_bytes(vocab=False),
|
'moves': lambda: self.moves.to_bytes(strings=False),
|
||||||
'cfg': lambda: ujson.dumps(self.cfg)
|
'cfg': lambda: ujson.dumps(self.cfg)
|
||||||
}
|
}
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
@ -669,15 +670,19 @@ cdef class Parser:
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
deserializers = {
|
deserializers = {
|
||||||
'vocab': lambda b: self.vocab.from_bytes(b),
|
'vocab': lambda b: self.vocab.from_bytes(b),
|
||||||
'moves': lambda b: self.moves.from_bytes(b),
|
'moves': lambda b: self.moves.from_bytes(b, strings=False),
|
||||||
'cfg': lambda b: self.cfg.update(ujson.loads(b)),
|
'cfg': lambda b: self.cfg.update(ujson.loads(b)),
|
||||||
'model': lambda b: None
|
'lower_model': lambda b: None,
|
||||||
|
'upper_model': lambda b: None
|
||||||
}
|
}
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
if 'model' not in exclude:
|
if 'model' not in exclude:
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model, cfg = self.Model(self.moves.n_moves)
|
self.model, cfg = self.Model(self.moves.n_moves)
|
||||||
util.model_from_bytes(self.model, msg['model'])
|
else:
|
||||||
|
cfg = {}
|
||||||
|
util.model_from_bytes(self.model[0], msg['lower_model'])
|
||||||
|
util.model_from_bytes(self.model[1], msg['upper_model'])
|
||||||
self.cfg.update(cfg)
|
self.cfg.update(cfg)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,7 @@ import sys
|
||||||
import textwrap
|
import textwrap
|
||||||
import random
|
import random
|
||||||
import numpy
|
import numpy
|
||||||
|
import io
|
||||||
|
|
||||||
import msgpack
|
import msgpack
|
||||||
import msgpack_numpy
|
import msgpack_numpy
|
||||||
|
@ -447,27 +448,25 @@ def model_to_bytes(model):
|
||||||
i += 1
|
i += 1
|
||||||
if hasattr(layer, '_layers'):
|
if hasattr(layer, '_layers'):
|
||||||
queue.extend(layer._layers)
|
queue.extend(layer._layers)
|
||||||
data = {'metas': tuple(metas), 'weights': tuple(weights), 'dims':
|
data = {'metas': metas, 'weights': weights, 'dims': dims}
|
||||||
tuple(dims)}
|
|
||||||
return msgpack.dumps(data)
|
return msgpack.dumps(data)
|
||||||
|
|
||||||
|
|
||||||
def model_from_bytes(model, bytes_data):
|
def model_from_bytes(model, bytes_data):
|
||||||
data = msgpack.loads(bytes_data)
|
data = msgpack.loads(bytes_data)
|
||||||
metas = data['metas']
|
|
||||||
weights = data['weights']
|
weights = data['weights']
|
||||||
|
metas = data['metas']
|
||||||
dims = data['dims']
|
dims = data['dims']
|
||||||
queue = [model]
|
queue = [model]
|
||||||
i = 0
|
i = 0
|
||||||
for layer in queue:
|
for layer in queue:
|
||||||
if hasattr(layer, '_mem'):
|
if hasattr(layer, '_mem'):
|
||||||
params = weights[i]
|
params = weights[i]
|
||||||
flat_mem = layer._mem._mem.ravel()
|
blob = layer._mem._get_blob(params.size)
|
||||||
flat_params = params.ravel()
|
blob[:] = params
|
||||||
flat_mem[:flat_params.size] = flat_params
|
layer._mem._offsets = metas[i]
|
||||||
layer._mem._offsets.update(metas[i])
|
|
||||||
if hasattr(layer, '_dims'):
|
if hasattr(layer, '_dims'):
|
||||||
layer._dims.update(dims[i])
|
layer._dims[i] = dims[i]
|
||||||
i += 1
|
i += 1
|
||||||
if hasattr(layer, '_layers'):
|
if hasattr(layer, '_layers'):
|
||||||
queue.extend(layer._layers)
|
queue.extend(layer._layers)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user