Fix to/from disk methods

This commit is contained in:
Matthew Honnibal 2017-05-31 13:42:39 +02:00
parent 5c30466c95
commit 33e5ec737f
4 changed files with 87 additions and 54 deletions

View File

@ -96,6 +96,13 @@ class BaseDefaults(object):
factories = { factories = {
'make_doc': create_tokenizer, 'make_doc': create_tokenizer,
'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
'parser': lambda nlp, **cfg: [
NeuralDependencyParser(nlp.vocab, **cfg),
nonproj.deprojectivize],
'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
# Temporary compatibility -- delete after pivot
'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)], 'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)], 'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
'dependencies': lambda nlp, **cfg: [ 'dependencies': lambda nlp, **cfg: [
@ -358,37 +365,35 @@ class Language(object):
for doc in docs: for doc in docs:
yield doc yield doc
def to_disk(self, path, disable=[]): def to_disk(self, path, disable=tuple()):
"""Save the current state to a directory. If a model is loaded, this """Save the current state to a directory. If a model is loaded, this
will include the model. will include the model.
path (unicode or Path): A path to a directory, which will be created if path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects. it doesn't exist. Paths may be either strings or `Path`-like objects.
disable (list): Nameds of pipeline components to disable and prevent disable (list): Names of pipeline components to disable and prevent
from being saved. from being saved.
EXAMPLE: EXAMPLE:
>>> nlp.to_disk('/path/to/models') >>> nlp.to_disk('/path/to/models')
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
with path.open('wb') as file_: serializers = OrderedDict((
file_.write(self.to_bytes(disable)) ('vocab', lambda p: self.vocab.to_disk(p)),
#serializers = { ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
# 'vocab': lambda p: self.vocab.to_disk(p), ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
# 'tokenizer': lambda p: self.tokenizer.to_disk(p, vocab=False), ))
# 'meta.json': lambda p: ujson.dump(p.open('w'), self.meta) for proc in self.pipeline:
#} if not hasattr(proc, 'name'):
#for proc in self.pipeline: continue
# if not hasattr(proc, 'name'): if proc.name in disable:
# continue continue
# if proc.name in disable: if not hasattr(proc, 'to_disk'):
# continue continue
# if not hasattr(proc, 'to_disk'): serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
# continue util.to_disk(path, serializers, {p: False for p in disable})
# serializers[proc.name] = lambda p: proc.to_disk(p, vocab=False)
#util.to_disk(serializers, path)
def from_disk(self, path, disable=[]): def from_disk(self, path, disable=tuple()):
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
returns it. If the saved `Language` object contains a model, the returns it. If the saved `Language` object contains a model, the
model will be loaded. model will be loaded.
@ -403,24 +408,21 @@ class Language(object):
>>> nlp = Language().from_disk('/path/to/models') >>> nlp = Language().from_disk('/path/to/models')
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
with path.open('rb') as file_: deserializers = OrderedDict((
bytes_data = file_.read() ('vocab', lambda p: self.vocab.from_disk(p)),
return self.from_bytes(bytes_data, disable) ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
#deserializers = { ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
# 'vocab': lambda p: self.vocab.from_disk(p), ))
# 'tokenizer': lambda p: self.tokenizer.from_disk(p, vocab=False), for proc in self.pipeline:
# 'meta.json': lambda p: ujson.dump(p.open('w'), self.meta) if not hasattr(proc, 'name'):
#} continue
#for proc in self.pipeline: if proc.name in disable:
# if not hasattr(proc, 'name'): continue
# continue if not hasattr(proc, 'to_disk'):
# if proc.name in disable: continue
# continue deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
# if not hasattr(proc, 'to_disk'): util.from_disk(path, deserializers, {p: False for p in disable})
# continue return self
# deserializers[proc.name] = lambda p: proc.from_disk(p, vocab=False)
#util.from_disk(deserializers, path)
#return self
def to_bytes(self, disable=[]): def to_bytes(self, disable=[]):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.

View File

@ -41,7 +41,7 @@ from .parts_of_speech import X
class TokenVectorEncoder(object): class TokenVectorEncoder(object):
"""Assign position-sensitive vectors to tokens, using a CNN or RNN.""" """Assign position-sensitive vectors to tokens, using a CNN or RNN."""
name = 'tok2vec' name = 'tensorizer'
@classmethod @classmethod
def Model(cls, width=128, embed_size=7500, **cfg): def Model(cls, width=128, embed_size=7500, **cfg):
@ -176,17 +176,19 @@ class TokenVectorEncoder(object):
return self return self
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
serialize = { serialize = OrderedDict((
'model': lambda p: p.open('w').write(util.model_to_bytes(self.model)), ('model', lambda p: p.open('wb').write(util.model_to_bytes(self.model))),
'vocab': lambda p: self.vocab.to_disk(p) ('vocab', lambda p: self.vocab.to_disk(p))
} ))
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude): def from_disk(self, path, **exclude):
deserialize = { if self.model is True:
'model': lambda p: util.model_from_bytes(self.model, p.open('rb').read()), self.model = self.Model()
'vocab': lambda p: self.vocab.from_disk(p) deserialize = OrderedDict((
} ('model', lambda p: util.model_from_bytes(self.model, p.open('rb').read())),
('vocab', lambda p: self.vocab.from_disk(p))
))
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
return self return self
@ -315,7 +317,7 @@ class NeuralTagger(object):
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
serialize = { serialize = {
'model': lambda p: p.open('w').write(util.model_to_bytes(self.model)), 'model': lambda p: p.open('wb').write(util.model_to_bytes(self.model)),
'vocab': lambda p: self.vocab.to_disk(p) 'vocab': lambda p: self.vocab.to_disk(p)
} }
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
@ -420,7 +422,7 @@ cdef class NeuralDependencyParser(NeuralParser):
cdef class NeuralEntityRecognizer(NeuralParser): cdef class NeuralEntityRecognizer(NeuralParser):
name = 'entity' name = 'ner'
TransitionSystem = BiluoPushDown TransitionSystem = BiluoPushDown
nr_feature = 6 nr_feature = 6

View File

@ -44,6 +44,7 @@ from .. import util
from ..util import get_async, get_cuda_stream from ..util import get_async, get_cuda_stream
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
from .._ml import Tok2Vec, doc2feats, rebatch from .._ml import Tok2Vec, doc2feats, rebatch
from ..compat import json_dumps
from . import _parse_features from . import _parse_features
from ._parse_features cimport CONTEXT_SIZE from ._parse_features cimport CONTEXT_SIZE
@ -633,11 +634,13 @@ cdef class Parser:
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
serializers = { serializers = {
'model': lambda p: p.open('wb').write( 'lower_model': lambda p: p.open('wb').write(
util.model_to_bytes(self.model)), util.model_to_bytes(self.model[0])),
'upper_model': lambda p: p.open('wb').write(
util.model_to_bytes(self.model[1])),
'vocab': lambda p: self.vocab.to_disk(p), 'vocab': lambda p: self.vocab.to_disk(p),
'moves': lambda p: self.moves.to_disk(p, strings=False), 'moves': lambda p: self.moves.to_disk(p, strings=False),
'cfg': lambda p: ujson.dumps(p.open('w'), self.cfg) 'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
} }
util.to_disk(path, serializers, exclude) util.to_disk(path, serializers, exclude)
@ -645,7 +648,7 @@ cdef class Parser:
deserializers = { deserializers = {
'vocab': lambda p: self.vocab.from_disk(p), 'vocab': lambda p: self.vocab.from_disk(p),
'moves': lambda p: self.moves.from_disk(p, strings=False), 'moves': lambda p: self.moves.from_disk(p, strings=False),
'cfg': lambda p: self.cfg.update(ujson.load((path/'cfg.json').open())), 'cfg': lambda p: self.cfg.update(ujson.load(p.open())),
'model': lambda p: None 'model': lambda p: None
} }
util.from_disk(path, deserializers, exclude) util.from_disk(path, deserializers, exclude)
@ -653,7 +656,14 @@ cdef class Parser:
path = util.ensure_path(path) path = util.ensure_path(path)
if self.model is True: if self.model is True:
self.model, cfg = self.Model(**self.cfg) self.model, cfg = self.Model(**self.cfg)
util.model_from_disk(self.model, path / 'model') else:
cfg = {}
with (path / 'lower_model').open('rb') as file_:
bytes_data = file_.read()
util.model_from_bytes(self.model[0], bytes_data)
with (path / 'upper_model').open('rb') as file_:
bytes_data = file_.read()
util.model_from_bytes(self.model[1], bytes_data)
self.cfg.update(cfg) self.cfg.update(cfg)
return self return self

View File

@ -13,6 +13,7 @@ import random
import numpy import numpy
import io import io
import dill import dill
from collections import OrderedDict
import msgpack import msgpack
import msgpack_numpy import msgpack_numpy
@ -408,7 +409,7 @@ def get_raw_input(description, default=False):
def to_bytes(getters, exclude): def to_bytes(getters, exclude):
serialized = {} serialized = OrderedDict()
for key, getter in getters.items(): for key, getter in getters.items():
if key not in exclude: if key not in exclude:
serialized[key] = getter() serialized[key] = getter()
@ -423,6 +424,24 @@ def from_bytes(bytes_data, setters, exclude):
return msg return msg
def to_disk(path, writers, exclude):
path = ensure_path(path)
if not path.exists():
path.mkdir()
for key, writer in writers.items():
if key not in exclude:
writer(path / key)
return path
def from_disk(path, readers, exclude):
path = ensure_path(path)
for key, reader in readers.items():
if key not in exclude:
reader(path / key)
return path
# This stuff really belongs in thinc -- but I expect # This stuff really belongs in thinc -- but I expect
# to refactor how all this works in thinc anyway. # to refactor how all this works in thinc anyway.
# What a mess! # What a mess!