Work on to/from bytes/disk serialization methods

2025-07-17 11:42:30 +03:00 · 2017-05-29 11:45:45 +02:00 · 2017-05-29 11:45:45 +02:00 · ff26aa6c37
commit ff26aa6c37
parent 6b019b0540
6 changed files with 205 additions and 65 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -366,20 +366,22 @@ class Language(object):
            >>> nlp.to_disk('/path/to/models')
        """
        path = util.ensure_path(path)
-        if not path.exists():
+        with path.open('wb') as file_:
-            path.mkdir()
+            file_.write(self.to_bytes(disable))
-        if not path.is_dir():
+        #serializers = {
-            raise IOError("Output path must be a directory")
+        #    'vocab': lambda p: self.vocab.to_disk(p),
-        props = {}
+        #    'tokenizer': lambda p: self.tokenizer.to_disk(p, vocab=False),
-        for name, value in self.__dict__.items():
+        #    'meta.json': lambda p: ujson.dump(p.open('w'), self.meta)
-            if name in disable:
+        #}
-                continue
+        #for proc in self.pipeline:
-            if hasattr(value, 'to_disk'):
+        #    if not hasattr(proc, 'name'):
-                value.to_disk(path / name)
+        #        continue
-            else:
+        #    if proc.name in disable:
-                props[name] = value
+        #        continue
-        with (path / 'props.pickle').open('wb') as file_:
+        #    if not hasattr(proc, 'to_disk'):
-            dill.dump(props, file_)
+        #        continue
        #    serializers[proc.name] = lambda p: proc.to_disk(p, vocab=False)
        #util.to_disk(serializers, path)
    def from_disk(self, path, disable=[]):
        """Loads state from a directory. Modifies the object in place and
@ -396,13 +398,24 @@ class Language(object):
            >>> nlp = Language().from_disk('/path/to/models')
        """
        path = util.ensure_path(path)
-        for name in path.iterdir():
+        with path.open('rb') as file_:
            if name not in disable and hasattr(self, str(name)):
                getattr(self, name).from_disk(path / name)
        with (path / 'props.pickle').open('rb') as file_:
            bytes_data = file_.read()
-        self.from_bytes(bytes_data, disable)
+        return self.from_bytes(bytes_data, disable)
-        return self
+        #deserializers = {
        #    'vocab': lambda p: self.vocab.from_disk(p),
        #    'tokenizer': lambda p: self.tokenizer.from_disk(p, vocab=False),
        #    'meta.json': lambda p: ujson.dump(p.open('w'), self.meta)
        #}
        #for proc in self.pipeline:
        #    if not hasattr(proc, 'name'):
        #        continue
        #    if proc.name in disable:
        #        continue
        #    if not hasattr(proc, 'to_disk'):
        #        continue
        #    deserializers[proc.name] = lambda p: proc.from_disk(p, vocab=False)
        #util.from_disk(deserializers, path)
        #return self
    def to_bytes(self, disable=[]):
        """Serialize the current state to a binary string.
@ -411,11 +424,20 @@ class Language(object):
            from being serialized.
        RETURNS (bytes): The serialized form of the `Language` object.
        """
-        props = dict(self.__dict__)
+        serializers = {
-        for key in disable:
+            'vocab': lambda: self.vocab.to_bytes(),
-            if key in props:
+            'tokenizer': lambda: self.tokenizer.to_bytes(vocab=False),
-                props.pop(key)
+            'meta': lambda: ujson.dumps(self.meta)
-        return dill.dumps(props, -1)
+        }
        for proc in self.pipeline:
            if not hasattr(proc, 'name'):
                continue
            if proc.name in disable:
                continue
            if not hasattr(proc, 'to_bytes'):
                continue
            serializers[proc.name] = lambda: proc.to_bytes(p, vocab=False)
        return util.to_bytes(serializers)
    def from_bytes(self, bytes_data, disable=[]):
        """Load state from a binary string.
@ -424,12 +446,23 @@ class Language(object):
        disable (list): Names of the pipeline components to disable.
        RETURNS (Language): The `Language` object.
        """
-        props = dill.loads(bytes_data)
+        deserializers = {
-        for key, value in props.items():
+            'vocab': lambda b: self.vocab.from_bytes(b),
-            if key not in disable:
+            'tokenizer': lambda b: self.tokenizer.from_bytes(b, vocab=False),
-                setattr(self, key, value)
+            'meta': lambda b: self.meta.update(ujson.loads(b))
        }
        for proc in self.pipeline:
            if not hasattr(proc, 'name'):
                continue
            if proc.name in disable:
                continue
            if not hasattr(proc, 'to_disk'):
                continue
            deserializers[proc.name] = lambda b: proc.from_bytes(b, vocab=False)
        util.from_bytes(deserializers, bytes_data)
        return self
 def _pipe(func, docs):
    for doc in docs:
        func(doc)
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -35,7 +35,6 @@ from .syntax import nonproj
 from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
 from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
 from ._ml import model_to_bytes, model_from_bytes
 from .parts_of_speech import X
@ -160,36 +159,33 @@ class TokenVectorEncoder(object):
    def to_bytes(self, **exclude):
        serialize = {
-            'model': lambda: model_to_bytes(self.model),
+            'model': lambda: util.model_to_bytes(self.model),
            'vocab': lambda: self.vocab.to_bytes()
        }
        return util.to_bytes(serialize, exclude)
    def from_bytes(self, bytes_data, **exclude):
        deserialize = {
-            'model': lambda b: model_from_bytes(self.model, b),
+            'model': lambda b: util.model_from_bytes(self.model, b),
            'vocab': lambda b: self.vocab.from_bytes(b)
        }
        util.from_bytes(deserialize, exclude)
        return self
    def to_disk(self, path, **exclude):
-        path = util.ensure_path(path)
+        serialize = {
-        if not path.exists():
+            'model': lambda p: p.open('w').write(util.model_to_bytes(self.model)),
-            path.mkdir()
+            'vocab': lambda p: self.vocab.to_disk(p)
-        if 'vocab' not in exclude:
+        }
-            self.vocab.to_disk(path / 'vocab')
+        util.to_disk(path, serialize, exclude)
        if 'model' not in exclude:
            with (path / 'model.bin').open('wb') as file_:
                file_.write(util.model_to_bytes(self.model))
    def from_disk(self, path, **exclude):
-        path = util.ensure_path(path)
+        deserialize = {
-        if 'vocab' not in exclude:
+            'model': lambda p: util.model_from_bytes(self.model, p.open('rb').read()),
-            self.vocab.from_disk(path / 'vocab')
+            'vocab': lambda p: self.vocab.from_disk(p)
-        if 'model.bin' not in exclude:
+        }
-            with (path / 'model.bin').open('rb') as file_:
+        util.from_disk(path, deserialize, exclude)
-                util.model_from_bytes(self.model, file_.read())
+        return self
 class NeuralTagger(object):
@ -291,19 +287,33 @@ class NeuralTagger(object):
    def to_bytes(self, **exclude):
        serialize = {
-            'model': lambda: model_to_bytes(self.model),
+            'model': lambda: util.model_to_bytes(self.model),
            'vocab': lambda: self.vocab.to_bytes()
        }
        return util.to_bytes(serialize, exclude)
    def from_bytes(self, bytes_data, **exclude):
        deserialize = {
-            'model': lambda b: model_from_bytes(self.model, b),
+            'model': lambda b: util.model_from_bytes(self.model, b),
            'vocab': lambda b: self.vocab.from_bytes(b)
        }
        util.from_bytes(deserialize, exclude)
        return self
    def to_disk(self, path, **exclude):
        serialize = {
            'model': lambda p: p.open('w').write(util.model_to_bytes(self.model)),
            'vocab': lambda p: self.vocab.to_disk(p)
        }
        util.to_disk(path, serialize, exclude)
    def from_disk(self, path, **exclude):
        deserialize = {
            'model': lambda p: util.model_from_bytes(self.model, p.open('rb').read()),
            'vocab': lambda p: self.vocab.from_disk(p)
        }
        util.from_disk(path, deserialize, exclude)
        return self
 class NeuralLabeller(NeuralTagger):
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -631,37 +631,53 @@ cdef class Parser:
            with self.model[1].use_params(params):
                yield
-    def to_disk(self, path):
+    def to_disk(self, path, **exclude):
-        path = util.ensure_path(path)
+        serializers = {
-        with (path / 'model.bin').open('wb') as file_:
+            'model': lambda p: p.open('wb').write(
-            dill.dump(self.model, file_)
+                util.model_to_bytes(self.model)),
            'vocab': lambda p: self.vocab.to_disk(p),
            'moves': lambda p: self.moves.to_disk(p, strings=False),
            'cfg': lambda p: ujson.dumps(p.open('w'), self.cfg)
        }
        util.to_disk(path, serializers, exclude)
-    def from_disk(self, path):
+    def from_disk(self, path, **exclude):
-        path = util.ensure_path(path)
+        deserializers = {
-        with (path / 'model.bin').open('wb') as file_:
+            'vocab': lambda p: self.vocab.from_disk(p),
-            self.model = dill.load(file_)
+            'moves': lambda p: self.moves.from_disk(p, strings=False),
            'cfg': lambda p: self.cfg.update(ujson.load((path/'cfg.json').open())),
            'model': lambda p: None
        }
        util.from_disk(path, deserializers, exclude)
        if 'model' not in exclude:
            path = util.ensure_path(path)
            if self.model is True:
                self.model = self.Model(**self.cfg)
            util.model_from_disk(self.model, path / 'model')
        return self
    def to_bytes(self, **exclude):
-        serialize = {
+        serializers = {
            'model': lambda: util.model_to_bytes(self.model),
            'vocab': lambda: self.vocab.to_bytes(),
-            'moves': lambda: self.moves.to_bytes(),
+            'moves': lambda: self.moves.to_bytes(vocab=False),
            'cfg': lambda: ujson.dumps(self.cfg)
        }
-        return util.to_bytes(serialize, exclude)
+        return util.to_bytes(serializers, exclude)
    def from_bytes(self, bytes_data, **exclude):
-        deserialize = {
+        deserializers = {
            'vocab': lambda b: self.vocab.from_bytes(b),
            'moves': lambda b: self.moves.from_bytes(b),
            'cfg': lambda b: self.cfg.update(ujson.loads(b)),
            'model': lambda b: None
        }
-        msg = util.from_bytes(deserialize, exclude)
+        msg = util.from_bytes(bytes_data, deserializers, exclude)
        if 'model' not in exclude:
            if self.model is True:
-                self.model = self.Model(**msg['cfg'])
+                print(msg['cfg'])
-            util.model_from_disk(self.model, msg['model'])
+                self.model = self.Model(self.moves.n_moves)
            util.model_from_bytes(self.model, msg['model'])
        return self
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -6,7 +6,9 @@ from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport weight_t
 from collections import defaultdict, OrderedDict
 import ujson
 from .. import util
 from ..structs cimport TokenC
 from .stateclass cimport StateClass
 from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
@ -153,3 +155,48 @@ cdef class TransitionSystem:
        assert self.c[self.n_moves].label == label_id
        self.n_moves += 1
        return 1
    def to_disk(self, path, **exclude):
        actions = list(self.move_names)
        deserializers = {
            'actions': lambda p: ujson.dump(p.open('w'), actions),
            'strings': lambda p: self.strings.to_disk(p)
        }
        util.to_disk(path, deserializers, exclude)
    def from_disk(self, path, **exclude):
        actions = []
        deserializers = {
            'strings': lambda p: self.strings.from_disk(p),
            'actions': lambda p: actions.extend(ujson.load(p.open()))
        }
        util.from_disk(path, deserializers, exclude)
        for move, label in actions:
            self.add_action(move, label)
        return self
    def to_bytes(self, **exclude):
        transitions = []
        for trans in self.c[:self.n_moves]:
            transitions.append({
                'clas': trans.clas,
                'move': trans.move,
                'label': self.strings[trans.label],
                'name': self.move_name(trans.move, trans.label)
            })
        serializers = {
            'transitions': lambda: ujson.dumps(transitions),
            'strings': lambda: self.strings.to_bytes()
        }
        return util.to_bytes(serializers, exclude)
    def from_bytes(self, bytes_data, **exclude):
        transitions = []
        deserializers = {
            'transitions': lambda b: transitions.extend(ujson.loads(b)),
            'strings': lambda b: self.strings.from_bytes(b)
        }
        msg = util.from_bytes(bytes_data, deserializers, exclude)
        for trans in transitions:
            self.add_action(trans['move'], trans['label'])
        return self
--- a/spacy/tests/parser/test_to_from_bytes_disk.py
+++ b/spacy/tests/parser/test_to_from_bytes_disk.py
@ -0,0 +1,34 @@
 import pytest
 from ...pipeline import NeuralDependencyParser
 from ...vocab import Vocab
@pytest.fixture
 def vocab():
    return Vocab()
@pytest.fixture
 def parser(vocab):
    parser = NeuralDependencyParser(vocab)
    parser.add_label('nsubj')
    parser.model, cfg = parser.Model(parser.moves.n_moves)
    parser.cfg.update(cfg)
    return parser
@pytest.fixture
 def blank_parser(vocab):
    parser = NeuralDependencyParser(vocab)
    return parser
 def test_to_from_bytes(parser, blank_parser):
    assert parser.model is not True
    assert blank_parser.model is True
    assert blank_parser.moves.n_moves != parser.moves.n_moves
    bytes_data = parser.to_bytes()
    blank_parser.from_bytes(bytes_data)
    assert blank_parser.model is not True
    assert blank_parser.moves.n_moves == parser.moves.n_moves
--- a/spacy/util.py
+++ b/spacy/util.py
@ -417,11 +417,11 @@ def to_bytes(getters, exclude):
    for key, getter in getters.items():
        if key not in exclude:
            serialized[key] = getter()
-    return messagepack.dumps(serialized)
+    return msgpack.dumps(serialized)
 def from_bytes(bytes_data, setters, exclude):
-    msg = messagepack.loads(bytes_data)
+    msg = msgpack.loads(bytes_data)
    for key, setter in setters.items():
        if key not in exclude:
            setter(msg[key])