From 33e5ec737f89761f54a490371e03fcf924479b84 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 31 May 2017 13:42:39 +0200
Subject: [PATCH] Fix to/from disk methods

---
 spacy/language.py          | 76 +++++++++++++++++++-------------------
 spacy/pipeline.pyx         | 24 ++++++------
 spacy/syntax/nn_parser.pyx | 20 +++++++---
 spacy/util.py              | 21 ++++++++++-
 4 files changed, 87 insertions(+), 54 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index d9a888507..324d78622 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -96,6 +96,13 @@ class BaseDefaults(object):
 
     factories = {
         'make_doc': create_tokenizer,
+        'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
+        'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
+        'parser': lambda nlp, **cfg: [
+            NeuralDependencyParser(nlp.vocab, **cfg),
+            nonproj.deprojectivize],
+        'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
+        # Temporary compatibility -- delete after pivot
         'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
         'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
         'dependencies': lambda nlp, **cfg: [
@@ -358,37 +365,35 @@ class Language(object):
         for doc in docs:
             yield doc
 
-    def to_disk(self, path, disable=[]):
+    def to_disk(self, path, disable=tuple()):
         """Save the current state to a directory.  If a model is loaded, this
         will include the model.
 
         path (unicode or Path): A path to a directory, which will be created if
             it doesn't exist. Paths may be either strings or `Path`-like objects.
-        disable (list): Nameds of pipeline components to disable and prevent
+        disable (list): Names of pipeline components to disable and prevent
             from being saved.
 
         EXAMPLE:
             >>> nlp.to_disk('/path/to/models')
         """
         path = util.ensure_path(path)
-        with path.open('wb') as file_:
-            file_.write(self.to_bytes(disable))
-        #serializers = {
-        #    'vocab': lambda p: self.vocab.to_disk(p),
-        #    'tokenizer': lambda p: self.tokenizer.to_disk(p, vocab=False),
-        #    'meta.json': lambda p: ujson.dump(p.open('w'), self.meta)
-        #}
-        #for proc in self.pipeline:
-        #    if not hasattr(proc, 'name'):
-        #        continue
-        #    if proc.name in disable:
-        #        continue
-        #    if not hasattr(proc, 'to_disk'):
-        #        continue
-        #    serializers[proc.name] = lambda p: proc.to_disk(p, vocab=False)
-        #util.to_disk(serializers, path)
+        serializers = OrderedDict((
+            ('vocab', lambda p: self.vocab.to_disk(p)),
+            ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
+            ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
+        ))
+        for proc in self.pipeline:
+            if not hasattr(proc, 'name'):
+                continue
+            if proc.name in disable:
+                continue
+            if not hasattr(proc, 'to_disk'):
+                continue
+            serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
+        util.to_disk(path, serializers, {p: False for p in disable})
 
-    def from_disk(self, path, disable=[]):
+    def from_disk(self, path, disable=tuple()):
         """Loads state from a directory. Modifies the object in place and
         returns it. If the saved `Language` object contains a model, the
         model will be loaded.
@@ -403,24 +408,21 @@ class Language(object):
             >>> nlp = Language().from_disk('/path/to/models')
         """
         path = util.ensure_path(path)
-        with path.open('rb') as file_:
-            bytes_data = file_.read()
-        return self.from_bytes(bytes_data, disable)
-        #deserializers = {
-        #    'vocab': lambda p: self.vocab.from_disk(p),
-        #    'tokenizer': lambda p: self.tokenizer.from_disk(p, vocab=False),
-        #    'meta.json': lambda p: ujson.dump(p.open('w'), self.meta)
-        #}
-        #for proc in self.pipeline:
-        #    if not hasattr(proc, 'name'):
-        #        continue
-        #    if proc.name in disable:
-        #        continue
-        #    if not hasattr(proc, 'to_disk'):
-        #        continue
-        #    deserializers[proc.name] = lambda p: proc.from_disk(p, vocab=False)
-        #util.from_disk(deserializers, path)
-        #return self
+        deserializers = OrderedDict((
+            ('vocab', lambda p: self.vocab.from_disk(p)),
+            ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
+            ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
+        ))
+        for proc in self.pipeline:
+            if not hasattr(proc, 'name'):
+                continue
+            if proc.name in disable:
+                continue
+            if not hasattr(proc, 'to_disk'):
+                continue
+            deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
+        util.from_disk(path, deserializers, {p: False for p in disable})
+        return self
 
     def to_bytes(self, disable=[]):
         """Serialize the current state to a binary string.
diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index 963dd2faa..ff7098439 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -41,7 +41,7 @@ from .parts_of_speech import X
 
 class TokenVectorEncoder(object):
     """Assign position-sensitive vectors to tokens, using a CNN or RNN."""
-    name = 'tok2vec'
+    name = 'tensorizer'
 
     @classmethod
     def Model(cls, width=128, embed_size=7500, **cfg):
@@ -176,17 +176,19 @@ class TokenVectorEncoder(object):
         return self
 
     def to_disk(self, path, **exclude):
-        serialize = {
-            'model': lambda p: p.open('w').write(util.model_to_bytes(self.model)),
-            'vocab': lambda p: self.vocab.to_disk(p)
-        }
+        serialize = OrderedDict((
+            ('model', lambda p: p.open('wb').write(util.model_to_bytes(self.model))),
+            ('vocab', lambda p: self.vocab.to_disk(p))
+        ))
         util.to_disk(path, serialize, exclude)
 
     def from_disk(self, path, **exclude):
-        deserialize = {
-            'model': lambda p: util.model_from_bytes(self.model, p.open('rb').read()),
-            'vocab': lambda p: self.vocab.from_disk(p)
-        }
+        if self.model is True:
+            self.model = self.Model()
+        deserialize = OrderedDict((
+            ('model', lambda p: util.model_from_bytes(self.model, p.open('rb').read())),
+            ('vocab', lambda p: self.vocab.from_disk(p))
+        ))
         util.from_disk(path, deserialize, exclude)
         return self
 
@@ -315,7 +317,7 @@ class NeuralTagger(object):
 
     def to_disk(self, path, **exclude):
         serialize = {
-            'model': lambda p: p.open('w').write(util.model_to_bytes(self.model)),
+            'model': lambda p: p.open('wb').write(util.model_to_bytes(self.model)),
             'vocab': lambda p: self.vocab.to_disk(p)
         }
         util.to_disk(path, serialize, exclude)
@@ -420,7 +422,7 @@ cdef class NeuralDependencyParser(NeuralParser):
 
 
 cdef class NeuralEntityRecognizer(NeuralParser):
-    name = 'entity'
+    name = 'ner'
     TransitionSystem = BiluoPushDown
 
     nr_feature = 6
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index d49e9cdef..d156156d6 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -44,6 +44,7 @@ from .. import util
 from ..util import get_async, get_cuda_stream
 from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
 from .._ml import Tok2Vec, doc2feats, rebatch
+from ..compat import json_dumps
 
 from . import _parse_features
 from ._parse_features cimport CONTEXT_SIZE
@@ -633,11 +634,13 @@ cdef class Parser:
 
     def to_disk(self, path, **exclude):
         serializers = {
-            'model': lambda p: p.open('wb').write(
-                util.model_to_bytes(self.model)),
+            'lower_model': lambda p: p.open('wb').write(
+                util.model_to_bytes(self.model[0])),
+            'upper_model': lambda p: p.open('wb').write(
+                util.model_to_bytes(self.model[1])),
             'vocab': lambda p: self.vocab.to_disk(p),
             'moves': lambda p: self.moves.to_disk(p, strings=False),
-            'cfg': lambda p: ujson.dumps(p.open('w'), self.cfg)
+            'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
         }
         util.to_disk(path, serializers, exclude)
 
@@ -645,7 +648,7 @@ cdef class Parser:
         deserializers = {
             'vocab': lambda p: self.vocab.from_disk(p),
             'moves': lambda p: self.moves.from_disk(p, strings=False),
-            'cfg': lambda p: self.cfg.update(ujson.load((path/'cfg.json').open())),
+            'cfg': lambda p: self.cfg.update(ujson.load(p.open())),
             'model': lambda p: None
         }
         util.from_disk(path, deserializers, exclude)
@@ -653,7 +656,14 @@ cdef class Parser:
             path = util.ensure_path(path)
             if self.model is True:
                 self.model, cfg = self.Model(**self.cfg)
-            util.model_from_disk(self.model, path / 'model')
+            else:
+                cfg = {}
+            with (path / 'lower_model').open('rb') as file_:
+                bytes_data = file_.read()
+            util.model_from_bytes(self.model[0], bytes_data)
+            with (path / 'upper_model').open('rb') as file_:
+                bytes_data = file_.read()
+            util.model_from_bytes(self.model[1], bytes_data)
             self.cfg.update(cfg)
         return self
 
diff --git a/spacy/util.py b/spacy/util.py
index df66b59a8..273293c24 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -13,6 +13,7 @@ import random
 import numpy
 import io
 import dill
+from collections import OrderedDict
 
 import msgpack
 import msgpack_numpy
@@ -408,7 +409,7 @@ def get_raw_input(description, default=False):
 
 
 def to_bytes(getters, exclude):
-    serialized = {}
+    serialized = OrderedDict()
     for key, getter in getters.items():
         if key not in exclude:
             serialized[key] = getter()
@@ -423,6 +424,24 @@ def from_bytes(bytes_data, setters, exclude):
     return msg
 
 
+def to_disk(path, writers, exclude):
+    path = ensure_path(path)
+    if not path.exists():
+        path.mkdir()
+    for key, writer in writers.items():
+        if key not in exclude:
+            writer(path / key)
+    return path
+
+
+def from_disk(path, readers, exclude):
+    path = ensure_path(path)
+    for key, reader in readers.items():
+        if key not in exclude:
+            reader(path / key)
+    return path
+
+
 # This stuff really belongs in thinc -- but I expect
 # to refactor how all this works in thinc anyway.
 # What a mess!