From 39f390dba7e4a5e7ac224b27731e8c463cb92f7d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 25 Sep 2017 16:20:49 +0200
Subject: [PATCH 1/6] Add docstrings for Pipe API

---
 spacy/pipeline.pyx | 37 ++++++++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index dcc06cdf7..fef925d85 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -88,17 +88,30 @@ class BaseThincComponent(object):
 
     @classmethod
     def Model(cls, *shape, **kwargs):
+        '''Initialize a model for the pipe.'''
         raise NotImplementedError
 
     def __init__(self, vocab, model=True, **cfg):
+        '''Create a new pipe instance.'''
         raise NotImplementedError
 
     def __call__(self, doc):
+        '''Apply the pipe to one document. The document is
+        modified in-place, and returned.
+        
+        Both __call__ and pipe should delegate to the `predict()`
+        and `set_annotations()` methods.
+        '''
         scores = self.predict([doc])
         self.set_annotations([doc], scores)
         return doc
 
     def pipe(self, stream, batch_size=128, n_threads=-1):
+        '''Apply the pipe to a stream of documents.
+
+        Both __call__ and pipe should delegate to the `predict()`
+        and `set_annotations()` methods.
+        '''
         for docs in cytoolz.partition_all(batch_size, stream):
             docs = list(docs)
             scores = self.predict(docs)
@@ -106,27 +119,42 @@ class BaseThincComponent(object):
             yield from docs
 
     def predict(self, docs):
+        '''Apply the pipeline's model to a batch of docs, without
+        modifying them.
+        '''
         raise NotImplementedError
 
     def set_annotations(self, docs, scores):
+        '''Modify a batch of documents, using pre-computed scores.'''
         raise NotImplementedError
 
-    def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
+    def update(self, docs, golds, drop=0., sgd=None, losses=None):
+        '''Learn from a batch of documents and gold-standard information,
+        updating the pipe's model.
+
+        Delegates to predict() and get_loss().
+        '''
         raise NotImplementedError
 
     def get_loss(self, docs, golds, scores):
+        '''Find the loss and gradient of loss for the batch of
+        documents and their predicted scores.'''
         raise NotImplementedError
 
     def begin_training(self, gold_tuples=tuple(), pipeline=None):
-        token_vector_width = pipeline[0].model.nO
+        '''Initialize the pipe for training, using data exampes if available.
+        If no model has been initialized yet, the model is added.'''
         if self.model is True:
-            self.model = self.Model(1, token_vector_width)
+            self.model = self.Model(**self.cfg)
 
     def use_params(self, params):
+        '''Modify the pipe's model, to use the given parameter values.
+        '''
         with self.model.use_params(params):
             yield
 
     def to_bytes(self, **exclude):
+        '''Serialize the pipe to a bytestring.'''
         serialize = OrderedDict((
             ('cfg', lambda: json_dumps(self.cfg)),
             ('model', lambda: self.model.to_bytes()),
@@ -135,6 +163,7 @@ class BaseThincComponent(object):
         return util.to_bytes(serialize, exclude)
 
     def from_bytes(self, bytes_data, **exclude):
+        '''Load the pipe from a bytestring.'''
         def load_model(b):
             if self.model is True:
                 self.cfg['pretrained_dims'] = self.vocab.vectors_length
@@ -150,6 +179,7 @@ class BaseThincComponent(object):
         return self
 
     def to_disk(self, path, **exclude):
+        '''Serialize the pipe to disk.'''
         serialize = OrderedDict((
             ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
             ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
@@ -158,6 +188,7 @@ class BaseThincComponent(object):
         util.to_disk(path, serialize, exclude)
 
     def from_disk(self, path, **exclude):
+        '''Load the pipe from disk.'''
         def load_model(p):
             if self.model is True:
                 self.cfg['pretrained_dims'] = self.vocab.vectors_length

From d2d35b63b770c74f80e560fbb2efc5491064608c Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 25 Sep 2017 18:37:13 +0200
Subject: [PATCH 2/6] Fix formatting

---
 spacy/pipeline.pyx | 50 +++++++++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index f660f88a6..90ff1ad88 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -48,7 +48,7 @@ from .parts_of_speech import X
 
 
 class SentenceSegmenter(object):
-    '''A simple spaCy hook, to allow custom sentence boundary detection logic
+    """A simple spaCy hook, to allow custom sentence boundary detection logic
     (that doesn't require the dependency parse).
 
     To change the sentence boundary detection strategy, pass a generator
@@ -57,7 +57,7 @@ class SentenceSegmenter(object):
 
     Sentence detection strategies should be generators that take `Doc` objects
     and yield `Span` objects for each sentence.
-    '''
+    """
     name = 'sbd'
 
     def __init__(self, vocab, strategy=None):
@@ -89,30 +89,30 @@ class BaseThincComponent(object):
 
     @classmethod
     def Model(cls, *shape, **kwargs):
-        '''Initialize a model for the pipe.'''
+        """Initialize a model for the pipe."""
         raise NotImplementedError
 
     def __init__(self, vocab, model=True, **cfg):
-        '''Create a new pipe instance.'''
+        """Create a new pipe instance."""
         raise NotImplementedError
 
     def __call__(self, doc):
-        '''Apply the pipe to one document. The document is
+        """Apply the pipe to one document. The document is
         modified in-place, and returned.
-        
+
         Both __call__ and pipe should delegate to the `predict()`
         and `set_annotations()` methods.
-        '''
+        """
         scores = self.predict([doc])
         self.set_annotations([doc], scores)
         return doc
 
     def pipe(self, stream, batch_size=128, n_threads=-1):
-        '''Apply the pipe to a stream of documents.
+        """Apply the pipe to a stream of documents.
 
         Both __call__ and pipe should delegate to the `predict()`
         and `set_annotations()` methods.
-        '''
+        """
         for docs in cytoolz.partition_all(batch_size, stream):
             docs = list(docs)
             scores = self.predict(docs)
@@ -120,43 +120,43 @@ class BaseThincComponent(object):
             yield from docs
 
     def predict(self, docs):
-        '''Apply the pipeline's model to a batch of docs, without
+        """Apply the pipeline's model to a batch of docs, without
         modifying them.
-        '''
+        """
         raise NotImplementedError
 
     def set_annotations(self, docs, scores):
-        '''Modify a batch of documents, using pre-computed scores.'''
+        """Modify a batch of documents, using pre-computed scores."""
         raise NotImplementedError
 
     def update(self, docs, golds, drop=0., sgd=None, losses=None):
-        '''Learn from a batch of documents and gold-standard information,
+        """Learn from a batch of documents and gold-standard information,
         updating the pipe's model.
 
         Delegates to predict() and get_loss().
-        '''
+        """
         raise NotImplementedError
 
     def get_loss(self, docs, golds, scores):
-        '''Find the loss and gradient of loss for the batch of
-        documents and their predicted scores.'''
+        """Find the loss and gradient of loss for the batch of
+        documents and their predicted scores."""
         raise NotImplementedError
 
     def begin_training(self, gold_tuples=tuple(), pipeline=None):
-        '''Initialize the pipe for training, using data exampes if available.
-        If no model has been initialized yet, the model is added.'''
+        """Initialize the pipe for training, using data exampes if available.
+        If no model has been initialized yet, the model is added."""
         if self.model is True:
             self.model = self.Model(**self.cfg)
         link_vectors_to_models(self.vocab)
 
     def use_params(self, params):
-        '''Modify the pipe's model, to use the given parameter values.
-        '''
+        """Modify the pipe's model, to use the given parameter values.
+        """
         with self.model.use_params(params):
             yield
 
     def to_bytes(self, **exclude):
-        '''Serialize the pipe to a bytestring.'''
+        """Serialize the pipe to a bytestring."""
         serialize = OrderedDict((
             ('cfg', lambda: json_dumps(self.cfg)),
             ('model', lambda: self.model.to_bytes()),
@@ -165,7 +165,7 @@ class BaseThincComponent(object):
         return util.to_bytes(serialize, exclude)
 
     def from_bytes(self, bytes_data, **exclude):
-        '''Load the pipe from a bytestring.'''
+        """Load the pipe from a bytestring."""
         def load_model(b):
             if self.model is True:
                 self.cfg['pretrained_dims'] = self.vocab.vectors_length
@@ -181,7 +181,7 @@ class BaseThincComponent(object):
         return self
 
     def to_disk(self, path, **exclude):
-        '''Serialize the pipe to disk.'''
+        """Serialize the pipe to disk."""
         serialize = OrderedDict((
             ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
             ('vocab', lambda p: self.vocab.to_disk(p)),
@@ -190,7 +190,7 @@ class BaseThincComponent(object):
         util.to_disk(path, serialize, exclude)
 
     def from_disk(self, path, **exclude):
-        '''Load the pipe from disk.'''
+        """Load the pipe from disk."""
         def load_model(p):
             if self.model is True:
                 self.cfg['pretrained_dims'] = self.vocab.vectors_length
@@ -596,7 +596,7 @@ class SimilarityHook(BaseThincComponent):
         return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
 
     def __call__(self, doc):
-        '''Install similarity hook'''
+        """Install similarity hook"""
         doc.user_hooks['similarity'] = self.predict
         return doc
 

From edf7e4881debb5244e445225a98117f9263d9b0d Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 25 Sep 2017 19:00:47 +0200
Subject: [PATCH 3/6] Add meta.json option to cli.train and add relevant
 properties

Add accuracy scores to meta.json instead of accuracy.json and replace
all relevant properties like lang, pipeline, spacy_version in existing
meta.json. If not present, also add name and version placeholders to
make it packagable.
---
 spacy/cli/train.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 96233406d..d71523a9c 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -18,6 +18,7 @@ from ..gold import GoldParse, merge_sents
 from ..gold import GoldCorpus, minibatch
 from ..util import prints
 from .. import util
+from .. import about
 from .. import displacy
 from ..compat import json_dumps
 
@@ -35,10 +36,11 @@ from ..compat import json_dumps
     no_parser=("Don't train parser", "flag", "P", bool),
     no_entities=("Don't train NER", "flag", "N", bool),
     gold_preproc=("Use gold preprocessing", "flag", "G", bool),
+    meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
 )
 def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
           use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
-          gold_preproc=False):
+          gold_preproc=False, meta_path=None):
     """
     Train a model. Expects data in spaCy's JSON format.
     """
@@ -47,13 +49,19 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
     output_path = util.ensure_path(output_dir)
     train_path = util.ensure_path(train_data)
     dev_path = util.ensure_path(dev_data)
+    meta_path = util.ensure_path(meta_path)
     if not output_path.exists():
         output_path.mkdir()
     if not train_path.exists():
         prints(train_path, title="Training data not found", exits=1)
     if dev_path and not dev_path.exists():
         prints(dev_path, title="Development data not found", exits=1)
-
+    if meta_path is not None and not meta_path.exists():
+        prints(meta_path, title="meta.json not found", exits=1)
+    meta = util.read_json(meta_path) if meta_path else {}
+    if not isinstance(meta, dict):
+        prints("Expected dict but got: {}".format(type(meta)),
+               title="Not a valid meta.json format", exits=1)
 
     pipeline = ['token_vectors', 'tags', 'dependencies', 'entities']
     if no_tagger and 'tags' in pipeline: pipeline.remove('tags')
@@ -105,9 +113,16 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                             corpus.dev_docs(
                                 nlp,
                                 gold_preproc=gold_preproc))
-                acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
-                with acc_loc.open('w') as file_:
-                    file_.write(json_dumps(scorer.scores))
+                meta_loc = output_path / ('model%d' % i) / 'meta.json'
+                meta['accuracy'] = scorer.scores
+                meta['lang'] = nlp.lang
+                meta['pipeline'] = pipeline
+                meta['spacy_version'] = '>=%s' % about.__version__
+                meta.setdefault('name', 'model%d' % i)
+                meta.setdefault('version', '0.0.0')
+
+                with meta_loc.open('w') as file_:
+                    file_.write(json_dumps(meta))
                 util.set_env_log(True)
             print_progress(i, losses, scorer.scores)
     finally:

From 02c65155ababfc16d67714fefbe58723cb595f5f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 26 Sep 2017 12:50:31 +0200
Subject: [PATCH 4/6] Try to fix crazy travis error

---
 travis.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/travis.sh b/travis.sh
index eed6a96f2..78599665c 100755
--- a/travis.sh
+++ b/travis.sh
@@ -17,6 +17,7 @@ fi
 
 if [ "${VIA}" == "compile" ]; then
   pip install -r requirements.txt
+  python setup.py clean --all
   python setup.py build_ext --inplace
   pip install -e .
 fi

From ddee15cee957956394677b0dbcfa8ec48a76e6e4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 26 Sep 2017 14:00:25 +0200
Subject: [PATCH 5/6] Try to fix travis

---
 travis.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/travis.sh b/travis.sh
index 78599665c..d1a1b6b29 100755
--- a/travis.sh
+++ b/travis.sh
@@ -18,7 +18,6 @@ fi
 if [ "${VIA}" == "compile" ]; then
   pip install -r requirements.txt
   python setup.py clean --all
-  python setup.py build_ext --inplace
   pip install -e .
 fi
 

From 8c390e23a2ca8c0b24dece18d2faafcbe8066778 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 26 Sep 2017 14:14:46 +0200
Subject: [PATCH 6/6] Require older Cython

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 6298b1982..5d515f7a1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-cython>=0.24
+cython>=0.24,<0.27.0
 pathlib
 numpy>=1.7
 cymem>=1.30,<1.32