From 7f876a7a8284a6a0479beb57c265e13a4bec4ea0 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 6 Aug 2017 00:00:21 +0200
Subject: [PATCH 01/49] Clean up some unused code in parser

---
 spacy/syntax/nn_parser.pxd |  4 ---
 spacy/syntax/nn_parser.pyx | 50 ++++++--------------------------------
 2 files changed, 8 insertions(+), 46 deletions(-)

diff --git a/spacy/syntax/nn_parser.pxd b/spacy/syntax/nn_parser.pxd
index 524718965..7ff4b9f9f 100644
--- a/spacy/syntax/nn_parser.pxd
+++ b/spacy/syntax/nn_parser.pxd
@@ -14,8 +14,4 @@ cdef class Parser:
     cdef readonly TransitionSystem moves
     cdef readonly object cfg
 
-    cdef void _parse_step(self, StateC* state,
-            const float* feat_weights,
-            int nr_class, int nr_feat, int nr_piece) nogil
-
     #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 0b39e2216..66787c22a 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -248,15 +248,10 @@ cdef class Parser:
                         nI=token_vector_width)
 
         with Model.use_device('cpu'):
-            if depth == 0:
-                upper = chain()
-                upper.is_noop = True
-            else:
-                upper = chain(
-                    clone(Maxout(hidden_width), (depth-1)),
-                    zero_init(Affine(nr_class, drop_factor=0.0))
-                )
-                upper.is_noop = False
+            upper = chain(
+                clone(Maxout(hidden_width), (depth-1)),
+                zero_init(Affine(nr_class, drop_factor=0.0))
+            )
         # TODO: This is an unfortunate hack atm!
         # Used to set input dimensions in network.
         lower.begin_training(lower.ops.allocate((500, token_vector_width)))
@@ -394,18 +389,11 @@ cdef class Parser:
         cdef np.ndarray scores
         c_token_ids = <int*>token_ids.data
         c_is_valid = <int*>is_valid.data
-        cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
         while not next_step.empty():
-            if not has_hidden:
-                for i in cython.parallel.prange(
-                        next_step.size(), num_threads=6, nogil=True):
-                    self._parse_step(next_step[i],
-                        feat_weights, nr_class, nr_feat, nr_piece)
-            else:
-                for i in range(next_step.size()):
-                    st = next_step[i]
-                    st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
-                    self.moves.set_valid(&c_is_valid[i*nr_class], st)
+            for i in range(next_step.size()):
+                st = next_step[i]
+                st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
+                self.moves.set_valid(&c_is_valid[i*nr_class], st)
                 vectors = state2vec(token_ids[:next_step.size()])
                 scores = vec2scores(vectors)
                 c_scores = <float*>scores.data
@@ -461,28 +449,6 @@ cdef class Parser:
             beams.append(beam)
         return beams
 
-    cdef void _parse_step(self, StateC* state,
-            const float* feat_weights,
-            int nr_class, int nr_feat, int nr_piece) nogil:
-        '''This only works with no hidden layers -- fast but inaccurate'''
-        #for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True):
-        #    self._parse_step(next_step[i], feat_weights, nr_class, nr_feat)
-        token_ids = <int*>calloc(nr_feat, sizeof(int))
-        scores = <float*>calloc(nr_class * nr_piece, sizeof(float))
-        is_valid = <int*>calloc(nr_class, sizeof(int))
-
-        state.set_context_tokens(token_ids, nr_feat)
-        sum_state_features(scores,
-            feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece)
-        self.moves.set_valid(is_valid, state)
-        guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece)
-        action = self.moves.c[guess]
-        action.do(state, action.label)
-
-        free(is_valid)
-        free(scores)
-        free(token_ids)
-
     def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
         if losses is not None and self.name not in losses:
             losses[self.name] = 0.

From 468c138ab3923276d36ace9d03586709a5c7f187 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 6 Aug 2017 01:13:23 +0200
Subject: [PATCH 02/49] WIP: Add fine-tuning logic to tagger model, re #1182

---
 spacy/_ml.py       | 70 ++++++++++++++++++++++++++++++++++++++++++++--
 spacy/pipeline.pyx | 14 ++++------
 2 files changed, 73 insertions(+), 11 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index f1ded666e..5f8ce9470 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -19,7 +19,7 @@ from thinc.api import FeatureExtracter, with_getitem
 from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
 from thinc.neural._classes.attention import ParametricAttention
 from thinc.linear.linear import LinearModel
-from thinc.api import uniqued, wrap
+from thinc.api import uniqued, wrap, flatten_add_lengths
 
 from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
 from .tokens.doc import Doc
@@ -53,6 +53,27 @@ def _logistic(X, drop=0.):
     return Y, logistic_bwd
 
 
+@layerize
+def add_tuples(X, drop=0.):
+    """Give inputs of sequence pairs, where each sequence is (vals, length),
+    sum the values, returning a single sequence.
+
+    If input is:
+    ((vals1, length), (vals2, length)
+    Output is:
+    (vals1+vals2, length)
+
+    vals are a single tensor for the whole batch.
+    """
+    (vals1, length1), (vals2, length2) = X
+    assert length1 == length2
+
+    def add_tuples_bwd(dY, sgd=None):
+        return (dY, dY)
+
+    return (vals1+vals2, length), add_tuples_bwd
+
+
 def _zero_init(model):
     def _zero_init_impl(self, X, y):
         self.W.fill(0)
@@ -61,6 +82,7 @@ def _zero_init(model):
         model.W.fill(0.)
     return model
 
+
 @layerize
 def _preprocess_doc(docs, drop=0.):
     keys = [doc.to_array([LOWER]) for doc in docs]
@@ -72,7 +94,6 @@ def _preprocess_doc(docs, drop=0.):
     return (keys, vals, lengths), None
 
 
-
 def _init_for_precomputed(W, ops):
     if (W**2).sum() != 0.:
         return
@@ -80,6 +101,7 @@ def _init_for_precomputed(W, ops):
     ops.xavier_uniform_init(reshaped)
     W[:] = reshaped.reshape(W.shape)
 
+
 @describe.on_data(_set_dimensions_if_needed)
 @describe.attributes(
     nI=Dimension("Input size"),
@@ -323,6 +345,21 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
     return vectors, backward
 
 
+def fine_tune(model1, combine=None):
+    def fine_tune_fwd(docs, drop=0.):
+        X1, bp_X1 = model1.begin_update(docs)
+        lengths = [len(doc) for doc in docs]
+        X2 = model1.ops.flatten(X1)
+
+        def fine_tune_bwd(d_output, sgd=None):
+            bp_X1(d_output, sgd=sgd)
+            return d_output
+
+        return (X1+X2, lengths), fine_tune_bwd
+    model = wrap(fine_tune_fwd)
+    return model
+
+
 @layerize
 def flatten(seqs, drop=0.):
     if isinstance(seqs[0], numpy.ndarray):
@@ -370,6 +407,35 @@ def preprocess_doc(docs, drop=0.):
     return (keys, vals, lengths), None
 
 
+def build_tagger_model(nr_class, token_vector_width, **cfg):
+    with Model.define_operators({'>>': chain, '+': add}):
+        # Input: (doc, tensor) tuples
+        embed_docs = with_getitem(0, 
+            FeatureExtracter([NORM])
+            >> HashEmbed(token_vector_width, 1000)
+            >> flatten_add_lengths
+        )
+ 
+        model = ( 
+            fine_tune(embed_docs)
+            >> 
+            with_getitem(0, 
+                FeatureExtracter([NORM])
+                >> HashEmbed(token_vector_width, 1000)
+                >> flatten_add_lengths
+            )
+            >> with_getitem(1,
+                flatten_add_lengths) 
+            >> add_tuples
+            >> with_flatten(
+                Maxout(token_vector_width, token_vector_width)
+                >> Softmax(nr_class, token_vector_width)
+            )
+        )
+        return model
+
+
+
 def build_text_classifier(nr_class, width=64, **cfg):
     nr_vector = cfg.get('nr_vector', 200)
     with Model.define_operators({'>>': chain, '+': add, '|': concatenate, '**': clone}):
diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index 947f0a1f1..b96387351 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -42,7 +42,7 @@ from .compat import json_dumps
 
 from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
 from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
-from ._ml import build_text_classifier
+from ._ml import build_text_classifier, build_tagger_model
 from .parts_of_speech import X
 
 
@@ -346,10 +346,8 @@ class NeuralTagger(BaseThincComponent):
 
     @classmethod
     def Model(cls, n_tags, token_vector_width):
-        return with_flatten(
-            chain(Maxout(token_vector_width, token_vector_width),
-                  Softmax(n_tags, token_vector_width)))
-
+        return build_tagger_model(n_tags, token_vector_width)
+ 
     def use_params(self, params):
         with self.model.use_params(params):
             yield
@@ -455,10 +453,8 @@ class NeuralLabeller(NeuralTagger):
 
     @classmethod
     def Model(cls, n_tags, token_vector_width):
-        return with_flatten(
-            chain(Maxout(token_vector_width, token_vector_width),
-                  Softmax(n_tags, token_vector_width)))
-
+        return build_tagger_model(n_tags, token_vector_width)
+    
     def get_loss(self, docs, golds, scores):
         scores = self.model.ops.flatten(scores)
         cdef int idx = 0

From e9ab800e15ba45ba919387107aadb0cec388872a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 6 Aug 2017 01:50:08 +0200
Subject: [PATCH 03/49] Fix tagging model

---
 spacy/_ml.py       | 27 +++++++++------------------
 spacy/pipeline.pyx | 12 +++++++-----
 2 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index 5f8ce9470..e60e8a610 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -346,16 +346,16 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
 
 
 def fine_tune(model1, combine=None):
-    def fine_tune_fwd(docs, drop=0.):
+    def fine_tune_fwd(docs_tokvecs, drop=0.):
+        docs, tokvecs = docs_tokvecs
+        lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
         X1, bp_X1 = model1.begin_update(docs)
-        lengths = [len(doc) for doc in docs]
-        X2 = model1.ops.flatten(X1)
 
         def fine_tune_bwd(d_output, sgd=None):
-            bp_X1(d_output, sgd=sgd)
+            bp_X1(model1.ops.flatten(d_output), sgd=sgd)
             return d_output
 
-        return (X1+X2, lengths), fine_tune_bwd
+        return model1.ops.unflatten(X1+X2, lengths), fine_tune_bwd
     model = wrap(fine_tune_fwd)
     return model
 
@@ -410,30 +410,21 @@ def preprocess_doc(docs, drop=0.):
 def build_tagger_model(nr_class, token_vector_width, **cfg):
     with Model.define_operators({'>>': chain, '+': add}):
         # Input: (doc, tensor) tuples
-        embed_docs = with_getitem(0, 
+        embed_docs = ( 
             FeatureExtracter([NORM])
+            >> flatten
             >> HashEmbed(token_vector_width, 1000)
-            >> flatten_add_lengths
         )
  
         model = ( 
             fine_tune(embed_docs)
-            >> 
-            with_getitem(0, 
-                FeatureExtracter([NORM])
-                >> HashEmbed(token_vector_width, 1000)
-                >> flatten_add_lengths
-            )
-            >> with_getitem(1,
-                flatten_add_lengths) 
-            >> add_tuples
             >> with_flatten(
                 Maxout(token_vector_width, token_vector_width)
                 >> Softmax(nr_class, token_vector_width)
             )
         )
-        return model
-
+    model.nI = None
+    return model
 
 
 def build_text_classifier(nr_class, width=64, **cfg):
diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index b96387351..848653c5c 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -253,23 +253,25 @@ class NeuralTagger(BaseThincComponent):
         self.cfg = dict(cfg)
 
     def __call__(self, doc):
-        tags = self.predict([doc.tensor])
+        tags = self.predict(([doc], [doc.tensor]))
         self.set_annotations([doc], tags)
         return doc
 
     def pipe(self, stream, batch_size=128, n_threads=-1):
         for docs in cytoolz.partition_all(batch_size, stream):
+            docs = list(docs)
             tokvecs = [d.tensor for d in docs]
-            tag_ids = self.predict(tokvecs)
+            tag_ids = self.predict((docs, tokvecs))
             self.set_annotations(docs, tag_ids)
             yield from docs
 
-    def predict(self, tokvecs):
-        scores = self.model(tokvecs)
+    def predict(self, docs_tokvecs):
+        scores = self.model(docs_tokvecs)
         scores = self.model.ops.flatten(scores)
         guesses = scores.argmax(axis=1)
         if not isinstance(guesses, numpy.ndarray):
             guesses = guesses.get()
+        tokvecs = docs_tokvecs[1]
         guesses = self.model.ops.unflatten(guesses,
                     [tv.shape[0] for tv in tokvecs])
         return guesses
@@ -295,7 +297,7 @@ class NeuralTagger(BaseThincComponent):
         if self.model.nI is None:
             self.model.nI = tokvecs[0].shape[1]
 
-        tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
+        tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
         loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
 
         d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)

From 4cfb7a54e78c077dc6ac743ec7ccfe8a3b341ebd Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 6 Aug 2017 01:53:31 +0200
Subject: [PATCH 04/49] Fix tagger

---
 spacy/_ml.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index e60e8a610..c0025e597 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -350,6 +350,7 @@ def fine_tune(model1, combine=None):
         docs, tokvecs = docs_tokvecs
         lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
         X1, bp_X1 = model1.begin_update(docs)
+        X2 = model1.ops.flatten(tokvecs)
 
         def fine_tune_bwd(d_output, sgd=None):
             bp_X1(model1.ops.flatten(d_output), sgd=sgd)

From cc19ea0e7ca9c8adfb779dfafb1d534d55d78e5e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 6 Aug 2017 02:17:10 +0200
Subject: [PATCH 05/49] Add update_tensors flag to Language.update.
 Experimental, re #1182

---
 spacy/language.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 0284c4636..4a489387a 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -277,7 +277,8 @@ class Language(object):
     def make_doc(self, text):
         return self.tokenizer(text)
 
-    def update(self, docs, golds, drop=0., sgd=None, losses=None):
+    def update(self, docs, golds, drop=0., sgd=None, losses=None,
+            update_tensors=False):
         """Update the models in the pipeline.
 
         docs (iterable): A batch of `Doc` objects.
@@ -310,7 +311,7 @@ class Language(object):
             tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
             d_tokvecses = proc.update((docs, tokvecses), golds,
                                       drop=drop, sgd=get_grads, losses=losses)
-            if d_tokvecses is not None:
+            if update_tensors and d_tokvecses is not None:
                 bp_tokvecses(d_tokvecses, sgd=sgd)
         for key, (W, dW) in grads.items():
             sgd(W, dW, key=key)

From 0a566dc320a1103569df90791e1ede24f513e78e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 6 Aug 2017 02:18:12 +0200
Subject: [PATCH 06/49] Add update_tensors flag to Language.update.
 Experimental, re #1182

---
 spacy/cli/train.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index af028dae5..9ed621c12 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -91,7 +91,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                 for batch in minibatch(train_docs, size=batch_sizes):
                     docs, golds = zip(*batch)
                     nlp.update(docs, golds, sgd=optimizer,
-                               drop=next(dropout_rates), losses=losses)
+                               drop=next(dropout_rates), losses=losses,
+                               update_tensors=True)
                     pbar.update(sum(len(doc) for doc in docs))
 
             with nlp.use_params(optimizer.averages):

From 0eec7c9e9b7bfafaa80633942088ce6016e3c918 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 6 Aug 2017 02:18:31 +0200
Subject: [PATCH 07/49] Fix Language.evaluate

---
 spacy/language.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 4a489387a..3079249f6 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -382,9 +382,13 @@ class Language(object):
         return optimizer
 
     def evaluate(self, docs_golds):
-        docs, golds = zip(*docs_golds)
         scorer = Scorer()
-        for doc, gold in zip(self.pipe(docs, batch_size=32), golds):
+        docs, golds = zip(*docs_golds)
+        docs = list(docs)
+        golds = list(golds)
+        for pipe in self.pipeline:
+            docs = pipe.pipe(docs)
+        for doc, gold in zip(docs, golds):
             scorer.score(doc, gold)
             doc.tensor = None
         return scorer

From bfffdeabb2ad16b65a1d5c2b0c0f088d47e7f7cc Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 6 Aug 2017 14:10:48 +0200
Subject: [PATCH 08/49] Fix parser batch-size bug introduced during cleanup

---
 spacy/syntax/nn_parser.pyx | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 66787c22a..4be31b4de 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -339,12 +339,10 @@ cdef class Parser:
                 The number of threads with which to work on the buffer in parallel.
         Yields (Doc): Documents, in order.
         """
-        cdef StateClass parse_state
         cdef Doc doc
-        queue = []
         for docs in cytoolz.partition_all(batch_size, docs):
             docs = list(docs)
-            tokvecs = [d.tensor for d in docs]
+            tokvecs = [doc.tensor for doc in docs]
             if beam_width == 1:
                 parse_states = self.parse_batch(docs, tokvecs)
             else:
@@ -364,6 +362,8 @@ cdef class Parser:
             int nr_class, nr_feat, nr_piece, nr_dim, nr_state
         if isinstance(docs, Doc):
             docs = [docs]
+        if isinstance(tokvecses, np.ndarray):
+            tokvecses = [tokvecses]
 
         tokvecs = self.model[0].ops.flatten(tokvecses)
 
@@ -395,14 +395,14 @@ cdef class Parser:
                 st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
                 self.moves.set_valid(&c_is_valid[i*nr_class], st)
                 vectors = state2vec(token_ids[:next_step.size()])
-                scores = vec2scores(vectors)
-                c_scores = <float*>scores.data
-                for i in range(next_step.size()):
-                    st = next_step[i]
-                    guess = arg_max_if_valid(
-                        &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
-                    action = self.moves.c[guess]
-                    action.do(st, action.label)
+            scores = vec2scores(vectors)
+            c_scores = <float*>scores.data
+            for i in range(next_step.size()):
+                st = next_step[i]
+                guess = arg_max_if_valid(
+                    &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
+                action = self.moves.c[guess]
+                action.do(st, action.label)
             this_step, next_step = next_step, this_step
             next_step.clear()
             for st in this_step:

From 0acce0521b3768dce2029db28168b5ec79aac741 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 6 Aug 2017 14:13:03 +0200
Subject: [PATCH 09/49] Fix Language.update for pipeline

---
 spacy/language.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/spacy/language.py b/spacy/language.py
index 3079249f6..6d97f41fe 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -387,7 +387,12 @@ class Language(object):
         docs = list(docs)
         golds = list(golds)
         for pipe in self.pipeline:
-            docs = pipe.pipe(docs)
+            if not hasattr(pipe, 'pipe'):
+                for doc in docs:
+                    pipe(doc)
+            else:
+                docs = list(pipe.pipe(docs))
+        assert len(docs) == len(golds)
         for doc, gold in zip(docs, golds):
             scorer.score(doc, gold)
             doc.tensor = None

From 3cb8f06881f5991e4de2be4520e201612debb911 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 6 Aug 2017 14:15:14 +0200
Subject: [PATCH 10/49] Fix NeuralLabeller

---
 spacy/pipeline.pyx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index 848653c5c..b87f73c27 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -296,7 +296,6 @@ class NeuralTagger(BaseThincComponent):
 
         if self.model.nI is None:
             self.model.nI = tokvecs[0].shape[1]
-
         tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
         loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
 
@@ -432,7 +431,7 @@ class NeuralLabeller(NeuralTagger):
 
     @property
     def labels(self):
-        return self.cfg.get('labels', {})
+        return self.cfg.setdefault('labels', {})
 
     @labels.setter
     def labels(self, value):

From 4a5cc8913845accce3033bd751f503dd72d9c2ff Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 6 Aug 2017 14:15:48 +0200
Subject: [PATCH 11/49] Fix tagger 'fine_tune', to keep private CNN weights

---
 spacy/_ml.py | 42 +++++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index c0025e597..bc08dfbbc 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -5,6 +5,7 @@ from thinc.neural._classes.hash_embed import HashEmbed
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module
 import random
+import cytoolz
 
 from thinc.neural._classes.convolution import ExtractWindow
 from thinc.neural._classes.static_vectors import StaticVectors
@@ -207,9 +208,9 @@ class PrecomputableMaxouts(Model):
 
 
 def Tok2Vec(width, embed_size, preprocess=None):
-    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
+    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
     with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
-        norm = get_col(cols.index(NORM))   >> HashEmbed(width, embed_size, name='embed_lower')
+        norm = get_col(cols.index(NORM))     >> HashEmbed(width, embed_size, name='embed_lower')
         prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
         suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
         shape = get_col(cols.index(SHAPE))   >> HashEmbed(width, embed_size//2, name='embed_shape')
@@ -218,7 +219,7 @@ def Tok2Vec(width, embed_size, preprocess=None):
         tok2vec = (
             with_flatten(
                 asarray(Model.ops, dtype='uint64')
-                >> embed
+                >> uniqued(embed, column=5)
                 >> Maxout(width, width*4, pieces=3)
                 >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
                 >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
@@ -319,7 +320,7 @@ def zero_init(model):
 
 
 def doc2feats(cols=None):
-    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
+    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
     def forward(docs, drop=0.):
         feats = []
         for doc in docs:
@@ -345,19 +346,26 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
     return vectors, backward
 
 
-def fine_tune(model1, combine=None):
+def fine_tune(embedding, combine=None):
+    if combine is not None:
+        raise NotImplementedError(
+            "fine_tune currently only supports addition. Set combine=None")
     def fine_tune_fwd(docs_tokvecs, drop=0.):
         docs, tokvecs = docs_tokvecs
         lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
-        X1, bp_X1 = model1.begin_update(docs)
-        X2 = model1.ops.flatten(tokvecs)
+
+        vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
+        
+        output = embedding.ops.unflatten(
+                    embedding.ops.flatten(tokvecs)
+                    + embedding.ops.flatten(vecs),
+                    lengths)
 
         def fine_tune_bwd(d_output, sgd=None):
-            bp_X1(model1.ops.flatten(d_output), sgd=sgd)
+            bp_vecs(d_output, sgd=sgd)
             return d_output
-
-        return model1.ops.unflatten(X1+X2, lengths), fine_tune_bwd
-    model = wrap(fine_tune_fwd)
+        return output, fine_tune_bwd
+    model = wrap(fine_tune_fwd, embedding)
     return model
 
 
@@ -407,18 +415,18 @@ def preprocess_doc(docs, drop=0.):
     vals = ops.allocate(keys.shape[0]) + 1
     return (keys, vals, lengths), None
 
+def getitem(i):
+    def getitem_fwd(X, drop=0.):
+        return X[i], None
+    return layerize(getitem_fwd)
 
 def build_tagger_model(nr_class, token_vector_width, **cfg):
     with Model.define_operators({'>>': chain, '+': add}):
         # Input: (doc, tensor) tuples
-        embed_docs = ( 
-            FeatureExtracter([NORM])
-            >> flatten
-            >> HashEmbed(token_vector_width, 1000)
-        )
+        private_tok2vec = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats())
  
         model = ( 
-            fine_tune(embed_docs)
+            fine_tune(private_tok2vec)
             >> with_flatten(
                 Maxout(token_vector_width, token_vector_width)
                 >> Softmax(nr_class, token_vector_width)

From 78498a072de1104baefe0e5b075303d24a7828f6 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 6 Aug 2017 14:16:36 +0200
Subject: [PATCH 12/49] Return Transition for missing actions in lookup_action

---
 spacy/syntax/arc_eager.pyx         | 1 +
 spacy/syntax/transition_system.pyx | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 29e8de0aa..9477449a5 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -385,6 +385,7 @@ cdef class ArcEager(TransitionSystem):
         for i in range(self.n_moves):
             if self.c[i].move == move and self.c[i].label == label:
                 return self.c[i]
+        return Transition(clas=0, move=MISSING, label=0)
 
     def move_name(self, int move, attr_t label):
         label_str = self.strings[label]
diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index 27b375bba..d3f64f827 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -107,6 +107,8 @@ cdef class TransitionSystem:
 
     def is_valid(self, StateClass stcls, move_name):
         action = self.lookup_transition(move_name)
+        if action.move == 0:
+            return False
         return action.is_valid(stcls.c, action.label)
 
     cdef int set_valid(self, int* is_valid, const StateC* st) nogil:

From 3ed203de2504edd2b5470ecfa4ef8a5b2e382b2a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 6 Aug 2017 18:33:18 +0200
Subject: [PATCH 13/49] Use LayerNorm and SELU in Tok2Vec

---
 spacy/_ml.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index bc08dfbbc..f7ab9b259 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -10,6 +10,7 @@ import cytoolz
 from thinc.neural._classes.convolution import ExtractWindow
 from thinc.neural._classes.static_vectors import StaticVectors
 from thinc.neural._classes.batchnorm import BatchNorm
+from thinc.neural._classes.layernorm import LayerNorm as LN
 from thinc.neural._classes.resnet import Residual
 from thinc.neural import ReLu
 from thinc.neural._classes.selu import SELU
@@ -220,11 +221,11 @@ def Tok2Vec(width, embed_size, preprocess=None):
             with_flatten(
                 asarray(Model.ops, dtype='uint64')
                 >> uniqued(embed, column=5)
-                >> Maxout(width, width*4, pieces=3)
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
+                >> LN(Maxout(width, width*4, pieces=3))
+                >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3))
+                >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3))
+                >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3))
+                >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3)),
             pad=4)
         )
         if preprocess not in (False, None):

From 42bd26f6f397c5234b9b82c3daa14c3c0268bd3c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 6 Aug 2017 18:33:46 +0200
Subject: [PATCH 14/49] Give parser its own tok2vec weights

---
 spacy/syntax/nn_parser.pyx | 40 +++++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 4be31b4de..06c61656b 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -237,6 +237,7 @@ cdef class Parser:
         token_vector_width = util.env_opt('token_vector_width', token_vector_width)
         hidden_width = util.env_opt('hidden_width', hidden_width)
         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
+        tensors = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats())
         if parser_maxout_pieces == 1:
             lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
                         nF=cls.nr_feature,
@@ -263,7 +264,7 @@ cdef class Parser:
             'hidden_width': hidden_width,
             'maxout_pieces': parser_maxout_pieces
         }
-        return (lower, upper), cfg
+        return (tensors, lower, upper), cfg
 
     def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
         """
@@ -366,6 +367,7 @@ cdef class Parser:
             tokvecses = [tokvecses]
 
         tokvecs = self.model[0].ops.flatten(tokvecses)
+        tokvecs += self.model[0].ops.flatten(self.model[0](docs))
 
         nr_state = len(docs)
         nr_class = self.moves.n_moves
@@ -417,6 +419,7 @@ cdef class Parser:
         cdef int nr_class = self.moves.n_moves
         cdef StateClass stcls, output
         tokvecs = self.model[0].ops.flatten(tokvecses)
+        tokvecs += self.model[0].ops.flatten(self.model[0](docs))
         cuda_stream = get_cuda_stream()
         state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
                                                      cuda_stream, 0.0)
@@ -457,6 +460,9 @@ cdef class Parser:
         if isinstance(docs, Doc) and isinstance(golds, GoldParse):
             docs = [docs]
             golds = [golds]
+        my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs, drop=0.)
+        my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
+        tokvecs += my_tokvecs
 
         cuda_stream = get_cuda_stream()
 
@@ -506,7 +512,9 @@ cdef class Parser:
                 break
         self._make_updates(d_tokvecs,
             backprops, sgd, cuda_stream)
-        return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
+        d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
+        #bp_my_tokvecs(d_tokvecs, sgd=sgd)
+        return d_tokvecs
 
     def _init_gold_batch(self, whole_docs, whole_golds):
         """Make a square batch, of length equal to the shortest doc. A long
@@ -569,7 +577,7 @@ cdef class Parser:
         return names
 
     def get_batch_model(self, batch_size, tokvecs, stream, dropout):
-        lower, upper = self.model
+        _, lower, upper = self.model
         state2vec = precompute_hiddens(batch_size, tokvecs,
                         lower, stream, drop=dropout)
         return state2vec, upper
@@ -659,10 +667,12 @@ cdef class Parser:
 
     def to_disk(self, path, **exclude):
         serializers = {
-            'lower_model': lambda p: p.open('wb').write(
+            'tok2vec_model': lambda p: p.open('wb').write(
                 self.model[0].to_bytes()),
-            'upper_model': lambda p: p.open('wb').write(
+            'lower_model': lambda p: p.open('wb').write(
                 self.model[1].to_bytes()),
+            'upper_model': lambda p: p.open('wb').write(
+                self.model[2].to_bytes()),
             'vocab': lambda p: self.vocab.to_disk(p),
             'moves': lambda p: self.moves.to_disk(p, strings=False),
             'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
@@ -683,24 +693,29 @@ cdef class Parser:
                 self.model, cfg = self.Model(**self.cfg)
             else:
                 cfg = {}
-            with (path / 'lower_model').open('rb') as file_:
+            with (path / 'tok2vec_model').open('rb') as file_:
                 bytes_data = file_.read()
             self.model[0].from_bytes(bytes_data)
-            with (path / 'upper_model').open('rb') as file_:
+            with (path / 'lower_model').open('rb') as file_:
                 bytes_data = file_.read()
             self.model[1].from_bytes(bytes_data)
+            with (path / 'upper_model').open('rb') as file_:
+                bytes_data = file_.read()
+            self.model[2].from_bytes(bytes_data)
             self.cfg.update(cfg)
         return self
 
     def to_bytes(self, **exclude):
         serializers = OrderedDict((
-            ('lower_model', lambda: self.model[0].to_bytes()),
-            ('upper_model', lambda: self.model[1].to_bytes()),
+            ('tok2vec_model', lambda: self.model[0].to_bytes()),
+            ('lower_model', lambda: self.model[1].to_bytes()),
+            ('upper_model', lambda: self.model[2].to_bytes()),
             ('vocab', lambda: self.vocab.to_bytes()),
             ('moves', lambda: self.moves.to_bytes(strings=False)),
             ('cfg', lambda: ujson.dumps(self.cfg))
         ))
         if 'model' in exclude:
+            exclude['tok2vec_model'] = True
             exclude['lower_model'] = True
             exclude['upper_model'] = True
             exclude.pop('model')
@@ -711,6 +726,7 @@ cdef class Parser:
             ('vocab', lambda b: self.vocab.from_bytes(b)),
             ('moves', lambda b: self.moves.from_bytes(b, strings=False)),
             ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
+            ('tok2vec_model', lambda b: None),
             ('lower_model', lambda b: None),
             ('upper_model', lambda b: None)
         ))
@@ -720,10 +736,12 @@ cdef class Parser:
                 self.model, cfg = self.Model(self.moves.n_moves)
             else:
                 cfg = {}
+            if 'tok2vec_model' in msg:
+                self.model[0].from_bytes(msg['tok2vec_model'])
             if 'lower_model' in msg:
-                self.model[0].from_bytes(msg['lower_model'])
+                self.model[1].from_bytes(msg['lower_model'])
             if 'upper_model' in msg:
-                self.model[1].from_bytes(msg['upper_model'])
+                self.model[2].from_bytes(msg['upper_model'])
             self.cfg.update(cfg)
         return self
 

From 5d837c37762cb06a230906be80225e0e421c6cb2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 7 Aug 2017 06:32:59 -0500
Subject: [PATCH 15/49] Add mix weights on fine_tune

---
 spacy/_ml.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index f7ab9b259..d28f48c42 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -356,17 +356,24 @@ def fine_tune(embedding, combine=None):
         lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
 
         vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
-        
+        flat_tokvecs = embedding.ops.flatten(tokvecs)
+        flat_vecs = embedding.ops.flatten(vecs)
         output = embedding.ops.unflatten(
-                    embedding.ops.flatten(tokvecs)
-                    + embedding.ops.flatten(vecs),
+                   (model.mix[0] * flat_vecs + model.mix[1] * flat_tokvecs),
                     lengths)
 
         def fine_tune_bwd(d_output, sgd=None):
             bp_vecs(d_output, sgd=sgd)
+            flat_grad = model.ops.flatten(d_output)
+            model.d_mix[1] += flat_tokvecs.dot(flat_grad.T).sum()
+            model.d_mix[0] += flat_vecs.dot(flat_grad.T).sum()
+            sgd(model._mem.weights, model._mem.gradient, key=model.id)
             return d_output
         return output, fine_tune_bwd
     model = wrap(fine_tune_fwd, embedding)
+    model.mix = model._mem.add((model.id, 'mix'), (2,))
+    model.mix.fill(1.)
+    model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
     return model
 
 

From 88bf1cf87c874c2e9fa0d88aa28db07907b6ad90 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 8 Aug 2017 15:34:17 -0500
Subject: [PATCH 16/49] Update parser for fine tuning

---
 spacy/_ml.py               | 18 +++++++++---------
 spacy/syntax/nn_parser.pyx | 10 +++++-----
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index d28f48c42..01f166b9f 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -222,11 +222,11 @@ def Tok2Vec(width, embed_size, preprocess=None):
                 asarray(Model.ops, dtype='uint64')
                 >> uniqued(embed, column=5)
                 >> LN(Maxout(width, width*4, pieces=3))
-                >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3)),
-            pad=4)
+                >> Residual(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
+                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
+                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
+                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
+                pad=4)
         )
         if preprocess not in (False, None):
             tok2vec = preprocess >> tok2vec
@@ -432,8 +432,8 @@ def build_tagger_model(nr_class, token_vector_width, **cfg):
     with Model.define_operators({'>>': chain, '+': add}):
         # Input: (doc, tensor) tuples
         private_tok2vec = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats())
- 
-        model = ( 
+
+        model = (
             fine_tune(private_tok2vec)
             >> with_flatten(
                 Maxout(token_vector_width, token_vector_width)
@@ -457,7 +457,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
             >> _flatten_add_lengths
             >> with_getitem(0,
                 uniqued(
-                  (embed_lower | embed_prefix | embed_suffix | embed_shape) 
+                  (embed_lower | embed_prefix | embed_suffix | embed_shape)
                   >> Maxout(width, width+(width//2)*3))
                 >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
                 >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
@@ -478,7 +478,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
             >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
             >> logistic
         )
- 
+
     model.lsuv = False
     return model
 
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 06c61656b..00835f697 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -44,7 +44,7 @@ from thinc.neural.util import get_array_module
 from .. import util
 from ..util import get_async, get_cuda_stream
 from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
-from .._ml import Tok2Vec, doc2feats, rebatch
+from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
 from ..compat import json_dumps
 
 from . import _parse_features
@@ -237,7 +237,7 @@ cdef class Parser:
         token_vector_width = util.env_opt('token_vector_width', token_vector_width)
         hidden_width = util.env_opt('hidden_width', hidden_width)
         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
-        tensors = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats())
+        tensors = fine_tune(Tok2Vec(token_vector_width, 7500, preprocess=doc2feats()))
         if parser_maxout_pieces == 1:
             lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
                         nF=cls.nr_feature,
@@ -367,7 +367,7 @@ cdef class Parser:
             tokvecses = [tokvecses]
 
         tokvecs = self.model[0].ops.flatten(tokvecses)
-        tokvecs += self.model[0].ops.flatten(self.model[0](docs))
+        tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
 
         nr_state = len(docs)
         nr_class = self.moves.n_moves
@@ -419,7 +419,7 @@ cdef class Parser:
         cdef int nr_class = self.moves.n_moves
         cdef StateClass stcls, output
         tokvecs = self.model[0].ops.flatten(tokvecses)
-        tokvecs += self.model[0].ops.flatten(self.model[0](docs))
+        tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
         cuda_stream = get_cuda_stream()
         state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
                                                      cuda_stream, 0.0)
@@ -460,7 +460,7 @@ cdef class Parser:
         if isinstance(docs, Doc) and isinstance(golds, GoldParse):
             docs = [docs]
             golds = [golds]
-        my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs, drop=0.)
+        my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=0.)
         my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
         tokvecs += my_tokvecs
 

From dbdd8afc4bb4fa56db69ddca584df7505888e46b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 8 Aug 2017 15:46:07 -0500
Subject: [PATCH 17/49] Fix parser fine-tune training

---
 spacy/syntax/nn_parser.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 00835f697..31c3801a2 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -513,7 +513,7 @@ cdef class Parser:
         self._make_updates(d_tokvecs,
             backprops, sgd, cuda_stream)
         d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
-        #bp_my_tokvecs(d_tokvecs, sgd=sgd)
+        bp_my_tokvecs(d_tokvecs, sgd=sgd)
         return d_tokvecs
 
     def _init_gold_batch(self, whole_docs, whole_golds):

From 28e2fec23bf5f654490c8d8f17d551fda190e831 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 9 Aug 2017 11:52:38 +0200
Subject: [PATCH 18/49] Fix autolinking failure on fresh model install
 (resolves #1138)

On fresh install via subprocess, pip.get_installed_distributions()
won't show new model, so is_package check in link command fails.
Solution for now is to get model package path explicitly and pass it to
link command.
---
 spacy/cli/download.py | 8 ++++++--
 spacy/cli/link.py     | 4 ++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index b6e5549da..675ae8cee 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -8,7 +8,7 @@ import subprocess
 import sys
 
 from .link import link
-from ..util import prints
+from ..util import prints, get_package_path
 from .. import about
 
 
@@ -32,7 +32,11 @@ def download(cmd, model, direct=False):
         version = get_version(model_name, compatibility)
         download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
         try:
-            link(None, model_name, model, force=True)
+            # Get package path here because link uses
+            # pip.get_installed_distributions() to check if model is a package,
+            # which fails if model was just installed via subprocess
+            package_path = get_package_path(model_name)
+            link(None, model_name, model, force=True, model_path=package_path)
         except:
             # Dirty, but since spacy.download and the auto-linking is mostly
             # a convenience wrapper, it's best to show a success message and
diff --git a/spacy/cli/link.py b/spacy/cli/link.py
index a8ee01565..712a05aee 100644
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@@ -14,7 +14,7 @@ from .. import util
     link_name=("name of shortuct link to create", "positional", None, str),
     force=("force overwriting of existing link", "flag", "f", bool)
 )
-def link(cmd, origin, link_name, force=False):
+def link(cmd, origin, link_name, force=False, model_path=None):
     """
     Create a symlink for models within the spacy/data directory. Accepts
     either the name of a pip package, or the local path to the model data
@@ -23,7 +23,7 @@ def link(cmd, origin, link_name, force=False):
     if util.is_package(origin):
         model_path = util.get_package_path(origin)
     else:
-        model_path = Path(origin)
+        model_path = Path(origin) if model_path is None else Path(model_path)
     if not model_path.exists():
         prints("The data should be located in %s" % path2str(model_path),
                title="Can't locate model data", exits=1)

From 764540a6dd36b4a51fc6b9f28786aa5ffeaee202 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 9 Aug 2017 12:16:30 +0200
Subject: [PATCH 19/49] Don't ignore /bin directory

---
 .gitignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 52838918c..cb0a8e84e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,7 +40,6 @@ venv/
 
 # Distribution / packaging
 env/
-bin/
 build/
 develop-eggs/
 dist/

From 495e0424291e95846fcccb679c938a0a1e8f6ff1 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 9 Aug 2017 12:17:30 +0200
Subject: [PATCH 20/49] Add entry point-style auto alias for "spacy"

Simplest way to run commands as spacy xxx instead of python -m spacy
xxx, while avoiding environment conflicts
---
 MANIFEST.in | 1 +
 bin/spacy   | 1 +
 setup.py    | 1 +
 3 files changed, 3 insertions(+)
 create mode 100644 bin/spacy

diff --git a/MANIFEST.in b/MANIFEST.in
index 697748835..4d804a23e 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,4 @@
 recursive-include include *.h
 include LICENSE
 include README.rst
+include bin/spacy
diff --git a/bin/spacy b/bin/spacy
new file mode 100644
index 000000000..29d9a80e5
--- /dev/null
+++ b/bin/spacy
@@ -0,0 +1 @@
+python -m spacy "$@"
diff --git a/setup.py b/setup.py
index ecdf15536..0a3384ed5 100755
--- a/setup.py
+++ b/setup.py
@@ -187,6 +187,7 @@ def setup_package():
             url=about['__uri__'],
             license=about['__license__'],
             ext_modules=ext_modules,
+            scripts=['bin/spacy'],
             install_requires=[
                 'numpy>=1.7',
                 'murmurhash>=0.28,<0.29',

From bcce6f7de0d03c86c5c189381d00de16b6cdbb19 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 9 Aug 2017 16:23:12 -0500
Subject: [PATCH 21/49] Fix parser fine tuning

---
 spacy/syntax/nn_parser.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 00835f697..31c3801a2 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -513,7 +513,7 @@ cdef class Parser:
         self._make_updates(d_tokvecs,
             backprops, sgd, cuda_stream)
         d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
-        #bp_my_tokvecs(d_tokvecs, sgd=sgd)
+        bp_my_tokvecs(d_tokvecs, sgd=sgd)
         return d_tokvecs
 
     def _init_gold_batch(self, whole_docs, whole_golds):

From bbace204bec8160936ff8ce9b50b8194b5d94a23 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 9 Aug 2017 16:40:42 -0500
Subject: [PATCH 22/49] Gate parser fine-tuning behind feature flag

---
 spacy/syntax/nn_parser.pyx | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 31c3801a2..f1f21134c 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -59,8 +59,9 @@ from ..structs cimport TokenC
 from ..tokens.doc cimport Doc
 from ..strings cimport StringStore
 from ..gold cimport GoldParse
-from ..attrs cimport TAG, DEP
+from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
 
+USE_FINE_TUNE = True
 
 def get_templates(*args, **kwargs):
     return []
@@ -237,7 +238,8 @@ cdef class Parser:
         token_vector_width = util.env_opt('token_vector_width', token_vector_width)
         hidden_width = util.env_opt('hidden_width', hidden_width)
         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
-        tensors = fine_tune(Tok2Vec(token_vector_width, 7500, preprocess=doc2feats()))
+        tensors = fine_tune(Tok2Vec(token_vector_width, 7500,
+                    preprocess=doc2feats(cols=[ID, NORM, PREFIX, SUFFIX, TAG])))
         if parser_maxout_pieces == 1:
             lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
                         nF=cls.nr_feature,
@@ -367,7 +369,8 @@ cdef class Parser:
             tokvecses = [tokvecses]
 
         tokvecs = self.model[0].ops.flatten(tokvecses)
-        tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
+        if USE_FINE_TUNE:
+            tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
 
         nr_state = len(docs)
         nr_class = self.moves.n_moves
@@ -419,7 +422,8 @@ cdef class Parser:
         cdef int nr_class = self.moves.n_moves
         cdef StateClass stcls, output
         tokvecs = self.model[0].ops.flatten(tokvecses)
-        tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
+        if USE_FINE_TUNE:
+            tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
         cuda_stream = get_cuda_stream()
         state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
                                                      cuda_stream, 0.0)
@@ -460,9 +464,10 @@ cdef class Parser:
         if isinstance(docs, Doc) and isinstance(golds, GoldParse):
             docs = [docs]
             golds = [golds]
-        my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=0.)
-        my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
-        tokvecs += my_tokvecs
+        if USE_FINE_TUNE:
+            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=0.)
+            my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
+            tokvecs += my_tokvecs
 
         cuda_stream = get_cuda_stream()
 
@@ -513,7 +518,8 @@ cdef class Parser:
         self._make_updates(d_tokvecs,
             backprops, sgd, cuda_stream)
         d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
-        bp_my_tokvecs(d_tokvecs, sgd=sgd)
+        if USE_FINE_TUNE:
+            bp_my_tokvecs(d_tokvecs, sgd=sgd)
         return d_tokvecs
 
     def _init_gold_batch(self, whole_docs, whole_golds):

From ac2de6dced7a3fa3d224487c61885b334c493392 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 9 Aug 2017 16:41:25 -0500
Subject: [PATCH 23/49] Switch to ReLu layers in Tok2Vec

---
 spacy/_ml.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index 01f166b9f..d08a43b8e 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -220,12 +220,12 @@ def Tok2Vec(width, embed_size, preprocess=None):
         tok2vec = (
             with_flatten(
                 asarray(Model.ops, dtype='uint64')
-                >> uniqued(embed, column=5)
+                >> embed
                 >> LN(Maxout(width, width*4, pieces=3))
-                >> Residual(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
+                >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
+                >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
+                >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
+                >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)),
                 pad=4)
         )
         if preprocess not in (False, None):
@@ -321,7 +321,8 @@ def zero_init(model):
 
 
 def doc2feats(cols=None):
-    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
+    if cols is None:
+        cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
     def forward(docs, drop=0.):
         feats = []
         for doc in docs:

From f93f2bed58a5caa8bdfba78c3c3f035c97c790e7 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 9 Aug 2017 17:47:03 -0500
Subject: [PATCH 24/49] Revert use of layer normalization in Tok2Vec

---
 spacy/_ml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index d08a43b8e..39041cc22 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -221,7 +221,7 @@ def Tok2Vec(width, embed_size, preprocess=None):
             with_flatten(
                 asarray(Model.ops, dtype='uint64')
                 >> embed
-                >> LN(Maxout(width, width*4, pieces=3))
+                >> Maxout(width, width*4, pieces=3)
                 >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
                 >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
                 >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))

From f37528ef58c89988eaa8c046d6dd0f0e6144a378 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 9 Aug 2017 17:52:53 -0500
Subject: [PATCH 25/49] Pass embed size for parser fine-tune. Use SELU

---
 spacy/syntax/nn_parser.pyx | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index f1f21134c..eb6117167 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -37,7 +37,8 @@ from preshed.maps cimport MapStruct
 from preshed.maps cimport map_get
 
 from thinc.api import layerize, chain, noop, clone
-from thinc.neural import Model, Affine, ELU, ReLu, Maxout
+from thinc.neural import Model, Affine, ReLu, Maxout
+from thinc.neural._classes.selu import SELU
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module
 
@@ -238,8 +239,9 @@ cdef class Parser:
         token_vector_width = util.env_opt('token_vector_width', token_vector_width)
         hidden_width = util.env_opt('hidden_width', hidden_width)
         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
-        tensors = fine_tune(Tok2Vec(token_vector_width, 7500,
-                    preprocess=doc2feats(cols=[ID, NORM, PREFIX, SUFFIX, TAG])))
+        embed_size = util.env_opt('embed_size', 7500)
+        tensors = fine_tune(Tok2Vec(token_vector_width, embed_size,
+                    preprocess=doc2feats(cols=[ID, NORM, PREFIX, SUFFIX, SHAPE])))
         if parser_maxout_pieces == 1:
             lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
                         nF=cls.nr_feature,
@@ -252,7 +254,7 @@ cdef class Parser:
 
         with Model.use_device('cpu'):
             upper = chain(
-                clone(Maxout(hidden_width), (depth-1)),
+                clone(SELU(hidden_width), (depth-1)),
                 zero_init(Affine(nr_class, drop_factor=0.0))
             )
         # TODO: This is an unfortunate hack atm!

From d01dc3704a5339cfd1f576a83f761ba9d6e62e7a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 9 Aug 2017 20:06:33 -0500
Subject: [PATCH 26/49] Adjust parser model

---
 spacy/syntax/nn_parser.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index eb6117167..a94b94e83 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -241,7 +241,7 @@ cdef class Parser:
         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
         embed_size = util.env_opt('embed_size', 7500)
         tensors = fine_tune(Tok2Vec(token_vector_width, embed_size,
-                    preprocess=doc2feats(cols=[ID, NORM, PREFIX, SUFFIX, SHAPE])))
+                    preprocess=doc2feats()))
         if parser_maxout_pieces == 1:
             lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
                         nF=cls.nr_feature,
@@ -254,7 +254,7 @@ cdef class Parser:
 
         with Model.use_device('cpu'):
             upper = chain(
-                clone(SELU(hidden_width), (depth-1)),
+                clone(Maxout(hidden_width), (depth-1)),
                 zero_init(Affine(nr_class, drop_factor=0.0))
             )
         # TODO: This is an unfortunate hack atm!

From d42a03b8ded67eabac13122ede58aa5f9e2dd447 Mon Sep 17 00:00:00 2001
From: Nikolai Kruglikov <kkrugl@yandex.ru>
Date: Thu, 10 Aug 2017 14:38:30 +0500
Subject: [PATCH 27/49] Fix small typo in documentation

---
 website/docs/usage/adding-languages.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade
index a0b77ad17..4cd65a62d 100644
--- a/website/docs/usage/adding-languages.jade
+++ b/website/docs/usage/adding-languages.jade
@@ -205,7 +205,7 @@ p
 
 +infobox("Why lazy-loading?")
     |  Some languages contain large volumes of custom data, like lemmatizer
-    |  loopup tables, or complex regular expression that are expensive to
+    |  lookup tables, or complex regular expression that are expensive to
     |  compute. As of spaCy v2.0, #[code Language] classes are not imported on
     |  initialisation and are only loaded when you import them directly, or load
     |  a model that requires a language to be loaded. To lazy-load languages in

From 1a59db1c86537c54b8b59e3a2988c6a24749b7f2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 12 Aug 2017 05:44:39 -0500
Subject: [PATCH 28/49] Fix dropout and learn rate in parser

---
 spacy/syntax/nn_parser.pyx | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index a94b94e83..201b988b9 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -39,6 +39,7 @@ from preshed.maps cimport map_get
 from thinc.api import layerize, chain, noop, clone
 from thinc.neural import Model, Affine, ReLu, Maxout
 from thinc.neural._classes.selu import SELU
+from thinc.neural._classes.layernorm import LayerNorm
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module
 
@@ -467,7 +468,7 @@ cdef class Parser:
             docs = [docs]
             golds = [golds]
         if USE_FINE_TUNE:
-            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=0.)
+            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
             my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
             tokvecs += my_tokvecs
 
@@ -496,13 +497,13 @@ cdef class Parser:
             scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
 
             d_scores = self.get_batch_loss(states, golds, scores)
-            d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
+            d_vector = bp_scores(d_scores, sgd=sgd)
             if drop != 0:
                 d_vector *= mask
 
             if isinstance(self.model[0].ops, CupyOps) \
             and not isinstance(token_ids, state2vec.ops.xp.ndarray):
-                # Move token_ids and d_vector to CPU, asynchronously
+                # Move token_ids and d_vector to GPU, asynchronously
                 backprops.append((
                     get_async(cuda_stream, token_ids),
                     get_async(cuda_stream, d_vector),

From ebe0f7f6418927e92086c1d408c6c9622682efcb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 12 Aug 2017 05:45:20 -0500
Subject: [PATCH 29/49] Pass embed size correctly in tagger, and cache
 embeddings for efficiency

---
 spacy/_ml.py | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index 39041cc22..33c6f378b 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -23,8 +23,10 @@ from thinc.neural._classes.attention import ParametricAttention
 from thinc.linear.linear import LinearModel
 from thinc.api import uniqued, wrap, flatten_add_lengths
 
+
 from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
 from .tokens.doc import Doc
+from . import util
 
 import numpy
 import io
@@ -208,6 +210,17 @@ class PrecomputableMaxouts(Model):
         return Yfp, backward
 
 
+def drop_layer(layer, factor=1.0):
+    def drop_layer_fwd(X, drop=0.):
+        drop *= factor
+        mask = layer.ops.get_dropout_mask((1,), drop)
+        if mask is not None and mask[0] == 0.:
+            return X, lambda dX, sgd=None: dX
+        else:
+            return layer.begin_update(X, drop=drop)
+    return wrap(drop_layer_fwd, layer)
+
+
 def Tok2Vec(width, embed_size, preprocess=None):
     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
     with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
@@ -220,13 +233,13 @@ def Tok2Vec(width, embed_size, preprocess=None):
         tok2vec = (
             with_flatten(
                 asarray(Model.ops, dtype='uint64')
-                >> embed
-                >> Maxout(width, width*4, pieces=3)
-                >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)),
-                pad=4)
+                >> uniqued(embed >> Maxout(width, width*4, pieces=3), column=5)
+                >> Residual(
+                    (ExtractWindow(nW=1)    >> ReLu(width, width*3))
+                    >> (ExtractWindow(nW=1) >> ReLu(width, width*3))
+                    >> (ExtractWindow(nW=1) >> ReLu(width, width*3))
+                    >> (ExtractWindow(nW=1) >> ReLu(width, width*3))
+                ), pad=4)
         )
         if preprocess not in (False, None):
             tok2vec = preprocess >> tok2vec
@@ -430,9 +443,10 @@ def getitem(i):
     return layerize(getitem_fwd)
 
 def build_tagger_model(nr_class, token_vector_width, **cfg):
+    embed_size = util.env_opt('embed_size', 7500)
     with Model.define_operators({'>>': chain, '+': add}):
         # Input: (doc, tensor) tuples
-        private_tok2vec = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats())
+        private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats())
 
         model = (
             fine_tune(private_tok2vec)

From 680043ebca7b695933d4935e6d189c54e27fa087 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 12 Aug 2017 08:54:21 -0500
Subject: [PATCH 30/49] Improve efficiency of tagger.set_annotations for GPU

---
 spacy/pipeline.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index b87f73c27..f367d2b5b 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -284,6 +284,8 @@ class NeuralTagger(BaseThincComponent):
         cdef Vocab vocab = self.vocab
         for i, doc in enumerate(docs):
             doc_tag_ids = batch_tag_ids[i]
+            if hasattr(doc_tag_ids, 'get'):
+                doc_tag_ids = doc_tag_ids.get()
             for j, tag_id in enumerate(doc_tag_ids):
                 # Don't clobber preset POS tags
                 if doc.c[j].tag == 0 and doc.c[j].pos == 0:

From 8870d491f1f4c1b50791484d234c2890f225abef Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 12 Aug 2017 08:55:53 -0500
Subject: [PATCH 31/49] Remove redundant pickling during training

---
 spacy/cli/train.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 9ed621c12..04aac8319 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -99,8 +99,6 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                 util.set_env_log(False)
                 epoch_model_path = output_path / ('model%d' % i)
                 nlp.to_disk(epoch_model_path)
-                with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
-                    dill.dump(nlp, file_, -1)
                 nlp_loaded = lang_class(pipeline=pipeline)
                 nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
                 scorer = nlp_loaded.evaluate(

From cd5ecedf6a02c0ce1fe2c2157e2281751cec98cb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 12 Aug 2017 08:56:33 -0500
Subject: [PATCH 32/49] Try drop_layer in parser

---
 spacy/syntax/nn_parser.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 201b988b9..bd56ba40b 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -47,6 +47,7 @@ from .. import util
 from ..util import get_async, get_cuda_stream
 from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
 from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
+from .._ml import Residual, drop_layer
 from ..compat import json_dumps
 
 from . import _parse_features
@@ -255,7 +256,7 @@ cdef class Parser:
 
         with Model.use_device('cpu'):
             upper = chain(
-                clone(Maxout(hidden_width), (depth-1)),
+                clone(drop_layer(Residual(Maxout(hidden_width))), (depth-1)),
                 zero_init(Affine(nr_class, drop_factor=0.0))
             )
         # TODO: This is an unfortunate hack atm!

From 4ab0c8c8e9b3320675e6a5e20d39db0be7fa0210 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 12 Aug 2017 08:56:57 -0500
Subject: [PATCH 33/49] Try different drop_layer structure in Tok2Vec

---
 spacy/_ml.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index 33c6f378b..e37bcac52 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -210,14 +210,14 @@ class PrecomputableMaxouts(Model):
         return Yfp, backward
 
 
-def drop_layer(layer, factor=1.0):
+def drop_layer(layer, factor=2.):
     def drop_layer_fwd(X, drop=0.):
         drop *= factor
         mask = layer.ops.get_dropout_mask((1,), drop)
-        if mask is not None and mask[0] == 0.:
-            return X, lambda dX, sgd=None: dX
-        else:
+        if mask is None or mask > 0:
             return layer.begin_update(X, drop=drop)
+        else:
+            return X, lambda dX, sgd=None: dX
     return wrap(drop_layer_fwd, layer)
 
 
@@ -229,17 +229,17 @@ def Tok2Vec(width, embed_size, preprocess=None):
         suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
         shape = get_col(cols.index(SHAPE))   >> HashEmbed(width, embed_size//2, name='embed_shape')
 
-        embed = (norm | prefix | suffix | shape )
+        embed = (norm | prefix | suffix | shape ) >> Maxout(width, width*4, pieces=3)
         tok2vec = (
             with_flatten(
                 asarray(Model.ops, dtype='uint64')
-                >> uniqued(embed >> Maxout(width, width*4, pieces=3), column=5)
-                >> Residual(
-                    (ExtractWindow(nW=1)    >> ReLu(width, width*3))
-                    >> (ExtractWindow(nW=1) >> ReLu(width, width*3))
-                    >> (ExtractWindow(nW=1) >> ReLu(width, width*3))
-                    >> (ExtractWindow(nW=1) >> ReLu(width, width*3))
-                ), pad=4)
+                >> uniqued(embed, column=5)
+                >> drop_layer(
+                    Residual(
+                        (ExtractWindow(nW=1) >> ReLu(width, width*3))
+                    )
+                ) ** 4, pad=4
+            )
         )
         if preprocess not in (False, None):
             tok2vec = preprocess >> tok2vec

From d4f2baf7dd7f0136916aa54c5d2af3ce12a43495 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 12 Aug 2017 21:44:15 +0200
Subject: [PATCH 34/49] Add create_meta option to package command

Re-create meta.json in model directory, even if it exists. Especially
useful when updating existing spaCy models or training with Prodigy.
Ensures user won't end up with multiple "en_core_web_sm" models, and
offers easy way to change the model's name and settings without having
to edit the meta.json file.
---
 spacy/cli/package.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 1c720c2b5..9be28d4aa 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -15,10 +15,11 @@ from .. import about
 @plac.annotations(
     input_dir=("directory with model data", "positional", None, str),
     output_dir=("output parent directory", "positional", None, str),
-    meta=("path to meta.json", "option", "m", str),
+    meta_path=("path to meta.json", "option", "m", str),
+    create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool),
     force=("force overwriting of existing folder in output directory", "flag", "f", bool)
 )
-def package(cmd, input_dir, output_dir, meta=None, force=False):
+def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False):
     """
     Generate Python package for model data, including meta and required
     installation files. A new directory will be created in the specified
@@ -26,7 +27,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
     """
     input_path = util.ensure_path(input_dir)
     output_path = util.ensure_path(output_dir)
-    meta_path = util.ensure_path(meta)
+    meta_path = util.ensure_path(meta_path)
     if not input_path or not input_path.exists():
         prints(input_path, title="Model directory not found", exits=1)
     if not output_path or not output_path.exists():
@@ -38,7 +39,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
     template_manifest = get_template('MANIFEST.in')
     template_init = get_template('xx_model_name/__init__.py')
     meta_path = meta_path or input_path / 'meta.json'
-    if meta_path.is_file():
+    if not create_meta and meta_path.is_file():
         prints(meta_path, title="Reading meta.json from file")
         meta = util.read_json(meta_path)
     else:

From b353e4d843be9eb55bc89927df1e4d4ec099dc21 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 12 Aug 2017 14:47:45 -0500
Subject: [PATCH 35/49] Work on parser beam training

---
 setup.py                           |   1 +
 spacy/syntax/_beam_utils.pyx       | 196 +++++++++++++++++++++++++++++
 spacy/syntax/nn_parser.pyx         |  27 +++-
 spacy/tests/parser/test_nn_beam.py |  98 +++++++++++++++
 4 files changed, 321 insertions(+), 1 deletion(-)
 create mode 100644 spacy/syntax/_beam_utils.pyx
 create mode 100644 spacy/tests/parser/test_nn_beam.py

diff --git a/setup.py b/setup.py
index 0a3384ed5..02d4fe0d9 100755
--- a/setup.py
+++ b/setup.py
@@ -36,6 +36,7 @@ MOD_NAMES = [
     'spacy.syntax.transition_system',
     'spacy.syntax.arc_eager',
     'spacy.syntax._parse_features',
+    'spacy.syntax._beam_utils',
     'spacy.gold',
     'spacy.tokens.doc',
     'spacy.tokens.span',
diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx
new file mode 100644
index 000000000..4a4b79dad
--- /dev/null
+++ b/spacy/syntax/_beam_utils.pyx
@@ -0,0 +1,196 @@
+# cython: infer_types=True
+cimport numpy as np
+import numpy
+from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
+from thinc.extra.search cimport Beam
+from thinc.extra.search import MaxViolation
+from thinc.typedefs cimport hash_t, class_t
+
+from .transition_system cimport TransitionSystem, Transition
+from .stateclass cimport StateClass
+from ..gold cimport GoldParse
+from ..tokens.doc cimport Doc
+
+
+# These are passed as callbacks to thinc.search.Beam
+cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
+    dest = <StateClass>_dest
+    src = <StateClass>_src
+    moves = <const Transition*>_moves
+    dest.clone(src)
+    moves[clas].do(dest.c, moves[clas].label)
+
+
+cdef int _check_final_state(void* _state, void* extra_args) except -1:
+    return (<StateClass>_state).is_final()
+
+
+def _cleanup(Beam beam):
+    for i in range(beam.width):
+        Py_XDECREF(<PyObject*>beam._states[i].content)
+        Py_XDECREF(<PyObject*>beam._parents[i].content)
+
+
+cdef hash_t _hash_state(void* _state, void* _) except 0:
+    state = <StateClass>_state
+    if state.c.is_final():
+        return 1
+    else:
+        return state.c.hash()
+
+
+cdef class ParserBeam(object):
+    cdef public TransitionSystem moves
+    cdef public object docs
+    cdef public object golds
+    cdef public object beams
+
+    def __init__(self, TransitionSystem moves, docs, golds,
+            int width=4, float density=0.001):
+        self.moves = moves
+        self.docs = docs
+        self.golds = golds
+        self.beams = []
+        cdef Doc doc
+        cdef Beam beam
+        for doc in docs:
+            beam = Beam(self.moves.n_moves, width, density)
+            beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
+            self.beams.append(beam)
+    
+    @property
+    def is_done(self):
+        return all(beam.is_done for beam in self.beams)
+
+    def __getitem__(self, i):
+        return self.beams[i]
+
+    def __len__(self):
+        return len(self.beams)
+
+    def advance(self, scores, follow_gold=False):
+        cdef Beam beam
+        for i, beam in enumerate(self.beams):
+            self._set_scores(beam, scores[i])
+            if self.golds is not None:
+                self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
+        if follow_gold:
+            assert self.golds is not None
+            beam.advance(_transition_state, NULL, <void*>self.moves.c)
+        else:
+            beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
+        beam.check_done(_check_final_state, NULL)
+
+    def _set_scores(self, Beam beam, scores):
+        for i in range(beam.size):
+            state = <StateClass>beam.at(i)
+            for j in range(beam.nr_class):
+                beam.scores[i][j] = scores[i, j]
+            self.moves.set_valid(beam.is_valid[i], state.c)
+
+    def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
+        for i in range(beam.size):
+            state = <StateClass>beam.at(i)
+            self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
+            if follow_gold:
+                for j in range(beam.nr_class):
+                    beam.is_valid[i][j] *= beam.costs[i][j] <= 0
+ 
+
+def get_token_ids(states, int n_tokens):
+    cdef StateClass state
+    cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
+                                      dtype='i', order='C')
+    c_ids = <int*>ids.data
+    for i, state in enumerate(states):
+        if not state.is_final():
+            state.c.set_context_tokens(c_ids, n_tokens)
+        c_ids += ids.shape[1]
+    return ids
+
+
+def update_beam(TransitionSystem moves, int nr_feature,
+                docs, tokvecs, golds,
+                state2vec, vec2scores, drop=0., sgd=None,
+                losses=None, int width=4, float density=0.001):
+    pbeam = ParserBeam(moves, docs, golds,
+                       width=width, density=density)
+    gbeam = ParserBeam(moves, docs, golds,
+                       width=width, density=density)
+    beam_map = {}
+    backprops = []
+    violns = [MaxViolation() for _ in range(len(docs))]
+    example_ids = list(range(len(docs)))
+    while not pbeam.is_done and not gbeam.is_done:
+        states, p_indices, g_indices = get_states(example_ids, pbeam, gbeam, beam_map)
+
+        token_ids = get_token_ids(states, nr_feature)
+        vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
+        scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
+        
+        backprops.append((token_ids, bp_vectors, bp_scores))
+
+        p_scores = [scores[indices] for indices in p_indices]
+        g_scores = [scores[indices] for indices in g_indices]
+        pbeam.advance(p_scores)
+        gbeam.advance(g_scores, follow_gold=True)
+
+        for i, violn in enumerate(violns):
+            violn.check_crf(pbeam[i], gbeam[i])
+    
+    histories = [(v.p_hist + v.g_hist) for v in violns]
+    losses = [(v.p_probs + v.g_probs) for v in violns]
+    states_d_scores = get_gradient(moves.n_moves, beam_map,
+                                   histories, losses)
+    return states_d_scores, backprops
+
+
+def get_states(example_ids, pbeams, gbeams, beam_map):
+    states = []
+    seen = {}
+    p_indices = []
+    g_indices = []
+    cdef Beam pbeam, gbeam
+    for eg_id, pbeam, gbeam in zip(example_ids, pbeams, gbeams):
+        p_indices.append([])
+        for j in range(pbeam.size):
+            key = tuple([eg_id] + pbeam.histories[j])
+            seen[key] = len(states)
+            p_indices[-1].append(len(states))
+            states.append(<StateClass>pbeam.at(j))
+        beam_map.update(seen)
+        g_indices.append([])
+        for i in range(gbeam.size):
+            key = tuple([eg_id] + gbeam.histories[i])
+            if key in seen:
+                g_indices[-1].append(seen[key])
+            else:
+                g_indices[-1].append(len(states))
+                beam_map[key] = len(states)
+                states.append(<StateClass>gbeam.at(i))
+
+    p_indices = numpy.asarray(p_indices, dtype='i')
+    g_indices = numpy.asarray(g_indices, dtype='i')
+    return states, p_indices, g_indices
+
+
+def get_gradient(nr_class, beam_map, histories, losses):
+    """
+    The global model assigns a loss to each parse. The beam scores
+    are additive, so the same gradient is applied to each action
+    in the history. This gives the gradient of a single *action*
+    for a beam state -- so we have "the gradient of loss for taking
+    action i given history H."
+    """
+    nr_step = max(len(hist) for hist in histories)
+    nr_beam = len(histories)
+    grads = [numpy.zeros((nr_beam, nr_class), dtype='f') for _ in range(nr_step)]
+    for hist, loss in zip(histories, losses):
+        key = tuple()
+        for j, clas in enumerate(hist):
+            grads[j][i, clas] = loss
+            key = key + clas
+            i = beam_map[key]
+    return grads
+
+
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index bd56ba40b..11584e4d2 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -63,6 +63,7 @@ from ..tokens.doc cimport Doc
 from ..strings cimport StringStore
 from ..gold cimport GoldParse
 from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
+from . import _beam_utils
 
 USE_FINE_TUNE = True
 
@@ -256,7 +257,7 @@ cdef class Parser:
 
         with Model.use_device('cpu'):
             upper = chain(
-                clone(drop_layer(Residual(Maxout(hidden_width))), (depth-1)),
+                clone(Residual(ReLu(hidden_width)), (depth-1)),
                 zero_init(Affine(nr_class, drop_factor=0.0))
             )
         # TODO: This is an unfortunate hack atm!
@@ -526,6 +527,30 @@ cdef class Parser:
             bp_my_tokvecs(d_tokvecs, sgd=sgd)
         return d_tokvecs
 
+    def update_beam(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
+        docs, tokvecs = docs_tokvecs
+        tokvecs = self.model[0].ops.flatten(tokvecs)
+
+        cuda_stream = get_cuda_stream()
+        state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, cuda_stream, 0.0)
+
+        states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature,
+                                        docs, tokvecs, golds,
+                                        state2vec, vec2scores,
+                                        drop, sgd, losses)
+        backprop_lower = []
+        for i, d_scores in enumerate(states_d_scores):
+            ids, bp_vectors, bp_scores = backprops[i]
+            d_vector = bp_scores(d_scores, sgd=sgd)
+            backprop_lower.append((
+                get_async(cuda_stream, ids),
+                get_async(cuda_stream, d_vector),
+                bp_vectors))
+        d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
+        self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
+        lengths = [len(doc) for doc in docs]
+        return self.model[0].ops.unflatten(d_tokvecs, lengths)
+
     def _init_gold_batch(self, whole_docs, whole_golds):
         """Make a square batch, of length equal to the shortest doc. A long
         doc will get multiple states. Let's say we have a doc of length 2*N,
diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py
new file mode 100644
index 000000000..ad0dfa7a1
--- /dev/null
+++ b/spacy/tests/parser/test_nn_beam.py
@@ -0,0 +1,98 @@
+from __future__ import unicode_literals
+import pytest
+import numpy
+from thinc.api import layerize
+
+from ...vocab import Vocab
+from ...syntax.arc_eager import ArcEager
+from ...tokens import Doc
+from ...gold import GoldParse
+from ...syntax._beam_utils import ParserBeam, update_beam
+
+
+@pytest.fixture
+def vocab():
+    return Vocab()
+
+@pytest.fixture
+def moves(vocab):
+    aeager = ArcEager(vocab.strings, {})
+    aeager.add_action(2, 'nsubj')
+    aeager.add_action(3, 'dobj')
+    aeager.add_action(2, 'aux')
+    return aeager
+
+
+@pytest.fixture
+def docs(vocab):
+    return [Doc(vocab, words=['Rats', 'bite', 'things'])]
+
+@pytest.fixture
+def tokvecs(docs, vector_size):
+    output = []
+    for doc in docs:
+        vec = numpy.random.uniform(-0.1, 0.1, (len(doc), vector_size))
+        output.append(numpy.asarray(vec))
+    return output
+
+
+@pytest.fixture
+def golds(docs):
+    return [GoldParse(doc) for doc in docs]
+
+
+@pytest.fixture
+def batch_size(docs):
+    return len(docs)
+
+
+@pytest.fixture
+def beam_width():
+    return 4
+
+
+@pytest.fixture
+def vector_size():
+    return 6
+
+
+@pytest.fixture
+def beam(moves, docs, golds, beam_width):
+    return ParserBeam(moves, docs, golds, width=beam_width)
+
+@pytest.fixture
+def scores(moves, batch_size, beam_width):
+    return [
+        numpy.asarray(
+            numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)),
+            dtype='f')
+        for _ in range(batch_size)]
+
+
+def test_create_beam(beam):
+    pass
+
+
+def test_beam_advance(beam, scores):
+    beam.advance(scores)
+
+
+def test_beam_advance_too_few_scores(beam, scores):
+    with pytest.raises(IndexError):
+        beam.advance(scores[:-1])
+
+
+def test_update_beam(moves, docs, tokvecs, golds, vector_size):
+    @layerize
+    def state2vec(X, drop=0.):
+        vec = numpy.ones((X.shape[0], vector_size), dtype='f')
+        return vec, None
+    @layerize
+    def vec2scores(X, drop=0.):
+        scores = numpy.ones((X.shape[0], moves.n_moves), dtype='f')
+        return scores, None
+    d_loss, backprops = update_beam(moves, 13, docs, tokvecs, golds,
+                            state2vec, vec2scores, drop=0.0, sgd=None,
+                            losses={}, width=4, density=0.001)
+
+

From d4308d236356e483c60f0119549f5a4da12fe1cc Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 12 Aug 2017 17:14:39 -0500
Subject: [PATCH 36/49] Initialize State offset to 0

---
 spacy/syntax/_state.pxd | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd
index c06851978..9aeeba441 100644
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@@ -37,6 +37,7 @@ cdef cppclass StateC:
         this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
         this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
         this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity))
+        this.offset = 0
         cdef int i
         for i in range(length + (PADDING * 2)):
             this._ents[i].end = -1

From 4638f4b869ad18c86c227e71e99c462aabd31eba Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 12 Aug 2017 17:15:16 -0500
Subject: [PATCH 37/49] Fix beam update

---
 spacy/syntax/_beam_utils.pyx | 76 +++++++++++++++++++++---------------
 spacy/syntax/nn_parser.pyx   | 20 ++++++----
 2 files changed, 58 insertions(+), 38 deletions(-)

diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx
index 4a4b79dad..10b5e407c 100644
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@@ -41,21 +41,24 @@ cdef hash_t _hash_state(void* _state, void* _) except 0:
 
 cdef class ParserBeam(object):
     cdef public TransitionSystem moves
-    cdef public object docs
+    cdef public object states
     cdef public object golds
     cdef public object beams
 
-    def __init__(self, TransitionSystem moves, docs, golds,
+    def __init__(self, TransitionSystem moves, states, golds,
             int width=4, float density=0.001):
         self.moves = moves
-        self.docs = docs
+        self.states = states
         self.golds = golds
         self.beams = []
-        cdef Doc doc
         cdef Beam beam
-        for doc in docs:
+        cdef StateClass state, st
+        for state in states:
             beam = Beam(self.moves.n_moves, width, density)
-            beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
+            beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
+            for i in range(beam.size):
+                st = <StateClass>beam.at(i)
+                st.c.offset = state.c.offset
             self.beams.append(beam)
     
     @property
@@ -100,34 +103,38 @@ cdef class ParserBeam(object):
 def get_token_ids(states, int n_tokens):
     cdef StateClass state
     cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
-                                      dtype='i', order='C')
+                                      dtype='int32', order='C')
     c_ids = <int*>ids.data
     for i, state in enumerate(states):
         if not state.is_final():
             state.c.set_context_tokens(c_ids, n_tokens)
+        else:
+            ids[i] = -1
         c_ids += ids.shape[1]
     return ids
 
 
-def update_beam(TransitionSystem moves, int nr_feature,
-                docs, tokvecs, golds,
+def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
+                states, tokvecs, golds,
                 state2vec, vec2scores, drop=0., sgd=None,
                 losses=None, int width=4, float density=0.001):
-    pbeam = ParserBeam(moves, docs, golds,
+    pbeam = ParserBeam(moves, states, golds,
                        width=width, density=density)
-    gbeam = ParserBeam(moves, docs, golds,
+    gbeam = ParserBeam(moves, states, golds,
                        width=width, density=density)
-    beam_map = {}
+    beam_maps = []
     backprops = []
-    violns = [MaxViolation() for _ in range(len(docs))]
-    example_ids = list(range(len(docs)))
-    while not pbeam.is_done and not gbeam.is_done:
-        states, p_indices, g_indices = get_states(example_ids, pbeam, gbeam, beam_map)
+    violns = [MaxViolation() for _ in range(len(states))]
+    for t in range(max_steps):
+        if pbeam.is_done and gbeam.is_done:
+            break
+        beam_maps.append({})
+        states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1])
 
         token_ids = get_token_ids(states, nr_feature)
         vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
         scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
-        
+
         backprops.append((token_ids, bp_vectors, bp_scores))
 
         p_scores = [scores[indices] for indices in p_indices]
@@ -140,18 +147,18 @@ def update_beam(TransitionSystem moves, int nr_feature,
     
     histories = [(v.p_hist + v.g_hist) for v in violns]
     losses = [(v.p_probs + v.g_probs) for v in violns]
-    states_d_scores = get_gradient(moves.n_moves, beam_map,
+    states_d_scores = get_gradient(moves.n_moves, beam_maps,
                                    histories, losses)
     return states_d_scores, backprops
 
 
-def get_states(example_ids, pbeams, gbeams, beam_map):
-    states = []
+def get_states(pbeams, gbeams, beam_map):
     seen = {}
+    states = []
     p_indices = []
     g_indices = []
     cdef Beam pbeam, gbeam
-    for eg_id, pbeam, gbeam in zip(example_ids, pbeams, gbeams):
+    for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
         p_indices.append([])
         for j in range(pbeam.size):
             key = tuple([eg_id] + pbeam.histories[j])
@@ -174,23 +181,30 @@ def get_states(example_ids, pbeams, gbeams, beam_map):
     return states, p_indices, g_indices
 
 
-def get_gradient(nr_class, beam_map, histories, losses):
+def get_gradient(nr_class, beam_maps, histories, losses):
     """
     The global model assigns a loss to each parse. The beam scores
     are additive, so the same gradient is applied to each action
     in the history. This gives the gradient of a single *action*
     for a beam state -- so we have "the gradient of loss for taking
     action i given history H."
+
+    Histories: Each hitory is a list of actions
+    Each candidate has a history
+    Each beam has multiple candidates
+    Each batch has multiple beams
+    So history is list of lists of lists of ints
     """
-    nr_step = max(len(hist) for hist in histories)
-    nr_beam = len(histories)
-    grads = [numpy.zeros((nr_beam, nr_class), dtype='f') for _ in range(nr_step)]
-    for hist, loss in zip(histories, losses):
-        key = tuple()
-        for j, clas in enumerate(hist):
-            grads[j][i, clas] = loss
-            key = key + clas
-            i = beam_map[key]
+    nr_step = len(beam_maps)
+    grads = [numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f')
+             for beam_map in beam_maps]
+    for eg_id, hists in enumerate(histories):
+        for loss, hist in zip(losses[eg_id], hists):
+            key = tuple([eg_id])
+            for j, clas in enumerate(hist):
+                i = beam_maps[j][key]
+                grads[j][i, clas] = loss
+                key = key + tuple([clas])
     return grads
 
 
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 11584e4d2..c842ef00b 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -529,23 +529,29 @@ cdef class Parser:
 
     def update_beam(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
         docs, tokvecs = docs_tokvecs
+        lengths = [len(d) for d in docs]
         tokvecs = self.model[0].ops.flatten(tokvecs)
+        states, golds, max_moves = self._init_gold_batch(docs, golds)
 
         cuda_stream = get_cuda_stream()
-        state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, cuda_stream, 0.0)
+        state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0)
 
-        states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature,
-                                        docs, tokvecs, golds,
+        states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, max_moves,
+                                        states, tokvecs, golds,
                                         state2vec, vec2scores,
                                         drop, sgd, losses)
         backprop_lower = []
         for i, d_scores in enumerate(states_d_scores):
             ids, bp_vectors, bp_scores = backprops[i]
             d_vector = bp_scores(d_scores, sgd=sgd)
-            backprop_lower.append((
-                get_async(cuda_stream, ids),
-                get_async(cuda_stream, d_vector),
-                bp_vectors))
+            if isinstance(self.model[0].ops, CupyOps) \
+            and not isinstance(ids, state2vec.ops.xp.ndarray):
+                backprop_lower.append((
+                    get_async(cuda_stream, ids),
+                    get_async(cuda_stream, d_vector),
+                    bp_vectors))
+            else:
+                backprop_lower.append((ids, d_vector, bp_vectors))
         d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
         self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
         lengths = [len(doc) for doc in docs]

From 24b45b45c6bbbb42443d4eb91ec39062a22039d0 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 12 Aug 2017 17:15:28 -0500
Subject: [PATCH 38/49] Add test for beam update

---
 spacy/tests/parser/test_neural_parser.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py
index 42b55745f..30a6367c8 100644
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@@ -78,3 +78,16 @@ def test_predict_doc_beam(parser, tok2vec, model, doc):
     parser(doc, beam_width=32, beam_density=0.001)
     for word in doc:
         print(word.text, word.head, word.dep_)
+
+
+def test_update_doc_beam(parser, tok2vec, model, doc, gold):
+    parser.model = model
+    tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
+    d_tokvecs = parser.update_beam(([doc], tokvecs), [gold])
+    assert d_tokvecs[0].shape == tokvecs[0].shape
+    def optimize(weights, gradient, key=None):
+        weights -= 0.001 * gradient
+    bp_tokvecs(d_tokvecs, sgd=optimize)
+    assert d_tokvecs[0].sum() == 0.
+
+

From c96d76983626ed1edcd4f513318469b7b7e6a191 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 12 Aug 2017 18:21:54 -0500
Subject: [PATCH 39/49] Fix beam parse. Not sure if working

---
 spacy/syntax/_beam_utils.pyx | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx
index 10b5e407c..3fcd322e2 100644
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@@ -87,8 +87,9 @@ cdef class ParserBeam(object):
     def _set_scores(self, Beam beam, scores):
         for i in range(beam.size):
             state = <StateClass>beam.at(i)
-            for j in range(beam.nr_class):
-                beam.scores[i][j] = scores[i, j]
+            if not state.is_final():
+                for j in range(beam.nr_class):
+                    beam.scores[i][j] = scores[i, j]
             self.moves.set_valid(beam.is_valid[i], state.c)
 
     def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
@@ -137,8 +138,8 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
 
         backprops.append((token_ids, bp_vectors, bp_scores))
 
-        p_scores = [scores[indices] for indices in p_indices]
-        g_scores = [scores[indices] for indices in g_indices]
+        p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
+        g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')  for indices in g_indices]
         pbeam.advance(p_scores)
         gbeam.advance(g_scores, follow_gold=True)
 
@@ -176,8 +177,8 @@ def get_states(pbeams, gbeams, beam_map):
                 beam_map[key] = len(states)
                 states.append(<StateClass>gbeam.at(i))
 
-    p_indices = numpy.asarray(p_indices, dtype='i')
-    g_indices = numpy.asarray(g_indices, dtype='i')
+    p_indices = [numpy.asarray(idx, dtype='i') for idx in p_indices]
+    g_indices = [numpy.asarray(idx, dtype='i') for idx in g_indices]
     return states, p_indices, g_indices
 
 
@@ -203,7 +204,9 @@ def get_gradient(nr_class, beam_maps, histories, losses):
             key = tuple([eg_id])
             for j, clas in enumerate(hist):
                 i = beam_maps[j][key]
-                grads[j][i, clas] = loss
+                # In step j, at state i action clas
+                # resulted in loss
+                grads[j][i, clas] += loss
                 key = key + tuple([clas])
     return grads
 

From 28e930aae096407cf5dcb0cfda54bcbd881a551c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 12 Aug 2017 19:22:52 -0500
Subject: [PATCH 40/49] Fixes for beam parsing. Not working

---
 spacy/syntax/_beam_utils.pyx | 41 ++++++++++++++++++++++--------------
 spacy/syntax/nn_parser.pyx   | 27 +++++++++++++++++++-----
 2 files changed, 47 insertions(+), 21 deletions(-)

diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx
index 3fcd322e2..af4aff9fe 100644
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@@ -60,10 +60,16 @@ cdef class ParserBeam(object):
                 st = <StateClass>beam.at(i)
                 st.c.offset = state.c.offset
             self.beams.append(beam)
+
+    def __dealloc__(self):
+        if self.beams is not None:
+            for beam in self.beams:
+                if beam is not None:
+                    _cleanup(beam)
     
     @property
     def is_done(self):
-        return all(beam.is_done for beam in self.beams)
+        return all(b.is_done for b in self.beams)
 
     def __getitem__(self, i):
         return self.beams[i]
@@ -77,28 +83,31 @@ cdef class ParserBeam(object):
             self._set_scores(beam, scores[i])
             if self.golds is not None:
                 self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
-        if follow_gold:
-            assert self.golds is not None
-            beam.advance(_transition_state, NULL, <void*>self.moves.c)
-        else:
-            beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
-        beam.check_done(_check_final_state, NULL)
+            if follow_gold:
+                assert self.golds is not None
+                beam.advance(_transition_state, NULL, <void*>self.moves.c)
+            else:
+                beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
+            beam.check_done(_check_final_state, NULL)
 
-    def _set_scores(self, Beam beam, scores):
+    def _set_scores(self, Beam beam, float[:, ::1] scores):
+        cdef float* c_scores = &scores[0, 0]
         for i in range(beam.size):
             state = <StateClass>beam.at(i)
             if not state.is_final():
                 for j in range(beam.nr_class):
-                    beam.scores[i][j] = scores[i, j]
-            self.moves.set_valid(beam.is_valid[i], state.c)
+                    beam.scores[i][j] = c_scores[i * beam.nr_class + j]
+                self.moves.set_valid(beam.is_valid[i], state.c)
 
     def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
         for i in range(beam.size):
             state = <StateClass>beam.at(i)
-            self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
-            if follow_gold:
-                for j in range(beam.nr_class):
-                    beam.is_valid[i][j] *= beam.costs[i][j] <= 0
+            if not state.c.is_final():
+                self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
+                if follow_gold:
+                    for j in range(beam.nr_class):
+                        if beam.costs[i][j] >= 1:
+                            beam.is_valid[i][j] = 0
  
 
 def get_token_ids(states, int n_tokens):
@@ -122,7 +131,7 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
     pbeam = ParserBeam(moves, states, golds,
                        width=width, density=density)
     gbeam = ParserBeam(moves, states, golds,
-                       width=width, density=density)
+                       width=width, density=0.0)
     beam_maps = []
     backprops = []
     violns = [MaxViolation() for _ in range(len(states))]
@@ -145,7 +154,7 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
 
         for i, violn in enumerate(violns):
             violn.check_crf(pbeam[i], gbeam[i])
-    
+
     histories = [(v.p_hist + v.g_hist) for v in violns]
     losses = [(v.p_probs + v.g_probs) for v in violns]
     states_d_scores = get_gradient(moves.n_moves, beam_maps,
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index c842ef00b..fa954a879 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -66,6 +66,7 @@ from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
 from . import _beam_utils
 
 USE_FINE_TUNE = True
+BEAM_PARSE = True
 
 def get_templates(*args, **kwargs):
     return []
@@ -335,7 +336,7 @@ cdef class Parser:
             return output
 
     def pipe(self, docs, int batch_size=1000, int n_threads=2,
-             beam_width=1, beam_density=0.001):
+             beam_width=4, beam_density=0.001):
         """
         Process a stream of documents.
 
@@ -348,14 +349,18 @@ cdef class Parser:
         Yields (Doc): Documents, in order.
         """
         cdef Doc doc
+        cdef Beam beam
         for docs in cytoolz.partition_all(batch_size, docs):
             docs = list(docs)
             tokvecs = [doc.tensor for doc in docs]
             if beam_width == 1:
                 parse_states = self.parse_batch(docs, tokvecs)
             else:
-                parse_states = self.beam_parse(docs, tokvecs,
-                                    beam_width=beam_width, beam_density=beam_density)
+                beams = self.beam_parse(docs, tokvecs,
+                            beam_width=beam_width, beam_density=beam_density)
+                parse_states = []
+                for beam in beams:
+                    parse_states.append(<StateClass>beam.at(0))
             self.set_annotations(docs, parse_states)
             yield from docs
 
@@ -462,6 +467,9 @@ cdef class Parser:
         return beams
 
     def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
+        if BEAM_PARSE:
+            return self.update_beam(docs_tokvecs, golds, drop=drop, sgd=sgd,
+                                    losses=losses)
         if losses is not None and self.name not in losses:
             losses[self.name] = 0.
         docs, tokvec_lists = docs_tokvecs
@@ -528,9 +536,16 @@ cdef class Parser:
         return d_tokvecs
 
     def update_beam(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
+        if losses is not None and self.name not in losses:
+            losses[self.name] = 0.
         docs, tokvecs = docs_tokvecs
         lengths = [len(d) for d in docs]
         tokvecs = self.model[0].ops.flatten(tokvecs)
+        if USE_FINE_TUNE:
+            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
+            my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
+            tokvecs += my_tokvecs
+
         states, golds, max_moves = self._init_gold_batch(docs, golds)
 
         cuda_stream = get_cuda_stream()
@@ -554,8 +569,10 @@ cdef class Parser:
                 backprop_lower.append((ids, d_vector, bp_vectors))
         d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
         self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
-        lengths = [len(doc) for doc in docs]
-        return self.model[0].ops.unflatten(d_tokvecs, lengths)
+        d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
+        if USE_FINE_TUNE:
+            bp_my_tokvecs(d_tokvecs, sgd=sgd)
+        return d_tokvecs
 
     def _init_gold_batch(self, whole_docs, whole_golds):
         """Make a square batch, of length equal to the shortest doc. A long

From 3e30712b627ea5c5625f4eeeba125b38722bd67a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 12 Aug 2017 19:24:17 -0500
Subject: [PATCH 41/49] Improve defaults

---
 spacy/pipeline.pyx         | 2 +-
 spacy/syntax/nn_parser.pyx | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index f367d2b5b..634d3e4b5 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -138,7 +138,7 @@ class TokenVectorEncoder(BaseThincComponent):
     name = 'tensorizer'
 
     @classmethod
-    def Model(cls, width=128, embed_size=7500, **cfg):
+    def Model(cls, width=128, embed_size=4000, **cfg):
         """Create a new statistical model for the class.
 
         width (int): Output size of the model.
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index fa954a879..8a33a9da1 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -238,12 +238,12 @@ cdef class Parser:
     Base class of the DependencyParser and EntityRecognizer.
     """
     @classmethod
-    def Model(cls, nr_class, token_vector_width=128, hidden_width=128, depth=1, **cfg):
+    def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg):
         depth = util.env_opt('parser_hidden_depth', depth)
         token_vector_width = util.env_opt('token_vector_width', token_vector_width)
         hidden_width = util.env_opt('hidden_width', hidden_width)
         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
-        embed_size = util.env_opt('embed_size', 7500)
+        embed_size = util.env_opt('embed_size', 4000)
         tensors = fine_tune(Tok2Vec(token_vector_width, embed_size,
                     preprocess=doc2feats()))
         if parser_maxout_pieces == 1:

From 17874fe4918eeef757bae153342468e166ba9c96 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 12 Aug 2017 19:35:40 -0500
Subject: [PATCH 42/49] Disable beam parsing

---
 spacy/syntax/nn_parser.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 8a33a9da1..ea61af1df 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -66,7 +66,7 @@ from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
 from . import _beam_utils
 
 USE_FINE_TUNE = True
-BEAM_PARSE = True
+BEAM_PARSE = False
 
 def get_templates(*args, **kwargs):
     return []
@@ -336,7 +336,7 @@ cdef class Parser:
             return output
 
     def pipe(self, docs, int batch_size=1000, int n_threads=2,
-             beam_width=4, beam_density=0.001):
+             beam_width=1, beam_density=0.001):
         """
         Process a stream of documents.
 

From 92ebab6073f29fd919306e9f5775e8f8842692f8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 13 Aug 2017 08:56:02 +0200
Subject: [PATCH 43/49] Update beam-update tests

---
 spacy/tests/parser/test_nn_beam.py | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py
index ad0dfa7a1..45c85d969 100644
--- a/spacy/tests/parser/test_nn_beam.py
+++ b/spacy/tests/parser/test_nn_beam.py
@@ -8,6 +8,7 @@ from ...syntax.arc_eager import ArcEager
 from ...tokens import Doc
 from ...gold import GoldParse
 from ...syntax._beam_utils import ParserBeam, update_beam
+from ...syntax.stateclass import StateClass
 
 
 @pytest.fixture
@@ -27,6 +28,10 @@ def moves(vocab):
 def docs(vocab):
     return [Doc(vocab, words=['Rats', 'bite', 'things'])]
 
+@pytest.fixture
+def states(docs):
+    return [StateClass(doc) for doc in docs]
+
 @pytest.fixture
 def tokvecs(docs, vector_size):
     output = []
@@ -57,8 +62,8 @@ def vector_size():
 
 
 @pytest.fixture
-def beam(moves, docs, golds, beam_width):
-    return ParserBeam(moves, docs, golds, width=beam_width)
+def beam(moves, states, golds, beam_width):
+    return ParserBeam(moves, states, golds, width=beam_width)
 
 @pytest.fixture
 def scores(moves, batch_size, beam_width):
@@ -80,19 +85,3 @@ def test_beam_advance(beam, scores):
 def test_beam_advance_too_few_scores(beam, scores):
     with pytest.raises(IndexError):
         beam.advance(scores[:-1])
-
-
-def test_update_beam(moves, docs, tokvecs, golds, vector_size):
-    @layerize
-    def state2vec(X, drop=0.):
-        vec = numpy.ones((X.shape[0], vector_size), dtype='f')
-        return vec, None
-    @layerize
-    def vec2scores(X, drop=0.):
-        scores = numpy.ones((X.shape[0], moves.n_moves), dtype='f')
-        return scores, None
-    d_loss, backprops = update_beam(moves, 13, docs, tokvecs, golds,
-                            state2vec, vec2scores, drop=0.0, sgd=None,
-                            losses={}, width=4, density=0.001)
-
-

From 4ae0d5e1e63903613ed24aa7fea0fe5593c30fe8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 13 Aug 2017 09:03:38 +0200
Subject: [PATCH 44/49] Set defaults for convert command

---
 spacy/cli/convert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index a0a76e5ec..fef6753e6 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -21,10 +21,10 @@ CONVERTERS = {
 @plac.annotations(
     input_file=("input file", "positional", None, str),
     output_dir=("output directory for converted file", "positional", None, str),
-    n_sents=("Number of sentences per doc", "option", "n", float),
+    n_sents=("Number of sentences per doc", "option", "n", int),
     morphology=("Enable appending morphology to tags", "flag", "m", bool)
 )
-def convert(cmd, input_file, output_dir, n_sents, morphology):
+def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
     """
     Convert files into JSON format for use with train command and other
     experiment management functions.

From 12de2638137c1c8c9f86b687d6296138e0aaa0ea Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 13 Aug 2017 09:33:39 +0200
Subject: [PATCH 45/49] Bug fixes to beam parsing. Learns small sample

---
 spacy/syntax/_beam_utils.pyx | 81 +++++++++++++++++++++++++++---------
 1 file changed, 61 insertions(+), 20 deletions(-)

diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx
index af4aff9fe..0a513531d 100644
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@@ -66,7 +66,7 @@ cdef class ParserBeam(object):
             for beam in self.beams:
                 if beam is not None:
                     _cleanup(beam)
-    
+
     @property
     def is_done(self):
         return all(b.is_done for b in self.beams)
@@ -80,6 +80,8 @@ cdef class ParserBeam(object):
     def advance(self, scores, follow_gold=False):
         cdef Beam beam
         for i, beam in enumerate(self.beams):
+            if beam.is_done:
+                continue
             self._set_scores(beam, scores[i])
             if self.golds is not None:
                 self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
@@ -108,7 +110,22 @@ cdef class ParserBeam(object):
                     for j in range(beam.nr_class):
                         if beam.costs[i][j] >= 1:
                             beam.is_valid[i][j] = 0
- 
+
+
+def is_gold(StateClass state, GoldParse gold, strings):
+    predicted = set()
+    truth = set()
+    for i in range(gold.length):
+        if gold.cand_to_gold[i] is None:
+            continue
+        if state.safe_get(i).dep:
+            predicted.add((i, state.H(i), strings[state.safe_get(i).dep]))
+        else:
+            predicted.add((i, state.H(i), 'ROOT'))
+        id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
+        truth.add((id_, head, dep))
+    return truth == predicted
+
 
 def get_token_ids(states, int n_tokens):
     cdef StateClass state
@@ -123,11 +140,13 @@ def get_token_ids(states, int n_tokens):
         c_ids += ids.shape[1]
     return ids
 
-
+nr_update = 0
 def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
                 states, tokvecs, golds,
                 state2vec, vec2scores, drop=0., sgd=None,
                 losses=None, int width=4, float density=0.001):
+    global nr_update
+    nr_update += 1
     pbeam = ParserBeam(moves, states, golds,
                        width=width, density=density)
     gbeam = ParserBeam(moves, states, golds,
@@ -139,8 +158,9 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
         if pbeam.is_done and gbeam.is_done:
             break
         beam_maps.append({})
-        states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1])
-
+        states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
+        if not states:
+            break
         token_ids = get_token_ids(states, nr_feature)
         vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
         scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
@@ -154,6 +174,16 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
 
         for i, violn in enumerate(violns):
             violn.check_crf(pbeam[i], gbeam[i])
+    # The non-monotonic oracle makes it difficult to ensure final costs are
+    # correct. Therefore do final correction
+    cdef Beam pred
+    for i, (pred, gold_parse) in enumerate(zip(pbeam, golds)):
+        for j in range(pred.size):
+            if is_gold(<StateClass>pred.at(j), gold_parse, moves.strings):
+                pred._states[j].loss = 0.0
+            elif pred._states[j].loss == 0.0:
+                pred._states[j].loss = 1.0
+        violn.check_crf(pred, gbeam[i])
 
     histories = [(v.p_hist + v.g_hist) for v in violns]
     losses = [(v.p_probs + v.g_probs) for v in violns]
@@ -162,30 +192,35 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
     return states_d_scores, backprops
 
 
-def get_states(pbeams, gbeams, beam_map):
+def get_states(pbeams, gbeams, beam_map, nr_update):
     seen = {}
     states = []
     p_indices = []
     g_indices = []
     cdef Beam pbeam, gbeam
     for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
+        if pbeam.loss > 0 and pbeam.min_score > (gbeam.score + nr_update):
+            continue
         p_indices.append([])
         for j in range(pbeam.size):
-            key = tuple([eg_id] + pbeam.histories[j])
-            seen[key] = len(states)
-            p_indices[-1].append(len(states))
-            states.append(<StateClass>pbeam.at(j))
+            state = <StateClass>pbeam.at(j)
+            if not state.is_final():
+                key = tuple([eg_id] + pbeam.histories[j])
+                seen[key] = len(states)
+                p_indices[-1].append(len(states))
+                states.append(<StateClass>pbeam.at(j))
         beam_map.update(seen)
         g_indices.append([])
         for i in range(gbeam.size):
-            key = tuple([eg_id] + gbeam.histories[i])
-            if key in seen:
-                g_indices[-1].append(seen[key])
-            else:
-                g_indices[-1].append(len(states))
-                beam_map[key] = len(states)
-                states.append(<StateClass>gbeam.at(i))
-
+            state = <StateClass>gbeam.at(j)
+            if not state.is_final():
+                key = tuple([eg_id] + gbeam.histories[i])
+                if key in seen:
+                    g_indices[-1].append(seen[key])
+                else:
+                    g_indices[-1].append(len(states))
+                    beam_map[key] = len(states)
+                    states.append(<StateClass>gbeam.at(i))
     p_indices = [numpy.asarray(idx, dtype='i') for idx in p_indices]
     g_indices = [numpy.asarray(idx, dtype='i') for idx in g_indices]
     return states, p_indices, g_indices
@@ -206,12 +241,18 @@ def get_gradient(nr_class, beam_maps, histories, losses):
     So history is list of lists of lists of ints
     """
     nr_step = len(beam_maps)
-    grads = [numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f')
-             for beam_map in beam_maps]
+    grads = []
+    for beam_map in beam_maps:
+        if beam_map:
+            grads.append(numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f'))
+        else:
+            grads.append(None)
     for eg_id, hists in enumerate(histories):
         for loss, hist in zip(losses[eg_id], hists):
             key = tuple([eg_id])
             for j, clas in enumerate(hist):
+                if grads[j] is None:
+                    continue
                 i = beam_maps[j][key]
                 # In step j, at state i action clas
                 # resulted in loss

From 4363b4aa4a757807831d89e1f1b10bc46e8bc69a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 13 Aug 2017 12:36:55 +0200
Subject: [PATCH 46/49] Fix redundant tokvecs updates during update

---
 spacy/language.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 6d97f41fe..cb679a2bc 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -305,14 +305,17 @@ class Language(object):
             grads[key] = (W, dW)
         pipes = list(self.pipeline[1:])
         random.shuffle(pipes)
+        tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
+        all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
         for proc in pipes:
             if not hasattr(proc, 'update'):
                 continue
-            tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
             d_tokvecses = proc.update((docs, tokvecses), golds,
                                       drop=drop, sgd=get_grads, losses=losses)
             if update_tensors and d_tokvecses is not None:
-                bp_tokvecses(d_tokvecses, sgd=sgd)
+                for i, d_tv in enumerate(d_tokvecses):
+                    all_d_tokvecses[i] += d_tv
+        bp_tokvecses(all_d_tokvecses, sgd=sgd)
         for key, (W, dW) in grads.items():
             sgd(W, dW, key=key)
         # Clear the tensor variable, to free GPU memory.

From 6a42cc16ff673c738e29aa515c1623dde4cf9566 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 13 Aug 2017 12:37:26 +0200
Subject: [PATCH 47/49] Fix beam parser, improve efficiency of non-beam

---
 spacy/syntax/_beam_utils.pyx | 39 ++++++++++++------------------------
 spacy/syntax/beam_parser.pyx | 14 +------------
 spacy/syntax/nn_parser.pyx   | 38 +++++++++++++++++++++++------------
 3 files changed, 39 insertions(+), 52 deletions(-)

diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx
index 0a513531d..6df8d472f 100644
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@@ -1,4 +1,5 @@
 # cython: infer_types=True
+# cython: profile=True
 cimport numpy as np
 import numpy
 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
@@ -155,8 +156,6 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
     backprops = []
     violns = [MaxViolation() for _ in range(len(states))]
     for t in range(max_steps):
-        if pbeam.is_done and gbeam.is_done:
-            break
         beam_maps.append({})
         states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
         if not states:
@@ -174,16 +173,6 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
 
         for i, violn in enumerate(violns):
             violn.check_crf(pbeam[i], gbeam[i])
-    # The non-monotonic oracle makes it difficult to ensure final costs are
-    # correct. Therefore do final correction
-    cdef Beam pred
-    for i, (pred, gold_parse) in enumerate(zip(pbeam, golds)):
-        for j in range(pred.size):
-            if is_gold(<StateClass>pred.at(j), gold_parse, moves.strings):
-                pred._states[j].loss = 0.0
-            elif pred._states[j].loss == 0.0:
-                pred._states[j].loss = 1.0
-        violn.check_crf(pred, gbeam[i])
 
     histories = [(v.p_hist + v.g_hist) for v in violns]
     losses = [(v.p_probs + v.g_probs) for v in violns]
@@ -199,20 +188,18 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
     g_indices = []
     cdef Beam pbeam, gbeam
     for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
-        if pbeam.loss > 0 and pbeam.min_score > (gbeam.score + nr_update):
-            continue
         p_indices.append([])
-        for j in range(pbeam.size):
-            state = <StateClass>pbeam.at(j)
+        for i in range(pbeam.size):
+            state = <StateClass>pbeam.at(i)
             if not state.is_final():
-                key = tuple([eg_id] + pbeam.histories[j])
+                key = tuple([eg_id] + pbeam.histories[i])
                 seen[key] = len(states)
                 p_indices[-1].append(len(states))
-                states.append(<StateClass>pbeam.at(j))
+                states.append(<StateClass>pbeam.at(i))
         beam_map.update(seen)
         g_indices.append([])
         for i in range(gbeam.size):
-            state = <StateClass>gbeam.at(j)
+            state = <StateClass>gbeam.at(i)
             if not state.is_final():
                 key = tuple([eg_id] + gbeam.histories[i])
                 if key in seen:
@@ -243,17 +230,17 @@ def get_gradient(nr_class, beam_maps, histories, losses):
     nr_step = len(beam_maps)
     grads = []
     for beam_map in beam_maps:
-        if beam_map:
-            grads.append(numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f'))
-        else:
-            grads.append(None)
+        grads.append(numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f'))
+    assert len(histories) == len(losses)
     for eg_id, hists in enumerate(histories):
         for loss, hist in zip(losses[eg_id], hists):
             key = tuple([eg_id])
             for j, clas in enumerate(hist):
-                if grads[j] is None:
-                    continue
-                i = beam_maps[j][key]
+                try:
+                    i = beam_maps[j][key]
+                except:
+                    print(sorted(beam_maps[j].items()))
+                    raise
                 # In step j, at state i action clas
                 # resulted in loss
                 grads[j][i, clas] += loss
diff --git a/spacy/syntax/beam_parser.pyx b/spacy/syntax/beam_parser.pyx
index e96e28fcf..f4f66f9fb 100644
--- a/spacy/syntax/beam_parser.pyx
+++ b/spacy/syntax/beam_parser.pyx
@@ -34,6 +34,7 @@ from ._parse_features cimport CONTEXT_SIZE
 from ._parse_features cimport fill_context
 from .stateclass cimport StateClass
 from .parser cimport Parser
+from ._beam_utils import is_gold
 
 
 DEBUG = False
@@ -237,16 +238,3 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
             raise Exception("Gold parse is not gold-standard")
 
 
-def is_gold(StateClass state, GoldParse gold, StringStore strings):
-    predicted = set()
-    truth = set()
-    for i in range(gold.length):
-        if gold.cand_to_gold[i] is None:
-            continue
-        if state.safe_get(i).dep:
-            predicted.add((i, state.H(i), strings[state.safe_get(i).dep]))
-        else:
-            predicted.add((i, state.H(i), 'ROOT'))
-        id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
-        truth.add((id_, head, dep))
-    return truth == predicted
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index ea61af1df..51fd61cc1 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -66,7 +66,7 @@ from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
 from . import _beam_utils
 
 USE_FINE_TUNE = True
-BEAM_PARSE = False
+BEAM_PARSE = True
 
 def get_templates(*args, **kwargs):
     return []
@@ -348,6 +348,8 @@ cdef class Parser:
                 The number of threads with which to work on the buffer in parallel.
         Yields (Doc): Documents, in order.
         """
+        if BEAM_PARSE:
+            beam_width = 8
         cdef Doc doc
         cdef Beam beam
         for docs in cytoolz.partition_all(batch_size, docs):
@@ -439,6 +441,8 @@ cdef class Parser:
                                                      cuda_stream, 0.0)
         beams = []
         cdef int offset = 0
+        cdef int j = 0
+        cdef int k
         for doc in docs:
             beam = Beam(nr_class, beam_width, min_density=beam_density)
             beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
@@ -451,16 +455,22 @@ cdef class Parser:
                 states = []
                 for i in range(beam.size):
                     stcls = <StateClass>beam.at(i)
-                    states.append(stcls)
+                    # This way we avoid having to score finalized states
+                    # We do have to take care to keep indexes aligned, though
+                    if not stcls.is_final():
+                        states.append(stcls)
                 token_ids = self.get_token_ids(states)
                 vectors = state2vec(token_ids)
                 scores = vec2scores(vectors)
+                j = 0
+                c_scores = <float*>scores.data
                 for i in range(beam.size):
                     stcls = <StateClass>beam.at(i)
                     if not stcls.is_final():
                         self.moves.set_valid(beam.is_valid[i], stcls.c)
-                        for j in range(nr_class):
-                            beam.scores[i][j] = scores[i, j]
+                        for k in range(nr_class):
+                            beam.scores[i][k] = c_scores[j * scores.shape[1] + k]
+                        j += 1
                 beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
                 beam.check_done(_check_final_state, NULL)
             beams.append(beam)
@@ -540,6 +550,7 @@ cdef class Parser:
             losses[self.name] = 0.
         docs, tokvecs = docs_tokvecs
         lengths = [len(d) for d in docs]
+        assert min(lengths) >= 1
         tokvecs = self.model[0].ops.flatten(tokvecs)
         if USE_FINE_TUNE:
             my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
@@ -554,9 +565,14 @@ cdef class Parser:
         states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, max_moves,
                                         states, tokvecs, golds,
                                         state2vec, vec2scores,
-                                        drop, sgd, losses)
+                                        drop, sgd, losses,
+                                        width=8)
         backprop_lower = []
         for i, d_scores in enumerate(states_d_scores):
+            if d_scores is None:
+                continue
+            if losses is not None:
+                losses[self.name] += (d_scores**2).sum()
             ids, bp_vectors, bp_scores = backprops[i]
             d_vector = bp_scores(d_scores, sgd=sgd)
             if isinstance(self.model[0].ops, CupyOps) \
@@ -617,14 +633,10 @@ cdef class Parser:
         xp = get_array_module(d_tokvecs)
         for ids, d_vector, bp_vector in backprops:
             d_state_features = bp_vector(d_vector, sgd=sgd)
-            active_feats = ids * (ids >= 0)
-            active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1))
-            if hasattr(xp, 'scatter_add'):
-                xp.scatter_add(d_tokvecs,
-                    ids, d_state_features * active_feats)
-            else:
-                xp.add.at(d_tokvecs,
-                    ids, d_state_features * active_feats)
+            mask = ids >= 0
+            indices = xp.nonzero(mask)
+            self.model[0].ops.scatter_add(d_tokvecs, ids[indices],
+                d_state_features[indices])
 
     @property
     def move_names(self):

From 0ae045256df6f735b0a301914d87b5c26e2520d5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 13 Aug 2017 18:02:05 -0500
Subject: [PATCH 48/49] Fix beam training

---
 spacy/syntax/_beam_utils.pyx | 59 +++++++++++++++++++++++++-----------
 spacy/syntax/nn_parser.pyx   |  8 ++---
 2 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx
index 6df8d472f..e77036e55 100644
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@@ -57,7 +57,7 @@ cdef class ParserBeam(object):
         for state in states:
             beam = Beam(self.moves.n_moves, width, density)
             beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
-            for i in range(beam.size):
+            for i in range(beam.width):
                 st = <StateClass>beam.at(i)
                 st.c.offset = state.c.offset
             self.beams.append(beam)
@@ -81,7 +81,7 @@ cdef class ParserBeam(object):
     def advance(self, scores, follow_gold=False):
         cdef Beam beam
         for i, beam in enumerate(self.beams):
-            if beam.is_done:
+            if beam.is_done or not scores[i].size:
                 continue
             self._set_scores(beam, scores[i])
             if self.golds is not None:
@@ -92,6 +92,12 @@ cdef class ParserBeam(object):
             else:
                 beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
             beam.check_done(_check_final_state, NULL)
+            if beam.is_done:
+                for j in range(beam.size):
+                    if is_gold(<StateClass>beam.at(j), self.golds[i], self.moves.strings):
+                        beam._states[j].loss = 0.0
+                    elif beam._states[j].loss == 0.0:
+                        beam._states[j].loss = 1.0
 
     def _set_scores(self, Beam beam, float[:, ::1] scores):
         cdef float* c_scores = &scores[0, 0]
@@ -152,32 +158,49 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
                        width=width, density=density)
     gbeam = ParserBeam(moves, states, golds,
                        width=width, density=0.0)
+    cdef StateClass state
     beam_maps = []
     backprops = []
     violns = [MaxViolation() for _ in range(len(states))]
     for t in range(max_steps):
+        # The beam maps let us find the right row in the flattened scores
+        # arrays for each state. States are identified by (example id, history).
+        # We keep a different beam map for each step (since we'll have a flat
+        # scores array for each step). The beam map will let us take the per-state
+        # losses, and compute the gradient for each (step, state, class).
         beam_maps.append({})
+        # Gather all states from the two beams in a list. Some stats may occur
+        # in both beams. To figure out which beam each state belonged to,
+        # we keep two lists of indices, p_indices and g_indices
         states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
         if not states:
             break
+        # Now that we have our flat list of states, feed them through the model
         token_ids = get_token_ids(states, nr_feature)
         vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
         scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
 
+        # Store the callbacks for the backward pass
         backprops.append((token_ids, bp_vectors, bp_scores))
 
+        # Unpack the flat scores into lists for the two beams. The indices arrays
+        # tell us which example and state the scores-row refers to.
         p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
         g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')  for indices in g_indices]
+        # Now advance the states in the beams. The gold beam is contrained to
+        # to follow only gold analyses.
         pbeam.advance(p_scores)
         gbeam.advance(g_scores, follow_gold=True)
-
+        # Track the "maximum violation", to use in the update.
         for i, violn in enumerate(violns):
             violn.check_crf(pbeam[i], gbeam[i])
 
-    histories = [(v.p_hist + v.g_hist) for v in violns]
-    losses = [(v.p_probs + v.g_probs) for v in violns]
+    # Only make updates if we have non-gold states
+    histories = [((v.p_hist + v.g_hist) if v.p_hist else []) for v in violns]
+    losses = [((v.p_probs + v.g_probs) if v.p_probs else []) for v in violns]
     states_d_scores = get_gradient(moves.n_moves, beam_maps,
                                    histories, losses)
+    assert len(states_d_scores) == len(backprops), (len(states_d_scores), len(backprops))
     return states_d_scores, backprops
 
 
@@ -187,17 +210,20 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
     p_indices = []
     g_indices = []
     cdef Beam pbeam, gbeam
+    assert len(pbeams) == len(gbeams)
     for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
         p_indices.append([])
+        g_indices.append([])
+        if pbeam.loss > 0 and pbeam.min_score > gbeam.score:
+            continue
         for i in range(pbeam.size):
             state = <StateClass>pbeam.at(i)
             if not state.is_final():
                 key = tuple([eg_id] + pbeam.histories[i])
                 seen[key] = len(states)
                 p_indices[-1].append(len(states))
-                states.append(<StateClass>pbeam.at(i))
+                states.append(state)
         beam_map.update(seen)
-        g_indices.append([])
         for i in range(gbeam.size):
             state = <StateClass>gbeam.at(i)
             if not state.is_final():
@@ -207,10 +233,10 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
                 else:
                     g_indices[-1].append(len(states))
                     beam_map[key] = len(states)
-                    states.append(<StateClass>gbeam.at(i))
-    p_indices = [numpy.asarray(idx, dtype='i') for idx in p_indices]
-    g_indices = [numpy.asarray(idx, dtype='i') for idx in g_indices]
-    return states, p_indices, g_indices
+                    states.append(state)
+    p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices]
+    g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices]
+    return states, p_idx, g_idx
 
 
 def get_gradient(nr_class, beam_maps, histories, losses):
@@ -230,20 +256,17 @@ def get_gradient(nr_class, beam_maps, histories, losses):
     nr_step = len(beam_maps)
     grads = []
     for beam_map in beam_maps:
-        grads.append(numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f'))
+        if beam_map:
+            grads.append(numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f'))
     assert len(histories) == len(losses)
     for eg_id, hists in enumerate(histories):
         for loss, hist in zip(losses[eg_id], hists):
             key = tuple([eg_id])
             for j, clas in enumerate(hist):
-                try:
-                    i = beam_maps[j][key]
-                except:
-                    print(sorted(beam_maps[j].items()))
-                    raise
+                i = beam_maps[j][key]
                 # In step j, at state i action clas
                 # resulted in loss
-                grads[j][i, clas] += loss
+                grads[j][i, clas] += loss / len(histories)
                 key = key + tuple([clas])
     return grads
 
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 51fd61cc1..a193c96a3 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -557,20 +557,20 @@ cdef class Parser:
             my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
             tokvecs += my_tokvecs
 
-        states, golds, max_moves = self._init_gold_batch(docs, golds)
+        states = self.moves.init_batch(docs)
+        for gold in golds:
+            self.moves.preprocess_gold(gold)
 
         cuda_stream = get_cuda_stream()
         state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0)
 
-        states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, max_moves,
+        states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
                                         states, tokvecs, golds,
                                         state2vec, vec2scores,
                                         drop, sgd, losses,
                                         width=8)
         backprop_lower = []
         for i, d_scores in enumerate(states_d_scores):
-            if d_scores is None:
-                continue
             if losses is not None:
                 losses[self.name] += (d_scores**2).sum()
             ids, bp_vectors, bp_scores = backprops[i]

From ac6c25f7629011c9a51692e684b1e1db3422585d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 14 Aug 2017 12:09:18 +0200
Subject: [PATCH 49/49] Check SGD is not None in update

---
 spacy/_ml.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index e37bcac52..91b530fad 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -381,7 +381,8 @@ def fine_tune(embedding, combine=None):
             flat_grad = model.ops.flatten(d_output)
             model.d_mix[1] += flat_tokvecs.dot(flat_grad.T).sum()
             model.d_mix[0] += flat_vecs.dot(flat_grad.T).sum()
-            sgd(model._mem.weights, model._mem.gradient, key=model.id)
+            if sgd is not None:
+                sgd(model._mem.weights, model._mem.gradient, key=model.id)
             return d_output
         return output, fine_tune_bwd
     model = wrap(fine_tune_fwd, embedding)