From dbf2a4cf577f0e66bf1591289728ed4ec56d1c5c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 25 May 2017 19:46:56 -0500
Subject: [PATCH 1/5] Update all models on each epoch

---
 spacy/language.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index b20bb4617..1d9f232a7 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -223,8 +223,7 @@ class Language(object):
             tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
             d_tokvecses = proc.update((docs, tokvecses), golds,
                                       drop=drop, sgd=get_grads, losses=losses)
-            bp_tokvecses(d_tokvecses, sgd=get_grads)
-            break
+            bp_tokvecses(d_tokvecses, sgd=sgd)
         for key, (W, dW) in grads.items():
             sgd(W, dW, key=key)
         # Clear the tensor variable, to free GPU memory.

From 22d7b448a541863efd62b60e3b674f2a1b356af7 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 25 May 2017 19:47:12 -0500
Subject: [PATCH 2/5] Fix convert command

---
 spacy/cli/convert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index c7730ab9e..847051e3f 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -25,7 +25,7 @@ CONVERTERS = {
     n_sents=("Number of sentences per doc", "option", "n", float),
     morphology=("Enable appending morphology to tags", "flag", "m", bool)
 )
-def convert(input_file, output_dir, n_sents, morphology):
+def convert(_, input_file, output_dir, n_sents, morphology):
     """Convert files into JSON format for use with train command and other
     experiment management functions.
     """
@@ -39,4 +39,4 @@ def convert(input_file, output_dir, n_sents, morphology):
     if not file_ext in CONVERTERS:
         prints("Can't find converter for %s" % input_path.parts[-1],
                title="Unknown format", exits=1)
-    CONVERTERS[file_ext](input_path, output_path, *args)
+    CONVERTERS[file_ext](input_path, output_path, n_sents, morphology)

From d65f99a72016cb6eb9b0fe18172abf206dc738a9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 26 May 2017 05:52:09 -0500
Subject: [PATCH 3/5] Improve model saving in train script

---
 spacy/cli/train.py | 43 +++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index ee0ee53a2..b25cdcbd5 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -57,9 +57,9 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
     # starts high and decays sharply, to force the optimizer to explore.
     # Batch size starts at 1 and grows, so that we make updates quickly
     # at the beginning of training.
-    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.5),
+    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
                                   util.env_opt('dropout_to', 0.2),
-                                  util.env_opt('dropout_decay', 1e-4))
+                                  util.env_opt('dropout_decay', 0.0))
     batch_sizes = util.compounding(util.env_opt('batch_from', 1),
                                    util.env_opt('batch_to', 64),
                                    util.env_opt('batch_compound', 1.001))
@@ -71,23 +71,30 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
     optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu)
 
     print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
-    for i in range(n_iter):
-        with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar:
-            train_docs = corpus.train_docs(nlp, projectivize=True,
-                                           gold_preproc=False, shuffle=i)
-            losses = {}
-            for batch in minibatch(train_docs, size=batch_sizes):
-                docs, golds = zip(*batch)
-                nlp.update(docs, golds, sgd=optimizer,
-                           drop=next(dropout_rates), losses=losses)
-                pbar.update(len(docs))
+    try:
+        for i in range(n_iter):
+            with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar:
+                train_docs = corpus.train_docs(nlp, projectivize=True,
+                                               gold_preproc=False, max_length=1000)
+                losses = {}
+                for batch in minibatch(train_docs, size=batch_sizes):
+                    docs, golds = zip(*batch)
+                    nlp.update(docs, golds, sgd=optimizer,
+                               drop=next(dropout_rates), losses=losses)
+                    pbar.update(len(docs))
 
-        with nlp.use_params(optimizer.averages):
-            scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False))
-        print_progress(i, losses, scorer.scores)
-    with (output_path / 'model.bin').open('wb') as file_:
-        with nlp.use_params(optimizer.averages):
-            dill.dump(nlp, file_, -1)
+            with nlp.use_params(optimizer.averages):
+                scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False))
+                with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
+                    dill.dump(nlp, file_, -1)
+
+
+            print_progress(i, losses, scorer.scores)
+    finally:
+        print("Saving model...")
+        with (output_path / 'model-final.pickle').open('wb') as file_:
+            with nlp.use_params(optimizer.averages):
+                dill.dump(nlp, file_, -1)
 
 
 def _render_parses(i, to_render):

From daac3e3573c3661d604909ca56c61fcd8e2107eb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 26 May 2017 11:30:52 -0500
Subject: [PATCH 4/5] Always shuffle gold data, and support length cap

---
 spacy/gold.pyx | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 579010e6d..558e4e008 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -198,15 +198,15 @@ class GoldCorpus(object):
             n += 1
         return n
 
-    def train_docs(self, nlp, shuffle=0, gold_preproc=False,
-                   projectivize=False):
+    def train_docs(self, nlp, gold_preproc=False,
+                   projectivize=False, max_length=None):
         train_tuples = self.train_tuples
         if projectivize:
             train_tuples = nonproj.preprocess_training_data(
                                self.train_tuples)
-        if shuffle:
-            random.shuffle(train_tuples)
-        gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc)
+        random.shuffle(train_tuples)
+        gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
+                                        max_length=max_length)
         yield from gold_docs
 
     def dev_docs(self, nlp, gold_preproc=False):
@@ -215,7 +215,7 @@ class GoldCorpus(object):
         yield from gold_docs
 
     @classmethod
-    def iter_gold_docs(cls, nlp, tuples, gold_preproc):
+    def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None):
         for raw_text, paragraph_tuples in tuples:
             if gold_preproc:
                 raw_text = None
@@ -226,7 +226,8 @@ class GoldCorpus(object):
                                   gold_preproc)
             golds = cls._make_golds(docs, paragraph_tuples)
             for doc, gold in zip(docs, golds):
-                yield doc, gold
+                if not max_length or len(doc) < max_length:
+                    yield doc, gold
 
     @classmethod
     def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc):

From 3d5a536eaa49a46a17156ea8ba996f43179a2e13 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 26 May 2017 11:31:23 -0500
Subject: [PATCH 5/5] Improve efficiency of parser batching

---
 spacy/syntax/_state.pxd            |  1 +
 spacy/syntax/arc_eager.pyx         |  9 ++++-
 spacy/syntax/ner.pyx               |  9 ++++-
 spacy/syntax/nn_parser.pyx         | 55 ++++++++++++------------------
 spacy/syntax/stateclass.pyx        |  5 +++
 spacy/syntax/transition_system.pyx | 28 +++++++++++++++
 6 files changed, 72 insertions(+), 35 deletions(-)

diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd
index 829779dc1..4b2b47270 100644
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@@ -345,6 +345,7 @@ cdef cppclass StateC:
         this._s_i = src._s_i
         this._e_i = src._e_i
         this._break = src._break
+        this.offset = src.offset
 
     void fast_forward() nogil:
         # space token attachement policy:
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 0a1422088..f7c1c7922 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -350,8 +350,15 @@ cdef class ArcEager(TransitionSystem):
         def __get__(self):
             return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
 
+    def has_gold(self, GoldParse gold, start=0, end=None):
+        end = end or len(gold.heads)
+        if all([tag is None for tag in gold.heads[start:end]]):
+            return False
+        else:
+            return True
+
     def preprocess_gold(self, GoldParse gold):
-        if all([h is None for h in gold.heads]):
+        if not self.has_gold(gold):
             return None
         for i in range(gold.length):
             if gold.heads[i] is None: # Missing values
diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx
index 74ab9c26c..af42eded4 100644
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@@ -95,8 +95,15 @@ cdef class BiluoPushDown(TransitionSystem):
         else:
             return MOVE_NAMES[move] + '-' + self.strings[label]
 
+    def has_gold(self, GoldParse gold, start=0, end=None):
+        end = end or len(gold.ner)
+        if all([tag == '-' for tag in gold.ner[start:end]]):
+            return False
+        else:
+            return True
+
     def preprocess_gold(self, GoldParse gold):
-        if all([tag == '-' for tag in gold.ner]):
+        if not self.has_gold(gold):
             return None
         for i in range(gold.length):
             gold.c.ner[i] = self.lookup_transition(gold.ner[i])
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 341b8c041..35966d536 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -427,8 +427,7 @@ cdef class Parser:
 
         cuda_stream = get_cuda_stream()
 
-        states, golds = self._init_gold_batch(docs, golds)
-        max_length = min([len(doc) for doc in docs])
+        states, golds, max_length = self._init_gold_batch(docs, golds)
         state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
                                                       0.0)
         todo = [(s, g) for (s, g) in zip(states, golds)
@@ -472,46 +471,36 @@ cdef class Parser:
             backprops, sgd, cuda_stream)
         return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
 
-    def _init_gold_batch(self, docs, golds):
+    def _init_gold_batch(self, whole_docs, whole_golds):
         """Make a square batch, of length equal to the shortest doc. A long
         doc will get multiple states. Let's say we have a doc of length 2*N,
         where N is the shortest doc. We'll make two states, one representing
         long_doc[:N], and another representing long_doc[N:]."""
-        cdef StateClass state
-        lengths = [len(doc) for doc in docs]
-        min_length = min(lengths)
-        offset = 0
+        cdef:
+            StateClass state
+            Transition action
+        whole_states = self.moves.init_batch(whole_docs)
+        max_length = max(5, min(20, min([len(doc) for doc in whole_docs])))
         states = []
-        extra_golds = []
-        cdef Pool mem = Pool()
-        costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
-        is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
-        for doc, gold in zip(docs, golds):
+        golds = []
+        for doc, state, gold in zip(whole_docs, whole_states, whole_golds):
             gold = self.moves.preprocess_gold(gold)
-            state = StateClass(doc, offset=offset)
-            self.moves.initialize_state(state.c)
-            if not state.is_final():
-                states.append(state)
-                extra_golds.append(gold)
-            start = min(min_length, len(doc))
+            if gold is None:
+                continue
+            oracle_actions = self.moves.get_oracle_sequence(doc, gold)
+            start = 0
             while start < len(doc):
-                length = min(min_length, len(doc)-start)
-                state = StateClass(doc, offset=offset)
-                self.moves.initialize_state(state.c)
+                state = state.copy()
                 while state.B(0) < start and not state.is_final():
-                    self.moves.set_costs(is_valid, costs, state, gold)
-                    for i in range(self.moves.n_moves):
-                        if is_valid[i] and costs[i] <= 0:
-                            self.moves.c[i].do(state.c, self.moves.c[i].label)
-                            break
-                    else:
-                        raise ValueError("Could not find gold move")
-                start += length
-                if not state.is_final():
+                    action = self.moves.c[oracle_actions.pop(0)]
+                    action.do(state.c, action.label)
+                has_gold = self.moves.has_gold(gold, start=start,
+                                               end=start+max_length)
+                if not state.is_final() and has_gold:
                     states.append(state)
-                    extra_golds.append(gold)
-            offset += len(doc)
-        return states, extra_golds
+                    golds.append(gold)
+                start += min(max_length, len(doc)-start)
+        return states, golds, max_length
 
     def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None):
         # Tells CUDA to block, so our async copies complete.
diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx
index fd38710e7..228a3ff91 100644
--- a/spacy/syntax/stateclass.pyx
+++ b/spacy/syntax/stateclass.pyx
@@ -41,6 +41,11 @@ cdef class StateClass:
     def is_final(self):
         return self.c.is_final()
 
+    def copy(self):
+        cdef StateClass new_state = StateClass.init(self.c._sent, self.c.length)
+        new_state.c.clone(self.c)
+        return new_state
+
     def print_state(self, words):
         words = list(words) + ['_']
         top = words[self.S(0)] + '_%d' % self.S_(0).head
diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index d6750d09c..07102aeb0 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -61,6 +61,24 @@ cdef class TransitionSystem:
             offset += len(doc)
         return states
 
+    def get_oracle_sequence(self, doc, GoldParse gold):
+        cdef Pool mem = Pool()
+        costs = <float*>mem.alloc(self.n_moves, sizeof(float))
+        is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
+
+        cdef StateClass state = StateClass(doc, offset=0)
+        self.initialize_state(state.c)
+        history = []
+        while not state.is_final():
+            self.set_costs(is_valid, costs, state, gold)
+            for i in range(self.n_moves):
+                if is_valid[i] and costs[i] <= 0:
+                    action = self.c[i]
+                    history.append(i)
+                    action.do(state.c, action.label)
+                    break
+        return history
+
     cdef int initialize_state(self, StateC* state) nogil:
         pass
 
@@ -92,11 +110,21 @@ cdef class TransitionSystem:
                        StateClass stcls, GoldParse gold) except -1:
         cdef int i
         self.set_valid(is_valid, stcls.c)
+        cdef int n_gold = 0
         for i in range(self.n_moves):
             if is_valid[i]:
                 costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
+                n_gold += costs[i] <= 0
             else:
                 costs[i] = 9000
+        if n_gold <= 0:
+            print(gold.words)
+            print(gold.ner)
+            raise ValueError(
+                "Could not find a gold-standard action to supervise "
+                "the entity recognizer\n"
+                "The transition system has %d actions.\n"
+                "%s" % (self.n_moves))
 
     def add_action(self, int action, label):
         if not isinstance(label, int):