From 0ae045256df6f735b0a301914d87b5c26e2520d5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 13 Aug 2017 18:02:05 -0500
Subject: [PATCH] Fix beam training

---
 spacy/syntax/_beam_utils.pyx | 59 +++++++++++++++++++++++++-----------
 spacy/syntax/nn_parser.pyx   |  8 ++---
 2 files changed, 45 insertions(+), 22 deletions(-)
diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx
index 6df8d472f..e77036e55 100644
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@@ -57,7 +57,7 @@ cdef class ParserBeam(object):
         for state in states:
             beam = Beam(self.moves.n_moves, width, density)
             beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
-            for i in range(beam.size):
+            for i in range(beam.width):
                 st = <StateClass>beam.at(i)
                 st.c.offset = state.c.offset
             self.beams.append(beam)
@@ -81,7 +81,7 @@ cdef class ParserBeam(object):
     def advance(self, scores, follow_gold=False):
         cdef Beam beam
         for i, beam in enumerate(self.beams):
-            if beam.is_done:
+            if beam.is_done or not scores[i].size:
                 continue
             self._set_scores(beam, scores[i])
             if self.golds is not None:
@@ -92,6 +92,12 @@ cdef class ParserBeam(object):
             else:
                 beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
             beam.check_done(_check_final_state, NULL)
+            if beam.is_done:
+                for j in range(beam.size):
+                    if is_gold(<StateClass>beam.at(j), self.golds[i], self.moves.strings):
+                        beam._states[j].loss = 0.0
+                    elif beam._states[j].loss == 0.0:
+                        beam._states[j].loss = 1.0
 
     def _set_scores(self, Beam beam, float[:, ::1] scores):
         cdef float* c_scores = &scores[0, 0]
@@ -152,32 +158,49 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
                        width=width, density=density)
     gbeam = ParserBeam(moves, states, golds,
                        width=width, density=0.0)
+    cdef StateClass state
     beam_maps = []
     backprops = []
     violns = [MaxViolation() for _ in range(len(states))]
     for t in range(max_steps):
+        # The beam maps let us find the right row in the flattened scores
+        # arrays for each state. States are identified by (example id, history).
+        # We keep a different beam map for each step (since we'll have a flat
+        # scores array for each step). The beam map will let us take the per-state
+        # losses, and compute the gradient for each (step, state, class).
         beam_maps.append({})
+        # Gather all states from the two beams in a list. Some stats may occur
+        # in both beams. To figure out which beam each state belonged to,
+        # we keep two lists of indices, p_indices and g_indices
         states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
         if not states:
             break
+        # Now that we have our flat list of states, feed them through the model
         token_ids = get_token_ids(states, nr_feature)
         vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
         scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
 
+        # Store the callbacks for the backward pass
         backprops.append((token_ids, bp_vectors, bp_scores))
 
+        # Unpack the flat scores into lists for the two beams. The indices arrays
+        # tell us which example and state the scores-row refers to.
         p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
         g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')  for indices in g_indices]
+        # Now advance the states in the beams. The gold beam is contrained to
+        # to follow only gold analyses.
         pbeam.advance(p_scores)
         gbeam.advance(g_scores, follow_gold=True)
-
+        # Track the "maximum violation", to use in the update.
         for i, violn in enumerate(violns):
             violn.check_crf(pbeam[i], gbeam[i])
 
-    histories = [(v.p_hist + v.g_hist) for v in violns]
-    losses = [(v.p_probs + v.g_probs) for v in violns]
+    # Only make updates if we have non-gold states
+    histories = [((v.p_hist + v.g_hist) if v.p_hist else []) for v in violns]
+    losses = [((v.p_probs + v.g_probs) if v.p_probs else []) for v in violns]
     states_d_scores = get_gradient(moves.n_moves, beam_maps,
                                    histories, losses)
+    assert len(states_d_scores) == len(backprops), (len(states_d_scores), len(backprops))
     return states_d_scores, backprops
 
 
@@ -187,17 +210,20 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
     p_indices = []
     g_indices = []
     cdef Beam pbeam, gbeam
+    assert len(pbeams) == len(gbeams)
     for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
         p_indices.append([])
+        g_indices.append([])
+        if pbeam.loss > 0 and pbeam.min_score > gbeam.score:
+            continue
         for i in range(pbeam.size):
             state = <StateClass>pbeam.at(i)
             if not state.is_final():
                 key = tuple([eg_id] + pbeam.histories[i])
                 seen[key] = len(states)
                 p_indices[-1].append(len(states))
-                states.append(<StateClass>pbeam.at(i))
+                states.append(state)
         beam_map.update(seen)
-        g_indices.append([])
         for i in range(gbeam.size):
             state = <StateClass>gbeam.at(i)
             if not state.is_final():
@@ -207,10 +233,10 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
                 else:
                     g_indices[-1].append(len(states))
                     beam_map[key] = len(states)
-                    states.append(<StateClass>gbeam.at(i))
-    p_indices = [numpy.asarray(idx, dtype='i') for idx in p_indices]
-    g_indices = [numpy.asarray(idx, dtype='i') for idx in g_indices]
-    return states, p_indices, g_indices
+                    states.append(state)
+    p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices]
+    g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices]
+    return states, p_idx, g_idx
 
 
 def get_gradient(nr_class, beam_maps, histories, losses):
@@ -230,20 +256,17 @@ def get_gradient(nr_class, beam_maps, histories, losses):
     nr_step = len(beam_maps)
     grads = []
     for beam_map in beam_maps:
-        grads.append(numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f'))
+        if beam_map:
+            grads.append(numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f'))
     assert len(histories) == len(losses)
     for eg_id, hists in enumerate(histories):
         for loss, hist in zip(losses[eg_id], hists):
             key = tuple([eg_id])
             for j, clas in enumerate(hist):
-                try:
-                    i = beam_maps[j][key]
-                except:
-                    print(sorted(beam_maps[j].items()))
-                    raise
+                i = beam_maps[j][key]
                 # In step j, at state i action clas
                 # resulted in loss
-                grads[j][i, clas] += loss
+                grads[j][i, clas] += loss / len(histories)
                 key = key + tuple([clas])
     return grads
 
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 51fd61cc1..a193c96a3 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -557,20 +557,20 @@ cdef class Parser:
             my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
             tokvecs += my_tokvecs
 
-        states, golds, max_moves = self._init_gold_batch(docs, golds)
+        states = self.moves.init_batch(docs)
+        for gold in golds:
+            self.moves.preprocess_gold(gold)
 
         cuda_stream = get_cuda_stream()
         state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0)
 
-        states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, max_moves,
+        states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
                                         states, tokvecs, golds,
                                         state2vec, vec2scores,
                                         drop, sgd, losses,
                                         width=8)
         backprop_lower = []
         for i, d_scores in enumerate(states_d_scores):
-            if d_scores is None:
-                continue
             if losses is not None:
                 losses[self.name] += (d_scores**2).sum()
             ids, bp_vectors, bp_scores = backprops[i]