From c49e44349aec773bc1b0aafea08e4953d71065b1 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 8 May 2018 02:53:24 +0200
Subject: [PATCH] Fix beam parsing

---
 spacy/syntax/_beam_utils.pyx   |  8 +++-
 spacy/syntax/_parser_model.pyx |  7 ++--
 spacy/syntax/nn_parser.pyx     | 74 ++++++++++++++++++++++++----------
 3 files changed, 62 insertions(+), 27 deletions(-)

diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx
index 20f6bdc5e..b8ca338ee 100644
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@@ -59,7 +59,7 @@ cdef class ParserBeam(object):
     cdef public object dones
 
     def __init__(self, TransitionSystem moves, states, golds,
-                 int width, float density):
+                 int width, float density=0.):
         self.moves = moves
         self.states = states
         self.golds = golds
@@ -133,8 +133,12 @@ cdef class ParserBeam(object):
                 self.moves.set_costs(beam.is_valid[i], beam.costs[i],
                                      state, gold)
                 if follow_gold:
+                    min_cost = 0
                     for j in range(beam.nr_class):
-                        if beam.costs[i][j] >= 1:
+                        if beam.costs[i][j] < min_cost:
+                            min_cost = beam.costs[i][j]
+                    for j in range(beam.nr_class):
+                        if beam.costs[i][j] > min_cost:
                             beam.is_valid[i][j] = 0
 
 
diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index 5a7d3609b..962461417 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -266,13 +266,14 @@ class ParserStepModel(Model):
 
     def get_token_ids(self, batch):
         states = _beam_utils.collect_states(batch)
+        cdef StateClass state
+        states = [state for state in states if not state.is_final()]
         cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
                                           dtype='i', order='C')
+        ids.fill(-1)
         c_ids = <int*>ids.data
-        cdef StateClass state
         for state in states:
-            if not state.c.is_final():
-                state.c.set_context_tokens(c_ids, ids.shape[1])
+            state.c.set_context_tokens(c_ids, ids.shape[1])
             c_ids += ids.shape[1]
         return ids
 
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index d7f64d937..9fad36f46 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -208,31 +208,62 @@ cdef class Parser:
             for doc in batch_in_order:
                 yield doc
 
-    def predict(self, docs, beam_width=1):
+    def predict(self, docs, beam_width=1, drop=0.):
         if isinstance(docs, Doc):
             docs = [docs]
+        if beam_width < 2:
+            return self.greedy_parse(docs, drop=drop)
+        else:
+            return self.beam_parse(docs, beam_width=beam_width, drop=drop)
 
+    def greedy_parse(self, docs, drop=0.):
         cdef vector[StateC*] states
         cdef StateClass state
         model = self.model(docs)
-        if beam_width == 1:
-            batch = self.moves.init_batch(docs)
-            weights = get_c_weights(model)
-            for state in batch:
-                if not state.is_final():
-                    states.push_back(state.c)
-            sizes = get_c_sizes(model, states.size())
-            with nogil:
-                self._parseC(&states[0],
-                    weights, sizes)
-        else:
-            batch = self.moves.init_beams(docs, beam_width)
-            unfinished = list(batch)
-            while unfinished:
-                scores = model.predict(unfinished)
-                unfinished = self.transition_beams(batch, scores)
+        batch = self.moves.init_batch(docs)
+        weights = get_c_weights(model)
+        for state in batch:
+            if not state.is_final():
+                states.push_back(state.c)
+        sizes = get_c_sizes(model, states.size())
+        with nogil:
+            self._parseC(&states[0],
+                weights, sizes)
         return batch
     
+    def beam_parse(self, docs, int beam_width=3, float drop=0.):
+        cdef Beam beam
+        cdef Doc doc
+        cdef np.ndarray token_ids
+        model = self.model(docs)
+        beams = self.moves.init_beams(docs, beam_width)
+        token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
+                                 dtype='i', order='C')
+        cdef int* c_ids
+        cdef int nr_feature = self.nr_feature
+        cdef int n_states
+        model = self.model(docs)
+        todo = [beam for beam in beams if not beam.is_done]
+        while todo:
+            token_ids.fill(-1)
+            c_ids = <int*>token_ids.data
+            n_states = 0
+            for beam in todo:
+                for i in range(beam.size):
+                    state = <StateC*>beam.at(i)
+                    # This way we avoid having to score finalized states
+                    # We do have to take care to keep indexes aligned, though
+                    if not state.is_final():
+                        state.set_context_tokens(c_ids, nr_feature)
+                        c_ids += nr_feature
+                        n_states += 1
+            if n_states == 0:
+                break
+            vectors = model.state2vec(token_ids[:n_states])
+            scores = model.vec2scores(vectors)
+            todo = self.transition_beams(todo, scores)
+        return beams
+ 
     cdef void _parseC(self, StateC** states,
             WeightsC weights, SizesC sizes) nogil:
         cdef int i, j
@@ -325,7 +356,7 @@ cdef class Parser:
         beam_update_prob = 1-self.cfg.get('beam_update_prob', 0.5)
         if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= beam_update_prob:
             return self.update_beam(docs, golds,
-                    self.cfg['beam_width'], self.cfg['beam_density'],
+                    self.cfg['beam_width'],
                     drop=drop, sgd=sgd, losses=losses)
         # Chop sequences into lengths of this many transitions, to make the
         # batch uniform length.
@@ -352,12 +383,11 @@ cdef class Parser:
 
     def update_beam(self, docs, golds, width, drop=0., sgd=None, losses=None):
         lengths = [len(d) for d in docs]
-        states = self.moves.init_batch(docs)
-        for gold in golds:
-            self.moves.preprocess_gold(gold)
+        cut_gold = numpy.random.choice(range(20, 100))
+        states, golds, max_steps = self._init_gold_batch(docs, golds, max_length=cut_gold)
         model, finish_update = self.model.begin_update(docs, drop=drop)
         states_d_scores, backprops, beams = _beam_utils.update_beam(
-            self.moves, self.nr_feature, 500, states, golds, model.state2vec,
+            self.moves, self.nr_feature, max_steps, states, golds, model.state2vec,
             model.vec2scores, width, drop=drop, losses=losses)
         for i, d_scores in enumerate(states_d_scores):
             ids, bp_vectors, bp_scores = backprops[i]