diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx
index 4a4b79dad..10b5e407c 100644
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@@ -41,21 +41,24 @@ cdef hash_t _hash_state(void* _state, void* _) except 0:
 
 cdef class ParserBeam(object):
     cdef public TransitionSystem moves
-    cdef public object docs
+    cdef public object states
     cdef public object golds
     cdef public object beams
 
-    def __init__(self, TransitionSystem moves, docs, golds,
+    def __init__(self, TransitionSystem moves, states, golds,
             int width=4, float density=0.001):
         self.moves = moves
-        self.docs = docs
+        self.states = states
         self.golds = golds
         self.beams = []
-        cdef Doc doc
         cdef Beam beam
-        for doc in docs:
+        cdef StateClass state, st
+        for state in states:
             beam = Beam(self.moves.n_moves, width, density)
-            beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
+            beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
+            for i in range(beam.size):
+                st = <StateClass>beam.at(i)
+                st.c.offset = state.c.offset
             self.beams.append(beam)
     
     @property
@@ -100,34 +103,38 @@ cdef class ParserBeam(object):
 def get_token_ids(states, int n_tokens):
     cdef StateClass state
     cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
-                                      dtype='i', order='C')
+                                      dtype='int32', order='C')
     c_ids = <int*>ids.data
     for i, state in enumerate(states):
         if not state.is_final():
             state.c.set_context_tokens(c_ids, n_tokens)
+        else:
+            ids[i] = -1
         c_ids += ids.shape[1]
     return ids
 
 
-def update_beam(TransitionSystem moves, int nr_feature,
-                docs, tokvecs, golds,
+def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
+                states, tokvecs, golds,
                 state2vec, vec2scores, drop=0., sgd=None,
                 losses=None, int width=4, float density=0.001):
-    pbeam = ParserBeam(moves, docs, golds,
+    pbeam = ParserBeam(moves, states, golds,
                        width=width, density=density)
-    gbeam = ParserBeam(moves, docs, golds,
+    gbeam = ParserBeam(moves, states, golds,
                        width=width, density=density)
-    beam_map = {}
+    beam_maps = []
     backprops = []
-    violns = [MaxViolation() for _ in range(len(docs))]
-    example_ids = list(range(len(docs)))
-    while not pbeam.is_done and not gbeam.is_done:
-        states, p_indices, g_indices = get_states(example_ids, pbeam, gbeam, beam_map)
+    violns = [MaxViolation() for _ in range(len(states))]
+    for t in range(max_steps):
+        if pbeam.is_done and gbeam.is_done:
+            break
+        beam_maps.append({})
+        states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1])
 
         token_ids = get_token_ids(states, nr_feature)
         vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
         scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
-        
+
         backprops.append((token_ids, bp_vectors, bp_scores))
 
         p_scores = [scores[indices] for indices in p_indices]
@@ -140,18 +147,18 @@ def update_beam(TransitionSystem moves, int nr_feature,
     
     histories = [(v.p_hist + v.g_hist) for v in violns]
     losses = [(v.p_probs + v.g_probs) for v in violns]
-    states_d_scores = get_gradient(moves.n_moves, beam_map,
+    states_d_scores = get_gradient(moves.n_moves, beam_maps,
                                    histories, losses)
     return states_d_scores, backprops
 
 
-def get_states(example_ids, pbeams, gbeams, beam_map):
-    states = []
+def get_states(pbeams, gbeams, beam_map):
     seen = {}
+    states = []
     p_indices = []
     g_indices = []
     cdef Beam pbeam, gbeam
-    for eg_id, pbeam, gbeam in zip(example_ids, pbeams, gbeams):
+    for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
         p_indices.append([])
         for j in range(pbeam.size):
             key = tuple([eg_id] + pbeam.histories[j])
@@ -174,23 +181,30 @@ def get_states(example_ids, pbeams, gbeams, beam_map):
     return states, p_indices, g_indices
 
 
-def get_gradient(nr_class, beam_map, histories, losses):
+def get_gradient(nr_class, beam_maps, histories, losses):
     """
     The global model assigns a loss to each parse. The beam scores
     are additive, so the same gradient is applied to each action
     in the history. This gives the gradient of a single *action*
     for a beam state -- so we have "the gradient of loss for taking
     action i given history H."
+
+    Histories: Each hitory is a list of actions
+    Each candidate has a history
+    Each beam has multiple candidates
+    Each batch has multiple beams
+    So history is list of lists of lists of ints
     """
-    nr_step = max(len(hist) for hist in histories)
-    nr_beam = len(histories)
-    grads = [numpy.zeros((nr_beam, nr_class), dtype='f') for _ in range(nr_step)]
-    for hist, loss in zip(histories, losses):
-        key = tuple()
-        for j, clas in enumerate(hist):
-            grads[j][i, clas] = loss
-            key = key + clas
-            i = beam_map[key]
+    nr_step = len(beam_maps)
+    grads = [numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f')
+             for beam_map in beam_maps]
+    for eg_id, hists in enumerate(histories):
+        for loss, hist in zip(losses[eg_id], hists):
+            key = tuple([eg_id])
+            for j, clas in enumerate(hist):
+                i = beam_maps[j][key]
+                grads[j][i, clas] = loss
+                key = key + tuple([clas])
     return grads
 
 
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 11584e4d2..c842ef00b 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -529,23 +529,29 @@ cdef class Parser:
 
     def update_beam(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
         docs, tokvecs = docs_tokvecs
+        lengths = [len(d) for d in docs]
         tokvecs = self.model[0].ops.flatten(tokvecs)
+        states, golds, max_moves = self._init_gold_batch(docs, golds)
 
         cuda_stream = get_cuda_stream()
-        state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, cuda_stream, 0.0)
+        state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0)
 
-        states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature,
-                                        docs, tokvecs, golds,
+        states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, max_moves,
+                                        states, tokvecs, golds,
                                         state2vec, vec2scores,
                                         drop, sgd, losses)
         backprop_lower = []
         for i, d_scores in enumerate(states_d_scores):
             ids, bp_vectors, bp_scores = backprops[i]
             d_vector = bp_scores(d_scores, sgd=sgd)
-            backprop_lower.append((
-                get_async(cuda_stream, ids),
-                get_async(cuda_stream, d_vector),
-                bp_vectors))
+            if isinstance(self.model[0].ops, CupyOps) \
+            and not isinstance(ids, state2vec.ops.xp.ndarray):
+                backprop_lower.append((
+                    get_async(cuda_stream, ids),
+                    get_async(cuda_stream, d_vector),
+                    bp_vectors))
+            else:
+                backprop_lower.append((ids, d_vector, bp_vectors))
         d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
         self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
         lengths = [len(doc) for doc in docs]