diff --git a/spacy/syntax/nn_parser.pxd b/spacy/syntax/nn_parser.pxd
index f6963ea18..8692185e5 100644
--- a/spacy/syntax/nn_parser.pxd
+++ b/spacy/syntax/nn_parser.pxd
@@ -15,7 +15,7 @@ cdef class Parser:
     cdef readonly object cfg
 
     cdef void _parse_step(self, StateC* state,
-            int* token_ids, float* scores, int* is_valid,
-            const float* feat_weights, int nr_class, int nr_feat) nogil
+            const float* feat_weights,
+            int nr_class, int nr_feat) nogil
 
     #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 1b96bae36..995ff5278 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -19,7 +19,6 @@ import numpy.random
 cimport numpy as np
 
 from libcpp.vector cimport vector
-from libcpp.pair cimport pair
 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 from cpython.exc cimport PyErr_CheckSignals
 from libc.stdint cimport uint32_t, uint64_t
@@ -69,9 +68,6 @@ def set_debug(val):
     DEBUG = val
 
 
-ctypedef pair[int, StateC*] step_t
-
-
 cdef class precompute_hiddens:
     '''Allow a model to be "primed" by pre-computing input features in bulk.
 
@@ -123,9 +119,6 @@ cdef class precompute_hiddens:
             self._is_synchronized = True
         return <float*>self._cached.data
 
-    def get_bp_hiddens(self):
-        return self._bp_hiddens
-
     def __call__(self, X):
         return self.begin_update(X)[0]
 
@@ -315,6 +308,7 @@ cdef class Parser:
         cdef:
             precompute_hiddens state2vec
             StateClass state
+            Pool mem
             const float* feat_weights
             StateC* st
             vector[StateC*] next_step, this_step
@@ -342,14 +336,7 @@ cdef class Parser:
         cdef int i
         while not next_step.empty():
             for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True):
-                token_ids = <int*>calloc(nr_feat, sizeof(int))
-                scores = <float*>calloc(nr_class, sizeof(float))
-                is_valid = <int*>calloc(nr_class, sizeof(int))
-                self._parse_step(next_step[i], token_ids, scores, is_valid,
-                                 feat_weights, nr_class, nr_feat)
-                free(is_valid)
-                free(scores)
-                free(token_ids)
+                self._parse_step(next_step[i], feat_weights, nr_class, nr_feat)
             this_step, next_step = next_step, this_step
             next_step.clear()
             for st in this_step:
@@ -358,8 +345,12 @@ cdef class Parser:
         return states
 
     cdef void _parse_step(self, StateC* state,
-            int* token_ids, float* scores, int* is_valid,
-            const float* feat_weights, int nr_class, int nr_feat) nogil:
+            const float* feat_weights,
+            int nr_class, int nr_feat) nogil:
+        token_ids = <int*>calloc(nr_feat, sizeof(int))
+        scores = <float*>calloc(nr_class, sizeof(float))
+        is_valid = <int*>calloc(nr_class, sizeof(int))
+
         state.set_context_tokens(token_ids, nr_feat)
         sum_state_features(scores,
             feat_weights, token_ids, 1, nr_feat, nr_class)
@@ -368,90 +359,66 @@ cdef class Parser:
         action = self.moves.c[guess]
         action.do(state, action.label)
 
-    def update(self, docs_tokvecs, golds, drop=0., sgd=None):
-        cdef:
-            precompute_hiddens state2vec
-            StateClass state
-            const float* feat_weights
-            StateC* st
-            vector[step_t] next_step, this_step
-            cdef int[:, ::1] is_valid, token_ids
-            cdef float[:, ::1] scores, d_scores, costs
-            int nr_state, nr_feat, nr_class
+        free(is_valid)
+        free(scores)
+        free(token_ids)
 
+    def update(self, docs_tokvecs, golds, drop=0., sgd=None):
         docs, tokvec_lists = docs_tokvecs
+        tokvecs = self.model[0].ops.flatten(tokvec_lists)
         if isinstance(docs, Doc) and isinstance(golds, GoldParse):
             docs = [docs]
             golds = [golds]
-        assert len(docs) == len(golds) == len(tokvec_lists)
 
-        nr_state = len(docs)
-        nr_feat = self.nr_feature
-        nr_class = self.moves.n_moves
-
-        token_ids = numpy.zeros((nr_state, nr_feat), dtype='i')
-        is_valid = numpy.zeros((nr_state, nr_class), dtype='i')
-        scores = numpy.zeros((nr_state, nr_class), dtype='f')
-        d_scores = numpy.zeros((nr_state, nr_class), dtype='f')
-        costs = numpy.zeros((nr_state, nr_class), dtype='f')
-
-        tokvecs = self.model[0].ops.flatten(tokvec_lists)
         cuda_stream = get_cuda_stream()
-        state2vec, vec2scores = self.get_batch_model(nr_state, tokvecs,
-                                                     cuda_stream, drop)
-
         golds = [self.moves.preprocess_gold(g) for g in golds]
+
         states = self.moves.init_batch(docs)
-        cdef step_t step
-        cdef int i
-        for i, state in enumerate(states):
-            if not state.c.is_final():
-                step.first = i
-                step.second = state.c
-                next_step.push_back(step)
-                self.moves.set_costs(&is_valid[i, 0], &costs[i, 0], state, golds[i])
+        state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
+                                                      drop)
+
+        todo = [(s, g) for (s, g) in zip(states, golds)
+                if not s.is_final() and g is not None]
 
-        feat_weights = state2vec.get_feat_weights()
-        bp_hiddens = state2vec.get_bp_hiddens()
-        d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
         backprops = []
+        cdef float loss = 0.
+        while len(todo) >= 3:
+            states, golds = zip(*todo)
 
-        while next_step.size():
-            # Allocate these each step, so copy an be async
-            np_token_ids = numpy.zeros((nr_state, nr_feat), dtype='i')
-            np_d_scores = numpy.zeros((nr_state, nr_class), dtype='f')
-            token_ids = np_token_ids
-            d_scores = np_d_scores
-            for step in next_step:
-                i = step.first
-                st = step.second
-                self._parse_step(st, &token_ids[i, 0],
-                    &scores[i, 0], &is_valid[i, 0],
-                    feat_weights, nr_class, nr_feat)
-                cpu_log_loss(&d_scores[i, 0],
-                    &costs[i, 0], &is_valid[i, 0], &scores[i, 0], nr_class)
-            backprops.append((
-                get_async(cuda_stream, np_token_ids),
-                get_async(cuda_stream, np_d_scores)))
-            this_step, next_step = next_step, this_step
-            next_step.clear()
-            for step in this_step:
-                i = step.first
-                st = step.second
-                if not st.is_final():
-                    next_step.push_back(step)
-                    self.moves.set_costs(&is_valid[i, 0], &costs[i, 0],
-                                         states[i], golds[i])
-        cuda_stream.synchronize()
-        for gpu_token_ids, gpu_d_scores in backprops:
-            d_features = bp_hiddens((gpu_d_scores, gpu_token_ids), sgd)
-            d_features *= (gpu_token_ids >= 0).reshape((nr_state, nr_feat, 1))
+            token_ids = self.get_token_ids(states)
+            vector, bp_vector = state2vec.begin_update(token_ids, drop=drop)
+            scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
 
-            xp = self.model[0].ops.xp
-            if hasattr(xp, 'scatter_add'):
-                xp.scatter_add(d_tokvecs, gpu_token_ids, d_features)
+            d_scores = self.get_batch_loss(states, golds, scores)
+            d_vector = bp_scores(d_scores, sgd=sgd)
+
+            if isinstance(self.model[0].ops, CupyOps) \
+            and not isinstance(token_ids, state2vec.ops.xp.ndarray):
+                # Move token_ids and d_vector to CPU, asynchronously
+                backprops.append((
+                    get_async(cuda_stream, token_ids),
+                    get_async(cuda_stream, d_vector),
+                    bp_vector
+                ))
             else:
-                xp.add.at(d_tokvecs, gpu_token_ids, d_features)
+                backprops.append((token_ids, d_vector, bp_vector))
+            self.transition_batch(states, scores)
+            todo = [st for st in todo if not st[0].is_final()]
+        # Tells CUDA to block, so our async copies complete.
+        if cuda_stream is not None:
+            cuda_stream.synchronize()
+        d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
+        xp = state2vec.ops.xp # Handle for numpy/cupy
+        for token_ids, d_vector, bp_vector in backprops:
+            d_state_features = bp_vector(d_vector, sgd=sgd)
+            active_feats = token_ids * (token_ids >= 0)
+            active_feats = active_feats.reshape((token_ids.shape[0], token_ids.shape[1], 1))
+            if hasattr(xp, 'scatter_add'):
+                xp.scatter_add(d_tokvecs,
+                    token_ids, d_state_features * active_feats)
+            else:
+                xp.add.at(d_tokvecs,
+                    token_ids, d_state_features * active_feats)
         return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
 
     def get_batch_model(self, batch_size, tokvecs, stream, dropout):