diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index 4cbb666c0..756dbecc1 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -237,10 +237,9 @@ cdef class NeuralEntityRecognizer(NeuralParser):
 
     nr_feature = 6
 
-    def get_token_ids(self, states):
+    def set_token_ids(self, ids, states):
         cdef StateClass state
         cdef int n_tokens = 6
-        ids = numpy.zeros((len(states), n_tokens), dtype='i', order='c')
         for i, state in enumerate(states):
             ids[i, 0] = state.c.B(0)-1
             ids[i, 1] = state.c.B(0)
@@ -253,7 +252,7 @@ cdef class NeuralEntityRecognizer(NeuralParser):
                     ids[i, j] = -1
                 if ids[i, j] != -1:
                     ids[i, j] += state.c.offset
-        return ids
+        ids[i+1:ids.shape[0]] = -1
 
 
 cdef class BeamDependencyParser(BeamParser):
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 5140a41fd..ff558e20b 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -315,7 +315,9 @@ cdef class Parser:
 
         todo = [st for st in states if not st.is_final()]
         while todo:
-            token_ids = self.get_token_ids(todo)
+            token_ids = numpy.zeros((len(todo), self.nr_feature),
+                                    dtype='i', order='C')
+            self.set_token_ids(token_ids, todo)
             vectors = state2vec(token_ids)
             scores = vec2scores(vectors)
             self.transition_batch(todo, scores)
@@ -339,44 +341,53 @@ cdef class Parser:
         todo = [(s, g) for s, g in zip(states, golds) if not s.is_final()]
 
         backprops = []
+        cdef int max_steps = max(len(doc)*3 for doc in docs)
+        # Allocate one buffer for the token_ids and d_vectors
+        # This will make it quicker to copy back to GPU
+        token_ids = numpy.zeros((max_steps, len(todo), self.nr_feature),
+                                dtype='i', order='C')
+        d_vectors = numpy.zeros((max_steps, len(todo), self.model[0].nO),
+                                dtype='f', order='C')
         cdef float loss = 0.
-        while todo:
+        cdef int nr_step = 0
+        while len(todo) >= 4 and nr_step < max_steps:
             states, golds = zip(*todo)
 
-            token_ids = self.get_token_ids(states)
-            vector, bp_vector = state2vec.begin_update(token_ids, drop=drop)
+            self.set_token_ids(token_ids[nr_step], states)
+            length = len(todo)
+            vector, bp_vector = state2vec.begin_update(token_ids[nr_step, :length],
+                                                       drop=drop)
             scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
 
             d_scores = self.get_batch_loss(states, golds, scores)
-            d_vector = bp_scores(d_scores, sgd=sgd)
+            d_vectors[nr_step, :length] = bp_scores(d_scores, sgd=sgd)
 
-            if isinstance(self.model[0].ops, CupyOps) \
-            and not isinstance(token_ids, state2vec.ops.xp.ndarray):
-                # Move token_ids and d_vector to CPU, asynchronously
-                backprops.append((
-                    get_async(cuda_stream, token_ids),
-                    get_async(cuda_stream, d_vector),
-                    bp_vector
-                ))
-            else:
-                backprops.append((token_ids, d_vector, bp_vector))
+            backprops.append((length, bp_vector))
             self.transition_batch(states, scores)
             todo = [st for st in todo if not st[0].is_final()]
-        # Tells CUDA to block, so our async copies complete.
-        if cuda_stream is not None:
-            cuda_stream.synchronize()
+            nr_step += 1
+
         d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
+        if type(token_ids) != type(d_tokvecs):
+            token_ids = get_async(cuda_stream, token_ids)
+            d_vectors = get_async(cuda_stream, d_vectors)
+        if cuda_stream is not None:
+            # Tells CUDA to block, so our async copies complete.
+            cuda_stream.synchronize()
         xp = state2vec.ops.xp # Handle for numpy/cupy
-        for token_ids, d_vector, bp_vector in backprops:
+        for i, (length, bp_vector) in enumerate(backprops):
+            d_vector = d_vectors[i, :length]
             d_state_features = bp_vector(d_vector, sgd=sgd)
-            active_feats = token_ids * (token_ids >= 0)
-            active_feats = active_feats.reshape((token_ids.shape[0], token_ids.shape[1], 1))
+            step_token_ids = token_ids[i, :length]
+            active_feats = step_token_ids * (step_token_ids >= 0)
+            active_feats = active_feats.reshape((active_feats.shape[0],
+                                  active_feats.shape[1], 1))
             if hasattr(xp, 'scatter_add'):
                 xp.scatter_add(d_tokvecs,
-                    token_ids, d_state_features * active_feats)
+                    step_token_ids, d_state_features)
             else:
                 xp.add.at(d_tokvecs,
-                    token_ids, d_state_features * active_feats)
+                    step_token_ids, d_state_features * active_feats)
         return d_tokvecs
 
     def get_batch_model(self, batch_size, tokvecs, stream, dropout):
@@ -387,13 +398,11 @@ cdef class Parser:
 
     nr_feature = 13
 
-    def get_token_ids(self, states):
+    def set_token_ids(self, token_ids, states):
         cdef StateClass state
-        cdef int n_tokens = self.nr_feature
-        ids = numpy.zeros((len(states), n_tokens), dtype='i', order='C')
         for i, state in enumerate(states):
-            state.set_context_tokens(ids[i])
-        return ids
+            state.set_context_tokens(token_ids[i])
+        token_ids[i+1:token_ids.shape[0]] = -1
 
     def transition_batch(self, states, float[:, ::1] scores):
         cdef StateClass state