diff --git a/spacy/_ml.py b/spacy/_ml.py
index d3c82897f..1f3d50cbd 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -81,31 +81,28 @@ def add_tuples(X, drop=0.):
 def AddHistory(layer, decay=0.0001):
     ops = layer.ops
     nonlocals = []
-    if layer.nI:
-        average_inputs = ops.allocate((layer.nO, layer.nI-layer.nO))
-        nonlocals = []
     def history_fwd(X, drop=0.):
         if not nonlocals:
-            nonlocals.append(ops.allocate((layer.nO, X.shape[1])))
+            if hasattr(layer, 'nO'):
+                nO = layer.nO
+            else:
+                nO = layer._layers[-1].nO
+            nonlocals.append(ops.allocate((nO, X.shape[1])))
             model.history = nonlocals[0]
         average_inputs = nonlocals[0]
         hist = ops.xp.tensordot(X, average_inputs, axes=[[1], [1]])
         X_hist = ops.xp.hstack((X, hist))
         Y, bp_Y = layer.begin_update(X_hist, drop=drop)
-        for i in range(Y.shape[0]):
-            amax = Y[i].argmax()
-            average_inputs[amax] *= 1-decay
-            average_inputs[amax] += decay * X[i]
+        amax = Y.argmax(axis=1)
+        average_inputs *= 1-decay
+        ops.scatter_add(average_inputs, amax, X * decay)
         def history_bwd(dY, sgd=None):
             dX_hist = bp_Y(dY, sgd=sgd)
             dX = dX_hist[:, :X.shape[1]]
-            return dX
+            return ops.xp.ascontiguousarray(dX)
         return Y, history_bwd
     model = wrap(history_fwd, layer)
-    if layer.nI:
-        model.history = average_inputs
-    else:
-        model.history = None
+    model.history = None
     return model