diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py
index 7a25e7574..1c20c622b 100644
--- a/spacy/ml/_precomputable_affine.py
+++ b/spacy/ml/_precomputable_affine.py
@@ -26,7 +26,11 @@ def forward(model, X, is_train):
     Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
     model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
     Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
-    Yf[0] = model.get_param("pad")
+
+    # Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
+    # change its shape to (nF, nO, nP) without breaking existing models. So
+    # we'll squeeze the first dimension here.
+    Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
 
     def backward(dY_ids):
         # This backprop is particularly tricky, because we get back a different