diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py
index 2155d489c..e77797d4a 100644
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@@ -387,7 +387,13 @@ def ant_scorer_forward(
 
         scores = pw_prod + pw_sum + mask
 
-        top_scores, top_scores_idx = topk(xp, scores, min(ant_limit, len(scores)))
+        top_limit = min(ant_limit, len(scores))
+        top_scores, top_scores_idx = topk(xp, scores, top_limit)
+        # now add the placeholder
+        placeholder = ops.alloc2f(scores.shape[0], 1)
+        top_scores = xp.concatenate( (placeholder, top_scores), 1)
+        top_scores = ops.softmax(top_scores, axis=1)
+
         out.append((top_scores, top_scores_idx))
 
         # In the full model these scores can be further refined. In the current
@@ -414,6 +420,8 @@ def ant_scorer_forward(
         offset = 0
         for dy, (prod_back, pw_sum_back), ll in zip(dYscores, backprops, veclens):
             dyscore, dyidx = dy
+            # remove the placeholder
+            dyscore = dyscore[:, 1:]
             # the full score grid is square
 
             fullscore = ops.alloc2f(ll, ll)
diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index 4caf02359..f0ae62fa9 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -142,10 +142,6 @@ class CoreferenceResolver(TrainablePipe):
             starts = idxs[offset:hi, 0]
             ends = idxs[offset:hi, 1]
 
-            # need to add the placeholder
-            placeholder = self.model.ops.alloc2f(cscores.shape[0], 1)
-            cscores = xp.concatenate((placeholder, cscores), 1)
-
             predicted = get_predicted_clusters(xp, starts, ends, ant_idxs, cscores)
             clusters_by_doc.append(predicted)
         return clusters_by_doc
@@ -291,9 +287,8 @@ class CoreferenceResolver(TrainablePipe):
 
         offset = 0
         gradients = []
-        loss = 0
+        total_loss = 0
         for example, (cscores, cidx) in zip(examples, score_matrix):
-            # assume cids has absolute mention ids
 
             ll = cscores.shape[0]
             hi = offset + ll
@@ -310,20 +305,14 @@ class CoreferenceResolver(TrainablePipe):
             # boolean to float
             top_gscores = ops.asarray2f(top_gscores)
 
-            # add the placeholder to cscores
-            placeholder = self.model.ops.alloc2f(ll, 1)
-            cscores = xp.concatenate((placeholder, cscores), 1)
+            grad, loss = self.loss(cscores.T, top_gscores.T)
 
-            # do softmax to cscores
-            cscores = ops.softmax(cscores, axis=1)
+            gradients.append((grad.T, cidx))
+            total_loss += float(loss)
 
-            diff = self.loss.get_grad(cscores.T, top_gscores.T).T
-            diff = diff[:, 1:]
-            gradients.append((diff, cidx))
+            offset = hi
 
-            loss += float(self.loss.get_loss(cscores.T, top_gscores.T))
-            offset += ll
-        return loss, gradients
+        return total_loss, gradients
 
     def initialize(
         self,