From 8f66176b2dd1196d90ab7c72b7cca5080ad98314 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 5 Jul 2021 18:17:10 +0900
Subject: [PATCH] Fix loss?

This rewrites the loss to not use the Thinc crossentropy code at all.
The main difference here is that the negative predictions are being
masked out (= marginalized over), but negative gradient is still being
reflected.

I'm still not sure this is exactly right but models seem to train
reliably now.
---
 spacy/ml/models/coref.py |  2 +-
 spacy/pipeline/coref.py  | 11 +++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py
index 2545f7325..33c278b3d 100644
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@@ -394,7 +394,7 @@ def ant_scorer_forward(
         # now add the placeholder
         placeholder = ops.alloc2f(scores.shape[0], 1)
         top_scores = xp.concatenate( (placeholder, top_scores), 1)
-        top_scores = ops.softmax(top_scores, axis=1)
+        #top_scores = ops.softmax(top_scores, axis=1)
 
         out.append((top_scores, top_scores_idx))
 
diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index 2f9baaeb4..f040e6637 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -1,4 +1,5 @@
 from typing import Iterable, Tuple, Optional, Dict, Callable, Any, List
+import warnings
 
 from thinc.types import Floats2d, Ints2d
 from thinc.api import Model, Config, Optimizer, CategoricalCrossentropy
@@ -305,9 +306,15 @@ class CoreferenceResolver(TrainablePipe):
             # boolean to float
             top_gscores = ops.asarray2f(top_gscores)
 
-            grad, loss = self.loss(cscores.T, top_gscores.T)
+            with warnings.catch_warnings():
+                warnings.filterwarnings('ignore', category=RuntimeWarning)
+                log_marg = ops.softmax(cscores + ops.xp.log(top_gscores), axis=1)
+            log_norm = ops.softmax(cscores, axis=1)
+            grad = log_norm - log_marg
+            # XXX might be better to not square this
+            loss = (grad ** 2).sum()
 
-            gradients.append((grad.T, cidx))
+            gradients.append((grad, cidx))
             total_loss += float(loss)
 
             offset = hi