From db422abf011fb9b0dabde5e22b9d7fa0b05424b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= <akos@onyx.uvt.nl>
Date: Fri, 18 Mar 2022 16:24:26 +0100
Subject: [PATCH 01/18] remove unnecessary .device

---
 spacy/ml/models/coref.py | 30 ++++++++----------------------
 1 file changed, 8 insertions(+), 22 deletions(-)

diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py
index f40a4c110..fea4bc21a 100644
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@@ -38,14 +38,11 @@ def build_wl_coref_model(
     except ValueError:
         # happens with transformer listener
         dim = 768
-    
+
     with Model.define_operators({">>": chain}):
         # TODO chain tok2vec with these models
-        # TODO fix device - should be automatic
-        device = "cuda:0"
         coref_scorer = PyTorchWrapper(
             CorefScorer(
-                device,
                 dim,
                 embedding_size,
                 hidden_size,
@@ -65,7 +62,6 @@ def build_wl_coref_model(
                 # TODO this was hardcoded to 1024, check
                 hidden_size,
                 sp_embedding_size,
-                device
             ),
             
             convert_inputs=convert_span_predictor_inputs
@@ -205,7 +201,6 @@ class CorefScorer(torch.nn.Module):
     """
     def __init__(
         self,
-        device: str,
         dim: int, # tok2vec size
         dist_emb_size: int,
         hidden_size: int,
@@ -222,8 +217,7 @@ class CorefScorer(torch.nn.Module):
             epochs_trained (int): the number of epochs finished
                 (useful for warm start)
         """
-        # device, dist_emb_size, hidden_size, n_layers, dropout_rate
-        self.pw = DistancePairwiseEncoder(dist_emb_size, dropout_rate).to(device)
+        self.pw = DistancePairwiseEncoder(dist_emb_size, dropout_rate)
         #TODO clean this up
         bert_emb = dim
         pair_emb = bert_emb * 3 + self.pw.shape
@@ -232,7 +226,7 @@ class CorefScorer(torch.nn.Module):
             hidden_size,
             n_layers,
             dropout_rate
-        ).to(device)
+        )
         self.lstm = torch.nn.LSTM(
             input_size=bert_emb,
             hidden_size=bert_emb,
@@ -243,7 +237,7 @@ class CorefScorer(torch.nn.Module):
             bert_emb,
             dropout_rate,
             roughk
-        ).to(device)
+        )
         self.batch_size = batch_size
 
     def forward(
@@ -392,7 +386,6 @@ class AnaphoricityScorer(torch.nn.Module):
         return out
 
 
-
 class RoughScorer(torch.nn.Module):
     """
     Is needed to give a roughly estimate of the anaphoricity of two candidates,
@@ -423,7 +416,6 @@ class RoughScorer(torch.nn.Module):
         pair_mask = torch.arange(mentions.shape[0])
         pair_mask = pair_mask.unsqueeze(1) - pair_mask.unsqueeze(0)
         pair_mask = torch.log((pair_mask > 0).to(torch.float))
-        pair_mask = pair_mask.to(mentions.device)
         bilinear_scores = self.dropout(self.bilinear(mentions)).mm(mentions.T)
         rough_scores = pair_mask + bilinear_scores
 
@@ -450,7 +442,7 @@ class RoughScorer(torch.nn.Module):
 
 
 class SpanPredictor(torch.nn.Module):
-    def __init__(self, input_size: int, distance_emb_size: int, device):
+    def __init__(self, input_size: int, distance_emb_size: int):
         super().__init__()
         self.ffnn = torch.nn.Sequential(
             torch.nn.Linear(input_size * 2 + 64, input_size),
@@ -461,7 +453,6 @@ class SpanPredictor(torch.nn.Module):
             torch.nn.Dropout(0.3),
             torch.nn.Linear(256, 64),
         )
-        self.device = device
         self.conv = torch.nn.Sequential(
             torch.nn.Conv1d(64, 4, 3, 1, 1),
             torch.nn.Conv1d(4, 2, 3, 1, 1)
@@ -529,6 +520,8 @@ class SpanPredictor(torch.nn.Module):
             valid_positions = torch.stack((valid_starts, valid_ends), dim=2)
             return scores + valid_positions
         return scores
+
+
 class DistancePairwiseEncoder(torch.nn.Module):
 
     def __init__(self, embedding_size, dropout_rate):
@@ -538,17 +531,10 @@ class DistancePairwiseEncoder(torch.nn.Module):
         self.dropout = torch.nn.Dropout(dropout_rate)
         self.shape = emb_size
 
-    @property
-    def device(self) -> torch.device:
-        """ A workaround to get current device (which is assumed to be the
-        device of the first parameter of one of the submodules) """
-        return next(self.distance_emb.parameters()).device
-
-
     def forward(self,  # type: ignore  # pylint: disable=arguments-differ  #35566 in pytorch
                 top_indices: torch.Tensor
         ) -> torch.Tensor:
-        word_ids = torch.arange(0, top_indices.size(0), device=self.device)
+        word_ids = torch.arange(0, top_indices.size(0))
         distance = (word_ids.unsqueeze(1) - word_ids[top_indices]
                     ).clamp_min_(min=1)
         log_distance = distance.to(torch.float).log2().floor_()

From 1eaf8fb0cf01dec6d6a01f20e109eb21fd5f530d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= <akos@onyx.uvt.nl>
Date: Wed, 23 Mar 2022 11:24:27 +0100
Subject: [PATCH 02/18] span predictor debug start

---
 spacy/ml/models/coref.py |  9 ++++-----
 spacy/pipeline/coref.py  | 14 +++++++-------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py
index 382d7a98b..29f3ad819 100644
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@@ -91,7 +91,7 @@ def build_span_predictor(
         # TODO fix device - should be automatic
         device = "cuda:0"
         span_predictor = PyTorchWrapper(
-            SpanPredictor(hidden_size, dist_emb_size, device),
+            SpanPredictor(dim, dist_emb_size, device),
             convert_inputs=convert_span_predictor_inputs
         )
         # TODO use proper parameter for prefix
@@ -148,7 +148,6 @@ def convert_span_predictor_inputs(
     # Normally we shoudl use the input is_train, but for these two it's not relevant
     sent_ids = xp2torch(sent_ids[0], requires_grad=False)
     head_ids = xp2torch(head_ids[0], requires_grad=False)
-
     word_features = xp2torch(tok2vec[0], requires_grad=is_train)
 
     argskwargs = ArgsKwargs(args=(sent_ids, word_features, head_ids), kwargs={})
@@ -557,7 +556,6 @@ class SpanPredictor(torch.nn.Module):
         sent_id = torch.tensor(sent_id, device=words.device)
         heads_ids = heads_ids.long()
         same_sent = (sent_id[heads_ids].unsqueeze(1) == sent_id.unsqueeze(0))
-
         # To save memory, only pass candidates from one sentence for each head
         # pair_matrix contains concatenated span_head_emb + candidate_emb + distance_emb
         # for each candidate among the words in the same sentence as span_head
@@ -568,11 +566,11 @@ class SpanPredictor(torch.nn.Module):
             words[cols],
             self.emb(emb_ids[rows, cols]),
         ), dim=1)
-
+        input(len(heads_ids))
         lengths = same_sent.sum(dim=1)
         padding_mask = torch.arange(0, lengths.max().item(), device=words.device).unsqueeze(0)
         padding_mask = (padding_mask < lengths.unsqueeze(1))  # [n_heads, max_sent_len]
-
+        input(padding_mask.shape)
         # [n_heads, max_sent_len, input_size * 2 + distance_emb_size]
         # This is necessary to allow the convolution layer to look at several
         # word scores
@@ -592,6 +590,7 @@ class SpanPredictor(torch.nn.Module):
             valid_positions = torch.stack((valid_starts, valid_ends), dim=2)
             return scores + valid_positions
         return scores
+
 class DistancePairwiseEncoder(torch.nn.Module):
 
     def __init__(self, embedding_size, dropout_rate):
diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index 54e9d8cfd..b3ced454c 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -3,7 +3,7 @@ import warnings
 
 from thinc.types import Floats2d, Floats3d, Ints2d
 from thinc.api import Model, Config, Optimizer, CategoricalCrossentropy
-from thinc.api import set_dropout_rate
+from thinc.api import set_dropout_rate, to_categorical
 from itertools import islice
 from statistics import mean
 
@@ -513,10 +513,8 @@ class SpanPredictor(TrainablePipe):
         total_loss = 0
 
         for eg in examples:
-            preds, backprop = self.model.begin_update([eg.predicted])
-            score_matrix, mention_idx = preds
-
-            loss, d_scores = self.get_loss([eg], score_matrix, mention_idx)
+            span_scores, backprop = self.model.begin_update([eg.predicted])
+            loss, d_scores = self.get_loss([eg], span_scores)
             total_loss += loss
             # TODO check shape here
             backprop((d_scores, mention_idx))
@@ -573,8 +571,10 @@ class SpanPredictor(TrainablePipe):
             for cluster in gold:
                 for mention in cluster:
                     starts.append(mention[0])
-                    ends.append(mention[1])
-
+                    # XXX I think this was missing here
+                    ends.append(mention[1] - 1)
+            starts = self.model.ops.xp.asarray(starts)
+            ends = self.model.ops.xp.asarray(ends)
             start_scores = span_scores[:, :, 0]
             end_scores = span_scores[:, :, 1]
             n_classes = start_scores.shape[1]

From 706b2e6f25cc98e4be47adf5c0b8b968158019cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= <akos@onyx.uvt.nl>
Date: Thu, 24 Mar 2022 16:06:20 +0100
Subject: [PATCH 03/18] gearing up SpanPredictor for gold-heads

---
 spacy/ml/models/coref.py | 34 ++++++++++++++++++++--------------
 spacy/pipeline/coref.py  | 20 +++++++++++++++-----
 2 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py
index 179de7e58..3350a8dd9 100644
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@@ -53,7 +53,6 @@ def build_wl_coref_model(
             convert_inputs=convert_coref_scorer_inputs,
             convert_outputs=convert_coref_scorer_outputs
         )
-
         coref_model = tok2vec >> coref_scorer
         # XXX just ignore this until the coref scorer is integrated
         span_predictor = PyTorchWrapper(
@@ -62,7 +61,6 @@ def build_wl_coref_model(
                 hidden_size,
                 sp_embedding_size,
             ),
-            
             convert_inputs=convert_span_predictor_inputs
         )
     # TODO combine models so output is uniform (just one forward pass)
@@ -84,14 +82,15 @@ def build_span_predictor(
         dim = 768
 
     with Model.define_operators({">>": chain, "&": tuplify}):
-        # TODO fix device - should be automatic
-        device = "cuda:0"
         span_predictor = PyTorchWrapper(
-            SpanPredictor(dim, dist_emb_size, device),
+            SpanPredictor(dim, dist_emb_size),
             convert_inputs=convert_span_predictor_inputs
         )
         # TODO use proper parameter for prefix
-        head_info = build_get_head_metadata("coref_head_clusters")
+        head_info = build_get_head_metadata(
+            "span_coref_head_clusters",
+            "coref_head_clusters"
+        )
         model = (tok2vec & head_info) >> span_predictor
 
     return model
@@ -148,7 +147,7 @@ def convert_span_predictor_inputs(
 
     argskwargs = ArgsKwargs(args=(sent_ids, word_features, head_ids), kwargs={})
     # TODO actually support backprop
-    return argskwargs, lambda dX: []
+    return argskwargs, lambda dX: [[]]
 
 # TODO This probably belongs in the component, not the model.
 def predict_span_clusters(span_predictor: Model,
@@ -217,18 +216,27 @@ def _clusterize(
             clusters.append(sorted(cluster))
     return sorted(clusters)
 
-def build_get_head_metadata(prefix):
+
+def build_get_head_metadata(update_prefix, predict_prefix):
     # TODO this name is awful, fix it
-    model = Model("HeadDataProvider", attrs={"prefix": prefix}, forward=head_data_forward)
+    model = Model("HeadDataProvider",
+                  attrs={
+                      "update_prefix": update_prefix,
+                      "predict_prefix": predict_prefix
+                  },
+                  forward=head_data_forward)
     return model
 
+
 def head_data_forward(model, docs, is_train):
     """A layer to generate the extra data needed for the span predictor.
     """
     sent_ids = []
     head_ids = []
-    prefix = model.attrs["prefix"]
-
+    if is_train:
+        prefix = model.attrs["update_prefix"]
+    else:
+        prefix = model.attrs["predict_prefix"]
     for doc in docs:
         sids = model.ops.asarray2i(get_sentence_ids(doc))
         sent_ids.append(sids)
@@ -241,7 +249,7 @@ def head_data_forward(model, docs, is_train):
                 heads.append(span[0].i)
         heads = model.ops.asarray2i(heads)
         head_ids.append(heads)
-    
+
     # each of these is a list with one entry per doc
     # backprop is just a placeholder
     # TODO it would probably be better to have a list of tuples than two lists of arrays
@@ -557,11 +565,9 @@ class SpanPredictor(torch.nn.Module):
             words[cols],
             self.emb(emb_ids[rows, cols]),
         ), dim=1)
-        input(len(heads_ids))
         lengths = same_sent.sum(dim=1)
         padding_mask = torch.arange(0, lengths.max().item(), device=words.device).unsqueeze(0)
         padding_mask = (padding_mask < lengths.unsqueeze(1))  # [n_heads, max_sent_len]
-        input(padding_mask.shape)
         # [n_heads, max_sent_len, input_size * 2 + distance_emb_size]
         # This is necessary to allow the convolution layer to look at several
         # word scores
diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index b3ced454c..f37f777fc 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -417,6 +417,7 @@ DEFAULT_SPAN_PREDICTOR_MODEL = Config().from_str(default_span_predictor_config)[
         default_config={
             "model": DEFAULT_SPAN_PREDICTOR_MODEL,
             "input_prefix": "coref_head_clusters",
+            "target_prefix": "span_head_target_clusters",
             "output_prefix": "coref_clusters",
             },
     default_score_weights={"span_predictor_f": 1.0, "span_predictor_p": None, "span_predictor_r": None},
@@ -426,6 +427,7 @@ def make_span_predictor(
         name: str,
         model,
         input_prefix: str = "coref_head_clusters",
+        target_prefix: str = "span_head_target_clusters",
         output_prefix: str = "coref_clusters",
 ) -> "SpanPredictor":
     """Create a SpanPredictor component."""
@@ -444,12 +446,14 @@ class SpanPredictor(TrainablePipe):
         name: str = "span_predictor",
         *,
         input_prefix: str = "coref_head_clusters",
+        target_prefix: str = "span_coref_head_clusters",
         output_prefix: str = "coref_clusters",
     ) -> None:
         self.vocab = vocab
         self.model = model
         self.name = name
         self.input_prefix = input_prefix
+        self.target_prefix = target_prefix
         self.output_prefix = output_prefix
 
         self.cfg = {}
@@ -511,13 +515,18 @@ class SpanPredictor(TrainablePipe):
         set_dropout_rate(self.model, drop)
 
         total_loss = 0
-
-        for eg in examples:
-            span_scores, backprop = self.model.begin_update([eg.predicted])
+        docs = [eg.predicted for eg in examples]
+        for doc, eg in zip(docs, examples):
+            # replicates the EntityLinker's behaviour and
+            # copies annotations over https://bit.ly/3iweDcW
+            for key, sg in eg.reference.spans.items():
+                if key.startswith(self.target_prefix):
+                    doc.spans[key] = [doc[span.start:span.end] for span in sg]
+            span_scores, backprop = self.model.begin_update([doc])
             loss, d_scores = self.get_loss([eg], span_scores)
             total_loss += loss
             # TODO check shape here
-            backprop((d_scores, mention_idx))
+            backprop(d_scores)
 
         if sgd is not None:
             self.finish_update(sgd)
@@ -564,7 +573,7 @@ class SpanPredictor(TrainablePipe):
         for eg in examples:
 
             # get gold data
-            gold = doc2clusters(eg.reference, self.output_prefix)
+            gold = doc2clusters(eg.predicted, self.target_prefix)
             # flatten the gold data
             starts = []
             ends = []
@@ -605,6 +614,7 @@ class SpanPredictor(TrainablePipe):
                 doc = ex.predicted
                 assert len(doc) > 2, "Coreference requires at least two tokens"
                 doc.spans[f"{self.input_prefix}_0"] = [doc[0:1], doc[1:2]]
+                doc.spans[f"{self.target_prefix}_0"] = [doc[0:1], doc[1:2]]
             X.append(ex.predicted)
             Y.append(ex.reference)
 

From 1c5dabcb47f89635a9a5c529f48abaac694fcf4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= <akos@onyx.uvt.nl>
Date: Thu, 24 Mar 2022 16:23:12 +0100
Subject: [PATCH 04/18] merge SpanPredictor attributes

---
 spacy/ml/models/coref.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py
index 5fbb64a29..5fe29c25f 100644
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@@ -83,7 +83,7 @@ def build_span_predictor(
 
     with Model.define_operators({">>": chain, "&": tuplify}):
         span_predictor = PyTorchWrapper(
-            SpanPredictor(dim, dist_emb_size),
+            SpanPredictor(dim, hidden_size, dist_emb_size),
             convert_inputs=convert_span_predictor_inputs
         )
         # TODO use proper parameter for prefix
@@ -511,11 +511,7 @@ class RoughScorer(torch.nn.Module):
 
 
 class SpanPredictor(torch.nn.Module):
-<<<<<<< HEAD
-    def __init__(self, input_size: int, distance_emb_size: int):
-=======
     def __init__(self, input_size: int, hidden_size: int, dist_emb_size: int, device):
->>>>>>> eec00ce60d83f500e18f2da7d9feafa7143440f2
         super().__init__()
         # input size = single token size
         # 64 = probably distance emb size

From 83ac0477c8e73b3676a8614368f430d3e9ae6fa4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= <akos@onyx.uvt.nl>
Date: Thu, 24 Mar 2022 16:44:50 +0100
Subject: [PATCH 05/18] remove useless extra prefix and device from
 spanpredictor

---
 spacy/ml/models/coref.py | 41 +++++++++++++++++-----------------------
 spacy/pipeline/coref.py  | 10 +++-------
 2 files changed, 20 insertions(+), 31 deletions(-)

diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py
index 5fe29c25f..71082e7ac 100644
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@@ -55,14 +55,14 @@ def build_wl_coref_model(
         )
         coref_model = tok2vec >> coref_scorer
         # XXX just ignore this until the coref scorer is integrated
-        span_predictor = PyTorchWrapper(
-            SpanPredictor(
-                # TODO this was hardcoded to 1024, check
-                hidden_size,
-                sp_embedding_size,
-            ),
-            convert_inputs=convert_span_predictor_inputs
-        )
+        # span_predictor = PyTorchWrapper(
+        #    SpanPredictor(
+        # TODO this was hardcoded to 1024, check
+        #        hidden_size,
+        #        sp_embedding_size,
+        #    ),
+        #    convert_inputs=convert_span_predictor_inputs
+        # )
     # TODO combine models so output is uniform (just one forward pass)
     # It may be reasonable to have an option to disable span prediction,
     # and just return words as spans.
@@ -88,7 +88,6 @@ def build_span_predictor(
         )
         # TODO use proper parameter for prefix
         head_info = build_get_head_metadata(
-            "span_coref_head_clusters",
             "coref_head_clusters"
         )
         model = (tok2vec & head_info) >> span_predictor
@@ -217,13 +216,10 @@ def _clusterize(
     return sorted(clusters)
 
 
-def build_get_head_metadata(update_prefix, predict_prefix):
+def build_get_head_metadata(prefix):
     # TODO this name is awful, fix it
     model = Model("HeadDataProvider",
-                  attrs={
-                      "update_prefix": update_prefix,
-                      "predict_prefix": predict_prefix
-                  },
+                  attrs={'prefix': prefix},
                   forward=head_data_forward)
     return model
 
@@ -233,10 +229,7 @@ def head_data_forward(model, docs, is_train):
     """
     sent_ids = []
     head_ids = []
-    if is_train:
-        prefix = model.attrs["update_prefix"]
-    else:
-        prefix = model.attrs["predict_prefix"]
+    prefix = model.attrs["prefix"]
     for doc in docs:
         sids = model.ops.asarray2i(get_sentence_ids(doc))
         sent_ids.append(sids)
@@ -511,7 +504,7 @@ class RoughScorer(torch.nn.Module):
 
 
 class SpanPredictor(torch.nn.Module):
-    def __init__(self, input_size: int, hidden_size: int, dist_emb_size: int, device):
+    def __init__(self, input_size: int, hidden_size: int, dist_emb_size: int):
         super().__init__()
         # input size = single token size
         # 64 = probably distance emb size
@@ -551,13 +544,13 @@ class SpanPredictor(torch.nn.Module):
             torch.Tensor: span start/end scores, [n_heads, n_words, 2]
         """
         # Obtain distance embedding indices, [n_heads, n_words]
-        relative_positions = (heads_ids.unsqueeze(1) - torch.arange(words.shape[0], device=words.device).unsqueeze(0))
+        relative_positions = (heads_ids.unsqueeze(1) - torch.arange(words.shape[0]).unsqueeze(0))
         # make all valid distances positive
         emb_ids = relative_positions + 63
         # "too_far"
         emb_ids[(emb_ids < 0) + (emb_ids > 126)] = 127
         # Obtain "same sentence" boolean mask, [n_heads, n_words]
-        sent_id = torch.tensor(sent_id, device=words.device)
+        sent_id = torch.tensor(sent_id)
         heads_ids = heads_ids.long()
         same_sent = (sent_id[heads_ids].unsqueeze(1) == sent_id.unsqueeze(0))
         # To save memory, only pass candidates from one sentence for each head
@@ -571,18 +564,18 @@ class SpanPredictor(torch.nn.Module):
             self.emb(emb_ids[rows, cols]),
         ), dim=1)
         lengths = same_sent.sum(dim=1)
-        padding_mask = torch.arange(0, lengths.max().item(), device=words.device).unsqueeze(0)
+        padding_mask = torch.arange(0, lengths.max().item()).unsqueeze(0)
         padding_mask = (padding_mask < lengths.unsqueeze(1))  # [n_heads, max_sent_len]
         # [n_heads, max_sent_len, input_size * 2 + distance_emb_size]
         # This is necessary to allow the convolution layer to look at several
         # word scores
-        padded_pairs = torch.zeros(*padding_mask.shape, pair_matrix.shape[-1], device=words.device)
+        padded_pairs = torch.zeros(*padding_mask.shape, pair_matrix.shape[-1])
         padded_pairs[padding_mask] = pair_matrix
 
         res = self.ffnn(padded_pairs) # [n_heads, n_candidates, last_layer_output]
         res = self.conv(res.permute(0, 2, 1)).permute(0, 2, 1) # [n_heads, n_candidates, 2]
 
-        scores = torch.full((heads_ids.shape[0], words.shape[0], 2), float('-inf'), device=words.device)
+        scores = torch.full((heads_ids.shape[0], words.shape[0], 2), float('-inf'))
         scores[rows, cols] = res[padding_mask]
 
         # Make sure that start <= head <= end during inference
diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index f37f777fc..eb05011ec 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -417,7 +417,6 @@ DEFAULT_SPAN_PREDICTOR_MODEL = Config().from_str(default_span_predictor_config)[
         default_config={
             "model": DEFAULT_SPAN_PREDICTOR_MODEL,
             "input_prefix": "coref_head_clusters",
-            "target_prefix": "span_head_target_clusters",
             "output_prefix": "coref_clusters",
             },
     default_score_weights={"span_predictor_f": 1.0, "span_predictor_p": None, "span_predictor_r": None},
@@ -427,7 +426,6 @@ def make_span_predictor(
         name: str,
         model,
         input_prefix: str = "coref_head_clusters",
-        target_prefix: str = "span_head_target_clusters",
         output_prefix: str = "coref_clusters",
 ) -> "SpanPredictor":
     """Create a SpanPredictor component."""
@@ -446,14 +444,12 @@ class SpanPredictor(TrainablePipe):
         name: str = "span_predictor",
         *,
         input_prefix: str = "coref_head_clusters",
-        target_prefix: str = "span_coref_head_clusters",
         output_prefix: str = "coref_clusters",
     ) -> None:
         self.vocab = vocab
         self.model = model
         self.name = name
         self.input_prefix = input_prefix
-        self.target_prefix = target_prefix
         self.output_prefix = output_prefix
 
         self.cfg = {}
@@ -519,8 +515,9 @@ class SpanPredictor(TrainablePipe):
         for doc, eg in zip(docs, examples):
             # replicates the EntityLinker's behaviour and
             # copies annotations over https://bit.ly/3iweDcW
+            # takes 'coref_head_clusters' from the reference.
             for key, sg in eg.reference.spans.items():
-                if key.startswith(self.target_prefix):
+                if key.startswith(self.input_prefix):
                     doc.spans[key] = [doc[span.start:span.end] for span in sg]
             span_scores, backprop = self.model.begin_update([doc])
             loss, d_scores = self.get_loss([eg], span_scores)
@@ -573,7 +570,7 @@ class SpanPredictor(TrainablePipe):
         for eg in examples:
 
             # get gold data
-            gold = doc2clusters(eg.predicted, self.target_prefix)
+            gold = doc2clusters(eg.predicted, self.input_prefix)
             # flatten the gold data
             starts = []
             ends = []
@@ -614,7 +611,6 @@ class SpanPredictor(TrainablePipe):
                 doc = ex.predicted
                 assert len(doc) > 2, "Coreference requires at least two tokens"
                 doc.spans[f"{self.input_prefix}_0"] = [doc[0:1], doc[1:2]]
-                doc.spans[f"{self.target_prefix}_0"] = [doc[0:1], doc[1:2]]
             X.append(ex.predicted)
             Y.append(ex.reference)
 

From 7304604edd6238d16f156b3f30db40d809f1a440 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= <akos@onyx.uvt.nl>
Date: Fri, 25 Mar 2022 18:29:33 +0100
Subject: [PATCH 06/18] make sure predicted and reference keeps aligned

---
 spacy/pipeline/coref.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index eb05011ec..99bb611ff 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -130,7 +130,6 @@ class CoreferenceResolver(TrainablePipe):
 
         DOCS: https://spacy.io/api/coref#predict (TODO)
         """
-        #print("DOCS", docs)
         out = []
         for doc in docs:
             scores, idxs = self.model.predict([doc])
@@ -212,7 +211,6 @@ class CoreferenceResolver(TrainablePipe):
             # TODO check this causes no issues (in practice it runs)
             preds, backprop = self.model.begin_update([eg.predicted])
             score_matrix, mention_idx = preds
-
             loss, d_scores = self.get_loss([eg], score_matrix, mention_idx)
             total_loss += loss
             # TODO check shape here
@@ -518,7 +516,8 @@ class SpanPredictor(TrainablePipe):
             # takes 'coref_head_clusters' from the reference.
             for key, sg in eg.reference.spans.items():
                 if key.startswith(self.input_prefix):
-                    doc.spans[key] = [doc[span.start:span.end] for span in sg]
+                    aligned_spans = eg.get_aligned_spans_x2y(sg)
+                    doc.spans[key] = [doc[span.start:span.end] for span in aligned_spans]
             span_scores, backprop = self.model.begin_update([doc])
             loss, d_scores = self.get_loss([eg], span_scores)
             total_loss += loss
@@ -600,7 +599,7 @@ class SpanPredictor(TrainablePipe):
         *,
         nlp: Optional[Language] = None,
     ) -> None:
-        validate_get_examples(get_examples, "CoreferenceResolver.initialize")
+        validate_get_examples(get_examples, "SpanPredictor.initialize")
 
         X = []
         Y = []

From 4fc40340f94d6dc47398dfa264804723b7e52b65 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= <akos@onyx.uvt.nl>
Date: Mon, 28 Mar 2022 11:28:21 +0200
Subject: [PATCH 07/18] handle empty head_ids

---
 spacy/ml/models/coref.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py
index 71082e7ac..7972f9160 100644
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@@ -133,6 +133,7 @@ def convert_coref_scorer_outputs(
     indices_xp = torch2xp(indices)
     return (scores_xp, indices_xp), convert_for_torch_backward
 
+
 def convert_span_predictor_inputs(
     model: Model,
     X: Tuple[Ints1d, Floats2d, Ints1d],
@@ -141,13 +142,17 @@ def convert_span_predictor_inputs(
     tok2vec, (sent_ids, head_ids) = X
     # Normally we shoudl use the input is_train, but for these two it's not relevant
     sent_ids = xp2torch(sent_ids[0], requires_grad=False)
-    head_ids = xp2torch(head_ids[0], requires_grad=False)
+    if not head_ids[0].size:
+        head_ids = torch.empty(size=(0,))
+    else:
+        head_ids = xp2torch(head_ids[0], requires_grad=False)
     word_features = xp2torch(tok2vec[0], requires_grad=is_train)
 
     argskwargs = ArgsKwargs(args=(sent_ids, word_features, head_ids), kwargs={})
     # TODO actually support backprop
     return argskwargs, lambda dX: [[]]
 
+
 # TODO This probably belongs in the component, not the model.
 def predict_span_clusters(span_predictor: Model,
                           sent_ids: Ints1d,
@@ -543,6 +548,9 @@ class SpanPredictor(torch.nn.Module):
         Returns:
             torch.Tensor: span start/end scores, [n_heads, n_words, 2]
         """
+        # If we don't receive heads, return empty
+        if heads_ids.nelement() == 0:
+            return torch.empty(size=(0,))
         # Obtain distance embedding indices, [n_heads, n_words]
         relative_positions = (heads_ids.unsqueeze(1) - torch.arange(words.shape[0]).unsqueeze(0))
         # make all valid distances positive
@@ -550,7 +558,6 @@ class SpanPredictor(torch.nn.Module):
         # "too_far"
         emb_ids[(emb_ids < 0) + (emb_ids > 126)] = 127
         # Obtain "same sentence" boolean mask, [n_heads, n_words]
-        sent_id = torch.tensor(sent_id)
         heads_ids = heads_ids.long()
         same_sent = (sent_id[heads_ids].unsqueeze(1) == sent_id.unsqueeze(0))
         # To save memory, only pass candidates from one sentence for each head

From e4b4b67ef6f627f7cd9cd313ab9274779c16c971 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= <akos@onyx.uvt.nl>
Date: Mon, 28 Mar 2022 11:29:00 +0200
Subject: [PATCH 08/18] handle empty clusters

---
 spacy/pipeline/coref.py | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index 99bb611ff..5a4fa1ab9 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -458,27 +458,29 @@ class SpanPredictor(TrainablePipe):
         out = []
         for doc in docs:
             # TODO check shape here
-            span_scores = self.model.predict(doc)
-            span_scores = span_scores[0]
-            # the information about clustering has to come from the input docs
-            # first let's convert the scores to a list of span idxs
-            start_scores = span_scores[:, :, 0]
-            end_scores = span_scores[:, :, 1]
-            starts = start_scores.argmax(axis=1)
-            ends = end_scores.argmax(axis=1)
+            span_scores = self.model.predict([doc])
+            if span_scores.size:
+                # the information about clustering has to come from the input docs
+                # first let's convert the scores to a list of span idxs
+                start_scores = span_scores[:, :, 0]
+                end_scores = span_scores[:, :, 1]
+                starts = start_scores.argmax(axis=1)
+                ends = end_scores.argmax(axis=1)
 
-            # TODO check start < end
+                # TODO check start < end
 
-            # get the old clusters (shape will be preserved)
-            clusters = doc2clusters(doc, self.input_prefix)
-            cidx = 0
-            out_clusters = []
-            for cluster in clusters:
-                ncluster = []
-                for mention in cluster:
-                    ncluster.append( (starts[cidx], ends[cidx]) )
-                    cidx += 1
-                out_clusters.append(ncluster)
+                # get the old clusters (shape will be preserved)
+                clusters = doc2clusters(doc, self.input_prefix)
+                cidx = 0
+                out_clusters = []
+                for cluster in clusters:
+                    ncluster = []
+                    for mention in cluster:
+                        ncluster.append((starts[cidx], ends[cidx]))
+                        cidx += 1
+                    out_clusters.append(ncluster)
+            else:
+                out_clusters = []
             out.append(out_clusters)
         return out
 
@@ -628,7 +630,6 @@ class SpanPredictor(TrainablePipe):
                 # XXX this is the only different part
                 p_clusters = doc2clusters(ex.predicted, self.output_prefix)
                 g_clusters = doc2clusters(ex.reference, self.output_prefix)
-
                 cluster_info = get_cluster_info(p_clusters, g_clusters)
 
                 evaluator.update(cluster_info)

From 06d680b269c87059ca1fd0381f025a2bcc60c5ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= <akos@onyx.uvt.nl>
Date: Mon, 28 Mar 2022 14:31:51 +0200
Subject: [PATCH 09/18] addressing suggestions by @polm

---
 spacy/pipeline/coref.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index 5a4fa1ab9..340dde470 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -511,20 +511,24 @@ class SpanPredictor(TrainablePipe):
         set_dropout_rate(self.model, drop)
 
         total_loss = 0
-        docs = [eg.predicted for eg in examples]
-        for doc, eg in zip(docs, examples):
+        old_spans = [eg.predicted.spans for eg in examples]
+        for eg in examples:
             # replicates the EntityLinker's behaviour and
             # copies annotations over https://bit.ly/3iweDcW
-            # takes 'coref_head_clusters' from the reference.
+            # https://github.com/explosion/spaCy/blob/master/spacy/pipeline/entity_linker.py#L313
+            doc = eg.predicted
             for key, sg in eg.reference.spans.items():
                 if key.startswith(self.input_prefix):
-                    aligned_spans = eg.get_aligned_spans_x2y(sg)
-                    doc.spans[key] = [doc[span.start:span.end] for span in aligned_spans]
+                    doc.spans[key] = eg.get_aligned_spans_y2x(sg)
             span_scores, backprop = self.model.begin_update([doc])
             loss, d_scores = self.get_loss([eg], span_scores)
             total_loss += loss
             # TODO check shape here
             backprop(d_scores)
+        # Restore examples
+        for spans, eg in zip(old_spans, examples):
+            for key, sg in spans.items():
+                eg.predicted.spans[key] = sg
 
         if sgd is not None:
             self.finish_update(sgd)

From 7ff99a3acc38cf7202fc269f32774d3e1f613d43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= <akos@onyx.uvt.nl>
Date: Mon, 28 Mar 2022 18:16:41 +0200
Subject: [PATCH 10/18] nicer restore

---
 spacy/pipeline/coref.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index 340dde470..f0862c844 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -364,9 +364,7 @@ class CoreferenceResolver(TrainablePipe):
             for ex in examples:
                 p_clusters = doc2clusters(ex.predicted, self.span_cluster_prefix)
                 g_clusters = doc2clusters(ex.reference, self.span_cluster_prefix)
-
                 cluster_info = get_cluster_info(p_clusters, g_clusters)
-
                 evaluator.update(cluster_info)
 
             score = {
@@ -511,12 +509,12 @@ class SpanPredictor(TrainablePipe):
         set_dropout_rate(self.model, drop)
 
         total_loss = 0
-        old_spans = [eg.predicted.spans for eg in examples]
         for eg in examples:
             # replicates the EntityLinker's behaviour and
             # copies annotations over https://bit.ly/3iweDcW
             # https://github.com/explosion/spaCy/blob/master/spacy/pipeline/entity_linker.py#L313
             doc = eg.predicted
+            old_spans = eg.predicted.spans
             for key, sg in eg.reference.spans.items():
                 if key.startswith(self.input_prefix):
                     doc.spans[key] = eg.get_aligned_spans_y2x(sg)
@@ -525,9 +523,8 @@ class SpanPredictor(TrainablePipe):
             total_loss += loss
             # TODO check shape here
             backprop(d_scores)
-        # Restore examples
-        for spans, eg in zip(old_spans, examples):
-            for key, sg in spans.items():
+            # Restore example
+            for key, sg in old_spans.items():
                 eg.predicted.spans[key] = sg
 
         if sgd is not None:

From 63a41ba50abd16c8b945bb39d8beff2879031cc2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= <akos@onyx.uvt.nl>
Date: Wed, 30 Mar 2022 17:28:20 +0200
Subject: [PATCH 11/18] fix score overwriting bug

---
 spacy/pipeline/coref.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index f0862c844..25a353405 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -636,14 +636,14 @@ class SpanPredictor(TrainablePipe):
                 evaluator.update(cluster_info)
 
             score = {
-                "coref_f": evaluator.get_f1(),
-                "coref_p": evaluator.get_precision(),
-                "coref_r": evaluator.get_recall(),
+                "coref_span_f": evaluator.get_f1(),
+                "coref_span_p": evaluator.get_precision(),
+                "coref_span_r": evaluator.get_recall(),
             }
             scores.append(score)
 
         out = {}
         for field in ("f", "p", "r"):
-            fname = f"coref_{field}"
+            fname = f"coref_span_{field}"
             out[fname] = mean([ss[fname] for ss in scores])
         return out

From a1d021990379203a523c4c8683ce1bff620650f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= <akos@onyx.uvt.nl>
Date: Mon, 4 Apr 2022 15:26:15 +0200
Subject: [PATCH 12/18] prepare for aligned heads-spans training

---
 spacy/pipeline/coref.py | 32 ++++++++++----------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index 25a353405..1c0e56521 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -503,29 +503,20 @@ class SpanPredictor(TrainablePipe):
             losses = {}
         losses.setdefault(self.name, 0.0)
         validate_examples(examples, "SpanPredictor.update")
-        if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
+        if not any(len(eg.reference) if eg.reference else 0 for eg in examples):
             # Handle cases where there are no tokens in any docs.
             return losses
         set_dropout_rate(self.model, drop)
 
         total_loss = 0
         for eg in examples:
-            # replicates the EntityLinker's behaviour and
-            # copies annotations over https://bit.ly/3iweDcW
-            # https://github.com/explosion/spaCy/blob/master/spacy/pipeline/entity_linker.py#L313
-            doc = eg.predicted
-            old_spans = eg.predicted.spans
-            for key, sg in eg.reference.spans.items():
-                if key.startswith(self.input_prefix):
-                    doc.spans[key] = eg.get_aligned_spans_y2x(sg)
-            span_scores, backprop = self.model.begin_update([doc])
+            # For update we use the gold coref_head_clusters
+            # in the reference.
+            span_scores, backprop = self.model.begin_update([eg.reference])
             loss, d_scores = self.get_loss([eg], span_scores)
             total_loss += loss
             # TODO check shape here
             backprop(d_scores)
-            # Restore example
-            for key, sg in old_spans.items():
-                eg.predicted.spans[key] = sg
 
         if sgd is not None:
             self.finish_update(sgd)
@@ -570,17 +561,14 @@ class SpanPredictor(TrainablePipe):
         # span_scores is a Floats3d. What are the axes? mention x token x start/end
 
         for eg in examples:
-
-            # get gold data
-            gold = doc2clusters(eg.predicted, self.input_prefix)
-            # flatten the gold data
             starts = []
             ends = []
-            for cluster in gold:
-                for mention in cluster:
-                    starts.append(mention[0])
-                    # XXX I think this was missing here
-                    ends.append(mention[1] - 1)
+            for key, sg in eg.reference.spans.items():
+                if key.startswith(self.output_prefix):
+                    for mention in sg:
+                        starts.append(mention.start)
+                        ends.append(mention.end)
+
             starts = self.model.ops.xp.asarray(starts)
             ends = self.model.ops.xp.asarray(ends)
             start_scores = span_scores[:, :, 0]

From ef141ad3995410d64cd27a615b3f17ee21d59dd2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= <akos@onyx.uvt.nl>
Date: Mon, 4 Apr 2022 18:10:09 +0200
Subject: [PATCH 13/18] span accuracy score

---
 spacy/pipeline/coref.py | 52 +++++++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index 1c0e56521..c1db23d68 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -457,6 +457,7 @@ class SpanPredictor(TrainablePipe):
         for doc in docs:
             # TODO check shape here
             span_scores = self.model.predict([doc])
+            print(span_scores)
             if span_scores.size:
                 # the information about clustering has to come from the input docs
                 # first let's convert the scores to a list of span idxs
@@ -608,30 +609,35 @@ class SpanPredictor(TrainablePipe):
         self.model.initialize(X=X, Y=Y)
 
     def score(self, examples, **kwargs):
-        """Score a batch of examples."""
-        # TODO This is basically the same as the main coref component - factor out?
-
+        """
+        Evaluate on reconstructing the correct spans around
+        gold heads.
+        """
         scores = []
-        for metric in (b_cubed, muc, ceafe):
-            evaluator = Evaluator(metric)
+        for eg in examples:
+            starts = []
+            ends = []
+            pred_starts = []
+            pred_ends = []
+            ref = eg.reference
+            pred = eg.predicted
+            for key, gold_sg in ref.spans.items():
+                if key.startswith(self.input_prefix):
+                    cluster_id = key.split('_')[-1]
+                    # FIXME THIS DOESN'T WORK BECAUSE pred.spans are empty?
+                    pred_sg = pred.spans[f"{self.output_prefix}_{cluster_id}"]
+                    for gold_mention, pred_mention in zip(gold_sg, pred_sg):
+                        starts.append(gold_mention.start)
+                        ends.append(gold_mention.end)
+                        pred_starts.append(pred_mention.start)
+                        pred_ends.append(pred_mention.end)
 
-            for ex in examples:
-                # XXX this is the only different part
-                p_clusters = doc2clusters(ex.predicted, self.output_prefix)
-                g_clusters = doc2clusters(ex.reference, self.output_prefix)
-                cluster_info = get_cluster_info(p_clusters, g_clusters)
+            starts = self.model.ops.xp.asarray(starts)
+            ends = self.model.ops.xp.asarray(ends)
+            pred_starts = self.model.ops.xp.asarray(pred_starts)
+            pred_ends = self.model.ops.xp.asarray(pred_ends)
+            correct = ((starts == pred_starts) * (ends == pred_ends)).sum()
+            scores.append(correct)
 
-                evaluator.update(cluster_info)
-
-            score = {
-                "coref_span_f": evaluator.get_f1(),
-                "coref_span_p": evaluator.get_precision(),
-                "coref_span_r": evaluator.get_recall(),
-            }
-            scores.append(score)
-
-        out = {}
-        for field in ("f", "p", "r"):
-            fname = f"coref_span_{field}"
-            out[fname] = mean([ss[fname] for ss in scores])
+        out = {"span_accuracy": mean(scores)}
         return out

From 3ba913109d27827639eaa2bf91c1693bed7f33f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= <akos@onyx.uvt.nl>
Date: Thu, 7 Apr 2022 13:20:12 +0200
Subject: [PATCH 14/18] update with eg.predited as other components

---
 spacy/pipeline/coref.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index c1db23d68..1b062ed9a 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -457,7 +457,6 @@ class SpanPredictor(TrainablePipe):
         for doc in docs:
             # TODO check shape here
             span_scores = self.model.predict([doc])
-            print(span_scores)
             if span_scores.size:
                 # the information about clustering has to come from the input docs
                 # first let's convert the scores to a list of span idxs
@@ -513,7 +512,7 @@ class SpanPredictor(TrainablePipe):
         for eg in examples:
             # For update we use the gold coref_head_clusters
             # in the reference.
-            span_scores, backprop = self.model.begin_update([eg.reference])
+            span_scores, backprop = self.model.begin_update([eg.predicted])
             loss, d_scores = self.get_loss([eg], span_scores)
             total_loss += loss
             # TODO check shape here
@@ -622,10 +621,9 @@ class SpanPredictor(TrainablePipe):
             ref = eg.reference
             pred = eg.predicted
             for key, gold_sg in ref.spans.items():
-                if key.startswith(self.input_prefix):
+                if key.startswith(self.output_prefix):
                     cluster_id = key.split('_')[-1]
-                    # FIXME THIS DOESN'T WORK BECAUSE pred.spans are empty?
-                    pred_sg = pred.spans[f"{self.output_prefix}_{cluster_id}"]
+                    pred_sg = pred.spans[key]
                     for gold_mention, pred_mention in zip(gold_sg, pred_sg):
                         starts.append(gold_mention.start)
                         ends.append(gold_mention.end)

From 2a1ad4c5d294de02af668e07d19894491afc3204 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= <akos@onyx.uvt.nl>
Date: Fri, 8 Apr 2022 14:56:44 +0200
Subject: [PATCH 15/18] add backprop callback to spanpredictor

---
 spacy/ml/models/coref.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py
index 7972f9160..0b533daf0 100644
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@@ -104,13 +104,13 @@ def convert_coref_scorer_inputs(
     # just use the first
     # TODO real batching
     X = X[0]
-
-
     word_features = xp2torch(X, requires_grad=is_train)
+
     def backprop(args: ArgsKwargs) -> List[Floats2d]:
         # convert to xp and wrap in list
         gradients = torch2xp(args.args[0])
         return [gradients]
+
     return ArgsKwargs(args=(word_features, ), kwargs={}), backprop
 
 
@@ -141,16 +141,22 @@ def convert_span_predictor_inputs(
 ):
     tok2vec, (sent_ids, head_ids) = X
     # Normally we shoudl use the input is_train, but for these two it's not relevant
+
+    def backprop(args: ArgsKwargs) -> List[Floats2d]:
+        # convert to xp and wrap in list
+        gradients = torch2xp(args.args[1])
+        return [[gradients], None]
+
+    word_features = xp2torch(tok2vec[0], requires_grad=is_train)
     sent_ids = xp2torch(sent_ids[0], requires_grad=False)
     if not head_ids[0].size:
         head_ids = torch.empty(size=(0,))
     else:
         head_ids = xp2torch(head_ids[0], requires_grad=False)
-    word_features = xp2torch(tok2vec[0], requires_grad=is_train)
 
     argskwargs = ArgsKwargs(args=(sent_ids, word_features, head_ids), kwargs={})
     # TODO actually support backprop
-    return argskwargs, lambda dX: [[]]
+    return argskwargs, backprop
 
 
 # TODO This probably belongs in the component, not the model.
@@ -247,7 +253,6 @@ def head_data_forward(model, docs, is_train):
                 heads.append(span[0].i)
         heads = model.ops.asarray2i(heads)
         head_ids.append(heads)
-
     # each of these is a list with one entry per doc
     # backprop is just a placeholder
     # TODO it would probably be better to have a list of tuples than two lists of arrays
@@ -584,7 +589,6 @@ class SpanPredictor(torch.nn.Module):
 
         scores = torch.full((heads_ids.shape[0], words.shape[0], 2), float('-inf'))
         scores[rows, cols] = res[padding_mask]
-
         # Make sure that start <= head <= end during inference
         if not self.training:
             valid_starts = torch.log((relative_positions >= 0).to(torch.float))

From 7a239f2ec7c71a494f2380686fdbcfdd421e7fa6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= <akos@onyx.uvt.nl>
Date: Fri, 8 Apr 2022 14:57:19 +0200
Subject: [PATCH 16/18] report start- and end-accuracies separately

---
 spacy/pipeline/coref.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index 1b062ed9a..02c93f712 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -510,13 +510,11 @@ class SpanPredictor(TrainablePipe):
 
         total_loss = 0
         for eg in examples:
-            # For update we use the gold coref_head_clusters
-            # in the reference.
             span_scores, backprop = self.model.begin_update([eg.predicted])
             loss, d_scores = self.get_loss([eg], span_scores)
             total_loss += loss
             # TODO check shape here
-            backprop(d_scores)
+            backprop((d_scores))
 
         if sgd is not None:
             self.finish_update(sgd)
@@ -612,7 +610,8 @@ class SpanPredictor(TrainablePipe):
         Evaluate on reconstructing the correct spans around
         gold heads.
         """
-        scores = []
+        start_scores = []
+        end_scores = []
         for eg in examples:
             starts = []
             ends = []
@@ -622,7 +621,6 @@ class SpanPredictor(TrainablePipe):
             pred = eg.predicted
             for key, gold_sg in ref.spans.items():
                 if key.startswith(self.output_prefix):
-                    cluster_id = key.split('_')[-1]
                     pred_sg = pred.spans[key]
                     for gold_mention, pred_mention in zip(gold_sg, pred_sg):
                         starts.append(gold_mention.start)
@@ -634,8 +632,12 @@ class SpanPredictor(TrainablePipe):
             ends = self.model.ops.xp.asarray(ends)
             pred_starts = self.model.ops.xp.asarray(pred_starts)
             pred_ends = self.model.ops.xp.asarray(pred_ends)
-            correct = ((starts == pred_starts) * (ends == pred_ends)).sum()
-            scores.append(correct)
-
-        out = {"span_accuracy": mean(scores)}
+            start_accuracy = (starts == pred_starts).mean()
+            end_accuracy = (ends == pred_ends).mean()
+            start_scores.append(float(start_accuracy))
+            end_scores.append(float(end_accuracy))
+        out = {
+            "span_start_accuracy": mean(start_scores),
+            "span_end_accuracy": mean(end_scores)
+        }
         return out

From 6aedd98d02b55672469556f4d61f2ad6254f3759 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= <akos@onyx.uvt.nl>
Date: Mon, 11 Apr 2022 16:10:14 +0200
Subject: [PATCH 17/18] fixing scorer

---
 spacy/pipeline/coref.py | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index 02c93f712..fc04d1a3e 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -511,10 +511,13 @@ class SpanPredictor(TrainablePipe):
         total_loss = 0
         for eg in examples:
             span_scores, backprop = self.model.begin_update([eg.predicted])
-            loss, d_scores = self.get_loss([eg], span_scores)
-            total_loss += loss
-            # TODO check shape here
-            backprop((d_scores))
+            # FIXME, this only happens once in the first 1000 docs of OntoNotes
+            # and I'm not sure yet why.
+            if span_scores.size:
+                loss, d_scores = self.get_loss([eg], span_scores)
+                total_loss += loss
+                # TODO check shape here
+                backprop((d_scores))
 
         if sgd is not None:
             self.finish_update(sgd)
@@ -557,7 +560,6 @@ class SpanPredictor(TrainablePipe):
         assert len(examples) == 1, "Only fake batching is supported."
         # starts and ends are gold starts and ends (Ints1d)
         # span_scores is a Floats3d. What are the axes? mention x token x start/end
-
         for eg in examples:
             starts = []
             ends = []
@@ -610,8 +612,8 @@ class SpanPredictor(TrainablePipe):
         Evaluate on reconstructing the correct spans around
         gold heads.
         """
-        start_scores = []
-        end_scores = []
+        scores = []
+        xp = self.model.ops.xp
         for eg in examples:
             starts = []
             ends = []
@@ -628,16 +630,11 @@ class SpanPredictor(TrainablePipe):
                         pred_starts.append(pred_mention.start)
                         pred_ends.append(pred_mention.end)
 
-            starts = self.model.ops.xp.asarray(starts)
-            ends = self.model.ops.xp.asarray(ends)
-            pred_starts = self.model.ops.xp.asarray(pred_starts)
-            pred_ends = self.model.ops.xp.asarray(pred_ends)
-            start_accuracy = (starts == pred_starts).mean()
-            end_accuracy = (ends == pred_ends).mean()
-            start_scores.append(float(start_accuracy))
-            end_scores.append(float(end_accuracy))
-        out = {
-            "span_start_accuracy": mean(start_scores),
-            "span_end_accuracy": mean(end_scores)
-        }
-        return out
+            starts = xp.asarray(starts)
+            ends = xp.asarray(ends)
+            pred_starts = xp.asarray(pred_starts)
+            pred_ends = xp.asarray(pred_ends)
+            correct = (starts == pred_starts) * (ends == pred_ends)
+            accuracy = correct.mean()
+            scores.append(float(accuracy))
+        return {"span_accuracy": mean(scores)}

From e512874c809bd35429979c66943af4212486a33e Mon Sep 17 00:00:00 2001
From: kadarakos <kadar.akos@gmail.com>
Date: Tue, 10 May 2022 16:40:31 +0000
Subject: [PATCH 18/18] small refactor and docs

---
 spacy/ml/models/coref.py | 189 ++++++++++++++-------------------------
 1 file changed, 67 insertions(+), 122 deletions(-)

diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py
index 4e8e604d8..435c3bc80 100644
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@@ -1,14 +1,14 @@
 from typing import List, Tuple
 import torch
 
-from thinc.api import Model, chain, tuplify
+from thinc.api import Model, chain
 from thinc.api import PyTorchWrapper, ArgsKwargs
-from thinc.types import Floats2d, Ints1d, Ints2d
+from thinc.types import Floats2d, Ints2d
 from thinc.util import xp2torch, torch2xp
 
 from ...tokens import Doc
 from ...util import registry
-from .coref_util import add_dummy, get_sentence_ids
+from .coref_util import add_dummy
 
 
 @registry.architectures("spacy.Coref.v1")
@@ -19,7 +19,6 @@ def build_wl_coref_model(
     n_hidden_layers: int = 1,  # TODO rename to "depth"?
     dropout: float = 0.3,
     # pairs to keep per mention after rough scoring
-    # TODO change to meaningful name
     rough_k: int = 50,
     # TODO is this not a training loop setting?
     a_scoring_batch_size: int = 512,
@@ -34,7 +33,6 @@ def build_wl_coref_model(
         dim = 768
 
     with Model.define_operators({">>": chain}):
-        # TODO chain tok2vec with these models
         coref_scorer = PyTorchWrapper(
             CorefScorer(
                 dim,
@@ -49,18 +47,6 @@ def build_wl_coref_model(
             convert_outputs=convert_coref_scorer_outputs,
         )
         coref_model = tok2vec >> coref_scorer
-        # XXX just ignore this until the coref scorer is integrated
-        # span_predictor = PyTorchWrapper(
-        #    SpanPredictor(
-        # TODO this was hardcoded to 1024, check
-        #        hidden_size,
-        #        sp_embedding_size,
-        #    ),
-        #    convert_inputs=convert_span_predictor_inputs
-        # )
-    # TODO combine models so output is uniform (just one forward pass)
-    # It may be reasonable to have an option to disable span prediction,
-    # and just return words as spans.
     return coref_model
 
 
@@ -95,46 +81,13 @@ def convert_coref_scorer_outputs(model: Model, inputs_outputs, is_train: bool):
     return (scores_xp, indices_xp), convert_for_torch_backward
 
 
-# TODO add docstring for this, maybe move to utils.
-# This might belong in the component.
-def _clusterize(model, scores: Floats2d, top_indices: Ints2d):
-    xp = model.ops.xp
-    antecedents = scores.argmax(axis=1) - 1
-    not_dummy = antecedents >= 0
-    coref_span_heads = xp.arange(0, len(scores))[not_dummy]
-    antecedents = top_indices[coref_span_heads, antecedents[not_dummy]]
-    n_words = scores.shape[0]
-    nodes = [GraphNode(i) for i in range(n_words)]
-    for i, j in zip(coref_span_heads.tolist(), antecedents.tolist()):
-        nodes[i].link(nodes[j])
-        assert nodes[i] is not nodes[j]
-
-    clusters = []
-    for node in nodes:
-        if len(node.links) > 0 and not node.visited:
-            cluster = []
-            stack = [node]
-            while stack:
-                current_node = stack.pop()
-                current_node.visited = True
-                cluster.append(current_node.id)
-                stack.extend(link for link in current_node.links if not link.visited)
-            assert len(cluster) > 1
-            clusters.append(sorted(cluster))
-    return sorted(clusters)
-
-
 class CorefScorer(torch.nn.Module):
-    """Combines all coref modules together to find coreferent spans.
-
-    Attributes:
-        epochs_trained (int): number of epochs the model has been trained for
-
+    """
+    Combines all coref modules together to find coreferent token pairs.
     Submodules (in the order of their usage in the pipeline):
-        rough_scorer (RoughScorer)
-        pw (PairwiseEncoder)
-        a_scorer (AnaphoricityScorer)
-        sp (SpanPredictor)
+        - rough_scorer (RoughScorer) that prunes candidate pairs
+        - pw (DistancePairwiseEncoder) that computes pairwise features
+        - a_scorer (AnaphoricityScorer) produces the final scores
     """
 
     def __init__(
@@ -149,50 +102,54 @@ class CorefScorer(torch.nn.Module):
     ):
         super().__init__()
         """
-        A newly created model is set to evaluation mode.
-
-        Args:
-            epochs_trained (int): the number of epochs finished
-                (useful for warm start)
+        dim: Size of the input features.
+        dist_emb_size: Size of the distance embeddings.
+        hidden_size: Size of the coreference candidate embeddings.
+        n_layers: Numbers of layers in the AnaphoricityScorer.
+        dropout_rate: Dropout probability to apply across all modules.
+        roughk: Number of candidates the RoughScorer returns.
+        batch_size: Internal batch-size for the more expensive AnaphoricityScorer.
         """
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.batch_size = batch_size
+        # Modules
+        self.lstm = torch.nn.LSTM(
+            input_size=dim,
+            hidden_size=dim,
+            batch_first=True,
+        )
+        self.rough_scorer = RoughScorer(dim, dropout_rate, roughk)
         self.pw = DistancePairwiseEncoder(dist_emb_size, dropout_rate)
-        # TODO clean this up
-        bert_emb = dim
-        pair_emb = bert_emb * 3 + self.pw.shape
+        pair_emb = dim * 3 + self.pw.shape
         self.a_scorer = AnaphoricityScorer(
             pair_emb, hidden_size, n_layers, dropout_rate
         )
-        self.lstm = torch.nn.LSTM(
-            input_size=bert_emb,
-            hidden_size=bert_emb,
-            batch_first=True,
-        )
-        self.dropout = torch.nn.Dropout(dropout_rate)
-        self.rough_scorer = RoughScorer(bert_emb, dropout_rate, roughk)
-        self.batch_size = batch_size
 
     def forward(self, word_features: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """
-        This is a massive method, but it made sense to me to not split it into
-        several ones to let one see the data flow.
+        1. LSTM encodes the incoming word_features.
+        2. The RoughScorer scores and prunes the candidates.
+        3. The DistancePairwiseEncoder embeds the distance between remaning pairs.
+        4. The AnaphoricityScorer scores all pairs in mini-batches.
 
-        Args:
-            word_features: torch.Tensor containing word encodings
-        Returns:
-            coreference scores and top indices
+        word_features: torch.Tensor containing word encodings
+
+        returns:
+            coref_scores: n_words x roughk floats.
+            top_indices: n_words x roughk integers.
         """
-        # words           [n_words, span_emb]
-        # cluster_ids     [n_words]
         self.lstm.flatten_parameters()  # XXX without this there's a warning
         word_features = torch.unsqueeze(word_features, dim=0)
         words, _ = self.lstm(word_features)
         words = words.squeeze()
+        # words: n_words x dim
         words = self.dropout(words)
         # Obtain bilinear scores and leave only top-k antecedents for each word
-        # top_rough_scores  [n_words, n_ants]
-        # top_indices       [n_words, n_ants]
+        # top_rough_scores: (n_words x roughk)
+        # top_indices: (n_words x roughk)
         top_rough_scores, top_indices = self.rough_scorer(words)
-        # Get pairwise features [n_words, n_ants, n_pw_features]
+        # Get pairwise features
+        # (n_words x roughk x n_pw_features)
         pw = self.pw(top_indices)
         batch_size = self.batch_size
         a_scores_lst: List[torch.Tensor] = []
@@ -272,13 +229,8 @@ class AnaphoricityScorer(torch.nn.Module):
 
     def _ffnn(self, x: torch.Tensor) -> torch.Tensor:
         """
-        Calculates anaphoricity scores.
-
-        Args:
-            x: tensor of shape [batch_size, n_ants, n_features]
-
-        Returns:
-            tensor of shape [batch_size, n_ants]
+        x: tensor of shape (batch_size x roughk x n_features
+        returns: tensor of shape (batch_size x rough_k)
         """
         x = self.out(self.hidden(x))
         return x.squeeze(2)
@@ -293,21 +245,18 @@ class AnaphoricityScorer(torch.nn.Module):
         """
         Builds the matrix used as input for AnaphoricityScorer.
 
-        Args:
-            all_mentions (torch.Tensor): [n_mentions, mention_emb],
-                all the valid mentions of the document,
-                can be on a different device
-            mentions_batch (torch.Tensor): [batch_size, mention_emb],
-                the mentions of the current batch,
-                is expected to be on the current device
-            pw_batch (torch.Tensor): [batch_size, n_ants, pw_emb],
-                pairwise features of the current batch,
-                is expected to be on the current device
-            top_indices_batch (torch.Tensor): [batch_size, n_ants],
-                indices of antecedents of each mention
+        all_mentions: (n_mentions x mention_emb),
+            all the valid mentions of the document,
+            can be on a different device
+        mentions_batch: (batch_size x mention_emb),
+            the mentions of the current batch.
+        pw_batch: (batch_size x roughk x pw_emb),
+            pairwise distance features of the current batch.
+        top_indices_batch: (batch_size x n_ants),
+            indices of antecedents of each mention
 
         Returns:
-            torch.Tensor: [batch_size, n_ants, pair_emb]
+            out: pairwise features (batch_size x n_ants x pair_emb)
         """
         emb_size = mentions_batch.shape[1]
         n_ants = pw_batch.shape[1]
@@ -322,16 +271,15 @@ class AnaphoricityScorer(torch.nn.Module):
 
 class RoughScorer(torch.nn.Module):
     """
-    Is needed to give a roughly estimate of the anaphoricity of two candidates,
-    only top scoring candidates are considered on later steps to reduce
-    computational complexity.
+    Cheaper module that gives a rough estimate of the anaphoricity of two
+    candidates, only top scoring candidates are considered on later
+    steps to reduce computational cost.
     """
 
     def __init__(self, features: int, dropout_rate: float, rough_k: float):
         super().__init__()
         self.dropout = torch.nn.Dropout(dropout_rate)
         self.bilinear = torch.nn.Linear(features, features)
-
         self.k = rough_k
 
     def forward(
@@ -348,21 +296,6 @@ class RoughScorer(torch.nn.Module):
         pair_mask = torch.log((pair_mask > 0).to(torch.float))
         bilinear_scores = self.dropout(self.bilinear(mentions)).mm(mentions.T)
         rough_scores = pair_mask + bilinear_scores
-
-        return self._prune(rough_scores)
-
-    def _prune(self, rough_scores: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Selects top-k rough antecedent scores for each mention.
-
-        Args:
-            rough_scores: tensor of shape [n_mentions, n_mentions], containing
-                rough antecedent scores of each mention-antecedent pair.
-
-        Returns:
-            FloatTensor of shape [n_mentions, k], top rough scores
-            LongTensor of shape [n_mentions, k], top indices
-        """
         top_scores, indices = torch.topk(
             rough_scores, k=min(self.k, len(rough_scores)), dim=1, sorted=False
         )
@@ -371,6 +304,18 @@ class RoughScorer(torch.nn.Module):
 
 class DistancePairwiseEncoder(torch.nn.Module):
     def __init__(self, embedding_size, dropout_rate):
+        """
+        Takes the top_indices indicating, which is a ranked
+        list for each word and its most likely corresponding
+        anaphora candidates. For each of these pairs it looks
+        up a distance embedding from a table, where the distance
+        corresponds to the log-distance.
+
+        embedding_size: int,
+            Dimensionality of the distance-embeddings table.
+        dropout_rate: float,
+            Dropout probability.
+        """
         super().__init__()
         emb_size = embedding_size
         self.distance_emb = torch.nn.Embedding(9, emb_size)
@@ -378,7 +323,7 @@ class DistancePairwiseEncoder(torch.nn.Module):
         self.shape = emb_size
 
     def forward(
-        self,  # type: ignore  # pylint: disable=arguments-differ  #35566 in pytorch
+        self,
         top_indices: torch.Tensor,
     ) -> torch.Tensor:
         word_ids = torch.arange(0, top_indices.size(0))