From bc081c24fa96474a19d73269fdc875b952029198 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Sun, 18 Jul 2021 20:13:10 +0900
Subject: [PATCH] Add full traditional scoring

This calculates scores as an average of three metrics. As noted in the
code, these metrics all have issues, but we want to use them to match up
with prior work.

This should be replaced with some simpler default scoring and the scorer
here should be moved to an external project to be passed in just for
generating the traditional scores.
---
 spacy/pipeline/coref.py | 44 ++++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index 3cc6606dd..4e67c9f9f 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -5,6 +5,7 @@ from thinc.types import Floats2d, Ints2d
 from thinc.api import Model, Config, Optimizer, CategoricalCrossentropy
 from thinc.api import set_dropout_rate
 from itertools import islice
+from statistics import mean
 
 from .trainable_pipe import TrainablePipe
 from ..language import Language
@@ -23,7 +24,7 @@ from ..ml.models.coref_util import (
     doc2clusters,
 )
 
-from ..coref_scorer import Evaluator, get_cluster_info, b_cubed
+from ..coref_scorer import Evaluator, get_cluster_info, b_cubed, muc, ceafe
 
 default_config = """
 [model]
@@ -344,28 +345,35 @@ class CoreferenceResolver(TrainablePipe):
         assert len(X) > 0, Errors.E923.format(name=self.name)
         self.model.initialize(X=X, Y=Y)
 
-    # TODO consider whether to use this. It's pretty fast, but it'll be slower if 
-    # we use all three methods like the original evaluator does. Also the current
-    # implementation, borrowed from the coval project, uses scipy, which we would
-    # want to avoid. (If that's the only issue we can probably work around it.)
+    # TODO This mirrors the evaluation used in prior work, but we don't want to
+    # include this in the final release. The metrics all have fundamental
+    # issues and the current implementation requires scipy.
     def score(self, examples, **kwargs):
         """Score a batch of examples."""
 
-        #TODO traditionally coref uses the average of b_cubed, muc, and ceaf.
+        #NOTE traditionally coref uses the average of b_cubed, muc, and ceaf.
         # we need to handle the average ourselves.
-        evaluator = Evaluator(b_cubed)
+        scores = []
+        for metric in (b_cubed, muc, ceafe):
+            evaluator = Evaluator(b_cubed)
 
-        for ex in examples:
-            p_clusters = doc2clusters(ex.predicted, self.span_cluster_prefix)
-            g_clusters = doc2clusters(ex.reference, self.span_cluster_prefix)
+            for ex in examples:
+                p_clusters = doc2clusters(ex.predicted, self.span_cluster_prefix)
+                g_clusters = doc2clusters(ex.reference, self.span_cluster_prefix)
 
-            cluster_info = get_cluster_info(p_clusters, g_clusters)
+                cluster_info = get_cluster_info(p_clusters, g_clusters)
 
-            evaluator.update(cluster_info)
+                evaluator.update(cluster_info)
 
-        scores ={
-                "coref_f": evaluator.get_f1(),
-                "coref_p": evaluator.get_precision(),
-                "coref_r": evaluator.get_recall(),
-                }
-        return scores
+            score ={
+                    "coref_f": evaluator.get_f1(),
+                    "coref_p": evaluator.get_precision(),
+                    "coref_r": evaluator.get_recall(),
+                    }
+            scores.append(score)
+
+        out = {}
+        for field in ("f", "p", "r"):
+            fname = f"coref_{field}"
+            out[fname] = mean([ss[fname] for ss in scores])
+        return out