From e8af02700f610845a7caa53639120ea0987f6927 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 13 Apr 2022 21:02:18 +0900
Subject: [PATCH] Remove all coref scoring exept LEA

This is necessary because one of the three old methods relied on scipy
for some complex problem solving. LEA is generally better for
evaluations.

The downside is that this means evaluations aren't comparable with many
papers, but canonical scoring can be supported using external eval
scripts or other methods.
---
 spacy/coref_scorer.py   | 128 +---------------------------------------
 spacy/pipeline/coref.py |  46 +++++++--------
 2 files changed, 22 insertions(+), 152 deletions(-)

diff --git a/spacy/coref_scorer.py b/spacy/coref_scorer.py
index e00b22fd7..b266ec3b3 100644
--- a/spacy/coref_scorer.py
+++ b/spacy/coref_scorer.py
@@ -1,17 +1,5 @@
 # copied from coval
 # https://github.com/ns-moosavi/coval
-from collections import Counter
-import numpy as np
-
-try:
-    # This is only used in the ceaf methods. If those are necessary we should
-    # implement this locally to avoid a scipy dep.
-    from scipy.optimize import linear_sum_assignment
-except:
-    pass
-
-# Terminology here is consistent with papers in the field but kind of confusing.
-# Key = gold data, System = predictions.
 
 
 def get_cluster_info(predicted_clusters, gold_clusters):
@@ -44,29 +32,6 @@ def f1(p_num, p_den, r_num, r_den, beta=1):
     return 0 if p + r == 0 else (1 + beta * beta) * p * r / (beta * beta * p + r)
 
 
-def evaluate_non_referrings(doc_non_referring_infos):
-    tp, _tn, fp, fn = 0, 0, 0, 0
-
-    for doc_id in doc_non_referring_infos:
-        key_non_referrings, sys_non_referrings = doc_non_referring_infos[doc_id]
-        for m in key_non_referrings:
-            if m in sys_non_referrings:
-                tp += 1
-            else:
-                fn += 1
-        for m in sys_non_referrings:
-            if m not in key_non_referrings:
-                fp += 1
-
-    recall = tp / float(tp + fn) if (tp + fn) > 0 else 0
-    precision = tp / float(tp + fp) if (tp + fp) > 0 else 0
-    f1 = (
-        2 * recall * precision / (recall + precision) if (recall + precision) > 0 else 0
-    )
-
-    return recall, precision, f1
-
-
 class Evaluator:
     def __init__(self, metric, beta=1, keep_aggregated_values=False):
         self.p_num = 0
@@ -91,14 +56,8 @@ class Evaluator:
             sys_mention_key_cluster,
         ) = coref_info
 
-        if self.metric == ceafe or self.metric == ceafm:
-            pn, pd, rn, rd = self.metric(sys_clusters, key_clusters)
-        elif self.metric == lea:
-            pn, pd = self.metric(sys_clusters, key_clusters, sys_mention_key_cluster)
-            rn, rd = self.metric(key_clusters, sys_clusters, key_mention_sys_cluster)
-        else:
-            pn, pd = self.metric(sys_clusters, sys_mention_key_cluster)
-            rn, rd = self.metric(key_clusters, key_mention_sys_cluster)
+        pn, pd = self.metric(sys_clusters, key_clusters, sys_mention_key_cluster)
+        rn, rd = self.metric(key_clusters, sys_clusters, key_mention_sys_cluster)
         self.p_num += pn
         self.p_den += pd
         self.r_num += rn
@@ -134,89 +93,6 @@ class Evaluator:
         )
 
 
-def evaluate_documents(doc_coref_infos, metric, beta=1):
-    evaluator = Evaluator(metric, beta=beta)
-    for doc_id in doc_coref_infos:
-        evaluator.update(doc_coref_infos[doc_id])
-    return (evaluator.get_recall(), evaluator.get_precision(), evaluator.get_f1())
-
-
-def get_document_evaluations(doc_coref_infos, metric, beta=1):
-    evaluator = Evaluator(metric, beta=beta, keep_aggregated_values=True)
-    for doc_id in doc_coref_infos:
-        evaluator.update(doc_coref_infos[doc_id])
-    return evaluator.get_aggregated_values()
-
-
-def mentions(clusters, mention_to_gold):
-    setofmentions = set(mention for cluster in clusters for mention in cluster)
-    correct = setofmentions & set(mention_to_gold.keys())
-    return len(correct), len(setofmentions)
-
-
-def b_cubed(clusters, mention_to_gold):
-    num, den = 0, 0
-
-    for c in clusters:
-        gold_counts = Counter()
-        correct = 0
-        for m in c:
-            if m in mention_to_gold:
-                gold_counts[mention_to_gold[m]] += 1
-        for c2 in gold_counts:
-            correct += gold_counts[c2] * gold_counts[c2]
-
-        num += correct / float(len(c))
-        den += len(c)
-
-    return num, den
-
-
-def muc(clusters, mention_to_gold):
-    tp, p = 0, 0
-    for c in clusters:
-        p += len(c) - 1
-        tp += len(c)
-        linked = set()
-        for m in c:
-            if m in mention_to_gold:
-                linked.add(mention_to_gold[m])
-            else:
-                tp -= 1
-        tp -= len(linked)
-    return tp, p
-
-
-def phi4(c1, c2):
-    return 2 * len([m for m in c1 if m in c2]) / float(len(c1) + len(c2))
-
-
-def phi3(c1, c2):
-    return len([m for m in c1 if m in c2])
-
-
-def ceafe(clusters, gold_clusters):
-    clusters = [c for c in clusters]
-    scores = np.zeros((len(gold_clusters), len(clusters)))
-    for i in range(len(gold_clusters)):
-        for j in range(len(clusters)):
-            scores[i, j] = phi4(gold_clusters[i], clusters[j])
-    row_ind, col_ind = linear_sum_assignment(-scores)
-    similarity = scores[row_ind, col_ind].sum()
-    return similarity, len(clusters), similarity, len(gold_clusters)
-
-
-def ceafm(clusters, gold_clusters):
-    clusters = [c for c in clusters]
-    scores = np.zeros((len(gold_clusters), len(clusters)))
-    for i in range(len(gold_clusters)):
-        for j in range(len(clusters)):
-            scores[i, j] = phi3(gold_clusters[i], clusters[j])
-    row_ind, col_ind = linear_sum_assignment(-scores)
-    similarity = scores[row_ind, col_ind].sum()
-    return similarity, len(clusters), similarity, len(gold_clusters)
-
-
 def lea(input_clusters, output_clusters, mention_to_gold):
     num, den = 0, 0
 
diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index 4111f8445..671d65e19 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -25,7 +25,7 @@ from ..ml.models.coref_util import (
     doc2clusters,
 )
 
-from ..coref_scorer import Evaluator, get_cluster_info, b_cubed, muc, ceafe
+from ..coref_scorer import Evaluator, get_cluster_info, lea
 
 
 default_config = """
@@ -349,36 +349,30 @@ class CoreferenceResolver(TrainablePipe):
         assert len(X) > 0, Errors.E923.format(name=self.name)
         self.model.initialize(X=X, Y=Y)
 
-    # TODO This mirrors the evaluation used in prior work, but we don't want to
-    # include this in the final release. The metrics all have fundamental
-    # issues and the current implementation requires scipy.
     def score(self, examples, **kwargs):
-        """Score a batch of examples."""
+        """Score a batch of examples using LEA.
 
-        # NOTE traditionally coref uses the average of b_cubed, muc, and ceaf.
-        # we need to handle the average ourselves.
-        scores = []
-        for metric in (b_cubed, muc, ceafe):
-            evaluator = Evaluator(metric)
+        For details on how LEA works and why to use it see the paper:
 
-            for ex in examples:
-                p_clusters = doc2clusters(ex.predicted, self.span_cluster_prefix)
-                g_clusters = doc2clusters(ex.reference, self.span_cluster_prefix)
-                cluster_info = get_cluster_info(p_clusters, g_clusters)
-                evaluator.update(cluster_info)
+        Which Coreference Evaluation Metric Do You Trust? A Proposal for a Link-based Entity Aware Metric
+        Moosavi and Strube, 2016
+        https://api.semanticscholar.org/CorpusID:17606580
+        """
 
-            score = {
-                "coref_f": evaluator.get_f1(),
-                "coref_p": evaluator.get_precision(),
-                "coref_r": evaluator.get_recall(),
-            }
-            scores.append(score)
+        evaluator = Evaluator(lea)
 
-        out = {}
-        for field in ("f", "p", "r"):
-            fname = f"coref_{field}"
-            out[fname] = mean([ss[fname] for ss in scores])
-        return out
+        for ex in examples:
+            p_clusters = doc2clusters(ex.predicted, self.span_cluster_prefix)
+            g_clusters = doc2clusters(ex.reference, self.span_cluster_prefix)
+            cluster_info = get_cluster_info(p_clusters, g_clusters)
+            evaluator.update(cluster_info)
+
+        score = {
+            "coref_f": evaluator.get_f1(),
+            "coref_p": evaluator.get_precision(),
+            "coref_r": evaluator.get_recall(),
+        }
+        return score
 
 
 default_span_predictor_config = """