Remove all coref scoring exept LEA

This is necessary because one of the three old methods relied on scipy for some complex problem solving. LEA is generally better for evaluations. The downside is that this means evaluations aren't comparable with many papers, but canonical scoring can be supported using external eval scripts or other methods.
2025-09-18 18:12:45 +03:00 · 2022-04-13 21:02:18 +09:00 · 2022-04-13 21:02:18 +09:00 · e8af02700f
commit e8af02700f
parent 2300f4df3d
2 changed files with 22 additions and 152 deletions
--- a/spacy/coref_scorer.py
+++ b/spacy/coref_scorer.py
@ -1,17 +1,5 @@
 # copied from coval
 # https://github.com/ns-moosavi/coval
 from collections import Counter
 import numpy as np
 try:
    # This is only used in the ceaf methods. If those are necessary we should
    # implement this locally to avoid a scipy dep.
    from scipy.optimize import linear_sum_assignment
 except:
    pass
 # Terminology here is consistent with papers in the field but kind of confusing.
 # Key = gold data, System = predictions.
 def get_cluster_info(predicted_clusters, gold_clusters):
@ -44,29 +32,6 @@ def f1(p_num, p_den, r_num, r_den, beta=1):
    return 0 if p + r == 0 else (1 + beta * beta) * p * r / (beta * beta * p + r)
 def evaluate_non_referrings(doc_non_referring_infos):
    tp, _tn, fp, fn = 0, 0, 0, 0
    for doc_id in doc_non_referring_infos:
        key_non_referrings, sys_non_referrings = doc_non_referring_infos[doc_id]
        for m in key_non_referrings:
            if m in sys_non_referrings:
                tp += 1
            else:
                fn += 1
        for m in sys_non_referrings:
            if m not in key_non_referrings:
                fp += 1
    recall = tp / float(tp + fn) if (tp + fn) > 0 else 0
    precision = tp / float(tp + fp) if (tp + fp) > 0 else 0
    f1 = (
        2 * recall * precision / (recall + precision) if (recall + precision) > 0 else 0
    )
    return recall, precision, f1
 class Evaluator:
    def __init__(self, metric, beta=1, keep_aggregated_values=False):
        self.p_num = 0
@ -91,14 +56,8 @@ class Evaluator:
            sys_mention_key_cluster,
        ) = coref_info
        if self.metric == ceafe or self.metric == ceafm:
            pn, pd, rn, rd = self.metric(sys_clusters, key_clusters)
        elif self.metric == lea:
        pn, pd = self.metric(sys_clusters, key_clusters, sys_mention_key_cluster)
        rn, rd = self.metric(key_clusters, sys_clusters, key_mention_sys_cluster)
        else:
            pn, pd = self.metric(sys_clusters, sys_mention_key_cluster)
            rn, rd = self.metric(key_clusters, key_mention_sys_cluster)
        self.p_num += pn
        self.p_den += pd
        self.r_num += rn
@ -134,89 +93,6 @@ class Evaluator:
        )
 def evaluate_documents(doc_coref_infos, metric, beta=1):
    evaluator = Evaluator(metric, beta=beta)
    for doc_id in doc_coref_infos:
        evaluator.update(doc_coref_infos[doc_id])
    return (evaluator.get_recall(), evaluator.get_precision(), evaluator.get_f1())
 def get_document_evaluations(doc_coref_infos, metric, beta=1):
    evaluator = Evaluator(metric, beta=beta, keep_aggregated_values=True)
    for doc_id in doc_coref_infos:
        evaluator.update(doc_coref_infos[doc_id])
    return evaluator.get_aggregated_values()
 def mentions(clusters, mention_to_gold):
    setofmentions = set(mention for cluster in clusters for mention in cluster)
    correct = setofmentions & set(mention_to_gold.keys())
    return len(correct), len(setofmentions)
 def b_cubed(clusters, mention_to_gold):
    num, den = 0, 0
    for c in clusters:
        gold_counts = Counter()
        correct = 0
        for m in c:
            if m in mention_to_gold:
                gold_counts[mention_to_gold[m]] += 1
        for c2 in gold_counts:
            correct += gold_counts[c2] * gold_counts[c2]
        num += correct / float(len(c))
        den += len(c)
    return num, den
 def muc(clusters, mention_to_gold):
    tp, p = 0, 0
    for c in clusters:
        p += len(c) - 1
        tp += len(c)
        linked = set()
        for m in c:
            if m in mention_to_gold:
                linked.add(mention_to_gold[m])
            else:
                tp -= 1
        tp -= len(linked)
    return tp, p
 def phi4(c1, c2):
    return 2 * len([m for m in c1 if m in c2]) / float(len(c1) + len(c2))
 def phi3(c1, c2):
    return len([m for m in c1 if m in c2])
 def ceafe(clusters, gold_clusters):
    clusters = [c for c in clusters]
    scores = np.zeros((len(gold_clusters), len(clusters)))
    for i in range(len(gold_clusters)):
        for j in range(len(clusters)):
            scores[i, j] = phi4(gold_clusters[i], clusters[j])
    row_ind, col_ind = linear_sum_assignment(-scores)
    similarity = scores[row_ind, col_ind].sum()
    return similarity, len(clusters), similarity, len(gold_clusters)
 def ceafm(clusters, gold_clusters):
    clusters = [c for c in clusters]
    scores = np.zeros((len(gold_clusters), len(clusters)))
    for i in range(len(gold_clusters)):
        for j in range(len(clusters)):
            scores[i, j] = phi3(gold_clusters[i], clusters[j])
    row_ind, col_ind = linear_sum_assignment(-scores)
    similarity = scores[row_ind, col_ind].sum()
    return similarity, len(clusters), similarity, len(gold_clusters)
 def lea(input_clusters, output_clusters, mention_to_gold):
    num, den = 0, 0
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@ -25,7 +25,7 @@ from ..ml.models.coref_util import (
    doc2clusters,
 )
-from ..coref_scorer import Evaluator, get_cluster_info, b_cubed, muc, ceafe
+from ..coref_scorer import Evaluator, get_cluster_info, lea
 default_config = """
@ -349,17 +349,17 @@ class CoreferenceResolver(TrainablePipe):
        assert len(X) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(X=X, Y=Y)
    # TODO This mirrors the evaluation used in prior work, but we don't want to
    # include this in the final release. The metrics all have fundamental
    # issues and the current implementation requires scipy.
    def score(self, examples, **kwargs):
-        """Score a batch of examples."""
+        """Score a batch of examples using LEA.
-        # NOTE traditionally coref uses the average of b_cubed, muc, and ceaf.
+        For details on how LEA works and why to use it see the paper:
-        # we need to handle the average ourselves.
+
-        scores = []
+        Which Coreference Evaluation Metric Do You Trust? A Proposal for a Link-based Entity Aware Metric
-        for metric in (b_cubed, muc, ceafe):
+        Moosavi and Strube, 2016
-            evaluator = Evaluator(metric)
+        https://api.semanticscholar.org/CorpusID:17606580
        """
        evaluator = Evaluator(lea)
        for ex in examples:
            p_clusters = doc2clusters(ex.predicted, self.span_cluster_prefix)
@ -372,13 +372,7 @@ class CoreferenceResolver(TrainablePipe):
            "coref_p": evaluator.get_precision(),
            "coref_r": evaluator.get_recall(),
        }
-            scores.append(score)
+        return score
        out = {}
        for field in ("f", "p", "r"):
            fname = f"coref_{field}"
            out[fname] = mean([ss[fname] for ss in scores])
        return out
 default_span_predictor_config = """