Remove all coref scoring exept LEA

This is necessary because one of the three old methods relied on scipy
for some complex problem solving. LEA is generally better for
evaluations.

The downside is that this means evaluations aren't comparable with many
papers, but canonical scoring can be supported using external eval
scripts or other methods.
This commit is contained in:
Paul O'Leary McCann 2022-04-13 21:02:18 +09:00
parent 2300f4df3d
commit e8af02700f
2 changed files with 22 additions and 152 deletions

View File

@ -1,17 +1,5 @@
# copied from coval # copied from coval
# https://github.com/ns-moosavi/coval # https://github.com/ns-moosavi/coval
from collections import Counter
import numpy as np
try:
# This is only used in the ceaf methods. If those are necessary we should
# implement this locally to avoid a scipy dep.
from scipy.optimize import linear_sum_assignment
except:
pass
# Terminology here is consistent with papers in the field but kind of confusing.
# Key = gold data, System = predictions.
def get_cluster_info(predicted_clusters, gold_clusters): def get_cluster_info(predicted_clusters, gold_clusters):
@ -44,29 +32,6 @@ def f1(p_num, p_den, r_num, r_den, beta=1):
return 0 if p + r == 0 else (1 + beta * beta) * p * r / (beta * beta * p + r) return 0 if p + r == 0 else (1 + beta * beta) * p * r / (beta * beta * p + r)
def evaluate_non_referrings(doc_non_referring_infos):
tp, _tn, fp, fn = 0, 0, 0, 0
for doc_id in doc_non_referring_infos:
key_non_referrings, sys_non_referrings = doc_non_referring_infos[doc_id]
for m in key_non_referrings:
if m in sys_non_referrings:
tp += 1
else:
fn += 1
for m in sys_non_referrings:
if m not in key_non_referrings:
fp += 1
recall = tp / float(tp + fn) if (tp + fn) > 0 else 0
precision = tp / float(tp + fp) if (tp + fp) > 0 else 0
f1 = (
2 * recall * precision / (recall + precision) if (recall + precision) > 0 else 0
)
return recall, precision, f1
class Evaluator: class Evaluator:
def __init__(self, metric, beta=1, keep_aggregated_values=False): def __init__(self, metric, beta=1, keep_aggregated_values=False):
self.p_num = 0 self.p_num = 0
@ -91,14 +56,8 @@ class Evaluator:
sys_mention_key_cluster, sys_mention_key_cluster,
) = coref_info ) = coref_info
if self.metric == ceafe or self.metric == ceafm:
pn, pd, rn, rd = self.metric(sys_clusters, key_clusters)
elif self.metric == lea:
pn, pd = self.metric(sys_clusters, key_clusters, sys_mention_key_cluster) pn, pd = self.metric(sys_clusters, key_clusters, sys_mention_key_cluster)
rn, rd = self.metric(key_clusters, sys_clusters, key_mention_sys_cluster) rn, rd = self.metric(key_clusters, sys_clusters, key_mention_sys_cluster)
else:
pn, pd = self.metric(sys_clusters, sys_mention_key_cluster)
rn, rd = self.metric(key_clusters, key_mention_sys_cluster)
self.p_num += pn self.p_num += pn
self.p_den += pd self.p_den += pd
self.r_num += rn self.r_num += rn
@ -134,89 +93,6 @@ class Evaluator:
) )
def evaluate_documents(doc_coref_infos, metric, beta=1):
evaluator = Evaluator(metric, beta=beta)
for doc_id in doc_coref_infos:
evaluator.update(doc_coref_infos[doc_id])
return (evaluator.get_recall(), evaluator.get_precision(), evaluator.get_f1())
def get_document_evaluations(doc_coref_infos, metric, beta=1):
evaluator = Evaluator(metric, beta=beta, keep_aggregated_values=True)
for doc_id in doc_coref_infos:
evaluator.update(doc_coref_infos[doc_id])
return evaluator.get_aggregated_values()
def mentions(clusters, mention_to_gold):
setofmentions = set(mention for cluster in clusters for mention in cluster)
correct = setofmentions & set(mention_to_gold.keys())
return len(correct), len(setofmentions)
def b_cubed(clusters, mention_to_gold):
num, den = 0, 0
for c in clusters:
gold_counts = Counter()
correct = 0
for m in c:
if m in mention_to_gold:
gold_counts[mention_to_gold[m]] += 1
for c2 in gold_counts:
correct += gold_counts[c2] * gold_counts[c2]
num += correct / float(len(c))
den += len(c)
return num, den
def muc(clusters, mention_to_gold):
tp, p = 0, 0
for c in clusters:
p += len(c) - 1
tp += len(c)
linked = set()
for m in c:
if m in mention_to_gold:
linked.add(mention_to_gold[m])
else:
tp -= 1
tp -= len(linked)
return tp, p
def phi4(c1, c2):
return 2 * len([m for m in c1 if m in c2]) / float(len(c1) + len(c2))
def phi3(c1, c2):
return len([m for m in c1 if m in c2])
def ceafe(clusters, gold_clusters):
clusters = [c for c in clusters]
scores = np.zeros((len(gold_clusters), len(clusters)))
for i in range(len(gold_clusters)):
for j in range(len(clusters)):
scores[i, j] = phi4(gold_clusters[i], clusters[j])
row_ind, col_ind = linear_sum_assignment(-scores)
similarity = scores[row_ind, col_ind].sum()
return similarity, len(clusters), similarity, len(gold_clusters)
def ceafm(clusters, gold_clusters):
clusters = [c for c in clusters]
scores = np.zeros((len(gold_clusters), len(clusters)))
for i in range(len(gold_clusters)):
for j in range(len(clusters)):
scores[i, j] = phi3(gold_clusters[i], clusters[j])
row_ind, col_ind = linear_sum_assignment(-scores)
similarity = scores[row_ind, col_ind].sum()
return similarity, len(clusters), similarity, len(gold_clusters)
def lea(input_clusters, output_clusters, mention_to_gold): def lea(input_clusters, output_clusters, mention_to_gold):
num, den = 0, 0 num, den = 0, 0

View File

@ -25,7 +25,7 @@ from ..ml.models.coref_util import (
doc2clusters, doc2clusters,
) )
from ..coref_scorer import Evaluator, get_cluster_info, b_cubed, muc, ceafe from ..coref_scorer import Evaluator, get_cluster_info, lea
default_config = """ default_config = """
@ -349,17 +349,17 @@ class CoreferenceResolver(TrainablePipe):
assert len(X) > 0, Errors.E923.format(name=self.name) assert len(X) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=X, Y=Y) self.model.initialize(X=X, Y=Y)
# TODO This mirrors the evaluation used in prior work, but we don't want to
# include this in the final release. The metrics all have fundamental
# issues and the current implementation requires scipy.
def score(self, examples, **kwargs): def score(self, examples, **kwargs):
"""Score a batch of examples.""" """Score a batch of examples using LEA.
# NOTE traditionally coref uses the average of b_cubed, muc, and ceaf. For details on how LEA works and why to use it see the paper:
# we need to handle the average ourselves.
scores = [] Which Coreference Evaluation Metric Do You Trust? A Proposal for a Link-based Entity Aware Metric
for metric in (b_cubed, muc, ceafe): Moosavi and Strube, 2016
evaluator = Evaluator(metric) https://api.semanticscholar.org/CorpusID:17606580
"""
evaluator = Evaluator(lea)
for ex in examples: for ex in examples:
p_clusters = doc2clusters(ex.predicted, self.span_cluster_prefix) p_clusters = doc2clusters(ex.predicted, self.span_cluster_prefix)
@ -372,13 +372,7 @@ class CoreferenceResolver(TrainablePipe):
"coref_p": evaluator.get_precision(), "coref_p": evaluator.get_precision(),
"coref_r": evaluator.get_recall(), "coref_r": evaluator.get_recall(),
} }
scores.append(score) return score
out = {}
for field in ("f", "p", "r"):
fname = f"coref_{field}"
out[fname] = mean([ss[fname] for ss in scores])
return out
default_span_predictor_config = """ default_span_predictor_config = """