mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-19 20:52:23 +03:00
Remove all coref scoring exept LEA
This is necessary because one of the three old methods relied on scipy for some complex problem solving. LEA is generally better for evaluations. The downside is that this means evaluations aren't comparable with many papers, but canonical scoring can be supported using external eval scripts or other methods.
This commit is contained in:
parent
2300f4df3d
commit
e8af02700f
|
@ -1,17 +1,5 @@
|
||||||
# copied from coval
|
# copied from coval
|
||||||
# https://github.com/ns-moosavi/coval
|
# https://github.com/ns-moosavi/coval
|
||||||
from collections import Counter
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
try:
|
|
||||||
# This is only used in the ceaf methods. If those are necessary we should
|
|
||||||
# implement this locally to avoid a scipy dep.
|
|
||||||
from scipy.optimize import linear_sum_assignment
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Terminology here is consistent with papers in the field but kind of confusing.
|
|
||||||
# Key = gold data, System = predictions.
|
|
||||||
|
|
||||||
|
|
||||||
def get_cluster_info(predicted_clusters, gold_clusters):
|
def get_cluster_info(predicted_clusters, gold_clusters):
|
||||||
|
@ -44,29 +32,6 @@ def f1(p_num, p_den, r_num, r_den, beta=1):
|
||||||
return 0 if p + r == 0 else (1 + beta * beta) * p * r / (beta * beta * p + r)
|
return 0 if p + r == 0 else (1 + beta * beta) * p * r / (beta * beta * p + r)
|
||||||
|
|
||||||
|
|
||||||
def evaluate_non_referrings(doc_non_referring_infos):
|
|
||||||
tp, _tn, fp, fn = 0, 0, 0, 0
|
|
||||||
|
|
||||||
for doc_id in doc_non_referring_infos:
|
|
||||||
key_non_referrings, sys_non_referrings = doc_non_referring_infos[doc_id]
|
|
||||||
for m in key_non_referrings:
|
|
||||||
if m in sys_non_referrings:
|
|
||||||
tp += 1
|
|
||||||
else:
|
|
||||||
fn += 1
|
|
||||||
for m in sys_non_referrings:
|
|
||||||
if m not in key_non_referrings:
|
|
||||||
fp += 1
|
|
||||||
|
|
||||||
recall = tp / float(tp + fn) if (tp + fn) > 0 else 0
|
|
||||||
precision = tp / float(tp + fp) if (tp + fp) > 0 else 0
|
|
||||||
f1 = (
|
|
||||||
2 * recall * precision / (recall + precision) if (recall + precision) > 0 else 0
|
|
||||||
)
|
|
||||||
|
|
||||||
return recall, precision, f1
|
|
||||||
|
|
||||||
|
|
||||||
class Evaluator:
|
class Evaluator:
|
||||||
def __init__(self, metric, beta=1, keep_aggregated_values=False):
|
def __init__(self, metric, beta=1, keep_aggregated_values=False):
|
||||||
self.p_num = 0
|
self.p_num = 0
|
||||||
|
@ -91,14 +56,8 @@ class Evaluator:
|
||||||
sys_mention_key_cluster,
|
sys_mention_key_cluster,
|
||||||
) = coref_info
|
) = coref_info
|
||||||
|
|
||||||
if self.metric == ceafe or self.metric == ceafm:
|
|
||||||
pn, pd, rn, rd = self.metric(sys_clusters, key_clusters)
|
|
||||||
elif self.metric == lea:
|
|
||||||
pn, pd = self.metric(sys_clusters, key_clusters, sys_mention_key_cluster)
|
pn, pd = self.metric(sys_clusters, key_clusters, sys_mention_key_cluster)
|
||||||
rn, rd = self.metric(key_clusters, sys_clusters, key_mention_sys_cluster)
|
rn, rd = self.metric(key_clusters, sys_clusters, key_mention_sys_cluster)
|
||||||
else:
|
|
||||||
pn, pd = self.metric(sys_clusters, sys_mention_key_cluster)
|
|
||||||
rn, rd = self.metric(key_clusters, key_mention_sys_cluster)
|
|
||||||
self.p_num += pn
|
self.p_num += pn
|
||||||
self.p_den += pd
|
self.p_den += pd
|
||||||
self.r_num += rn
|
self.r_num += rn
|
||||||
|
@ -134,89 +93,6 @@ class Evaluator:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def evaluate_documents(doc_coref_infos, metric, beta=1):
|
|
||||||
evaluator = Evaluator(metric, beta=beta)
|
|
||||||
for doc_id in doc_coref_infos:
|
|
||||||
evaluator.update(doc_coref_infos[doc_id])
|
|
||||||
return (evaluator.get_recall(), evaluator.get_precision(), evaluator.get_f1())
|
|
||||||
|
|
||||||
|
|
||||||
def get_document_evaluations(doc_coref_infos, metric, beta=1):
|
|
||||||
evaluator = Evaluator(metric, beta=beta, keep_aggregated_values=True)
|
|
||||||
for doc_id in doc_coref_infos:
|
|
||||||
evaluator.update(doc_coref_infos[doc_id])
|
|
||||||
return evaluator.get_aggregated_values()
|
|
||||||
|
|
||||||
|
|
||||||
def mentions(clusters, mention_to_gold):
|
|
||||||
setofmentions = set(mention for cluster in clusters for mention in cluster)
|
|
||||||
correct = setofmentions & set(mention_to_gold.keys())
|
|
||||||
return len(correct), len(setofmentions)
|
|
||||||
|
|
||||||
|
|
||||||
def b_cubed(clusters, mention_to_gold):
|
|
||||||
num, den = 0, 0
|
|
||||||
|
|
||||||
for c in clusters:
|
|
||||||
gold_counts = Counter()
|
|
||||||
correct = 0
|
|
||||||
for m in c:
|
|
||||||
if m in mention_to_gold:
|
|
||||||
gold_counts[mention_to_gold[m]] += 1
|
|
||||||
for c2 in gold_counts:
|
|
||||||
correct += gold_counts[c2] * gold_counts[c2]
|
|
||||||
|
|
||||||
num += correct / float(len(c))
|
|
||||||
den += len(c)
|
|
||||||
|
|
||||||
return num, den
|
|
||||||
|
|
||||||
|
|
||||||
def muc(clusters, mention_to_gold):
|
|
||||||
tp, p = 0, 0
|
|
||||||
for c in clusters:
|
|
||||||
p += len(c) - 1
|
|
||||||
tp += len(c)
|
|
||||||
linked = set()
|
|
||||||
for m in c:
|
|
||||||
if m in mention_to_gold:
|
|
||||||
linked.add(mention_to_gold[m])
|
|
||||||
else:
|
|
||||||
tp -= 1
|
|
||||||
tp -= len(linked)
|
|
||||||
return tp, p
|
|
||||||
|
|
||||||
|
|
||||||
def phi4(c1, c2):
|
|
||||||
return 2 * len([m for m in c1 if m in c2]) / float(len(c1) + len(c2))
|
|
||||||
|
|
||||||
|
|
||||||
def phi3(c1, c2):
|
|
||||||
return len([m for m in c1 if m in c2])
|
|
||||||
|
|
||||||
|
|
||||||
def ceafe(clusters, gold_clusters):
|
|
||||||
clusters = [c for c in clusters]
|
|
||||||
scores = np.zeros((len(gold_clusters), len(clusters)))
|
|
||||||
for i in range(len(gold_clusters)):
|
|
||||||
for j in range(len(clusters)):
|
|
||||||
scores[i, j] = phi4(gold_clusters[i], clusters[j])
|
|
||||||
row_ind, col_ind = linear_sum_assignment(-scores)
|
|
||||||
similarity = scores[row_ind, col_ind].sum()
|
|
||||||
return similarity, len(clusters), similarity, len(gold_clusters)
|
|
||||||
|
|
||||||
|
|
||||||
def ceafm(clusters, gold_clusters):
|
|
||||||
clusters = [c for c in clusters]
|
|
||||||
scores = np.zeros((len(gold_clusters), len(clusters)))
|
|
||||||
for i in range(len(gold_clusters)):
|
|
||||||
for j in range(len(clusters)):
|
|
||||||
scores[i, j] = phi3(gold_clusters[i], clusters[j])
|
|
||||||
row_ind, col_ind = linear_sum_assignment(-scores)
|
|
||||||
similarity = scores[row_ind, col_ind].sum()
|
|
||||||
return similarity, len(clusters), similarity, len(gold_clusters)
|
|
||||||
|
|
||||||
|
|
||||||
def lea(input_clusters, output_clusters, mention_to_gold):
|
def lea(input_clusters, output_clusters, mention_to_gold):
|
||||||
num, den = 0, 0
|
num, den = 0, 0
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,7 @@ from ..ml.models.coref_util import (
|
||||||
doc2clusters,
|
doc2clusters,
|
||||||
)
|
)
|
||||||
|
|
||||||
from ..coref_scorer import Evaluator, get_cluster_info, b_cubed, muc, ceafe
|
from ..coref_scorer import Evaluator, get_cluster_info, lea
|
||||||
|
|
||||||
|
|
||||||
default_config = """
|
default_config = """
|
||||||
|
@ -349,17 +349,17 @@ class CoreferenceResolver(TrainablePipe):
|
||||||
assert len(X) > 0, Errors.E923.format(name=self.name)
|
assert len(X) > 0, Errors.E923.format(name=self.name)
|
||||||
self.model.initialize(X=X, Y=Y)
|
self.model.initialize(X=X, Y=Y)
|
||||||
|
|
||||||
# TODO This mirrors the evaluation used in prior work, but we don't want to
|
|
||||||
# include this in the final release. The metrics all have fundamental
|
|
||||||
# issues and the current implementation requires scipy.
|
|
||||||
def score(self, examples, **kwargs):
|
def score(self, examples, **kwargs):
|
||||||
"""Score a batch of examples."""
|
"""Score a batch of examples using LEA.
|
||||||
|
|
||||||
# NOTE traditionally coref uses the average of b_cubed, muc, and ceaf.
|
For details on how LEA works and why to use it see the paper:
|
||||||
# we need to handle the average ourselves.
|
|
||||||
scores = []
|
Which Coreference Evaluation Metric Do You Trust? A Proposal for a Link-based Entity Aware Metric
|
||||||
for metric in (b_cubed, muc, ceafe):
|
Moosavi and Strube, 2016
|
||||||
evaluator = Evaluator(metric)
|
https://api.semanticscholar.org/CorpusID:17606580
|
||||||
|
"""
|
||||||
|
|
||||||
|
evaluator = Evaluator(lea)
|
||||||
|
|
||||||
for ex in examples:
|
for ex in examples:
|
||||||
p_clusters = doc2clusters(ex.predicted, self.span_cluster_prefix)
|
p_clusters = doc2clusters(ex.predicted, self.span_cluster_prefix)
|
||||||
|
@ -372,13 +372,7 @@ class CoreferenceResolver(TrainablePipe):
|
||||||
"coref_p": evaluator.get_precision(),
|
"coref_p": evaluator.get_precision(),
|
||||||
"coref_r": evaluator.get_recall(),
|
"coref_r": evaluator.get_recall(),
|
||||||
}
|
}
|
||||||
scores.append(score)
|
return score
|
||||||
|
|
||||||
out = {}
|
|
||||||
for field in ("f", "p", "r"):
|
|
||||||
fname = f"coref_{field}"
|
|
||||||
out[fname] = mean([ss[fname] for ss in scores])
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
default_span_predictor_config = """
|
default_span_predictor_config = """
|
||||||
|
|
Loading…
Reference in New Issue
Block a user