diff --git a/spacy/coref_scorer.py b/spacy/coref_scorer.py index e00b22fd7..b266ec3b3 100644 --- a/spacy/coref_scorer.py +++ b/spacy/coref_scorer.py @@ -1,17 +1,5 @@ # copied from coval # https://github.com/ns-moosavi/coval -from collections import Counter -import numpy as np - -try: - # This is only used in the ceaf methods. If those are necessary we should - # implement this locally to avoid a scipy dep. - from scipy.optimize import linear_sum_assignment -except: - pass - -# Terminology here is consistent with papers in the field but kind of confusing. -# Key = gold data, System = predictions. def get_cluster_info(predicted_clusters, gold_clusters): @@ -44,29 +32,6 @@ def f1(p_num, p_den, r_num, r_den, beta=1): return 0 if p + r == 0 else (1 + beta * beta) * p * r / (beta * beta * p + r) -def evaluate_non_referrings(doc_non_referring_infos): - tp, _tn, fp, fn = 0, 0, 0, 0 - - for doc_id in doc_non_referring_infos: - key_non_referrings, sys_non_referrings = doc_non_referring_infos[doc_id] - for m in key_non_referrings: - if m in sys_non_referrings: - tp += 1 - else: - fn += 1 - for m in sys_non_referrings: - if m not in key_non_referrings: - fp += 1 - - recall = tp / float(tp + fn) if (tp + fn) > 0 else 0 - precision = tp / float(tp + fp) if (tp + fp) > 0 else 0 - f1 = ( - 2 * recall * precision / (recall + precision) if (recall + precision) > 0 else 0 - ) - - return recall, precision, f1 - - class Evaluator: def __init__(self, metric, beta=1, keep_aggregated_values=False): self.p_num = 0 @@ -91,14 +56,8 @@ class Evaluator: sys_mention_key_cluster, ) = coref_info - if self.metric == ceafe or self.metric == ceafm: - pn, pd, rn, rd = self.metric(sys_clusters, key_clusters) - elif self.metric == lea: - pn, pd = self.metric(sys_clusters, key_clusters, sys_mention_key_cluster) - rn, rd = self.metric(key_clusters, sys_clusters, key_mention_sys_cluster) - else: - pn, pd = self.metric(sys_clusters, sys_mention_key_cluster) - rn, rd = self.metric(key_clusters, key_mention_sys_cluster) + pn, pd = self.metric(sys_clusters, key_clusters, sys_mention_key_cluster) + rn, rd = self.metric(key_clusters, sys_clusters, key_mention_sys_cluster) self.p_num += pn self.p_den += pd self.r_num += rn @@ -134,89 +93,6 @@ class Evaluator: ) -def evaluate_documents(doc_coref_infos, metric, beta=1): - evaluator = Evaluator(metric, beta=beta) - for doc_id in doc_coref_infos: - evaluator.update(doc_coref_infos[doc_id]) - return (evaluator.get_recall(), evaluator.get_precision(), evaluator.get_f1()) - - -def get_document_evaluations(doc_coref_infos, metric, beta=1): - evaluator = Evaluator(metric, beta=beta, keep_aggregated_values=True) - for doc_id in doc_coref_infos: - evaluator.update(doc_coref_infos[doc_id]) - return evaluator.get_aggregated_values() - - -def mentions(clusters, mention_to_gold): - setofmentions = set(mention for cluster in clusters for mention in cluster) - correct = setofmentions & set(mention_to_gold.keys()) - return len(correct), len(setofmentions) - - -def b_cubed(clusters, mention_to_gold): - num, den = 0, 0 - - for c in clusters: - gold_counts = Counter() - correct = 0 - for m in c: - if m in mention_to_gold: - gold_counts[mention_to_gold[m]] += 1 - for c2 in gold_counts: - correct += gold_counts[c2] * gold_counts[c2] - - num += correct / float(len(c)) - den += len(c) - - return num, den - - -def muc(clusters, mention_to_gold): - tp, p = 0, 0 - for c in clusters: - p += len(c) - 1 - tp += len(c) - linked = set() - for m in c: - if m in mention_to_gold: - linked.add(mention_to_gold[m]) - else: - tp -= 1 - tp -= len(linked) - return tp, p - - -def phi4(c1, c2): - return 2 * len([m for m in c1 if m in c2]) / float(len(c1) + len(c2)) - - -def phi3(c1, c2): - return len([m for m in c1 if m in c2]) - - -def ceafe(clusters, gold_clusters): - clusters = [c for c in clusters] - scores = np.zeros((len(gold_clusters), len(clusters))) - for i in range(len(gold_clusters)): - for j in range(len(clusters)): - scores[i, j] = phi4(gold_clusters[i], clusters[j]) - row_ind, col_ind = linear_sum_assignment(-scores) - similarity = scores[row_ind, col_ind].sum() - return similarity, len(clusters), similarity, len(gold_clusters) - - -def ceafm(clusters, gold_clusters): - clusters = [c for c in clusters] - scores = np.zeros((len(gold_clusters), len(clusters))) - for i in range(len(gold_clusters)): - for j in range(len(clusters)): - scores[i, j] = phi3(gold_clusters[i], clusters[j]) - row_ind, col_ind = linear_sum_assignment(-scores) - similarity = scores[row_ind, col_ind].sum() - return similarity, len(clusters), similarity, len(gold_clusters) - - def lea(input_clusters, output_clusters, mention_to_gold): num, den = 0, 0 diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py index 4111f8445..671d65e19 100644 --- a/spacy/pipeline/coref.py +++ b/spacy/pipeline/coref.py @@ -25,7 +25,7 @@ from ..ml.models.coref_util import ( doc2clusters, ) -from ..coref_scorer import Evaluator, get_cluster_info, b_cubed, muc, ceafe +from ..coref_scorer import Evaluator, get_cluster_info, lea default_config = """ @@ -349,36 +349,30 @@ class CoreferenceResolver(TrainablePipe): assert len(X) > 0, Errors.E923.format(name=self.name) self.model.initialize(X=X, Y=Y) - # TODO This mirrors the evaluation used in prior work, but we don't want to - # include this in the final release. The metrics all have fundamental - # issues and the current implementation requires scipy. def score(self, examples, **kwargs): - """Score a batch of examples.""" + """Score a batch of examples using LEA. - # NOTE traditionally coref uses the average of b_cubed, muc, and ceaf. - # we need to handle the average ourselves. - scores = [] - for metric in (b_cubed, muc, ceafe): - evaluator = Evaluator(metric) + For details on how LEA works and why to use it see the paper: - for ex in examples: - p_clusters = doc2clusters(ex.predicted, self.span_cluster_prefix) - g_clusters = doc2clusters(ex.reference, self.span_cluster_prefix) - cluster_info = get_cluster_info(p_clusters, g_clusters) - evaluator.update(cluster_info) + Which Coreference Evaluation Metric Do You Trust? A Proposal for a Link-based Entity Aware Metric + Moosavi and Strube, 2016 + https://api.semanticscholar.org/CorpusID:17606580 + """ - score = { - "coref_f": evaluator.get_f1(), - "coref_p": evaluator.get_precision(), - "coref_r": evaluator.get_recall(), - } - scores.append(score) + evaluator = Evaluator(lea) - out = {} - for field in ("f", "p", "r"): - fname = f"coref_{field}" - out[fname] = mean([ss[fname] for ss in scores]) - return out + for ex in examples: + p_clusters = doc2clusters(ex.predicted, self.span_cluster_prefix) + g_clusters = doc2clusters(ex.reference, self.span_cluster_prefix) + cluster_info = get_cluster_info(p_clusters, g_clusters) + evaluator.update(cluster_info) + + score = { + "coref_f": evaluator.get_f1(), + "coref_p": evaluator.get_precision(), + "coref_r": evaluator.get_recall(), + } + return score default_span_predictor_config = """