diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py index 3cc6606dd..4e67c9f9f 100644 --- a/spacy/pipeline/coref.py +++ b/spacy/pipeline/coref.py @@ -5,6 +5,7 @@ from thinc.types import Floats2d, Ints2d from thinc.api import Model, Config, Optimizer, CategoricalCrossentropy from thinc.api import set_dropout_rate from itertools import islice +from statistics import mean from .trainable_pipe import TrainablePipe from ..language import Language @@ -23,7 +24,7 @@ from ..ml.models.coref_util import ( doc2clusters, ) -from ..coref_scorer import Evaluator, get_cluster_info, b_cubed +from ..coref_scorer import Evaluator, get_cluster_info, b_cubed, muc, ceafe default_config = """ [model] @@ -344,28 +345,35 @@ class CoreferenceResolver(TrainablePipe): assert len(X) > 0, Errors.E923.format(name=self.name) self.model.initialize(X=X, Y=Y) - # TODO consider whether to use this. It's pretty fast, but it'll be slower if - # we use all three methods like the original evaluator does. Also the current - # implementation, borrowed from the coval project, uses scipy, which we would - # want to avoid. (If that's the only issue we can probably work around it.) + # TODO This mirrors the evaluation used in prior work, but we don't want to + # include this in the final release. The metrics all have fundamental + # issues and the current implementation requires scipy. def score(self, examples, **kwargs): """Score a batch of examples.""" - #TODO traditionally coref uses the average of b_cubed, muc, and ceaf. + #NOTE traditionally coref uses the average of b_cubed, muc, and ceaf. # we need to handle the average ourselves. - evaluator = Evaluator(b_cubed) + scores = [] + for metric in (b_cubed, muc, ceafe): + evaluator = Evaluator(b_cubed) - for ex in examples: - p_clusters = doc2clusters(ex.predicted, self.span_cluster_prefix) - g_clusters = doc2clusters(ex.reference, self.span_cluster_prefix) + for ex in examples: + p_clusters = doc2clusters(ex.predicted, self.span_cluster_prefix) + g_clusters = doc2clusters(ex.reference, self.span_cluster_prefix) - cluster_info = get_cluster_info(p_clusters, g_clusters) + cluster_info = get_cluster_info(p_clusters, g_clusters) - evaluator.update(cluster_info) + evaluator.update(cluster_info) - scores ={ - "coref_f": evaluator.get_f1(), - "coref_p": evaluator.get_precision(), - "coref_r": evaluator.get_recall(), - } - return scores + score ={ + "coref_f": evaluator.get_f1(), + "coref_p": evaluator.get_precision(), + "coref_r": evaluator.get_recall(), + } + scores.append(score) + + out = {} + for field in ("f", "p", "r"): + fname = f"coref_{field}" + out[fname] = mean([ss[fname] for ss in scores]) + return out