mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-19 04:32:32 +03:00
Add full traditional scoring
This calculates scores as an average of three metrics. As noted in the code, these metrics all have issues, but we want to use them to match up with prior work. This should be replaced with some simpler default scoring and the scorer here should be moved to an external project to be passed in just for generating the traditional scores.
This commit is contained in:
parent
a4531be099
commit
bc081c24fa
|
@ -5,6 +5,7 @@ from thinc.types import Floats2d, Ints2d
|
||||||
from thinc.api import Model, Config, Optimizer, CategoricalCrossentropy
|
from thinc.api import Model, Config, Optimizer, CategoricalCrossentropy
|
||||||
from thinc.api import set_dropout_rate
|
from thinc.api import set_dropout_rate
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
from statistics import mean
|
||||||
|
|
||||||
from .trainable_pipe import TrainablePipe
|
from .trainable_pipe import TrainablePipe
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
@ -23,7 +24,7 @@ from ..ml.models.coref_util import (
|
||||||
doc2clusters,
|
doc2clusters,
|
||||||
)
|
)
|
||||||
|
|
||||||
from ..coref_scorer import Evaluator, get_cluster_info, b_cubed
|
from ..coref_scorer import Evaluator, get_cluster_info, b_cubed, muc, ceafe
|
||||||
|
|
||||||
default_config = """
|
default_config = """
|
||||||
[model]
|
[model]
|
||||||
|
@ -344,15 +345,16 @@ class CoreferenceResolver(TrainablePipe):
|
||||||
assert len(X) > 0, Errors.E923.format(name=self.name)
|
assert len(X) > 0, Errors.E923.format(name=self.name)
|
||||||
self.model.initialize(X=X, Y=Y)
|
self.model.initialize(X=X, Y=Y)
|
||||||
|
|
||||||
# TODO consider whether to use this. It's pretty fast, but it'll be slower if
|
# TODO This mirrors the evaluation used in prior work, but we don't want to
|
||||||
# we use all three methods like the original evaluator does. Also the current
|
# include this in the final release. The metrics all have fundamental
|
||||||
# implementation, borrowed from the coval project, uses scipy, which we would
|
# issues and the current implementation requires scipy.
|
||||||
# want to avoid. (If that's the only issue we can probably work around it.)
|
|
||||||
def score(self, examples, **kwargs):
|
def score(self, examples, **kwargs):
|
||||||
"""Score a batch of examples."""
|
"""Score a batch of examples."""
|
||||||
|
|
||||||
#TODO traditionally coref uses the average of b_cubed, muc, and ceaf.
|
#NOTE traditionally coref uses the average of b_cubed, muc, and ceaf.
|
||||||
# we need to handle the average ourselves.
|
# we need to handle the average ourselves.
|
||||||
|
scores = []
|
||||||
|
for metric in (b_cubed, muc, ceafe):
|
||||||
evaluator = Evaluator(b_cubed)
|
evaluator = Evaluator(b_cubed)
|
||||||
|
|
||||||
for ex in examples:
|
for ex in examples:
|
||||||
|
@ -363,9 +365,15 @@ class CoreferenceResolver(TrainablePipe):
|
||||||
|
|
||||||
evaluator.update(cluster_info)
|
evaluator.update(cluster_info)
|
||||||
|
|
||||||
scores ={
|
score ={
|
||||||
"coref_f": evaluator.get_f1(),
|
"coref_f": evaluator.get_f1(),
|
||||||
"coref_p": evaluator.get_precision(),
|
"coref_p": evaluator.get_precision(),
|
||||||
"coref_r": evaluator.get_recall(),
|
"coref_r": evaluator.get_recall(),
|
||||||
}
|
}
|
||||||
return scores
|
scores.append(score)
|
||||||
|
|
||||||
|
out = {}
|
||||||
|
for field in ("f", "p", "r"):
|
||||||
|
fname = f"coref_{field}"
|
||||||
|
out[fname] = mean([ss[fname] for ss in scores])
|
||||||
|
return out
|
||||||
|
|
Loading…
Reference in New Issue
Block a user