Add beta parameter to Scorer and PRFScore.

This commit is contained in:
Raphael Mitsch 2022-09-01 16:52:50 +02:00
parent 51863cd267
commit ea9737a664
3 changed files with 50 additions and 32 deletions

View File

@ -7,6 +7,8 @@ from typing import Optional, Tuple, Any, Dict, List
import numpy
import wasabi.tables
from ..pipeline import TrainablePipe, Pipe
from ..errors import Errors
from ..training import Corpus
from ._util import app, Arg, Opt, import_code, setup_gpu
from .. import util
@ -47,7 +49,7 @@ def find_threshold_cli(
threshold_key (str): Key of threshold attribute in component's configuration.
scores_key (str): Name of score to metric to optimize.
n_trials (int): Number of trials to determine optimal thresholds
beta (float): Beta for F1 calculation.
beta (float): Beta for F-score calculation.
code_path (Optional[Path]): Path to Python file with additional code (registered functions) to be imported.
use_gpu (int): GPU ID or -1 for CPU.
gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the
@ -79,10 +81,10 @@ def find_threshold(
threshold_key: str,
scores_key: str,
*,
n_trials: int = _DEFAULTS["n_trials"],
beta: float = _DEFAULTS["beta"],
use_gpu: int = _DEFAULTS["use_gpu"],
gold_preproc: bool = _DEFAULTS["gold_preproc"],
n_trials: int = _DEFAULTS["n_trials"], # type: ignore
beta: float = _DEFAULTS["beta"], # type: ignore
use_gpu: int = _DEFAULTS["use_gpu"], # type: ignore
gold_preproc: bool = _DEFAULTS["gold_preproc"], # type: ignore
silent: bool = True,
) -> Tuple[float, float]:
"""
@ -93,7 +95,7 @@ def find_threshold(
threshold_key (str): Key of threshold attribute in component's configuration.
scores_key (str): Name of score to metric to optimize.
n_trials (int): Number of trials to determine optimal thresholds.
beta (float): Beta for F1 calculation.
beta (float): Beta for F-score calculation.
use_gpu (int): GPU ID or -1 for CPU.
gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the
tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due
@ -108,10 +110,13 @@ def find_threshold(
wasabi.msg.fail("Evaluation data not found", data_path, exits=1)
nlp = util.load_model(model)
pipe: Optional[Pipe] = None
try:
pipe = nlp.get_pipe(pipe_name)
except KeyError as err:
wasabi.msg.fail(title=str(err), exits=1)
if not isinstance(pipe, TrainablePipe):
raise TypeError(Errors.E1044)
if not silent:
wasabi.msg.info(
@ -140,7 +145,9 @@ def find_threshold(
scores: Dict[float, float] = {}
for threshold in numpy.linspace(0, 1, n_trials):
pipe.cfg = set_nested_item(pipe.cfg, config_keys, threshold)
scores[threshold] = nlp.evaluate(dev_dataset)[scores_key]
scores[threshold] = nlp.evaluate(dev_dataset, scorer_cfg={"beta": beta})[
scores_key
]
if not (
isinstance(scores[threshold], float) or isinstance(scores[threshold], int)
):

View File

@ -939,6 +939,7 @@ class Errors(metaclass=ErrorsWithCodes):
"`{arg2}`={arg2_values} but these arguments are conflicting.")
E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
"{value}.")
E1044 = ("Only components of type `TrainablePipe` are supported by `find_threshold()`.")
# Deprecated model shortcuts, only used in errors and warnings

View File

@ -22,15 +22,12 @@ class PRFScore:
"""A precision / recall / F score."""
def __init__(
self,
*,
tp: int = 0,
fp: int = 0,
fn: int = 0,
self, *, tp: int = 0, fp: int = 0, fn: int = 0, beta: float = 1
) -> None:
self.tp = tp
self.fp = fp
self.fn = fn
self.beta = beta
def __len__(self) -> int:
return self.tp + self.fp + self.fn
@ -42,8 +39,12 @@ class PRFScore:
return self
def __add__(self, other):
assert self.beta == other.beta
return PRFScore(
tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
tp=self.tp + other.tp,
fp=self.fp + other.fp,
fn=self.fn + other.fn,
beta=self.beta,
)
def score_set(self, cand: set, gold: set) -> None:
@ -63,7 +64,7 @@ class PRFScore:
def fscore(self) -> float:
p = self.precision
r = self.recall
return 2 * ((p * r) / (p + r + 1e-100))
return (1 + self.beta**2) * ((p * r) / ((self.beta**2 * p) + r + 1e-100))
def to_dict(self) -> Dict[str, float]:
return {"p": self.precision, "r": self.recall, "f": self.fscore}
@ -101,6 +102,8 @@ class ROCAUCScore:
class Scorer:
"""Compute evaluation scores."""
BETA = 1
def __init__(
self,
nlp: Optional["Language"] = None,
@ -149,8 +152,9 @@ class Scorer:
DOCS: https://spacy.io/api/scorer#score_tokenization
"""
acc_score = PRFScore()
prf_score = PRFScore()
beta = cfg.get("beta", Scorer.BETA)
acc_score = PRFScore(beta=beta)
prf_score = PRFScore(beta=beta)
for example in examples:
gold_doc = example.reference
pred_doc = example.predicted
@ -210,7 +214,7 @@ class Scorer:
DOCS: https://spacy.io/api/scorer#score_token_attr
"""
tag_score = PRFScore()
tag_score = PRFScore(beta=cfg.get("beta", Scorer.BETA))
for example in examples:
gold_doc = example.reference
pred_doc = example.predicted
@ -261,7 +265,8 @@ class Scorer:
key attr_micro_p/r/f and the per-feat PRF scores under
attr_per_feat.
"""
micro_score = PRFScore()
beta = cfg.get("beta", Scorer.BETA)
micro_score = PRFScore(beta=beta)
per_feat = {}
for example in examples:
pred_doc = example.predicted
@ -276,7 +281,7 @@ class Scorer:
for feat in morph.split(Morphology.FEATURE_SEP):
field, values = feat.split(Morphology.FIELD_SEP)
if field not in per_feat:
per_feat[field] = PRFScore()
per_feat[field] = PRFScore(beta=beta)
if field not in gold_per_feat:
gold_per_feat[field] = set()
gold_per_feat[field].add((gold_i, feat))
@ -298,7 +303,7 @@ class Scorer:
for feat in morph.split(Morphology.FEATURE_SEP):
field, values = feat.split(Morphology.FIELD_SEP)
if field not in per_feat:
per_feat[field] = PRFScore()
per_feat[field] = PRFScore(beta=beta)
if field not in pred_per_feat:
pred_per_feat[field] = set()
pred_per_feat[field].add((gold_i, feat))
@ -353,7 +358,8 @@ class Scorer:
DOCS: https://spacy.io/api/scorer#score_spans
"""
score = PRFScore()
beta = cfg.get("beta", Scorer.BETA)
score = PRFScore(beta=beta)
score_per_type = dict()
for example in examples:
pred_doc = example.predicted
@ -372,7 +378,7 @@ class Scorer:
gold_per_type: Dict[str, Set] = {label: set() for label in labels}
for label in labels:
if label not in score_per_type:
score_per_type[label] = PRFScore()
score_per_type[label] = PRFScore(beta=beta)
# Find all predidate labels, for all and per type
gold_spans = set()
pred_spans = set()
@ -469,9 +475,10 @@ class Scorer:
DOCS: https://spacy.io/api/scorer#score_cats
"""
beta = cfg.get("beta", Scorer.BETA)
if threshold is None:
threshold = 0.5 if multi_label else 0.0
f_per_type = {label: PRFScore() for label in labels}
f_per_type = {label: PRFScore(beta=beta) for label in labels}
auc_per_type = {label: ROCAUCScore() for label in labels}
labels = set(labels)
if labels:
@ -519,7 +526,7 @@ class Scorer:
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
if pred_score >= threshold:
f_per_type[pred_label].fp += 1
micro_prf = PRFScore()
micro_prf = PRFScore(beta=beta)
for label_prf in f_per_type.values():
micro_prf.tp += label_prf.tp
micro_prf.fn += label_prf.fn
@ -576,6 +583,7 @@ class Scorer:
DOCS: https://spacy.io/api/scorer#score_links
"""
beta = cfg.get("beta", Scorer.BETA)
f_per_type = {}
for example in examples:
gold_ent_by_offset = {}
@ -589,7 +597,7 @@ class Scorer:
if gold_span is not None:
label = gold_span.label_
if label not in f_per_type:
f_per_type[label] = PRFScore()
f_per_type[label] = PRFScore(beta=beta)
gold = gold_span.kb_id_
# only evaluating entities that overlap between gold and pred,
# to disentangle the performance of the NEL from the NER
@ -608,7 +616,7 @@ class Scorer:
# a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN
f_per_type[label].fp += 1
f_per_type[label].fn += 1
micro_prf = PRFScore()
micro_prf = PRFScore(beta=beta)
for label_prf in f_per_type.values():
micro_prf.tp += label_prf.tp
micro_prf.fn += label_prf.fn
@ -663,8 +671,9 @@ class Scorer:
DOCS: https://spacy.io/api/scorer#score_deps
"""
unlabelled = PRFScore()
labelled = PRFScore()
beta = cfg.get("beta", Scorer.BETA)
unlabelled = PRFScore(beta=beta)
labelled = PRFScore(beta=beta)
labelled_per_dep = dict()
missing_indices = set()
for example in examples:
@ -680,7 +689,7 @@ class Scorer:
if dep not in ignore_labels:
gold_deps.add((gold_i, head.i, dep))
if dep not in labelled_per_dep:
labelled_per_dep[dep] = PRFScore()
labelled_per_dep[dep] = PRFScore(beta=beta)
if dep not in gold_deps_per_dep:
gold_deps_per_dep[dep] = set()
gold_deps_per_dep[dep].add((gold_i, head.i, dep))
@ -711,7 +720,7 @@ class Scorer:
else:
pred_deps.add((gold_i, gold_head, dep))
if dep not in labelled_per_dep:
labelled_per_dep[dep] = PRFScore()
labelled_per_dep[dep] = PRFScore(beta=beta)
if dep not in pred_deps_per_dep:
pred_deps_per_dep[dep] = set()
pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
@ -742,6 +751,7 @@ class Scorer:
def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
score_per_type = defaultdict(PRFScore)
beta = kwargs.get("beta", Scorer.BETA)
for eg in examples:
if not eg.y.has_annotation("ENT_IOB"):
continue
@ -749,7 +759,7 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
align_x2y = eg.alignment.x2y
for pred_ent in eg.x.ents:
if pred_ent.label_ not in score_per_type:
score_per_type[pred_ent.label_] = PRFScore()
score_per_type[pred_ent.label_] = PRFScore(beta=beta)
indices = align_x2y[pred_ent.start : pred_ent.end]
if len(indices):
g_span = eg.y[indices[0] : indices[-1] + 1]
@ -765,7 +775,7 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
score_per_type[pred_ent.label_].fp += 1
for label, start, end in golds:
score_per_type[label].fn += 1
totals = PRFScore()
totals = PRFScore(beta=beta)
for prf in score_per_type.values():
totals += prf
if len(totals) > 0: