From ea9737a664bfeda6003bd904e914afc26234ecba Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 1 Sep 2022 16:52:50 +0200 Subject: [PATCH] Add beta parameter to Scorer and PRFScore. --- spacy/cli/find_threshold.py | 21 ++++++++----- spacy/errors.py | 1 + spacy/scorer.py | 60 +++++++++++++++++++++---------------- 3 files changed, 50 insertions(+), 32 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index fe3bdedae..1641c2d04 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -7,6 +7,8 @@ from typing import Optional, Tuple, Any, Dict, List import numpy import wasabi.tables +from ..pipeline import TrainablePipe, Pipe +from ..errors import Errors from ..training import Corpus from ._util import app, Arg, Opt, import_code, setup_gpu from .. import util @@ -47,7 +49,7 @@ def find_threshold_cli( threshold_key (str): Key of threshold attribute in component's configuration. scores_key (str): Name of score to metric to optimize. n_trials (int): Number of trials to determine optimal thresholds - beta (float): Beta for F1 calculation. + beta (float): Beta for F-score calculation. code_path (Optional[Path]): Path to Python file with additional code (registered functions) to be imported. use_gpu (int): GPU ID or -1 for CPU. gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the @@ -79,10 +81,10 @@ def find_threshold( threshold_key: str, scores_key: str, *, - n_trials: int = _DEFAULTS["n_trials"], - beta: float = _DEFAULTS["beta"], - use_gpu: int = _DEFAULTS["use_gpu"], - gold_preproc: bool = _DEFAULTS["gold_preproc"], + n_trials: int = _DEFAULTS["n_trials"], # type: ignore + beta: float = _DEFAULTS["beta"], # type: ignore + use_gpu: int = _DEFAULTS["use_gpu"], # type: ignore + gold_preproc: bool = _DEFAULTS["gold_preproc"], # type: ignore silent: bool = True, ) -> Tuple[float, float]: """ @@ -93,7 +95,7 @@ def find_threshold( threshold_key (str): Key of threshold attribute in component's configuration. scores_key (str): Name of score to metric to optimize. n_trials (int): Number of trials to determine optimal thresholds. - beta (float): Beta for F1 calculation. + beta (float): Beta for F-score calculation. use_gpu (int): GPU ID or -1 for CPU. gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due @@ -108,10 +110,13 @@ def find_threshold( wasabi.msg.fail("Evaluation data not found", data_path, exits=1) nlp = util.load_model(model) + pipe: Optional[Pipe] = None try: pipe = nlp.get_pipe(pipe_name) except KeyError as err: wasabi.msg.fail(title=str(err), exits=1) + if not isinstance(pipe, TrainablePipe): + raise TypeError(Errors.E1044) if not silent: wasabi.msg.info( @@ -140,7 +145,9 @@ def find_threshold( scores: Dict[float, float] = {} for threshold in numpy.linspace(0, 1, n_trials): pipe.cfg = set_nested_item(pipe.cfg, config_keys, threshold) - scores[threshold] = nlp.evaluate(dev_dataset)[scores_key] + scores[threshold] = nlp.evaluate(dev_dataset, scorer_cfg={"beta": beta})[ + scores_key + ] if not ( isinstance(scores[threshold], float) or isinstance(scores[threshold], int) ): diff --git a/spacy/errors.py b/spacy/errors.py index fd412a4da..18d3cd5f2 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -939,6 +939,7 @@ class Errors(metaclass=ErrorsWithCodes): "`{arg2}`={arg2_values} but these arguments are conflicting.") E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got " "{value}.") + E1044 = ("Only components of type `TrainablePipe` are supported by `find_threshold()`.") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/scorer.py b/spacy/scorer.py index 8cd755ac4..3bb3c5cab 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -22,15 +22,12 @@ class PRFScore: """A precision / recall / F score.""" def __init__( - self, - *, - tp: int = 0, - fp: int = 0, - fn: int = 0, + self, *, tp: int = 0, fp: int = 0, fn: int = 0, beta: float = 1 ) -> None: self.tp = tp self.fp = fp self.fn = fn + self.beta = beta def __len__(self) -> int: return self.tp + self.fp + self.fn @@ -42,8 +39,12 @@ class PRFScore: return self def __add__(self, other): + assert self.beta == other.beta return PRFScore( - tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn + tp=self.tp + other.tp, + fp=self.fp + other.fp, + fn=self.fn + other.fn, + beta=self.beta, ) def score_set(self, cand: set, gold: set) -> None: @@ -63,7 +64,7 @@ class PRFScore: def fscore(self) -> float: p = self.precision r = self.recall - return 2 * ((p * r) / (p + r + 1e-100)) + return (1 + self.beta**2) * ((p * r) / ((self.beta**2 * p) + r + 1e-100)) def to_dict(self) -> Dict[str, float]: return {"p": self.precision, "r": self.recall, "f": self.fscore} @@ -101,6 +102,8 @@ class ROCAUCScore: class Scorer: """Compute evaluation scores.""" + BETA = 1 + def __init__( self, nlp: Optional["Language"] = None, @@ -149,8 +152,9 @@ class Scorer: DOCS: https://spacy.io/api/scorer#score_tokenization """ - acc_score = PRFScore() - prf_score = PRFScore() + beta = cfg.get("beta", Scorer.BETA) + acc_score = PRFScore(beta=beta) + prf_score = PRFScore(beta=beta) for example in examples: gold_doc = example.reference pred_doc = example.predicted @@ -210,7 +214,7 @@ class Scorer: DOCS: https://spacy.io/api/scorer#score_token_attr """ - tag_score = PRFScore() + tag_score = PRFScore(beta=cfg.get("beta", Scorer.BETA)) for example in examples: gold_doc = example.reference pred_doc = example.predicted @@ -261,7 +265,8 @@ class Scorer: key attr_micro_p/r/f and the per-feat PRF scores under attr_per_feat. """ - micro_score = PRFScore() + beta = cfg.get("beta", Scorer.BETA) + micro_score = PRFScore(beta=beta) per_feat = {} for example in examples: pred_doc = example.predicted @@ -276,7 +281,7 @@ class Scorer: for feat in morph.split(Morphology.FEATURE_SEP): field, values = feat.split(Morphology.FIELD_SEP) if field not in per_feat: - per_feat[field] = PRFScore() + per_feat[field] = PRFScore(beta=beta) if field not in gold_per_feat: gold_per_feat[field] = set() gold_per_feat[field].add((gold_i, feat)) @@ -298,7 +303,7 @@ class Scorer: for feat in morph.split(Morphology.FEATURE_SEP): field, values = feat.split(Morphology.FIELD_SEP) if field not in per_feat: - per_feat[field] = PRFScore() + per_feat[field] = PRFScore(beta=beta) if field not in pred_per_feat: pred_per_feat[field] = set() pred_per_feat[field].add((gold_i, feat)) @@ -353,7 +358,8 @@ class Scorer: DOCS: https://spacy.io/api/scorer#score_spans """ - score = PRFScore() + beta = cfg.get("beta", Scorer.BETA) + score = PRFScore(beta=beta) score_per_type = dict() for example in examples: pred_doc = example.predicted @@ -372,7 +378,7 @@ class Scorer: gold_per_type: Dict[str, Set] = {label: set() for label in labels} for label in labels: if label not in score_per_type: - score_per_type[label] = PRFScore() + score_per_type[label] = PRFScore(beta=beta) # Find all predidate labels, for all and per type gold_spans = set() pred_spans = set() @@ -469,9 +475,10 @@ class Scorer: DOCS: https://spacy.io/api/scorer#score_cats """ + beta = cfg.get("beta", Scorer.BETA) if threshold is None: threshold = 0.5 if multi_label else 0.0 - f_per_type = {label: PRFScore() for label in labels} + f_per_type = {label: PRFScore(beta=beta) for label in labels} auc_per_type = {label: ROCAUCScore() for label in labels} labels = set(labels) if labels: @@ -519,7 +526,7 @@ class Scorer: pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1]) if pred_score >= threshold: f_per_type[pred_label].fp += 1 - micro_prf = PRFScore() + micro_prf = PRFScore(beta=beta) for label_prf in f_per_type.values(): micro_prf.tp += label_prf.tp micro_prf.fn += label_prf.fn @@ -576,6 +583,7 @@ class Scorer: DOCS: https://spacy.io/api/scorer#score_links """ + beta = cfg.get("beta", Scorer.BETA) f_per_type = {} for example in examples: gold_ent_by_offset = {} @@ -589,7 +597,7 @@ class Scorer: if gold_span is not None: label = gold_span.label_ if label not in f_per_type: - f_per_type[label] = PRFScore() + f_per_type[label] = PRFScore(beta=beta) gold = gold_span.kb_id_ # only evaluating entities that overlap between gold and pred, # to disentangle the performance of the NEL from the NER @@ -608,7 +616,7 @@ class Scorer: # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN f_per_type[label].fp += 1 f_per_type[label].fn += 1 - micro_prf = PRFScore() + micro_prf = PRFScore(beta=beta) for label_prf in f_per_type.values(): micro_prf.tp += label_prf.tp micro_prf.fn += label_prf.fn @@ -663,8 +671,9 @@ class Scorer: DOCS: https://spacy.io/api/scorer#score_deps """ - unlabelled = PRFScore() - labelled = PRFScore() + beta = cfg.get("beta", Scorer.BETA) + unlabelled = PRFScore(beta=beta) + labelled = PRFScore(beta=beta) labelled_per_dep = dict() missing_indices = set() for example in examples: @@ -680,7 +689,7 @@ class Scorer: if dep not in ignore_labels: gold_deps.add((gold_i, head.i, dep)) if dep not in labelled_per_dep: - labelled_per_dep[dep] = PRFScore() + labelled_per_dep[dep] = PRFScore(beta=beta) if dep not in gold_deps_per_dep: gold_deps_per_dep[dep] = set() gold_deps_per_dep[dep].add((gold_i, head.i, dep)) @@ -711,7 +720,7 @@ class Scorer: else: pred_deps.add((gold_i, gold_head, dep)) if dep not in labelled_per_dep: - labelled_per_dep[dep] = PRFScore() + labelled_per_dep[dep] = PRFScore(beta=beta) if dep not in pred_deps_per_dep: pred_deps_per_dep[dep] = set() pred_deps_per_dep[dep].add((gold_i, gold_head, dep)) @@ -742,6 +751,7 @@ class Scorer: def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: """Compute micro-PRF and per-entity PRF scores for a sequence of examples.""" score_per_type = defaultdict(PRFScore) + beta = kwargs.get("beta", Scorer.BETA) for eg in examples: if not eg.y.has_annotation("ENT_IOB"): continue @@ -749,7 +759,7 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: align_x2y = eg.alignment.x2y for pred_ent in eg.x.ents: if pred_ent.label_ not in score_per_type: - score_per_type[pred_ent.label_] = PRFScore() + score_per_type[pred_ent.label_] = PRFScore(beta=beta) indices = align_x2y[pred_ent.start : pred_ent.end] if len(indices): g_span = eg.y[indices[0] : indices[-1] + 1] @@ -765,7 +775,7 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: score_per_type[pred_ent.label_].fp += 1 for label, start, end in golds: score_per_type[label].fn += 1 - totals = PRFScore() + totals = PRFScore(beta=beta) for prf in score_per_type.values(): totals += prf if len(totals) > 0: