From ea9737a664bfeda6003bd904e914afc26234ecba Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Thu, 1 Sep 2022 16:52:50 +0200
Subject: [PATCH] Add beta parameter to Scorer and PRFScore.

---
 spacy/cli/find_threshold.py | 21 ++++++++-----
 spacy/errors.py             |  1 +
 spacy/scorer.py             | 60 +++++++++++++++++++++----------------
 3 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py
index fe3bdedae..1641c2d04 100644
--- a/spacy/cli/find_threshold.py
+++ b/spacy/cli/find_threshold.py
@@ -7,6 +7,8 @@ from typing import Optional, Tuple, Any, Dict, List
 import numpy
 import wasabi.tables
 
+from ..pipeline import TrainablePipe, Pipe
+from ..errors import Errors
 from ..training import Corpus
 from ._util import app, Arg, Opt, import_code, setup_gpu
 from .. import util
@@ -47,7 +49,7 @@ def find_threshold_cli(
     threshold_key (str): Key of threshold attribute in component's configuration.
     scores_key (str): Name of score to metric to optimize.
     n_trials (int): Number of trials to determine optimal thresholds
-    beta (float): Beta for F1 calculation.
+    beta (float): Beta for F-score calculation.
     code_path (Optional[Path]): Path to Python file with additional code (registered functions) to be imported.
     use_gpu (int): GPU ID or -1 for CPU.
     gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the
@@ -79,10 +81,10 @@ def find_threshold(
     threshold_key: str,
     scores_key: str,
     *,
-    n_trials: int = _DEFAULTS["n_trials"],
-    beta: float = _DEFAULTS["beta"],
-    use_gpu: int = _DEFAULTS["use_gpu"],
-    gold_preproc: bool = _DEFAULTS["gold_preproc"],
+    n_trials: int = _DEFAULTS["n_trials"],  # type: ignore
+    beta: float = _DEFAULTS["beta"],  # type: ignore
+    use_gpu: int = _DEFAULTS["use_gpu"],  # type: ignore
+    gold_preproc: bool = _DEFAULTS["gold_preproc"],  # type: ignore
     silent: bool = True,
 ) -> Tuple[float, float]:
     """
@@ -93,7 +95,7 @@ def find_threshold(
     threshold_key (str): Key of threshold attribute in component's configuration.
     scores_key (str): Name of score to metric to optimize.
     n_trials (int): Number of trials to determine optimal thresholds.
-    beta (float): Beta for F1 calculation.
+    beta (float): Beta for F-score calculation.
     use_gpu (int): GPU ID or -1 for CPU.
     gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the
         tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due
@@ -108,10 +110,13 @@ def find_threshold(
         wasabi.msg.fail("Evaluation data not found", data_path, exits=1)
     nlp = util.load_model(model)
 
+    pipe: Optional[Pipe] = None
     try:
         pipe = nlp.get_pipe(pipe_name)
     except KeyError as err:
         wasabi.msg.fail(title=str(err), exits=1)
+    if not isinstance(pipe, TrainablePipe):
+        raise TypeError(Errors.E1044)
 
     if not silent:
         wasabi.msg.info(
@@ -140,7 +145,9 @@ def find_threshold(
     scores: Dict[float, float] = {}
     for threshold in numpy.linspace(0, 1, n_trials):
         pipe.cfg = set_nested_item(pipe.cfg, config_keys, threshold)
-        scores[threshold] = nlp.evaluate(dev_dataset)[scores_key]
+        scores[threshold] = nlp.evaluate(dev_dataset, scorer_cfg={"beta": beta})[
+            scores_key
+        ]
         if not (
             isinstance(scores[threshold], float) or isinstance(scores[threshold], int)
         ):
diff --git a/spacy/errors.py b/spacy/errors.py
index fd412a4da..18d3cd5f2 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -939,6 +939,7 @@ class Errors(metaclass=ErrorsWithCodes):
              "`{arg2}`={arg2_values} but these arguments are conflicting.")
     E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
              "{value}.")
+    E1044 = ("Only components of type `TrainablePipe` are supported by `find_threshold()`.")
 
 
 # Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 8cd755ac4..3bb3c5cab 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -22,15 +22,12 @@ class PRFScore:
     """A precision / recall / F score."""
 
     def __init__(
-        self,
-        *,
-        tp: int = 0,
-        fp: int = 0,
-        fn: int = 0,
+        self, *, tp: int = 0, fp: int = 0, fn: int = 0, beta: float = 1
     ) -> None:
         self.tp = tp
         self.fp = fp
         self.fn = fn
+        self.beta = beta
 
     def __len__(self) -> int:
         return self.tp + self.fp + self.fn
@@ -42,8 +39,12 @@ class PRFScore:
         return self
 
     def __add__(self, other):
+        assert self.beta == other.beta
         return PRFScore(
-            tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
+            tp=self.tp + other.tp,
+            fp=self.fp + other.fp,
+            fn=self.fn + other.fn,
+            beta=self.beta,
         )
 
     def score_set(self, cand: set, gold: set) -> None:
@@ -63,7 +64,7 @@ class PRFScore:
     def fscore(self) -> float:
         p = self.precision
         r = self.recall
-        return 2 * ((p * r) / (p + r + 1e-100))
+        return (1 + self.beta**2) * ((p * r) / ((self.beta**2 * p) + r + 1e-100))
 
     def to_dict(self) -> Dict[str, float]:
         return {"p": self.precision, "r": self.recall, "f": self.fscore}
@@ -101,6 +102,8 @@ class ROCAUCScore:
 class Scorer:
     """Compute evaluation scores."""
 
+    BETA = 1
+
     def __init__(
         self,
         nlp: Optional["Language"] = None,
@@ -149,8 +152,9 @@ class Scorer:
 
         DOCS: https://spacy.io/api/scorer#score_tokenization
         """
-        acc_score = PRFScore()
-        prf_score = PRFScore()
+        beta = cfg.get("beta", Scorer.BETA)
+        acc_score = PRFScore(beta=beta)
+        prf_score = PRFScore(beta=beta)
         for example in examples:
             gold_doc = example.reference
             pred_doc = example.predicted
@@ -210,7 +214,7 @@ class Scorer:
 
         DOCS: https://spacy.io/api/scorer#score_token_attr
         """
-        tag_score = PRFScore()
+        tag_score = PRFScore(beta=cfg.get("beta", Scorer.BETA))
         for example in examples:
             gold_doc = example.reference
             pred_doc = example.predicted
@@ -261,7 +265,8 @@ class Scorer:
             key attr_micro_p/r/f and the per-feat PRF scores under
             attr_per_feat.
         """
-        micro_score = PRFScore()
+        beta = cfg.get("beta", Scorer.BETA)
+        micro_score = PRFScore(beta=beta)
         per_feat = {}
         for example in examples:
             pred_doc = example.predicted
@@ -276,7 +281,7 @@ class Scorer:
                     for feat in morph.split(Morphology.FEATURE_SEP):
                         field, values = feat.split(Morphology.FIELD_SEP)
                         if field not in per_feat:
-                            per_feat[field] = PRFScore()
+                            per_feat[field] = PRFScore(beta=beta)
                         if field not in gold_per_feat:
                             gold_per_feat[field] = set()
                         gold_per_feat[field].add((gold_i, feat))
@@ -298,7 +303,7 @@ class Scorer:
                             for feat in morph.split(Morphology.FEATURE_SEP):
                                 field, values = feat.split(Morphology.FIELD_SEP)
                                 if field not in per_feat:
-                                    per_feat[field] = PRFScore()
+                                    per_feat[field] = PRFScore(beta=beta)
                                 if field not in pred_per_feat:
                                     pred_per_feat[field] = set()
                                 pred_per_feat[field].add((gold_i, feat))
@@ -353,7 +358,8 @@ class Scorer:
 
         DOCS: https://spacy.io/api/scorer#score_spans
         """
-        score = PRFScore()
+        beta = cfg.get("beta", Scorer.BETA)
+        score = PRFScore(beta=beta)
         score_per_type = dict()
         for example in examples:
             pred_doc = example.predicted
@@ -372,7 +378,7 @@ class Scorer:
             gold_per_type: Dict[str, Set] = {label: set() for label in labels}
             for label in labels:
                 if label not in score_per_type:
-                    score_per_type[label] = PRFScore()
+                    score_per_type[label] = PRFScore(beta=beta)
             # Find all predidate labels, for all and per type
             gold_spans = set()
             pred_spans = set()
@@ -469,9 +475,10 @@ class Scorer:
 
         DOCS: https://spacy.io/api/scorer#score_cats
         """
+        beta = cfg.get("beta", Scorer.BETA)
         if threshold is None:
             threshold = 0.5 if multi_label else 0.0
-        f_per_type = {label: PRFScore() for label in labels}
+        f_per_type = {label: PRFScore(beta=beta) for label in labels}
         auc_per_type = {label: ROCAUCScore() for label in labels}
         labels = set(labels)
         if labels:
@@ -519,7 +526,7 @@ class Scorer:
                 pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
                 if pred_score >= threshold:
                     f_per_type[pred_label].fp += 1
-        micro_prf = PRFScore()
+        micro_prf = PRFScore(beta=beta)
         for label_prf in f_per_type.values():
             micro_prf.tp += label_prf.tp
             micro_prf.fn += label_prf.fn
@@ -576,6 +583,7 @@ class Scorer:
 
         DOCS: https://spacy.io/api/scorer#score_links
         """
+        beta = cfg.get("beta", Scorer.BETA)
         f_per_type = {}
         for example in examples:
             gold_ent_by_offset = {}
@@ -589,7 +597,7 @@ class Scorer:
                 if gold_span is not None:
                     label = gold_span.label_
                     if label not in f_per_type:
-                        f_per_type[label] = PRFScore()
+                        f_per_type[label] = PRFScore(beta=beta)
                     gold = gold_span.kb_id_
                     # only evaluating entities that overlap between gold and pred,
                     # to disentangle the performance of the NEL from the NER
@@ -608,7 +616,7 @@ class Scorer:
                             # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN
                             f_per_type[label].fp += 1
                             f_per_type[label].fn += 1
-        micro_prf = PRFScore()
+        micro_prf = PRFScore(beta=beta)
         for label_prf in f_per_type.values():
             micro_prf.tp += label_prf.tp
             micro_prf.fn += label_prf.fn
@@ -663,8 +671,9 @@ class Scorer:
 
         DOCS: https://spacy.io/api/scorer#score_deps
         """
-        unlabelled = PRFScore()
-        labelled = PRFScore()
+        beta = cfg.get("beta", Scorer.BETA)
+        unlabelled = PRFScore(beta=beta)
+        labelled = PRFScore(beta=beta)
         labelled_per_dep = dict()
         missing_indices = set()
         for example in examples:
@@ -680,7 +689,7 @@ class Scorer:
                     if dep not in ignore_labels:
                         gold_deps.add((gold_i, head.i, dep))
                         if dep not in labelled_per_dep:
-                            labelled_per_dep[dep] = PRFScore()
+                            labelled_per_dep[dep] = PRFScore(beta=beta)
                         if dep not in gold_deps_per_dep:
                             gold_deps_per_dep[dep] = set()
                         gold_deps_per_dep[dep].add((gold_i, head.i, dep))
@@ -711,7 +720,7 @@ class Scorer:
                         else:
                             pred_deps.add((gold_i, gold_head, dep))
                             if dep not in labelled_per_dep:
-                                labelled_per_dep[dep] = PRFScore()
+                                labelled_per_dep[dep] = PRFScore(beta=beta)
                             if dep not in pred_deps_per_dep:
                                 pred_deps_per_dep[dep] = set()
                             pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
@@ -742,6 +751,7 @@ class Scorer:
 def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
     """Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
     score_per_type = defaultdict(PRFScore)
+    beta = kwargs.get("beta", Scorer.BETA)
     for eg in examples:
         if not eg.y.has_annotation("ENT_IOB"):
             continue
@@ -749,7 +759,7 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
         align_x2y = eg.alignment.x2y
         for pred_ent in eg.x.ents:
             if pred_ent.label_ not in score_per_type:
-                score_per_type[pred_ent.label_] = PRFScore()
+                score_per_type[pred_ent.label_] = PRFScore(beta=beta)
             indices = align_x2y[pred_ent.start : pred_ent.end]
             if len(indices):
                 g_span = eg.y[indices[0] : indices[-1] + 1]
@@ -765,7 +775,7 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
                         score_per_type[pred_ent.label_].fp += 1
         for label, start, end in golds:
             score_per_type[label].fn += 1
-    totals = PRFScore()
+    totals = PRFScore(beta=beta)
     for prf in score_per_type.values():
         totals += prf
     if len(totals) > 0: