From 3a0a3854f72dc726261658bc3701931f23bb9ef1 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Thu, 1 Sep 2022 14:00:05 +0200
Subject: [PATCH] Generalize component and threshold handling. Harmonize
 arguments with 'spacy evaluate' CLI.

---
 spacy/cli/find_threshold.py | 201 ++++++++++++++----------------------
 spacy/tests/test_cli.py     | 159 ++++++++++++++--------------
 2 files changed, 157 insertions(+), 203 deletions(-)

diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py
index d5cd9e3de..fe3bdedae 100644
--- a/spacy/cli/find_threshold.py
+++ b/spacy/cli/find_threshold.py
@@ -1,16 +1,23 @@
+import functools
+import operator
 from pathlib import Path
 import logging
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Any, Dict, List
 
 import numpy
 import wasabi.tables
 
+from ..training import Corpus
 from ._util import app, Arg, Opt, import_code, setup_gpu
 from .. import util
-from ..pipeline import MultiLabel_TextCategorizer, Pipe
-from ..tokens import DocBin
 
-_DEFAULTS = {"average": "micro", "n_trials": 10, "beta": 1, "use_gpu": -1}
+_DEFAULTS = {
+    "average": "micro",
+    "n_trials": 10,
+    "beta": 1,
+    "use_gpu": -1,
+    "gold_preproc": False,
+}
 
 
 @app.command(
@@ -21,12 +28,14 @@ def find_threshold_cli(
     # fmt: off
     model: str = Arg(..., help="Model name or path"),
     data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
-    pipe_name: str = Opt(..., "--pipe_name", "-p", help="Name of pipe to examine thresholds for"),
-    average: str = Arg(_DEFAULTS["average"], help="How to aggregate F-scores over labels. One of ('micro', 'macro')", exists=True, allow_dash=True),
+    pipe_name: str = Arg(..., help="Name of pipe to examine thresholds for"),
+    threshold_key: str = Arg(..., help="Key of threshold attribute in component's configuration"),
+    scores_key: str = Arg(..., help="Name of score to metric to optimize"),
     n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"),
     beta: float = Opt(_DEFAULTS["beta"], "--beta", help="Beta for F1 calculation. Ignored if different metric is used"),
     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
     use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
+    gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
     verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"),
     # fmt: on
 ):
@@ -35,24 +44,30 @@ def find_threshold_cli(
     model (Path): Path to file with trained model.
     data_path (Path): Path to file with DocBin with docs to use for threshold search.
     pipe_name (str): Name of pipe to examine thresholds for.
-    average (str): How to average F-scores across labels. One of ('micro', 'macro').
+    threshold_key (str): Key of threshold attribute in component's configuration.
+    scores_key (str): Name of score to metric to optimize.
     n_trials (int): Number of trials to determine optimal thresholds
     beta (float): Beta for F1 calculation.
     code_path (Optional[Path]): Path to Python file with additional code (registered functions) to be imported.
     use_gpu (int): GPU ID or -1 for CPU.
+    gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the
+        tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due
+        to train/test skew.
     silent (bool): Display more information for debugging purposes
     """
 
     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
     import_code(code_path)
     find_threshold(
-        model,
-        data_path,
+        model=model,
+        data_path=data_path,
         pipe_name=pipe_name,
-        average=average,
+        threshold_key=threshold_key,
+        scores_key=scores_key,
         n_trials=n_trials,
         beta=beta,
         use_gpu=use_gpu,
+        gold_preproc=gold_preproc,
         silent=False,
     )
 
@@ -60,12 +75,14 @@ def find_threshold_cli(
 def find_threshold(
     model: str,
     data_path: Path,
+    pipe_name: str,
+    threshold_key: str,
+    scores_key: str,
     *,
-    pipe_name: str,  # type: ignore
-    average: str = _DEFAULTS["average"],  # type: ignore
-    n_trials: int = _DEFAULTS["n_trials"],  # type: ignore
-    beta: float = _DEFAULTS["beta"],  # type: ignore,
+    n_trials: int = _DEFAULTS["n_trials"],
+    beta: float = _DEFAULTS["beta"],
     use_gpu: int = _DEFAULTS["use_gpu"],
+    gold_preproc: bool = _DEFAULTS["gold_preproc"],
     silent: bool = True,
 ) -> Tuple[float, float]:
     """
@@ -73,10 +90,14 @@ def find_threshold(
     model (Union[str, Path]): Path to file with trained model.
     data_path (Union[str, Path]): Path to file with DocBin with docs to use for threshold search.
     pipe_name (str): Name of pipe to examine thresholds for.
-    average (str): How to average F-scores across labels. One of ('micro', 'macro').
+    threshold_key (str): Key of threshold attribute in component's configuration.
+    scores_key (str): Name of score to metric to optimize.
     n_trials (int): Number of trials to determine optimal thresholds.
     beta (float): Beta for F1 calculation.
     use_gpu (int): GPU ID or -1 for CPU.
+    gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the
+        tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due
+        to train/test skew.
     silent (bool): Whether to print non-error-related output to stdout.
     RETURNS (Tuple[float, float]): Best found threshold with corresponding F-score.
     """
@@ -86,127 +107,57 @@ def find_threshold(
     if not data_path.exists():
         wasabi.msg.fail("Evaluation data not found", data_path, exits=1)
     nlp = util.load_model(model)
-    pipe: Optional[Pipe] = None
-    selected_pipe_name: Optional[str] = pipe_name
 
-    if average not in ("micro", "macro"):
-        wasabi.msg.fail(
-            "Expected 'micro' or 'macro' for F-score averaging method, received '{avg_method}'.",
-            exits=1,
-        )
+    try:
+        pipe = nlp.get_pipe(pipe_name)
+    except KeyError as err:
+        wasabi.msg.fail(title=str(err), exits=1)
 
-    for _pipe_name, _pipe in nlp.pipeline:
-        # todo instead of instance check, assert _pipe has a .threshold arg
-        #   won't work, actually. e.g. spancat doesn't .threshold.
-        if _pipe_name == pipe_name:
-            if not isinstance(_pipe, MultiLabel_TextCategorizer):
-                wasabi.msg.fail(
-                    "Specified component '{component}' is not of type `MultiLabel_TextCategorizer`.".format(
-                        component=pipe_name
-                    ),
-                    exits=1,
-                )
-            pipe = _pipe
-            break
-
-    if pipe is None:
-        wasabi.msg.fail(
-            f"No component with name {pipe_name} found in pipeline.", exits=1
-        )
-    # This is purely for MyPy. Type checking is done in loop above already.
-    assert isinstance(pipe, MultiLabel_TextCategorizer)
-
-    if silent:
-        print(
-            f"Searching threshold with the best {average} F-score for component '{selected_pipe_name}' with {n_trials} "
+    if not silent:
+        wasabi.msg.info(
+            title=f"Optimizing for {scores_key} for component '{pipe_name}' with {n_trials} "
             f"trials and beta = {beta}."
         )
 
-    thresholds = numpy.linspace(0, 1, n_trials)
-    # todo use Scorer.score_cats. possibly to be extended?
-    ref_pos_counts = {label: 0 for label in pipe.labels}
-    pred_pos_counts = {
-        t: {True: ref_pos_counts.copy(), False: ref_pos_counts.copy()}
-        for t in thresholds
-    }
-    f_scores_per_label = {t: {label: 0.0 for label in pipe.labels} for t in thresholds}
-    f_scores = {t: 0.0 for t in thresholds}
+    # Load evaluation corpus.
+    corpus = Corpus(data_path, gold_preproc=gold_preproc)
+    dev_dataset = list(corpus(nlp))
+    config_keys = threshold_key.split(".")
 
-    # Count true/false positives for provided docs.
-    doc_bin = DocBin()
-    doc_bin.from_disk(data_path)
-    for ref_doc in doc_bin.get_docs(nlp.vocab):
-        for label, score in ref_doc.cats.items():
-            if score not in (0, 1):
-                wasabi.msg.fail(
-                    f"Expected category scores in evaluation dataset to be 0 <= x <= 1, received {score}.",
-                    exits=1,
-                )
-            ref_pos_counts[label] += ref_doc.cats[label] == 1
+    def set_nested_item(
+        config: Dict[str, Any], keys: List[str], value: float
+    ) -> Dict[str, Any]:
+        """Set item in nested dictionary. Adapated from https://stackoverflow.com/a/54138200.
+        config (Dict[str, Any]): Configuration dictionary.
+        keys (List[Any]):
+        value (float): Value to set.
+        RETURNS (Dict[str, Any]): Updated dictionary.
+        """
+        functools.reduce(operator.getitem, keys[:-1], config)[keys[-1]] = value
+        return config
 
-        pred_doc = nlp(ref_doc.text)
-        # Collect count stats per threshold value and label.
-        for threshold in thresholds:
-            for label, score in pred_doc.cats.items():
-                if label not in pipe.labels:
-                    continue
-                label_value = int(score >= threshold)
-                if label_value == ref_doc.cats[label] == 1:
-                    pred_pos_counts[threshold][True][label] += 1
-                elif label_value == 1 and ref_doc.cats[label] == 0:
-                    pred_pos_counts[threshold][False][label] += 1
-
-    # Compute F-scores.
-    for threshold in thresholds:
-        for label in ref_pos_counts:
-            n_pos_preds = (
-                pred_pos_counts[threshold][True][label]
-                + pred_pos_counts[threshold][False][label]
-            )
-            precision = (
-                (pred_pos_counts[threshold][True][label] / n_pos_preds)
-                if n_pos_preds > 0
-                else 0
-            )
-            recall = pred_pos_counts[threshold][True][label] / ref_pos_counts[label]
-            f_scores_per_label[threshold][label] = (
-                (
-                    (1 + beta**2)
-                    * (precision * recall / (precision * beta**2 + recall))
-                )
-                if precision
-                else 0
+    # Evaluate with varying threshold values.
+    scores: Dict[float, float] = {}
+    for threshold in numpy.linspace(0, 1, n_trials):
+        pipe.cfg = set_nested_item(pipe.cfg, config_keys, threshold)
+        scores[threshold] = nlp.evaluate(dev_dataset)[scores_key]
+        if not (
+            isinstance(scores[threshold], float) or isinstance(scores[threshold], int)
+        ):
+            wasabi.msg.fail(
+                f"Returned score for key '{scores_key}' is not numeric. Threshold optimization only works for numeric "
+                f"scores.",
+                exits=1,
             )
 
-        # Aggregate F-scores.
-        if average == "micro":
-            f_scores[threshold] = sum(
-                [
-                    f_scores_per_label[threshold][label] * ref_pos_counts[label]
-                    for label in ref_pos_counts
-                ]
-            ) / sum(ref_pos_counts.values())
-        else:
-            f_scores[threshold] = sum(
-                [f_scores_per_label[threshold][label] for label in ref_pos_counts]
-            ) / len(ref_pos_counts)
-
-    best_threshold = max(f_scores.keys(), key=(lambda key: f_scores[key]))
-    if silent:
+    best_threshold = max(scores.keys(), key=(lambda key: scores[key]))
+    if not silent:
         print(
-            f"Best threshold: {round(best_threshold, ndigits=4)} with F-score of {f_scores[best_threshold]}.",
+            f"Best threshold: {round(best_threshold, ndigits=4)} with value of {scores[best_threshold]}.",
             wasabi.tables.table(
-                data=[
-                    (threshold, label, f_score)
-                    for threshold, label_f_scores in f_scores_per_label.items()
-                    for label, f_score in label_f_scores.items()
-                ],
-                header=["Threshold", "Label", "F-Score"],
-            ),
-            wasabi.tables.table(
-                data=[(threshold, f_score) for threshold, f_score in f_scores.items()],
-                header=["Threshold", f"F-Score ({average})"],
+                data=[(threshold, score) for threshold, score in scores.items()],
+                header=["Threshold", f"{scores_key}"],
             ),
         )
 
-    return best_threshold, f_scores[best_threshold]
+    return best_threshold, scores[best_threshold]
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 48cc364f0..b0d173fdf 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,6 +1,6 @@
 import os
 import math
-from typing import Counter, Iterable, Tuple, List
+from typing import Counter, Tuple, List, Dict, Any
 
 import numpy
 import pytest
@@ -36,7 +36,7 @@ from spacy.tokens.span import Span
 from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
 from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
 from spacy.training.converters import iob_to_docs
-from spacy.pipeline import TextCategorizer, Pipe
+from spacy.pipeline import TextCategorizer, Pipe, SpanCategorizer
 from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config
 
 from ..cli.init_pipeline import _init_labels
@@ -860,38 +860,55 @@ def test_span_length_freq_dist_output_must_be_correct():
 
 
 def test_cli_find_threshold(capsys):
-    def make_get_examples_multi_label(_nlp: Language) -> List[Example]:
-        return [
-            Example.from_dict(_nlp.make_doc(t[0]), t[1])
-            for t in [
-                (
-                    "I'm angry and confused",
-                    {"cats": {"ANGRY": 1.0, "CONFUSED": 1.0, "HAPPY": 0.0}},
-                ),
-                (
-                    "I'm confused but happy",
-                    {"cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}},
-                ),
-            ]
-        ]
+    def make_examples(_nlp: Language) -> List[Example]:
+        docs: List[Example] = []
+
+        for t in [
+            (
+                "I'm angry and confused in the Bank of America.",
+                {
+                    "cats": {"ANGRY": 1.0, "CONFUSED": 1.0, "HAPPY": 0.0},
+                    "spans": {"sc": [(7, 10, "ORG")]},
+                },
+            ),
+            (
+                "I'm confused but happy in New York.",
+                {
+                    "cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0},
+                    "spans": {"sc": [(6, 7, "GPE")]},
+                },
+            ),
+        ]:
+            doc = _nlp.make_doc(t[0])
+            docs.append(Example.from_dict(doc, t[1]))
+
+        return docs
 
     def init_nlp(
-        component_factory_names: Tuple[str, ...] = (),
+        components: Tuple[Tuple[str, Dict[str, Any]], ...] = ()
     ) -> Tuple[Language, List[Example]]:
         _nlp = English()
+        textcat: TextCategorizer = _nlp.add_pipe(  # type: ignore
+            factory_name="textcat_multilabel",
+            name="tc_multi",
+            config={"threshold": 0.9},
+        )
+        textcat_labels = ("ANGRY", "CONFUSED", "HAPPY")
+        for label in textcat_labels:
+            textcat.add_label(label)
 
-        textcat: TextCategorizer = _nlp.add_pipe(factory_name="textcat_multilabel", name="tc_multi")  # type: ignore
-        textcat.add_label("ANGRY")
-        textcat.add_label("CONFUSED")
-        textcat.add_label("HAPPY")
-        for cfn in component_factory_names:
-            comp = _nlp.add_pipe(cfn)
+        # Append additional components to pipeline.
+        for cfn, comp_config in components:
+            comp = _nlp.add_pipe(cfn, config=comp_config)
             if isinstance(comp, TextCategorizer):
-                comp.add_label("dummy")
+                for label in textcat_labels:
+                    comp.add_label(label)
+            if isinstance(comp, SpanCategorizer):
+                comp.add_label("GPE")
+                comp.add_label("ORG")
 
         _nlp.initialize()
-
-        _examples = make_get_examples_multi_label(_nlp)
+        _examples = make_examples(_nlp)
         for i in range(5):
             _nlp.update(_examples)
 
@@ -903,77 +920,63 @@ def test_cli_find_threshold(capsys):
         # mostly as a smoke test.
         nlp, examples = init_nlp()
         DocBin(docs=[example.reference for example in examples]).to_disk(
-            docs_dir / "docs"
+            docs_dir / "docs.spacy"
         )
         with make_tempdir() as nlp_dir:
             nlp.to_disk(nlp_dir)
             assert (
-                find_threshold(nlp_dir, docs_dir / "docs", verbose=False)[0]
+                find_threshold(
+                    model=nlp_dir,
+                    data_path=docs_dir / "docs.spacy",
+                    pipe_name="tc_multi",
+                    threshold_key="threshold",
+                    scores_key="cats_macro_f",
+                    silent=True,
+                )[0]
                 == numpy.linspace(0, 1, 10)[1]
             )
 
+        # todo fix spancat test
         # Specifying name of non-MultiLabel_TextCategorizer component should fail.
-        nlp, _ = init_nlp(("sentencizer",))
-        with make_tempdir() as nlp_dir:
-            nlp.to_disk(nlp_dir)
-            with pytest.raises(SystemExit) as error:
-                find_threshold(nlp_dir, docs_dir / "docs", pipe_name="sentencizer")
-            assert error.value.code == 1
-
-        # Having multiple textcat_multilabel components without specifying the name should fail.
-        nlp, _ = init_nlp(("textcat_multilabel",))
-        with make_tempdir() as nlp_dir:
-            nlp.to_disk(nlp_dir)
-            with pytest.raises(SystemExit) as error:
-                find_threshold(nlp_dir, docs_dir / "docs")
-            assert error.value.code == 1
-
-        # Having multiple textcat_multilabel components should work when specifying the name.
-        nlp, _ = init_nlp(("textcat_multilabel",))
+        nlp, _ = init_nlp((("spancat", {"spans_key": "sc", "threshold": 0.5}),))
         with make_tempdir() as nlp_dir:
             nlp.to_disk(nlp_dir)
             assert (
                 find_threshold(
-                    nlp_dir, docs_dir / "docs", pipe_name="tc_multi", verbose=False
+                    model=nlp_dir,
+                    data_path=docs_dir / "docs.spacy",
+                    pipe_name="spancat",
+                    threshold_key="threshold",
+                    scores_key="spans_sc_f",
+                    silent=True,
                 )[0]
                 == numpy.linspace(0, 1, 10)[1]
             )
 
+        # Having multiple textcat_multilabel components should work, since the name has to be specified.
+        nlp, _ = init_nlp((("textcat_multilabel", {}),))
+        with make_tempdir() as nlp_dir:
+            nlp.to_disk(nlp_dir)
+            assert find_threshold(
+                model=nlp_dir,
+                data_path=docs_dir / "docs.spacy",
+                pipe_name="tc_multi",
+                threshold_key="threshold",
+                scores_key="cats_macro_f",
+                silent=True,
+            )
+
         # Specifying the name of an non-existing pipe should fail.
         nlp, _ = init_nlp()
         with make_tempdir() as nlp_dir:
             nlp.to_disk(nlp_dir)
             with pytest.raises(SystemExit) as error:
-                find_threshold(nlp_dir, docs_dir / "docs", pipe_name="_")
-            assert error.value.code == 1
-
-        # Using a pipe with no textcat components should fail.
-        nlp = English()
-        with make_tempdir() as nlp_dir:
-            nlp.to_disk(nlp_dir)
-            with pytest.raises(SystemExit) as error:
-                find_threshold(nlp_dir, docs_dir / "docs")
-            assert error.value.code == 1
-
-        # Specifying scores not in range 0 <= x <= 1 should fail.
-        nlp, _ = init_nlp()
-        DocBin(
-            docs=[
-                Example.from_dict(nlp.make_doc(t[0]), t[1]).reference
-                for t in [
-                    (
-                        "I'm angry and confused",
-                        {"cats": {"ANGRY": 1.0, "CONFUSED": 2.0, "HAPPY": 0.0}},
-                    ),
-                    (
-                        "I'm confused but happy",
-                        {"cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}},
-                    ),
-                ]
-            ]
-        ).to_disk(docs_dir / "docs")
-        with make_tempdir() as nlp_dir:
-            nlp.to_disk(nlp_dir)
-            with pytest.raises(SystemExit) as error:
-                find_threshold(nlp_dir, docs_dir / "docs")
+                find_threshold(
+                    model=nlp_dir,
+                    data_path=docs_dir / "docs.spacy",
+                    pipe_name="_",
+                    threshold_key="threshold",
+                    scores_key="cats_macro_f",
+                    silent=True,
+                )
             assert error.value.code == 1