From 3a0a3854f72dc726261658bc3701931f23bb9ef1 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 1 Sep 2022 14:00:05 +0200 Subject: [PATCH] Generalize component and threshold handling. Harmonize arguments with 'spacy evaluate' CLI. --- spacy/cli/find_threshold.py | 201 ++++++++++++++---------------------- spacy/tests/test_cli.py | 159 ++++++++++++++-------------- 2 files changed, 157 insertions(+), 203 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index d5cd9e3de..fe3bdedae 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -1,16 +1,23 @@ +import functools +import operator from pathlib import Path import logging -from typing import Optional, Tuple +from typing import Optional, Tuple, Any, Dict, List import numpy import wasabi.tables +from ..training import Corpus from ._util import app, Arg, Opt, import_code, setup_gpu from .. import util -from ..pipeline import MultiLabel_TextCategorizer, Pipe -from ..tokens import DocBin -_DEFAULTS = {"average": "micro", "n_trials": 10, "beta": 1, "use_gpu": -1} +_DEFAULTS = { + "average": "micro", + "n_trials": 10, + "beta": 1, + "use_gpu": -1, + "gold_preproc": False, +} @app.command( @@ -21,12 +28,14 @@ def find_threshold_cli( # fmt: off model: str = Arg(..., help="Model name or path"), data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True), - pipe_name: str = Opt(..., "--pipe_name", "-p", help="Name of pipe to examine thresholds for"), - average: str = Arg(_DEFAULTS["average"], help="How to aggregate F-scores over labels. One of ('micro', 'macro')", exists=True, allow_dash=True), + pipe_name: str = Arg(..., help="Name of pipe to examine thresholds for"), + threshold_key: str = Arg(..., help="Key of threshold attribute in component's configuration"), + scores_key: str = Arg(..., help="Name of score to metric to optimize"), n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"), beta: float = Opt(_DEFAULTS["beta"], "--beta", help="Beta for F1 calculation. Ignored if different metric is used"), code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"), + gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"), verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"), # fmt: on ): @@ -35,24 +44,30 @@ def find_threshold_cli( model (Path): Path to file with trained model. data_path (Path): Path to file with DocBin with docs to use for threshold search. pipe_name (str): Name of pipe to examine thresholds for. - average (str): How to average F-scores across labels. One of ('micro', 'macro'). + threshold_key (str): Key of threshold attribute in component's configuration. + scores_key (str): Name of score to metric to optimize. n_trials (int): Number of trials to determine optimal thresholds beta (float): Beta for F1 calculation. code_path (Optional[Path]): Path to Python file with additional code (registered functions) to be imported. use_gpu (int): GPU ID or -1 for CPU. + gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the + tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due + to train/test skew. silent (bool): Display more information for debugging purposes """ util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) import_code(code_path) find_threshold( - model, - data_path, + model=model, + data_path=data_path, pipe_name=pipe_name, - average=average, + threshold_key=threshold_key, + scores_key=scores_key, n_trials=n_trials, beta=beta, use_gpu=use_gpu, + gold_preproc=gold_preproc, silent=False, ) @@ -60,12 +75,14 @@ def find_threshold_cli( def find_threshold( model: str, data_path: Path, + pipe_name: str, + threshold_key: str, + scores_key: str, *, - pipe_name: str, # type: ignore - average: str = _DEFAULTS["average"], # type: ignore - n_trials: int = _DEFAULTS["n_trials"], # type: ignore - beta: float = _DEFAULTS["beta"], # type: ignore, + n_trials: int = _DEFAULTS["n_trials"], + beta: float = _DEFAULTS["beta"], use_gpu: int = _DEFAULTS["use_gpu"], + gold_preproc: bool = _DEFAULTS["gold_preproc"], silent: bool = True, ) -> Tuple[float, float]: """ @@ -73,10 +90,14 @@ def find_threshold( model (Union[str, Path]): Path to file with trained model. data_path (Union[str, Path]): Path to file with DocBin with docs to use for threshold search. pipe_name (str): Name of pipe to examine thresholds for. - average (str): How to average F-scores across labels. One of ('micro', 'macro'). + threshold_key (str): Key of threshold attribute in component's configuration. + scores_key (str): Name of score to metric to optimize. n_trials (int): Number of trials to determine optimal thresholds. beta (float): Beta for F1 calculation. use_gpu (int): GPU ID or -1 for CPU. + gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the + tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due + to train/test skew. silent (bool): Whether to print non-error-related output to stdout. RETURNS (Tuple[float, float]): Best found threshold with corresponding F-score. """ @@ -86,127 +107,57 @@ def find_threshold( if not data_path.exists(): wasabi.msg.fail("Evaluation data not found", data_path, exits=1) nlp = util.load_model(model) - pipe: Optional[Pipe] = None - selected_pipe_name: Optional[str] = pipe_name - if average not in ("micro", "macro"): - wasabi.msg.fail( - "Expected 'micro' or 'macro' for F-score averaging method, received '{avg_method}'.", - exits=1, - ) + try: + pipe = nlp.get_pipe(pipe_name) + except KeyError as err: + wasabi.msg.fail(title=str(err), exits=1) - for _pipe_name, _pipe in nlp.pipeline: - # todo instead of instance check, assert _pipe has a .threshold arg - # won't work, actually. e.g. spancat doesn't .threshold. - if _pipe_name == pipe_name: - if not isinstance(_pipe, MultiLabel_TextCategorizer): - wasabi.msg.fail( - "Specified component '{component}' is not of type `MultiLabel_TextCategorizer`.".format( - component=pipe_name - ), - exits=1, - ) - pipe = _pipe - break - - if pipe is None: - wasabi.msg.fail( - f"No component with name {pipe_name} found in pipeline.", exits=1 - ) - # This is purely for MyPy. Type checking is done in loop above already. - assert isinstance(pipe, MultiLabel_TextCategorizer) - - if silent: - print( - f"Searching threshold with the best {average} F-score for component '{selected_pipe_name}' with {n_trials} " + if not silent: + wasabi.msg.info( + title=f"Optimizing for {scores_key} for component '{pipe_name}' with {n_trials} " f"trials and beta = {beta}." ) - thresholds = numpy.linspace(0, 1, n_trials) - # todo use Scorer.score_cats. possibly to be extended? - ref_pos_counts = {label: 0 for label in pipe.labels} - pred_pos_counts = { - t: {True: ref_pos_counts.copy(), False: ref_pos_counts.copy()} - for t in thresholds - } - f_scores_per_label = {t: {label: 0.0 for label in pipe.labels} for t in thresholds} - f_scores = {t: 0.0 for t in thresholds} + # Load evaluation corpus. + corpus = Corpus(data_path, gold_preproc=gold_preproc) + dev_dataset = list(corpus(nlp)) + config_keys = threshold_key.split(".") - # Count true/false positives for provided docs. - doc_bin = DocBin() - doc_bin.from_disk(data_path) - for ref_doc in doc_bin.get_docs(nlp.vocab): - for label, score in ref_doc.cats.items(): - if score not in (0, 1): - wasabi.msg.fail( - f"Expected category scores in evaluation dataset to be 0 <= x <= 1, received {score}.", - exits=1, - ) - ref_pos_counts[label] += ref_doc.cats[label] == 1 + def set_nested_item( + config: Dict[str, Any], keys: List[str], value: float + ) -> Dict[str, Any]: + """Set item in nested dictionary. Adapated from https://stackoverflow.com/a/54138200. + config (Dict[str, Any]): Configuration dictionary. + keys (List[Any]): + value (float): Value to set. + RETURNS (Dict[str, Any]): Updated dictionary. + """ + functools.reduce(operator.getitem, keys[:-1], config)[keys[-1]] = value + return config - pred_doc = nlp(ref_doc.text) - # Collect count stats per threshold value and label. - for threshold in thresholds: - for label, score in pred_doc.cats.items(): - if label not in pipe.labels: - continue - label_value = int(score >= threshold) - if label_value == ref_doc.cats[label] == 1: - pred_pos_counts[threshold][True][label] += 1 - elif label_value == 1 and ref_doc.cats[label] == 0: - pred_pos_counts[threshold][False][label] += 1 - - # Compute F-scores. - for threshold in thresholds: - for label in ref_pos_counts: - n_pos_preds = ( - pred_pos_counts[threshold][True][label] - + pred_pos_counts[threshold][False][label] - ) - precision = ( - (pred_pos_counts[threshold][True][label] / n_pos_preds) - if n_pos_preds > 0 - else 0 - ) - recall = pred_pos_counts[threshold][True][label] / ref_pos_counts[label] - f_scores_per_label[threshold][label] = ( - ( - (1 + beta**2) - * (precision * recall / (precision * beta**2 + recall)) - ) - if precision - else 0 + # Evaluate with varying threshold values. + scores: Dict[float, float] = {} + for threshold in numpy.linspace(0, 1, n_trials): + pipe.cfg = set_nested_item(pipe.cfg, config_keys, threshold) + scores[threshold] = nlp.evaluate(dev_dataset)[scores_key] + if not ( + isinstance(scores[threshold], float) or isinstance(scores[threshold], int) + ): + wasabi.msg.fail( + f"Returned score for key '{scores_key}' is not numeric. Threshold optimization only works for numeric " + f"scores.", + exits=1, ) - # Aggregate F-scores. - if average == "micro": - f_scores[threshold] = sum( - [ - f_scores_per_label[threshold][label] * ref_pos_counts[label] - for label in ref_pos_counts - ] - ) / sum(ref_pos_counts.values()) - else: - f_scores[threshold] = sum( - [f_scores_per_label[threshold][label] for label in ref_pos_counts] - ) / len(ref_pos_counts) - - best_threshold = max(f_scores.keys(), key=(lambda key: f_scores[key])) - if silent: + best_threshold = max(scores.keys(), key=(lambda key: scores[key])) + if not silent: print( - f"Best threshold: {round(best_threshold, ndigits=4)} with F-score of {f_scores[best_threshold]}.", + f"Best threshold: {round(best_threshold, ndigits=4)} with value of {scores[best_threshold]}.", wasabi.tables.table( - data=[ - (threshold, label, f_score) - for threshold, label_f_scores in f_scores_per_label.items() - for label, f_score in label_f_scores.items() - ], - header=["Threshold", "Label", "F-Score"], - ), - wasabi.tables.table( - data=[(threshold, f_score) for threshold, f_score in f_scores.items()], - header=["Threshold", f"F-Score ({average})"], + data=[(threshold, score) for threshold, score in scores.items()], + header=["Threshold", f"{scores_key}"], ), ) - return best_threshold, f_scores[best_threshold] + return best_threshold, scores[best_threshold] diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 48cc364f0..b0d173fdf 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,6 +1,6 @@ import os import math -from typing import Counter, Iterable, Tuple, List +from typing import Counter, Tuple, List, Dict, Any import numpy import pytest @@ -36,7 +36,7 @@ from spacy.tokens.span import Span from spacy.training import Example, docs_to_json, offsets_to_biluo_tags from spacy.training.converters import conll_ner_to_docs, conllu_to_docs from spacy.training.converters import iob_to_docs -from spacy.pipeline import TextCategorizer, Pipe +from spacy.pipeline import TextCategorizer, Pipe, SpanCategorizer from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config from ..cli.init_pipeline import _init_labels @@ -860,38 +860,55 @@ def test_span_length_freq_dist_output_must_be_correct(): def test_cli_find_threshold(capsys): - def make_get_examples_multi_label(_nlp: Language) -> List[Example]: - return [ - Example.from_dict(_nlp.make_doc(t[0]), t[1]) - for t in [ - ( - "I'm angry and confused", - {"cats": {"ANGRY": 1.0, "CONFUSED": 1.0, "HAPPY": 0.0}}, - ), - ( - "I'm confused but happy", - {"cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}}, - ), - ] - ] + def make_examples(_nlp: Language) -> List[Example]: + docs: List[Example] = [] + + for t in [ + ( + "I'm angry and confused in the Bank of America.", + { + "cats": {"ANGRY": 1.0, "CONFUSED": 1.0, "HAPPY": 0.0}, + "spans": {"sc": [(7, 10, "ORG")]}, + }, + ), + ( + "I'm confused but happy in New York.", + { + "cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}, + "spans": {"sc": [(6, 7, "GPE")]}, + }, + ), + ]: + doc = _nlp.make_doc(t[0]) + docs.append(Example.from_dict(doc, t[1])) + + return docs def init_nlp( - component_factory_names: Tuple[str, ...] = (), + components: Tuple[Tuple[str, Dict[str, Any]], ...] = () ) -> Tuple[Language, List[Example]]: _nlp = English() + textcat: TextCategorizer = _nlp.add_pipe( # type: ignore + factory_name="textcat_multilabel", + name="tc_multi", + config={"threshold": 0.9}, + ) + textcat_labels = ("ANGRY", "CONFUSED", "HAPPY") + for label in textcat_labels: + textcat.add_label(label) - textcat: TextCategorizer = _nlp.add_pipe(factory_name="textcat_multilabel", name="tc_multi") # type: ignore - textcat.add_label("ANGRY") - textcat.add_label("CONFUSED") - textcat.add_label("HAPPY") - for cfn in component_factory_names: - comp = _nlp.add_pipe(cfn) + # Append additional components to pipeline. + for cfn, comp_config in components: + comp = _nlp.add_pipe(cfn, config=comp_config) if isinstance(comp, TextCategorizer): - comp.add_label("dummy") + for label in textcat_labels: + comp.add_label(label) + if isinstance(comp, SpanCategorizer): + comp.add_label("GPE") + comp.add_label("ORG") _nlp.initialize() - - _examples = make_get_examples_multi_label(_nlp) + _examples = make_examples(_nlp) for i in range(5): _nlp.update(_examples) @@ -903,77 +920,63 @@ def test_cli_find_threshold(capsys): # mostly as a smoke test. nlp, examples = init_nlp() DocBin(docs=[example.reference for example in examples]).to_disk( - docs_dir / "docs" + docs_dir / "docs.spacy" ) with make_tempdir() as nlp_dir: nlp.to_disk(nlp_dir) assert ( - find_threshold(nlp_dir, docs_dir / "docs", verbose=False)[0] + find_threshold( + model=nlp_dir, + data_path=docs_dir / "docs.spacy", + pipe_name="tc_multi", + threshold_key="threshold", + scores_key="cats_macro_f", + silent=True, + )[0] == numpy.linspace(0, 1, 10)[1] ) + # todo fix spancat test # Specifying name of non-MultiLabel_TextCategorizer component should fail. - nlp, _ = init_nlp(("sentencizer",)) - with make_tempdir() as nlp_dir: - nlp.to_disk(nlp_dir) - with pytest.raises(SystemExit) as error: - find_threshold(nlp_dir, docs_dir / "docs", pipe_name="sentencizer") - assert error.value.code == 1 - - # Having multiple textcat_multilabel components without specifying the name should fail. - nlp, _ = init_nlp(("textcat_multilabel",)) - with make_tempdir() as nlp_dir: - nlp.to_disk(nlp_dir) - with pytest.raises(SystemExit) as error: - find_threshold(nlp_dir, docs_dir / "docs") - assert error.value.code == 1 - - # Having multiple textcat_multilabel components should work when specifying the name. - nlp, _ = init_nlp(("textcat_multilabel",)) + nlp, _ = init_nlp((("spancat", {"spans_key": "sc", "threshold": 0.5}),)) with make_tempdir() as nlp_dir: nlp.to_disk(nlp_dir) assert ( find_threshold( - nlp_dir, docs_dir / "docs", pipe_name="tc_multi", verbose=False + model=nlp_dir, + data_path=docs_dir / "docs.spacy", + pipe_name="spancat", + threshold_key="threshold", + scores_key="spans_sc_f", + silent=True, )[0] == numpy.linspace(0, 1, 10)[1] ) + # Having multiple textcat_multilabel components should work, since the name has to be specified. + nlp, _ = init_nlp((("textcat_multilabel", {}),)) + with make_tempdir() as nlp_dir: + nlp.to_disk(nlp_dir) + assert find_threshold( + model=nlp_dir, + data_path=docs_dir / "docs.spacy", + pipe_name="tc_multi", + threshold_key="threshold", + scores_key="cats_macro_f", + silent=True, + ) + # Specifying the name of an non-existing pipe should fail. nlp, _ = init_nlp() with make_tempdir() as nlp_dir: nlp.to_disk(nlp_dir) with pytest.raises(SystemExit) as error: - find_threshold(nlp_dir, docs_dir / "docs", pipe_name="_") - assert error.value.code == 1 - - # Using a pipe with no textcat components should fail. - nlp = English() - with make_tempdir() as nlp_dir: - nlp.to_disk(nlp_dir) - with pytest.raises(SystemExit) as error: - find_threshold(nlp_dir, docs_dir / "docs") - assert error.value.code == 1 - - # Specifying scores not in range 0 <= x <= 1 should fail. - nlp, _ = init_nlp() - DocBin( - docs=[ - Example.from_dict(nlp.make_doc(t[0]), t[1]).reference - for t in [ - ( - "I'm angry and confused", - {"cats": {"ANGRY": 1.0, "CONFUSED": 2.0, "HAPPY": 0.0}}, - ), - ( - "I'm confused but happy", - {"cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}}, - ), - ] - ] - ).to_disk(docs_dir / "docs") - with make_tempdir() as nlp_dir: - nlp.to_disk(nlp_dir) - with pytest.raises(SystemExit) as error: - find_threshold(nlp_dir, docs_dir / "docs") + find_threshold( + model=nlp_dir, + data_path=docs_dir / "docs.spacy", + pipe_name="_", + threshold_key="threshold", + scores_key="cats_macro_f", + silent=True, + ) assert error.value.code == 1