From 809588de308d5c3053c9900842cdf43f46e4a540 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 17 Nov 2022 12:37:22 +0100 Subject: [PATCH] Update docs & docstring. --- spacy/cli/find_threshold.py | 21 +++++++++++---------- website/docs/api/cli.md | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 44 insertions(+), 12 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 32cb7a555..f75b3aac8 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -39,15 +39,16 @@ def find_threshold_cli( # fmt: on ): """ - Runs prediction trials for models with varying tresholds to maximize the - specified metric. The search space for the threshold is traversed - linearly from 0 to 1 in n_trials steps. + Runs prediction trials for a trained model with varying tresholds to maximize + the specified metric. The search space for the threshold is traversed linearly + from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` + (the corresponding API call to `spacy.cli.find_threshold.find_threshold()` + returns all results). - This is applicable only for components whose predictions are influenced - by thresholds (e.g. textcat_multilabel and spancat, but not textcat). - - Note that the full path to the corresponding threshold attribute in the - config has to be provided. + This is applicable only for components whose predictions are influenced by + thresholds - e.g. `textcat_multilabel` and `spancat`, but not `textcat`. Note + that the full path to the corresponding threshold attribute in the config has to + be provided. DOCS: https://spacy.io/api/cli#find-threshold """ @@ -81,8 +82,8 @@ def find_threshold( ) -> Tuple[float, float, Dict[float, float]]: """ Runs prediction trials for models with varying tresholds to maximize the specified metric. - model (Union[str, Path]): Path to file with trained model. - data_path (Union[str, Path]): Path to file with DocBin with docs to use for threshold search. + model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory. + data_path (Path): Path to file with DocBin with docs to use for threshold search. pipe_name (str): Name of pipe to examine thresholds for. threshold_key (str): Key of threshold attribute in component's configuration. scores_key (str): Name of score to metric to optimize. diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index fc2c46022..23a7e7a8d 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -12,6 +12,7 @@ menu: - ['train', 'train'] - ['pretrain', 'pretrain'] - ['evaluate', 'evaluate'] + - ['find-threshold', 'find-threshold'] - ['assemble', 'assemble'] - ['package', 'package'] - ['project', 'project'] @@ -474,8 +475,7 @@ report span characteristics such as the average span length and the span (or span boundary) distinctiveness. The distinctiveness measure shows how different the tokens are with respect to the rest of the corpus using the KL-divergence of the token distributions. To learn more, you can check out Papay et al.'s work on -[*Dissecting Span Identification Tasks with Performance Prediction* (EMNLP -2020)](https://aclanthology.org/2020.emnlp-main.396/). +[_Dissecting Span Identification Tasks with Performance Prediction_ (EMNLP 2020)](https://aclanthology.org/2020.emnlp-main.396/). @@ -1163,6 +1163,37 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-prepr | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **CREATES** | Training results and optional metrics and visualizations. | +## find-threshold {#find-threshold new="3.5" tag="command"} + +Runs prediction trials for a trained model with varying tresholds to maximize +the specified metric. The search space for the threshold is traversed linearly +from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` +(the corresponding API call to `spacy.cli.find_threshold.find_threshold()` +returns all results). + +This is applicable only for components whose predictions are influenced by +thresholds - e.g. `textcat_multilabel` and `spancat`, but not `textcat`. Note +that the full path to the corresponding threshold attribute in the config has to +be provided. + +```cli +$ python -m spacy find-threshold [model] [data_path] [pipe_name] [threshold_key] [scores_key] [--n_trials] [--code] [--use-gpu] [--gold-preproc] [--verbose] +``` + +| Name | Description | +| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | +| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ | +| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ | +| `threshold_key` | Key of threshold attribute in component's configuration. ~~str (positional)~~ | +| `scores_key` | Name of score to metric to optimize. ~~str (positional)~~ | +| `--n_trials`, `-n` | Number of trials to determine optimal thresholds. ~~int (option)~~ | +| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | +| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | + ## assemble {#assemble tag="command"} Assemble a pipeline from a config file without additional training. Expects a