diff --git a/spacy/cli/rehearse.py b/spacy/cli/rehearse.py deleted file mode 100644 index 23af22fdf..000000000 --- a/spacy/cli/rehearse.py +++ /dev/null @@ -1,83 +0,0 @@ -from typing import Optional, Dict, Any, Union -from pathlib import Path -from wasabi import msg -import typer -import logging -import sys - -from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code, setup_gpu -from ..training.loop import train as train_nlp -from ..training.initialize import init_nlp -from .. import util - - -@app.command( - "rehearse", - context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, -) -def rehearse_cli( - # fmt: off - ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), - use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") - # fmt: on -): - """ - Rehearse a spaCy pipeline. Requires data in spaCy's binary format. To - convert data from other formats, use the `spacy convert` command. The - config file includes all settings and hyperparameters used during training. - To override settings in the config, e.g. settings that point to local - paths or that you want to experiment with, you can override them as - command line options. For instance, --training.batch_size 128 overrides - the value of "batch_size" in the block "[training]". The --code argument - lets you pass in a Python file that's imported before training. It can be - used to register custom functions and architectures that can then be - referenced in the config. - - DOCS: https://spacy.io/api/cli#rehearse - """ - util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) - overrides = parse_config_overrides(ctx.args) - import_code(code_path) - rehearse(config_path, output_path, use_gpu=use_gpu, overrides=overrides) - - -def rehearse( - config_path: Union[str, Path], - output_path: Optional[Union[str, Path]] = None, - *, - use_gpu: int = -1, - overrides: Dict[str, Any] = util.SimpleFrozenDict(), -): - config_path = util.ensure_path(config_path) - output_path = util.ensure_path(output_path) - # Make sure all files and paths exists if they are needed - if not config_path or (str(config_path) != "-" and not config_path.exists()): - msg.fail("Config file not found", config_path, exits=1) - if not output_path: - msg.info("No output directory provided") - else: - if not output_path.exists(): - output_path.mkdir(parents=True) - msg.good(f"Created output directory: {output_path}") - msg.info(f"Saving to output directory: {output_path}") - setup_gpu(use_gpu) - with show_validation_error(config_path): - config = util.load_config(config_path, overrides=overrides, interpolate=False) - msg.divider("Initializing pipeline") - with show_validation_error(config_path, hint_fill=False): - nlp = init_nlp(config, use_gpu=use_gpu) - msg.good("Initialized pipeline") - msg.divider("Training pipeline") - train_nlp( - nlp, - output_path, - use_gpu=use_gpu, - use_rehearse=True, - stdout=sys.stdout, - stderr=sys.stderr, - ) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index cc22cbba6..b3462caa6 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -22,7 +22,8 @@ def train_cli( output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"), code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), - use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), + use_rehearse: bool = Opt(False, "--use_rehearse", "-r", help="Perform 'rehearsal updates' on a pre-trained model") # fmt: on ): """ @@ -42,7 +43,13 @@ def train_cli( util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) overrides = parse_config_overrides(ctx.args) import_code(code_path) - train(config_path, output_path, use_gpu=use_gpu, overrides=overrides) + train( + config_path, + output_path, + use_gpu=use_gpu, + overrides=overrides, + use_rehearse=use_rehearse, + ) def train( @@ -51,6 +58,7 @@ def train( *, use_gpu: int = -1, overrides: Dict[str, Any] = util.SimpleFrozenDict(), + use_rehearse: bool = False, ): config_path = util.ensure_path(config_path) output_path = util.ensure_path(output_path) @@ -72,4 +80,11 @@ def train( nlp = init_nlp(config, use_gpu=use_gpu) msg.good("Initialized pipeline") msg.divider("Training pipeline") - train_nlp(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr) + train_nlp( + nlp, + output_path, + use_gpu=use_gpu, + stdout=sys.stdout, + stderr=sys.stderr, + use_rehearse=use_rehearse, + ) diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 5e4aa8af6..d5b2d54d2 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -1050,7 +1050,7 @@ in the section `[paths]`. > ``` ```bash -$ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides] +$ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] [--use_rehearse] [overrides] ``` | Name | Description | @@ -1060,6 +1060,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] | `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ | | `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | +| `--use_rehearse`, `-r` | Use 'rehearsal' updates on a pre-trained model to address the catastrophic forgetting problem. Defaults to `False`. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | | **CREATES** | The final trained pipeline and the best trained pipeline. | @@ -1135,38 +1136,6 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [ | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ | | **CREATES** | The pretrained weights that can be used to initialize `spacy train`. | -## rehearse {id="rehearse",tag="command, experimental"} - -This command is designed to fine-tune pre-trained models while also trying to address the “catastrophic forgetting” problem. -It uses "rehearsal" updates that teach the current model to make predictions similar to an initial model. This feature is experimental. - - - -The `rehearse` command outputs the sum of both losses from the `TrainablePipe.update` and `TrainablePipe.rehearse`. -This can potentially cause the loss to increase drastically, even while the scores also increasing. It's likely due to the model making more different predictions than the intital model. - - - -> #### Example -> -> ```bash -> $ python -m spacy rehearse config.cfg --output ./output --paths.train ./train --paths.dev ./dev -> ``` - -```bash -$ python -m spacy rehearse [config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides] -``` - -| Name | Description | -| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | -| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ | -| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | -| **CREATES** | The final rehearse pipeline and the best rehearsed pipeline. ## evaluate {id="evaluate",version="2",tag="command"}