Move rehearse functionality to existing train cli

2025-11-06 10:57:34 +03:00 · 2023-01-19 13:10:56 +01:00 · 2023-01-19 13:10:56 +01:00 · e18a183f54
commit e18a183f54
parent 97c86072e5
3 changed files with 20 additions and 119 deletions
--- a/spacy/cli/rehearse.py
+++ b/spacy/cli/rehearse.py
@ -1,83 +0,0 @@
 from typing import Optional, Dict, Any, Union
 from pathlib import Path
 from wasabi import msg
 import typer
 import logging
 import sys
 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code, setup_gpu
 from ..training.loop import train as train_nlp
 from ..training.initialize import init_nlp
 from .. import util
@app.command(
    "rehearse",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
 def rehearse_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
 ):
    """
    Rehearse a spaCy pipeline. Requires data in spaCy's binary format. To
    convert data from other formats, use the `spacy convert` command. The
    config file includes all settings and hyperparameters used during training.
    To override settings in the config, e.g. settings that point to local
    paths or that you want to experiment with, you can override them as
    command line options. For instance, --training.batch_size 128 overrides
    the value of "batch_size" in the block "[training]". The --code argument
    lets you pass in a Python file that's imported before training. It can be
    used to register custom functions and architectures that can then be
    referenced in the config.
    DOCS: https://spacy.io/api/cli#rehearse
    """
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    rehearse(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
 def rehearse(
    config_path: Union[str, Path],
    output_path: Optional[Union[str, Path]] = None,
    *,
    use_gpu: int = -1,
    overrides: Dict[str, Any] = util.SimpleFrozenDict(),
 ):
    config_path = util.ensure_path(config_path)
    output_path = util.ensure_path(output_path)
    # Make sure all files and paths exists if they are needed
    if not config_path or (str(config_path) != "-" and not config_path.exists()):
        msg.fail("Config file not found", config_path, exits=1)
    if not output_path:
        msg.info("No output directory provided")
    else:
        if not output_path.exists():
            output_path.mkdir(parents=True)
            msg.good(f"Created output directory: {output_path}")
        msg.info(f"Saving to output directory: {output_path}")
    setup_gpu(use_gpu)
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides, interpolate=False)
    msg.divider("Initializing pipeline")
    with show_validation_error(config_path, hint_fill=False):
        nlp = init_nlp(config, use_gpu=use_gpu)
    msg.good("Initialized pipeline")
    msg.divider("Training pipeline")
    train_nlp(
        nlp,
        output_path,
        use_gpu=use_gpu,
        use_rehearse=True,
        stdout=sys.stdout,
        stderr=sys.stderr,
    )
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -22,7 +22,8 @@ def train_cli(
    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
-    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
    use_rehearse: bool = Opt(False, "--use_rehearse", "-r", help="Perform 'rehearsal updates' on a pre-trained model")
    # fmt: on
 ):
    """
@ -42,7 +43,13 @@ def train_cli(
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
-    train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
+    train(
        config_path,
        output_path,
        use_gpu=use_gpu,
        overrides=overrides,
        use_rehearse=use_rehearse,
    )
 def train(
@ -51,6 +58,7 @@ def train(
    *,
    use_gpu: int = -1,
    overrides: Dict[str, Any] = util.SimpleFrozenDict(),
    use_rehearse: bool = False,
 ):
    config_path = util.ensure_path(config_path)
    output_path = util.ensure_path(output_path)
@ -72,4 +80,11 @@ def train(
        nlp = init_nlp(config, use_gpu=use_gpu)
    msg.good("Initialized pipeline")
    msg.divider("Training pipeline")
-    train_nlp(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
+    train_nlp(
        nlp,
        output_path,
        use_gpu=use_gpu,
        stdout=sys.stdout,
        stderr=sys.stderr,
        use_rehearse=use_rehearse,
    )
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -1050,7 +1050,7 @@ in the section `[paths]`.
 > ```
 ```bash
-$ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides]
+$ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] [--use_rehearse] [overrides]
 ```
 | Name              | Description                                                                                                                                                                                                        |
@ -1060,6 +1060,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id]
 | `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
 | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                                                       |
 | `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
 | `--use_rehearse`, `-r`  | Use 'rehearsal' updates on a pre-trained model to address the catastrophic forgetting problem. Defaults to `False`. ~~bool (flag)~~                                                                                                                                                         |                                                                                                                                                     
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | **CREATES**       | The final trained pipeline and the best trained pipeline.                                                                                                                                                          |
@ -1135,38 +1136,6 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
 | overrides               | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~                              |
 | **CREATES**             | The pretrained weights that can be used to initialize `spacy train`.                                                                                                                                               |
 ## rehearse {id="rehearse",tag="command, experimental"}
 This command is designed to fine-tune pre-trained models while also trying to address the “catastrophic forgetting” problem. 
 It uses "rehearsal" updates that teach the current model to make predictions similar to an initial model. This feature is experimental.
 <Infobox title="Please note" variant="warning">
 The `rehearse` command outputs the sum of both losses from the `TrainablePipe.update` and `TrainablePipe.rehearse`. 
 This can potentially cause the loss to increase drastically, even while the scores also increasing. It's likely due to the model making more different predictions than the intital model.
 </Infobox>
 > #### Example
 >
 > ```bash
 > $ python -m spacy rehearse config.cfg --output ./output --paths.train ./train --paths.dev ./dev
 > ```
 ```bash
 $ python -m spacy rehearse [config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides]
 ```
 | Name              | Description                                                                                                                                                                                                        |
 | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`     | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
 | `--output`, `-o`  | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~                                                                                                          |
 | `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
 | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                                                       |
 | `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | **CREATES**       | The final rehearse pipeline and the best rehearsed pipeline.   
 ## evaluate {id="evaluate",version="2",tag="command"}