Move rehearse functionality to existing train cli

2025-12-10 19:54:17 +03:00 · 2023-01-19 13:10:56 +01:00 · 2023-01-19 13:10:56 +01:00 · e18a183f54
commit e18a183f54
parent 97c86072e5
3 changed files with 20 additions and 119 deletions
--- a/spacy/cli/rehearse.py
+++ b/spacy/cli/rehearse.py
@ -1,83 +0,0 @@
-from typing import Optional, Dict, Any, Union
-from pathlib import Path
-from wasabi import msg
-import typer
-import logging
-import sys
-
-from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu
-from ..training.loop import train as train_nlp
-from ..training.initialize import init_nlp
-from .. import util
-
-
-@app.command(
-    "rehearse",
-    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
-)
-def rehearse_cli(
-    # fmt: off
-    ctx: typer.Context,  # This is only used to read additional arguments
-    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
-    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
-    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
-    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
-    # fmt: on
-):
-    """
-    Rehearse a spaCy pipeline. Requires data in spaCy's binary format. To
-    convert data from other formats, use the `spacy convert` command. The
-    config file includes all settings and hyperparameters used during training.
-    To override settings in the config, e.g. settings that point to local
-    paths or that you want to experiment with, you can override them as
-    command line options. For instance, --training.batch_size 128 overrides
-    the value of "batch_size" in the block "[training]". The --code argument
-    lets you pass in a Python file that's imported before training. It can be
-    used to register custom functions and architectures that can then be
-    referenced in the config.
-
-    DOCS: https://spacy.io/api/cli#rehearse
-    """
-    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
-    overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
-    rehearse(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
-
-
-def rehearse(
-    config_path: Union[str, Path],
-    output_path: Optional[Union[str, Path]] = None,
-    *,
-    use_gpu: int = -1,
-    overrides: Dict[str, Any] = util.SimpleFrozenDict(),
-):
-    config_path = util.ensure_path(config_path)
-    output_path = util.ensure_path(output_path)
-    # Make sure all files and paths exists if they are needed
-    if not config_path or (str(config_path) != "-" and not config_path.exists()):
-        msg.fail("Config file not found", config_path, exits=1)
-    if not output_path:
-        msg.info("No output directory provided")
-    else:
-        if not output_path.exists():
-            output_path.mkdir(parents=True)
-            msg.good(f"Created output directory: {output_path}")
-        msg.info(f"Saving to output directory: {output_path}")
-    setup_gpu(use_gpu)
-    with show_validation_error(config_path):
-        config = util.load_config(config_path, overrides=overrides, interpolate=False)
-    msg.divider("Initializing pipeline")
-    with show_validation_error(config_path, hint_fill=False):
-        nlp = init_nlp(config, use_gpu=use_gpu)
-    msg.good("Initialized pipeline")
-    msg.divider("Training pipeline")
-    train_nlp(
-        nlp,
-        output_path,
-        use_gpu=use_gpu,
-        use_rehearse=True,
-        stdout=sys.stdout,
-        stderr=sys.stderr,
-    )
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -22,7 +22,8 @@ def train_cli(
    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
-    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
+    use_rehearse: bool = Opt(False, "--use_rehearse", "-r", help="Perform 'rehearsal updates' on a pre-trained model")
    # fmt: on
 ):
    """
@ -42,7 +43,13 @@ def train_cli(
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
-    train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
+    train(
+        config_path,
+        output_path,
+        use_gpu=use_gpu,
+        overrides=overrides,
+        use_rehearse=use_rehearse,
+    )


 def train(
@ -51,6 +58,7 @@ def train(
    *,
    use_gpu: int = -1,
    overrides: Dict[str, Any] = util.SimpleFrozenDict(),
+    use_rehearse: bool = False,
 ):
    config_path = util.ensure_path(config_path)
    output_path = util.ensure_path(output_path)
@ -72,4 +80,11 @@ def train(
        nlp = init_nlp(config, use_gpu=use_gpu)
    msg.good("Initialized pipeline")
    msg.divider("Training pipeline")
-    train_nlp(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
+    train_nlp(
+        nlp,
+        output_path,
+        use_gpu=use_gpu,
+        stdout=sys.stdout,
+        stderr=sys.stderr,
+        use_rehearse=use_rehearse,
+    )
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -1050,7 +1050,7 @@ in the section `[paths]`.
 > ```

 ```bash
-$ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides]
+$ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] [--use_rehearse] [overrides]
 ```

 | Name              | Description                                                                                                                                                                                                        |
@ -1060,6 +1060,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id]
 | `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
 | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                                                       |
 | `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
+| `--use_rehearse`, `-r`  | Use 'rehearsal' updates on a pre-trained model to address the catastrophic forgetting problem. Defaults to `False`. ~~bool (flag)~~                                                                                                                                                         |                                                                                                                                                     
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | **CREATES**       | The final trained pipeline and the best trained pipeline.                                                                                                                                                          |
@ -1135,38 +1136,6 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
 | overrides               | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~                              |
 | **CREATES**             | The pretrained weights that can be used to initialize `spacy train`.                                                                                                                                               |

-## rehearse {id="rehearse",tag="command, experimental"}
-
-This command is designed to fine-tune pre-trained models while also trying to address the “catastrophic forgetting” problem. 
-It uses "rehearsal" updates that teach the current model to make predictions similar to an initial model. This feature is experimental.
-
-<Infobox title="Please note" variant="warning">
-
-The `rehearse` command outputs the sum of both losses from the `TrainablePipe.update` and `TrainablePipe.rehearse`. 
-This can potentially cause the loss to increase drastically, even while the scores also increasing. It's likely due to the model making more different predictions than the intital model.
-
-</Infobox>
-
-> #### Example
->
-> ```bash
-> $ python -m spacy rehearse config.cfg --output ./output --paths.train ./train --paths.dev ./dev
-> ```
-
-```bash
-$ python -m spacy rehearse [config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides]
-```
-
-| Name              | Description                                                                                                                                                                                                        |
-| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `config_path`     | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
-| `--output`, `-o`  | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~                                                                                                          |
-| `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
-| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                                                       |
-| `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
-| `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
-| overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
-| **CREATES**       | The final rehearse pipeline and the best rehearsed pipeline.   

 ## evaluate {id="evaluate",version="2",tag="command"}