From 60661ab0fac39927dbe7550885f4b2b0305a1682 Mon Sep 17 00:00:00 2001
From: thomashacker <EdwardSchmuhl@web.de>
Date: Thu, 19 Jan 2023 12:35:14 +0100
Subject: [PATCH] Init

---
 spacy/cli/__init__.py         |  1 +
 spacy/cli/rehearse.py         | 83 +++++++++++++++++++++++++++++++++++
 spacy/language.py             | 27 +++++-------
 spacy/pipeline/tagger.pyx     |  3 +-
 spacy/training/loop.py        | 15 ++++++-
 website/docs/api/cli.mdx      | 34 ++++++++++++++
 website/docs/api/language.mdx |  2 +-
 website/docs/api/pipe.mdx     |  2 +-
 8 files changed, 146 insertions(+), 21 deletions(-)
 create mode 100644 spacy/cli/rehearse.py

diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 868526b42..128de2d41 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -10,6 +10,7 @@ from .info import info  # noqa: F401
 from .package import package  # noqa: F401
 from .profile import profile  # noqa: F401
 from .train import train_cli  # noqa: F401
+from .rehearse import rehearse_cli  # noqa: F401
 from .assemble import assemble_cli  # noqa: F401
 from .pretrain import pretrain  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
diff --git a/spacy/cli/rehearse.py b/spacy/cli/rehearse.py
new file mode 100644
index 000000000..23af22fdf
--- /dev/null
+++ b/spacy/cli/rehearse.py
@@ -0,0 +1,83 @@
+from typing import Optional, Dict, Any, Union
+from pathlib import Path
+from wasabi import msg
+import typer
+import logging
+import sys
+
+from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
+from ._util import import_code, setup_gpu
+from ..training.loop import train as train_nlp
+from ..training.initialize import init_nlp
+from .. import util
+
+
+@app.command(
+    "rehearse",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def rehearse_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
+    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
+    # fmt: on
+):
+    """
+    Rehearse a spaCy pipeline. Requires data in spaCy's binary format. To
+    convert data from other formats, use the `spacy convert` command. The
+    config file includes all settings and hyperparameters used during training.
+    To override settings in the config, e.g. settings that point to local
+    paths or that you want to experiment with, you can override them as
+    command line options. For instance, --training.batch_size 128 overrides
+    the value of "batch_size" in the block "[training]". The --code argument
+    lets you pass in a Python file that's imported before training. It can be
+    used to register custom functions and architectures that can then be
+    referenced in the config.
+
+    DOCS: https://spacy.io/api/cli#rehearse
+    """
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    overrides = parse_config_overrides(ctx.args)
+    import_code(code_path)
+    rehearse(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
+
+
+def rehearse(
+    config_path: Union[str, Path],
+    output_path: Optional[Union[str, Path]] = None,
+    *,
+    use_gpu: int = -1,
+    overrides: Dict[str, Any] = util.SimpleFrozenDict(),
+):
+    config_path = util.ensure_path(config_path)
+    output_path = util.ensure_path(output_path)
+    # Make sure all files and paths exists if they are needed
+    if not config_path or (str(config_path) != "-" and not config_path.exists()):
+        msg.fail("Config file not found", config_path, exits=1)
+    if not output_path:
+        msg.info("No output directory provided")
+    else:
+        if not output_path.exists():
+            output_path.mkdir(parents=True)
+            msg.good(f"Created output directory: {output_path}")
+        msg.info(f"Saving to output directory: {output_path}")
+    setup_gpu(use_gpu)
+    with show_validation_error(config_path):
+        config = util.load_config(config_path, overrides=overrides, interpolate=False)
+    msg.divider("Initializing pipeline")
+    with show_validation_error(config_path, hint_fill=False):
+        nlp = init_nlp(config, use_gpu=use_gpu)
+    msg.good("Initialized pipeline")
+    msg.divider("Training pipeline")
+    train_nlp(
+        nlp,
+        output_path,
+        use_gpu=use_gpu,
+        use_rehearse=True,
+        stdout=sys.stdout,
+        stderr=sys.stderr,
+    )
diff --git a/spacy/language.py b/spacy/language.py
index e0abfd5e7..5977ed17f 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1211,32 +1211,25 @@ class Language:
         if isinstance(examples, list) and len(examples) == 0:
             return losses
         validate_examples(examples, "Language.rehearse")
-        if sgd is None:
-            if self._optimizer is None:
-                self._optimizer = self.create_optimizer()
-            sgd = self._optimizer
         pipes = list(self.pipeline)
-        random.shuffle(pipes)
         if component_cfg is None:
             component_cfg = {}
-        grads = {}
 
-        def get_grads(key, W, dW):
-            grads[key] = (W, dW)
-            return W, dW
-
-        get_grads.learn_rate = sgd.learn_rate  # type: ignore[attr-defined, union-attr]
-        get_grads.b1 = sgd.b1  # type: ignore[attr-defined, union-attr]
-        get_grads.b2 = sgd.b2  # type: ignore[attr-defined, union-attr]
         for name, proc in pipes:
             if name in exclude or not hasattr(proc, "rehearse"):
                 continue
-            grads = {}
             proc.rehearse(  # type: ignore[attr-defined]
-                examples, sgd=get_grads, losses=losses, **component_cfg.get(name, {})
+                examples, sgd=None, losses=losses, **component_cfg.get(name, {})
             )
-        for key, (W, dW) in grads.items():
-            sgd(key, W, dW)  # type: ignore[call-arg, misc]
+            if isinstance(sgd, Optimizer):
+                if (
+                    name not in exclude
+                    and isinstance(proc, ty.TrainableComponent)
+                    and proc.is_trainable
+                    and proc.model not in (True, False, None)
+                ):
+                    proc.finish_update(sgd)
+
         return losses
 
     def begin_training(
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index d6ecbf084..b1698d11c 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -241,7 +241,8 @@ class Tagger(TrainablePipe):
         tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
         grads, loss = loss_func(tag_scores, tutor_tag_scores)
         bp_tag_scores(grads)
-        self.finish_update(sgd)
+        if sgd is not None:
+            self.finish_update(sgd)
         losses[self.name] += loss
         return losses
 
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 885257772..44c7023d3 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -26,6 +26,7 @@ def train(
     output_path: Optional[Path] = None,
     *,
     use_gpu: int = -1,
+    use_rehearse: bool = False,
     stdout: IO = sys.stdout,
     stderr: IO = sys.stderr,
 ) -> Tuple["Language", Optional[Path]]:
@@ -35,6 +36,7 @@ def train(
     output_path (Optional[Path]): Optional output path to save trained model to.
     use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
         before calling this function.
+    use_rehearse (bool): Use nlp.rehearse after nlp.update
     stdout (file): A file-like object to write output messages. To disable
         printing, set to io.StringIO.
     stderr (file): A second file-like object to write output messages. To disable
@@ -54,7 +56,10 @@ def train(
     T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
     dot_names = [T["train_corpus"], T["dev_corpus"]]
     train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
-    optimizer = T["optimizer"]
+    if use_rehearse:
+        optimizer = nlp.resume_training()
+    else:
+        optimizer = T["optimizer"]
     score_weights = T["score_weights"]
     batcher = T["batcher"]
     train_logger = T["logger"]
@@ -88,6 +93,7 @@ def train(
         patience=T["patience"],
         max_steps=T["max_steps"],
         eval_frequency=T["eval_frequency"],
+        use_rehearse=use_rehearse,
         exclude=frozen_components,
         annotating_components=annotating_components,
         before_update=before_update,
@@ -150,6 +156,7 @@ def train_while_improving(
     accumulate_gradient: int,
     patience: int,
     max_steps: int,
+    use_rehearse: bool = False,
     exclude: List[str],
     annotating_components: List[str],
     before_update: Optional[Callable[["Language", Dict[str, Any]], None]],
@@ -214,6 +221,12 @@ def train_while_improving(
                 exclude=exclude,
                 annotates=annotating_components,
             )
+            nlp.rehearse(
+                subbatch,
+                losses=losses,
+                sgd=False,  # type: ignore[arg-type]
+                exclude=exclude,
+            )
         # TODO: refactor this so we don't have to run it separately in here
         for name, proc in nlp.pipeline:
             if (
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index ca4023101..5e4aa8af6 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -11,6 +11,7 @@ menu:
   - ['debug', 'debug']
   - ['train', 'train']
   - ['pretrain', 'pretrain']
+  - ['rehearse', 'rehearse']
   - ['evaluate', 'evaluate']
   - ['benchmark', 'benchmark']
   - ['apply', 'apply']
@@ -1134,6 +1135,39 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
 | overrides               | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~                              |
 | **CREATES**             | The pretrained weights that can be used to initialize `spacy train`.                                                                                                                                               |
 
+## rehearse {id="rehearse",tag="command, experimental"}
+
+This command is designed to fine-tune pre-trained models while also trying to address the “catastrophic forgetting” problem. 
+It uses "rehearsal" updates that teach the current model to make predictions similar to an initial model. This feature is experimental.
+
+<Infobox title="Please note" variant="warning">
+
+The `rehearse` command outputs the sum of both losses from the `TrainablePipe.update` and `TrainablePipe.rehearse`. 
+This can potentially cause the loss to increase drastically, even while the scores also increasing. It's likely due to the model making more different predictions than the intital model.
+
+</Infobox>
+
+> #### Example
+>
+> ```bash
+> $ python -m spacy rehearse config.cfg --output ./output --paths.train ./train --paths.dev ./dev
+> ```
+
+```bash
+$ python -m spacy rehearse [config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides]
+```
+
+| Name              | Description                                                                                                                                                                                                        |
+| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `config_path`     | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
+| `--output`, `-o`  | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~                                                                                                          |
+| `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
+| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                                                       |
+| `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
+| `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
+| overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
+| **CREATES**       | The final rehearse pipeline and the best rehearsed pipeline.   
+
 ## evaluate {id="evaluate",version="2",tag="command"}
 
 The `evaluate` subcommand is superseded by
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index 93ddd79a2..45f64e346 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -346,7 +346,7 @@ and custom registered functions if needed. See the
 
 Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
 current model to make predictions similar to an initial model, to try to address
-the "catastrophic forgetting" problem. This feature is experimental.
+the "catastrophic forgetting" problem. Please note that this function needs to be used together with `Language.update`. This feature is experimental.
 
 > #### Example
 >
diff --git a/website/docs/api/pipe.mdx b/website/docs/api/pipe.mdx
index c2777edf0..8004d4d76 100644
--- a/website/docs/api/pipe.mdx
+++ b/website/docs/api/pipe.mdx
@@ -244,7 +244,7 @@ predictions and gold-standard annotations, and update the component's model.
 
 Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
 current model to make predictions similar to an initial model, to try to address
-the "catastrophic forgetting" problem. This feature is experimental.
+the "catastrophic forgetting" problem. Please note that this function needs to be used together with `TrainablePipe.update`. This feature is experimental.
 
 > #### Example
 >