From 60661ab0fac39927dbe7550885f4b2b0305a1682 Mon Sep 17 00:00:00 2001 From: thomashacker Date: Thu, 19 Jan 2023 12:35:14 +0100 Subject: [PATCH] Init --- spacy/cli/__init__.py | 1 + spacy/cli/rehearse.py | 83 +++++++++++++++++++++++++++++++++++ spacy/language.py | 27 +++++------- spacy/pipeline/tagger.pyx | 3 +- spacy/training/loop.py | 15 ++++++- website/docs/api/cli.mdx | 34 ++++++++++++++ website/docs/api/language.mdx | 2 +- website/docs/api/pipe.mdx | 2 +- 8 files changed, 146 insertions(+), 21 deletions(-) create mode 100644 spacy/cli/rehearse.py diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 868526b42..128de2d41 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -10,6 +10,7 @@ from .info import info # noqa: F401 from .package import package # noqa: F401 from .profile import profile # noqa: F401 from .train import train_cli # noqa: F401 +from .rehearse import rehearse_cli # noqa: F401 from .assemble import assemble_cli # noqa: F401 from .pretrain import pretrain # noqa: F401 from .debug_data import debug_data # noqa: F401 diff --git a/spacy/cli/rehearse.py b/spacy/cli/rehearse.py new file mode 100644 index 000000000..23af22fdf --- /dev/null +++ b/spacy/cli/rehearse.py @@ -0,0 +1,83 @@ +from typing import Optional, Dict, Any, Union +from pathlib import Path +from wasabi import msg +import typer +import logging +import sys + +from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error +from ._util import import_code, setup_gpu +from ..training.loop import train as train_nlp +from ..training.initialize import init_nlp +from .. import util + + +@app.command( + "rehearse", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def rehearse_cli( + # fmt: off + ctx: typer.Context, # This is only used to read additional arguments + config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), + output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"), + code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") + # fmt: on +): + """ + Rehearse a spaCy pipeline. Requires data in spaCy's binary format. To + convert data from other formats, use the `spacy convert` command. The + config file includes all settings and hyperparameters used during training. + To override settings in the config, e.g. settings that point to local + paths or that you want to experiment with, you can override them as + command line options. For instance, --training.batch_size 128 overrides + the value of "batch_size" in the block "[training]". The --code argument + lets you pass in a Python file that's imported before training. It can be + used to register custom functions and architectures that can then be + referenced in the config. + + DOCS: https://spacy.io/api/cli#rehearse + """ + util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) + overrides = parse_config_overrides(ctx.args) + import_code(code_path) + rehearse(config_path, output_path, use_gpu=use_gpu, overrides=overrides) + + +def rehearse( + config_path: Union[str, Path], + output_path: Optional[Union[str, Path]] = None, + *, + use_gpu: int = -1, + overrides: Dict[str, Any] = util.SimpleFrozenDict(), +): + config_path = util.ensure_path(config_path) + output_path = util.ensure_path(output_path) + # Make sure all files and paths exists if they are needed + if not config_path or (str(config_path) != "-" and not config_path.exists()): + msg.fail("Config file not found", config_path, exits=1) + if not output_path: + msg.info("No output directory provided") + else: + if not output_path.exists(): + output_path.mkdir(parents=True) + msg.good(f"Created output directory: {output_path}") + msg.info(f"Saving to output directory: {output_path}") + setup_gpu(use_gpu) + with show_validation_error(config_path): + config = util.load_config(config_path, overrides=overrides, interpolate=False) + msg.divider("Initializing pipeline") + with show_validation_error(config_path, hint_fill=False): + nlp = init_nlp(config, use_gpu=use_gpu) + msg.good("Initialized pipeline") + msg.divider("Training pipeline") + train_nlp( + nlp, + output_path, + use_gpu=use_gpu, + use_rehearse=True, + stdout=sys.stdout, + stderr=sys.stderr, + ) diff --git a/spacy/language.py b/spacy/language.py index e0abfd5e7..5977ed17f 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1211,32 +1211,25 @@ class Language: if isinstance(examples, list) and len(examples) == 0: return losses validate_examples(examples, "Language.rehearse") - if sgd is None: - if self._optimizer is None: - self._optimizer = self.create_optimizer() - sgd = self._optimizer pipes = list(self.pipeline) - random.shuffle(pipes) if component_cfg is None: component_cfg = {} - grads = {} - def get_grads(key, W, dW): - grads[key] = (W, dW) - return W, dW - - get_grads.learn_rate = sgd.learn_rate # type: ignore[attr-defined, union-attr] - get_grads.b1 = sgd.b1 # type: ignore[attr-defined, union-attr] - get_grads.b2 = sgd.b2 # type: ignore[attr-defined, union-attr] for name, proc in pipes: if name in exclude or not hasattr(proc, "rehearse"): continue - grads = {} proc.rehearse( # type: ignore[attr-defined] - examples, sgd=get_grads, losses=losses, **component_cfg.get(name, {}) + examples, sgd=None, losses=losses, **component_cfg.get(name, {}) ) - for key, (W, dW) in grads.items(): - sgd(key, W, dW) # type: ignore[call-arg, misc] + if isinstance(sgd, Optimizer): + if ( + name not in exclude + and isinstance(proc, ty.TrainableComponent) + and proc.is_trainable + and proc.model not in (True, False, None) + ): + proc.finish_update(sgd) + return losses def begin_training( diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index d6ecbf084..b1698d11c 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -241,7 +241,8 @@ class Tagger(TrainablePipe): tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs) grads, loss = loss_func(tag_scores, tutor_tag_scores) bp_tag_scores(grads) - self.finish_update(sgd) + if sgd is not None: + self.finish_update(sgd) losses[self.name] += loss return losses diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 885257772..44c7023d3 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -26,6 +26,7 @@ def train( output_path: Optional[Path] = None, *, use_gpu: int = -1, + use_rehearse: bool = False, stdout: IO = sys.stdout, stderr: IO = sys.stderr, ) -> Tuple["Language", Optional[Path]]: @@ -35,6 +36,7 @@ def train( output_path (Optional[Path]): Optional output path to save trained model to. use_gpu (int): Whether to train on GPU. Make sure to call require_gpu before calling this function. + use_rehearse (bool): Use nlp.rehearse after nlp.update stdout (file): A file-like object to write output messages. To disable printing, set to io.StringIO. stderr (file): A second file-like object to write output messages. To disable @@ -54,7 +56,10 @@ def train( T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(config, dot_names) - optimizer = T["optimizer"] + if use_rehearse: + optimizer = nlp.resume_training() + else: + optimizer = T["optimizer"] score_weights = T["score_weights"] batcher = T["batcher"] train_logger = T["logger"] @@ -88,6 +93,7 @@ def train( patience=T["patience"], max_steps=T["max_steps"], eval_frequency=T["eval_frequency"], + use_rehearse=use_rehearse, exclude=frozen_components, annotating_components=annotating_components, before_update=before_update, @@ -150,6 +156,7 @@ def train_while_improving( accumulate_gradient: int, patience: int, max_steps: int, + use_rehearse: bool = False, exclude: List[str], annotating_components: List[str], before_update: Optional[Callable[["Language", Dict[str, Any]], None]], @@ -214,6 +221,12 @@ def train_while_improving( exclude=exclude, annotates=annotating_components, ) + nlp.rehearse( + subbatch, + losses=losses, + sgd=False, # type: ignore[arg-type] + exclude=exclude, + ) # TODO: refactor this so we don't have to run it separately in here for name, proc in nlp.pipeline: if ( diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index ca4023101..5e4aa8af6 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -11,6 +11,7 @@ menu: - ['debug', 'debug'] - ['train', 'train'] - ['pretrain', 'pretrain'] + - ['rehearse', 'rehearse'] - ['evaluate', 'evaluate'] - ['benchmark', 'benchmark'] - ['apply', 'apply'] @@ -1134,6 +1135,39 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [ | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ | | **CREATES** | The pretrained weights that can be used to initialize `spacy train`. | +## rehearse {id="rehearse",tag="command, experimental"} + +This command is designed to fine-tune pre-trained models while also trying to address the “catastrophic forgetting” problem. +It uses "rehearsal" updates that teach the current model to make predictions similar to an initial model. This feature is experimental. + + + +The `rehearse` command outputs the sum of both losses from the `TrainablePipe.update` and `TrainablePipe.rehearse`. +This can potentially cause the loss to increase drastically, even while the scores also increasing. It's likely due to the model making more different predictions than the intital model. + + + +> #### Example +> +> ```bash +> $ python -m spacy rehearse config.cfg --output ./output --paths.train ./train --paths.dev ./dev +> ``` + +```bash +$ python -m spacy rehearse [config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides] +``` + +| Name | Description | +| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | +| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~ | +| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ | +| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | +| **CREATES** | The final rehearse pipeline and the best rehearsed pipeline. + ## evaluate {id="evaluate",version="2",tag="command"} The `evaluate` subcommand is superseded by diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx index 93ddd79a2..45f64e346 100644 --- a/website/docs/api/language.mdx +++ b/website/docs/api/language.mdx @@ -346,7 +346,7 @@ and custom registered functions if needed. See the Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the current model to make predictions similar to an initial model, to try to address -the "catastrophic forgetting" problem. This feature is experimental. +the "catastrophic forgetting" problem. Please note that this function needs to be used together with `Language.update`. This feature is experimental. > #### Example > diff --git a/website/docs/api/pipe.mdx b/website/docs/api/pipe.mdx index c2777edf0..8004d4d76 100644 --- a/website/docs/api/pipe.mdx +++ b/website/docs/api/pipe.mdx @@ -244,7 +244,7 @@ predictions and gold-standard annotations, and update the component's model. Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the current model to make predictions similar to an initial model, to try to address -the "catastrophic forgetting" problem. This feature is experimental. +the "catastrophic forgetting" problem. Please note that this function needs to be used together with `TrainablePipe.update`. This feature is experimental. > #### Example >