diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 7b11217c5..aa671646e 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -22,8 +22,7 @@ def train_cli( output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"), code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), - use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), - use_rehearse: bool = Opt(False, "--use_rehearse", "-r", help="Perform 'rehearsal updates' on a pre-trained model") + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") # fmt: on ): """ @@ -43,13 +42,7 @@ def train_cli( util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) overrides = parse_config_overrides(ctx.args) import_code(code_path) - train( - config_path, - output_path, - use_gpu=use_gpu, - overrides=overrides, - use_rehearse=use_rehearse, - ) + train(config_path, output_path, use_gpu=use_gpu, overrides=overrides) def train( @@ -88,5 +81,4 @@ def train( use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr, - use_rehearse=use_rehearse, ) diff --git a/spacy/language.py b/spacy/language.py index 5977ed17f..133c9aa93 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1183,6 +1183,7 @@ class Language: losses: Optional[Dict[str, float]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, exclude: Iterable[str] = SimpleFrozenList(), + rehearse_components: List[str] = [], ) -> Dict[str, float]: """Make a "rehearsal" update to the models in the pipeline, to prevent forgetting. Rehearsal updates run an initial copy of the model over some @@ -1195,6 +1196,7 @@ class Language: component_cfg (Dict[str, Dict]): Config parameters for specific pipeline components, keyed by component name. exclude (Iterable[str]): Names of components that shouldn't be updated. + rehearse_components (List[str]): Names of components that should be rehearsed RETURNS (dict): Results from the update. EXAMPLE: @@ -1216,7 +1218,11 @@ class Language: component_cfg = {} for name, proc in pipes: - if name in exclude or not hasattr(proc, "rehearse"): + if ( + name in exclude + or not hasattr(proc, "rehearse") + or name not in rehearse_components + ): continue proc.rehearse( # type: ignore[attr-defined] examples, sgd=None, losses=losses, **component_cfg.get(name, {}) diff --git a/spacy/schemas.py b/spacy/schemas.py index 140592dcd..cab909e21 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -356,6 +356,7 @@ class ConfigSchemaTraining(BaseModel): logger: Logger = Field(..., title="The logger to track training progress") frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training") annotating_components: List[str] = Field(..., title="Pipeline components that should set annotations during training") + rehearse_components: List[str] = Field(..., title="Pipeline components that should be rehearsed during training") before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk") before_update: Optional[Callable[["Language", Dict[str, Any]], None]] = Field(..., title="Optional callback that is invoked at the start of each training step") # fmt: on diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 6304e4a84..2315c1140 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -60,6 +60,8 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": optimizer = T["optimizer"] # Components that shouldn't be updated during training frozen_components = T["frozen_components"] + # Components that shouldn't be updated during training + rehearse_components = T["rehearse_components"] # Sourced components that require resume_training resume_components = [p for p in sourced if p not in frozen_components] logger.info(f"Pipeline: {nlp.pipe_names}") @@ -67,6 +69,8 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": with nlp.select_pipes(enable=resume_components): logger.info(f"Resuming training for: {resume_components}") nlp.resume_training(sgd=optimizer) + if rehearse_components: + logger.info(f"Rehearsing components: {rehearse_components}") # Make sure that listeners are defined before initializing further nlp._link_components() with nlp.select_pipes(disable=[*frozen_components, *resume_components]): diff --git a/spacy/training/loop.py b/spacy/training/loop.py index d7c3504fd..b624d9be1 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -26,7 +26,6 @@ def train( output_path: Optional[Path] = None, *, use_gpu: int = -1, - use_rehearse: bool = False, stdout: IO = sys.stdout, stderr: IO = sys.stderr, ) -> Tuple["Language", Optional[Path]]: @@ -36,7 +35,6 @@ def train( output_path (Optional[Path]): Optional output path to save trained model to. use_gpu (int): Whether to train on GPU. Make sure to call require_gpu before calling this function. - use_rehearse (bool): Use nlp.rehearse after nlp.update stdout (file): A file-like object to write output messages. To disable printing, set to io.StringIO. stderr (file): A second file-like object to write output messages. To disable @@ -56,10 +54,7 @@ def train( T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(config, dot_names) - if use_rehearse: - optimizer = nlp.resume_training() - else: - optimizer = T["optimizer"] + optimizer = T["optimizer"] score_weights = T["score_weights"] batcher = T["batcher"] train_logger = T["logger"] @@ -82,6 +77,8 @@ def train( frozen_components = T["frozen_components"] # Components that should set annotations on update annotating_components = T["annotating_components"] + # Components that should be rehearsed after update + rehearse_components = T["rehearse_components"] # Create iterator, which yields out info after each optimization step. training_step_iterator = train_while_improving( nlp, @@ -93,9 +90,9 @@ def train( patience=T["patience"], max_steps=T["max_steps"], eval_frequency=T["eval_frequency"], - use_rehearse=use_rehearse, exclude=frozen_components, annotating_components=annotating_components, + rehearse_components=rehearse_components, before_update=before_update, ) clean_output_dir(output_path) @@ -156,9 +153,9 @@ def train_while_improving( accumulate_gradient: int, patience: int, max_steps: int, - use_rehearse: bool = False, exclude: List[str], annotating_components: List[str], + rehearse_components: List[str], before_update: Optional[Callable[["Language", Dict[str, Any]], None]], ): """Train until an evaluation stops improving. Works as a generator, @@ -217,17 +214,17 @@ def train_while_improving( subbatch, drop=dropout, losses=losses, - sgd=False, # type: ignore[arg-type] + sgd=None, # type: ignore[arg-type] exclude=exclude, annotates=annotating_components, ) - if use_rehearse: - nlp.rehearse( - subbatch, - losses=losses, - sgd=False, # type: ignore[arg-type] - exclude=exclude, - ) + nlp.rehearse( + subbatch, + losses=losses, + sgd=None, # type: ignore[arg-type] + exclude=exclude, + rehearse_components=rehearse_components, + ) # TODO: refactor this so we don't have to run it separately in here for name, proc in nlp.pipeline: if (