diff --git a/spacy/cli/train.py b/spacy/cli/train.py index eabc82be0..6d61c2425 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -97,6 +97,7 @@ def train( dev_corpus = dot_to_object(config, T_cfg["dev_corpus"]) batcher = T_cfg["batcher"] train_logger = T_cfg["logger"] + before_to_disk = create_before_to_disk_callback(T_cfg["before_to_disk"]) # Components that shouldn't be updated during training frozen_components = T_cfg["frozen_components"] # Sourced components that require resume_training @@ -167,6 +168,7 @@ def train( with nlp.select_pipes(disable=frozen_components): update_meta(T_cfg, nlp, info) with nlp.use_params(optimizer.averages): + nlp = before_to_disk(nlp) nlp.to_disk(output_path / "model-best") progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False) progress.set_description(f"Epoch {info['epoch']}") @@ -179,6 +181,7 @@ def train( f"Aborting and saving the final best model. " f"Encountered exception: {str(e)}" ) + nlp = before_to_disk(nlp) nlp.to_disk(output_path / "model-final") raise e finally: @@ -233,6 +236,21 @@ def create_evaluation_callback( return evaluate +def create_before_to_disk_callback( + callback: Optional[Callable[[Language], Language]] +) -> Callable[[Language], Language]: + def before_to_disk(nlp: Language) -> Language: + if not callback: + return nlp + modified_nlp = callback(nlp) + if not isinstance(modified_nlp, Language): + err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp)) + raise ValueError(err) + return modified_nlp + + return before_to_disk + + def train_while_improving( nlp: Language, optimizer: Optimizer, diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 5cd97a0eb..6f8c0aa00 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -72,6 +72,8 @@ frozen_components = [] dev_corpus = "corpora.dev" # Location in the config where the train corpus is defined train_corpus = "corpora.train" +# Optional callback before nlp object is saved to disk after training +before_to_disk = null [training.logger] @loggers = "spacy.ConsoleLogger.v1" diff --git a/spacy/errors.py b/spacy/errors.py index dce5cf51c..708b7fda8 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -480,6 +480,9 @@ class Errors: E201 = ("Span index out of range.") # TODO: fix numbering after merging develop into master + E914 = ("Executing {name} callback failed. Expected the function to " + "return the nlp object but got: {value}. Maybe you forgot to return " + "the modified object in your function?") E915 = ("Can't use score '{name}' to calculate final weighted score. Expected " "float or int but got: {score_type}. To exclude the score from the " "final score, set its weight to null in the [training.score_weights] " diff --git a/spacy/schemas.py b/spacy/schemas.py index 1ff73bccc..eea6639d3 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -216,6 +216,7 @@ class ConfigSchemaTraining(BaseModel): optimizer: Optimizer = Field(..., title="The optimizer to use") logger: Logger = Field(..., title="The logger to track training progress") frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training") + before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk") # fmt: on class Config: diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 0fc3481a4..420c09237 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -180,26 +180,27 @@ single corpus once and then divide it up into `train` and `dev` partitions. This section defines settings and controls for the training and evaluation process that are used when you run [`spacy train`](/api/cli#train). -| Name | Description | -| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | -| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | -| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | -| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | -| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | -| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | -| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | -| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | -| `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ | -| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | -| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | -| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | -| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | -| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | -| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | -| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | -| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | -| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | +| Name | Description | +| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | +| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | +| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | +| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | +| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | +| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | +| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | +| `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ | +| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | +| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | +| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | +| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | +| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | +| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | +| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | +| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | ### pretraining {#config-pretraining tag="section,optional"}