mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge pull request #6134 from explosion/feature/training_before_to_disk
This commit is contained in:
commit
74e1f192b4
|
@ -97,6 +97,7 @@ def train(
|
||||||
dev_corpus = dot_to_object(config, T_cfg["dev_corpus"])
|
dev_corpus = dot_to_object(config, T_cfg["dev_corpus"])
|
||||||
batcher = T_cfg["batcher"]
|
batcher = T_cfg["batcher"]
|
||||||
train_logger = T_cfg["logger"]
|
train_logger = T_cfg["logger"]
|
||||||
|
before_to_disk = create_before_to_disk_callback(T_cfg["before_to_disk"])
|
||||||
# Components that shouldn't be updated during training
|
# Components that shouldn't be updated during training
|
||||||
frozen_components = T_cfg["frozen_components"]
|
frozen_components = T_cfg["frozen_components"]
|
||||||
# Sourced components that require resume_training
|
# Sourced components that require resume_training
|
||||||
|
@ -167,6 +168,7 @@ def train(
|
||||||
with nlp.select_pipes(disable=frozen_components):
|
with nlp.select_pipes(disable=frozen_components):
|
||||||
update_meta(T_cfg, nlp, info)
|
update_meta(T_cfg, nlp, info)
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
|
nlp = before_to_disk(nlp)
|
||||||
nlp.to_disk(output_path / "model-best")
|
nlp.to_disk(output_path / "model-best")
|
||||||
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
||||||
progress.set_description(f"Epoch {info['epoch']}")
|
progress.set_description(f"Epoch {info['epoch']}")
|
||||||
|
@ -179,6 +181,7 @@ def train(
|
||||||
f"Aborting and saving the final best model. "
|
f"Aborting and saving the final best model. "
|
||||||
f"Encountered exception: {str(e)}"
|
f"Encountered exception: {str(e)}"
|
||||||
)
|
)
|
||||||
|
nlp = before_to_disk(nlp)
|
||||||
nlp.to_disk(output_path / "model-final")
|
nlp.to_disk(output_path / "model-final")
|
||||||
raise e
|
raise e
|
||||||
finally:
|
finally:
|
||||||
|
@ -233,6 +236,21 @@ def create_evaluation_callback(
|
||||||
return evaluate
|
return evaluate
|
||||||
|
|
||||||
|
|
||||||
|
def create_before_to_disk_callback(
|
||||||
|
callback: Optional[Callable[[Language], Language]]
|
||||||
|
) -> Callable[[Language], Language]:
|
||||||
|
def before_to_disk(nlp: Language) -> Language:
|
||||||
|
if not callback:
|
||||||
|
return nlp
|
||||||
|
modified_nlp = callback(nlp)
|
||||||
|
if not isinstance(modified_nlp, Language):
|
||||||
|
err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
|
||||||
|
raise ValueError(err)
|
||||||
|
return modified_nlp
|
||||||
|
|
||||||
|
return before_to_disk
|
||||||
|
|
||||||
|
|
||||||
def train_while_improving(
|
def train_while_improving(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
optimizer: Optimizer,
|
optimizer: Optimizer,
|
||||||
|
|
|
@ -72,6 +72,8 @@ frozen_components = []
|
||||||
dev_corpus = "corpora.dev"
|
dev_corpus = "corpora.dev"
|
||||||
# Location in the config where the train corpus is defined
|
# Location in the config where the train corpus is defined
|
||||||
train_corpus = "corpora.train"
|
train_corpus = "corpora.train"
|
||||||
|
# Optional callback before nlp object is saved to disk after training
|
||||||
|
before_to_disk = null
|
||||||
|
|
||||||
[training.logger]
|
[training.logger]
|
||||||
@loggers = "spacy.ConsoleLogger.v1"
|
@loggers = "spacy.ConsoleLogger.v1"
|
||||||
|
|
|
@ -480,6 +480,9 @@ class Errors:
|
||||||
E201 = ("Span index out of range.")
|
E201 = ("Span index out of range.")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E914 = ("Executing {name} callback failed. Expected the function to "
|
||||||
|
"return the nlp object but got: {value}. Maybe you forgot to return "
|
||||||
|
"the modified object in your function?")
|
||||||
E915 = ("Can't use score '{name}' to calculate final weighted score. Expected "
|
E915 = ("Can't use score '{name}' to calculate final weighted score. Expected "
|
||||||
"float or int but got: {score_type}. To exclude the score from the "
|
"float or int but got: {score_type}. To exclude the score from the "
|
||||||
"final score, set its weight to null in the [training.score_weights] "
|
"final score, set its weight to null in the [training.score_weights] "
|
||||||
|
|
|
@ -216,6 +216,7 @@ class ConfigSchemaTraining(BaseModel):
|
||||||
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
||||||
logger: Logger = Field(..., title="The logger to track training progress")
|
logger: Logger = Field(..., title="The logger to track training progress")
|
||||||
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
|
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
|
||||||
|
before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
|
|
|
@ -180,26 +180,27 @@ single corpus once and then divide it up into `train` and `dev` partitions.
|
||||||
This section defines settings and controls for the training and evaluation
|
This section defines settings and controls for the training and evaluation
|
||||||
process that are used when you run [`spacy train`](/api/cli#train).
|
process that are used when you run [`spacy train`](/api/cli#train).
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
|
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
|
||||||
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
||||||
| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
|
| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||||
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
|
||||||
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
||||||
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
|
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
||||||
| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ |
|
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
|
||||||
| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ |
|
| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ |
|
||||||
| `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ |
|
| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ |
|
||||||
| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
|
| `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ |
|
||||||
| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ |
|
| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
|
||||||
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ |
|
||||||
| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ |
|
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
||||||
| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ |
|
| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ |
|
||||||
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
|
| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ |
|
||||||
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
|
||||||
| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ |
|
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
||||||
| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ |
|
| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ |
|
||||||
|
| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ |
|
||||||
|
|
||||||
### pretraining {#config-pretraining tag="section,optional"}
|
### pretraining {#config-pretraining tag="section,optional"}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user