mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Add model-last saving mechanism to pretraining (#12459)
* Adjust pretrain command * chane naming and add finally block * Add unit test * Add unit test assertions * Update spacy/training/pretrain.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * change finally block * Add to docs * Update website/docs/usage/embeddings-transformers.mdx * Add flag to skip saving model-last --------- Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
4a1ec332de
commit
de32011e4c
|
@ -23,6 +23,7 @@ def pretrain_cli(
|
||||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
|
skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -74,6 +75,7 @@ def pretrain_cli(
|
||||||
epoch_resume=epoch_resume,
|
epoch_resume=epoch_resume,
|
||||||
use_gpu=use_gpu,
|
use_gpu=use_gpu,
|
||||||
silent=False,
|
silent=False,
|
||||||
|
skip_last=skip_last,
|
||||||
)
|
)
|
||||||
msg.good("Successfully finished pretrain")
|
msg.good("Successfully finished pretrain")
|
||||||
|
|
||||||
|
|
|
@ -165,7 +165,8 @@ def test_pretraining_default():
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("objective", CHAR_OBJECTIVES)
|
@pytest.mark.parametrize("objective", CHAR_OBJECTIVES)
|
||||||
def test_pretraining_tok2vec_characters(objective):
|
@pytest.mark.parametrize("skip_last", (True, False))
|
||||||
|
def test_pretraining_tok2vec_characters(objective, skip_last):
|
||||||
"""Test that pretraining works with the character objective"""
|
"""Test that pretraining works with the character objective"""
|
||||||
config = Config().from_str(pretrain_string_listener)
|
config = Config().from_str(pretrain_string_listener)
|
||||||
config["pretraining"]["objective"] = objective
|
config["pretraining"]["objective"] = objective
|
||||||
|
@ -178,10 +179,14 @@ def test_pretraining_tok2vec_characters(objective):
|
||||||
filled["paths"]["raw_text"] = file_path
|
filled["paths"]["raw_text"] = file_path
|
||||||
filled = filled.interpolate()
|
filled = filled.interpolate()
|
||||||
assert filled["pretraining"]["component"] == "tok2vec"
|
assert filled["pretraining"]["component"] == "tok2vec"
|
||||||
pretrain(filled, tmp_dir)
|
pretrain(filled, tmp_dir, skip_last=skip_last)
|
||||||
assert Path(tmp_dir / "model0.bin").exists()
|
assert Path(tmp_dir / "model0.bin").exists()
|
||||||
assert Path(tmp_dir / "model4.bin").exists()
|
assert Path(tmp_dir / "model4.bin").exists()
|
||||||
assert not Path(tmp_dir / "model5.bin").exists()
|
assert not Path(tmp_dir / "model5.bin").exists()
|
||||||
|
if skip_last:
|
||||||
|
assert not Path(tmp_dir / "model-last.bin").exists()
|
||||||
|
else:
|
||||||
|
assert Path(tmp_dir / "model-last.bin").exists()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("objective", VECTOR_OBJECTIVES)
|
@pytest.mark.parametrize("objective", VECTOR_OBJECTIVES)
|
||||||
|
@ -237,6 +242,7 @@ def test_pretraining_tagger_tok2vec(config):
|
||||||
pretrain(filled, tmp_dir)
|
pretrain(filled, tmp_dir)
|
||||||
assert Path(tmp_dir / "model0.bin").exists()
|
assert Path(tmp_dir / "model0.bin").exists()
|
||||||
assert Path(tmp_dir / "model4.bin").exists()
|
assert Path(tmp_dir / "model4.bin").exists()
|
||||||
|
assert Path(tmp_dir / "model-last.bin").exists()
|
||||||
assert not Path(tmp_dir / "model5.bin").exists()
|
assert not Path(tmp_dir / "model5.bin").exists()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -24,6 +24,7 @@ def pretrain(
|
||||||
epoch_resume: Optional[int] = None,
|
epoch_resume: Optional[int] = None,
|
||||||
use_gpu: int = -1,
|
use_gpu: int = -1,
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
|
skip_last: bool = False,
|
||||||
):
|
):
|
||||||
msg = Printer(no_print=silent)
|
msg = Printer(no_print=silent)
|
||||||
if config["training"]["seed"] is not None:
|
if config["training"]["seed"] is not None:
|
||||||
|
@ -60,10 +61,14 @@ def pretrain(
|
||||||
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
||||||
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
||||||
|
|
||||||
def _save_model(epoch, is_temp=False):
|
def _save_model(epoch, is_temp=False, is_last=False):
|
||||||
is_temp_str = ".temp" if is_temp else ""
|
is_temp_str = ".temp" if is_temp else ""
|
||||||
with model.use_params(optimizer.averages):
|
with model.use_params(optimizer.averages):
|
||||||
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
|
if is_last:
|
||||||
|
save_path = output_dir / f"model-last.bin"
|
||||||
|
else:
|
||||||
|
save_path = output_dir / f"model{epoch}{is_temp_str}.bin"
|
||||||
|
with (save_path).open("wb") as file_:
|
||||||
file_.write(model.get_ref("tok2vec").to_bytes())
|
file_.write(model.get_ref("tok2vec").to_bytes())
|
||||||
log = {
|
log = {
|
||||||
"nr_word": tracker.nr_word,
|
"nr_word": tracker.nr_word,
|
||||||
|
@ -76,22 +81,26 @@ def pretrain(
|
||||||
|
|
||||||
# TODO: I think we probably want this to look more like the
|
# TODO: I think we probably want this to look more like the
|
||||||
# 'create_train_batches' function?
|
# 'create_train_batches' function?
|
||||||
for epoch in range(epoch_resume, P["max_epochs"]):
|
try:
|
||||||
for batch_id, batch in enumerate(batcher(corpus(nlp))):
|
for epoch in range(epoch_resume, P["max_epochs"]):
|
||||||
docs = ensure_docs(batch)
|
for batch_id, batch in enumerate(batcher(corpus(nlp))):
|
||||||
loss = make_update(model, docs, optimizer, objective)
|
docs = ensure_docs(batch)
|
||||||
progress = tracker.update(epoch, loss, docs)
|
loss = make_update(model, docs, optimizer, objective)
|
||||||
if progress:
|
progress = tracker.update(epoch, loss, docs)
|
||||||
msg.row(progress, **row_settings)
|
if progress:
|
||||||
if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
|
msg.row(progress, **row_settings)
|
||||||
_save_model(epoch, is_temp=True)
|
if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
|
||||||
|
_save_model(epoch, is_temp=True)
|
||||||
|
|
||||||
if P["n_save_epoch"]:
|
if P["n_save_epoch"]:
|
||||||
if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1:
|
if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1:
|
||||||
|
_save_model(epoch)
|
||||||
|
else:
|
||||||
_save_model(epoch)
|
_save_model(epoch)
|
||||||
else:
|
tracker.epoch_loss = 0.0
|
||||||
_save_model(epoch)
|
finally:
|
||||||
tracker.epoch_loss = 0.0
|
if not skip_last:
|
||||||
|
_save_model(P["max_epochs"], is_last=True)
|
||||||
|
|
||||||
|
|
||||||
def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
|
def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
|
||||||
|
|
|
@ -1122,17 +1122,18 @@ auto-generated by setting `--pretraining` on
|
||||||
$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides]
|
$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
||||||
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
||||||
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
|
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
|
||||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--skip-last`, `-L` <Tag variant="new">3.5.2</Tag> | Skip saving `model-last.bin`. Defaults to `False`. ~~bool (flag)~~ |
|
||||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
|
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
|
||||||
|
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
|
||||||
|
|
||||||
## evaluate {id="evaluate",version="2",tag="command"}
|
## evaluate {id="evaluate",version="2",tag="command"}
|
||||||
|
|
||||||
|
|
|
@ -746,13 +746,16 @@ this by setting `initialize.init_tok2vec` to the filename of the `.bin` file
|
||||||
that you want to use from pretraining.
|
that you want to use from pretraining.
|
||||||
|
|
||||||
A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as
|
A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as
|
||||||
an example, produces `pretrain/model0.bin` through `pretrain/model4.bin`. To
|
an example, produces `pretrain/model0.bin` through `pretrain/model4.bin` plus a
|
||||||
make use of the final output, you could fill in this value in your config file:
|
copy of the last iteration as `pretrain/model-last.bin`. Additionally, you can
|
||||||
|
configure `n_save_epoch` to tell pretraining in which epoch interval it should
|
||||||
|
save the current training progress. To use the final output to initialize your
|
||||||
|
`tok2vec` layer, you could fill in this value in your config file:
|
||||||
|
|
||||||
```ini {title="config.cfg"}
|
```ini {title="config.cfg"}
|
||||||
|
|
||||||
[paths]
|
[paths]
|
||||||
init_tok2vec = "pretrain/model4.bin"
|
init_tok2vec = "pretrain/model-last.bin"
|
||||||
|
|
||||||
[initialize]
|
[initialize]
|
||||||
init_tok2vec = ${paths.init_tok2vec}
|
init_tok2vec = ${paths.init_tok2vec}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user