spaCy/spacy/cli/pretrain.py
Paul O'Leary McCann b4e457d9fe
Accept multiple code files in all CLI commands (#12101)
* Add support for multiple code files to all relevant commands

Prior to this, only the package command supported multiple code files.

* Update docs

* Add debug data test, plus generic fixtures

One tricky thing here: it's tempting to create the config by creating a
pipeline in code, but that requires declaring the custom components
here. However the CliRunner appears to be run in the same process or
otherwise have access to our registry, so it works even without any
code arguments. So it's necessary to avoid declaring the components in
the tests.

* Add debug config test and restructure

The code argument imports the provided file. If it adds item to the
registry, that affects global state, which CliRunner doesn't isolate.
Since there's no standard way to remove things from the registry, this
instead uses subprocess.run to run commands.

* Use a more generic, parametrized test

* Add output arg for assemble and pretrain

Assemble and pretrain require an output argument. This commit adds
assemble testing, but not pretrain, as that requires an actual trainable
component, which is not currently in the test config.

* Add evaluate test and some cleanup

* Mark tests as slow

* Revert argument name change

* Apply suggestions from code review

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Format API CLI docs

* isort

* Fix imports in tests

* isort

* Undo changes to package CLI help

* Fix python executable and lang code in test

* Fix executable in another test

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
2023-08-01 15:24:02 +02:00

126 lines
5.2 KiB
Python

import re
from pathlib import Path
from typing import Optional
import typer
from wasabi import msg
from ..training.pretrain import pretrain
from ..util import load_config
from ._util import (
Arg,
Opt,
app,
import_code_paths,
parse_config_overrides,
setup_gpu,
show_validation_error,
)
@app.command(
"pretrain",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def pretrain_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True),
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"),
# fmt: on
):
"""
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
using an approximate language-modelling objective. Two objective types
are available, vector-based and character-based.
In the vector-based objective, we load word vectors that have been trained
using a word2vec-style distributional similarity algorithm, and train a
component like a CNN, BiLSTM, etc to predict vectors which match the
pretrained ones. The weights are saved to a directory after each epoch. You
can then pass a path to one of these pretrained weights files to the
'spacy train' command.
This technique may be especially helpful if you have little labelled data.
However, it's still quite experimental, so your mileage may vary.
To load the weights back in during 'spacy train', you need to ensure
all settings are the same between pretraining and training. Ideally,
this is done by using the same config file for both commands.
DOCS: https://spacy.io/api/cli#pretrain
"""
config_overrides = parse_config_overrides(ctx.args)
import_code_paths(code_path)
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
setup_gpu(use_gpu)
msg.info(f"Loading config from: {config_path}")
with show_validation_error(config_path):
raw_config = load_config(
config_path, overrides=config_overrides, interpolate=False
)
config = raw_config.interpolate()
if not config.get("pretraining"):
# TODO: What's the solution here? How do we handle optional blocks?
msg.fail("The [pretraining] block in your config is empty", exits=1)
if not output_dir.exists():
output_dir.mkdir(parents=True)
msg.good(f"Created output directory: {output_dir}")
# Save non-interpolated config
raw_config.to_disk(output_dir / "config.cfg")
msg.good("Saved config file in the output directory")
pretrain(
config,
output_dir,
resume_path=resume_path,
epoch_resume=epoch_resume,
use_gpu=use_gpu,
silent=False,
skip_last=skip_last,
)
msg.good("Successfully finished pretrain")
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
if not config_path or (str(config_path) != "-" and not config_path.exists()):
msg.fail("Config file not found", config_path, exits=1)
if output_dir.exists() and [p for p in output_dir.iterdir()]:
if resume_path:
msg.warn(
"Output directory is not empty.",
"If you're resuming a run in this directory, the old weights "
"for the consecutive epochs will be overwritten with the new ones.",
)
else:
msg.warn(
"Output directory is not empty. ",
"It is better to use an empty directory or refer to a new output path, "
"then the new directory will be created for you.",
)
if resume_path is not None:
if resume_path.is_dir():
# This is necessary because Windows gives a Permission Denied when we
# try to open the directory later, which is confusing. See #7878
msg.fail(
"--resume-path should be a weights file, but {resume_path} is a directory.",
exits=True,
)
model_name = re.search(r"model\d+\.bin", str(resume_path))
if not model_name and not epoch_resume:
msg.fail(
"You have to use the --epoch-resume setting when using a renamed weight file for --resume-path",
exits=True,
)
elif not model_name and epoch_resume < 0:
msg.fail(
f"The argument --epoch-resume has to be greater or equal to 0. {epoch_resume} is invalid",
exits=True,
)