mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
b4e457d9fe
* Add support for multiple code files to all relevant commands Prior to this, only the package command supported multiple code files. * Update docs * Add debug data test, plus generic fixtures One tricky thing here: it's tempting to create the config by creating a pipeline in code, but that requires declaring the custom components here. However the CliRunner appears to be run in the same process or otherwise have access to our registry, so it works even without any code arguments. So it's necessary to avoid declaring the components in the tests. * Add debug config test and restructure The code argument imports the provided file. If it adds item to the registry, that affects global state, which CliRunner doesn't isolate. Since there's no standard way to remove things from the registry, this instead uses subprocess.run to run commands. * Use a more generic, parametrized test * Add output arg for assemble and pretrain Assemble and pretrain require an output argument. This commit adds assemble testing, but not pretrain, as that requires an actual trainable component, which is not currently in the test config. * Add evaluate test and some cleanup * Mark tests as slow * Revert argument name change * Apply suggestions from code review Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Format API CLI docs * isort * Fix imports in tests * isort * Undo changes to package CLI help * Fix python executable and lang code in test * Fix executable in another test --------- Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
126 lines
5.2 KiB
Python
126 lines
5.2 KiB
Python
import re
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import typer
|
|
from wasabi import msg
|
|
|
|
from ..training.pretrain import pretrain
|
|
from ..util import load_config
|
|
from ._util import (
|
|
Arg,
|
|
Opt,
|
|
app,
|
|
import_code_paths,
|
|
parse_config_overrides,
|
|
setup_gpu,
|
|
show_validation_error,
|
|
)
|
|
|
|
|
|
@app.command(
|
|
"pretrain",
|
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
|
)
|
|
def pretrain_cli(
|
|
# fmt: off
|
|
ctx: typer.Context, # This is only used to read additional arguments
|
|
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True),
|
|
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
|
code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
|
|
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
|
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
|
skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"),
|
|
# fmt: on
|
|
):
|
|
"""
|
|
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
|
using an approximate language-modelling objective. Two objective types
|
|
are available, vector-based and character-based.
|
|
|
|
In the vector-based objective, we load word vectors that have been trained
|
|
using a word2vec-style distributional similarity algorithm, and train a
|
|
component like a CNN, BiLSTM, etc to predict vectors which match the
|
|
pretrained ones. The weights are saved to a directory after each epoch. You
|
|
can then pass a path to one of these pretrained weights files to the
|
|
'spacy train' command.
|
|
|
|
This technique may be especially helpful if you have little labelled data.
|
|
However, it's still quite experimental, so your mileage may vary.
|
|
|
|
To load the weights back in during 'spacy train', you need to ensure
|
|
all settings are the same between pretraining and training. Ideally,
|
|
this is done by using the same config file for both commands.
|
|
|
|
DOCS: https://spacy.io/api/cli#pretrain
|
|
"""
|
|
config_overrides = parse_config_overrides(ctx.args)
|
|
import_code_paths(code_path)
|
|
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
|
|
setup_gpu(use_gpu)
|
|
msg.info(f"Loading config from: {config_path}")
|
|
|
|
with show_validation_error(config_path):
|
|
raw_config = load_config(
|
|
config_path, overrides=config_overrides, interpolate=False
|
|
)
|
|
config = raw_config.interpolate()
|
|
if not config.get("pretraining"):
|
|
# TODO: What's the solution here? How do we handle optional blocks?
|
|
msg.fail("The [pretraining] block in your config is empty", exits=1)
|
|
if not output_dir.exists():
|
|
output_dir.mkdir(parents=True)
|
|
msg.good(f"Created output directory: {output_dir}")
|
|
# Save non-interpolated config
|
|
raw_config.to_disk(output_dir / "config.cfg")
|
|
msg.good("Saved config file in the output directory")
|
|
|
|
pretrain(
|
|
config,
|
|
output_dir,
|
|
resume_path=resume_path,
|
|
epoch_resume=epoch_resume,
|
|
use_gpu=use_gpu,
|
|
silent=False,
|
|
skip_last=skip_last,
|
|
)
|
|
msg.good("Successfully finished pretrain")
|
|
|
|
|
|
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
|
|
if not config_path or (str(config_path) != "-" and not config_path.exists()):
|
|
msg.fail("Config file not found", config_path, exits=1)
|
|
if output_dir.exists() and [p for p in output_dir.iterdir()]:
|
|
if resume_path:
|
|
msg.warn(
|
|
"Output directory is not empty.",
|
|
"If you're resuming a run in this directory, the old weights "
|
|
"for the consecutive epochs will be overwritten with the new ones.",
|
|
)
|
|
else:
|
|
msg.warn(
|
|
"Output directory is not empty. ",
|
|
"It is better to use an empty directory or refer to a new output path, "
|
|
"then the new directory will be created for you.",
|
|
)
|
|
if resume_path is not None:
|
|
if resume_path.is_dir():
|
|
# This is necessary because Windows gives a Permission Denied when we
|
|
# try to open the directory later, which is confusing. See #7878
|
|
msg.fail(
|
|
"--resume-path should be a weights file, but {resume_path} is a directory.",
|
|
exits=True,
|
|
)
|
|
model_name = re.search(r"model\d+\.bin", str(resume_path))
|
|
if not model_name and not epoch_resume:
|
|
msg.fail(
|
|
"You have to use the --epoch-resume setting when using a renamed weight file for --resume-path",
|
|
exits=True,
|
|
)
|
|
elif not model_name and epoch_resume < 0:
|
|
msg.fail(
|
|
f"The argument --epoch-resume has to be greater or equal to 0. {epoch_resume} is invalid",
|
|
exits=True,
|
|
)
|