2020-09-28 16:09:59 +03:00
|
|
|
from typing import Optional
|
2020-01-29 19:06:46 +03:00
|
|
|
from pathlib import Path
|
2020-02-27 20:42:27 +03:00
|
|
|
from wasabi import msg
|
2020-09-28 16:09:59 +03:00
|
|
|
from thinc.api import Config
|
2020-07-10 18:57:40 +03:00
|
|
|
import typer
|
2020-08-14 16:00:52 +03:00
|
|
|
import logging
|
2020-01-29 19:06:46 +03:00
|
|
|
|
2020-07-11 00:34:17 +03:00
|
|
|
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
2020-09-28 22:17:10 +03:00
|
|
|
from ._util import import_code, setup_gpu
|
2020-07-22 14:42:59 +03:00
|
|
|
from ..language import Language
|
2020-09-28 16:09:59 +03:00
|
|
|
from ..training.loop import train
|
|
|
|
from ..training.initialize import init_nlp, must_reinitialize
|
2020-02-27 20:42:27 +03:00
|
|
|
from .. import util
|
2020-07-10 14:31:27 +03:00
|
|
|
|
2020-06-21 14:44:00 +03:00
|
|
|
|
2020-07-10 18:57:40 +03:00
|
|
|
@app.command(
|
|
|
|
"train", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
|
|
|
)
|
2020-06-20 15:15:04 +03:00
|
|
|
def train_cli(
|
2020-01-29 19:06:46 +03:00
|
|
|
# fmt: off
|
2020-07-10 18:57:40 +03:00
|
|
|
ctx: typer.Context, # This is only used to read additional arguments
|
2020-06-21 22:35:01 +03:00
|
|
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
2020-09-03 14:13:03 +03:00
|
|
|
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
2020-09-19 02:17:02 +03:00
|
|
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
2020-07-09 20:44:28 +03:00
|
|
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
2020-09-28 12:06:07 +03:00
|
|
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
2020-01-29 19:06:46 +03:00
|
|
|
# fmt: on
|
|
|
|
):
|
|
|
|
"""
|
2020-09-03 14:13:03 +03:00
|
|
|
Train or update a spaCy pipeline. Requires data in spaCy's binary format. To
|
2020-07-10 18:57:40 +03:00
|
|
|
convert data from other formats, use the `spacy convert` command. The
|
|
|
|
config file includes all settings and hyperparameters used during traing.
|
|
|
|
To override settings in the config, e.g. settings that point to local
|
|
|
|
paths or that you want to experiment with, you can override them as
|
|
|
|
command line options. For instance, --training.batch_size 128 overrides
|
|
|
|
the value of "batch_size" in the block "[training]". The --code argument
|
|
|
|
lets you pass in a Python file that's imported before training. It can be
|
|
|
|
used to register custom functions and architectures that can then be
|
|
|
|
referenced in the config.
|
2020-09-04 13:58:50 +03:00
|
|
|
|
|
|
|
DOCS: https://nightly.spacy.io/api/cli#train
|
2020-01-29 19:06:46 +03:00
|
|
|
"""
|
2020-08-14 16:00:52 +03:00
|
|
|
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
2020-08-04 16:09:37 +03:00
|
|
|
verify_cli_args(config_path, output_path)
|
2020-07-10 18:57:40 +03:00
|
|
|
overrides = parse_config_overrides(ctx.args)
|
2020-07-11 14:03:53 +03:00
|
|
|
import_code(code_path)
|
2020-09-28 16:09:59 +03:00
|
|
|
setup_gpu(use_gpu)
|
|
|
|
with show_validation_error(config_path):
|
|
|
|
config = util.load_config(config_path, overrides=overrides, interpolate=False)
|
2020-09-28 11:53:17 +03:00
|
|
|
msg.divider("Initializing pipeline")
|
2020-09-28 16:09:59 +03:00
|
|
|
nlp = init_pipeline(config, output_path, use_gpu=use_gpu)
|
2020-09-28 12:06:07 +03:00
|
|
|
msg.divider("Training pipeline")
|
2020-09-28 22:17:10 +03:00
|
|
|
train(nlp, output_path, use_gpu=use_gpu, silent=False)
|
2020-09-28 12:06:07 +03:00
|
|
|
|
|
|
|
|
2020-09-28 16:09:59 +03:00
|
|
|
def init_pipeline(
|
|
|
|
config: Config, output_path: Optional[Path], *, use_gpu: int = -1
|
|
|
|
) -> Language:
|
2020-09-28 22:17:10 +03:00
|
|
|
init_kwargs = {"use_gpu": use_gpu, "silent": False}
|
2020-09-28 12:06:07 +03:00
|
|
|
if output_path is not None:
|
2020-09-28 13:46:28 +03:00
|
|
|
init_path = output_path / "model-initial"
|
|
|
|
if not init_path.exists():
|
|
|
|
msg.info(f"Initializing the pipeline in {init_path}")
|
2020-09-28 16:09:59 +03:00
|
|
|
nlp = init_nlp(config, **init_kwargs)
|
2020-09-28 13:46:28 +03:00
|
|
|
nlp.to_disk(init_path)
|
|
|
|
msg.good(f"Saved initialized pipeline to {init_path}")
|
2020-09-28 00:59:44 +03:00
|
|
|
else:
|
2020-09-28 13:46:28 +03:00
|
|
|
nlp = util.load_model(init_path)
|
2020-09-28 13:30:13 +03:00
|
|
|
if must_reinitialize(config, nlp.config):
|
|
|
|
msg.warn("Config has changed: need to re-initialize pipeline")
|
2020-09-28 16:09:59 +03:00
|
|
|
nlp = init_nlp(config, **init_kwargs)
|
2020-09-28 13:46:28 +03:00
|
|
|
nlp.to_disk(init_path)
|
|
|
|
msg.good(f"Re-initialized pipeline in {init_path}")
|
2020-09-28 13:30:13 +03:00
|
|
|
else:
|
2020-09-28 13:46:28 +03:00
|
|
|
msg.good(f"Loaded initialized pipeline from {init_path}")
|
2020-09-28 12:06:07 +03:00
|
|
|
return nlp
|
2020-09-28 13:30:13 +03:00
|
|
|
msg.warn(
|
|
|
|
"Not saving initialized model: no output directory specified. "
|
|
|
|
"To speed up training, spaCy can save the initialized nlp object with "
|
|
|
|
"the vocabulary, vectors and label scheme. To take advantage of this, "
|
2020-09-28 13:46:28 +03:00
|
|
|
"provide an output directory."
|
2020-09-28 13:30:13 +03:00
|
|
|
)
|
2020-09-28 16:09:59 +03:00
|
|
|
return init_nlp(config, **init_kwargs)
|
2020-06-26 20:34:12 +03:00
|
|
|
|
|
|
|
|
2020-08-26 16:24:33 +03:00
|
|
|
def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
|
2020-06-26 20:34:12 +03:00
|
|
|
# Make sure all files and paths exists if they are needed
|
|
|
|
if not config_path or not config_path.exists():
|
|
|
|
msg.fail("Config file not found", config_path, exits=1)
|
|
|
|
if output_path is not None:
|
|
|
|
if not output_path.exists():
|
|
|
|
output_path.mkdir()
|
|
|
|
msg.good(f"Created output directory: {output_path}")
|