spaCy/spacy/cli/init_pipeline.py
2020-09-28 09:47:34 +02:00

112 lines
4.5 KiB
Python

from typing import Optional, Dict, Any, Tuple, Union, Callable, List
import logging
import srsly
from pathlib import Path
from wasabi import msg
import typer
from thinc.api import Config, fix_random_seed
from .train import create_before_to_disk_callback
from .. import util
from ..util import registry
from ..schemas import ConfigSchemaTraining
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, get_sourced_components
from ..util import resolve_dot_names
@init_cli.command(
"pipeline",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def init_pipeline_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
output_path: Path = Arg(..., help="Output directory for the prepared data"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
# fmt: on
):
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
config = util.load_config(config_path, overrides=overrides)
with show_validation_error(config_path):
nlp = init_pipeline(config)
nlp.to_disk(output_path)
def must_initialize(init_path: Path, config_path: Path, overrides: Dict) -> bool:
config = util.load_config(config_path, overrides=overrides)
if not init_path.exists():
return True
elif not (init_path / "config.cfg").exists():
return True
else:
init_cfg = util.load_config(init_path / "config.cfg", interpolate=True)
if config.to_str() != init_cfg.to_str():
return True
else:
return False
def init_pipeline(config: Config, use_gpu=-1):
raw_config = config
config = raw_config.interpolate()
if config["training"]["seed"] is not None:
fix_random_seed(config["training"]["seed"])
allocator = config["training"]["gpu_allocator"]
if use_gpu >= 0 and allocator:
set_gpu_allocator(allocator)
# Use original config here before it's resolved to functions
sourced_components = get_sourced_components(config)
nlp = util.load_model_from_config(raw_config)
# Resolve all training-relevant sections using the filled nlp config
T = registry.resolve(
config["training"],
schema=ConfigSchemaTraining,
validate=True,
)
dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]]
train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names)
util.load_vocab_data_into_model(nlp, lookups=T["lookups"])
if T["vectors"] is not None:
add_vectors(nlp, T["vectors"])
score_weights = T["score_weights"]
optimizer = T["optimizer"]
batcher = T["batcher"]
train_logger = T["logger"]
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
# Components that shouldn't be updated during training
frozen_components = T["frozen_components"]
# Sourced components that require resume_training
resume_components = [p for p in sourced_components if p not in frozen_components]
msg.info(f"Pipeline: {nlp.pipe_names}")
if resume_components:
with nlp.select_pipes(enable=resume_components):
msg.info(f"Resuming training for: {resume_components}")
nlp.resume_training(sgd=optimizer)
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
# Verify the config after calling 'begin_training' to ensure labels
# are properly initialized
verify_config(nlp)
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
if weights_data is not None:
tok2vec_component = C["pretraining"]["component"]
if tok2vec_component is None:
msg.fail(
f"To use pretrained tok2vec weights, [pretraining.component] "
f"needs to specify the component that should load them.",
exits=1,
)
layer = nlp.get_pipe(tok2vec_component).model
tok2vec_layer = C["pretraining"]["layer"]
if tok2vec_layer:
layer = layer.get_ref(tok2vec_layer)
layer.from_bytes(weights_data)
msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'")
return nlp