spaCy/spacy/cli/init_config.py

216 lines
8.7 KiB
Python
Raw Normal View History

2020-08-14 17:49:26 +03:00
from typing import Optional, List, Tuple
2020-08-13 18:38:30 +03:00
from enum import Enum
from pathlib import Path
2020-08-14 17:49:26 +03:00
from wasabi import Printer, diff_strings
from thinc.api import Config
2020-08-13 18:38:30 +03:00
import srsly
import re
from jinja2 import Template
2020-08-14 15:06:22 +03:00
from .. import util
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
from ..schemas import RecommendationSchema
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND, string_to_list
2020-08-13 18:38:30 +03:00
ROOT = Path(__file__).parent / "templates"
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
RECOMMENDATIONS = srsly.read_yaml(ROOT / "quickstart_training_recommendations.yml")
2020-08-13 18:38:30 +03:00
class Optimizations(str, Enum):
efficiency = "efficiency"
accuracy = "accuracy"
@init_cli.command("config")
def init_config_cli(
# fmt: off
2020-08-21 13:06:19 +03:00
output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
2020-08-13 18:38:30 +03:00
lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
2020-08-13 18:38:30 +03:00
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
2020-09-17 17:05:40 +03:00
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
# fmt: on
):
2020-08-13 18:38:30 +03:00
"""
Generate a starter config.cfg for training. Based on your requirements
specified via the CLI arguments, this command generates a config with the
2020-09-22 11:40:05 +03:00
optimal settings for your use case. This includes the choice of architecture,
2020-08-13 18:38:30 +03:00
pretrained weights and related hyperparameters.
2020-09-04 13:58:50 +03:00
DOCS: https://nightly.spacy.io/api/cli#init-config
2020-08-13 18:38:30 +03:00
"""
if isinstance(optimize, Optimizations): # instance of enum from the CLI
optimize = optimize.value
pipeline = string_to_list(pipeline)
2020-09-17 17:57:02 +03:00
init_config(
output_file,
lang=lang,
pipeline=pipeline,
optimize=optimize,
cpu=cpu,
pretraining=pretraining,
)
2020-08-14 17:49:26 +03:00
@init_cli.command("fill-config")
def init_fill_config_cli(
# fmt: off
base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
2020-09-17 17:57:02 +03:00
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
2020-08-14 17:49:26 +03:00
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")
# fmt: on
):
"""
Fill partial config.cfg with default values. Will add all missing settings
from the default config and will create all objects, check the registered
functions for their default values and update the base config. This command
can be used with a config generated via the training quickstart widget:
https://nightly.spacy.io/usage/training#quickstart
2020-09-04 13:58:50 +03:00
DOCS: https://nightly.spacy.io/api/cli#init-fill-config
2020-08-14 17:49:26 +03:00
"""
fill_config(output_file, base_path, pretraining=pretraining, diff=diff)
2020-08-14 17:49:26 +03:00
def fill_config(
output_file: Path,
base_path: Path,
*,
pretraining: bool = False,
diff: bool = False,
silent: bool = False,
2020-08-14 17:49:26 +03:00
) -> Tuple[Config, Config]:
is_stdout = str(output_file) == "-"
no_print = is_stdout or silent
msg = Printer(no_print=no_print)
2020-08-14 17:49:26 +03:00
with show_validation_error(hint_fill=False):
2020-08-16 16:46:29 +03:00
config = util.load_config(base_path)
nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
# Load a second time with validation to be extra sure that the produced
# config result is a valid config
nlp = util.load_model_from_config(nlp.config)
filled = nlp.config
if pretraining:
validate_config_for_pretrain(filled, msg)
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
filled = pretrain_config.merge(filled)
before = config.to_str()
after = filled.to_str()
if before == after:
msg.warn("Nothing to auto-fill: base config is already complete")
else:
msg.good("Auto-filled config with all values")
if diff and not no_print:
if before == after:
msg.warn("No diff to show: nothing was auto-filled")
else:
msg.divider("START CONFIG DIFF")
print("")
print(diff_strings(before, after))
msg.divider("END CONFIG DIFF")
print("")
save_config(filled, output_file, is_stdout=is_stdout, silent=silent)
return config, filled
2020-08-14 17:49:26 +03:00
def init_config(
2020-09-17 17:57:02 +03:00
output_file: Path,
*,
lang: str,
pipeline: List[str],
optimize: str,
cpu: bool,
pretraining: bool = False,
2020-08-13 18:38:30 +03:00
) -> None:
is_stdout = str(output_file) == "-"
msg = Printer(no_print=is_stdout)
with TEMPLATE_PATH.open("r") as f:
template = Template(f.read())
# Filter out duplicates since tok2vec and transformer are added by template
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, {})).dict()
2020-08-13 18:38:30 +03:00
variables = {
"lang": lang,
"components": pipeline,
"optimize": optimize,
"hardware": "cpu" if cpu else "gpu",
2020-08-15 15:50:29 +03:00
"transformer_data": reco["transformer"],
"word_vectors": reco["word_vectors"],
"has_letters": reco["has_letters"],
2020-08-13 18:38:30 +03:00
}
2020-08-21 13:06:19 +03:00
if variables["transformer_data"] and not has_spacy_transformers():
msg.warn(
"To generate a more effective transformer-based config (GPU-only), "
"install the spacy-transformers package and re-run this command. "
"The config generated now does not use transformers."
)
variables["transformer_data"] = None
2020-08-15 15:50:29 +03:00
base_template = template.render(variables).strip()
2020-08-13 18:38:30 +03:00
# Giving up on getting the newlines right in jinja for now
base_template = re.sub(r"\n\n\n+", "\n\n", base_template)
2020-08-15 15:50:29 +03:00
# Access variables declared in templates
template_vars = template.make_module(variables)
2020-08-13 18:38:30 +03:00
use_case = {
"Language": lang,
"Pipeline": ", ".join(pipeline),
"Optimize for": optimize,
"Hardware": variables["hardware"].upper(),
2020-08-15 15:50:29 +03:00
"Transformer": template_vars.transformer.get("name", False),
2020-08-13 18:38:30 +03:00
}
2020-08-15 15:50:29 +03:00
msg.info("Generated template specific for your use case")
2020-08-13 18:38:30 +03:00
for label, value in use_case.items():
msg.text(f"- {label}: {value}")
2020-08-14 17:49:26 +03:00
with show_validation_error(hint_fill=False):
2020-08-15 15:50:29 +03:00
config = util.load_config_from_str(base_template)
nlp = util.load_model_from_config(config, auto_fill=True)
2020-09-17 17:05:40 +03:00
config = nlp.config
if pretraining:
validate_config_for_pretrain(config, msg)
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
config = pretrain_config.merge(config)
2020-08-13 18:38:30 +03:00
msg.good("Auto-filled config with all values")
2020-09-17 17:05:40 +03:00
save_config(config, output_file, is_stdout=is_stdout)
2020-08-14 17:49:26 +03:00
def save_config(
config: Config, output_file: Path, is_stdout: bool = False, silent: bool = False
) -> None:
no_print = is_stdout or silent
msg = Printer(no_print=no_print)
2020-08-13 18:38:30 +03:00
if is_stdout:
2020-08-14 17:49:26 +03:00
print(config.to_str())
2020-08-13 18:38:30 +03:00
else:
2020-08-26 11:26:57 +03:00
if not output_file.parent.exists():
output_file.parent.mkdir(parents=True)
2020-08-14 17:49:26 +03:00
config.to_disk(output_file, interpolate=False)
2020-08-13 18:38:30 +03:00
msg.good("Saved config", output_file)
msg.text("You can now add your data and train your pipeline:")
2020-08-13 18:38:30 +03:00
variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]
if not no_print:
print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")
2020-08-21 13:06:19 +03:00
def has_spacy_transformers() -> bool:
2020-08-13 18:38:30 +03:00
try:
import spacy_transformers # noqa: F401
2020-08-21 13:06:19 +03:00
return True
2020-08-13 18:38:30 +03:00
except ImportError:
return False
def validate_config_for_pretrain(config: Config, msg: Printer) -> None:
if "tok2vec" not in config["nlp"]["pipeline"]:
msg.warn(
"No tok2vec component found in the pipeline. If your tok2vec "
"component has a different name, you may need to adjust the "
"tok2vec_model reference in the [pretraining] block. If you don't "
"have a tok2vec component, make sure to add it to your [components] "
"and the pipeline specified in the [nlp] block, so you can pretrain "
"weights for it."
)