mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-03 22:06:37 +03:00
270 lines
11 KiB
Python
270 lines
11 KiB
Python
import re
|
|
from enum import Enum
|
|
from pathlib import Path
|
|
from typing import List, Optional, Tuple
|
|
|
|
import srsly
|
|
from jinja2 import Template
|
|
from thinc.api import Config
|
|
from wasabi import Printer, diff_strings
|
|
|
|
from .. import util
|
|
from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
|
|
from ..schemas import RecommendationSchema
|
|
from ..util import SimpleFrozenList
|
|
from ._util import (
|
|
COMMAND,
|
|
Arg,
|
|
Opt,
|
|
_handle_renamed_language_codes,
|
|
import_code,
|
|
init_cli,
|
|
show_validation_error,
|
|
string_to_list,
|
|
)
|
|
|
|
ROOT = Path(__file__).parent / "templates"
|
|
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
|
|
RECOMMENDATIONS = srsly.read_yaml(ROOT / "quickstart_training_recommendations.yml")
|
|
|
|
|
|
class Optimizations(str, Enum):
|
|
efficiency = "efficiency"
|
|
accuracy = "accuracy"
|
|
|
|
|
|
class InitValues:
|
|
"""
|
|
Default values for initialization. Dedicated class to allow synchronized default values for init_config_cli() and
|
|
init_config(), i.e. initialization calls via CLI respectively Python.
|
|
"""
|
|
|
|
lang = "en"
|
|
pipeline = SimpleFrozenList(["tagger", "parser", "ner"])
|
|
optimize = Optimizations.efficiency
|
|
gpu = False
|
|
pretraining = False
|
|
force_overwrite = False
|
|
|
|
|
|
@init_cli.command("config")
|
|
def init_config_cli(
|
|
# fmt: off
|
|
output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
|
lang: str = Opt(InitValues.lang, "--lang", "-l", help="Code of the language to use"),
|
|
pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
|
optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
|
gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
|
pretraining: bool = Opt(InitValues.pretraining, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
|
force_overwrite: bool = Opt(InitValues.force_overwrite, "--force", "-F", help="Force overwriting the output file"),
|
|
# fmt: on
|
|
):
|
|
"""
|
|
Generate a starter config file for training. Based on your requirements
|
|
specified via the CLI arguments, this command generates a config with the
|
|
optimal settings for your use case. This includes the choice of architecture,
|
|
pretrained weights and related hyperparameters.
|
|
|
|
DOCS: https://spacy.io/api/cli#init-config
|
|
"""
|
|
pipeline = string_to_list(pipeline)
|
|
is_stdout = str(output_file) == "-"
|
|
if not is_stdout and output_file.exists() and not force_overwrite:
|
|
msg = Printer()
|
|
msg.fail(
|
|
"The provided output file already exists. To force overwriting the config file, set the --force or -F flag.",
|
|
exits=1,
|
|
)
|
|
config = init_config(
|
|
lang=lang,
|
|
pipeline=pipeline,
|
|
optimize=optimize.value,
|
|
gpu=gpu,
|
|
pretraining=pretraining,
|
|
silent=is_stdout,
|
|
)
|
|
save_config(config, output_file, is_stdout=is_stdout)
|
|
|
|
|
|
@init_cli.command("fill-config")
|
|
def init_fill_config_cli(
|
|
# fmt: off
|
|
base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False),
|
|
output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True),
|
|
distillation: bool = Opt(False, "--distillation", "-dt", help="Include config for distillation (with 'spacy distill')"),
|
|
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
|
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"),
|
|
code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
|
# fmt: on
|
|
):
|
|
"""
|
|
Fill partial config file with default values. Will add all missing settings
|
|
from the default config and will create all objects, check the registered
|
|
functions for their default values and update the base config. This command
|
|
can be used with a config generated via the training quickstart widget:
|
|
https://spacy.io/usage/training#quickstart
|
|
|
|
DOCS: https://spacy.io/api/cli#init-fill-config
|
|
"""
|
|
import_code(code_path)
|
|
fill_config(
|
|
output_file,
|
|
base_path,
|
|
distillation=distillation,
|
|
pretraining=pretraining,
|
|
diff=diff,
|
|
)
|
|
|
|
|
|
def fill_config(
|
|
output_file: Path,
|
|
base_path: Path,
|
|
*,
|
|
distillation: bool = False,
|
|
pretraining: bool = False,
|
|
diff: bool = False,
|
|
silent: bool = False,
|
|
) -> Tuple[Config, Config]:
|
|
is_stdout = str(output_file) == "-"
|
|
no_print = is_stdout or silent
|
|
msg = Printer(no_print=no_print)
|
|
with show_validation_error(hint_fill=False):
|
|
config = util.load_config(base_path)
|
|
nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
|
|
# Load a second time with validation to be extra sure that the produced
|
|
# config result is a valid config
|
|
nlp = util.load_model_from_config(nlp.config)
|
|
filled = nlp.config
|
|
# If we have sourced components in the base config, those will have been
|
|
# replaced with their actual config after loading, so we have to re-add them
|
|
sourced = util.get_sourced_components(config)
|
|
filled["components"].update(sourced)
|
|
if distillation:
|
|
distillation_config = util.load_config(DEFAULT_CONFIG_DISTILL_PATH)
|
|
filled = distillation_config.merge(filled)
|
|
if pretraining:
|
|
validate_config_for_pretrain(filled, msg)
|
|
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
|
filled = pretrain_config.merge(filled)
|
|
before = config.to_str()
|
|
after = filled.to_str()
|
|
if before == after:
|
|
msg.warn("Nothing to auto-fill: base config is already complete")
|
|
else:
|
|
msg.good("Auto-filled config with all values")
|
|
if diff and not no_print:
|
|
if before == after:
|
|
msg.warn("No diff to show: nothing was auto-filled")
|
|
else:
|
|
msg.divider("START CONFIG DIFF")
|
|
print("")
|
|
print(diff_strings(before, after))
|
|
msg.divider("END CONFIG DIFF")
|
|
print("")
|
|
save_config(filled, output_file, is_stdout=is_stdout, silent=silent)
|
|
return config, filled
|
|
|
|
|
|
def init_config(
|
|
*,
|
|
lang: str = InitValues.lang,
|
|
pipeline: List[str] = InitValues.pipeline,
|
|
optimize: str = InitValues.optimize,
|
|
gpu: bool = InitValues.gpu,
|
|
pretraining: bool = InitValues.pretraining,
|
|
silent: bool = True,
|
|
) -> Config:
|
|
msg = Printer(no_print=silent)
|
|
with TEMPLATE_PATH.open("r") as f:
|
|
template = Template(f.read())
|
|
|
|
# Throw error for renamed language codes in v4
|
|
_handle_renamed_language_codes(lang)
|
|
|
|
# Filter out duplicates since tok2vec and transformer are added by template
|
|
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
|
|
defaults = RECOMMENDATIONS["__default__"]
|
|
reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, defaults)).dict()
|
|
variables = {
|
|
"lang": lang,
|
|
"components": pipeline,
|
|
"optimize": optimize,
|
|
"hardware": "gpu" if gpu else "cpu",
|
|
"transformer_data": reco["transformer"],
|
|
"word_vectors": reco["word_vectors"],
|
|
"has_letters": reco["has_letters"],
|
|
}
|
|
if variables["transformer_data"] and not has_spacy_transformers():
|
|
msg.warn(
|
|
"To generate a more effective transformer-based config (GPU-only), "
|
|
"install the spacy-transformers package and re-run this command. "
|
|
"The config generated now does not use transformers."
|
|
)
|
|
variables["transformer_data"] = None
|
|
base_template = template.render(variables).strip()
|
|
# Giving up on getting the newlines right in jinja for now
|
|
base_template = re.sub(r"\n\n\n+", "\n\n", base_template)
|
|
# Access variables declared in templates
|
|
template_vars = template.make_module(variables)
|
|
use_case = {
|
|
"Language": lang,
|
|
"Pipeline": ", ".join(pipeline),
|
|
"Optimize for": optimize,
|
|
"Hardware": variables["hardware"].upper(),
|
|
"Transformer": template_vars.transformer.get("name") # type: ignore[attr-defined]
|
|
if template_vars.use_transformer # type: ignore[attr-defined]
|
|
else None,
|
|
}
|
|
msg.info("Generated config template specific for your use case")
|
|
for label, value in use_case.items():
|
|
msg.text(f"- {label}: {value}")
|
|
with show_validation_error(hint_fill=False):
|
|
config = util.load_config_from_str(base_template)
|
|
nlp = util.load_model_from_config(config, auto_fill=True)
|
|
config = nlp.config
|
|
if pretraining:
|
|
validate_config_for_pretrain(config, msg)
|
|
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
|
config = pretrain_config.merge(config)
|
|
msg.good("Auto-filled config with all values")
|
|
return config
|
|
|
|
|
|
def save_config(
|
|
config: Config, output_file: Path, is_stdout: bool = False, silent: bool = False
|
|
) -> None:
|
|
no_print = is_stdout or silent
|
|
msg = Printer(no_print=no_print)
|
|
if is_stdout:
|
|
print(config.to_str())
|
|
else:
|
|
if not output_file.parent.exists():
|
|
output_file.parent.mkdir(parents=True)
|
|
config.to_disk(output_file, interpolate=False)
|
|
msg.good("Saved config", output_file)
|
|
msg.text("You can now add your data and train your pipeline:")
|
|
variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]
|
|
if not no_print:
|
|
print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")
|
|
|
|
|
|
def has_spacy_transformers() -> bool:
|
|
try:
|
|
import spacy_transformers # noqa: F401
|
|
|
|
return True
|
|
except ImportError:
|
|
return False
|
|
|
|
|
|
def validate_config_for_pretrain(config: Config, msg: Printer) -> None:
|
|
if "tok2vec" not in config["nlp"]["pipeline"]:
|
|
msg.warn(
|
|
"No tok2vec component found in the pipeline. If your tok2vec "
|
|
"component has a different name, you may need to adjust the "
|
|
"tok2vec_model reference in the [pretraining] block. If you don't "
|
|
"have a tok2vec component, make sure to add it to your [components] "
|
|
"and the pipeline specified in the [nlp] block, so you can pretrain "
|
|
"weights for it."
|
|
)
|