Add init CLI and init config (#5854)

* Add init CLI and init config draft * Improve config validation * Auto-format * Don't export anything in debug config * Update docs
2025-11-06 10:57:34 +03:00 · 2020-08-02 15:18:30 +02:00 · 2020-08-02 15:18:30 +02:00 · 4c055f0aa7
commit 4c055f0aa7
parent e393ebd78b
12 changed files with 245 additions and 77 deletions
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -15,6 +15,7 @@ from .debug_model import debug_model  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_model import init_model  # noqa: F401
 from .init_config import init_config  # noqa: F401
 from .validate import validate  # noqa: F401
 from .project.clone import project_clone  # noqa: F401
 from .project.assets import project_assets  # noqa: F401
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -31,6 +31,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
 commands to check and validate your config files, training and evaluation data,
 and custom model implementations.
 """
 INIT_HELP = """Commands for initializing configs and models."""
 # Wrappers for Typer's annotations. Initially created to set defaults and to
 # keep the names short, but not needed at the moment.
@ -40,9 +41,11 @@ Opt = typer.Option
 app = typer.Typer(name=NAME, help=HELP)
 project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
 debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
 init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
 app.add_typer(project_cli)
 app.add_typer(debug_cli)
 app.add_typer(init_cli)
 def setup_cli() -> None:
@ -172,16 +175,34 @@ def get_checksum(path: Union[Path, str]) -> str:
@contextmanager
-def show_validation_error(title: str = "Config validation error"):
+def show_validation_error(
    file_path: Optional[Union[str, Path]] = None,
    *,
    title: str = "Config validation error",
    hint_init: bool = True,
 ):
    """Helper to show custom config validation errors on the CLI.
    file_path (str / Path): Optional file path of config file, used in hints.
    title (str): Title of the custom formatted error.
    hint_init (bool): Show hint about filling config.
    """
    try:
        yield
    except (ConfigValidationError, InterpolationError) as e:
        msg.fail(title, spaced=True)
-        print(str(e).replace("Config validation error", "").strip())
+        # TODO: This is kinda hacky and we should probably provide a better
        # helper for this in Thinc
        err_text = str(e).replace("Config validation error", "").strip()
        print(err_text)
        if hint_init and "field required" in err_text:
            config_path = file_path if file_path is not None else "config.cfg"
            msg.text(
                "If your config contains missing values, you can run the 'init "
                "config' command to fill in all the defaults, if possible:",
                spaced=True,
            )
            print(f"{COMMAND} init config {config_path} --base {config_path}\n")
        sys.exit(1)
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -33,7 +33,6 @@ def debug_config_cli(
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True),
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    output_path: Optional[Path] = Opt(None, "--output", "-o", help="Output path for filled config or '-' for standard output", allow_dash=True),
    auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"),
    diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled")
    # fmt: on
@ -49,7 +48,7 @@ def debug_config_cli(
    """
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
-    with show_validation_error():
+    with show_validation_error(config_path):
        config = Config().from_disk(config_path)
        try:
            nlp, _ = util.load_model_from_config(
@ -57,7 +56,6 @@ def debug_config_cli(
            )
        except ValueError as e:
            msg.fail(str(e), exits=1)
    is_stdout = output_path is not None and str(output_path) == "-"
    if auto_fill:
        orig_config = config.to_str()
        filled_config = nlp.config.to_str()
@ -68,12 +66,7 @@ def debug_config_cli(
            if diff:
                print(diff_strings(config.to_str(), nlp.config.to_str()))
    else:
-        msg.good("Original config is valid", show=not is_stdout)
+        msg.good("Original config is valid")
    if is_stdout:
        print(nlp.config.to_str())
    elif output_path is not None:
        nlp.config.to_disk(output_path)
        msg.good(f"Saved updated config to {output_path}")
@debug_cli.command(
@ -142,7 +135,7 @@ def debug_data(
        msg.fail("Development data not found", dev_path, exits=1)
    if not config_path.exists():
        msg.fail("Config file not found", config_path, exists=1)
-    with show_validation_error():
+    with show_validation_error(config_path):
        cfg = Config().from_disk(config_path)
        nlp, config = util.load_model_from_config(cfg, overrides=config_overrides)
    # TODO: handle base model
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -50,8 +50,8 @@ def debug_model_cli(
        "print_prediction": P3,
    }
    config_overrides = parse_config_overrides(ctx.args)
    with show_validation_error(config_path):
        cfg = Config().from_disk(config_path)
    with show_validation_error():
        try:
            _, config = util.load_model_from_config(cfg, overrides=config_overrides)
        except ValueError as e:
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@ -0,0 +1,81 @@
 from typing import Optional, List
 from pathlib import Path
 from thinc.api import Config
 from wasabi import msg
 from ..util import load_model_from_config, get_lang_class, load_model
 from ._util import init_cli, Arg, Opt, show_validation_error
@init_cli.command("config")
 def init_config_cli(
    # fmt: off
    output_path: Path = Arg("-", help="Output path or - for stdout", allow_dash=True),
    base_path: Optional[Path] = Opt(None, "--base", "-b", help="Optional base config to fill", exists=True, dir_okay=False),
    model: Optional[str] = Opt(None, "--model", "-m", help="Optional model to copy config from"),
    lang: Optional[str] = Opt(None, "--lang", "-l", help="Optional language code for blank config"),
    pipeline: Optional[str] = Opt(None, "--pipeline", "-p", help="Optional pipeline components to use")
    # fmt: on
 ):
    """Generate a starter config.cfg for training."""
    validate_cli_args(base_path, model, lang)
    is_stdout = str(output_path) == "-"
    pipeline = [p.strip() for p in pipeline.split(",")] if pipeline else []
    cfg = init_config(output_path, base_path, model, lang, pipeline, silent=is_stdout)
    if is_stdout:
        print(cfg.to_str())
    else:
        cfg.to_disk(output_path)
        msg.good("Saved config", output_path)
 def init_config(
    output_path: Path,
    config_path: Optional[Path],
    model: Optional[str],
    lang: Optional[str],
    pipeline: Optional[List[str]],
    silent: bool = False,
 ) -> Config:
    if config_path is not None:
        msg.info("Generating config from base config", show=not silent)
        with show_validation_error(config_path, hint_init=False):
            config = Config().from_disk(config_path)
            try:
                nlp, _ = load_model_from_config(config, auto_fill=True)
            except ValueError as e:
                msg.fail(str(e), exits=1)
        return nlp.config
    if model is not None:
        ext = f" with pipeline {pipeline}" if pipeline else ""
        msg.info(f"Generating config from model {model}{ext}", show=not silent)
        nlp = load_model(model)
        for existing_pipe_name in nlp.pipe_names:
            if existing_pipe_name not in pipeline:
                nlp.remove_pipe(existing_pipe_name)
        for pipe_name in pipeline:
            if pipe_name not in nlp.pipe_names:
                nlp.add_pipe(pipe_name)
        return nlp.config
    if lang is not None:
        ext = f" with pipeline {pipeline}" if pipeline else ""
        msg.info(f"Generating config for language '{lang}'{ext}", show=not silent)
        nlp = get_lang_class(lang)()
        for pipe_name in pipeline:
            nlp.add_pipe(pipe_name)
        return nlp.config
 def validate_cli_args(
    config_path: Optional[Path], model: Optional[str], lang: Optional[str]
 ) -> None:
    args = {"--base": config_path, "--model": model, "--lang": lang}
    if sum(arg is not None for arg in args.values()) != 1:
        existing = " ".join(f"{a} {v}" for a, v in args.items() if v is not None)
        msg.fail(
            "The init config command expects only one of the following arguments: "
            "--base (base config to fill and update), --lang (language code to "
            "use for blank config) or --model (base model to copy config from).",
            f"Got: {existing if existing else 'no arguments'}",
            exits=1,
        )
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -10,14 +10,14 @@ import gzip
 import zipfile
 import srsly
 import warnings
-from wasabi import Printer
+from wasabi import msg, Printer
 import typer
-from ._util import app, Arg, Opt
+from ._util import app, init_cli, Arg, Opt
 from ..vectors import Vectors
 from ..errors import Errors, Warnings
 from ..language import Language
 from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
 from ..lookups import Lookups
 try:
    import ftfy
@ -28,9 +28,15 @@ except ImportError:
 DEFAULT_OOV_PROB = -20
-@app.command("init-model")
+@init_cli.command("model")
@app.command(
    "init-model",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
    hidden=True,  # hide this from main CLI help but still allow it to work with warning
 )
 def init_model_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    lang: str = Arg(..., help="Model language"),
    output_dir: Path = Arg(..., help="Model output directory"),
    freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
@ -48,6 +54,12 @@ def init_model_cli(
    Create a new model from raw data. If vectors are provided in Word2Vec format,
    they can be either a .txt or zipped as a .zip or .tar.gz.
    """
    if ctx.command.name == "init-model":
        msg.warn(
            "The init-model command is now available via the 'init model' "
            "subcommand (without the hyphen). You can run python -m spacy init "
            "--help for an overview of the other available initialization commands."
        )
    init_model(
        lang,
        output_dir,
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -87,8 +87,8 @@ def pretrain(
    else:
        msg.info("Using CPU")
    msg.info(f"Loading config from: {config_path}")
    with show_validation_error(config_path):
        config = Config().from_disk(config_path)
    with show_validation_error():
        nlp, config = util.load_model_from_config(config, overrides=config_overrides)
    # TODO: validate that [pretraining] block exists
    if not output_dir.exists():
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -79,10 +79,11 @@ def train(
    else:
        msg.info("Using CPU")
    msg.info(f"Loading config and nlp from: {config_path}")
    with show_validation_error(config_path):
        config = Config().from_disk(config_path)
    if config.get("training", {}).get("seed") is not None:
        fix_random_seed(config["training"]["seed"])
-    with show_validation_error():
+    with show_validation_error(config_path):
        nlp, config = util.load_model_from_config(config, overrides=config_overrides)
    if config["training"]["base_model"]:
        # TODO: do something to check base_nlp against regular nlp described in config?
@ -245,9 +246,7 @@ def create_evaluation_callback(
    cfg: Union[Config, Dict[str, Any]],
 ) -> Callable[[], Tuple[float, Dict[str, float]]]:
    def evaluate() -> Tuple[float, Dict[str, float]]:
-        dev_examples = corpus.dev_dataset(
+        dev_examples = corpus.dev_dataset(nlp, gold_preproc=cfg["gold_preproc"])
            nlp, gold_preproc=cfg["gold_preproc"]
        )
        dev_examples = list(dev_examples)
        n_words = sum(len(ex.predicted) for ex in dev_examples)
        batch_size = cfg["eval_batch_size"]
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@ -13,8 +13,9 @@ from ..util import get_package_path, get_model_meta, is_compatible_version
@app.command("validate")
 def validate_cli():
    """
-    Validate that the currently installed version of spaCy is compatible
+    Validate the currently installed models and spaCy version. Checks if the
-    with the installed models. Should be run after `pip install -U spacy`.
+    installed models are compatible and shows upgrade instructions if available.
    Should be run after `pip install -U spacy`.
    """
    validate()
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -6,11 +6,11 @@ menu:
  - ['Download', 'download']
  - ['Info', 'info']
  - ['Validate', 'validate']
  - ['Init', 'init']
  - ['Convert', 'convert']
  - ['Debug', 'debug']
  - ['Train', 'train']
  - ['Pretrain', 'pretrain']
  - ['Init Model', 'init-model']
  - ['Evaluate', 'evaluate']
  - ['Package', 'package']
  - ['Project', 'project']
@ -94,6 +94,80 @@ $ python -m spacy validate
 | ---------- | -------- | --------------------------------------------------------- |
 | **PRINTS** | `stdout` | Details about the compatibility of your installed models. |
 ## Init {#init new="3"}
 The `spacy init` CLI includes helpful commands for initializing training config
 files and model directories.
 ### init config {#init-config new="3"}
 Initialize and export a [`config.cfg` file](/usage/training#config) for training
 and update it with all default values, if possible. Config files used for
 training should always be complete and not contain any hidden defaults or
 missing values, so this command helps you create your final config. It takes
 **one** of the following options:
 - `--base`: Base **config** to auto-fill, e.g. created using the
  [training quickstart](/usage/training#quickstart) widget.
 - `--lang`: Base **language** code to use for blank config.
 - `--model`: Base **model** to copy config from.
 > ```bash
 > ### with base config {wrap="true"}
 > $ python -m spacy init config config.cfg --base base.cfg
 > ```
 >
 > ```bash
 > ### blank language {wrap="true"}
 > $ python -m spacy init config config.cfg --lang en --pipeline tagger,parser
 > ```
 ```bash
 $ python -m spacy init config [output] [--base] [--lang] [--model] [--pipeline]
 ```
 | Argument           | Type       | Description                                                                                                                                                           |
 | ------------------ | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `output`           | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file.                                                         |
 | `--base`, `-b`     | option     | Optional base config file to auto-fill with defaults.                                                                                                                 |
 | `--lang`, `-l`     | option     | Optional language code to use for blank config. If a `--pipeline` is specified, the components will be added in order.                                                |
 | `--model`, `-m`    | option     | Optional base model to copy config from. If a `--pipeline` is specified, only those components will be kept, and all other components not in the model will be added. |
 | `--pipeline`, `-p` | option     | Optional comma-separate pipeline of components to add to blank language or model.                                                                                     |
 | **CREATES**        | config     | Complete and auto-filled config file for training.                                                                                                                    |
 ### init model {#init-model new="2"}
 <!-- TODO: update for v3 -->
 Create a new model directory from raw data, like word frequencies, Brown
 clusters and word vectors. This command is similar to the `spacy model` command
 in v1.x. Note that in order to populate the model's vocab, you need to pass in a
 JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as
 `--jsonl-loc` with optional `id` values that correspond to the vectors table.
 Just loading in vectors will not automatically populate the vocab.
 <Infobox title="New in v3.0" variant="warning">
 The `init-model` command is now available as a subcommand of `spacy init`.
 </Infobox>
 ```bash
 $ python -m spacy init model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
 [--prune-vectors]
 ```
 | Argument                                                | Type       | Description                                                                                                                                                                                                                                            |
 | ------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `lang`                                                  | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`.                                                                                                                                                           |
 | `output_dir`                                            | positional | Model output directory. Will be created if it doesn't exist.                                                                                                                                                                                           |
 | `--jsonl-loc`, `-j`                                     | option     | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes.                                                                                                                                         |
 | `--vectors-loc`, `-v`                                   | option     | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
 | `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | option     | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation.                                                                                                                                                      |
 | `--prune-vectors`, `-V`                                 | option     | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning.                                                                                                                                                                         |
 | `--vectors-name`, `-vn`                                 | option     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`.                                                                                                                                                                  |
 | **CREATES**                                             | model      | A spaCy model containing the vocab and vectors.                                                                                                                                                                                                        |
 ## Convert {#convert}
 Convert files into spaCy's
@ -469,32 +543,6 @@ tokenization can be provided.
 {"tokens": ["If", "tokens", "are", "provided", "then", "we", "can", "skip", "the", "raw", "input", "text"]}
 ```
 ## Init Model {#init-model new="2"}
 Create a new model directory from raw data, like word frequencies, Brown
 clusters and word vectors. This command is similar to the `spacy model` command
 in v1.x. Note that in order to populate the model's vocab, you need to pass in a
 JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as
 `--jsonl-loc` with optional `id` values that correspond to the vectors table.
 Just loading in vectors will not automatically populate the vocab.
 ```bash
 $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
 [--prune-vectors]
 ```
 | Argument                                                    | Type       | Description                                                                                                                                                                                                                                            |
 | ----------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `lang`                                                      | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`.                                                                                                                                                           |
 | `output_dir`                                                | positional | Model output directory. Will be created if it doesn't exist.                                                                                                                                                                                           |
 | `--jsonl-loc`, `-j`                                         | option     | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes.                                                                                                                                         |
 | `--vectors-loc`, `-v`                                       | option     | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
 | `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag>     | option     | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation.                                                                                                                                                      |
 | `--prune-vectors`, `-V`                                     | option     | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning.                                                                                                                                                                         |
 | `--vectors-name`, `-vn`                                     | option     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`.                                                                                                                                                                  |
 | `--omit-extra-lookups`, `-OEL` <Tag variant="new">2.3</Tag> | flag       | Do not include any of the extra lookups tables (`cluster`/`prob`/`sentiment`) from `spacy-lookups-data` in the model.                                                                                                                                  |
 | **CREATES**                                                 | model      | A spaCy model containing the vocab and vectors.                                                                                                                                                                                                        |
 ## Evaluate {#evaluate new="2"}
 <!-- TODO: document new evaluate command -->
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -44,24 +44,12 @@ following data and information:
 2. A [`config.cfg`](#config) **configuration file** with all settings and
   hyperparameters.
 3. An optional **Python file** to register
-   [custom models and architectures](#custom-models).
+   [custom functions and architectures](#custom-code).
 <!-- TODO: decide how we want to present the "getting started" workflow here, get a default config etc. -->
 ```bash
 $ python -m spacy train train.spacy dev.spacy config.cfg --output ./output
 ```
 > #### Tip: Debug your data
 >
 > The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate
 > your training and development data, get useful stats, and find problems like
 > invalid entity annotations, cyclic dependencies, low data labels and more.
 >
 > ```bash
 > $ python -m spacy debug-data en train.spacy dev.spacy --verbose
 > ```
 <Project id="some_example_project">
 The easiest way to get started with an end-to-end training process is to clone a
@ -74,16 +62,42 @@ workflows, from data preprocessing to training and packaging your model.
 > #### Instructions
 >
-> 1. Select your requirements and settings. The quickstart widget will
+> 1. Select your requirements and settings.
 >    auto-generate a recommended starter config for you.
 > 2. Use the buttons at the bottom to save the result to your clipboard or a
->    file `config.cfg`.
+>    file `base_config.cfg`.
-> 3. TOOD: recommended approach for filling config
+> 3. Run [`init config`](/api/cli#init-config) to create a full training config.
-> 4. Run [`spacy train`](/api/cli#train) with your config and data.
+> 4. Run [`train`](/api/cli#train) with your config and data.
 import QuickstartTraining from 'widgets/quickstart-training.js'
-<QuickstartTraining />
+<QuickstartTraining download="base_config.cfg" />
 After you've saved the starter config to a file `base_config.cfg`, you can use
 the [`init config`](/api/cli#init-config) command to fill in the remaining
 defaults. Training configs should always be **complete and without hidden
 defaults**, to keep your experiments reproducible.
 ```bash
 $ python -m spacy init config config.cfg --base base_config.cfg
 ```
 > #### Tip: Debug your data
 >
 > The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate
 > your training and development data, get useful stats, and find problems like
 > invalid entity annotations, cyclic dependencies, low data labels and more.
 >
 > ```bash
 > $ python -m spacy debug-data en train.spacy dev.spacy --verbose
 > ```
 You can now run [`train`](/api/cli#train) with your training and development
 data and the training config. See the [`convert`](/api/cli#convert) command for
 details on how to convert your data to spaCy's binary `.spacy` format.
 ```bash
 $ python -m spacy train train.spacy dev.spacy config.cfg --output ./output
 ```
 ## Training config {#config}
--- a/website/docs/usage/transformers.md
+++ b/website/docs/usage/transformers.md
@ -165,10 +165,8 @@ resolved, the function is created and passed into the model as an argument.
 Remember that the `config.cfg` used for training should contain **no missing
 values** and requires all settings to be defined. You don't want any hidden
 defaults creeping in and changing your results! spaCy will tell you if settings
-are missing, and you can run [`spacy debug config`](/api/cli#debug-config) with
+are missing, and you can run [`spacy init config`](/api/cli#init-config) with to
-`--auto-fill` to automatically fill in all defaults.
+automatically fill in all defaults.
 <!-- TODO: update with details on getting started with a config -->
 </Infobox>