Add init CLI and init config (#5854)

* Add init CLI and init config draft * Improve config validation * Auto-format * Don't export anything in debug config * Update docs
2025-11-06 10:57:34 +03:00 · 2020-08-02 15:18:30 +02:00 · 2020-08-02 15:18:30 +02:00 · 4c055f0aa7
commit 4c055f0aa7
parent e393ebd78b
12 changed files with 245 additions and 77 deletions
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -15,6 +15,7 @@ from .debug_model import debug_model  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_model import init_model  # noqa: F401
+from .init_config import init_config  # noqa: F401
 from .validate import validate  # noqa: F401
 from .project.clone import project_clone  # noqa: F401
 from .project.assets import project_assets  # noqa: F401
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -31,6 +31,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
 commands to check and validate your config files, training and evaluation data,
 and custom model implementations.
 """
+INIT_HELP = """Commands for initializing configs and models."""

 # Wrappers for Typer's annotations. Initially created to set defaults and to
 # keep the names short, but not needed at the moment.
@ -40,9 +41,11 @@ Opt = typer.Option
 app = typer.Typer(name=NAME, help=HELP)
 project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
 debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
+init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)

 app.add_typer(project_cli)
 app.add_typer(debug_cli)
+app.add_typer(init_cli)


 def setup_cli() -> None:
@ -172,16 +175,34 @@ def get_checksum(path: Union[Path, str]) -> str:


@contextmanager
-def show_validation_error(title: str = "Config validation error"):
+def show_validation_error(
+    file_path: Optional[Union[str, Path]] = None,
+    *,
+    title: str = "Config validation error",
+    hint_init: bool = True,
+):
    """Helper to show custom config validation errors on the CLI.

+    file_path (str / Path): Optional file path of config file, used in hints.
    title (str): Title of the custom formatted error.
+    hint_init (bool): Show hint about filling config.
    """
    try:
        yield
    except (ConfigValidationError, InterpolationError) as e:
        msg.fail(title, spaced=True)
-        print(str(e).replace("Config validation error", "").strip())
+        # TODO: This is kinda hacky and we should probably provide a better
+        # helper for this in Thinc
+        err_text = str(e).replace("Config validation error", "").strip()
+        print(err_text)
+        if hint_init and "field required" in err_text:
+            config_path = file_path if file_path is not None else "config.cfg"
+            msg.text(
+                "If your config contains missing values, you can run the 'init "
+                "config' command to fill in all the defaults, if possible:",
+                spaced=True,
+            )
+            print(f"{COMMAND} init config {config_path} --base {config_path}\n")
        sys.exit(1)


--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -33,7 +33,6 @@ def debug_config_cli(
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True),
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
-    output_path: Optional[Path] = Opt(None, "--output", "-o", help="Output path for filled config or '-' for standard output", allow_dash=True),
    auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"),
    diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled")
    # fmt: on
@ -49,7 +48,7 @@ def debug_config_cli(
    """
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
-    with show_validation_error():
+    with show_validation_error(config_path):
        config = Config().from_disk(config_path)
        try:
            nlp, _ = util.load_model_from_config(
@ -57,7 +56,6 @@ def debug_config_cli(
            )
        except ValueError as e:
            msg.fail(str(e), exits=1)
-    is_stdout = output_path is not None and str(output_path) == "-"
    if auto_fill:
        orig_config = config.to_str()
        filled_config = nlp.config.to_str()
@ -68,12 +66,7 @@ def debug_config_cli(
            if diff:
                print(diff_strings(config.to_str(), nlp.config.to_str()))
    else:
-        msg.good("Original config is valid", show=not is_stdout)
-    if is_stdout:
-        print(nlp.config.to_str())
-    elif output_path is not None:
-        nlp.config.to_disk(output_path)
-        msg.good(f"Saved updated config to {output_path}")
+        msg.good("Original config is valid")


@debug_cli.command(
@ -142,7 +135,7 @@ def debug_data(
        msg.fail("Development data not found", dev_path, exits=1)
    if not config_path.exists():
        msg.fail("Config file not found", config_path, exists=1)
-    with show_validation_error():
+    with show_validation_error(config_path):
        cfg = Config().from_disk(config_path)
        nlp, config = util.load_model_from_config(cfg, overrides=config_overrides)
    # TODO: handle base model
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -50,8 +50,8 @@ def debug_model_cli(
        "print_prediction": P3,
    }
    config_overrides = parse_config_overrides(ctx.args)
+    with show_validation_error(config_path):
        cfg = Config().from_disk(config_path)
-    with show_validation_error():
        try:
            _, config = util.load_model_from_config(cfg, overrides=config_overrides)
        except ValueError as e:
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@ -0,0 +1,81 @@
+from typing import Optional, List
+from pathlib import Path
+from thinc.api import Config
+from wasabi import msg
+
+from ..util import load_model_from_config, get_lang_class, load_model
+from ._util import init_cli, Arg, Opt, show_validation_error
+
+
+@init_cli.command("config")
+def init_config_cli(
+    # fmt: off
+    output_path: Path = Arg("-", help="Output path or - for stdout", allow_dash=True),
+    base_path: Optional[Path] = Opt(None, "--base", "-b", help="Optional base config to fill", exists=True, dir_okay=False),
+    model: Optional[str] = Opt(None, "--model", "-m", help="Optional model to copy config from"),
+    lang: Optional[str] = Opt(None, "--lang", "-l", help="Optional language code for blank config"),
+    pipeline: Optional[str] = Opt(None, "--pipeline", "-p", help="Optional pipeline components to use")
+    # fmt: on
+):
+    """Generate a starter config.cfg for training."""
+    validate_cli_args(base_path, model, lang)
+    is_stdout = str(output_path) == "-"
+    pipeline = [p.strip() for p in pipeline.split(",")] if pipeline else []
+    cfg = init_config(output_path, base_path, model, lang, pipeline, silent=is_stdout)
+    if is_stdout:
+        print(cfg.to_str())
+    else:
+        cfg.to_disk(output_path)
+        msg.good("Saved config", output_path)
+
+
+def init_config(
+    output_path: Path,
+    config_path: Optional[Path],
+    model: Optional[str],
+    lang: Optional[str],
+    pipeline: Optional[List[str]],
+    silent: bool = False,
+) -> Config:
+    if config_path is not None:
+        msg.info("Generating config from base config", show=not silent)
+        with show_validation_error(config_path, hint_init=False):
+            config = Config().from_disk(config_path)
+            try:
+                nlp, _ = load_model_from_config(config, auto_fill=True)
+            except ValueError as e:
+                msg.fail(str(e), exits=1)
+        return nlp.config
+    if model is not None:
+        ext = f" with pipeline {pipeline}" if pipeline else ""
+        msg.info(f"Generating config from model {model}{ext}", show=not silent)
+        nlp = load_model(model)
+        for existing_pipe_name in nlp.pipe_names:
+            if existing_pipe_name not in pipeline:
+                nlp.remove_pipe(existing_pipe_name)
+        for pipe_name in pipeline:
+            if pipe_name not in nlp.pipe_names:
+                nlp.add_pipe(pipe_name)
+        return nlp.config
+    if lang is not None:
+        ext = f" with pipeline {pipeline}" if pipeline else ""
+        msg.info(f"Generating config for language '{lang}'{ext}", show=not silent)
+        nlp = get_lang_class(lang)()
+        for pipe_name in pipeline:
+            nlp.add_pipe(pipe_name)
+        return nlp.config
+
+
+def validate_cli_args(
+    config_path: Optional[Path], model: Optional[str], lang: Optional[str]
+) -> None:
+    args = {"--base": config_path, "--model": model, "--lang": lang}
+    if sum(arg is not None for arg in args.values()) != 1:
+        existing = " ".join(f"{a} {v}" for a, v in args.items() if v is not None)
+        msg.fail(
+            "The init config command expects only one of the following arguments: "
+            "--base (base config to fill and update), --lang (language code to "
+            "use for blank config) or --model (base model to copy config from).",
+            f"Got: {existing if existing else 'no arguments'}",
+            exits=1,
+        )
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -10,14 +10,14 @@ import gzip
 import zipfile
 import srsly
 import warnings
-from wasabi import Printer
+from wasabi import msg, Printer
+import typer

-from ._util import app, Arg, Opt
+from ._util import app, init_cli, Arg, Opt
 from ..vectors import Vectors
 from ..errors import Errors, Warnings
 from ..language import Language
 from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
-from ..lookups import Lookups

 try:
    import ftfy
@ -28,9 +28,15 @@ except ImportError:
 DEFAULT_OOV_PROB = -20


-@app.command("init-model")
+@init_cli.command("model")
+@app.command(
+    "init-model",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+    hidden=True,  # hide this from main CLI help but still allow it to work with warning
+)
 def init_model_cli(
    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
    lang: str = Arg(..., help="Model language"),
    output_dir: Path = Arg(..., help="Model output directory"),
    freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
@ -48,6 +54,12 @@ def init_model_cli(
    Create a new model from raw data. If vectors are provided in Word2Vec format,
    they can be either a .txt or zipped as a .zip or .tar.gz.
    """
+    if ctx.command.name == "init-model":
+        msg.warn(
+            "The init-model command is now available via the 'init model' "
+            "subcommand (without the hyphen). You can run python -m spacy init "
+            "--help for an overview of the other available initialization commands."
+        )
    init_model(
        lang,
        output_dir,
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -87,8 +87,8 @@ def pretrain(
    else:
        msg.info("Using CPU")
    msg.info(f"Loading config from: {config_path}")
+    with show_validation_error(config_path):
        config = Config().from_disk(config_path)
-    with show_validation_error():
        nlp, config = util.load_model_from_config(config, overrides=config_overrides)
    # TODO: validate that [pretraining] block exists
    if not output_dir.exists():
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -79,10 +79,11 @@ def train(
    else:
        msg.info("Using CPU")
    msg.info(f"Loading config and nlp from: {config_path}")
+    with show_validation_error(config_path):
        config = Config().from_disk(config_path)
    if config.get("training", {}).get("seed") is not None:
        fix_random_seed(config["training"]["seed"])
-    with show_validation_error():
+    with show_validation_error(config_path):
        nlp, config = util.load_model_from_config(config, overrides=config_overrides)
    if config["training"]["base_model"]:
        # TODO: do something to check base_nlp against regular nlp described in config?
@ -245,9 +246,7 @@ def create_evaluation_callback(
    cfg: Union[Config, Dict[str, Any]],
 ) -> Callable[[], Tuple[float, Dict[str, float]]]:
    def evaluate() -> Tuple[float, Dict[str, float]]:
-        dev_examples = corpus.dev_dataset(
-            nlp, gold_preproc=cfg["gold_preproc"]
-        )
+        dev_examples = corpus.dev_dataset(nlp, gold_preproc=cfg["gold_preproc"])
        dev_examples = list(dev_examples)
        n_words = sum(len(ex.predicted) for ex in dev_examples)
        batch_size = cfg["eval_batch_size"]
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@ -13,8 +13,9 @@ from ..util import get_package_path, get_model_meta, is_compatible_version
@app.command("validate")
 def validate_cli():
    """
-    Validate that the currently installed version of spaCy is compatible
-    with the installed models. Should be run after `pip install -U spacy`.
+    Validate the currently installed models and spaCy version. Checks if the
+    installed models are compatible and shows upgrade instructions if available.
+    Should be run after `pip install -U spacy`.
    """
    validate()

--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -6,11 +6,11 @@ menu:
  - ['Download', 'download']
  - ['Info', 'info']
  - ['Validate', 'validate']
+  - ['Init', 'init']
  - ['Convert', 'convert']
  - ['Debug', 'debug']
  - ['Train', 'train']
  - ['Pretrain', 'pretrain']
-  - ['Init Model', 'init-model']
  - ['Evaluate', 'evaluate']
  - ['Package', 'package']
  - ['Project', 'project']
@ -94,6 +94,80 @@ $ python -m spacy validate
 | ---------- | -------- | --------------------------------------------------------- |
 | **PRINTS** | `stdout` | Details about the compatibility of your installed models. |

+## Init {#init new="3"}
+
+The `spacy init` CLI includes helpful commands for initializing training config
+files and model directories.
+
+### init config {#init-config new="3"}
+
+Initialize and export a [`config.cfg` file](/usage/training#config) for training
+and update it with all default values, if possible. Config files used for
+training should always be complete and not contain any hidden defaults or
+missing values, so this command helps you create your final config. It takes
+**one** of the following options:
+
+- `--base`: Base **config** to auto-fill, e.g. created using the
+  [training quickstart](/usage/training#quickstart) widget.
+- `--lang`: Base **language** code to use for blank config.
+- `--model`: Base **model** to copy config from.
+
+> ```bash
+> ### with base config {wrap="true"}
+> $ python -m spacy init config config.cfg --base base.cfg
+> ```
+>
+> ```bash
+> ### blank language {wrap="true"}
+> $ python -m spacy init config config.cfg --lang en --pipeline tagger,parser
+> ```
+
+```bash
+$ python -m spacy init config [output] [--base] [--lang] [--model] [--pipeline]
+```
+
+| Argument           | Type       | Description                                                                                                                                                           |
+| ------------------ | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `output`           | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file.                                                         |
+| `--base`, `-b`     | option     | Optional base config file to auto-fill with defaults.                                                                                                                 |
+| `--lang`, `-l`     | option     | Optional language code to use for blank config. If a `--pipeline` is specified, the components will be added in order.                                                |
+| `--model`, `-m`    | option     | Optional base model to copy config from. If a `--pipeline` is specified, only those components will be kept, and all other components not in the model will be added. |
+| `--pipeline`, `-p` | option     | Optional comma-separate pipeline of components to add to blank language or model.                                                                                     |
+| **CREATES**        | config     | Complete and auto-filled config file for training.                                                                                                                    |
+
+### init model {#init-model new="2"}
+
+<!-- TODO: update for v3 -->
+
+Create a new model directory from raw data, like word frequencies, Brown
+clusters and word vectors. This command is similar to the `spacy model` command
+in v1.x. Note that in order to populate the model's vocab, you need to pass in a
+JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as
+`--jsonl-loc` with optional `id` values that correspond to the vectors table.
+Just loading in vectors will not automatically populate the vocab.
+
+<Infobox title="New in v3.0" variant="warning">
+
+The `init-model` command is now available as a subcommand of `spacy init`.
+
+</Infobox>
+
+```bash
+$ python -m spacy init model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
+[--prune-vectors]
+```
+
+| Argument                                                | Type       | Description                                                                                                                                                                                                                                            |
+| ------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `lang`                                                  | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`.                                                                                                                                                           |
+| `output_dir`                                            | positional | Model output directory. Will be created if it doesn't exist.                                                                                                                                                                                           |
+| `--jsonl-loc`, `-j`                                     | option     | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes.                                                                                                                                         |
+| `--vectors-loc`, `-v`                                   | option     | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
+| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | option     | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation.                                                                                                                                                      |
+| `--prune-vectors`, `-V`                                 | option     | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning.                                                                                                                                                                         |
+| `--vectors-name`, `-vn`                                 | option     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`.                                                                                                                                                                  |
+| **CREATES**                                             | model      | A spaCy model containing the vocab and vectors.                                                                                                                                                                                                        |
+
 ## Convert {#convert}

 Convert files into spaCy's
@ -469,32 +543,6 @@ tokenization can be provided.
 {"tokens": ["If", "tokens", "are", "provided", "then", "we", "can", "skip", "the", "raw", "input", "text"]}
 ```

-## Init Model {#init-model new="2"}
-
-Create a new model directory from raw data, like word frequencies, Brown
-clusters and word vectors. This command is similar to the `spacy model` command
-in v1.x. Note that in order to populate the model's vocab, you need to pass in a
-JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as
-`--jsonl-loc` with optional `id` values that correspond to the vectors table.
-Just loading in vectors will not automatically populate the vocab.
-
-```bash
-$ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
-[--prune-vectors]
-```
-
-| Argument                                                    | Type       | Description                                                                                                                                                                                                                                            |
-| ----------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `lang`                                                      | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`.                                                                                                                                                           |
-| `output_dir`                                                | positional | Model output directory. Will be created if it doesn't exist.                                                                                                                                                                                           |
-| `--jsonl-loc`, `-j`                                         | option     | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes.                                                                                                                                         |
-| `--vectors-loc`, `-v`                                       | option     | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
-| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag>     | option     | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation.                                                                                                                                                      |
-| `--prune-vectors`, `-V`                                     | option     | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning.                                                                                                                                                                         |
-| `--vectors-name`, `-vn`                                     | option     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`.                                                                                                                                                                  |
-| `--omit-extra-lookups`, `-OEL` <Tag variant="new">2.3</Tag> | flag       | Do not include any of the extra lookups tables (`cluster`/`prob`/`sentiment`) from `spacy-lookups-data` in the model.                                                                                                                                  |
-| **CREATES**                                                 | model      | A spaCy model containing the vocab and vectors.                                                                                                                                                                                                        |
-
 ## Evaluate {#evaluate new="2"}

 <!-- TODO: document new evaluate command -->
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -44,24 +44,12 @@ following data and information:
 2. A [`config.cfg`](#config) **configuration file** with all settings and
   hyperparameters.
 3. An optional **Python file** to register
-   [custom models and architectures](#custom-models).
-
-<!-- TODO: decide how we want to present the "getting started" workflow here, get a default config etc. -->
+   [custom functions and architectures](#custom-code).

 ```bash
 $ python -m spacy train train.spacy dev.spacy config.cfg --output ./output
 ```

-> #### Tip: Debug your data
->
-> The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate
-> your training and development data, get useful stats, and find problems like
-> invalid entity annotations, cyclic dependencies, low data labels and more.
->
-> ```bash
-> $ python -m spacy debug-data en train.spacy dev.spacy --verbose
-> ```
-
 <Project id="some_example_project">

 The easiest way to get started with an end-to-end training process is to clone a
@ -74,16 +62,42 @@ workflows, from data preprocessing to training and packaging your model.

 > #### Instructions
 >
-> 1. Select your requirements and settings. The quickstart widget will
->    auto-generate a recommended starter config for you.
+> 1. Select your requirements and settings.
 > 2. Use the buttons at the bottom to save the result to your clipboard or a
->    file `config.cfg`.
-> 3. TOOD: recommended approach for filling config
-> 4. Run [`spacy train`](/api/cli#train) with your config and data.
+>    file `base_config.cfg`.
+> 3. Run [`init config`](/api/cli#init-config) to create a full training config.
+> 4. Run [`train`](/api/cli#train) with your config and data.

 import QuickstartTraining from 'widgets/quickstart-training.js'

-<QuickstartTraining />
+<QuickstartTraining download="base_config.cfg" />
+
+After you've saved the starter config to a file `base_config.cfg`, you can use
+the [`init config`](/api/cli#init-config) command to fill in the remaining
+defaults. Training configs should always be **complete and without hidden
+defaults**, to keep your experiments reproducible.
+
+```bash
+$ python -m spacy init config config.cfg --base base_config.cfg
+```
+
+> #### Tip: Debug your data
+>
+> The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate
+> your training and development data, get useful stats, and find problems like
+> invalid entity annotations, cyclic dependencies, low data labels and more.
+>
+> ```bash
+> $ python -m spacy debug-data en train.spacy dev.spacy --verbose
+> ```
+
+You can now run [`train`](/api/cli#train) with your training and development
+data and the training config. See the [`convert`](/api/cli#convert) command for
+details on how to convert your data to spaCy's binary `.spacy` format.
+
+```bash
+$ python -m spacy train train.spacy dev.spacy config.cfg --output ./output
+```

 ## Training config {#config}

--- a/website/docs/usage/transformers.md
+++ b/website/docs/usage/transformers.md
@ -165,10 +165,8 @@ resolved, the function is created and passed into the model as an argument.
 Remember that the `config.cfg` used for training should contain **no missing
 values** and requires all settings to be defined. You don't want any hidden
 defaults creeping in and changing your results! spaCy will tell you if settings
-are missing, and you can run [`spacy debug config`](/api/cli#debug-config) with
-`--auto-fill` to automatically fill in all defaults.
-
-<!-- TODO: update with details on getting started with a config -->
+are missing, and you can run [`spacy init config`](/api/cli#init-config) with to
+automatically fill in all defaults.

 </Infobox>