diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 72fac05a6..bc47ffdef 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -15,6 +15,7 @@ from .debug_model import debug_model # noqa: F401
from .evaluate import evaluate # noqa: F401
from .convert import convert # noqa: F401
from .init_model import init_model # noqa: F401
+from .init_config import init_config # noqa: F401
from .validate import validate # noqa: F401
from .project.clone import project_clone # noqa: F401
from .project.assets import project_assets # noqa: F401
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index f277988f8..0130e60bb 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -31,6 +31,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
commands to check and validate your config files, training and evaluation data,
and custom model implementations.
"""
+INIT_HELP = """Commands for initializing configs and models."""
# Wrappers for Typer's annotations. Initially created to set defaults and to
# keep the names short, but not needed at the moment.
@@ -40,9 +41,11 @@ Opt = typer.Option
app = typer.Typer(name=NAME, help=HELP)
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
+init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
app.add_typer(project_cli)
app.add_typer(debug_cli)
+app.add_typer(init_cli)
def setup_cli() -> None:
@@ -172,16 +175,34 @@ def get_checksum(path: Union[Path, str]) -> str:
@contextmanager
-def show_validation_error(title: str = "Config validation error"):
+def show_validation_error(
+ file_path: Optional[Union[str, Path]] = None,
+ *,
+ title: str = "Config validation error",
+ hint_init: bool = True,
+):
"""Helper to show custom config validation errors on the CLI.
+ file_path (str / Path): Optional file path of config file, used in hints.
title (str): Title of the custom formatted error.
+ hint_init (bool): Show hint about filling config.
"""
try:
yield
except (ConfigValidationError, InterpolationError) as e:
msg.fail(title, spaced=True)
- print(str(e).replace("Config validation error", "").strip())
+ # TODO: This is kinda hacky and we should probably provide a better
+ # helper for this in Thinc
+ err_text = str(e).replace("Config validation error", "").strip()
+ print(err_text)
+ if hint_init and "field required" in err_text:
+ config_path = file_path if file_path is not None else "config.cfg"
+ msg.text(
+ "If your config contains missing values, you can run the 'init "
+ "config' command to fill in all the defaults, if possible:",
+ spaced=True,
+ )
+ print(f"{COMMAND} init config {config_path} --base {config_path}\n")
sys.exit(1)
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index fa6f7a7d5..0701992da 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -33,7 +33,6 @@ def debug_config_cli(
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
- output_path: Optional[Path] = Opt(None, "--output", "-o", help="Output path for filled config or '-' for standard output", allow_dash=True),
auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"),
diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled")
# fmt: on
@@ -49,7 +48,7 @@ def debug_config_cli(
"""
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
- with show_validation_error():
+ with show_validation_error(config_path):
config = Config().from_disk(config_path)
try:
nlp, _ = util.load_model_from_config(
@@ -57,7 +56,6 @@ def debug_config_cli(
)
except ValueError as e:
msg.fail(str(e), exits=1)
- is_stdout = output_path is not None and str(output_path) == "-"
if auto_fill:
orig_config = config.to_str()
filled_config = nlp.config.to_str()
@@ -68,12 +66,7 @@ def debug_config_cli(
if diff:
print(diff_strings(config.to_str(), nlp.config.to_str()))
else:
- msg.good("Original config is valid", show=not is_stdout)
- if is_stdout:
- print(nlp.config.to_str())
- elif output_path is not None:
- nlp.config.to_disk(output_path)
- msg.good(f"Saved updated config to {output_path}")
+ msg.good("Original config is valid")
@debug_cli.command(
@@ -142,7 +135,7 @@ def debug_data(
msg.fail("Development data not found", dev_path, exits=1)
if not config_path.exists():
msg.fail("Config file not found", config_path, exists=1)
- with show_validation_error():
+ with show_validation_error(config_path):
cfg = Config().from_disk(config_path)
nlp, config = util.load_model_from_config(cfg, overrides=config_overrides)
# TODO: handle base model
diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 88e060238..e98ddbe05 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -50,8 +50,8 @@ def debug_model_cli(
"print_prediction": P3,
}
config_overrides = parse_config_overrides(ctx.args)
- cfg = Config().from_disk(config_path)
- with show_validation_error():
+ with show_validation_error(config_path):
+ cfg = Config().from_disk(config_path)
try:
_, config = util.load_model_from_config(cfg, overrides=config_overrides)
except ValueError as e:
diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
new file mode 100644
index 000000000..01664ee40
--- /dev/null
+++ b/spacy/cli/init_config.py
@@ -0,0 +1,81 @@
+from typing import Optional, List
+from pathlib import Path
+from thinc.api import Config
+from wasabi import msg
+
+from ..util import load_model_from_config, get_lang_class, load_model
+from ._util import init_cli, Arg, Opt, show_validation_error
+
+
+@init_cli.command("config")
+def init_config_cli(
+ # fmt: off
+ output_path: Path = Arg("-", help="Output path or - for stdout", allow_dash=True),
+ base_path: Optional[Path] = Opt(None, "--base", "-b", help="Optional base config to fill", exists=True, dir_okay=False),
+ model: Optional[str] = Opt(None, "--model", "-m", help="Optional model to copy config from"),
+ lang: Optional[str] = Opt(None, "--lang", "-l", help="Optional language code for blank config"),
+ pipeline: Optional[str] = Opt(None, "--pipeline", "-p", help="Optional pipeline components to use")
+ # fmt: on
+):
+ """Generate a starter config.cfg for training."""
+ validate_cli_args(base_path, model, lang)
+ is_stdout = str(output_path) == "-"
+ pipeline = [p.strip() for p in pipeline.split(",")] if pipeline else []
+ cfg = init_config(output_path, base_path, model, lang, pipeline, silent=is_stdout)
+ if is_stdout:
+ print(cfg.to_str())
+ else:
+ cfg.to_disk(output_path)
+ msg.good("Saved config", output_path)
+
+
+def init_config(
+ output_path: Path,
+ config_path: Optional[Path],
+ model: Optional[str],
+ lang: Optional[str],
+ pipeline: Optional[List[str]],
+ silent: bool = False,
+) -> Config:
+ if config_path is not None:
+ msg.info("Generating config from base config", show=not silent)
+ with show_validation_error(config_path, hint_init=False):
+ config = Config().from_disk(config_path)
+ try:
+ nlp, _ = load_model_from_config(config, auto_fill=True)
+ except ValueError as e:
+ msg.fail(str(e), exits=1)
+ return nlp.config
+ if model is not None:
+ ext = f" with pipeline {pipeline}" if pipeline else ""
+ msg.info(f"Generating config from model {model}{ext}", show=not silent)
+ nlp = load_model(model)
+ for existing_pipe_name in nlp.pipe_names:
+ if existing_pipe_name not in pipeline:
+ nlp.remove_pipe(existing_pipe_name)
+ for pipe_name in pipeline:
+ if pipe_name not in nlp.pipe_names:
+ nlp.add_pipe(pipe_name)
+ return nlp.config
+ if lang is not None:
+ ext = f" with pipeline {pipeline}" if pipeline else ""
+ msg.info(f"Generating config for language '{lang}'{ext}", show=not silent)
+ nlp = get_lang_class(lang)()
+ for pipe_name in pipeline:
+ nlp.add_pipe(pipe_name)
+ return nlp.config
+
+
+def validate_cli_args(
+ config_path: Optional[Path], model: Optional[str], lang: Optional[str]
+) -> None:
+ args = {"--base": config_path, "--model": model, "--lang": lang}
+ if sum(arg is not None for arg in args.values()) != 1:
+ existing = " ".join(f"{a} {v}" for a, v in args.items() if v is not None)
+ msg.fail(
+ "The init config command expects only one of the following arguments: "
+ "--base (base config to fill and update), --lang (language code to "
+ "use for blank config) or --model (base model to copy config from).",
+ f"Got: {existing if existing else 'no arguments'}",
+ exits=1,
+ )
diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index e1dca2395..4fdd2bbbc 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -10,14 +10,14 @@ import gzip
import zipfile
import srsly
import warnings
-from wasabi import Printer
+from wasabi import msg, Printer
+import typer
-from ._util import app, Arg, Opt
+from ._util import app, init_cli, Arg, Opt
from ..vectors import Vectors
from ..errors import Errors, Warnings
from ..language import Language
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
-from ..lookups import Lookups
try:
import ftfy
@@ -28,9 +28,15 @@ except ImportError:
DEFAULT_OOV_PROB = -20
-@app.command("init-model")
+@init_cli.command("model")
+@app.command(
+ "init-model",
+ context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+ hidden=True, # hide this from main CLI help but still allow it to work with warning
+)
def init_model_cli(
# fmt: off
+ ctx: typer.Context, # This is only used to read additional arguments
lang: str = Arg(..., help="Model language"),
output_dir: Path = Arg(..., help="Model output directory"),
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
@@ -48,6 +54,12 @@ def init_model_cli(
Create a new model from raw data. If vectors are provided in Word2Vec format,
they can be either a .txt or zipped as a .zip or .tar.gz.
"""
+ if ctx.command.name == "init-model":
+ msg.warn(
+ "The init-model command is now available via the 'init model' "
+ "subcommand (without the hyphen). You can run python -m spacy init "
+ "--help for an overview of the other available initialization commands."
+ )
init_model(
lang,
output_dir,
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 7d1a217be..23de5f452 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -87,8 +87,8 @@ def pretrain(
else:
msg.info("Using CPU")
msg.info(f"Loading config from: {config_path}")
- config = Config().from_disk(config_path)
- with show_validation_error():
+ with show_validation_error(config_path):
+ config = Config().from_disk(config_path)
nlp, config = util.load_model_from_config(config, overrides=config_overrides)
# TODO: validate that [pretraining] block exists
if not output_dir.exists():
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index b0bc145ff..5a89ed6e8 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -79,10 +79,11 @@ def train(
else:
msg.info("Using CPU")
msg.info(f"Loading config and nlp from: {config_path}")
- config = Config().from_disk(config_path)
+ with show_validation_error(config_path):
+ config = Config().from_disk(config_path)
if config.get("training", {}).get("seed") is not None:
fix_random_seed(config["training"]["seed"])
- with show_validation_error():
+ with show_validation_error(config_path):
nlp, config = util.load_model_from_config(config, overrides=config_overrides)
if config["training"]["base_model"]:
# TODO: do something to check base_nlp against regular nlp described in config?
@@ -245,9 +246,7 @@ def create_evaluation_callback(
cfg: Union[Config, Dict[str, Any]],
) -> Callable[[], Tuple[float, Dict[str, float]]]:
def evaluate() -> Tuple[float, Dict[str, float]]:
- dev_examples = corpus.dev_dataset(
- nlp, gold_preproc=cfg["gold_preproc"]
- )
+ dev_examples = corpus.dev_dataset(nlp, gold_preproc=cfg["gold_preproc"])
dev_examples = list(dev_examples)
n_words = sum(len(ex.predicted) for ex in dev_examples)
batch_size = cfg["eval_batch_size"]
diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py
index 0580d34c5..e6ba284df 100644
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@@ -13,8 +13,9 @@ from ..util import get_package_path, get_model_meta, is_compatible_version
@app.command("validate")
def validate_cli():
"""
- Validate that the currently installed version of spaCy is compatible
- with the installed models. Should be run after `pip install -U spacy`.
+ Validate the currently installed models and spaCy version. Checks if the
+ installed models are compatible and shows upgrade instructions if available.
+ Should be run after `pip install -U spacy`.
"""
validate()
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 4690029aa..0ea67747e 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -6,11 +6,11 @@ menu:
- ['Download', 'download']
- ['Info', 'info']
- ['Validate', 'validate']
+ - ['Init', 'init']
- ['Convert', 'convert']
- ['Debug', 'debug']
- ['Train', 'train']
- ['Pretrain', 'pretrain']
- - ['Init Model', 'init-model']
- ['Evaluate', 'evaluate']
- ['Package', 'package']
- ['Project', 'project']
@@ -94,6 +94,80 @@ $ python -m spacy validate
| ---------- | -------- | --------------------------------------------------------- |
| **PRINTS** | `stdout` | Details about the compatibility of your installed models. |
+## Init {#init new="3"}
+
+The `spacy init` CLI includes helpful commands for initializing training config
+files and model directories.
+
+### init config {#init-config new="3"}
+
+Initialize and export a [`config.cfg` file](/usage/training#config) for training
+and update it with all default values, if possible. Config files used for
+training should always be complete and not contain any hidden defaults or
+missing values, so this command helps you create your final config. It takes
+**one** of the following options:
+
+- `--base`: Base **config** to auto-fill, e.g. created using the
+ [training quickstart](/usage/training#quickstart) widget.
+- `--lang`: Base **language** code to use for blank config.
+- `--model`: Base **model** to copy config from.
+
+> ```bash
+> ### with base config {wrap="true"}
+> $ python -m spacy init config config.cfg --base base.cfg
+> ```
+>
+> ```bash
+> ### blank language {wrap="true"}
+> $ python -m spacy init config config.cfg --lang en --pipeline tagger,parser
+> ```
+
+```bash
+$ python -m spacy init config [output] [--base] [--lang] [--model] [--pipeline]
+```
+
+| Argument | Type | Description |
+| ------------------ | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `output` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. |
+| `--base`, `-b` | option | Optional base config file to auto-fill with defaults. |
+| `--lang`, `-l` | option | Optional language code to use for blank config. If a `--pipeline` is specified, the components will be added in order. |
+| `--model`, `-m` | option | Optional base model to copy config from. If a `--pipeline` is specified, only those components will be kept, and all other components not in the model will be added. |
+| `--pipeline`, `-p` | option | Optional comma-separate pipeline of components to add to blank language or model. |
+| **CREATES** | config | Complete and auto-filled config file for training. |
+
+### init model {#init-model new="2"}
+
+
+
+Create a new model directory from raw data, like word frequencies, Brown
+clusters and word vectors. This command is similar to the `spacy model` command
+in v1.x. Note that in order to populate the model's vocab, you need to pass in a
+JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as
+`--jsonl-loc` with optional `id` values that correspond to the vectors table.
+Just loading in vectors will not automatically populate the vocab.
+
+
+
+The `init-model` command is now available as a subcommand of `spacy init`.
+
+
+
+```bash
+$ python -m spacy init model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
+[--prune-vectors]
+```
+
+| Argument | Type | Description |
+| ------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. |
+| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
+| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. |
+| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
+| `--truncate-vectors`, `-t` 2.3 | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. |
+| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
+| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
+| **CREATES** | model | A spaCy model containing the vocab and vectors. |
+
## Convert {#convert}
Convert files into spaCy's
@@ -469,32 +543,6 @@ tokenization can be provided.
{"tokens": ["If", "tokens", "are", "provided", "then", "we", "can", "skip", "the", "raw", "input", "text"]}
```
-## Init Model {#init-model new="2"}
-
-Create a new model directory from raw data, like word frequencies, Brown
-clusters and word vectors. This command is similar to the `spacy model` command
-in v1.x. Note that in order to populate the model's vocab, you need to pass in a
-JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as
-`--jsonl-loc` with optional `id` values that correspond to the vectors table.
-Just loading in vectors will not automatically populate the vocab.
-
-```bash
-$ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
-[--prune-vectors]
-```
-
-| Argument | Type | Description |
-| ----------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. |
-| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
-| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. |
-| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
-| `--truncate-vectors`, `-t` 2.3 | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. |
-| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
-| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
-| `--omit-extra-lookups`, `-OEL` 2.3 | flag | Do not include any of the extra lookups tables (`cluster`/`prob`/`sentiment`) from `spacy-lookups-data` in the model. |
-| **CREATES** | model | A spaCy model containing the vocab and vectors. |
-
## Evaluate {#evaluate new="2"}
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 635b52c89..955e484fb 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -44,24 +44,12 @@ following data and information:
2. A [`config.cfg`](#config) **configuration file** with all settings and
hyperparameters.
3. An optional **Python file** to register
- [custom models and architectures](#custom-models).
-
-
+ [custom functions and architectures](#custom-code).
```bash
$ python -m spacy train train.spacy dev.spacy config.cfg --output ./output
```
-> #### Tip: Debug your data
->
-> The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate
-> your training and development data, get useful stats, and find problems like
-> invalid entity annotations, cyclic dependencies, low data labels and more.
->
-> ```bash
-> $ python -m spacy debug-data en train.spacy dev.spacy --verbose
-> ```
-
The easiest way to get started with an end-to-end training process is to clone a
@@ -74,16 +62,42 @@ workflows, from data preprocessing to training and packaging your model.
> #### Instructions
>
-> 1. Select your requirements and settings. The quickstart widget will
-> auto-generate a recommended starter config for you.
+> 1. Select your requirements and settings.
> 2. Use the buttons at the bottom to save the result to your clipboard or a
-> file `config.cfg`.
-> 3. TOOD: recommended approach for filling config
-> 4. Run [`spacy train`](/api/cli#train) with your config and data.
+> file `base_config.cfg`.
+> 3. Run [`init config`](/api/cli#init-config) to create a full training config.
+> 4. Run [`train`](/api/cli#train) with your config and data.
import QuickstartTraining from 'widgets/quickstart-training.js'
-
+
+
+After you've saved the starter config to a file `base_config.cfg`, you can use
+the [`init config`](/api/cli#init-config) command to fill in the remaining
+defaults. Training configs should always be **complete and without hidden
+defaults**, to keep your experiments reproducible.
+
+```bash
+$ python -m spacy init config config.cfg --base base_config.cfg
+```
+
+> #### Tip: Debug your data
+>
+> The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate
+> your training and development data, get useful stats, and find problems like
+> invalid entity annotations, cyclic dependencies, low data labels and more.
+>
+> ```bash
+> $ python -m spacy debug-data en train.spacy dev.spacy --verbose
+> ```
+
+You can now run [`train`](/api/cli#train) with your training and development
+data and the training config. See the [`convert`](/api/cli#convert) command for
+details on how to convert your data to spaCy's binary `.spacy` format.
+
+```bash
+$ python -m spacy train train.spacy dev.spacy config.cfg --output ./output
+```
## Training config {#config}
diff --git a/website/docs/usage/transformers.md b/website/docs/usage/transformers.md
index 81bd45f58..b837c62de 100644
--- a/website/docs/usage/transformers.md
+++ b/website/docs/usage/transformers.md
@@ -165,10 +165,8 @@ resolved, the function is created and passed into the model as an argument.
Remember that the `config.cfg` used for training should contain **no missing
values** and requires all settings to be defined. You don't want any hidden
defaults creeping in and changing your results! spaCy will tell you if settings
-are missing, and you can run [`spacy debug config`](/api/cli#debug-config) with
-`--auto-fill` to automatically fill in all defaults.
-
-
+are missing, and you can run [`spacy init config`](/api/cli#init-config) with to
+automatically fill in all defaults.