Add init CLI and init config (#5854)

* Add init CLI and init config draft

* Improve config validation

* Auto-format

* Don't export anything in debug config

* Update docs
This commit is contained in:
Ines Montani 2020-08-02 15:18:30 +02:00 committed by GitHub
parent e393ebd78b
commit 4c055f0aa7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 245 additions and 77 deletions

View File

@ -15,6 +15,7 @@ from .debug_model import debug_model # noqa: F401
from .evaluate import evaluate # noqa: F401
from .convert import convert # noqa: F401
from .init_model import init_model # noqa: F401
from .init_config import init_config # noqa: F401
from .validate import validate # noqa: F401
from .project.clone import project_clone # noqa: F401
from .project.assets import project_assets # noqa: F401

View File

@ -31,6 +31,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
commands to check and validate your config files, training and evaluation data,
and custom model implementations.
"""
INIT_HELP = """Commands for initializing configs and models."""
# Wrappers for Typer's annotations. Initially created to set defaults and to
# keep the names short, but not needed at the moment.
@ -40,9 +41,11 @@ Opt = typer.Option
app = typer.Typer(name=NAME, help=HELP)
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
app.add_typer(project_cli)
app.add_typer(debug_cli)
app.add_typer(init_cli)
def setup_cli() -> None:
@ -172,16 +175,34 @@ def get_checksum(path: Union[Path, str]) -> str:
@contextmanager
def show_validation_error(title: str = "Config validation error"):
def show_validation_error(
file_path: Optional[Union[str, Path]] = None,
*,
title: str = "Config validation error",
hint_init: bool = True,
):
"""Helper to show custom config validation errors on the CLI.
file_path (str / Path): Optional file path of config file, used in hints.
title (str): Title of the custom formatted error.
hint_init (bool): Show hint about filling config.
"""
try:
yield
except (ConfigValidationError, InterpolationError) as e:
msg.fail(title, spaced=True)
print(str(e).replace("Config validation error", "").strip())
# TODO: This is kinda hacky and we should probably provide a better
# helper for this in Thinc
err_text = str(e).replace("Config validation error", "").strip()
print(err_text)
if hint_init and "field required" in err_text:
config_path = file_path if file_path is not None else "config.cfg"
msg.text(
"If your config contains missing values, you can run the 'init "
"config' command to fill in all the defaults, if possible:",
spaced=True,
)
print(f"{COMMAND} init config {config_path} --base {config_path}\n")
sys.exit(1)

View File

@ -33,7 +33,6 @@ def debug_config_cli(
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
output_path: Optional[Path] = Opt(None, "--output", "-o", help="Output path for filled config or '-' for standard output", allow_dash=True),
auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"),
diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled")
# fmt: on
@ -49,7 +48,7 @@ def debug_config_cli(
"""
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
with show_validation_error():
with show_validation_error(config_path):
config = Config().from_disk(config_path)
try:
nlp, _ = util.load_model_from_config(
@ -57,7 +56,6 @@ def debug_config_cli(
)
except ValueError as e:
msg.fail(str(e), exits=1)
is_stdout = output_path is not None and str(output_path) == "-"
if auto_fill:
orig_config = config.to_str()
filled_config = nlp.config.to_str()
@ -68,12 +66,7 @@ def debug_config_cli(
if diff:
print(diff_strings(config.to_str(), nlp.config.to_str()))
else:
msg.good("Original config is valid", show=not is_stdout)
if is_stdout:
print(nlp.config.to_str())
elif output_path is not None:
nlp.config.to_disk(output_path)
msg.good(f"Saved updated config to {output_path}")
msg.good("Original config is valid")
@debug_cli.command(
@ -142,7 +135,7 @@ def debug_data(
msg.fail("Development data not found", dev_path, exits=1)
if not config_path.exists():
msg.fail("Config file not found", config_path, exists=1)
with show_validation_error():
with show_validation_error(config_path):
cfg = Config().from_disk(config_path)
nlp, config = util.load_model_from_config(cfg, overrides=config_overrides)
# TODO: handle base model

View File

@ -50,8 +50,8 @@ def debug_model_cli(
"print_prediction": P3,
}
config_overrides = parse_config_overrides(ctx.args)
cfg = Config().from_disk(config_path)
with show_validation_error():
with show_validation_error(config_path):
cfg = Config().from_disk(config_path)
try:
_, config = util.load_model_from_config(cfg, overrides=config_overrides)
except ValueError as e:

81
spacy/cli/init_config.py Normal file
View File

@ -0,0 +1,81 @@
from typing import Optional, List
from pathlib import Path
from thinc.api import Config
from wasabi import msg
from ..util import load_model_from_config, get_lang_class, load_model
from ._util import init_cli, Arg, Opt, show_validation_error
@init_cli.command("config")
def init_config_cli(
# fmt: off
output_path: Path = Arg("-", help="Output path or - for stdout", allow_dash=True),
base_path: Optional[Path] = Opt(None, "--base", "-b", help="Optional base config to fill", exists=True, dir_okay=False),
model: Optional[str] = Opt(None, "--model", "-m", help="Optional model to copy config from"),
lang: Optional[str] = Opt(None, "--lang", "-l", help="Optional language code for blank config"),
pipeline: Optional[str] = Opt(None, "--pipeline", "-p", help="Optional pipeline components to use")
# fmt: on
):
"""Generate a starter config.cfg for training."""
validate_cli_args(base_path, model, lang)
is_stdout = str(output_path) == "-"
pipeline = [p.strip() for p in pipeline.split(",")] if pipeline else []
cfg = init_config(output_path, base_path, model, lang, pipeline, silent=is_stdout)
if is_stdout:
print(cfg.to_str())
else:
cfg.to_disk(output_path)
msg.good("Saved config", output_path)
def init_config(
output_path: Path,
config_path: Optional[Path],
model: Optional[str],
lang: Optional[str],
pipeline: Optional[List[str]],
silent: bool = False,
) -> Config:
if config_path is not None:
msg.info("Generating config from base config", show=not silent)
with show_validation_error(config_path, hint_init=False):
config = Config().from_disk(config_path)
try:
nlp, _ = load_model_from_config(config, auto_fill=True)
except ValueError as e:
msg.fail(str(e), exits=1)
return nlp.config
if model is not None:
ext = f" with pipeline {pipeline}" if pipeline else ""
msg.info(f"Generating config from model {model}{ext}", show=not silent)
nlp = load_model(model)
for existing_pipe_name in nlp.pipe_names:
if existing_pipe_name not in pipeline:
nlp.remove_pipe(existing_pipe_name)
for pipe_name in pipeline:
if pipe_name not in nlp.pipe_names:
nlp.add_pipe(pipe_name)
return nlp.config
if lang is not None:
ext = f" with pipeline {pipeline}" if pipeline else ""
msg.info(f"Generating config for language '{lang}'{ext}", show=not silent)
nlp = get_lang_class(lang)()
for pipe_name in pipeline:
nlp.add_pipe(pipe_name)
return nlp.config
def validate_cli_args(
config_path: Optional[Path], model: Optional[str], lang: Optional[str]
) -> None:
args = {"--base": config_path, "--model": model, "--lang": lang}
if sum(arg is not None for arg in args.values()) != 1:
existing = " ".join(f"{a} {v}" for a, v in args.items() if v is not None)
msg.fail(
"The init config command expects only one of the following arguments: "
"--base (base config to fill and update), --lang (language code to "
"use for blank config) or --model (base model to copy config from).",
f"Got: {existing if existing else 'no arguments'}",
exits=1,
)

View File

@ -10,14 +10,14 @@ import gzip
import zipfile
import srsly
import warnings
from wasabi import Printer
from wasabi import msg, Printer
import typer
from ._util import app, Arg, Opt
from ._util import app, init_cli, Arg, Opt
from ..vectors import Vectors
from ..errors import Errors, Warnings
from ..language import Language
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
from ..lookups import Lookups
try:
import ftfy
@ -28,9 +28,15 @@ except ImportError:
DEFAULT_OOV_PROB = -20
@app.command("init-model")
@init_cli.command("model")
@app.command(
"init-model",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
hidden=True, # hide this from main CLI help but still allow it to work with warning
)
def init_model_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
lang: str = Arg(..., help="Model language"),
output_dir: Path = Arg(..., help="Model output directory"),
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
@ -48,6 +54,12 @@ def init_model_cli(
Create a new model from raw data. If vectors are provided in Word2Vec format,
they can be either a .txt or zipped as a .zip or .tar.gz.
"""
if ctx.command.name == "init-model":
msg.warn(
"The init-model command is now available via the 'init model' "
"subcommand (without the hyphen). You can run python -m spacy init "
"--help for an overview of the other available initialization commands."
)
init_model(
lang,
output_dir,

View File

@ -87,8 +87,8 @@ def pretrain(
else:
msg.info("Using CPU")
msg.info(f"Loading config from: {config_path}")
config = Config().from_disk(config_path)
with show_validation_error():
with show_validation_error(config_path):
config = Config().from_disk(config_path)
nlp, config = util.load_model_from_config(config, overrides=config_overrides)
# TODO: validate that [pretraining] block exists
if not output_dir.exists():

View File

@ -79,10 +79,11 @@ def train(
else:
msg.info("Using CPU")
msg.info(f"Loading config and nlp from: {config_path}")
config = Config().from_disk(config_path)
with show_validation_error(config_path):
config = Config().from_disk(config_path)
if config.get("training", {}).get("seed") is not None:
fix_random_seed(config["training"]["seed"])
with show_validation_error():
with show_validation_error(config_path):
nlp, config = util.load_model_from_config(config, overrides=config_overrides)
if config["training"]["base_model"]:
# TODO: do something to check base_nlp against regular nlp described in config?
@ -245,9 +246,7 @@ def create_evaluation_callback(
cfg: Union[Config, Dict[str, Any]],
) -> Callable[[], Tuple[float, Dict[str, float]]]:
def evaluate() -> Tuple[float, Dict[str, float]]:
dev_examples = corpus.dev_dataset(
nlp, gold_preproc=cfg["gold_preproc"]
)
dev_examples = corpus.dev_dataset(nlp, gold_preproc=cfg["gold_preproc"])
dev_examples = list(dev_examples)
n_words = sum(len(ex.predicted) for ex in dev_examples)
batch_size = cfg["eval_batch_size"]

View File

@ -13,8 +13,9 @@ from ..util import get_package_path, get_model_meta, is_compatible_version
@app.command("validate")
def validate_cli():
"""
Validate that the currently installed version of spaCy is compatible
with the installed models. Should be run after `pip install -U spacy`.
Validate the currently installed models and spaCy version. Checks if the
installed models are compatible and shows upgrade instructions if available.
Should be run after `pip install -U spacy`.
"""
validate()

View File

@ -6,11 +6,11 @@ menu:
- ['Download', 'download']
- ['Info', 'info']
- ['Validate', 'validate']
- ['Init', 'init']
- ['Convert', 'convert']
- ['Debug', 'debug']
- ['Train', 'train']
- ['Pretrain', 'pretrain']
- ['Init Model', 'init-model']
- ['Evaluate', 'evaluate']
- ['Package', 'package']
- ['Project', 'project']
@ -94,6 +94,80 @@ $ python -m spacy validate
| ---------- | -------- | --------------------------------------------------------- |
| **PRINTS** | `stdout` | Details about the compatibility of your installed models. |
## Init {#init new="3"}
The `spacy init` CLI includes helpful commands for initializing training config
files and model directories.
### init config {#init-config new="3"}
Initialize and export a [`config.cfg` file](/usage/training#config) for training
and update it with all default values, if possible. Config files used for
training should always be complete and not contain any hidden defaults or
missing values, so this command helps you create your final config. It takes
**one** of the following options:
- `--base`: Base **config** to auto-fill, e.g. created using the
[training quickstart](/usage/training#quickstart) widget.
- `--lang`: Base **language** code to use for blank config.
- `--model`: Base **model** to copy config from.
> ```bash
> ### with base config {wrap="true"}
> $ python -m spacy init config config.cfg --base base.cfg
> ```
>
> ```bash
> ### blank language {wrap="true"}
> $ python -m spacy init config config.cfg --lang en --pipeline tagger,parser
> ```
```bash
$ python -m spacy init config [output] [--base] [--lang] [--model] [--pipeline]
```
| Argument | Type | Description |
| ------------------ | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `output` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. |
| `--base`, `-b` | option | Optional base config file to auto-fill with defaults. |
| `--lang`, `-l` | option | Optional language code to use for blank config. If a `--pipeline` is specified, the components will be added in order. |
| `--model`, `-m` | option | Optional base model to copy config from. If a `--pipeline` is specified, only those components will be kept, and all other components not in the model will be added. |
| `--pipeline`, `-p` | option | Optional comma-separate pipeline of components to add to blank language or model. |
| **CREATES** | config | Complete and auto-filled config file for training. |
### init model {#init-model new="2"}
<!-- TODO: update for v3 -->
Create a new model directory from raw data, like word frequencies, Brown
clusters and word vectors. This command is similar to the `spacy model` command
in v1.x. Note that in order to populate the model's vocab, you need to pass in a
JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as
`--jsonl-loc` with optional `id` values that correspond to the vectors table.
Just loading in vectors will not automatically populate the vocab.
<Infobox title="New in v3.0" variant="warning">
The `init-model` command is now available as a subcommand of `spacy init`.
</Infobox>
```bash
$ python -m spacy init model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
[--prune-vectors]
```
| Argument | Type | Description |
| ------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. |
| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. |
| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. |
| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
## Convert {#convert}
Convert files into spaCy's
@ -469,32 +543,6 @@ tokenization can be provided.
{"tokens": ["If", "tokens", "are", "provided", "then", "we", "can", "skip", "the", "raw", "input", "text"]}
```
## Init Model {#init-model new="2"}
Create a new model directory from raw data, like word frequencies, Brown
clusters and word vectors. This command is similar to the `spacy model` command
in v1.x. Note that in order to populate the model's vocab, you need to pass in a
JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as
`--jsonl-loc` with optional `id` values that correspond to the vectors table.
Just loading in vectors will not automatically populate the vocab.
```bash
$ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
[--prune-vectors]
```
| Argument | Type | Description |
| ----------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. |
| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. |
| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. |
| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
| `--omit-extra-lookups`, `-OEL` <Tag variant="new">2.3</Tag> | flag | Do not include any of the extra lookups tables (`cluster`/`prob`/`sentiment`) from `spacy-lookups-data` in the model. |
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
## Evaluate {#evaluate new="2"}
<!-- TODO: document new evaluate command -->

View File

@ -44,24 +44,12 @@ following data and information:
2. A [`config.cfg`](#config) **configuration file** with all settings and
hyperparameters.
3. An optional **Python file** to register
[custom models and architectures](#custom-models).
<!-- TODO: decide how we want to present the "getting started" workflow here, get a default config etc. -->
[custom functions and architectures](#custom-code).
```bash
$ python -m spacy train train.spacy dev.spacy config.cfg --output ./output
```
> #### Tip: Debug your data
>
> The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate
> your training and development data, get useful stats, and find problems like
> invalid entity annotations, cyclic dependencies, low data labels and more.
>
> ```bash
> $ python -m spacy debug-data en train.spacy dev.spacy --verbose
> ```
<Project id="some_example_project">
The easiest way to get started with an end-to-end training process is to clone a
@ -74,16 +62,42 @@ workflows, from data preprocessing to training and packaging your model.
> #### Instructions
>
> 1. Select your requirements and settings. The quickstart widget will
> auto-generate a recommended starter config for you.
> 1. Select your requirements and settings.
> 2. Use the buttons at the bottom to save the result to your clipboard or a
> file `config.cfg`.
> 3. TOOD: recommended approach for filling config
> 4. Run [`spacy train`](/api/cli#train) with your config and data.
> file `base_config.cfg`.
> 3. Run [`init config`](/api/cli#init-config) to create a full training config.
> 4. Run [`train`](/api/cli#train) with your config and data.
import QuickstartTraining from 'widgets/quickstart-training.js'
<QuickstartTraining />
<QuickstartTraining download="base_config.cfg" />
After you've saved the starter config to a file `base_config.cfg`, you can use
the [`init config`](/api/cli#init-config) command to fill in the remaining
defaults. Training configs should always be **complete and without hidden
defaults**, to keep your experiments reproducible.
```bash
$ python -m spacy init config config.cfg --base base_config.cfg
```
> #### Tip: Debug your data
>
> The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate
> your training and development data, get useful stats, and find problems like
> invalid entity annotations, cyclic dependencies, low data labels and more.
>
> ```bash
> $ python -m spacy debug-data en train.spacy dev.spacy --verbose
> ```
You can now run [`train`](/api/cli#train) with your training and development
data and the training config. See the [`convert`](/api/cli#convert) command for
details on how to convert your data to spaCy's binary `.spacy` format.
```bash
$ python -m spacy train train.spacy dev.spacy config.cfg --output ./output
```
## Training config {#config}

View File

@ -165,10 +165,8 @@ resolved, the function is created and passed into the model as an argument.
Remember that the `config.cfg` used for training should contain **no missing
values** and requires all settings to be defined. You don't want any hidden
defaults creeping in and changing your results! spaCy will tell you if settings
are missing, and you can run [`spacy debug config`](/api/cli#debug-config) with
`--auto-fill` to automatically fill in all defaults.
<!-- TODO: update with details on getting started with a config -->
are missing, and you can run [`spacy init config`](/api/cli#init-config) with to
automatically fill in all defaults.
</Infobox>