From 4c055f0aa703974ff3d14fb4ea5966c226013a1d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 2 Aug 2020 15:18:30 +0200 Subject: [PATCH] Add init CLI and init config (#5854) * Add init CLI and init config draft * Improve config validation * Auto-format * Don't export anything in debug config * Update docs --- spacy/cli/__init__.py | 1 + spacy/cli/_util.py | 25 ++++++- spacy/cli/debug_data.py | 13 +--- spacy/cli/debug_model.py | 4 +- spacy/cli/init_config.py | 81 +++++++++++++++++++++++ spacy/cli/init_model.py | 20 ++++-- spacy/cli/pretrain.py | 4 +- spacy/cli/train.py | 9 ++- spacy/cli/validate.py | 5 +- website/docs/api/cli.md | 102 +++++++++++++++++++++-------- website/docs/usage/training.md | 52 +++++++++------ website/docs/usage/transformers.md | 6 +- 12 files changed, 245 insertions(+), 77 deletions(-) create mode 100644 spacy/cli/init_config.py diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 72fac05a6..bc47ffdef 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -15,6 +15,7 @@ from .debug_model import debug_model # noqa: F401 from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 from .init_model import init_model # noqa: F401 +from .init_config import init_config # noqa: F401 from .validate import validate # noqa: F401 from .project.clone import project_clone # noqa: F401 from .project.assets import project_assets # noqa: F401 diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index f277988f8..0130e60bb 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -31,6 +31,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes commands to check and validate your config files, training and evaluation data, and custom model implementations. """ +INIT_HELP = """Commands for initializing configs and models.""" # Wrappers for Typer's annotations. Initially created to set defaults and to # keep the names short, but not needed at the moment. @@ -40,9 +41,11 @@ Opt = typer.Option app = typer.Typer(name=NAME, help=HELP) project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True) debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True) +init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True) app.add_typer(project_cli) app.add_typer(debug_cli) +app.add_typer(init_cli) def setup_cli() -> None: @@ -172,16 +175,34 @@ def get_checksum(path: Union[Path, str]) -> str: @contextmanager -def show_validation_error(title: str = "Config validation error"): +def show_validation_error( + file_path: Optional[Union[str, Path]] = None, + *, + title: str = "Config validation error", + hint_init: bool = True, +): """Helper to show custom config validation errors on the CLI. + file_path (str / Path): Optional file path of config file, used in hints. title (str): Title of the custom formatted error. + hint_init (bool): Show hint about filling config. """ try: yield except (ConfigValidationError, InterpolationError) as e: msg.fail(title, spaced=True) - print(str(e).replace("Config validation error", "").strip()) + # TODO: This is kinda hacky and we should probably provide a better + # helper for this in Thinc + err_text = str(e).replace("Config validation error", "").strip() + print(err_text) + if hint_init and "field required" in err_text: + config_path = file_path if file_path is not None else "config.cfg" + msg.text( + "If your config contains missing values, you can run the 'init " + "config' command to fill in all the defaults, if possible:", + spaced=True, + ) + print(f"{COMMAND} init config {config_path} --base {config_path}\n") sys.exit(1) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index fa6f7a7d5..0701992da 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -33,7 +33,6 @@ def debug_config_cli( ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - output_path: Optional[Path] = Opt(None, "--output", "-o", help="Output path for filled config or '-' for standard output", allow_dash=True), auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"), diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled") # fmt: on @@ -49,7 +48,7 @@ def debug_config_cli( """ overrides = parse_config_overrides(ctx.args) import_code(code_path) - with show_validation_error(): + with show_validation_error(config_path): config = Config().from_disk(config_path) try: nlp, _ = util.load_model_from_config( @@ -57,7 +56,6 @@ def debug_config_cli( ) except ValueError as e: msg.fail(str(e), exits=1) - is_stdout = output_path is not None and str(output_path) == "-" if auto_fill: orig_config = config.to_str() filled_config = nlp.config.to_str() @@ -68,12 +66,7 @@ def debug_config_cli( if diff: print(diff_strings(config.to_str(), nlp.config.to_str())) else: - msg.good("Original config is valid", show=not is_stdout) - if is_stdout: - print(nlp.config.to_str()) - elif output_path is not None: - nlp.config.to_disk(output_path) - msg.good(f"Saved updated config to {output_path}") + msg.good("Original config is valid") @debug_cli.command( @@ -142,7 +135,7 @@ def debug_data( msg.fail("Development data not found", dev_path, exits=1) if not config_path.exists(): msg.fail("Config file not found", config_path, exists=1) - with show_validation_error(): + with show_validation_error(config_path): cfg = Config().from_disk(config_path) nlp, config = util.load_model_from_config(cfg, overrides=config_overrides) # TODO: handle base model diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 88e060238..e98ddbe05 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -50,8 +50,8 @@ def debug_model_cli( "print_prediction": P3, } config_overrides = parse_config_overrides(ctx.args) - cfg = Config().from_disk(config_path) - with show_validation_error(): + with show_validation_error(config_path): + cfg = Config().from_disk(config_path) try: _, config = util.load_model_from_config(cfg, overrides=config_overrides) except ValueError as e: diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py new file mode 100644 index 000000000..01664ee40 --- /dev/null +++ b/spacy/cli/init_config.py @@ -0,0 +1,81 @@ +from typing import Optional, List +from pathlib import Path +from thinc.api import Config +from wasabi import msg + +from ..util import load_model_from_config, get_lang_class, load_model +from ._util import init_cli, Arg, Opt, show_validation_error + + +@init_cli.command("config") +def init_config_cli( + # fmt: off + output_path: Path = Arg("-", help="Output path or - for stdout", allow_dash=True), + base_path: Optional[Path] = Opt(None, "--base", "-b", help="Optional base config to fill", exists=True, dir_okay=False), + model: Optional[str] = Opt(None, "--model", "-m", help="Optional model to copy config from"), + lang: Optional[str] = Opt(None, "--lang", "-l", help="Optional language code for blank config"), + pipeline: Optional[str] = Opt(None, "--pipeline", "-p", help="Optional pipeline components to use") + # fmt: on +): + """Generate a starter config.cfg for training.""" + validate_cli_args(base_path, model, lang) + is_stdout = str(output_path) == "-" + pipeline = [p.strip() for p in pipeline.split(",")] if pipeline else [] + cfg = init_config(output_path, base_path, model, lang, pipeline, silent=is_stdout) + if is_stdout: + print(cfg.to_str()) + else: + cfg.to_disk(output_path) + msg.good("Saved config", output_path) + + +def init_config( + output_path: Path, + config_path: Optional[Path], + model: Optional[str], + lang: Optional[str], + pipeline: Optional[List[str]], + silent: bool = False, +) -> Config: + if config_path is not None: + msg.info("Generating config from base config", show=not silent) + with show_validation_error(config_path, hint_init=False): + config = Config().from_disk(config_path) + try: + nlp, _ = load_model_from_config(config, auto_fill=True) + except ValueError as e: + msg.fail(str(e), exits=1) + return nlp.config + if model is not None: + ext = f" with pipeline {pipeline}" if pipeline else "" + msg.info(f"Generating config from model {model}{ext}", show=not silent) + nlp = load_model(model) + for existing_pipe_name in nlp.pipe_names: + if existing_pipe_name not in pipeline: + nlp.remove_pipe(existing_pipe_name) + for pipe_name in pipeline: + if pipe_name not in nlp.pipe_names: + nlp.add_pipe(pipe_name) + return nlp.config + if lang is not None: + ext = f" with pipeline {pipeline}" if pipeline else "" + msg.info(f"Generating config for language '{lang}'{ext}", show=not silent) + nlp = get_lang_class(lang)() + for pipe_name in pipeline: + nlp.add_pipe(pipe_name) + return nlp.config + + +def validate_cli_args( + config_path: Optional[Path], model: Optional[str], lang: Optional[str] +) -> None: + args = {"--base": config_path, "--model": model, "--lang": lang} + if sum(arg is not None for arg in args.values()) != 1: + existing = " ".join(f"{a} {v}" for a, v in args.items() if v is not None) + msg.fail( + "The init config command expects only one of the following arguments: " + "--base (base config to fill and update), --lang (language code to " + "use for blank config) or --model (base model to copy config from).", + f"Got: {existing if existing else 'no arguments'}", + exits=1, + ) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index e1dca2395..4fdd2bbbc 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -10,14 +10,14 @@ import gzip import zipfile import srsly import warnings -from wasabi import Printer +from wasabi import msg, Printer +import typer -from ._util import app, Arg, Opt +from ._util import app, init_cli, Arg, Opt from ..vectors import Vectors from ..errors import Errors, Warnings from ..language import Language from ..util import ensure_path, get_lang_class, load_model, OOV_RANK -from ..lookups import Lookups try: import ftfy @@ -28,9 +28,15 @@ except ImportError: DEFAULT_OOV_PROB = -20 -@app.command("init-model") +@init_cli.command("model") +@app.command( + "init-model", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, + hidden=True, # hide this from main CLI help but still allow it to work with warning +) def init_model_cli( # fmt: off + ctx: typer.Context, # This is only used to read additional arguments lang: str = Arg(..., help="Model language"), output_dir: Path = Arg(..., help="Model output directory"), freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True), @@ -48,6 +54,12 @@ def init_model_cli( Create a new model from raw data. If vectors are provided in Word2Vec format, they can be either a .txt or zipped as a .zip or .tar.gz. """ + if ctx.command.name == "init-model": + msg.warn( + "The init-model command is now available via the 'init model' " + "subcommand (without the hyphen). You can run python -m spacy init " + "--help for an overview of the other available initialization commands." + ) init_model( lang, output_dir, diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 7d1a217be..23de5f452 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -87,8 +87,8 @@ def pretrain( else: msg.info("Using CPU") msg.info(f"Loading config from: {config_path}") - config = Config().from_disk(config_path) - with show_validation_error(): + with show_validation_error(config_path): + config = Config().from_disk(config_path) nlp, config = util.load_model_from_config(config, overrides=config_overrides) # TODO: validate that [pretraining] block exists if not output_dir.exists(): diff --git a/spacy/cli/train.py b/spacy/cli/train.py index b0bc145ff..5a89ed6e8 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -79,10 +79,11 @@ def train( else: msg.info("Using CPU") msg.info(f"Loading config and nlp from: {config_path}") - config = Config().from_disk(config_path) + with show_validation_error(config_path): + config = Config().from_disk(config_path) if config.get("training", {}).get("seed") is not None: fix_random_seed(config["training"]["seed"]) - with show_validation_error(): + with show_validation_error(config_path): nlp, config = util.load_model_from_config(config, overrides=config_overrides) if config["training"]["base_model"]: # TODO: do something to check base_nlp against regular nlp described in config? @@ -245,9 +246,7 @@ def create_evaluation_callback( cfg: Union[Config, Dict[str, Any]], ) -> Callable[[], Tuple[float, Dict[str, float]]]: def evaluate() -> Tuple[float, Dict[str, float]]: - dev_examples = corpus.dev_dataset( - nlp, gold_preproc=cfg["gold_preproc"] - ) + dev_examples = corpus.dev_dataset(nlp, gold_preproc=cfg["gold_preproc"]) dev_examples = list(dev_examples) n_words = sum(len(ex.predicted) for ex in dev_examples) batch_size = cfg["eval_batch_size"] diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index 0580d34c5..e6ba284df 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -13,8 +13,9 @@ from ..util import get_package_path, get_model_meta, is_compatible_version @app.command("validate") def validate_cli(): """ - Validate that the currently installed version of spaCy is compatible - with the installed models. Should be run after `pip install -U spacy`. + Validate the currently installed models and spaCy version. Checks if the + installed models are compatible and shows upgrade instructions if available. + Should be run after `pip install -U spacy`. """ validate() diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 4690029aa..0ea67747e 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -6,11 +6,11 @@ menu: - ['Download', 'download'] - ['Info', 'info'] - ['Validate', 'validate'] + - ['Init', 'init'] - ['Convert', 'convert'] - ['Debug', 'debug'] - ['Train', 'train'] - ['Pretrain', 'pretrain'] - - ['Init Model', 'init-model'] - ['Evaluate', 'evaluate'] - ['Package', 'package'] - ['Project', 'project'] @@ -94,6 +94,80 @@ $ python -m spacy validate | ---------- | -------- | --------------------------------------------------------- | | **PRINTS** | `stdout` | Details about the compatibility of your installed models. | +## Init {#init new="3"} + +The `spacy init` CLI includes helpful commands for initializing training config +files and model directories. + +### init config {#init-config new="3"} + +Initialize and export a [`config.cfg` file](/usage/training#config) for training +and update it with all default values, if possible. Config files used for +training should always be complete and not contain any hidden defaults or +missing values, so this command helps you create your final config. It takes +**one** of the following options: + +- `--base`: Base **config** to auto-fill, e.g. created using the + [training quickstart](/usage/training#quickstart) widget. +- `--lang`: Base **language** code to use for blank config. +- `--model`: Base **model** to copy config from. + +> ```bash +> ### with base config {wrap="true"} +> $ python -m spacy init config config.cfg --base base.cfg +> ``` +> +> ```bash +> ### blank language {wrap="true"} +> $ python -m spacy init config config.cfg --lang en --pipeline tagger,parser +> ``` + +```bash +$ python -m spacy init config [output] [--base] [--lang] [--model] [--pipeline] +``` + +| Argument | Type | Description | +| ------------------ | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `output` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. | +| `--base`, `-b` | option | Optional base config file to auto-fill with defaults. | +| `--lang`, `-l` | option | Optional language code to use for blank config. If a `--pipeline` is specified, the components will be added in order. | +| `--model`, `-m` | option | Optional base model to copy config from. If a `--pipeline` is specified, only those components will be kept, and all other components not in the model will be added. | +| `--pipeline`, `-p` | option | Optional comma-separate pipeline of components to add to blank language or model. | +| **CREATES** | config | Complete and auto-filled config file for training. | + +### init model {#init-model new="2"} + + + +Create a new model directory from raw data, like word frequencies, Brown +clusters and word vectors. This command is similar to the `spacy model` command +in v1.x. Note that in order to populate the model's vocab, you need to pass in a +JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as +`--jsonl-loc` with optional `id` values that correspond to the vectors table. +Just loading in vectors will not automatically populate the vocab. + + + +The `init-model` command is now available as a subcommand of `spacy init`. + + + +```bash +$ python -m spacy init model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] +[--prune-vectors] +``` + +| Argument | Type | Description | +| ------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | +| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | +| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. | +| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | +| `--truncate-vectors`, `-t` 2.3 | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | +| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | +| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | +| **CREATES** | model | A spaCy model containing the vocab and vectors. | + ## Convert {#convert} Convert files into spaCy's @@ -469,32 +543,6 @@ tokenization can be provided. {"tokens": ["If", "tokens", "are", "provided", "then", "we", "can", "skip", "the", "raw", "input", "text"]} ``` -## Init Model {#init-model new="2"} - -Create a new model directory from raw data, like word frequencies, Brown -clusters and word vectors. This command is similar to the `spacy model` command -in v1.x. Note that in order to populate the model's vocab, you need to pass in a -JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as -`--jsonl-loc` with optional `id` values that correspond to the vectors table. -Just loading in vectors will not automatically populate the vocab. - -```bash -$ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] -[--prune-vectors] -``` - -| Argument | Type | Description | -| ----------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | -| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | -| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. | -| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | -| `--truncate-vectors`, `-t` 2.3 | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | -| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | -| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | -| `--omit-extra-lookups`, `-OEL` 2.3 | flag | Do not include any of the extra lookups tables (`cluster`/`prob`/`sentiment`) from `spacy-lookups-data` in the model. | -| **CREATES** | model | A spaCy model containing the vocab and vectors. | - ## Evaluate {#evaluate new="2"} diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 635b52c89..955e484fb 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -44,24 +44,12 @@ following data and information: 2. A [`config.cfg`](#config) **configuration file** with all settings and hyperparameters. 3. An optional **Python file** to register - [custom models and architectures](#custom-models). - - + [custom functions and architectures](#custom-code). ```bash $ python -m spacy train train.spacy dev.spacy config.cfg --output ./output ``` -> #### Tip: Debug your data -> -> The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate -> your training and development data, get useful stats, and find problems like -> invalid entity annotations, cyclic dependencies, low data labels and more. -> -> ```bash -> $ python -m spacy debug-data en train.spacy dev.spacy --verbose -> ``` - The easiest way to get started with an end-to-end training process is to clone a @@ -74,16 +62,42 @@ workflows, from data preprocessing to training and packaging your model. > #### Instructions > -> 1. Select your requirements and settings. The quickstart widget will -> auto-generate a recommended starter config for you. +> 1. Select your requirements and settings. > 2. Use the buttons at the bottom to save the result to your clipboard or a -> file `config.cfg`. -> 3. TOOD: recommended approach for filling config -> 4. Run [`spacy train`](/api/cli#train) with your config and data. +> file `base_config.cfg`. +> 3. Run [`init config`](/api/cli#init-config) to create a full training config. +> 4. Run [`train`](/api/cli#train) with your config and data. import QuickstartTraining from 'widgets/quickstart-training.js' - + + +After you've saved the starter config to a file `base_config.cfg`, you can use +the [`init config`](/api/cli#init-config) command to fill in the remaining +defaults. Training configs should always be **complete and without hidden +defaults**, to keep your experiments reproducible. + +```bash +$ python -m spacy init config config.cfg --base base_config.cfg +``` + +> #### Tip: Debug your data +> +> The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate +> your training and development data, get useful stats, and find problems like +> invalid entity annotations, cyclic dependencies, low data labels and more. +> +> ```bash +> $ python -m spacy debug-data en train.spacy dev.spacy --verbose +> ``` + +You can now run [`train`](/api/cli#train) with your training and development +data and the training config. See the [`convert`](/api/cli#convert) command for +details on how to convert your data to spaCy's binary `.spacy` format. + +```bash +$ python -m spacy train train.spacy dev.spacy config.cfg --output ./output +``` ## Training config {#config} diff --git a/website/docs/usage/transformers.md b/website/docs/usage/transformers.md index 81bd45f58..b837c62de 100644 --- a/website/docs/usage/transformers.md +++ b/website/docs/usage/transformers.md @@ -165,10 +165,8 @@ resolved, the function is created and passed into the model as an argument. Remember that the `config.cfg` used for training should contain **no missing values** and requires all settings to be defined. You don't want any hidden defaults creeping in and changing your results! spaCy will tell you if settings -are missing, and you can run [`spacy debug config`](/api/cli#debug-config) with -`--auto-fill` to automatically fill in all defaults. - - +are missing, and you can run [`spacy init config`](/api/cli#init-config) with to +automatically fill in all defaults.