mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Add init CLI and init config (#5854)
* Add init CLI and init config draft * Improve config validation * Auto-format * Don't export anything in debug config * Update docs
This commit is contained in:
parent
e393ebd78b
commit
4c055f0aa7
|
@ -15,6 +15,7 @@ from .debug_model import debug_model # noqa: F401
|
|||
from .evaluate import evaluate # noqa: F401
|
||||
from .convert import convert # noqa: F401
|
||||
from .init_model import init_model # noqa: F401
|
||||
from .init_config import init_config # noqa: F401
|
||||
from .validate import validate # noqa: F401
|
||||
from .project.clone import project_clone # noqa: F401
|
||||
from .project.assets import project_assets # noqa: F401
|
||||
|
|
|
@ -31,6 +31,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
|
|||
commands to check and validate your config files, training and evaluation data,
|
||||
and custom model implementations.
|
||||
"""
|
||||
INIT_HELP = """Commands for initializing configs and models."""
|
||||
|
||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||
# keep the names short, but not needed at the moment.
|
||||
|
@ -40,9 +41,11 @@ Opt = typer.Option
|
|||
app = typer.Typer(name=NAME, help=HELP)
|
||||
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
|
||||
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
|
||||
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
|
||||
|
||||
app.add_typer(project_cli)
|
||||
app.add_typer(debug_cli)
|
||||
app.add_typer(init_cli)
|
||||
|
||||
|
||||
def setup_cli() -> None:
|
||||
|
@ -172,16 +175,34 @@ def get_checksum(path: Union[Path, str]) -> str:
|
|||
|
||||
|
||||
@contextmanager
|
||||
def show_validation_error(title: str = "Config validation error"):
|
||||
def show_validation_error(
|
||||
file_path: Optional[Union[str, Path]] = None,
|
||||
*,
|
||||
title: str = "Config validation error",
|
||||
hint_init: bool = True,
|
||||
):
|
||||
"""Helper to show custom config validation errors on the CLI.
|
||||
|
||||
file_path (str / Path): Optional file path of config file, used in hints.
|
||||
title (str): Title of the custom formatted error.
|
||||
hint_init (bool): Show hint about filling config.
|
||||
"""
|
||||
try:
|
||||
yield
|
||||
except (ConfigValidationError, InterpolationError) as e:
|
||||
msg.fail(title, spaced=True)
|
||||
print(str(e).replace("Config validation error", "").strip())
|
||||
# TODO: This is kinda hacky and we should probably provide a better
|
||||
# helper for this in Thinc
|
||||
err_text = str(e).replace("Config validation error", "").strip()
|
||||
print(err_text)
|
||||
if hint_init and "field required" in err_text:
|
||||
config_path = file_path if file_path is not None else "config.cfg"
|
||||
msg.text(
|
||||
"If your config contains missing values, you can run the 'init "
|
||||
"config' command to fill in all the defaults, if possible:",
|
||||
spaced=True,
|
||||
)
|
||||
print(f"{COMMAND} init config {config_path} --base {config_path}\n")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
|
|
|
@ -33,7 +33,6 @@ def debug_config_cli(
|
|||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
output_path: Optional[Path] = Opt(None, "--output", "-o", help="Output path for filled config or '-' for standard output", allow_dash=True),
|
||||
auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"),
|
||||
diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled")
|
||||
# fmt: on
|
||||
|
@ -49,7 +48,7 @@ def debug_config_cli(
|
|||
"""
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
with show_validation_error():
|
||||
with show_validation_error(config_path):
|
||||
config = Config().from_disk(config_path)
|
||||
try:
|
||||
nlp, _ = util.load_model_from_config(
|
||||
|
@ -57,7 +56,6 @@ def debug_config_cli(
|
|||
)
|
||||
except ValueError as e:
|
||||
msg.fail(str(e), exits=1)
|
||||
is_stdout = output_path is not None and str(output_path) == "-"
|
||||
if auto_fill:
|
||||
orig_config = config.to_str()
|
||||
filled_config = nlp.config.to_str()
|
||||
|
@ -68,12 +66,7 @@ def debug_config_cli(
|
|||
if diff:
|
||||
print(diff_strings(config.to_str(), nlp.config.to_str()))
|
||||
else:
|
||||
msg.good("Original config is valid", show=not is_stdout)
|
||||
if is_stdout:
|
||||
print(nlp.config.to_str())
|
||||
elif output_path is not None:
|
||||
nlp.config.to_disk(output_path)
|
||||
msg.good(f"Saved updated config to {output_path}")
|
||||
msg.good("Original config is valid")
|
||||
|
||||
|
||||
@debug_cli.command(
|
||||
|
@ -142,7 +135,7 @@ def debug_data(
|
|||
msg.fail("Development data not found", dev_path, exits=1)
|
||||
if not config_path.exists():
|
||||
msg.fail("Config file not found", config_path, exists=1)
|
||||
with show_validation_error():
|
||||
with show_validation_error(config_path):
|
||||
cfg = Config().from_disk(config_path)
|
||||
nlp, config = util.load_model_from_config(cfg, overrides=config_overrides)
|
||||
# TODO: handle base model
|
||||
|
|
|
@ -50,8 +50,8 @@ def debug_model_cli(
|
|||
"print_prediction": P3,
|
||||
}
|
||||
config_overrides = parse_config_overrides(ctx.args)
|
||||
with show_validation_error(config_path):
|
||||
cfg = Config().from_disk(config_path)
|
||||
with show_validation_error():
|
||||
try:
|
||||
_, config = util.load_model_from_config(cfg, overrides=config_overrides)
|
||||
except ValueError as e:
|
||||
|
|
81
spacy/cli/init_config.py
Normal file
81
spacy/cli/init_config.py
Normal file
|
@ -0,0 +1,81 @@
|
|||
from typing import Optional, List
|
||||
from pathlib import Path
|
||||
from thinc.api import Config
|
||||
from wasabi import msg
|
||||
|
||||
from ..util import load_model_from_config, get_lang_class, load_model
|
||||
from ._util import init_cli, Arg, Opt, show_validation_error
|
||||
|
||||
|
||||
@init_cli.command("config")
|
||||
def init_config_cli(
|
||||
# fmt: off
|
||||
output_path: Path = Arg("-", help="Output path or - for stdout", allow_dash=True),
|
||||
base_path: Optional[Path] = Opt(None, "--base", "-b", help="Optional base config to fill", exists=True, dir_okay=False),
|
||||
model: Optional[str] = Opt(None, "--model", "-m", help="Optional model to copy config from"),
|
||||
lang: Optional[str] = Opt(None, "--lang", "-l", help="Optional language code for blank config"),
|
||||
pipeline: Optional[str] = Opt(None, "--pipeline", "-p", help="Optional pipeline components to use")
|
||||
# fmt: on
|
||||
):
|
||||
"""Generate a starter config.cfg for training."""
|
||||
validate_cli_args(base_path, model, lang)
|
||||
is_stdout = str(output_path) == "-"
|
||||
pipeline = [p.strip() for p in pipeline.split(",")] if pipeline else []
|
||||
cfg = init_config(output_path, base_path, model, lang, pipeline, silent=is_stdout)
|
||||
if is_stdout:
|
||||
print(cfg.to_str())
|
||||
else:
|
||||
cfg.to_disk(output_path)
|
||||
msg.good("Saved config", output_path)
|
||||
|
||||
|
||||
def init_config(
|
||||
output_path: Path,
|
||||
config_path: Optional[Path],
|
||||
model: Optional[str],
|
||||
lang: Optional[str],
|
||||
pipeline: Optional[List[str]],
|
||||
silent: bool = False,
|
||||
) -> Config:
|
||||
if config_path is not None:
|
||||
msg.info("Generating config from base config", show=not silent)
|
||||
with show_validation_error(config_path, hint_init=False):
|
||||
config = Config().from_disk(config_path)
|
||||
try:
|
||||
nlp, _ = load_model_from_config(config, auto_fill=True)
|
||||
except ValueError as e:
|
||||
msg.fail(str(e), exits=1)
|
||||
return nlp.config
|
||||
if model is not None:
|
||||
ext = f" with pipeline {pipeline}" if pipeline else ""
|
||||
msg.info(f"Generating config from model {model}{ext}", show=not silent)
|
||||
nlp = load_model(model)
|
||||
for existing_pipe_name in nlp.pipe_names:
|
||||
if existing_pipe_name not in pipeline:
|
||||
nlp.remove_pipe(existing_pipe_name)
|
||||
for pipe_name in pipeline:
|
||||
if pipe_name not in nlp.pipe_names:
|
||||
nlp.add_pipe(pipe_name)
|
||||
return nlp.config
|
||||
if lang is not None:
|
||||
ext = f" with pipeline {pipeline}" if pipeline else ""
|
||||
msg.info(f"Generating config for language '{lang}'{ext}", show=not silent)
|
||||
nlp = get_lang_class(lang)()
|
||||
for pipe_name in pipeline:
|
||||
nlp.add_pipe(pipe_name)
|
||||
return nlp.config
|
||||
|
||||
|
||||
def validate_cli_args(
|
||||
config_path: Optional[Path], model: Optional[str], lang: Optional[str]
|
||||
) -> None:
|
||||
args = {"--base": config_path, "--model": model, "--lang": lang}
|
||||
if sum(arg is not None for arg in args.values()) != 1:
|
||||
existing = " ".join(f"{a} {v}" for a, v in args.items() if v is not None)
|
||||
msg.fail(
|
||||
"The init config command expects only one of the following arguments: "
|
||||
"--base (base config to fill and update), --lang (language code to "
|
||||
"use for blank config) or --model (base model to copy config from).",
|
||||
f"Got: {existing if existing else 'no arguments'}",
|
||||
exits=1,
|
||||
)
|
|
@ -10,14 +10,14 @@ import gzip
|
|||
import zipfile
|
||||
import srsly
|
||||
import warnings
|
||||
from wasabi import Printer
|
||||
from wasabi import msg, Printer
|
||||
import typer
|
||||
|
||||
from ._util import app, Arg, Opt
|
||||
from ._util import app, init_cli, Arg, Opt
|
||||
from ..vectors import Vectors
|
||||
from ..errors import Errors, Warnings
|
||||
from ..language import Language
|
||||
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
|
||||
from ..lookups import Lookups
|
||||
|
||||
try:
|
||||
import ftfy
|
||||
|
@ -28,9 +28,15 @@ except ImportError:
|
|||
DEFAULT_OOV_PROB = -20
|
||||
|
||||
|
||||
@app.command("init-model")
|
||||
@init_cli.command("model")
|
||||
@app.command(
|
||||
"init-model",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
hidden=True, # hide this from main CLI help but still allow it to work with warning
|
||||
)
|
||||
def init_model_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
lang: str = Arg(..., help="Model language"),
|
||||
output_dir: Path = Arg(..., help="Model output directory"),
|
||||
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
|
||||
|
@ -48,6 +54,12 @@ def init_model_cli(
|
|||
Create a new model from raw data. If vectors are provided in Word2Vec format,
|
||||
they can be either a .txt or zipped as a .zip or .tar.gz.
|
||||
"""
|
||||
if ctx.command.name == "init-model":
|
||||
msg.warn(
|
||||
"The init-model command is now available via the 'init model' "
|
||||
"subcommand (without the hyphen). You can run python -m spacy init "
|
||||
"--help for an overview of the other available initialization commands."
|
||||
)
|
||||
init_model(
|
||||
lang,
|
||||
output_dir,
|
||||
|
|
|
@ -87,8 +87,8 @@ def pretrain(
|
|||
else:
|
||||
msg.info("Using CPU")
|
||||
msg.info(f"Loading config from: {config_path}")
|
||||
with show_validation_error(config_path):
|
||||
config = Config().from_disk(config_path)
|
||||
with show_validation_error():
|
||||
nlp, config = util.load_model_from_config(config, overrides=config_overrides)
|
||||
# TODO: validate that [pretraining] block exists
|
||||
if not output_dir.exists():
|
||||
|
|
|
@ -79,10 +79,11 @@ def train(
|
|||
else:
|
||||
msg.info("Using CPU")
|
||||
msg.info(f"Loading config and nlp from: {config_path}")
|
||||
with show_validation_error(config_path):
|
||||
config = Config().from_disk(config_path)
|
||||
if config.get("training", {}).get("seed") is not None:
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
with show_validation_error():
|
||||
with show_validation_error(config_path):
|
||||
nlp, config = util.load_model_from_config(config, overrides=config_overrides)
|
||||
if config["training"]["base_model"]:
|
||||
# TODO: do something to check base_nlp against regular nlp described in config?
|
||||
|
@ -245,9 +246,7 @@ def create_evaluation_callback(
|
|||
cfg: Union[Config, Dict[str, Any]],
|
||||
) -> Callable[[], Tuple[float, Dict[str, float]]]:
|
||||
def evaluate() -> Tuple[float, Dict[str, float]]:
|
||||
dev_examples = corpus.dev_dataset(
|
||||
nlp, gold_preproc=cfg["gold_preproc"]
|
||||
)
|
||||
dev_examples = corpus.dev_dataset(nlp, gold_preproc=cfg["gold_preproc"])
|
||||
dev_examples = list(dev_examples)
|
||||
n_words = sum(len(ex.predicted) for ex in dev_examples)
|
||||
batch_size = cfg["eval_batch_size"]
|
||||
|
|
|
@ -13,8 +13,9 @@ from ..util import get_package_path, get_model_meta, is_compatible_version
|
|||
@app.command("validate")
|
||||
def validate_cli():
|
||||
"""
|
||||
Validate that the currently installed version of spaCy is compatible
|
||||
with the installed models. Should be run after `pip install -U spacy`.
|
||||
Validate the currently installed models and spaCy version. Checks if the
|
||||
installed models are compatible and shows upgrade instructions if available.
|
||||
Should be run after `pip install -U spacy`.
|
||||
"""
|
||||
validate()
|
||||
|
||||
|
|
|
@ -6,11 +6,11 @@ menu:
|
|||
- ['Download', 'download']
|
||||
- ['Info', 'info']
|
||||
- ['Validate', 'validate']
|
||||
- ['Init', 'init']
|
||||
- ['Convert', 'convert']
|
||||
- ['Debug', 'debug']
|
||||
- ['Train', 'train']
|
||||
- ['Pretrain', 'pretrain']
|
||||
- ['Init Model', 'init-model']
|
||||
- ['Evaluate', 'evaluate']
|
||||
- ['Package', 'package']
|
||||
- ['Project', 'project']
|
||||
|
@ -94,6 +94,80 @@ $ python -m spacy validate
|
|||
| ---------- | -------- | --------------------------------------------------------- |
|
||||
| **PRINTS** | `stdout` | Details about the compatibility of your installed models. |
|
||||
|
||||
## Init {#init new="3"}
|
||||
|
||||
The `spacy init` CLI includes helpful commands for initializing training config
|
||||
files and model directories.
|
||||
|
||||
### init config {#init-config new="3"}
|
||||
|
||||
Initialize and export a [`config.cfg` file](/usage/training#config) for training
|
||||
and update it with all default values, if possible. Config files used for
|
||||
training should always be complete and not contain any hidden defaults or
|
||||
missing values, so this command helps you create your final config. It takes
|
||||
**one** of the following options:
|
||||
|
||||
- `--base`: Base **config** to auto-fill, e.g. created using the
|
||||
[training quickstart](/usage/training#quickstart) widget.
|
||||
- `--lang`: Base **language** code to use for blank config.
|
||||
- `--model`: Base **model** to copy config from.
|
||||
|
||||
> ```bash
|
||||
> ### with base config {wrap="true"}
|
||||
> $ python -m spacy init config config.cfg --base base.cfg
|
||||
> ```
|
||||
>
|
||||
> ```bash
|
||||
> ### blank language {wrap="true"}
|
||||
> $ python -m spacy init config config.cfg --lang en --pipeline tagger,parser
|
||||
> ```
|
||||
|
||||
```bash
|
||||
$ python -m spacy init config [output] [--base] [--lang] [--model] [--pipeline]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ------------------ | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `output` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. |
|
||||
| `--base`, `-b` | option | Optional base config file to auto-fill with defaults. |
|
||||
| `--lang`, `-l` | option | Optional language code to use for blank config. If a `--pipeline` is specified, the components will be added in order. |
|
||||
| `--model`, `-m` | option | Optional base model to copy config from. If a `--pipeline` is specified, only those components will be kept, and all other components not in the model will be added. |
|
||||
| `--pipeline`, `-p` | option | Optional comma-separate pipeline of components to add to blank language or model. |
|
||||
| **CREATES** | config | Complete and auto-filled config file for training. |
|
||||
|
||||
### init model {#init-model new="2"}
|
||||
|
||||
<!-- TODO: update for v3 -->
|
||||
|
||||
Create a new model directory from raw data, like word frequencies, Brown
|
||||
clusters and word vectors. This command is similar to the `spacy model` command
|
||||
in v1.x. Note that in order to populate the model's vocab, you need to pass in a
|
||||
JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as
|
||||
`--jsonl-loc` with optional `id` values that correspond to the vectors table.
|
||||
Just loading in vectors will not automatically populate the vocab.
|
||||
|
||||
<Infobox title="New in v3.0" variant="warning">
|
||||
|
||||
The `init-model` command is now available as a subcommand of `spacy init`.
|
||||
|
||||
</Infobox>
|
||||
|
||||
```bash
|
||||
$ python -m spacy init model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
|
||||
[--prune-vectors]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. |
|
||||
| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
|
||||
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. |
|
||||
| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
|
||||
| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. |
|
||||
| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
|
||||
| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
|
||||
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
|
||||
|
||||
## Convert {#convert}
|
||||
|
||||
Convert files into spaCy's
|
||||
|
@ -469,32 +543,6 @@ tokenization can be provided.
|
|||
{"tokens": ["If", "tokens", "are", "provided", "then", "we", "can", "skip", "the", "raw", "input", "text"]}
|
||||
```
|
||||
|
||||
## Init Model {#init-model new="2"}
|
||||
|
||||
Create a new model directory from raw data, like word frequencies, Brown
|
||||
clusters and word vectors. This command is similar to the `spacy model` command
|
||||
in v1.x. Note that in order to populate the model's vocab, you need to pass in a
|
||||
JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as
|
||||
`--jsonl-loc` with optional `id` values that correspond to the vectors table.
|
||||
Just loading in vectors will not automatically populate the vocab.
|
||||
|
||||
```bash
|
||||
$ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
|
||||
[--prune-vectors]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. |
|
||||
| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
|
||||
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. |
|
||||
| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
|
||||
| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. |
|
||||
| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
|
||||
| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
|
||||
| `--omit-extra-lookups`, `-OEL` <Tag variant="new">2.3</Tag> | flag | Do not include any of the extra lookups tables (`cluster`/`prob`/`sentiment`) from `spacy-lookups-data` in the model. |
|
||||
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
|
||||
|
||||
## Evaluate {#evaluate new="2"}
|
||||
|
||||
<!-- TODO: document new evaluate command -->
|
||||
|
|
|
@ -44,24 +44,12 @@ following data and information:
|
|||
2. A [`config.cfg`](#config) **configuration file** with all settings and
|
||||
hyperparameters.
|
||||
3. An optional **Python file** to register
|
||||
[custom models and architectures](#custom-models).
|
||||
|
||||
<!-- TODO: decide how we want to present the "getting started" workflow here, get a default config etc. -->
|
||||
[custom functions and architectures](#custom-code).
|
||||
|
||||
```bash
|
||||
$ python -m spacy train train.spacy dev.spacy config.cfg --output ./output
|
||||
```
|
||||
|
||||
> #### Tip: Debug your data
|
||||
>
|
||||
> The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate
|
||||
> your training and development data, get useful stats, and find problems like
|
||||
> invalid entity annotations, cyclic dependencies, low data labels and more.
|
||||
>
|
||||
> ```bash
|
||||
> $ python -m spacy debug-data en train.spacy dev.spacy --verbose
|
||||
> ```
|
||||
|
||||
<Project id="some_example_project">
|
||||
|
||||
The easiest way to get started with an end-to-end training process is to clone a
|
||||
|
@ -74,16 +62,42 @@ workflows, from data preprocessing to training and packaging your model.
|
|||
|
||||
> #### Instructions
|
||||
>
|
||||
> 1. Select your requirements and settings. The quickstart widget will
|
||||
> auto-generate a recommended starter config for you.
|
||||
> 1. Select your requirements and settings.
|
||||
> 2. Use the buttons at the bottom to save the result to your clipboard or a
|
||||
> file `config.cfg`.
|
||||
> 3. TOOD: recommended approach for filling config
|
||||
> 4. Run [`spacy train`](/api/cli#train) with your config and data.
|
||||
> file `base_config.cfg`.
|
||||
> 3. Run [`init config`](/api/cli#init-config) to create a full training config.
|
||||
> 4. Run [`train`](/api/cli#train) with your config and data.
|
||||
|
||||
import QuickstartTraining from 'widgets/quickstart-training.js'
|
||||
|
||||
<QuickstartTraining />
|
||||
<QuickstartTraining download="base_config.cfg" />
|
||||
|
||||
After you've saved the starter config to a file `base_config.cfg`, you can use
|
||||
the [`init config`](/api/cli#init-config) command to fill in the remaining
|
||||
defaults. Training configs should always be **complete and without hidden
|
||||
defaults**, to keep your experiments reproducible.
|
||||
|
||||
```bash
|
||||
$ python -m spacy init config config.cfg --base base_config.cfg
|
||||
```
|
||||
|
||||
> #### Tip: Debug your data
|
||||
>
|
||||
> The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate
|
||||
> your training and development data, get useful stats, and find problems like
|
||||
> invalid entity annotations, cyclic dependencies, low data labels and more.
|
||||
>
|
||||
> ```bash
|
||||
> $ python -m spacy debug-data en train.spacy dev.spacy --verbose
|
||||
> ```
|
||||
|
||||
You can now run [`train`](/api/cli#train) with your training and development
|
||||
data and the training config. See the [`convert`](/api/cli#convert) command for
|
||||
details on how to convert your data to spaCy's binary `.spacy` format.
|
||||
|
||||
```bash
|
||||
$ python -m spacy train train.spacy dev.spacy config.cfg --output ./output
|
||||
```
|
||||
|
||||
## Training config {#config}
|
||||
|
||||
|
|
|
@ -165,10 +165,8 @@ resolved, the function is created and passed into the model as an argument.
|
|||
Remember that the `config.cfg` used for training should contain **no missing
|
||||
values** and requires all settings to be defined. You don't want any hidden
|
||||
defaults creeping in and changing your results! spaCy will tell you if settings
|
||||
are missing, and you can run [`spacy debug config`](/api/cli#debug-config) with
|
||||
`--auto-fill` to automatically fill in all defaults.
|
||||
|
||||
<!-- TODO: update with details on getting started with a config -->
|
||||
are missing, and you can run [`spacy init config`](/api/cli#init-config) with to
|
||||
automatically fill in all defaults.
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user