mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Add init CLI and init config (#5854)
* Add init CLI and init config draft * Improve config validation * Auto-format * Don't export anything in debug config * Update docs
This commit is contained in:
parent
e393ebd78b
commit
4c055f0aa7
|
@ -15,6 +15,7 @@ from .debug_model import debug_model # noqa: F401
|
||||||
from .evaluate import evaluate # noqa: F401
|
from .evaluate import evaluate # noqa: F401
|
||||||
from .convert import convert # noqa: F401
|
from .convert import convert # noqa: F401
|
||||||
from .init_model import init_model # noqa: F401
|
from .init_model import init_model # noqa: F401
|
||||||
|
from .init_config import init_config # noqa: F401
|
||||||
from .validate import validate # noqa: F401
|
from .validate import validate # noqa: F401
|
||||||
from .project.clone import project_clone # noqa: F401
|
from .project.clone import project_clone # noqa: F401
|
||||||
from .project.assets import project_assets # noqa: F401
|
from .project.assets import project_assets # noqa: F401
|
||||||
|
|
|
@ -31,6 +31,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
|
||||||
commands to check and validate your config files, training and evaluation data,
|
commands to check and validate your config files, training and evaluation data,
|
||||||
and custom model implementations.
|
and custom model implementations.
|
||||||
"""
|
"""
|
||||||
|
INIT_HELP = """Commands for initializing configs and models."""
|
||||||
|
|
||||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||||
# keep the names short, but not needed at the moment.
|
# keep the names short, but not needed at the moment.
|
||||||
|
@ -40,9 +41,11 @@ Opt = typer.Option
|
||||||
app = typer.Typer(name=NAME, help=HELP)
|
app = typer.Typer(name=NAME, help=HELP)
|
||||||
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
|
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
|
||||||
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
|
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
|
||||||
|
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
|
||||||
|
|
||||||
app.add_typer(project_cli)
|
app.add_typer(project_cli)
|
||||||
app.add_typer(debug_cli)
|
app.add_typer(debug_cli)
|
||||||
|
app.add_typer(init_cli)
|
||||||
|
|
||||||
|
|
||||||
def setup_cli() -> None:
|
def setup_cli() -> None:
|
||||||
|
@ -172,16 +175,34 @@ def get_checksum(path: Union[Path, str]) -> str:
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def show_validation_error(title: str = "Config validation error"):
|
def show_validation_error(
|
||||||
|
file_path: Optional[Union[str, Path]] = None,
|
||||||
|
*,
|
||||||
|
title: str = "Config validation error",
|
||||||
|
hint_init: bool = True,
|
||||||
|
):
|
||||||
"""Helper to show custom config validation errors on the CLI.
|
"""Helper to show custom config validation errors on the CLI.
|
||||||
|
|
||||||
|
file_path (str / Path): Optional file path of config file, used in hints.
|
||||||
title (str): Title of the custom formatted error.
|
title (str): Title of the custom formatted error.
|
||||||
|
hint_init (bool): Show hint about filling config.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
yield
|
yield
|
||||||
except (ConfigValidationError, InterpolationError) as e:
|
except (ConfigValidationError, InterpolationError) as e:
|
||||||
msg.fail(title, spaced=True)
|
msg.fail(title, spaced=True)
|
||||||
print(str(e).replace("Config validation error", "").strip())
|
# TODO: This is kinda hacky and we should probably provide a better
|
||||||
|
# helper for this in Thinc
|
||||||
|
err_text = str(e).replace("Config validation error", "").strip()
|
||||||
|
print(err_text)
|
||||||
|
if hint_init and "field required" in err_text:
|
||||||
|
config_path = file_path if file_path is not None else "config.cfg"
|
||||||
|
msg.text(
|
||||||
|
"If your config contains missing values, you can run the 'init "
|
||||||
|
"config' command to fill in all the defaults, if possible:",
|
||||||
|
spaced=True,
|
||||||
|
)
|
||||||
|
print(f"{COMMAND} init config {config_path} --base {config_path}\n")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,6 @@ def debug_config_cli(
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
output_path: Optional[Path] = Opt(None, "--output", "-o", help="Output path for filled config or '-' for standard output", allow_dash=True),
|
|
||||||
auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"),
|
auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"),
|
||||||
diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled")
|
diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
@ -49,7 +48,7 @@ def debug_config_cli(
|
||||||
"""
|
"""
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
with show_validation_error():
|
with show_validation_error(config_path):
|
||||||
config = Config().from_disk(config_path)
|
config = Config().from_disk(config_path)
|
||||||
try:
|
try:
|
||||||
nlp, _ = util.load_model_from_config(
|
nlp, _ = util.load_model_from_config(
|
||||||
|
@ -57,7 +56,6 @@ def debug_config_cli(
|
||||||
)
|
)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
msg.fail(str(e), exits=1)
|
msg.fail(str(e), exits=1)
|
||||||
is_stdout = output_path is not None and str(output_path) == "-"
|
|
||||||
if auto_fill:
|
if auto_fill:
|
||||||
orig_config = config.to_str()
|
orig_config = config.to_str()
|
||||||
filled_config = nlp.config.to_str()
|
filled_config = nlp.config.to_str()
|
||||||
|
@ -68,12 +66,7 @@ def debug_config_cli(
|
||||||
if diff:
|
if diff:
|
||||||
print(diff_strings(config.to_str(), nlp.config.to_str()))
|
print(diff_strings(config.to_str(), nlp.config.to_str()))
|
||||||
else:
|
else:
|
||||||
msg.good("Original config is valid", show=not is_stdout)
|
msg.good("Original config is valid")
|
||||||
if is_stdout:
|
|
||||||
print(nlp.config.to_str())
|
|
||||||
elif output_path is not None:
|
|
||||||
nlp.config.to_disk(output_path)
|
|
||||||
msg.good(f"Saved updated config to {output_path}")
|
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command(
|
@debug_cli.command(
|
||||||
|
@ -142,7 +135,7 @@ def debug_data(
|
||||||
msg.fail("Development data not found", dev_path, exits=1)
|
msg.fail("Development data not found", dev_path, exits=1)
|
||||||
if not config_path.exists():
|
if not config_path.exists():
|
||||||
msg.fail("Config file not found", config_path, exists=1)
|
msg.fail("Config file not found", config_path, exists=1)
|
||||||
with show_validation_error():
|
with show_validation_error(config_path):
|
||||||
cfg = Config().from_disk(config_path)
|
cfg = Config().from_disk(config_path)
|
||||||
nlp, config = util.load_model_from_config(cfg, overrides=config_overrides)
|
nlp, config = util.load_model_from_config(cfg, overrides=config_overrides)
|
||||||
# TODO: handle base model
|
# TODO: handle base model
|
||||||
|
|
|
@ -50,8 +50,8 @@ def debug_model_cli(
|
||||||
"print_prediction": P3,
|
"print_prediction": P3,
|
||||||
}
|
}
|
||||||
config_overrides = parse_config_overrides(ctx.args)
|
config_overrides = parse_config_overrides(ctx.args)
|
||||||
|
with show_validation_error(config_path):
|
||||||
cfg = Config().from_disk(config_path)
|
cfg = Config().from_disk(config_path)
|
||||||
with show_validation_error():
|
|
||||||
try:
|
try:
|
||||||
_, config = util.load_model_from_config(cfg, overrides=config_overrides)
|
_, config = util.load_model_from_config(cfg, overrides=config_overrides)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
|
|
81
spacy/cli/init_config.py
Normal file
81
spacy/cli/init_config.py
Normal file
|
@ -0,0 +1,81 @@
|
||||||
|
from typing import Optional, List
|
||||||
|
from pathlib import Path
|
||||||
|
from thinc.api import Config
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
|
from ..util import load_model_from_config, get_lang_class, load_model
|
||||||
|
from ._util import init_cli, Arg, Opt, show_validation_error
|
||||||
|
|
||||||
|
|
||||||
|
@init_cli.command("config")
|
||||||
|
def init_config_cli(
|
||||||
|
# fmt: off
|
||||||
|
output_path: Path = Arg("-", help="Output path or - for stdout", allow_dash=True),
|
||||||
|
base_path: Optional[Path] = Opt(None, "--base", "-b", help="Optional base config to fill", exists=True, dir_okay=False),
|
||||||
|
model: Optional[str] = Opt(None, "--model", "-m", help="Optional model to copy config from"),
|
||||||
|
lang: Optional[str] = Opt(None, "--lang", "-l", help="Optional language code for blank config"),
|
||||||
|
pipeline: Optional[str] = Opt(None, "--pipeline", "-p", help="Optional pipeline components to use")
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Generate a starter config.cfg for training."""
|
||||||
|
validate_cli_args(base_path, model, lang)
|
||||||
|
is_stdout = str(output_path) == "-"
|
||||||
|
pipeline = [p.strip() for p in pipeline.split(",")] if pipeline else []
|
||||||
|
cfg = init_config(output_path, base_path, model, lang, pipeline, silent=is_stdout)
|
||||||
|
if is_stdout:
|
||||||
|
print(cfg.to_str())
|
||||||
|
else:
|
||||||
|
cfg.to_disk(output_path)
|
||||||
|
msg.good("Saved config", output_path)
|
||||||
|
|
||||||
|
|
||||||
|
def init_config(
|
||||||
|
output_path: Path,
|
||||||
|
config_path: Optional[Path],
|
||||||
|
model: Optional[str],
|
||||||
|
lang: Optional[str],
|
||||||
|
pipeline: Optional[List[str]],
|
||||||
|
silent: bool = False,
|
||||||
|
) -> Config:
|
||||||
|
if config_path is not None:
|
||||||
|
msg.info("Generating config from base config", show=not silent)
|
||||||
|
with show_validation_error(config_path, hint_init=False):
|
||||||
|
config = Config().from_disk(config_path)
|
||||||
|
try:
|
||||||
|
nlp, _ = load_model_from_config(config, auto_fill=True)
|
||||||
|
except ValueError as e:
|
||||||
|
msg.fail(str(e), exits=1)
|
||||||
|
return nlp.config
|
||||||
|
if model is not None:
|
||||||
|
ext = f" with pipeline {pipeline}" if pipeline else ""
|
||||||
|
msg.info(f"Generating config from model {model}{ext}", show=not silent)
|
||||||
|
nlp = load_model(model)
|
||||||
|
for existing_pipe_name in nlp.pipe_names:
|
||||||
|
if existing_pipe_name not in pipeline:
|
||||||
|
nlp.remove_pipe(existing_pipe_name)
|
||||||
|
for pipe_name in pipeline:
|
||||||
|
if pipe_name not in nlp.pipe_names:
|
||||||
|
nlp.add_pipe(pipe_name)
|
||||||
|
return nlp.config
|
||||||
|
if lang is not None:
|
||||||
|
ext = f" with pipeline {pipeline}" if pipeline else ""
|
||||||
|
msg.info(f"Generating config for language '{lang}'{ext}", show=not silent)
|
||||||
|
nlp = get_lang_class(lang)()
|
||||||
|
for pipe_name in pipeline:
|
||||||
|
nlp.add_pipe(pipe_name)
|
||||||
|
return nlp.config
|
||||||
|
|
||||||
|
|
||||||
|
def validate_cli_args(
|
||||||
|
config_path: Optional[Path], model: Optional[str], lang: Optional[str]
|
||||||
|
) -> None:
|
||||||
|
args = {"--base": config_path, "--model": model, "--lang": lang}
|
||||||
|
if sum(arg is not None for arg in args.values()) != 1:
|
||||||
|
existing = " ".join(f"{a} {v}" for a, v in args.items() if v is not None)
|
||||||
|
msg.fail(
|
||||||
|
"The init config command expects only one of the following arguments: "
|
||||||
|
"--base (base config to fill and update), --lang (language code to "
|
||||||
|
"use for blank config) or --model (base model to copy config from).",
|
||||||
|
f"Got: {existing if existing else 'no arguments'}",
|
||||||
|
exits=1,
|
||||||
|
)
|
|
@ -10,14 +10,14 @@ import gzip
|
||||||
import zipfile
|
import zipfile
|
||||||
import srsly
|
import srsly
|
||||||
import warnings
|
import warnings
|
||||||
from wasabi import Printer
|
from wasabi import msg, Printer
|
||||||
|
import typer
|
||||||
|
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, init_cli, Arg, Opt
|
||||||
from ..vectors import Vectors
|
from ..vectors import Vectors
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
|
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
|
||||||
from ..lookups import Lookups
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ftfy
|
import ftfy
|
||||||
|
@ -28,9 +28,15 @@ except ImportError:
|
||||||
DEFAULT_OOV_PROB = -20
|
DEFAULT_OOV_PROB = -20
|
||||||
|
|
||||||
|
|
||||||
@app.command("init-model")
|
@init_cli.command("model")
|
||||||
|
@app.command(
|
||||||
|
"init-model",
|
||||||
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
hidden=True, # hide this from main CLI help but still allow it to work with warning
|
||||||
|
)
|
||||||
def init_model_cli(
|
def init_model_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
lang: str = Arg(..., help="Model language"),
|
lang: str = Arg(..., help="Model language"),
|
||||||
output_dir: Path = Arg(..., help="Model output directory"),
|
output_dir: Path = Arg(..., help="Model output directory"),
|
||||||
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
|
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
|
||||||
|
@ -48,6 +54,12 @@ def init_model_cli(
|
||||||
Create a new model from raw data. If vectors are provided in Word2Vec format,
|
Create a new model from raw data. If vectors are provided in Word2Vec format,
|
||||||
they can be either a .txt or zipped as a .zip or .tar.gz.
|
they can be either a .txt or zipped as a .zip or .tar.gz.
|
||||||
"""
|
"""
|
||||||
|
if ctx.command.name == "init-model":
|
||||||
|
msg.warn(
|
||||||
|
"The init-model command is now available via the 'init model' "
|
||||||
|
"subcommand (without the hyphen). You can run python -m spacy init "
|
||||||
|
"--help for an overview of the other available initialization commands."
|
||||||
|
)
|
||||||
init_model(
|
init_model(
|
||||||
lang,
|
lang,
|
||||||
output_dir,
|
output_dir,
|
||||||
|
|
|
@ -87,8 +87,8 @@ def pretrain(
|
||||||
else:
|
else:
|
||||||
msg.info("Using CPU")
|
msg.info("Using CPU")
|
||||||
msg.info(f"Loading config from: {config_path}")
|
msg.info(f"Loading config from: {config_path}")
|
||||||
|
with show_validation_error(config_path):
|
||||||
config = Config().from_disk(config_path)
|
config = Config().from_disk(config_path)
|
||||||
with show_validation_error():
|
|
||||||
nlp, config = util.load_model_from_config(config, overrides=config_overrides)
|
nlp, config = util.load_model_from_config(config, overrides=config_overrides)
|
||||||
# TODO: validate that [pretraining] block exists
|
# TODO: validate that [pretraining] block exists
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
|
|
|
@ -79,10 +79,11 @@ def train(
|
||||||
else:
|
else:
|
||||||
msg.info("Using CPU")
|
msg.info("Using CPU")
|
||||||
msg.info(f"Loading config and nlp from: {config_path}")
|
msg.info(f"Loading config and nlp from: {config_path}")
|
||||||
|
with show_validation_error(config_path):
|
||||||
config = Config().from_disk(config_path)
|
config = Config().from_disk(config_path)
|
||||||
if config.get("training", {}).get("seed") is not None:
|
if config.get("training", {}).get("seed") is not None:
|
||||||
fix_random_seed(config["training"]["seed"])
|
fix_random_seed(config["training"]["seed"])
|
||||||
with show_validation_error():
|
with show_validation_error(config_path):
|
||||||
nlp, config = util.load_model_from_config(config, overrides=config_overrides)
|
nlp, config = util.load_model_from_config(config, overrides=config_overrides)
|
||||||
if config["training"]["base_model"]:
|
if config["training"]["base_model"]:
|
||||||
# TODO: do something to check base_nlp against regular nlp described in config?
|
# TODO: do something to check base_nlp against regular nlp described in config?
|
||||||
|
@ -245,9 +246,7 @@ def create_evaluation_callback(
|
||||||
cfg: Union[Config, Dict[str, Any]],
|
cfg: Union[Config, Dict[str, Any]],
|
||||||
) -> Callable[[], Tuple[float, Dict[str, float]]]:
|
) -> Callable[[], Tuple[float, Dict[str, float]]]:
|
||||||
def evaluate() -> Tuple[float, Dict[str, float]]:
|
def evaluate() -> Tuple[float, Dict[str, float]]:
|
||||||
dev_examples = corpus.dev_dataset(
|
dev_examples = corpus.dev_dataset(nlp, gold_preproc=cfg["gold_preproc"])
|
||||||
nlp, gold_preproc=cfg["gold_preproc"]
|
|
||||||
)
|
|
||||||
dev_examples = list(dev_examples)
|
dev_examples = list(dev_examples)
|
||||||
n_words = sum(len(ex.predicted) for ex in dev_examples)
|
n_words = sum(len(ex.predicted) for ex in dev_examples)
|
||||||
batch_size = cfg["eval_batch_size"]
|
batch_size = cfg["eval_batch_size"]
|
||||||
|
|
|
@ -13,8 +13,9 @@ from ..util import get_package_path, get_model_meta, is_compatible_version
|
||||||
@app.command("validate")
|
@app.command("validate")
|
||||||
def validate_cli():
|
def validate_cli():
|
||||||
"""
|
"""
|
||||||
Validate that the currently installed version of spaCy is compatible
|
Validate the currently installed models and spaCy version. Checks if the
|
||||||
with the installed models. Should be run after `pip install -U spacy`.
|
installed models are compatible and shows upgrade instructions if available.
|
||||||
|
Should be run after `pip install -U spacy`.
|
||||||
"""
|
"""
|
||||||
validate()
|
validate()
|
||||||
|
|
||||||
|
|
|
@ -6,11 +6,11 @@ menu:
|
||||||
- ['Download', 'download']
|
- ['Download', 'download']
|
||||||
- ['Info', 'info']
|
- ['Info', 'info']
|
||||||
- ['Validate', 'validate']
|
- ['Validate', 'validate']
|
||||||
|
- ['Init', 'init']
|
||||||
- ['Convert', 'convert']
|
- ['Convert', 'convert']
|
||||||
- ['Debug', 'debug']
|
- ['Debug', 'debug']
|
||||||
- ['Train', 'train']
|
- ['Train', 'train']
|
||||||
- ['Pretrain', 'pretrain']
|
- ['Pretrain', 'pretrain']
|
||||||
- ['Init Model', 'init-model']
|
|
||||||
- ['Evaluate', 'evaluate']
|
- ['Evaluate', 'evaluate']
|
||||||
- ['Package', 'package']
|
- ['Package', 'package']
|
||||||
- ['Project', 'project']
|
- ['Project', 'project']
|
||||||
|
@ -94,6 +94,80 @@ $ python -m spacy validate
|
||||||
| ---------- | -------- | --------------------------------------------------------- |
|
| ---------- | -------- | --------------------------------------------------------- |
|
||||||
| **PRINTS** | `stdout` | Details about the compatibility of your installed models. |
|
| **PRINTS** | `stdout` | Details about the compatibility of your installed models. |
|
||||||
|
|
||||||
|
## Init {#init new="3"}
|
||||||
|
|
||||||
|
The `spacy init` CLI includes helpful commands for initializing training config
|
||||||
|
files and model directories.
|
||||||
|
|
||||||
|
### init config {#init-config new="3"}
|
||||||
|
|
||||||
|
Initialize and export a [`config.cfg` file](/usage/training#config) for training
|
||||||
|
and update it with all default values, if possible. Config files used for
|
||||||
|
training should always be complete and not contain any hidden defaults or
|
||||||
|
missing values, so this command helps you create your final config. It takes
|
||||||
|
**one** of the following options:
|
||||||
|
|
||||||
|
- `--base`: Base **config** to auto-fill, e.g. created using the
|
||||||
|
[training quickstart](/usage/training#quickstart) widget.
|
||||||
|
- `--lang`: Base **language** code to use for blank config.
|
||||||
|
- `--model`: Base **model** to copy config from.
|
||||||
|
|
||||||
|
> ```bash
|
||||||
|
> ### with base config {wrap="true"}
|
||||||
|
> $ python -m spacy init config config.cfg --base base.cfg
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> ### blank language {wrap="true"}
|
||||||
|
> $ python -m spacy init config config.cfg --lang en --pipeline tagger,parser
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy init config [output] [--base] [--lang] [--model] [--pipeline]
|
||||||
|
```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| ------------------ | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `output` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. |
|
||||||
|
| `--base`, `-b` | option | Optional base config file to auto-fill with defaults. |
|
||||||
|
| `--lang`, `-l` | option | Optional language code to use for blank config. If a `--pipeline` is specified, the components will be added in order. |
|
||||||
|
| `--model`, `-m` | option | Optional base model to copy config from. If a `--pipeline` is specified, only those components will be kept, and all other components not in the model will be added. |
|
||||||
|
| `--pipeline`, `-p` | option | Optional comma-separate pipeline of components to add to blank language or model. |
|
||||||
|
| **CREATES** | config | Complete and auto-filled config file for training. |
|
||||||
|
|
||||||
|
### init model {#init-model new="2"}
|
||||||
|
|
||||||
|
<!-- TODO: update for v3 -->
|
||||||
|
|
||||||
|
Create a new model directory from raw data, like word frequencies, Brown
|
||||||
|
clusters and word vectors. This command is similar to the `spacy model` command
|
||||||
|
in v1.x. Note that in order to populate the model's vocab, you need to pass in a
|
||||||
|
JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as
|
||||||
|
`--jsonl-loc` with optional `id` values that correspond to the vectors table.
|
||||||
|
Just loading in vectors will not automatically populate the vocab.
|
||||||
|
|
||||||
|
<Infobox title="New in v3.0" variant="warning">
|
||||||
|
|
||||||
|
The `init-model` command is now available as a subcommand of `spacy init`.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy init model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
|
||||||
|
[--prune-vectors]
|
||||||
|
```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| ------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. |
|
||||||
|
| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
|
||||||
|
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. |
|
||||||
|
| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
|
||||||
|
| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. |
|
||||||
|
| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
|
||||||
|
| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
|
||||||
|
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
|
||||||
|
|
||||||
## Convert {#convert}
|
## Convert {#convert}
|
||||||
|
|
||||||
Convert files into spaCy's
|
Convert files into spaCy's
|
||||||
|
@ -469,32 +543,6 @@ tokenization can be provided.
|
||||||
{"tokens": ["If", "tokens", "are", "provided", "then", "we", "can", "skip", "the", "raw", "input", "text"]}
|
{"tokens": ["If", "tokens", "are", "provided", "then", "we", "can", "skip", "the", "raw", "input", "text"]}
|
||||||
```
|
```
|
||||||
|
|
||||||
## Init Model {#init-model new="2"}
|
|
||||||
|
|
||||||
Create a new model directory from raw data, like word frequencies, Brown
|
|
||||||
clusters and word vectors. This command is similar to the `spacy model` command
|
|
||||||
in v1.x. Note that in order to populate the model's vocab, you need to pass in a
|
|
||||||
JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as
|
|
||||||
`--jsonl-loc` with optional `id` values that correspond to the vectors table.
|
|
||||||
Just loading in vectors will not automatically populate the vocab.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
$ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
|
|
||||||
[--prune-vectors]
|
|
||||||
```
|
|
||||||
|
|
||||||
| Argument | Type | Description |
|
|
||||||
| ----------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
|
||||||
| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. |
|
|
||||||
| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
|
|
||||||
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. |
|
|
||||||
| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
|
|
||||||
| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. |
|
|
||||||
| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
|
|
||||||
| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
|
|
||||||
| `--omit-extra-lookups`, `-OEL` <Tag variant="new">2.3</Tag> | flag | Do not include any of the extra lookups tables (`cluster`/`prob`/`sentiment`) from `spacy-lookups-data` in the model. |
|
|
||||||
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
|
|
||||||
|
|
||||||
## Evaluate {#evaluate new="2"}
|
## Evaluate {#evaluate new="2"}
|
||||||
|
|
||||||
<!-- TODO: document new evaluate command -->
|
<!-- TODO: document new evaluate command -->
|
||||||
|
|
|
@ -44,24 +44,12 @@ following data and information:
|
||||||
2. A [`config.cfg`](#config) **configuration file** with all settings and
|
2. A [`config.cfg`](#config) **configuration file** with all settings and
|
||||||
hyperparameters.
|
hyperparameters.
|
||||||
3. An optional **Python file** to register
|
3. An optional **Python file** to register
|
||||||
[custom models and architectures](#custom-models).
|
[custom functions and architectures](#custom-code).
|
||||||
|
|
||||||
<!-- TODO: decide how we want to present the "getting started" workflow here, get a default config etc. -->
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m spacy train train.spacy dev.spacy config.cfg --output ./output
|
$ python -m spacy train train.spacy dev.spacy config.cfg --output ./output
|
||||||
```
|
```
|
||||||
|
|
||||||
> #### Tip: Debug your data
|
|
||||||
>
|
|
||||||
> The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate
|
|
||||||
> your training and development data, get useful stats, and find problems like
|
|
||||||
> invalid entity annotations, cyclic dependencies, low data labels and more.
|
|
||||||
>
|
|
||||||
> ```bash
|
|
||||||
> $ python -m spacy debug-data en train.spacy dev.spacy --verbose
|
|
||||||
> ```
|
|
||||||
|
|
||||||
<Project id="some_example_project">
|
<Project id="some_example_project">
|
||||||
|
|
||||||
The easiest way to get started with an end-to-end training process is to clone a
|
The easiest way to get started with an end-to-end training process is to clone a
|
||||||
|
@ -74,16 +62,42 @@ workflows, from data preprocessing to training and packaging your model.
|
||||||
|
|
||||||
> #### Instructions
|
> #### Instructions
|
||||||
>
|
>
|
||||||
> 1. Select your requirements and settings. The quickstart widget will
|
> 1. Select your requirements and settings.
|
||||||
> auto-generate a recommended starter config for you.
|
|
||||||
> 2. Use the buttons at the bottom to save the result to your clipboard or a
|
> 2. Use the buttons at the bottom to save the result to your clipboard or a
|
||||||
> file `config.cfg`.
|
> file `base_config.cfg`.
|
||||||
> 3. TOOD: recommended approach for filling config
|
> 3. Run [`init config`](/api/cli#init-config) to create a full training config.
|
||||||
> 4. Run [`spacy train`](/api/cli#train) with your config and data.
|
> 4. Run [`train`](/api/cli#train) with your config and data.
|
||||||
|
|
||||||
import QuickstartTraining from 'widgets/quickstart-training.js'
|
import QuickstartTraining from 'widgets/quickstart-training.js'
|
||||||
|
|
||||||
<QuickstartTraining />
|
<QuickstartTraining download="base_config.cfg" />
|
||||||
|
|
||||||
|
After you've saved the starter config to a file `base_config.cfg`, you can use
|
||||||
|
the [`init config`](/api/cli#init-config) command to fill in the remaining
|
||||||
|
defaults. Training configs should always be **complete and without hidden
|
||||||
|
defaults**, to keep your experiments reproducible.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy init config config.cfg --base base_config.cfg
|
||||||
|
```
|
||||||
|
|
||||||
|
> #### Tip: Debug your data
|
||||||
|
>
|
||||||
|
> The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate
|
||||||
|
> your training and development data, get useful stats, and find problems like
|
||||||
|
> invalid entity annotations, cyclic dependencies, low data labels and more.
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> $ python -m spacy debug-data en train.spacy dev.spacy --verbose
|
||||||
|
> ```
|
||||||
|
|
||||||
|
You can now run [`train`](/api/cli#train) with your training and development
|
||||||
|
data and the training config. See the [`convert`](/api/cli#convert) command for
|
||||||
|
details on how to convert your data to spaCy's binary `.spacy` format.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy train train.spacy dev.spacy config.cfg --output ./output
|
||||||
|
```
|
||||||
|
|
||||||
## Training config {#config}
|
## Training config {#config}
|
||||||
|
|
||||||
|
|
|
@ -165,10 +165,8 @@ resolved, the function is created and passed into the model as an argument.
|
||||||
Remember that the `config.cfg` used for training should contain **no missing
|
Remember that the `config.cfg` used for training should contain **no missing
|
||||||
values** and requires all settings to be defined. You don't want any hidden
|
values** and requires all settings to be defined. You don't want any hidden
|
||||||
defaults creeping in and changing your results! spaCy will tell you if settings
|
defaults creeping in and changing your results! spaCy will tell you if settings
|
||||||
are missing, and you can run [`spacy debug config`](/api/cli#debug-config) with
|
are missing, and you can run [`spacy init config`](/api/cli#init-config) with to
|
||||||
`--auto-fill` to automatically fill in all defaults.
|
automatically fill in all defaults.
|
||||||
|
|
||||||
<!-- TODO: update with details on getting started with a config -->
|
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user