From b4e457d9fe43c186b4540107ca70e1d1bb60c75a Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 1 Aug 2023 22:24:02 +0900 Subject: [PATCH] Accept multiple code files in all CLI commands (#12101) * Add support for multiple code files to all relevant commands Prior to this, only the package command supported multiple code files. * Update docs * Add debug data test, plus generic fixtures One tricky thing here: it's tempting to create the config by creating a pipeline in code, but that requires declaring the custom components here. However the CliRunner appears to be run in the same process or otherwise have access to our registry, so it works even without any code arguments. So it's necessary to avoid declaring the components in the tests. * Add debug config test and restructure The code argument imports the provided file. If it adds item to the registry, that affects global state, which CliRunner doesn't isolate. Since there's no standard way to remove things from the registry, this instead uses subprocess.run to run commands. * Use a more generic, parametrized test * Add output arg for assemble and pretrain Assemble and pretrain require an output argument. This commit adds assemble testing, but not pretrain, as that requires an actual trainable component, which is not currently in the test config. * Add evaluate test and some cleanup * Mark tests as slow * Revert argument name change * Apply suggestions from code review Co-authored-by: Adriane Boyd * Format API CLI docs * isort * Fix imports in tests * isort * Undo changes to package CLI help * Fix python executable and lang code in test * Fix executable in another test --------- Co-authored-by: Adriane Boyd Co-authored-by: Raphael Mitsch --- spacy/cli/_util.py | 7 ++ spacy/cli/assemble.py | 6 +- spacy/cli/debug_config.py | 6 +- spacy/cli/debug_data.py | 6 +- spacy/cli/evaluate.py | 6 +- spacy/cli/package.py | 2 +- spacy/cli/pretrain.py | 6 +- spacy/cli/train.py | 6 +- spacy/tests/test_cli_app.py | 207 ++++++++++++++++++++++++++++++++++++ website/docs/api/cli.mdx | 107 ++++++++++--------- 10 files changed, 287 insertions(+), 72 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index f2ac20ae5..ca92cdd23 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -350,6 +350,13 @@ def show_validation_error( msg.fail("Config validation error", e, exits=1) +def import_code_paths(code_paths: str) -> None: + """Helper to import comma-separated list of code paths.""" + code_paths = [Path(p.strip()) for p in string_to_list(code_paths)] + for code_path in code_paths: + import_code(code_path) + + def import_code(code_path: Optional[Union[Path, str]]) -> None: """Helper to import Python file provided in training commands / commands using the config. This makes custom registered functions available. diff --git a/spacy/cli/assemble.py b/spacy/cli/assemble.py index ee2500b27..f552c8459 100644 --- a/spacy/cli/assemble.py +++ b/spacy/cli/assemble.py @@ -11,7 +11,7 @@ from ._util import ( Arg, Opt, app, - import_code, + import_code_paths, parse_config_overrides, show_validation_error, ) @@ -26,7 +26,7 @@ def assemble_cli( ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), output_path: Path = Arg(..., help="Output directory to store assembled pipeline in"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), # fmt: on ): @@ -45,7 +45,7 @@ def assemble_cli( if not config_path or (str(config_path) != "-" and not config_path.exists()): msg.fail("Config file not found", config_path, exits=1) overrides = parse_config_overrides(ctx.args) - import_code(code_path) + import_code_paths(code_path) with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides, interpolate=False) msg.divider("Initializing pipeline") diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py index 0e5382cd9..7818b4087 100644 --- a/spacy/cli/debug_config.py +++ b/spacy/cli/debug_config.py @@ -13,7 +13,7 @@ from ._util import ( Arg, Opt, debug_cli, - import_code, + import_code_paths, parse_config_overrides, show_validation_error, ) @@ -27,7 +27,7 @@ def debug_config_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"), show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"), show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.") # fmt: on @@ -44,7 +44,7 @@ def debug_config_cli( DOCS: https://spacy.io/api/cli#debug-config """ overrides = parse_config_overrides(ctx.args) - import_code(code_path) + import_code_paths(code_path) debug_config( config_path, overrides=overrides, show_funcs=show_funcs, show_vars=show_vars ) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 4c44a8c0e..714969be1 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -40,7 +40,7 @@ from ._util import ( _format_number, app, debug_cli, - import_code, + import_code_paths, parse_config_overrides, show_validation_error, ) @@ -72,7 +72,7 @@ def debug_data_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"), ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"), verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"), no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"), @@ -92,7 +92,7 @@ def debug_data_cli( "--help for an overview of the other available debugging commands." ) overrides = parse_config_overrides(ctx.args) - import_code(code_path) + import_code_paths(code_path) debug_data( config_path, config_overrides=overrides, diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 6235b658d..f035aa3ce 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -10,7 +10,7 @@ from .. import displacy, util from ..scorer import Scorer from ..tokens import Doc from ..training import Corpus -from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu +from ._util import Arg, Opt, app, benchmark_cli, import_code_paths, setup_gpu @benchmark_cli.command( @@ -22,7 +22,7 @@ def evaluate_cli( model: str = Arg(..., help="Model name or path"), data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True), output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), @@ -42,7 +42,7 @@ def evaluate_cli( DOCS: https://spacy.io/api/cli#benchmark-accuracy """ - import_code(code_path) + import_code_paths(code_path) evaluate( model, data_path, diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 4545578e6..01449f957 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -20,7 +20,7 @@ def package_cli( # fmt: off input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False), output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False), - code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"), + code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be included in the package"), meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False), create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"), name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"), diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 446c40510..73337a7ca 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -11,7 +11,7 @@ from ._util import ( Arg, Opt, app, - import_code, + import_code_paths, parse_config_overrides, setup_gpu, show_validation_error, @@ -27,7 +27,7 @@ def pretrain_cli( ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True), output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"), resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), @@ -56,7 +56,7 @@ def pretrain_cli( DOCS: https://spacy.io/api/cli#pretrain """ config_overrides = parse_config_overrides(ctx.args) - import_code(code_path) + import_code_paths(code_path) verify_cli_args(config_path, output_dir, resume_path, epoch_resume) setup_gpu(use_gpu) msg.info(f"Loading config from: {config_path}") diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 8bdabd39c..eb1a1a2c1 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -13,7 +13,7 @@ from ._util import ( Arg, Opt, app, - import_code, + import_code_paths, parse_config_overrides, setup_gpu, show_validation_error, @@ -28,7 +28,7 @@ def train_cli( ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") # fmt: on @@ -49,7 +49,7 @@ def train_cli( """ util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) overrides = parse_config_overrides(ctx.args) - import_code(code_path) + import_code_paths(code_path) train(config_path, output_path, use_gpu=use_gpu, overrides=overrides) diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py index 3a426113b..2424138d3 100644 --- a/spacy/tests/test_cli_app.py +++ b/spacy/tests/test_cli_app.py @@ -1,10 +1,13 @@ import os +import subprocess +import sys from pathlib import Path import pytest import srsly from typer.testing import CliRunner +import spacy from spacy.cli._util import app, get_git_version from spacy.tokens import Doc, DocBin @@ -46,6 +49,210 @@ def test_convert_auto_conflict(): assert len(out_files) == 0 +NOOP_CONFIG = """ +[paths] +train = null +dev = null +vectors = null +init_tok2vec = null + +[system] +seed = 0 +gpu_allocator = null + +[nlp] +lang = "mul" +pipeline = ["noop", "noop2"] +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +batch_size = 1000 +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[components] + +[components.noop] +factory = "noop" + +[components.noop2] +factory = "noop2" + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +gold_preproc = false +max_length = 0 +limit = 0 +augmenter = null + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +gold_preproc = false +max_length = 0 +limit = 0 +augmenter = null + +[training] +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.1 +accumulate_gradient = 1 +patience = 1600 +max_epochs = 0 +max_steps = 100 +eval_frequency = 200 +frozen_components = [] +annotating_components = [] +dev_corpus = "corpora.dev" + +train_corpus = "corpora.train" +before_to_disk = null +before_update = null + +[training.batcher] +@batchers = "spacy.batch_by_words.v1" +discard_oversize = false +tolerance = 0.2 +get_length = null + +[training.batcher.size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 +t = 0.0 + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = false + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 +learn_rate = 0.001 + +[training.score_weights] + +[pretraining] + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] +""" + + +@pytest.fixture +def data_paths(): + nlp = spacy.blank("mul") + doc = nlp("ok") + with make_tempdir() as tdir: + db = DocBin() + # debug data will *fail* if there aren't enough docs + for ii in range(100): + db.add(doc) + fpath = tdir / "data.spacy" + db.to_disk(fpath) + + args = [ + "--paths.train", + str(fpath), + "--paths.dev", + str(fpath), + ] + yield args + + +@pytest.fixture +def code_paths(): + noop_base = """ +from spacy.language import Language + +@Language.component("{}") +def noop(doc): + return doc +""" + + with make_tempdir() as temp_d: + # write code files to load + paths = [] + for ff in ["noop", "noop2"]: + pyfile = temp_d / f"{ff}.py" + pyfile.write_text(noop_base.format(ff)) + paths.append(pyfile) + + args = ["--code", ",".join([str(pp) for pp in paths])] + yield args + + +@pytest.fixture +def noop_config(): + with make_tempdir() as temp_d: + cfg = temp_d / "config.cfg" + cfg.write_text(NOOP_CONFIG) + + yield cfg + + +@pytest.mark.slow +@pytest.mark.parametrize( + "cmd", + ["debug config", "debug data", "train", "assemble"], +) +def test_multi_code(cmd, code_paths, data_paths, noop_config): + # check that it fails without the code arg + cmd = cmd.split() + output = ["."] if cmd[0] == "assemble" else [] + cmd = [sys.executable, "-m", "spacy"] + cmd + result = subprocess.run([*cmd, str(noop_config), *output, *data_paths]) + assert result.returncode == 1 + + # check that it succeeds with the code arg + result = subprocess.run([*cmd, str(noop_config), *output, *data_paths, *code_paths]) + assert result.returncode == 0 + + +@pytest.mark.slow +def test_multi_code_evaluate(code_paths, data_paths, noop_config): + # Evaluation requires a model, not a config, so this works differently from + # the other commands. + + # Train a model to evaluate + cmd = f"{sys.executable} -m spacy train {noop_config} -o model".split() + result = subprocess.run([*cmd, *data_paths, *code_paths]) + assert result.returncode == 0 + + # now do the evaluation + + eval_data = data_paths[-1] + cmd = f"{sys.executable} -m spacy evaluate model/model-best {eval_data}".split() + + # check that it fails without the code arg + result = subprocess.run(cmd) + assert result.returncode == 1 + + # check that it succeeds with the code arg + result = subprocess.run([*cmd, *code_paths]) + assert result.returncode == 0 + + def test_benchmark_accuracy_alias(): # Verify that the `evaluate` alias works correctly. result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"]) diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index b6a32f722..002b0b39d 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -175,15 +175,15 @@ validation error with more details. $ python -m spacy init fill-config [base_path] [output_file] [--diff] ``` -| Name | Description | -| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ | -| `output_file` | Path to output `.cfg` file or "-" to write to stdout so you can pipe it to a file. Defaults to "-" (stdout). ~~Path (positional)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ | -| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | Complete and auto-filled config file for training. | +| Name | Description | +| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ | +| `output_file` | Path to output `.cfg` file or "-" to write to stdout so you can pipe it to a file. Defaults to "-" (stdout). ~~Path (positional)~~ | +| `--code`, `-c` | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ | +| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Complete and auto-filled config file for training. | ### init vectors {id="init-vectors",version="3",tag="command"} @@ -242,7 +242,7 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [ | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | | `output_path` | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--code`, `-c` | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--verbose`, `-V` | Show more detailed messages for debugging purposes. ~~bool (flag)~~ | | `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | @@ -446,7 +446,7 @@ File /path/to/thinc/thinc/schedules.py (line 91) | Name | Description | | ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--code`, `-c` | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--show-functions`, `-F` | Show an overview of all registered function blocks used in the config and where those functions come from, including the module name, Python file and line number. ~~bool (flag)~~ | | `--show-variables`, `-V` | Show an overview of all variables referenced in the config, e.g. `${paths.train}` and their values that will be used. This also reflects any config overrides provided on the CLI, e.g. `--paths.train /path`. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | @@ -631,7 +631,7 @@ will not be available. | Name | Description | | -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--code`, `-c` | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--ignore-warnings`, `-IW` | Ignore warnings, only show stats and errors. ~~bool (flag)~~ | | `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ | | `--no-format`, `-NF` | Don't pretty-print the results. Use this if you want to write to a file. ~~bool (flag)~~ | @@ -1055,7 +1055,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | | `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--code`, `-c` | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ | | `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | @@ -1125,6 +1125,7 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [ | -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | | `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ | +| `--code`, `-c` | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ | | `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ | @@ -1162,19 +1163,19 @@ skew. To render a sample of dependency parses in a HTML file using the $ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit] ``` -| Name | Description | -| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | -| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | -| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ | -| `--code`, `-c` 3 | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | -| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | -| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ | -| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ | -| `--per-component`, `-P` 3.6 | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | Training results and optional metrics and visualizations. | +| Name | Description | +| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | +| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | +| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ | +| `--code`, `-c` 3 | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | +| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ | +| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ | +| `--per-component`, `-P` 3.6 | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Training results and optional metrics and visualizations. | ### speed {id="benchmark-speed", version="3.5", tag="command"} @@ -1216,19 +1217,19 @@ When a directory is provided it is traversed recursively to collect all files. $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process] ``` -| Name | Description | -| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ | -| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ | -| `output-file` | Output `DocBin` path. ~~str (positional)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ | -| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ | -| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | -| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ | -| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. | +| Name | Description | +| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ | +| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ | +| `output-file` | Output `DocBin` path. ~~str (positional)~~ | +| `--code`, `-c` | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ | +| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ | +| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ | +| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. | ## find-threshold {id="find-threshold",version="3.5",tag="command"} @@ -1255,19 +1256,19 @@ be provided. > $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f > ``` -| Name | Description | -| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | -| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ | -| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ | -| `threshold_key` | Key of threshold attribute in component's configuration. ~~str (positional)~~ | -| `scores_key` | Name of score to metric to optimize. ~~str (positional)~~ | -| `--n_trials`, `-n` | Number of trials to determine optimal thresholds. ~~int (option)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | -| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | -| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| Name | Description | +| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | +| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ | +| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ | +| `threshold_key` | Key of threshold attribute in component's configuration. ~~str (positional)~~ | +| `scores_key` | Name of score to metric to optimize. ~~str (positional)~~ | +| `--n_trials`, `-n` | Number of trials to determine optimal thresholds. ~~int (option)~~ | +| `--code`, `-c` | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | +| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | ## assemble {id="assemble",tag="command"} @@ -1291,7 +1292,7 @@ $ python -m spacy assemble [config_path] [output_dir] [--code] [--verbose] [over | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `config_path` | Path to the [config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | | `output_dir` | Directory to store the final pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions). ~~Optional[Path] \(option)~~ | +| `--code`, `-c` | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--verbose`, `-V` | Show more detailed messages during processing. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.data ./data`. ~~Any (option/flag)~~ |