diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index f2ac20ae5..ca92cdd23 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -350,6 +350,13 @@ def show_validation_error( msg.fail("Config validation error", e, exits=1) +def import_code_paths(code_paths: str) -> None: + """Helper to import comma-separated list of code paths.""" + code_paths = [Path(p.strip()) for p in string_to_list(code_paths)] + for code_path in code_paths: + import_code(code_path) + + def import_code(code_path: Optional[Union[Path, str]]) -> None: """Helper to import Python file provided in training commands / commands using the config. This makes custom registered functions available. diff --git a/spacy/cli/assemble.py b/spacy/cli/assemble.py index ee2500b27..f552c8459 100644 --- a/spacy/cli/assemble.py +++ b/spacy/cli/assemble.py @@ -11,7 +11,7 @@ from ._util import ( Arg, Opt, app, - import_code, + import_code_paths, parse_config_overrides, show_validation_error, ) @@ -26,7 +26,7 @@ def assemble_cli( ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), output_path: Path = Arg(..., help="Output directory to store assembled pipeline in"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), # fmt: on ): @@ -45,7 +45,7 @@ def assemble_cli( if not config_path or (str(config_path) != "-" and not config_path.exists()): msg.fail("Config file not found", config_path, exits=1) overrides = parse_config_overrides(ctx.args) - import_code(code_path) + import_code_paths(code_path) with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides, interpolate=False) msg.divider("Initializing pipeline") diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py index 0e5382cd9..7818b4087 100644 --- a/spacy/cli/debug_config.py +++ b/spacy/cli/debug_config.py @@ -13,7 +13,7 @@ from ._util import ( Arg, Opt, debug_cli, - import_code, + import_code_paths, parse_config_overrides, show_validation_error, ) @@ -27,7 +27,7 @@ def debug_config_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"), show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"), show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.") # fmt: on @@ -44,7 +44,7 @@ def debug_config_cli( DOCS: https://spacy.io/api/cli#debug-config """ overrides = parse_config_overrides(ctx.args) - import_code(code_path) + import_code_paths(code_path) debug_config( config_path, overrides=overrides, show_funcs=show_funcs, show_vars=show_vars ) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 4c44a8c0e..714969be1 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -40,7 +40,7 @@ from ._util import ( _format_number, app, debug_cli, - import_code, + import_code_paths, parse_config_overrides, show_validation_error, ) @@ -72,7 +72,7 @@ def debug_data_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"), ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"), verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"), no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"), @@ -92,7 +92,7 @@ def debug_data_cli( "--help for an overview of the other available debugging commands." ) overrides = parse_config_overrides(ctx.args) - import_code(code_path) + import_code_paths(code_path) debug_data( config_path, config_overrides=overrides, diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 6235b658d..f035aa3ce 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -10,7 +10,7 @@ from .. import displacy, util from ..scorer import Scorer from ..tokens import Doc from ..training import Corpus -from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu +from ._util import Arg, Opt, app, benchmark_cli, import_code_paths, setup_gpu @benchmark_cli.command( @@ -22,7 +22,7 @@ def evaluate_cli( model: str = Arg(..., help="Model name or path"), data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True), output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), @@ -42,7 +42,7 @@ def evaluate_cli( DOCS: https://spacy.io/api/cli#benchmark-accuracy """ - import_code(code_path) + import_code_paths(code_path) evaluate( model, data_path, diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 4545578e6..01449f957 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -20,7 +20,7 @@ def package_cli( # fmt: off input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False), output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False), - code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"), + code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be included in the package"), meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False), create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"), name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"), diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 446c40510..73337a7ca 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -11,7 +11,7 @@ from ._util import ( Arg, Opt, app, - import_code, + import_code_paths, parse_config_overrides, setup_gpu, show_validation_error, @@ -27,7 +27,7 @@ def pretrain_cli( ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True), output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"), resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), @@ -56,7 +56,7 @@ def pretrain_cli( DOCS: https://spacy.io/api/cli#pretrain """ config_overrides = parse_config_overrides(ctx.args) - import_code(code_path) + import_code_paths(code_path) verify_cli_args(config_path, output_dir, resume_path, epoch_resume) setup_gpu(use_gpu) msg.info(f"Loading config from: {config_path}") diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 8bdabd39c..eb1a1a2c1 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -13,7 +13,7 @@ from ._util import ( Arg, Opt, app, - import_code, + import_code_paths, parse_config_overrides, setup_gpu, show_validation_error, @@ -28,7 +28,7 @@ def train_cli( ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") # fmt: on @@ -49,7 +49,7 @@ def train_cli( """ util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) overrides = parse_config_overrides(ctx.args) - import_code(code_path) + import_code_paths(code_path) train(config_path, output_path, use_gpu=use_gpu, overrides=overrides) diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py index 3a426113b..2424138d3 100644 --- a/spacy/tests/test_cli_app.py +++ b/spacy/tests/test_cli_app.py @@ -1,10 +1,13 @@ import os +import subprocess +import sys from pathlib import Path import pytest import srsly from typer.testing import CliRunner +import spacy from spacy.cli._util import app, get_git_version from spacy.tokens import Doc, DocBin @@ -46,6 +49,210 @@ def test_convert_auto_conflict(): assert len(out_files) == 0 +NOOP_CONFIG = """ +[paths] +train = null +dev = null +vectors = null +init_tok2vec = null + +[system] +seed = 0 +gpu_allocator = null + +[nlp] +lang = "mul" +pipeline = ["noop", "noop2"] +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +batch_size = 1000 +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[components] + +[components.noop] +factory = "noop" + +[components.noop2] +factory = "noop2" + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +gold_preproc = false +max_length = 0 +limit = 0 +augmenter = null + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +gold_preproc = false +max_length = 0 +limit = 0 +augmenter = null + +[training] +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.1 +accumulate_gradient = 1 +patience = 1600 +max_epochs = 0 +max_steps = 100 +eval_frequency = 200 +frozen_components = [] +annotating_components = [] +dev_corpus = "corpora.dev" + +train_corpus = "corpora.train" +before_to_disk = null +before_update = null + +[training.batcher] +@batchers = "spacy.batch_by_words.v1" +discard_oversize = false +tolerance = 0.2 +get_length = null + +[training.batcher.size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 +t = 0.0 + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = false + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 +learn_rate = 0.001 + +[training.score_weights] + +[pretraining] + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] +""" + + +@pytest.fixture +def data_paths(): + nlp = spacy.blank("mul") + doc = nlp("ok") + with make_tempdir() as tdir: + db = DocBin() + # debug data will *fail* if there aren't enough docs + for ii in range(100): + db.add(doc) + fpath = tdir / "data.spacy" + db.to_disk(fpath) + + args = [ + "--paths.train", + str(fpath), + "--paths.dev", + str(fpath), + ] + yield args + + +@pytest.fixture +def code_paths(): + noop_base = """ +from spacy.language import Language + +@Language.component("{}") +def noop(doc): + return doc +""" + + with make_tempdir() as temp_d: + # write code files to load + paths = [] + for ff in ["noop", "noop2"]: + pyfile = temp_d / f"{ff}.py" + pyfile.write_text(noop_base.format(ff)) + paths.append(pyfile) + + args = ["--code", ",".join([str(pp) for pp in paths])] + yield args + + +@pytest.fixture +def noop_config(): + with make_tempdir() as temp_d: + cfg = temp_d / "config.cfg" + cfg.write_text(NOOP_CONFIG) + + yield cfg + + +@pytest.mark.slow +@pytest.mark.parametrize( + "cmd", + ["debug config", "debug data", "train", "assemble"], +) +def test_multi_code(cmd, code_paths, data_paths, noop_config): + # check that it fails without the code arg + cmd = cmd.split() + output = ["."] if cmd[0] == "assemble" else [] + cmd = [sys.executable, "-m", "spacy"] + cmd + result = subprocess.run([*cmd, str(noop_config), *output, *data_paths]) + assert result.returncode == 1 + + # check that it succeeds with the code arg + result = subprocess.run([*cmd, str(noop_config), *output, *data_paths, *code_paths]) + assert result.returncode == 0 + + +@pytest.mark.slow +def test_multi_code_evaluate(code_paths, data_paths, noop_config): + # Evaluation requires a model, not a config, so this works differently from + # the other commands. + + # Train a model to evaluate + cmd = f"{sys.executable} -m spacy train {noop_config} -o model".split() + result = subprocess.run([*cmd, *data_paths, *code_paths]) + assert result.returncode == 0 + + # now do the evaluation + + eval_data = data_paths[-1] + cmd = f"{sys.executable} -m spacy evaluate model/model-best {eval_data}".split() + + # check that it fails without the code arg + result = subprocess.run(cmd) + assert result.returncode == 1 + + # check that it succeeds with the code arg + result = subprocess.run([*cmd, *code_paths]) + assert result.returncode == 0 + + def test_benchmark_accuracy_alias(): # Verify that the `evaluate` alias works correctly. result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"]) diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index b6a32f722..002b0b39d 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -175,15 +175,15 @@ validation error with more details. $ python -m spacy init fill-config [base_path] [output_file] [--diff] ``` -| Name | Description | -| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ | -| `output_file` | Path to output `.cfg` file or "-" to write to stdout so you can pipe it to a file. Defaults to "-" (stdout). ~~Path (positional)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ | -| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | Complete and auto-filled config file for training. | +| Name | Description | +| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ | +| `output_file` | Path to output `.cfg` file or "-" to write to stdout so you can pipe it to a file. Defaults to "-" (stdout). ~~Path (positional)~~ | +| `--code`, `-c` | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ | +| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Complete and auto-filled config file for training. | ### init vectors {id="init-vectors",version="3",tag="command"} @@ -242,7 +242,7 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [ | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | | `output_path` | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--code`, `-c` | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--verbose`, `-V` | Show more detailed messages for debugging purposes. ~~bool (flag)~~ | | `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | @@ -446,7 +446,7 @@ File /path/to/thinc/thinc/schedules.py (line 91) | Name | Description | | ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--code`, `-c` | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--show-functions`, `-F` | Show an overview of all registered function blocks used in the config and where those functions come from, including the module name, Python file and line number. ~~bool (flag)~~ | | `--show-variables`, `-V` | Show an overview of all variables referenced in the config, e.g. `${paths.train}` and their values that will be used. This also reflects any config overrides provided on the CLI, e.g. `--paths.train /path`. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | @@ -631,7 +631,7 @@ will not be available. | Name | Description | | -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--code`, `-c` | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--ignore-warnings`, `-IW` | Ignore warnings, only show stats and errors. ~~bool (flag)~~ | | `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ | | `--no-format`, `-NF` | Don't pretty-print the results. Use this if you want to write to a file. ~~bool (flag)~~ | @@ -1055,7 +1055,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | | `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--code`, `-c` | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ | | `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | @@ -1125,6 +1125,7 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [ | -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | | `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ | +| `--code`, `-c` | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ | | `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ | @@ -1162,19 +1163,19 @@ skew. To render a sample of dependency parses in a HTML file using the $ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit] ``` -| Name | Description | -| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | -| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | -| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ | -| `--code`, `-c` 3 | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | -| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | -| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ | -| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ | -| `--per-component`, `-P` 3.6 | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | Training results and optional metrics and visualizations. | +| Name | Description | +| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | +| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | +| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ | +| `--code`, `-c` 3 | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | +| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ | +| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ | +| `--per-component`, `-P` 3.6 | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Training results and optional metrics and visualizations. | ### speed {id="benchmark-speed", version="3.5", tag="command"} @@ -1216,19 +1217,19 @@ When a directory is provided it is traversed recursively to collect all files. $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process] ``` -| Name | Description | -| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ | -| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ | -| `output-file` | Output `DocBin` path. ~~str (positional)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ | -| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ | -| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | -| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ | -| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. | +| Name | Description | +| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ | +| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ | +| `output-file` | Output `DocBin` path. ~~str (positional)~~ | +| `--code`, `-c` | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ | +| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ | +| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ | +| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. | ## find-threshold {id="find-threshold",version="3.5",tag="command"} @@ -1255,19 +1256,19 @@ be provided. > $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f > ``` -| Name | Description | -| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | -| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ | -| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ | -| `threshold_key` | Key of threshold attribute in component's configuration. ~~str (positional)~~ | -| `scores_key` | Name of score to metric to optimize. ~~str (positional)~~ | -| `--n_trials`, `-n` | Number of trials to determine optimal thresholds. ~~int (option)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | -| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | -| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| Name | Description | +| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | +| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ | +| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ | +| `threshold_key` | Key of threshold attribute in component's configuration. ~~str (positional)~~ | +| `scores_key` | Name of score to metric to optimize. ~~str (positional)~~ | +| `--n_trials`, `-n` | Number of trials to determine optimal thresholds. ~~int (option)~~ | +| `--code`, `-c` | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | +| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | ## assemble {id="assemble",tag="command"} @@ -1291,7 +1292,7 @@ $ python -m spacy assemble [config_path] [output_dir] [--code] [--verbose] [over | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `config_path` | Path to the [config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | | `output_dir` | Directory to store the final pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~ | -| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions). ~~Optional[Path] \(option)~~ | +| `--code`, `-c` | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--verbose`, `-V` | Show more detailed messages during processing. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.data ./data`. ~~Any (option/flag)~~ |