Update CLI to use radicli

This commit is contained in:
Ines Montani 2023-02-08 16:17:10 +01:00
parent eec5ccd72f
commit d292c6fc78
31 changed files with 677 additions and 750 deletions

View File

@ -9,9 +9,9 @@ murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.8.0
pathy>=0.10.0
smart-open>=5.2.1,<7.0.0
radicli>=0.0.1<1.0.0
# Third party dependencies
numpy>=1.15.0
requests>=2.13.0,<3.0.0

View File

@ -41,8 +41,8 @@ install_requires =
wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
radicli>=0.0.1<1.0.0
# Third-party dependencies
typer>=0.3.0,<0.8.0
pathy>=0.10.0
smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0

View File

@ -1,10 +1,8 @@
from wasabi import msg
from ._util import app, setup_cli # noqa: F401
from ._util import cli, setup_cli # noqa: F401
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
# are registered automatically and won't have to be imported here.
from .benchmark_speed import benchmark_speed_cli # noqa: F401
from .benchmark_speed import benchmark_speed # noqa: F401
from .download import download # noqa: F401
from .info import info # noqa: F401
from .package import package # noqa: F401
@ -25,18 +23,7 @@ from .validate import validate # noqa: F401
from .project.clone import project_clone # noqa: F401
from .project.assets import project_assets # noqa: F401
from .project.run import project_run # noqa: F401
from .project.dvc import project_update_dvc # noqa: F401
from .project.push import project_push # noqa: F401
from .project.pull import project_pull # noqa: F401
from .project.document import project_document # noqa: F401
from .find_threshold import find_threshold # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
def link(*args, **kwargs):
"""As of spaCy v3.0, symlinks like "en" are not supported anymore. You can load trained
pipeline packages using their full names or from a directory path."""
msg.warn(
"As of spaCy v3.0, model symlinks are not supported anymore. You can load trained "
"pipeline packages using their full names or from a directory path."
)

View File

@ -6,15 +6,13 @@ from pathlib import Path
from wasabi import msg, Printer
import srsly
import hashlib
import typer
from click import NoSuchOption
from click.parser import split_arg_string
from typer.main import get_command
import shlex
from contextlib import contextmanager
from thinc.api import Config, ConfigValidationError, require_gpu
from thinc.util import gpu_is_available
from configparser import InterpolationError
import os
import radicli
from ..schemas import ProjectConfigSchema, validate
from ..util import import_file, run_command, make_tempdir, registry, logger
@ -37,6 +35,8 @@ HELP = """spaCy Command-line Interface
DOCS: https://spacy.io/api/cli
"""
# TODO: need to find a way to inject these now
PROJECT_HELP = f"""Command-line interface for spaCy projects and templates.
You'd typically start by cloning a project template to a local directory and
fetching its assets like datasets etc. See the project's {PROJECT_FILE} for the
@ -49,29 +49,14 @@ and custom model implementations.
BENCHMARK_HELP = """Commands for benchmarking pipelines."""
INIT_HELP = """Commands for initializing configs and pipeline packages."""
# Wrappers for Typer's annotations. Initially created to set defaults and to
# keep the names short, but not needed at the moment.
Arg = typer.Argument
Opt = typer.Option
app = typer.Typer(name=NAME, help=HELP)
benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
app.add_typer(project_cli)
app.add_typer(debug_cli)
app.add_typer(benchmark_cli)
app.add_typer(init_cli)
# CLI
cli = radicli.Radicli(prog=COMMAND, help=HELP)
def setup_cli() -> None:
# Make sure the entry-point for CLI runs, so that they get imported.
registry.cli.get_all()
# Ensure that the help messages always display the correct prompt
command = get_command(app)
command(prog_name=COMMAND)
cli.run()
def parse_config_overrides(
@ -106,7 +91,7 @@ def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]:
opt = opt.replace("--", "")
if "." not in opt:
if is_cli:
raise NoSuchOption(orig_opt)
raise radicli.CliParseError(f"unrecognized argument: {orig_opt}")
else:
msg.fail(f"{err}: can't override top-level sections", exits=1)
if "=" in opt: # we have --opt=value
@ -510,7 +495,7 @@ def get_git_version(
"""
try:
ret = run_command("git --version", capture=True)
except:
except Exception:
raise RuntimeError(error)
stdout = ret.stdout.strip()
if not stdout or not stdout.startswith("git version"):
@ -580,6 +565,18 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
return result
def convert_string_list(value: str) -> List[str]:
return string_to_list(value)
def convert_int_list(value: str) -> List[int]:
return string_to_list(value, intify=True)
def convert_path_list(value: str) -> List[Path]:
return [Path(p) for p in string_to_list(value)]
def setup_gpu(use_gpu: int, silent=None) -> None:
"""Configure the GPU and log info."""
if silent is None:
@ -629,3 +626,20 @@ def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
return f"{number:.{ndigits}f}"
else:
return str(number)
def split_arg_string(string: str) -> List[str]:
# Adapted from: https://github.com/pallets/click/blob/8b48450d5d63c747600e069d4c3e2274f41c8360/src/click/parser.py#L125
lex = shlex.shlex(string, posix=True)
lex.whitespace_split = True
lex.commenters = ""
out = []
try:
for token in lex:
out.append(token)
except ValueError:
# Raised when end-of-string is reached in an invalid state. Use
# the partial token as-is. The quote or escape character is in
# lex.state, not lex.token.
out.append(lex.token)
return out

View File

@ -1,14 +1,12 @@
import tqdm
import srsly
from itertools import chain
from pathlib import Path
from typing import Optional, List, Iterable, cast, Union
from wasabi import msg
from radicli import Arg, ExistingPath, ExistingFilePath
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
from ._util import cli, setup_gpu, import_code, walk_directory
from ..tokens import Doc, DocBin
from ..vocab import Vocab
from ..util import ensure_path, load_model
@ -37,49 +35,30 @@ force_msg = (
DocOrStrStream = Union[Iterable[str], Iterable[Doc]]
def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
"""
Stream Doc objects from DocBin.
"""
docbin = DocBin().from_disk(path)
for doc in docbin.get_docs(vocab):
yield doc
def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
"""
Stream "text" field from JSONL. If the field "text" is
not found it raises error.
"""
for entry in srsly.read_jsonl(path):
if field not in entry:
msg.fail(f"{path} does not contain the required '{field}' field.", exits=1)
else:
yield entry[field]
def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
"""
Yields strings from text files in paths.
"""
for path in paths:
with open(path, "r") as fin:
text = fin.read()
yield text
@app.command("apply")
def apply_cli(
@cli.command(
"apply",
# fmt: off
model: str = Arg(..., help="Model name or path"),
data_path: Path = Arg(..., help=path_help, exists=True),
output_file: Path = Arg(..., help=out_help, dir_okay=False),
code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"),
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.")
model=Arg(help="Model name or path"),
data_path=Arg(help=path_help),
output_file=Arg(help=out_help),
code_path=Arg("--code", "-c", help=code_help),
text_key=Arg("--text-key", "-tk", help="Key containing text string for JSONL"),
force_overwrite=Arg("--force", "-F", help="Force overwriting the output file"),
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
batch_size=Arg("--batch-size", "-b", help="Batch size"),
n_process=Arg("--n-process", "-n", help="Number of processors to use"),
# fmt: on
)
def apply_cli(
model: str,
data_path: ExistingPath,
output_file: Path,
code_path: Optional[ExistingFilePath] = None,
text_key: str = "text",
force_overwrite: bool = False,
use_gpu: int = -1,
batch_size: int = 1,
n_process: int = 1,
):
"""
Apply a trained pipeline to documents to get predictions.
@ -122,7 +101,6 @@ def apply(
)
return
nlp = load_model(model)
msg.good(f"Loaded model {model}")
vocab = nlp.vocab
streams: List[DocOrStrStream] = []
text_files = []
@ -141,3 +119,32 @@ def apply(
if output_file.suffix == "":
output_file = output_file.with_suffix(".spacy")
docbin.to_disk(output_file)
def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
"""
Stream Doc objects from DocBin.
"""
docbin = DocBin().from_disk(path)
for doc in docbin.get_docs(vocab):
yield doc
def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
"""
Stream "text" field from JSONL. If the field "text" is
not found it raises error.
"""
for entry in srsly.read_jsonl(path):
if field not in entry:
msg.fail(f"{path} does not contain the required '{field}' field.", exits=1)
else:
yield entry[field]
def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
"""Yields strings from text files in paths."""
for path in paths:
with open(path, "r") as fin:
text = fin.read()
yield text

View File

@ -1,27 +1,30 @@
from typing import Optional
from typing import Optional, List
from pathlib import Path
from wasabi import msg
import typer
import logging
from radicli import Arg, ExistingFilePathOrDash, ExistingFilePath
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import cli, parse_config_overrides, show_validation_error
from ._util import import_code
from .. import util
from ..util import get_sourced_components, load_model_from_config
@app.command(
@cli.command_with_extra(
"assemble",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
# fmt: off
config_path=Arg(help="Path to config file"),
output_path=Arg(help="Output directory to store assembled pipeline in"),
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"),
# fmt: on
)
def assemble_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
output_path: Path = Arg(..., help="Output directory to store assembled pipeline in"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
# fmt: on
config_path: ExistingFilePathOrDash,
output_path: Optional[Path] = None,
code_path: Optional[ExistingFilePath] = None,
verbose: bool = False,
_extra: List[str] = [],
):
"""
Assemble a spaCy pipeline from a config file. The config file includes
@ -37,7 +40,7 @@ def assemble_cli(
# Make sure all files and paths exists if they are needed
if not config_path or (str(config_path) != "-" and not config_path.exists()):
msg.fail("Config file not found", config_path, exits=1)
overrides = parse_config_overrides(ctx.args)
overrides = parse_config_overrides(_extra)
import_code(code_path)
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides, interpolate=False)

View File

@ -2,56 +2,56 @@ from typing import Iterable, List, Optional
import random
from itertools import islice
import numpy
from pathlib import Path
import time
from tqdm import tqdm
import typer
from wasabi import msg
from radicli import Arg, ExistingPath
from .. import util
from ..language import Language
from ..tokens import Doc
from ..training import Corpus
from ._util import Arg, Opt, benchmark_cli, setup_gpu
from ._util import cli, setup_gpu
@benchmark_cli.command(
@cli.subcommand(
"benchmark",
"speed",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def benchmark_speed_cli(
# fmt: off
ctx: typer.Context,
model: str = Arg(..., help="Model name or path"),
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
batch_size: Optional[int] = Opt(None, "--batch-size", "-b", min=1, help="Override the pipeline batch size"),
no_shuffle: bool = Opt(False, "--no-shuffle", help="Do not shuffle benchmark data"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
model=Arg(help="Model name or path"),
data_path=Arg(help="Location of binary evaluation data in .spacy format"),
batch_size=Arg("--batch-size", "-b", help="Override the pipeline batch size"),
no_shuffle=Arg("--no-shuffle", help="Do not shuffle benchmark data"),
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
n_batches=Arg("--batches", help="Minimum number of batches to benchmark"),
warmup_epochs=Arg("--warmup", "-w", help="Number of iterations over the data for warmup"),
# fmt: on
)
def benchmark_speed(
model: str,
data_path: ExistingPath,
batch_size: Optional[int] = None,
no_shuffle: bool = False,
use_gpu: int = -1,
n_batches: int = 50,
warmup_epochs: int = 3,
):
"""
Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
data in the binary .spacy format.
"""
setup_gpu(use_gpu=use_gpu, silent=False)
nlp = util.load_model(model)
batch_size = batch_size if batch_size is not None else nlp.batch_size
corpus = Corpus(data_path)
docs = [eg.predicted for eg in corpus(nlp)]
if len(docs) == 0:
msg.fail("Cannot benchmark speed using an empty corpus.", exits=1)
print(f"Warming up for {warmup_epochs} epochs...")
warmup(nlp, docs, warmup_epochs, batch_size)
print()
print(f"Benchmarking {n_batches} batches...")
wps = benchmark(nlp, docs, n_batches, batch_size, not no_shuffle)
print()
print_outliers(wps)
print_mean_with_ci(wps)
@ -120,7 +120,6 @@ def benchmark(
nlp.make_doc(docs[i % len(docs)].text)
for i in range(n_batches * batch_size)
]
return annotate(nlp, bench_docs, batch_size)
@ -143,17 +142,14 @@ def print_mean_with_ci(sample: numpy.ndarray):
mean = numpy.mean(sample)
bootstrap_means = bootstrap(sample)
bootstrap_means.sort()
# 95% confidence interval
low = bootstrap_means[int(len(bootstrap_means) * 0.025)]
high = bootstrap_means[int(len(bootstrap_means) * 0.975)]
print(f"Mean: {mean:.1f} words/s (95% CI: {low-mean:.1f} +{high-mean:.1f})")
def print_outliers(sample: numpy.ndarray):
quartiles = Quartiles(sample)
n_outliers = numpy.sum(
(sample < (quartiles.q1 - 1.5 * quartiles.iqr))
| (sample > (quartiles.q3 + 1.5 * quartiles.iqr))

View File

@ -1,4 +1,4 @@
from typing import Callable, Iterable, Mapping, Optional, Any, Union
from typing import Callable, Iterable, Mapping, Optional, Any, Union, Literal
from enum import Enum
from pathlib import Path
from wasabi import Printer
@ -6,8 +6,9 @@ import srsly
import re
import sys
import itertools
from radicli import Arg, ExistingFilePath, ExistingPathOrDash, ExistingDirPathOrDash
from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
from ._util import cli, _handle_renamed_language_codes, walk_directory
from ..training import docs_to_json
from ..tokens import Doc, DocBin
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
@ -27,8 +28,8 @@ CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
"iob": iob_to_docs,
"json": json_to_docs,
}
AUTO = "auto"
ConvertersType = Literal["auto", "conllubio", "conllu", "conll", "ner", "iob", "json"]
# File types that can be written to stdout
@ -40,22 +41,36 @@ class FileTypes(str, Enum):
spacy = "spacy"
@app.command("convert")
def convert_cli(
@cli.command(
"convert",
# fmt: off
input_path: str = Arg(..., help="Input file or directory", exists=True),
output_dir: Path = Arg("-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True),
file_type: FileTypes = Opt("spacy", "--file-type", "-t", help="Type of data to produce"),
n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
input_path=Arg(help="Input file or directory"),
output_dir=Arg(help="Output directory. '-' for stdout."),
file_type=Arg("--file-type", "-t", help="Type of data to produce"),
n_sents=Arg("--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
seg_sents=Arg("--seg-sents", "-s", help="Segment sentences (for -c ner)"),
model=Arg("--model", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
morphology=Arg("--morphology", "-m", help="Enable appending morphology to tags"),
merge_subtokens=Arg("--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
converter=Arg("--converter", "-c", help=f"Converter to use"),
ner_map=Arg("--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)"),
lang=Arg("--lang", "-l", help="Language (if tokenizer required)"),
concatenate=Arg("--concatenate", "-C", help="Concatenate output to a single file"),
# fmt: on
)
def convert_cli(
input_path: ExistingPathOrDash,
output_dir: ExistingDirPathOrDash = "-",
file_type: Literal["json", "spacy"] = "spacy",
n_sents: int = 1,
seg_sents: bool = False,
model: Optional[str] = None,
morphology: bool = False,
merge_subtokens: bool = False,
converter: ConvertersType = AUTO,
ner_map: Optional[ExistingFilePath] = None,
lang: Optional[str] = None,
concatenate: bool = False,
):
"""
Convert files into json or DocBin format for training. The resulting .spacy
@ -69,15 +84,14 @@ def convert_cli(
DOCS: https://spacy.io/api/cli#convert
"""
input_path = Path(input_path)
output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
silent = output_dir == "-"
msg = Printer(no_print=silent)
converter = _get_converter(msg, converter, input_path)
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
verify_cli_args(msg, input_path, output_dir, file_type, converter, ner_map)
convert(
input_path,
output_dir,
file_type=file_type.value,
file_type=file_type,
n_sents=n_sents,
seg_sents=seg_sents,
model=model,

View File

@ -3,27 +3,31 @@ from pathlib import Path
from wasabi import msg, table
from thinc.api import Config
from thinc.config import VARIABLE_RE
import typer
from radicli import Arg, ExistingFilePathOrDash, ExistingFilePath
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
from ._util import import_code, debug_cli
from ._util import cli, show_validation_error, parse_config_overrides
from ._util import import_code
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
from ..util import registry
from .. import util
@debug_cli.command(
@cli.subcommand_with_extra(
"debug",
"config",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
# fmt: off
config_path=Arg(help="Path to config file"),
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
show_funcs=Arg("--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
show_vars=Arg("--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI"),
# fmt: on
)
def debug_config_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
# fmt: on
config_path: ExistingFilePathOrDash,
code_path: Optional[ExistingFilePath] = None,
show_funcs: bool = False,
show_vars: bool = False,
_extra: List[str] = [],
):
"""Debug a config file and show validation errors. The command will
create all objects in the tree and validate them. Note that some config
@ -36,7 +40,7 @@ def debug_config_cli(
DOCS: https://spacy.io/api/cli#debug-config
"""
overrides = parse_config_overrides(ctx.args)
overrides = parse_config_overrides(_extra)
import_code(code_path)
debug_config(
config_path, overrides=overrides, show_funcs=show_funcs, show_vars=show_vars

View File

@ -5,11 +5,11 @@ from collections import Counter
import sys
import srsly
from wasabi import Printer, MESSAGES, msg
import typer
import math
from radicli import Arg, ExistingFilePathOrDash, ExistingFilePath
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
from ._util import import_code, debug_cli, _format_number
from ._util import cli, show_validation_error, parse_config_overrides
from ._util import import_code, _format_number
from ..training import Example, remove_bilu_prefix
from ..training.initialize import get_sourced_components
from ..schemas import ConfigSchemaTraining
@ -40,23 +40,24 @@ BOUNDARY_DISTINCT_THRESHOLD = 1
SPAN_LENGTH_THRESHOLD_PERCENTAGE = 90
@debug_cli.command(
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
)
@app.command(
"debug-data",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
hidden=True, # hide this from main CLI help but still allow it to work with warning
@cli.subcommand_with_extra(
"debug",
"data",
# fmt: off
config_path=Arg(help="Path to config file"),
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
ignore_warnings=Arg("--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
verbose=Arg("--verbose", "-V", help="Print additional information and explanations"),
no_format=Arg("--no-format", "-NF", help="Don't pretty-print the results"),
# fmt: on
)
def debug_data_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"),
# fmt: on
config_path: ExistingFilePathOrDash,
code_path: Optional[ExistingFilePath] = None,
ignore_warnings: bool = False,
verbose: bool = False,
no_format: bool = False,
_extra: List[str] = [],
):
"""
Analyze, debug and validate your training and development data. Outputs
@ -65,13 +66,7 @@ def debug_data_cli(
DOCS: https://spacy.io/api/cli#debug-data
"""
if ctx.command.name == "debug-data":
msg.warn(
"The debug-data command is now available via the 'debug data' "
"subcommand (without the hyphen). You can run python -m spacy debug "
"--help for an overview of the other available debugging commands."
)
overrides = parse_config_overrides(ctx.args)
overrides = parse_config_overrides(_extra)
import_code(code_path)
debug_data(
config_path,

View File

@ -1,29 +1,32 @@
from typing import Optional
import typer
from wasabi import Printer, diff_strings, MarkdownRenderer
from pathlib import Path
from thinc.api import Config
from radicli import Arg, ExistingFilePathOrDash, ExistingFilePath
from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
from ._util import cli, show_validation_error
from .init_config import init_config, OptimizationsType
from ..util import load_config
from .init_config import init_config, Optimizations
@debug_cli.command(
@cli.subcommand(
"debug",
"diff-config",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
# fmt: off
config_path=Arg(help="Path to config file"),
compare_to=Arg(help="Path to a config file to diff against, or `None` to compare against default settings"),
optimize=Arg("--optimize", "-o", help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config"),
gpu=Arg("--gpu", "-G", help="Whether the original config can run on a GPU. Only relevant when comparing against the default config"),
pretraining=Arg("--pretraining", "--pt", help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config"),
markdown=Arg("--markdown", "-md", help="Generate Markdown for GitHub issues"),
# fmt: on
)
def debug_diff_cli(
# fmt: off
ctx: typer.Context,
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
compare_to: Optional[Path] = Opt(None, help="Path to a config file to diff against, or `None` to compare against default settings", exists=True, allow_dash=True),
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config."),
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the original config can run on a GPU. Only relevant when comparing against the default config."),
pretraining: bool = Opt(False, "--pretraining", "--pt", help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config."),
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues")
# fmt: on
config_path: ExistingFilePathOrDash,
compare_to: Optional[ExistingFilePath] = None,
optimize: OptimizationsType = "efficiency",
gpu: bool = False,
pretraining: bool = False,
markdown: bool = False,
):
"""Show a diff of a config file with respect to spaCy's defaults or another config file. If
additional settings were used in the creation of the config file, then you
@ -50,7 +53,7 @@ def debug_diff(
config_path: Path,
compare_to: Optional[Path],
gpu: bool,
optimize: Optimizations,
optimize: OptimizationsType,
pretraining: bool,
markdown: bool,
):
@ -68,7 +71,7 @@ def debug_diff(
other_config = init_config(
lang=lang,
pipeline=pipeline,
optimize=optimize.value,
optimize=optimize,
gpu=gpu,
pretraining=pretraining,
silent=True,

View File

@ -1,41 +1,51 @@
from typing import Dict, Any, Optional
from pathlib import Path
from typing import Dict, Any, Optional, List
import itertools
from spacy.training import Example
from spacy.util import resolve_dot_names
from wasabi import msg
from thinc.api import fix_random_seed, set_dropout_rate
from thinc.api import Model, data_validation, set_gpu_allocator
import typer
from radicli import Arg, ExistingFilePathOrDash
from ._util import Arg, Opt, debug_cli, show_validation_error
from ._util import parse_config_overrides, string_to_list, setup_gpu
from ._util import cli, show_validation_error
from ._util import parse_config_overrides, convert_int_list, setup_gpu
from ..schemas import ConfigSchemaTraining
from ..util import registry
from .. import util
@debug_cli.command(
@cli.subcommand_with_extra(
"debug",
"model",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
# fmt: off
config_path=Arg(help="Path to config file",),
component=Arg(help="Name of the pipeline component of which the model should be analyzed"),
layers=Arg(help="Comma-separated names of layer IDs to print", converter=convert_int_list),
dimensions=Arg("--dimensions", "-DIM", help="Show dimensions"),
parameters=Arg("--parameters", "-PAR", help="Show parameters"),
gradients=Arg("--gradients", "-GRAD", help="Show gradients"),
attributes=Arg("--attributes", "-ATTR", help="Show attributes"),
P0=Arg("--print-step0", "-P0", help="Print model before training"),
P1=Arg("--print-step1", "-P1", help="Print model after initialization"),
P2=Arg("--print-step2", "-P2", help="Print model after training"),
P3=Arg("--print-step3", "-P3", help="Print final predictions"),
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
# fmt: on
)
def debug_model_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
component: str = Arg(..., help="Name of the pipeline component of which the model should be analysed"),
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"),
dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"),
gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"),
attributes: bool = Opt(False, "--attributes", "-ATTR", help="Show attributes"),
P0: bool = Opt(False, "--print-step0", "-P0", help="Print model before training"),
P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"),
P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"),
P3: bool = Opt(False, "--print-step3", "-P3", help="Print final predictions"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
config_path: ExistingFilePathOrDash,
component: str,
layers: List[int] = [],
dimensions: bool = False,
parameters: bool = False,
gradients: bool = False,
attributes: bool = False,
P0: bool = False,
P1: bool = False,
P2: bool = False,
P3: bool = False,
use_gpu: int = -1,
_extra: List[str] = [],
):
"""
Analyze a Thinc model implementation. Includes checks for internal structure
@ -44,7 +54,6 @@ def debug_model_cli(
DOCS: https://spacy.io/api/cli#debug-model
"""
setup_gpu(use_gpu)
layers = string_to_list(layers, intify=True)
print_settings = {
"dimensions": dimensions,
"parameters": parameters,
@ -56,7 +65,7 @@ def debug_model_cli(
"print_after_training": P2,
"print_prediction": P3,
}
config_overrides = parse_config_overrides(ctx.args)
config_overrides = parse_config_overrides(_extra)
with show_validation_error(config_path):
raw_config = util.load_config(
config_path, overrides=config_overrides, interpolate=False

View File

@ -1,27 +1,26 @@
from typing import Optional, Sequence
from typing import Optional, Sequence, List
import requests
import sys
from wasabi import msg
import typer
from radicli import Arg
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
from ._util import cli, WHEEL_SUFFIX, SDIST_SUFFIX
from .. import about
from ..util import is_package, get_minor_version, run_command
from ..util import is_prerelease_version, get_installed_models
from ..util import get_package_version
@app.command(
@cli.command_with_extra(
"download",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
# fmt: off
model=Arg(help="Name of pipeline package to download"),
direct=Arg("--direct", "-D", help="Force direct download of name + version"),
sdist=Arg("--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel")
# fmt: on
)
def download_cli(
# fmt: off
ctx: typer.Context,
model: str = Arg(..., help="Name of pipeline package to download"),
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel"),
# fmt: on
model: str, direct: bool = False, sdist: bool = False, _extra: List[str] = []
):
"""
Download compatible trained pipeline from the default download path using
@ -33,7 +32,7 @@ def download_cli(
DOCS: https://spacy.io/api/cli#download
AVAILABLE PACKAGES: https://spacy.io/models
"""
download(model, direct, sdist, *ctx.args)
download(model, direct, sdist, *_extra)
def download(

View File

@ -1,33 +1,42 @@
from typing import Optional, List, Dict, Any, Union
from typing import Optional, List, Dict, Any
from wasabi import Printer
from pathlib import Path
import re
import srsly
from thinc.api import fix_random_seed
from radicli import Arg, ExistingPath, ExistingDirPath, ExistingFilePath
from ..training import Corpus
from ..tokens import Doc
from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli
from ..scorer import Scorer
from ._util import cli, setup_gpu, import_code
from .. import util
from .. import displacy
@benchmark_cli.command(
"accuracy",
)
@app.command("evaluate")
def evaluate_cli(
args = dict(
# fmt: off
model: str = Arg(..., help="Model name or path"),
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
model=Arg(help="Model name or path"),
data_path=Arg(help="Location of binary evaluation data in .spacy format"),
output=Arg("--output", "-o", help="Output JSON file for metrics"),
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
gold_preproc=Arg("--gold-preproc", "-G", help="Use gold preprocessing"),
displacy_path=Arg("--displacy-path", "-dp", help="Directory to output rendered parses as HTML"),
displacy_limit=Arg("--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
# fmt: on
)
@cli.subcommand("benchmark", "accuracy", **args)
@cli.command("evaluate", **args)
def evaluate_cli(
model: str,
data_path: ExistingPath,
output: Optional[ExistingFilePath] = None,
code_path: Optional[ExistingFilePath] = None,
use_gpu: int = -1,
gold_preproc: bool = False,
displacy_path: Optional[ExistingDirPath] = None,
displacy_limit: int = 25,
):
"""
Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation

View File

@ -3,14 +3,14 @@ import operator
from pathlib import Path
import logging
from typing import Optional, Tuple, Any, Dict, List
import numpy
import wasabi.tables
from radicli import Arg, ExistingPath, ExistingFilePath
from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer
from ..errors import Errors
from ..training import Corpus
from ._util import app, Arg, Opt, import_code, setup_gpu
from ._util import cli, import_code, setup_gpu
from .. import util
_DEFAULTS = {
@ -20,23 +20,32 @@ _DEFAULTS = {
}
@app.command(
@cli.command(
"find-threshold",
context_settings={"allow_extra_args": False, "ignore_unknown_options": True},
# fmt: off
model=Arg(help="Model name or path"),
data_path=Arg(help="Location of binary evaluation data in .spacy format"),
pipe_name=Arg(help="Name of pipe to examine thresholds for"),
threshold_key=Arg(help="Key of threshold attribute in component's configuration"),
scores_key=Arg(help="Metric to optimize"),
n_trials=Arg("--n_trials", "-n", help="Number of trials to determine optimal thresholds"),
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
gold_preproc=Arg("--gold-preproc", "-G", help="Use gold preprocessing"),
verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"),
# fmt: on
)
def find_threshold_cli(
# fmt: off
model: str = Arg(..., help="Model name or path"),
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
pipe_name: str = Arg(..., help="Name of pipe to examine thresholds for"),
threshold_key: str = Arg(..., help="Key of threshold attribute in component's configuration"),
scores_key: str = Arg(..., help="Metric to optimize"),
n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"),
# fmt: on
model: str,
data_path: ExistingPath,
pipe_name: str,
threshold_key: str,
scores_key: str,
n_trials: int = _DEFAULTS["n_trials"],
code_path: Optional[ExistingFilePath] = None,
use_gpu: int = _DEFAULTS["use_gpu"],
gold_preproc: bool = _DEFAULTS["gold_preproc"],
verbose: bool = False,
):
"""
Runs prediction trials for a trained model with varying tresholds to maximize
@ -52,7 +61,6 @@ def find_threshold_cli(
DOCS: https://spacy.io/api/cli#find-threshold
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
import_code(code_path)
find_threshold(
@ -110,19 +118,16 @@ def find_threshold(
pipe = nlp.get_pipe(pipe_name)
if not hasattr(pipe, "scorer"):
raise AttributeError(Errors.E1045)
if type(pipe) == TextCategorizer:
wasabi.msg.warn(
"The `textcat` component doesn't use a threshold as it's not applicable to the concept of "
"exclusive classes. All thresholds will yield the same results."
)
if not silent:
wasabi.msg.info(
title=f"Optimizing for {scores_key} for component '{pipe_name}' with {n_trials} "
f"trials."
)
# Load evaluation corpus.
corpus = Corpus(data_path, gold_preproc=gold_preproc)
dev_dataset = list(corpus(nlp))
@ -209,9 +214,7 @@ def find_threshold(
widths=table_col_widths,
)
)
best_threshold = max(scores.keys(), key=(lambda key: scores[key]))
# If all scores are identical, emit warning.
if len(set(scores.values())) == 1:
wasabi.msg.warn(
@ -223,7 +226,6 @@ def find_threshold(
)
else "Use `cats_macro_f` or `cats_micro_f` when optimizing the threshold for `textcat_multilabel`.",
)
else:
if not silent:
print(

View File

@ -5,23 +5,31 @@ import json
from pathlib import Path
from wasabi import Printer, MarkdownRenderer
import srsly
from radicli import Arg
from ._util import app, Arg, Opt, string_to_list
from ._util import cli
from .download import get_model_filename, get_latest_version
from .. import util
from .. import about
@app.command("info")
def info_cli(
@cli.command(
"info",
# fmt: off
model: Optional[str] = Arg(None, help="Optional loadable spaCy pipeline"),
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
exclude: str = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"),
url: bool = Opt(False, "--url", "-u", help="Print the URL to download the most recent compatible version of the pipeline"),
model=Arg(help="Optional loadable spaCy pipeline"),
markdown=Arg("--markdown", "-md", help="Generate Markdown for GitHub issues"),
silent=Arg("--silent", "-S", help="Don't print anything (just return)"),
exclude=Arg("--exclude", "-e", help="Keys to exclude from the print-out"),
url=Arg("--url", "-u", help="Print the URL to download the most recent compatible version of the pipeline"),
# fmt: on
):
)
def info(
model: Optional[str] = None,
markdown: bool = False,
silent: bool = False,
exclude: List[str] = [],
url: bool = False,
) -> Union[str, dict]:
"""
Print info about spaCy installation. If a pipeline is specified as an argument,
print its meta information. Flag --markdown prints details in Markdown for easy
@ -32,24 +40,6 @@ def info_cli(
DOCS: https://spacy.io/api/cli#info
"""
exclude = string_to_list(exclude)
info(
model,
markdown=markdown,
silent=silent,
exclude=exclude,
url=url,
)
def info(
model: Optional[str] = None,
*,
markdown: bool = False,
silent: bool = True,
exclude: Optional[List[str]] = None,
url: bool = False,
) -> Union[str, dict]:
msg = Printer(no_print=silent, pretty=not silent)
if not exclude:
exclude = []

View File

@ -1,18 +1,18 @@
from typing import Optional, List, Tuple
from enum import Enum
from typing import Optional, List, Tuple, Literal
from pathlib import Path
from wasabi import Printer, diff_strings
from thinc.api import Config
import srsly
import re
from jinja2 import Template
from radicli import Arg, PathOrDash, ExistingFilePath
from .. import util
from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
from ..schemas import RecommendationSchema
from ..util import SimpleFrozenList
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
from ._util import string_to_list, import_code, _handle_renamed_language_codes
from ._util import cli, convert_string_list, show_validation_error, COMMAND
from ._util import import_code, _handle_renamed_language_codes
ROOT = Path(__file__).parent / "templates"
@ -20,9 +20,7 @@ TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
RECOMMENDATIONS = srsly.read_yaml(ROOT / "quickstart_training_recommendations.yml")
class Optimizations(str, Enum):
efficiency = "efficiency"
accuracy = "accuracy"
OptimizationsType = Literal["efficiency", "accuracy"]
class InitValues:
@ -33,23 +31,33 @@ class InitValues:
lang = "en"
pipeline = SimpleFrozenList(["tagger", "parser", "ner"])
optimize = Optimizations.efficiency
optimize = "efficiency"
gpu = False
pretraining = False
force_overwrite = False
@init_cli.command("config")
def init_config_cli(
@cli.subcommand(
"init",
"config",
# fmt: off
output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
lang: str = Opt(InitValues.lang, "--lang", "-l", help="Code of the language to use"),
pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
pretraining: bool = Opt(InitValues.pretraining, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
force_overwrite: bool = Opt(InitValues.force_overwrite, "--force", "-F", help="Force overwriting the output file"),
output_file=Arg(help="File to save the config to or - for stdout (will only output config and no additional logging info)"),
lang=Arg("--lang", "-l", help="Code of the language to use"),
pipeline=Arg("--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')", converter=convert_string_list),
optimize=Arg("--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
gpu=Arg("--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
pretraining=Arg("--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
force_overwrite=Arg("--force", "-F", help="Force overwriting the output file"),
# fmt: on
)
def init_config_cli(
output_file: PathOrDash,
lang: str = InitValues.lang,
pipeline: List[str] = InitValues.pipeline,
optimize: OptimizationsType = InitValues.optimize,
gpu: bool = InitValues.gpu,
pretraining: bool = InitValues.pretraining,
force_overwrite: bool = InitValues.force_overwrite,
):
"""
Generate a starter config file for training. Based on your requirements
@ -59,8 +67,7 @@ def init_config_cli(
DOCS: https://spacy.io/api/cli#init-config
"""
pipeline = string_to_list(pipeline)
is_stdout = str(output_file) == "-"
is_stdout = output_file == "-"
if not is_stdout and output_file.exists() and not force_overwrite:
msg = Printer()
msg.fail(
@ -70,7 +77,7 @@ def init_config_cli(
config = init_config(
lang=lang,
pipeline=pipeline,
optimize=optimize.value,
optimize=optimize,
gpu=gpu,
pretraining=pretraining,
silent=is_stdout,
@ -78,16 +85,25 @@ def init_config_cli(
save_config(config, output_file, is_stdout=is_stdout)
@init_cli.command("fill-config")
def init_fill_config_cli(
@cli.subcommand(
"init",
"fill-config",
# fmt: off
base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False),
output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True),
distillation: bool = Opt(False, "--distillation", "-dt", help="Include config for distillation (with 'spacy distill')"),
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"),
code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
base_path=Arg(help="Path to base config to fill"),
output_file=Arg(help="Path to output .cfg file (or - for stdout)"),
distillation=Arg("--distillation", "-dt", help="Include config for distillation (with 'spacy distill')"),
pretraining=Arg("--pretraining", "-pt", help="Include config for pretraining (with `spacy pretrain`)"),
diff=Arg("--diff", "-D", help="Print a visual diff highlighting the changes"),
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
# fmt: on
)
def init_fill_config_cli(
base_path: ExistingFilePath,
output_file: PathOrDash = "-",
distillation: bool = False,
pretraining: bool = False,
diff: bool = False,
code_path: Optional[ExistingFilePath] = None,
):
"""
Fill partial config file with default values. Will add all missing settings

View File

@ -1,30 +1,42 @@
from typing import Optional
from typing import Optional, Literal, List
import logging
from pathlib import Path
from wasabi import msg
import typer
import srsly
from radicli import Arg, ExistingPath, ExistingFilePathOrDash, ExistingFilePath
from .. import util
from ..training.initialize import init_nlp, convert_vectors
from ..language import Language
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import cli, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu, _handle_renamed_language_codes
@init_cli.command("vectors")
def init_vectors_cli(
@cli.subcommand(
"init",
"vectors",
# fmt: off
lang: str = Arg(..., help="The language of the nlp object to create"),
vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),
output_dir: Path = Arg(..., help="Pipeline output directory"),
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
lang=Arg(help="The language of the nlp object to create"),
vectors_loc=Arg(help="Vectors file in Word2Vec format"),
output_dir=Arg(help="Pipeline output directory"),
prune=Arg("--prune", "-p", help="Optional number of vectors to prune to"),
truncate=Arg("--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
mode=Arg("--mode", "-m", help="Vectors mode: default or floret"),
name=Arg("--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"),
jsonl_loc=Arg("--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file"),
# fmt: on
)
def init_vectors_cli(
lang: str,
vectors_loc: ExistingPath,
output_dir: Path,
prune: int = -1,
truncate: int = 0,
mode: Literal["default", "floret"] = "default",
name: Optional[str] = None,
verbose: bool = False,
jsonl_loc: Optional[Path] = None,
):
"""Convert word vectors for use with spaCy. Will export an nlp object that
you can use in the [initialize] block of your config to initialize
@ -66,23 +78,28 @@ def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
lexeme.set_attrs(**attrs)
@init_cli.command(
@cli.subcommand_with_extra(
"init",
"nlp",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
hidden=True,
# fmt: off
config_path=Arg(help="Path to config file"),
output_path=Arg(help="Output directory for the prepared data"),
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"),
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
# fmt: on
)
def init_pipeline_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
output_path: Path = Arg(..., help="Output directory for the prepared data"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
config_path: ExistingFilePathOrDash,
output_path: Path,
code_path: Optional[ExistingFilePath] = None,
verbose: bool = False,
use_gpu: int = -1,
_extra: List[str] = [],
):
"""Initialize a pipeline."""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
overrides = parse_config_overrides(ctx.args)
overrides = parse_config_overrides(_extra)
import_code(code_path)
setup_gpu(use_gpu)
with show_validation_error(config_path):
@ -93,19 +110,24 @@ def init_pipeline_cli(
msg.good(f"Saved initialized pipeline to {output_path}")
@init_cli.command(
@cli.subcommand_with_extra(
"init",
"labels",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
# fmt: off
config_path=Arg(help="Path to config file"),
output_path=Arg(help="Output directory for the labels"),
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"),
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
# fmt: on
)
def init_labels_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
output_path: Path = Arg(..., help="Output directory for the labels"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
config_path: ExistingFilePathOrDash,
output_path: Path,
code_path: Optional[ExistingFilePath] = None,
verbose: bool = False,
use_gpu: int = -1,
_extra: List[str] = [],
):
"""Generate JSON files for the labels in the data. This helps speed up the
training process, since spaCy won't have to preprocess the data to
@ -113,7 +135,7 @@ def init_labels_cli(
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if not output_path.exists():
output_path.mkdir(parents=True)
overrides = parse_config_overrides(ctx.args)
overrides = parse_config_overrides(_extra)
import_code(code_path)
setup_gpu(use_gpu)
with show_validation_error(config_path):

View File

@ -1,4 +1,4 @@
from typing import Optional, Union, Any, Dict, List, Tuple, cast
from typing import Optional, Union, Any, Dict, List, Tuple, Literal, cast
import shutil
from pathlib import Path
from wasabi import Printer, MarkdownRenderer, get_raw_input
@ -8,26 +8,38 @@ from catalogue import RegistryError
import srsly
import sys
import re
from radicli import Arg, ExistingDirPath, ExistingFilePath
from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
from ._util import cli, convert_path_list, WHEEL_SUFFIX, SDIST_SUFFIX
from ..schemas import validate, ModelMetaSchema
from .. import util
from .. import about
@app.command("package")
def package_cli(
@cli.command(
"package",
# fmt: off
input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False),
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"),
name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"),
input_dir=Arg(help="Directory with pipeline data"),
output_dir=Arg(help="Output parent directory"),
code_paths=Arg("--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package", converter=convert_path_list),
meta_path=Arg("--meta", "-m", help="Path to meta.json"),
create_meta=Arg("--create-meta", "-C", help="Create meta.json, even if one exists"),
name=Arg("--name", "-n", help="Package name to override meta"),
version=Arg("--version", "-v", help="Package version to override meta"),
build=Arg("--build", "-b", help="Artifact to build. Can be set multiple times, 'sdist', 'wheel' or 'none'"),
force=Arg("--force", "-F", help="Force overwriting existing data in output directory"),
# fmt: on
)
def package_cli(
input_dir: ExistingDirPath,
output_dir: ExistingDirPath,
code_paths: List[Path] = [],
meta_path: Optional[ExistingFilePath] = None,
create_meta: bool = False,
name: Optional[str] = None,
version: Optional[str] = None,
build: List[Literal["sdist", "wheel", "none"]] = ["sdist"],
force: bool = False,
):
"""
Generate an installable Python package for a pipeline. Includes binary data,
@ -44,8 +56,6 @@ def package_cli(
DOCS: https://spacy.io/api/cli#package
"""
create_sdist, create_wheel = get_build_formats(string_to_list(build))
code_paths = [Path(p.strip()) for p in string_to_list(code_paths)]
package(
input_dir,
output_dir,
@ -54,8 +64,8 @@ def package_cli(
name=name,
version=version,
create_meta=create_meta,
create_sdist=create_sdist,
create_wheel=create_wheel,
create_sdist="sdist" in build and "none" not in build,
create_wheel="sdist" in build and "none" not in build,
force=force,
silent=False,
)

View File

@ -1,29 +1,34 @@
from typing import Optional
from typing import Optional, List
from pathlib import Path
from wasabi import msg
import typer
import re
from radicli import Arg, ExistingFilePathOrDash, ExistingFilePath
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import cli, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu
from ..training.pretrain import pretrain
from ..util import load_config
@app.command(
@cli.command_with_extra(
"pretrain",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
# fmt: off
config_path=Arg(help="Path to config file"),
output_dir=Arg(help="Directory to write weights to on each epoch"),
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
resume_path=Arg("--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume=Arg("--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
# fmt: on
)
def pretrain_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True),
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
# fmt: on
config_path: ExistingFilePathOrDash,
output_dir: Path,
code_path: Optional[ExistingFilePath] = None,
resume_path: Optional[ExistingFilePath] = None,
epoch_resume: Optional[int] = None,
use_gpu: int = -1,
_extra: List[str] = [],
):
"""
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
@ -46,7 +51,7 @@ def pretrain_cli(
DOCS: https://spacy.io/api/cli#pretrain
"""
config_overrides = parse_config_overrides(ctx.args)
config_overrides = parse_config_overrides(_extra)
import_code(code_path)
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
setup_gpu(use_gpu)

View File

@ -7,23 +7,25 @@ import pstats
import sys
import itertools
from wasabi import msg, Printer
import typer
from radicli import Arg, ExistingPathOrDash
from ._util import app, debug_cli, Arg, Opt, NAME
from ._util import cli
from ..language import Language
from ..util import load_model
@debug_cli.command("profile")
@app.command("profile", hidden=True)
def profile_cli(
@cli.subcommand(
"debug",
"profile",
# fmt: off
ctx: typer.Context, # This is only used to read current calling context
model: str = Arg(..., help="Trained pipeline to load"),
inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True),
n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"),
model=Arg(help="Trained pipeline to load"),
inputs=Arg(help="Location of input file. '-' for stdin."),
n_texts=Arg("--n-texts", "-n", help="Maximum number of texts to use if available"),
# fmt: on
):
)
def profile(
model: str, inputs: Optional[ExistingPathOrDash] = None, n_texts: int = 10000
) -> None:
"""
Profile which functions take the most time in a spaCy pipeline.
Input should be formatted as one JSON object per line with a key "text".
@ -32,16 +34,6 @@ def profile_cli(
DOCS: https://spacy.io/api/cli#debug-profile
"""
if ctx.parent.command.name == NAME: # type: ignore[union-attr] # called as top-level command
msg.warn(
"The profile command is now available via the 'debug profile' "
"subcommand. You can run python -m spacy debug --help for an "
"overview of the other available debugging commands."
)
profile(model, inputs=inputs, n_texts=n_texts)
def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None:
if inputs is not None:
texts = _read_inputs(inputs, msg)
texts = list(itertools.islice(texts, n_texts))

View File

@ -1,14 +1,14 @@
from typing import Any, Dict, Optional
from typing import Any, Dict, Optional, List
from pathlib import Path
from wasabi import msg
import os
import re
import shutil
import requests
import typer
from radicli import Arg, ExistingDirPath
from ...util import ensure_path, working_dir
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
from .._util import cli, PROJECT_FILE, load_project_config
from .._util import get_checksum, download_file, git_checkout, get_git_version
from .._util import SimpleFrozenDict, parse_config_overrides
@ -16,17 +16,20 @@ from .._util import SimpleFrozenDict, parse_config_overrides
EXTRA_DEFAULT = False
@project_cli.command(
@cli.subcommand(
"project",
"assets",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
# fmt: off
project_dir=Arg(help="Path to cloned project. Defaults to current working directory"),
sparse_checkout=Arg("--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+"),
extra=Arg("--extra", "-e", help="Download all assets, including those marked as 'extra'"),
# fmt: on
)
def project_assets_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
# fmt: on
project_dir: ExistingDirPath = Path.cwd(),
sparse_checkout: bool = False,
extra: bool = False,
_extra: List[str] = [],
):
"""Fetch project assets like datasets and pretrained weights. Assets are
defined in the "assets" section of the project.yml. If a checksum is
@ -35,7 +38,7 @@ def project_assets_cli(
DOCS: https://spacy.io/api/cli#project-assets
"""
overrides = parse_config_overrides(ctx.args)
overrides = parse_config_overrides(_extra)
project_assets(
project_dir,
overrides=overrides,

View File

@ -3,10 +3,11 @@ from pathlib import Path
from wasabi import msg
import subprocess
import re
from radicli import Arg
from ... import about
from ...util import ensure_path
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
from .._util import cli, COMMAND, PROJECT_FILE
from .._util import git_checkout, get_git_version, git_repo_branch_exists
DEFAULT_REPO = about.__projects__
@ -14,15 +15,23 @@ DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
DEFAULT_BRANCHES = ["main", "master"]
@project_cli.command("clone")
def project_clone_cli(
@cli.subcommand(
"project",
"clone",
# fmt: off
name: str = Arg(..., help="The name of the template to clone"),
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
name=Arg(help="The name of the template to clone"),
dest=Arg(help="Where to clone the project. Defaults to current working directory"),
repo=Arg("--repo", "-r", help="The repository to clone from"),
branch=Arg("--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
sparse_checkout=Arg("--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+"),
# fmt: on
)
def project_clone_cli(
name: str,
dest: Optional[Path] = None,
repo: str = DEFAULT_REPO,
branch: Optional[str] = None,
sparse_checkout: bool = False,
):
"""Clone a project template from a repository. Calls into "git" and will
only download the files from the given subdirectory. The GitHub repo

View File

@ -1,8 +1,9 @@
from pathlib import Path
from wasabi import msg, MarkdownRenderer
from radicli import Arg, ExistingDirPath, PathOrDash
from ...util import working_dir
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
from .._util import cli, PROJECT_FILE, load_project_config
DOCS_URL = "https://spacy.io"
@ -27,14 +28,20 @@ MARKER_END = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->"
MARKER_IGNORE = "<!-- SPACY PROJECT: IGNORE -->"
@project_cli.command("document")
def project_document_cli(
@cli.subcommand(
"project",
"document",
# fmt: off
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji")
project_dir=Arg(help="Path to cloned project. Defaults to current working directory."),
output_file=Arg("--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
no_emoji=Arg("--no-emoji", "-NE", help="Don't use emoji"),
# fmt: on
):
)
def project_document(
project_dir: ExistingDirPath = Path.cwd(),
output_file: PathOrDash = "-",
no_emoji: bool = False,
) -> None:
"""
Auto-generate a README.md for a project. If the content is saved to a file,
hidden markers are added so you can add custom content before or after the
@ -43,13 +50,7 @@ def project_document_cli(
DOCS: https://spacy.io/api/cli#project-document
"""
project_document(project_dir, output_file, no_emoji=no_emoji)
def project_document(
project_dir: Path, output_file: Path, *, no_emoji: bool = False
) -> None:
is_stdout = str(output_file) == "-"
is_stdout = output_file == "-"
config = load_project_config(project_dir)
md = MarkdownRenderer(no_emoji=no_emoji)
md.add(MARKER_START)

View File

@ -1,207 +0,0 @@
"""This module contains helpers and subcommands for integrating spaCy projects
with Data Version Controk (DVC). https://dvc.org"""
from typing import Dict, Any, List, Optional, Iterable
import subprocess
from pathlib import Path
from wasabi import msg
from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
from .._util import Arg, Opt, NAME, COMMAND
from ...util import working_dir, split_command, join_command, run_command
from ...util import SimpleFrozenList
DVC_CONFIG = "dvc.yaml"
DVC_DIR = ".dvc"
UPDATE_COMMAND = "dvc"
DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
# edited your {PROJECT_FILE}, you can regenerate this file by running:
# {COMMAND} project {UPDATE_COMMAND}"""
@project_cli.command(UPDATE_COMMAND)
def project_update_dvc_cli(
# fmt: off
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
# fmt: on
):
"""Auto-generate Data Version Control (DVC) config. A DVC
project can only define one pipeline, so you need to specify one workflow
defined in the project.yml. If no workflow is specified, the first defined
workflow is used. The DVC config will only be updated if the project.yml
changed.
DOCS: https://spacy.io/api/cli#project-dvc
"""
project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
def project_update_dvc(
project_dir: Path,
workflow: Optional[str] = None,
*,
verbose: bool = False,
quiet: bool = False,
force: bool = False,
) -> None:
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
project can only define one pipeline, so you need to specify one workflow
defined in the project.yml. Will only update the file if the checksum changed.
project_dir (Path): The project directory.
workflow (Optional[str]): Optional name of workflow defined in project.yml.
If not set, the first workflow will be used.
verbose (bool): Print more info.
quiet (bool): Print less info.
force (bool): Force update DVC config.
"""
config = load_project_config(project_dir)
updated = update_dvc_config(
project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
)
help_msg = "To execute the workflow with DVC, run: dvc repro"
if updated:
msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
else:
msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
def update_dvc_config(
path: Path,
config: Dict[str, Any],
workflow: Optional[str] = None,
verbose: bool = False,
quiet: bool = False,
force: bool = False,
) -> bool:
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
project directory. The file is auto-generated based on the config. The
first line of the auto-generated file specifies the hash of the config
dict, so if any of the config values change, the DVC config is regenerated.
path (Path): The path to the project directory.
config (Dict[str, Any]): The loaded project.yml.
verbose (bool): Whether to print additional info (via DVC).
quiet (bool): Don't output anything (via DVC).
force (bool): Force update, even if hashes match.
RETURNS (bool): Whether the DVC config file was updated.
"""
ensure_dvc(path)
workflows = config.get("workflows", {})
workflow_names = list(workflows.keys())
check_workflows(workflow_names, workflow)
if not workflow:
workflow = workflow_names[0]
config_hash = get_hash(config)
path = path.resolve()
dvc_config_path = path / DVC_CONFIG
if dvc_config_path.exists():
# Check if the file was generated using the current config, if not, redo
with dvc_config_path.open("r", encoding="utf8") as f:
ref_hash = f.readline().strip().replace("# ", "")
if ref_hash == config_hash and not force:
return False # Nothing has changed in project.yml, don't need to update
dvc_config_path.unlink()
dvc_commands = []
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
# some flags that apply to every command
flags = []
if verbose:
flags.append("--verbose")
if quiet:
flags.append("--quiet")
for name in workflows[workflow]:
command = config_commands[name]
deps = command.get("deps", [])
outputs = command.get("outputs", [])
outputs_no_cache = command.get("outputs_no_cache", [])
if not deps and not outputs and not outputs_no_cache:
continue
# Default to the working dir as the project path since dvc.yaml is auto-generated
# and we don't want arbitrary paths in there
project_cmd = ["python", "-m", NAME, "project", "run", name]
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
if command.get("no_skip"):
dvc_cmd.append("--always-changed")
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
dvc_commands.append(join_command(full_cmd))
if not dvc_commands:
# If we don't check for this, then there will be an error when reading the
# config, since DVC wouldn't create it.
msg.fail(
"No usable commands for DVC found. This can happen if none of your "
"commands have dependencies or outputs.",
exits=1,
)
with working_dir(path):
for c in dvc_commands:
dvc_command = "dvc " + c
run_command(dvc_command)
with dvc_config_path.open("r+", encoding="utf8") as f:
content = f.read()
f.seek(0, 0)
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
return True
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
"""Validate workflows provided in project.yml and check that a given
workflow can be used to generate a DVC config.
workflows (List[str]): Names of the available workflows.
workflow (Optional[str]): The name of the workflow to convert.
"""
if not workflows:
msg.fail(
f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
f"define at least one list of commands.",
exits=1,
)
if workflow is not None and workflow not in workflows:
msg.fail(
f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
f"Available workflows: {', '.join(workflows)}",
exits=1,
)
if not workflow:
msg.warn(
f"No workflow specified for DVC pipeline. Using the first workflow "
f"defined in {PROJECT_FILE}: '{workflows[0]}'"
)
def ensure_dvc(project_dir: Path) -> None:
"""Ensure that the "dvc" command is available and that the current project
directory is an initialized DVC project.
"""
try:
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
except Exception:
msg.fail(
"To use spaCy projects with DVC (Data Version Control), DVC needs "
"to be installed and the 'dvc' command needs to be available",
"You can install the Python package from pip (pip install dvc) or "
"conda (conda install -c conda-forge dvc). For more details, see the "
"documentation: https://dvc.org/doc/install",
exits=1,
)
if not (project_dir / ".dvc").exists():
msg.fail(
"Project not initialized as a DVC project",
"To initialize a DVC project, you can run 'dvc init' in the project "
"directory. For more details, see the documentation: "
"https://dvc.org/doc/command-reference/init",
exits=1,
)

View File

@ -1,18 +1,23 @@
from pathlib import Path
from wasabi import msg
from radicli import Arg, ExistingDirPath
from .remote_storage import RemoteStorage
from .remote_storage import get_command_hash
from .._util import project_cli, Arg, logger
from .._util import load_project_config
from .._util import cli, load_project_config, logger
from .run import update_lockfile
@project_cli.command("pull")
def project_pull_cli(
@cli.subcommand(
"project",
"pull",
# fmt: off
remote: str = Arg("default", help="Name or path of remote storage"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
remote=Arg(help="Name or path of remote storage"),
project_dir=Arg(help="Location of project directory. Defaults to current working directory."),
# fmt: on
)
def project_pull_cli(
remote: str = "default", project_dir: ExistingDirPath = Path.cwd()
):
"""Retrieve available precomputed outputs from a remote storage.
You can alias remotes in your project.yml by mapping them to storage paths.

View File

@ -1,19 +1,25 @@
from pathlib import Path
from wasabi import msg
from radicli import Arg, ExistingDirPath
from .remote_storage import RemoteStorage
from .remote_storage import get_content_hash, get_command_hash
from .._util import load_project_config
from .._util import project_cli, Arg, logger
from .._util import cli, load_project_config, logger
@project_cli.command("push")
def project_push_cli(
@cli.subcommand(
"project",
"push",
# fmt: off
remote: str = Arg("default", help="Name or path of remote storage"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
remote=Arg(help="Name or path of remote storage"),
project_dir=Arg(help="Location of project directory. Defaults to current working directory."),
# fmt: on
)
def project_push_cli(
remote: str = "default", project_dir: ExistingDirPath = Path.cwd()
):
"""Persist outputs to a remote storage. You can alias remotes in your
"""
Persist outputs to a remote storage. You can alias remotes in your
project.yml by mapping them to storage paths. A storage can be anything that
the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
local directories etc.

View File

@ -1,13 +1,12 @@
from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
import os.path
from pathlib import Path
import pkg_resources
from wasabi import msg
from wasabi.util import locale_escape
import sys
import srsly
import typer
from radicli import Arg, ExistingDirPath
from ... import about
from ...git_info import GIT_VERSION
@ -15,21 +14,27 @@ from ...util import working_dir, run_command, split_command, is_cwd, join_comman
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
from ...util import check_bool_env_var, SimpleFrozenDict
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides
from .._util import cli, get_checksum, COMMAND, parse_config_overrides
@project_cli.command(
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
@cli.subcommand_with_extra(
"project",
"run",
# fmt: off
subcommand=Arg(help=f"Name of command defined in the {PROJECT_FILE}"),
project_dir=Arg(help="Location of project directory. Defaults to current working directory."),
force=Arg("--force", "-F", help="Force re-running steps, even if nothing changed"),
dry=Arg("--dry", "-D", help="Perform a dry run and don't execute scripts"),
show_help=Arg("--help", help="Show help message and available subcommands"),
# fmt: on
)
def project_run_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
# fmt: on
subcommand: Optional[str] = None,
project_dir: ExistingDirPath = Path.cwd(),
force: bool = False,
dry: bool = False,
show_help: bool = False,
_extra: List[str] = [],
):
"""Run a named command or workflow defined in the project.yml. If a workflow
name is specified, all commands in the workflow are run, in order. If
@ -41,7 +46,7 @@ def project_run_cli(
if show_help or not subcommand:
print_run_help(project_dir, subcommand)
else:
overrides = parse_config_overrides(ctx.args)
overrides = parse_config_overrides(_extra)
project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)

View File

@ -1,29 +1,34 @@
from typing import Optional, Dict, Any, Union
from typing import Optional, Dict, Any, Union, List
from pathlib import Path
from wasabi import msg
import typer
import logging
import sys
from radicli import Arg, ExistingFilePathOrDash
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import cli, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu
from ..training.loop import train as train_nlp
from ..training.initialize import init_nlp
from .. import util
@app.command(
"train", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
@cli.command_with_extra(
"train",
# fmt: off
config_path=Arg(help="Path to config file"),
output_path=Arg("--output", "-o", help="Output directory to store trained pipeline in"),
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"),
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
# fmt: on
)
def train_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
config_path: ExistingFilePathOrDash,
output_path: Optional[Path] = None,
code_path: Optional[Path] = None,
verbose: bool = False,
use_gpu: int = -1,
_extra: List[str] = [],
):
"""
Train or update a spaCy pipeline. Requires data in spaCy's binary format. To
@ -40,7 +45,7 @@ def train_cli(
DOCS: https://spacy.io/api/cli#train
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
overrides = parse_config_overrides(ctx.args)
overrides = parse_config_overrides(_extra)
import_code(code_path)
train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)

View File

@ -5,14 +5,14 @@ import requests
from wasabi import msg, Printer
import warnings
from ._util import app
from ._util import cli
from .. import about
from ..util import get_package_version, get_installed_models, get_minor_version
from ..util import get_package_path, get_model_meta, is_compatible_version
@app.command("validate")
def validate_cli():
@cli.command("validate")
def validate() -> None:
"""
Validate the currently installed pipeline packages and spaCy version. Checks
if the installed packages are compatible and shows upgrade instructions if
@ -20,10 +20,6 @@ def validate_cli():
DOCS: https://spacy.io/api/cli#validate
"""
validate()
def validate() -> None:
model_pkgs, compat = get_model_pkgs()
spacy_version = get_minor_version(about.__version__)
current_compat = compat.get(spacy_version, {})

View File

@ -1,20 +1,39 @@
import pytest
import os
from pathlib import Path
from typer.testing import CliRunner
from spacy.tokens import DocBin, Doc
from spacy.cli._util import cli
from spacy.cli._util import app
from .util import make_tempdir, normalize_whitespace
def test_convert_auto():
@pytest.fixture(scope="session")
def all_commands():
result = [*cli.commands.values()]
for subcommands in cli.subcommands.values():
result.extend(subcommands.values())
return result
def test_help_texts(all_commands):
"""Test that all commands provide docstrings and argument help texts."""
for command in all_commands:
assert command.description, f"no docstring for {command.display_name}"
for arg in command.args:
if arg.id == cli.extra_key:
continue
assert arg.arg.help, f"no help text for {command.display_name} -> {arg.id}"
def test_convert_auto(capsys):
with make_tempdir() as d_in, make_tempdir() as d_out:
for f in ["data1.iob", "data2.iob", "data3.iob"]:
Path(d_in / f).touch()
# ensure that "automatic" suffix detection works
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
assert "Generated output file" in result.stdout
cli.run(["spacy", "convert", str(d_in), str(d_out)])
captured = capsys.readouterr()
assert "Generated output file" in captured.out
out_files = os.listdir(d_out)
assert len(out_files) == 3
assert "data1.spacy" in out_files
@ -22,28 +41,36 @@ def test_convert_auto():
assert "data3.spacy" in out_files
def test_convert_auto_conflict():
def test_convert_auto_conflict(capsys):
with make_tempdir() as d_in, make_tempdir() as d_out:
for f in ["data1.iob", "data2.iob", "data3.json"]:
Path(d_in / f).touch()
# ensure that "automatic" suffix detection warns when there are different file types
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
assert "All input files must be same type" in result.stdout
with pytest.raises(SystemExit):
cli.run(["spacy", "convert", str(d_in), str(d_out)])
captured = capsys.readouterr()
assert "All input files must be same type" in captured.out
out_files = os.listdir(d_out)
assert len(out_files) == 0
def test_benchmark_accuracy_alias():
def test_benchmark_accuracy_alias(capsys):
# Verify that the `evaluate` alias works correctly.
result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace(
result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy")
with pytest.raises(SystemExit):
cli.run(["spacy", "benchmark", "accuracy", "--help"])
captured = capsys.readouterr()
result_benchmark = normalize_whitespace(str(captured.out))
with pytest.raises(SystemExit):
cli.run(["spacy", "evaluate", "--help"])
captured = capsys.readouterr()
result_evaluate = normalize_whitespace(str(captured.out))
assert result_benchmark == result_evaluate.replace(
"spacy evaluate", "spacy benchmark accuracy"
)
def test_debug_data_trainable_lemmatizer_cli(en_vocab):
def test_debug_data_trainable_lemmatizer_cli(en_vocab, capsys):
train_docs = [
Doc(en_vocab, words=["I", "like", "cats"], lemmas=["I", "like", "cat"]),
Doc(
@ -62,30 +89,30 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab):
dev_bin = DocBin(docs=dev_docs)
dev_bin.to_disk(d_in / "dev.spacy")
# `debug data` requires an input pipeline config
CliRunner().invoke(
app,
[
"init",
"config",
f"{d_in}/config.cfg",
"--lang",
"en",
"--pipeline",
"trainable_lemmatizer",
],
)
result_debug_data = CliRunner().invoke(
app,
[
"debug",
"data",
f"{d_in}/config.cfg",
"--paths.train",
f"{d_in}/train.spacy",
"--paths.dev",
f"{d_in}/dev.spacy",
],
)
args = [
"spacy",
"init",
"config",
f"{d_in}/config.cfg",
"--lang",
"en",
"--pipeline",
"trainable_lemmatizer",
]
cli.run(args)
args = [
"spacy",
"debug",
"data",
f"{d_in}/config.cfg",
"--paths.train",
f"{d_in}/train.spacy",
"--paths.dev",
f"{d_in}/dev.spacy",
]
with pytest.raises(SystemExit):
cli.run(args)
captured = capsys.readouterr()
# Instead of checking specific wording of the output, which may change,
# we'll check that this section of the debug output is present.
assert "= Trainable Lemmatizer =" in result_debug_data.stdout
assert "= Trainable Lemmatizer =" in captured.out