mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Update CLI to use radicli
This commit is contained in:
parent
eec5ccd72f
commit
d292c6fc78
|
@ -9,9 +9,9 @@ murmurhash>=0.28.0,<1.1.0
|
|||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
typer>=0.3.0,<0.8.0
|
||||
pathy>=0.10.0
|
||||
smart-open>=5.2.1,<7.0.0
|
||||
radicli>=0.0.1<1.0.0
|
||||
# Third party dependencies
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
|
|
|
@ -41,8 +41,8 @@ install_requires =
|
|||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
radicli>=0.0.1<1.0.0
|
||||
# Third-party dependencies
|
||||
typer>=0.3.0,<0.8.0
|
||||
pathy>=0.10.0
|
||||
smart-open>=5.2.1,<7.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
from wasabi import msg
|
||||
|
||||
from ._util import app, setup_cli # noqa: F401
|
||||
from ._util import cli, setup_cli # noqa: F401
|
||||
|
||||
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
|
||||
# are registered automatically and won't have to be imported here.
|
||||
from .benchmark_speed import benchmark_speed_cli # noqa: F401
|
||||
from .benchmark_speed import benchmark_speed # noqa: F401
|
||||
from .download import download # noqa: F401
|
||||
from .info import info # noqa: F401
|
||||
from .package import package # noqa: F401
|
||||
|
@ -25,18 +23,7 @@ from .validate import validate # noqa: F401
|
|||
from .project.clone import project_clone # noqa: F401
|
||||
from .project.assets import project_assets # noqa: F401
|
||||
from .project.run import project_run # noqa: F401
|
||||
from .project.dvc import project_update_dvc # noqa: F401
|
||||
from .project.push import project_push # noqa: F401
|
||||
from .project.pull import project_pull # noqa: F401
|
||||
from .project.document import project_document # noqa: F401
|
||||
from .find_threshold import find_threshold # noqa: F401
|
||||
|
||||
|
||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||
def link(*args, **kwargs):
|
||||
"""As of spaCy v3.0, symlinks like "en" are not supported anymore. You can load trained
|
||||
pipeline packages using their full names or from a directory path."""
|
||||
msg.warn(
|
||||
"As of spaCy v3.0, model symlinks are not supported anymore. You can load trained "
|
||||
"pipeline packages using their full names or from a directory path."
|
||||
)
|
||||
|
|
|
@ -6,15 +6,13 @@ from pathlib import Path
|
|||
from wasabi import msg, Printer
|
||||
import srsly
|
||||
import hashlib
|
||||
import typer
|
||||
from click import NoSuchOption
|
||||
from click.parser import split_arg_string
|
||||
from typer.main import get_command
|
||||
import shlex
|
||||
from contextlib import contextmanager
|
||||
from thinc.api import Config, ConfigValidationError, require_gpu
|
||||
from thinc.util import gpu_is_available
|
||||
from configparser import InterpolationError
|
||||
import os
|
||||
import radicli
|
||||
|
||||
from ..schemas import ProjectConfigSchema, validate
|
||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||
|
@ -37,6 +35,8 @@ HELP = """spaCy Command-line Interface
|
|||
|
||||
DOCS: https://spacy.io/api/cli
|
||||
"""
|
||||
|
||||
# TODO: need to find a way to inject these now
|
||||
PROJECT_HELP = f"""Command-line interface for spaCy projects and templates.
|
||||
You'd typically start by cloning a project template to a local directory and
|
||||
fetching its assets like datasets etc. See the project's {PROJECT_FILE} for the
|
||||
|
@ -49,29 +49,14 @@ and custom model implementations.
|
|||
BENCHMARK_HELP = """Commands for benchmarking pipelines."""
|
||||
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
||||
|
||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||
# keep the names short, but not needed at the moment.
|
||||
Arg = typer.Argument
|
||||
Opt = typer.Option
|
||||
|
||||
app = typer.Typer(name=NAME, help=HELP)
|
||||
benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
|
||||
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
|
||||
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
|
||||
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
|
||||
|
||||
app.add_typer(project_cli)
|
||||
app.add_typer(debug_cli)
|
||||
app.add_typer(benchmark_cli)
|
||||
app.add_typer(init_cli)
|
||||
# CLI
|
||||
cli = radicli.Radicli(prog=COMMAND, help=HELP)
|
||||
|
||||
|
||||
def setup_cli() -> None:
|
||||
# Make sure the entry-point for CLI runs, so that they get imported.
|
||||
registry.cli.get_all()
|
||||
# Ensure that the help messages always display the correct prompt
|
||||
command = get_command(app)
|
||||
command(prog_name=COMMAND)
|
||||
cli.run()
|
||||
|
||||
|
||||
def parse_config_overrides(
|
||||
|
@ -106,7 +91,7 @@ def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]:
|
|||
opt = opt.replace("--", "")
|
||||
if "." not in opt:
|
||||
if is_cli:
|
||||
raise NoSuchOption(orig_opt)
|
||||
raise radicli.CliParseError(f"unrecognized argument: {orig_opt}")
|
||||
else:
|
||||
msg.fail(f"{err}: can't override top-level sections", exits=1)
|
||||
if "=" in opt: # we have --opt=value
|
||||
|
@ -510,7 +495,7 @@ def get_git_version(
|
|||
"""
|
||||
try:
|
||||
ret = run_command("git --version", capture=True)
|
||||
except:
|
||||
except Exception:
|
||||
raise RuntimeError(error)
|
||||
stdout = ret.stdout.strip()
|
||||
if not stdout or not stdout.startswith("git version"):
|
||||
|
@ -580,6 +565,18 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
|
|||
return result
|
||||
|
||||
|
||||
def convert_string_list(value: str) -> List[str]:
|
||||
return string_to_list(value)
|
||||
|
||||
|
||||
def convert_int_list(value: str) -> List[int]:
|
||||
return string_to_list(value, intify=True)
|
||||
|
||||
|
||||
def convert_path_list(value: str) -> List[Path]:
|
||||
return [Path(p) for p in string_to_list(value)]
|
||||
|
||||
|
||||
def setup_gpu(use_gpu: int, silent=None) -> None:
|
||||
"""Configure the GPU and log info."""
|
||||
if silent is None:
|
||||
|
@ -629,3 +626,20 @@ def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
|
|||
return f"{number:.{ndigits}f}"
|
||||
else:
|
||||
return str(number)
|
||||
|
||||
|
||||
def split_arg_string(string: str) -> List[str]:
|
||||
# Adapted from: https://github.com/pallets/click/blob/8b48450d5d63c747600e069d4c3e2274f41c8360/src/click/parser.py#L125
|
||||
lex = shlex.shlex(string, posix=True)
|
||||
lex.whitespace_split = True
|
||||
lex.commenters = ""
|
||||
out = []
|
||||
try:
|
||||
for token in lex:
|
||||
out.append(token)
|
||||
except ValueError:
|
||||
# Raised when end-of-string is reached in an invalid state. Use
|
||||
# the partial token as-is. The quote or escape character is in
|
||||
# lex.state, not lex.token.
|
||||
out.append(lex.token)
|
||||
return out
|
||||
|
|
|
@ -1,14 +1,12 @@
|
|||
import tqdm
|
||||
import srsly
|
||||
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Iterable, cast, Union
|
||||
|
||||
from wasabi import msg
|
||||
from radicli import Arg, ExistingPath, ExistingFilePath
|
||||
|
||||
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
|
||||
|
||||
from ._util import cli, setup_gpu, import_code, walk_directory
|
||||
from ..tokens import Doc, DocBin
|
||||
from ..vocab import Vocab
|
||||
from ..util import ensure_path, load_model
|
||||
|
@ -37,49 +35,30 @@ force_msg = (
|
|||
DocOrStrStream = Union[Iterable[str], Iterable[Doc]]
|
||||
|
||||
|
||||
def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
|
||||
"""
|
||||
Stream Doc objects from DocBin.
|
||||
"""
|
||||
docbin = DocBin().from_disk(path)
|
||||
for doc in docbin.get_docs(vocab):
|
||||
yield doc
|
||||
|
||||
|
||||
def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
|
||||
"""
|
||||
Stream "text" field from JSONL. If the field "text" is
|
||||
not found it raises error.
|
||||
"""
|
||||
for entry in srsly.read_jsonl(path):
|
||||
if field not in entry:
|
||||
msg.fail(f"{path} does not contain the required '{field}' field.", exits=1)
|
||||
else:
|
||||
yield entry[field]
|
||||
|
||||
|
||||
def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
|
||||
"""
|
||||
Yields strings from text files in paths.
|
||||
"""
|
||||
for path in paths:
|
||||
with open(path, "r") as fin:
|
||||
text = fin.read()
|
||||
yield text
|
||||
|
||||
|
||||
@app.command("apply")
|
||||
def apply_cli(
|
||||
@cli.command(
|
||||
"apply",
|
||||
# fmt: off
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help=path_help, exists=True),
|
||||
output_file: Path = Arg(..., help=out_help, dir_okay=False),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
|
||||
text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"),
|
||||
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
|
||||
batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
|
||||
n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.")
|
||||
model=Arg(help="Model name or path"),
|
||||
data_path=Arg(help=path_help),
|
||||
output_file=Arg(help=out_help),
|
||||
code_path=Arg("--code", "-c", help=code_help),
|
||||
text_key=Arg("--text-key", "-tk", help="Key containing text string for JSONL"),
|
||||
force_overwrite=Arg("--force", "-F", help="Force overwriting the output file"),
|
||||
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
batch_size=Arg("--batch-size", "-b", help="Batch size"),
|
||||
n_process=Arg("--n-process", "-n", help="Number of processors to use"),
|
||||
# fmt: on
|
||||
)
|
||||
def apply_cli(
|
||||
model: str,
|
||||
data_path: ExistingPath,
|
||||
output_file: Path,
|
||||
code_path: Optional[ExistingFilePath] = None,
|
||||
text_key: str = "text",
|
||||
force_overwrite: bool = False,
|
||||
use_gpu: int = -1,
|
||||
batch_size: int = 1,
|
||||
n_process: int = 1,
|
||||
):
|
||||
"""
|
||||
Apply a trained pipeline to documents to get predictions.
|
||||
|
@ -122,7 +101,6 @@ def apply(
|
|||
)
|
||||
return
|
||||
nlp = load_model(model)
|
||||
msg.good(f"Loaded model {model}")
|
||||
vocab = nlp.vocab
|
||||
streams: List[DocOrStrStream] = []
|
||||
text_files = []
|
||||
|
@ -141,3 +119,32 @@ def apply(
|
|||
if output_file.suffix == "":
|
||||
output_file = output_file.with_suffix(".spacy")
|
||||
docbin.to_disk(output_file)
|
||||
|
||||
|
||||
def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
|
||||
"""
|
||||
Stream Doc objects from DocBin.
|
||||
"""
|
||||
docbin = DocBin().from_disk(path)
|
||||
for doc in docbin.get_docs(vocab):
|
||||
yield doc
|
||||
|
||||
|
||||
def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
|
||||
"""
|
||||
Stream "text" field from JSONL. If the field "text" is
|
||||
not found it raises error.
|
||||
"""
|
||||
for entry in srsly.read_jsonl(path):
|
||||
if field not in entry:
|
||||
msg.fail(f"{path} does not contain the required '{field}' field.", exits=1)
|
||||
else:
|
||||
yield entry[field]
|
||||
|
||||
|
||||
def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
|
||||
"""Yields strings from text files in paths."""
|
||||
for path in paths:
|
||||
with open(path, "r") as fin:
|
||||
text = fin.read()
|
||||
yield text
|
||||
|
|
|
@ -1,27 +1,30 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, List
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import typer
|
||||
import logging
|
||||
from radicli import Arg, ExistingFilePathOrDash, ExistingFilePath
|
||||
|
||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import cli, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code
|
||||
from .. import util
|
||||
from ..util import get_sourced_components, load_model_from_config
|
||||
|
||||
|
||||
@app.command(
|
||||
@cli.command_with_extra(
|
||||
"assemble",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
# fmt: off
|
||||
config_path=Arg(help="Path to config file"),
|
||||
output_path=Arg(help="Output directory to store assembled pipeline in"),
|
||||
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"),
|
||||
# fmt: on
|
||||
)
|
||||
def assemble_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
output_path: Path = Arg(..., help="Output directory to store assembled pipeline in"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
# fmt: on
|
||||
config_path: ExistingFilePathOrDash,
|
||||
output_path: Optional[Path] = None,
|
||||
code_path: Optional[ExistingFilePath] = None,
|
||||
verbose: bool = False,
|
||||
_extra: List[str] = [],
|
||||
):
|
||||
"""
|
||||
Assemble a spaCy pipeline from a config file. The config file includes
|
||||
|
@ -37,7 +40,7 @@ def assemble_cli(
|
|||
# Make sure all files and paths exists if they are needed
|
||||
if not config_path or (str(config_path) != "-" and not config_path.exists()):
|
||||
msg.fail("Config file not found", config_path, exits=1)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
overrides = parse_config_overrides(_extra)
|
||||
import_code(code_path)
|
||||
with show_validation_error(config_path):
|
||||
config = util.load_config(config_path, overrides=overrides, interpolate=False)
|
||||
|
|
|
@ -2,56 +2,56 @@ from typing import Iterable, List, Optional
|
|||
import random
|
||||
from itertools import islice
|
||||
import numpy
|
||||
from pathlib import Path
|
||||
import time
|
||||
from tqdm import tqdm
|
||||
import typer
|
||||
from wasabi import msg
|
||||
from radicli import Arg, ExistingPath
|
||||
|
||||
from .. import util
|
||||
from ..language import Language
|
||||
from ..tokens import Doc
|
||||
from ..training import Corpus
|
||||
from ._util import Arg, Opt, benchmark_cli, setup_gpu
|
||||
from ._util import cli, setup_gpu
|
||||
|
||||
|
||||
@benchmark_cli.command(
|
||||
@cli.subcommand(
|
||||
"benchmark",
|
||||
"speed",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def benchmark_speed_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context,
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
||||
batch_size: Optional[int] = Opt(None, "--batch-size", "-b", min=1, help="Override the pipeline batch size"),
|
||||
no_shuffle: bool = Opt(False, "--no-shuffle", help="Do not shuffle benchmark data"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
|
||||
warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
|
||||
model=Arg(help="Model name or path"),
|
||||
data_path=Arg(help="Location of binary evaluation data in .spacy format"),
|
||||
batch_size=Arg("--batch-size", "-b", help="Override the pipeline batch size"),
|
||||
no_shuffle=Arg("--no-shuffle", help="Do not shuffle benchmark data"),
|
||||
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
n_batches=Arg("--batches", help="Minimum number of batches to benchmark"),
|
||||
warmup_epochs=Arg("--warmup", "-w", help="Number of iterations over the data for warmup"),
|
||||
# fmt: on
|
||||
)
|
||||
def benchmark_speed(
|
||||
model: str,
|
||||
data_path: ExistingPath,
|
||||
batch_size: Optional[int] = None,
|
||||
no_shuffle: bool = False,
|
||||
use_gpu: int = -1,
|
||||
n_batches: int = 50,
|
||||
warmup_epochs: int = 3,
|
||||
):
|
||||
"""
|
||||
Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
|
||||
data in the binary .spacy format.
|
||||
"""
|
||||
setup_gpu(use_gpu=use_gpu, silent=False)
|
||||
|
||||
nlp = util.load_model(model)
|
||||
batch_size = batch_size if batch_size is not None else nlp.batch_size
|
||||
corpus = Corpus(data_path)
|
||||
docs = [eg.predicted for eg in corpus(nlp)]
|
||||
|
||||
if len(docs) == 0:
|
||||
msg.fail("Cannot benchmark speed using an empty corpus.", exits=1)
|
||||
|
||||
print(f"Warming up for {warmup_epochs} epochs...")
|
||||
warmup(nlp, docs, warmup_epochs, batch_size)
|
||||
|
||||
print()
|
||||
print(f"Benchmarking {n_batches} batches...")
|
||||
wps = benchmark(nlp, docs, n_batches, batch_size, not no_shuffle)
|
||||
|
||||
print()
|
||||
print_outliers(wps)
|
||||
print_mean_with_ci(wps)
|
||||
|
@ -120,7 +120,6 @@ def benchmark(
|
|||
nlp.make_doc(docs[i % len(docs)].text)
|
||||
for i in range(n_batches * batch_size)
|
||||
]
|
||||
|
||||
return annotate(nlp, bench_docs, batch_size)
|
||||
|
||||
|
||||
|
@ -143,17 +142,14 @@ def print_mean_with_ci(sample: numpy.ndarray):
|
|||
mean = numpy.mean(sample)
|
||||
bootstrap_means = bootstrap(sample)
|
||||
bootstrap_means.sort()
|
||||
|
||||
# 95% confidence interval
|
||||
low = bootstrap_means[int(len(bootstrap_means) * 0.025)]
|
||||
high = bootstrap_means[int(len(bootstrap_means) * 0.975)]
|
||||
|
||||
print(f"Mean: {mean:.1f} words/s (95% CI: {low-mean:.1f} +{high-mean:.1f})")
|
||||
|
||||
|
||||
def print_outliers(sample: numpy.ndarray):
|
||||
quartiles = Quartiles(sample)
|
||||
|
||||
n_outliers = numpy.sum(
|
||||
(sample < (quartiles.q1 - 1.5 * quartiles.iqr))
|
||||
| (sample > (quartiles.q3 + 1.5 * quartiles.iqr))
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Callable, Iterable, Mapping, Optional, Any, Union
|
||||
from typing import Callable, Iterable, Mapping, Optional, Any, Union, Literal
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
|
@ -6,8 +6,9 @@ import srsly
|
|||
import re
|
||||
import sys
|
||||
import itertools
|
||||
from radicli import Arg, ExistingFilePath, ExistingPathOrDash, ExistingDirPathOrDash
|
||||
|
||||
from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
|
||||
from ._util import cli, _handle_renamed_language_codes, walk_directory
|
||||
from ..training import docs_to_json
|
||||
from ..tokens import Doc, DocBin
|
||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||
|
@ -27,8 +28,8 @@ CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
|
|||
"iob": iob_to_docs,
|
||||
"json": json_to_docs,
|
||||
}
|
||||
|
||||
AUTO = "auto"
|
||||
ConvertersType = Literal["auto", "conllubio", "conllu", "conll", "ner", "iob", "json"]
|
||||
|
||||
|
||||
# File types that can be written to stdout
|
||||
|
@ -40,22 +41,36 @@ class FileTypes(str, Enum):
|
|||
spacy = "spacy"
|
||||
|
||||
|
||||
@app.command("convert")
|
||||
def convert_cli(
|
||||
@cli.command(
|
||||
"convert",
|
||||
# fmt: off
|
||||
input_path: str = Arg(..., help="Input file or directory", exists=True),
|
||||
output_dir: Path = Arg("-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True),
|
||||
file_type: FileTypes = Opt("spacy", "--file-type", "-t", help="Type of data to produce"),
|
||||
n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
|
||||
seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
|
||||
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
|
||||
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
||||
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
||||
converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
|
||||
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
|
||||
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
|
||||
input_path=Arg(help="Input file or directory"),
|
||||
output_dir=Arg(help="Output directory. '-' for stdout."),
|
||||
file_type=Arg("--file-type", "-t", help="Type of data to produce"),
|
||||
n_sents=Arg("--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
|
||||
seg_sents=Arg("--seg-sents", "-s", help="Segment sentences (for -c ner)"),
|
||||
model=Arg("--model", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
|
||||
morphology=Arg("--morphology", "-m", help="Enable appending morphology to tags"),
|
||||
merge_subtokens=Arg("--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
||||
converter=Arg("--converter", "-c", help=f"Converter to use"),
|
||||
ner_map=Arg("--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)"),
|
||||
lang=Arg("--lang", "-l", help="Language (if tokenizer required)"),
|
||||
concatenate=Arg("--concatenate", "-C", help="Concatenate output to a single file"),
|
||||
# fmt: on
|
||||
)
|
||||
def convert_cli(
|
||||
input_path: ExistingPathOrDash,
|
||||
output_dir: ExistingDirPathOrDash = "-",
|
||||
file_type: Literal["json", "spacy"] = "spacy",
|
||||
n_sents: int = 1,
|
||||
seg_sents: bool = False,
|
||||
model: Optional[str] = None,
|
||||
morphology: bool = False,
|
||||
merge_subtokens: bool = False,
|
||||
converter: ConvertersType = AUTO,
|
||||
ner_map: Optional[ExistingFilePath] = None,
|
||||
lang: Optional[str] = None,
|
||||
concatenate: bool = False,
|
||||
):
|
||||
"""
|
||||
Convert files into json or DocBin format for training. The resulting .spacy
|
||||
|
@ -69,15 +84,14 @@ def convert_cli(
|
|||
DOCS: https://spacy.io/api/cli#convert
|
||||
"""
|
||||
input_path = Path(input_path)
|
||||
output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
|
||||
silent = output_dir == "-"
|
||||
msg = Printer(no_print=silent)
|
||||
converter = _get_converter(msg, converter, input_path)
|
||||
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
|
||||
verify_cli_args(msg, input_path, output_dir, file_type, converter, ner_map)
|
||||
convert(
|
||||
input_path,
|
||||
output_dir,
|
||||
file_type=file_type.value,
|
||||
file_type=file_type,
|
||||
n_sents=n_sents,
|
||||
seg_sents=seg_sents,
|
||||
model=model,
|
||||
|
|
|
@ -3,27 +3,31 @@ from pathlib import Path
|
|||
from wasabi import msg, table
|
||||
from thinc.api import Config
|
||||
from thinc.config import VARIABLE_RE
|
||||
import typer
|
||||
from radicli import Arg, ExistingFilePathOrDash, ExistingFilePath
|
||||
|
||||
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ._util import import_code, debug_cli
|
||||
from ._util import cli, show_validation_error, parse_config_overrides
|
||||
from ._util import import_code
|
||||
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
|
||||
from ..util import registry
|
||||
from .. import util
|
||||
|
||||
|
||||
@debug_cli.command(
|
||||
@cli.subcommand_with_extra(
|
||||
"debug",
|
||||
"config",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
# fmt: off
|
||||
config_path=Arg(help="Path to config file"),
|
||||
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
show_funcs=Arg("--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
|
||||
show_vars=Arg("--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI"),
|
||||
# fmt: on
|
||||
)
|
||||
def debug_config_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
|
||||
show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
|
||||
# fmt: on
|
||||
config_path: ExistingFilePathOrDash,
|
||||
code_path: Optional[ExistingFilePath] = None,
|
||||
show_funcs: bool = False,
|
||||
show_vars: bool = False,
|
||||
_extra: List[str] = [],
|
||||
):
|
||||
"""Debug a config file and show validation errors. The command will
|
||||
create all objects in the tree and validate them. Note that some config
|
||||
|
@ -36,7 +40,7 @@ def debug_config_cli(
|
|||
|
||||
DOCS: https://spacy.io/api/cli#debug-config
|
||||
"""
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
overrides = parse_config_overrides(_extra)
|
||||
import_code(code_path)
|
||||
debug_config(
|
||||
config_path, overrides=overrides, show_funcs=show_funcs, show_vars=show_vars
|
||||
|
|
|
@ -5,11 +5,11 @@ from collections import Counter
|
|||
import sys
|
||||
import srsly
|
||||
from wasabi import Printer, MESSAGES, msg
|
||||
import typer
|
||||
import math
|
||||
from radicli import Arg, ExistingFilePathOrDash, ExistingFilePath
|
||||
|
||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ._util import import_code, debug_cli, _format_number
|
||||
from ._util import cli, show_validation_error, parse_config_overrides
|
||||
from ._util import import_code, _format_number
|
||||
from ..training import Example, remove_bilu_prefix
|
||||
from ..training.initialize import get_sourced_components
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
|
@ -40,23 +40,24 @@ BOUNDARY_DISTINCT_THRESHOLD = 1
|
|||
SPAN_LENGTH_THRESHOLD_PERCENTAGE = 90
|
||||
|
||||
|
||||
@debug_cli.command(
|
||||
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
||||
)
|
||||
@app.command(
|
||||
"debug-data",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
hidden=True, # hide this from main CLI help but still allow it to work with warning
|
||||
@cli.subcommand_with_extra(
|
||||
"debug",
|
||||
"data",
|
||||
# fmt: off
|
||||
config_path=Arg(help="Path to config file"),
|
||||
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
ignore_warnings=Arg("--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
|
||||
verbose=Arg("--verbose", "-V", help="Print additional information and explanations"),
|
||||
no_format=Arg("--no-format", "-NF", help="Don't pretty-print the results"),
|
||||
# fmt: on
|
||||
)
|
||||
def debug_data_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
|
||||
no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"),
|
||||
# fmt: on
|
||||
config_path: ExistingFilePathOrDash,
|
||||
code_path: Optional[ExistingFilePath] = None,
|
||||
ignore_warnings: bool = False,
|
||||
verbose: bool = False,
|
||||
no_format: bool = False,
|
||||
_extra: List[str] = [],
|
||||
):
|
||||
"""
|
||||
Analyze, debug and validate your training and development data. Outputs
|
||||
|
@ -65,13 +66,7 @@ def debug_data_cli(
|
|||
|
||||
DOCS: https://spacy.io/api/cli#debug-data
|
||||
"""
|
||||
if ctx.command.name == "debug-data":
|
||||
msg.warn(
|
||||
"The debug-data command is now available via the 'debug data' "
|
||||
"subcommand (without the hyphen). You can run python -m spacy debug "
|
||||
"--help for an overview of the other available debugging commands."
|
||||
)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
overrides = parse_config_overrides(_extra)
|
||||
import_code(code_path)
|
||||
debug_data(
|
||||
config_path,
|
||||
|
|
|
@ -1,29 +1,32 @@
|
|||
from typing import Optional
|
||||
|
||||
import typer
|
||||
from wasabi import Printer, diff_strings, MarkdownRenderer
|
||||
from pathlib import Path
|
||||
from thinc.api import Config
|
||||
from radicli import Arg, ExistingFilePathOrDash, ExistingFilePath
|
||||
|
||||
from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ._util import cli, show_validation_error
|
||||
from .init_config import init_config, OptimizationsType
|
||||
from ..util import load_config
|
||||
from .init_config import init_config, Optimizations
|
||||
|
||||
|
||||
@debug_cli.command(
|
||||
@cli.subcommand(
|
||||
"debug",
|
||||
"diff-config",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
# fmt: off
|
||||
config_path=Arg(help="Path to config file"),
|
||||
compare_to=Arg(help="Path to a config file to diff against, or `None` to compare against default settings"),
|
||||
optimize=Arg("--optimize", "-o", help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config"),
|
||||
gpu=Arg("--gpu", "-G", help="Whether the original config can run on a GPU. Only relevant when comparing against the default config"),
|
||||
pretraining=Arg("--pretraining", "--pt", help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config"),
|
||||
markdown=Arg("--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
||||
# fmt: on
|
||||
)
|
||||
def debug_diff_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context,
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
compare_to: Optional[Path] = Opt(None, help="Path to a config file to diff against, or `None` to compare against default settings", exists=True, allow_dash=True),
|
||||
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config."),
|
||||
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the original config can run on a GPU. Only relevant when comparing against the default config."),
|
||||
pretraining: bool = Opt(False, "--pretraining", "--pt", help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config."),
|
||||
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues")
|
||||
# fmt: on
|
||||
config_path: ExistingFilePathOrDash,
|
||||
compare_to: Optional[ExistingFilePath] = None,
|
||||
optimize: OptimizationsType = "efficiency",
|
||||
gpu: bool = False,
|
||||
pretraining: bool = False,
|
||||
markdown: bool = False,
|
||||
):
|
||||
"""Show a diff of a config file with respect to spaCy's defaults or another config file. If
|
||||
additional settings were used in the creation of the config file, then you
|
||||
|
@ -50,7 +53,7 @@ def debug_diff(
|
|||
config_path: Path,
|
||||
compare_to: Optional[Path],
|
||||
gpu: bool,
|
||||
optimize: Optimizations,
|
||||
optimize: OptimizationsType,
|
||||
pretraining: bool,
|
||||
markdown: bool,
|
||||
):
|
||||
|
@ -68,7 +71,7 @@ def debug_diff(
|
|||
other_config = init_config(
|
||||
lang=lang,
|
||||
pipeline=pipeline,
|
||||
optimize=optimize.value,
|
||||
optimize=optimize,
|
||||
gpu=gpu,
|
||||
pretraining=pretraining,
|
||||
silent=True,
|
||||
|
|
|
@ -1,41 +1,51 @@
|
|||
from typing import Dict, Any, Optional
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional, List
|
||||
import itertools
|
||||
|
||||
from spacy.training import Example
|
||||
from spacy.util import resolve_dot_names
|
||||
from wasabi import msg
|
||||
from thinc.api import fix_random_seed, set_dropout_rate
|
||||
from thinc.api import Model, data_validation, set_gpu_allocator
|
||||
import typer
|
||||
from radicli import Arg, ExistingFilePathOrDash
|
||||
|
||||
from ._util import Arg, Opt, debug_cli, show_validation_error
|
||||
from ._util import parse_config_overrides, string_to_list, setup_gpu
|
||||
from ._util import cli, show_validation_error
|
||||
from ._util import parse_config_overrides, convert_int_list, setup_gpu
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
from ..util import registry
|
||||
from .. import util
|
||||
|
||||
|
||||
@debug_cli.command(
|
||||
@cli.subcommand_with_extra(
|
||||
"debug",
|
||||
"model",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
# fmt: off
|
||||
config_path=Arg(help="Path to config file",),
|
||||
component=Arg(help="Name of the pipeline component of which the model should be analyzed"),
|
||||
layers=Arg(help="Comma-separated names of layer IDs to print", converter=convert_int_list),
|
||||
dimensions=Arg("--dimensions", "-DIM", help="Show dimensions"),
|
||||
parameters=Arg("--parameters", "-PAR", help="Show parameters"),
|
||||
gradients=Arg("--gradients", "-GRAD", help="Show gradients"),
|
||||
attributes=Arg("--attributes", "-ATTR", help="Show attributes"),
|
||||
P0=Arg("--print-step0", "-P0", help="Print model before training"),
|
||||
P1=Arg("--print-step1", "-P1", help="Print model after initialization"),
|
||||
P2=Arg("--print-step2", "-P2", help="Print model after training"),
|
||||
P3=Arg("--print-step3", "-P3", help="Print final predictions"),
|
||||
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
# fmt: on
|
||||
)
|
||||
def debug_model_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
component: str = Arg(..., help="Name of the pipeline component of which the model should be analysed"),
|
||||
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"),
|
||||
dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
|
||||
parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"),
|
||||
gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"),
|
||||
attributes: bool = Opt(False, "--attributes", "-ATTR", help="Show attributes"),
|
||||
P0: bool = Opt(False, "--print-step0", "-P0", help="Print model before training"),
|
||||
P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"),
|
||||
P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"),
|
||||
P3: bool = Opt(False, "--print-step3", "-P3", help="Print final predictions"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||
# fmt: on
|
||||
config_path: ExistingFilePathOrDash,
|
||||
component: str,
|
||||
layers: List[int] = [],
|
||||
dimensions: bool = False,
|
||||
parameters: bool = False,
|
||||
gradients: bool = False,
|
||||
attributes: bool = False,
|
||||
P0: bool = False,
|
||||
P1: bool = False,
|
||||
P2: bool = False,
|
||||
P3: bool = False,
|
||||
use_gpu: int = -1,
|
||||
_extra: List[str] = [],
|
||||
):
|
||||
"""
|
||||
Analyze a Thinc model implementation. Includes checks for internal structure
|
||||
|
@ -44,7 +54,6 @@ def debug_model_cli(
|
|||
DOCS: https://spacy.io/api/cli#debug-model
|
||||
"""
|
||||
setup_gpu(use_gpu)
|
||||
layers = string_to_list(layers, intify=True)
|
||||
print_settings = {
|
||||
"dimensions": dimensions,
|
||||
"parameters": parameters,
|
||||
|
@ -56,7 +65,7 @@ def debug_model_cli(
|
|||
"print_after_training": P2,
|
||||
"print_prediction": P3,
|
||||
}
|
||||
config_overrides = parse_config_overrides(ctx.args)
|
||||
config_overrides = parse_config_overrides(_extra)
|
||||
with show_validation_error(config_path):
|
||||
raw_config = util.load_config(
|
||||
config_path, overrides=config_overrides, interpolate=False
|
||||
|
|
|
@ -1,27 +1,26 @@
|
|||
from typing import Optional, Sequence
|
||||
from typing import Optional, Sequence, List
|
||||
import requests
|
||||
import sys
|
||||
from wasabi import msg
|
||||
import typer
|
||||
from radicli import Arg
|
||||
|
||||
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
|
||||
from ._util import cli, WHEEL_SUFFIX, SDIST_SUFFIX
|
||||
from .. import about
|
||||
from ..util import is_package, get_minor_version, run_command
|
||||
from ..util import is_prerelease_version, get_installed_models
|
||||
from ..util import get_package_version
|
||||
|
||||
|
||||
@app.command(
|
||||
@cli.command_with_extra(
|
||||
"download",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
# fmt: off
|
||||
model=Arg(help="Name of pipeline package to download"),
|
||||
direct=Arg("--direct", "-D", help="Force direct download of name + version"),
|
||||
sdist=Arg("--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel")
|
||||
# fmt: on
|
||||
)
|
||||
def download_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context,
|
||||
model: str = Arg(..., help="Name of pipeline package to download"),
|
||||
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
|
||||
sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel"),
|
||||
# fmt: on
|
||||
model: str, direct: bool = False, sdist: bool = False, _extra: List[str] = []
|
||||
):
|
||||
"""
|
||||
Download compatible trained pipeline from the default download path using
|
||||
|
@ -33,7 +32,7 @@ def download_cli(
|
|||
DOCS: https://spacy.io/api/cli#download
|
||||
AVAILABLE PACKAGES: https://spacy.io/models
|
||||
"""
|
||||
download(model, direct, sdist, *ctx.args)
|
||||
download(model, direct, sdist, *_extra)
|
||||
|
||||
|
||||
def download(
|
||||
|
|
|
@ -1,33 +1,42 @@
|
|||
from typing import Optional, List, Dict, Any, Union
|
||||
from typing import Optional, List, Dict, Any
|
||||
from wasabi import Printer
|
||||
from pathlib import Path
|
||||
import re
|
||||
import srsly
|
||||
from thinc.api import fix_random_seed
|
||||
from radicli import Arg, ExistingPath, ExistingDirPath, ExistingFilePath
|
||||
|
||||
from ..training import Corpus
|
||||
from ..tokens import Doc
|
||||
from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli
|
||||
from ..scorer import Scorer
|
||||
from ._util import cli, setup_gpu, import_code
|
||||
from .. import util
|
||||
from .. import displacy
|
||||
|
||||
|
||||
@benchmark_cli.command(
|
||||
"accuracy",
|
||||
)
|
||||
@app.command("evaluate")
|
||||
def evaluate_cli(
|
||||
args = dict(
|
||||
# fmt: off
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
||||
output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
||||
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
|
||||
model=Arg(help="Model name or path"),
|
||||
data_path=Arg(help="Location of binary evaluation data in .spacy format"),
|
||||
output=Arg("--output", "-o", help="Output JSON file for metrics"),
|
||||
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
gold_preproc=Arg("--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||
displacy_path=Arg("--displacy-path", "-dp", help="Directory to output rendered parses as HTML"),
|
||||
displacy_limit=Arg("--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
|
||||
# fmt: on
|
||||
)
|
||||
|
||||
|
||||
@cli.subcommand("benchmark", "accuracy", **args)
|
||||
@cli.command("evaluate", **args)
|
||||
def evaluate_cli(
|
||||
model: str,
|
||||
data_path: ExistingPath,
|
||||
output: Optional[ExistingFilePath] = None,
|
||||
code_path: Optional[ExistingFilePath] = None,
|
||||
use_gpu: int = -1,
|
||||
gold_preproc: bool = False,
|
||||
displacy_path: Optional[ExistingDirPath] = None,
|
||||
displacy_limit: int = 25,
|
||||
):
|
||||
"""
|
||||
Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation
|
||||
|
|
|
@ -3,14 +3,14 @@ import operator
|
|||
from pathlib import Path
|
||||
import logging
|
||||
from typing import Optional, Tuple, Any, Dict, List
|
||||
|
||||
import numpy
|
||||
import wasabi.tables
|
||||
from radicli import Arg, ExistingPath, ExistingFilePath
|
||||
|
||||
from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer
|
||||
from ..errors import Errors
|
||||
from ..training import Corpus
|
||||
from ._util import app, Arg, Opt, import_code, setup_gpu
|
||||
from ._util import cli, import_code, setup_gpu
|
||||
from .. import util
|
||||
|
||||
_DEFAULTS = {
|
||||
|
@ -20,23 +20,32 @@ _DEFAULTS = {
|
|||
}
|
||||
|
||||
|
||||
@app.command(
|
||||
@cli.command(
|
||||
"find-threshold",
|
||||
context_settings={"allow_extra_args": False, "ignore_unknown_options": True},
|
||||
# fmt: off
|
||||
model=Arg(help="Model name or path"),
|
||||
data_path=Arg(help="Location of binary evaluation data in .spacy format"),
|
||||
pipe_name=Arg(help="Name of pipe to examine thresholds for"),
|
||||
threshold_key=Arg(help="Key of threshold attribute in component's configuration"),
|
||||
scores_key=Arg(help="Metric to optimize"),
|
||||
n_trials=Arg("--n_trials", "-n", help="Number of trials to determine optimal thresholds"),
|
||||
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
gold_preproc=Arg("--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||
verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"),
|
||||
# fmt: on
|
||||
)
|
||||
def find_threshold_cli(
|
||||
# fmt: off
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
||||
pipe_name: str = Arg(..., help="Name of pipe to examine thresholds for"),
|
||||
threshold_key: str = Arg(..., help="Key of threshold attribute in component's configuration"),
|
||||
scores_key: str = Arg(..., help="Metric to optimize"),
|
||||
n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||
verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
# fmt: on
|
||||
model: str,
|
||||
data_path: ExistingPath,
|
||||
pipe_name: str,
|
||||
threshold_key: str,
|
||||
scores_key: str,
|
||||
n_trials: int = _DEFAULTS["n_trials"],
|
||||
code_path: Optional[ExistingFilePath] = None,
|
||||
use_gpu: int = _DEFAULTS["use_gpu"],
|
||||
gold_preproc: bool = _DEFAULTS["gold_preproc"],
|
||||
verbose: bool = False,
|
||||
):
|
||||
"""
|
||||
Runs prediction trials for a trained model with varying tresholds to maximize
|
||||
|
@ -52,7 +61,6 @@ def find_threshold_cli(
|
|||
|
||||
DOCS: https://spacy.io/api/cli#find-threshold
|
||||
"""
|
||||
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
import_code(code_path)
|
||||
find_threshold(
|
||||
|
@ -110,19 +118,16 @@ def find_threshold(
|
|||
pipe = nlp.get_pipe(pipe_name)
|
||||
if not hasattr(pipe, "scorer"):
|
||||
raise AttributeError(Errors.E1045)
|
||||
|
||||
if type(pipe) == TextCategorizer:
|
||||
wasabi.msg.warn(
|
||||
"The `textcat` component doesn't use a threshold as it's not applicable to the concept of "
|
||||
"exclusive classes. All thresholds will yield the same results."
|
||||
)
|
||||
|
||||
if not silent:
|
||||
wasabi.msg.info(
|
||||
title=f"Optimizing for {scores_key} for component '{pipe_name}' with {n_trials} "
|
||||
f"trials."
|
||||
)
|
||||
|
||||
# Load evaluation corpus.
|
||||
corpus = Corpus(data_path, gold_preproc=gold_preproc)
|
||||
dev_dataset = list(corpus(nlp))
|
||||
|
@ -209,9 +214,7 @@ def find_threshold(
|
|||
widths=table_col_widths,
|
||||
)
|
||||
)
|
||||
|
||||
best_threshold = max(scores.keys(), key=(lambda key: scores[key]))
|
||||
|
||||
# If all scores are identical, emit warning.
|
||||
if len(set(scores.values())) == 1:
|
||||
wasabi.msg.warn(
|
||||
|
@ -223,7 +226,6 @@ def find_threshold(
|
|||
)
|
||||
else "Use `cats_macro_f` or `cats_micro_f` when optimizing the threshold for `textcat_multilabel`.",
|
||||
)
|
||||
|
||||
else:
|
||||
if not silent:
|
||||
print(
|
||||
|
|
|
@ -5,23 +5,31 @@ import json
|
|||
from pathlib import Path
|
||||
from wasabi import Printer, MarkdownRenderer
|
||||
import srsly
|
||||
from radicli import Arg
|
||||
|
||||
from ._util import app, Arg, Opt, string_to_list
|
||||
from ._util import cli
|
||||
from .download import get_model_filename, get_latest_version
|
||||
from .. import util
|
||||
from .. import about
|
||||
|
||||
|
||||
@app.command("info")
|
||||
def info_cli(
|
||||
@cli.command(
|
||||
"info",
|
||||
# fmt: off
|
||||
model: Optional[str] = Arg(None, help="Optional loadable spaCy pipeline"),
|
||||
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
||||
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
|
||||
exclude: str = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"),
|
||||
url: bool = Opt(False, "--url", "-u", help="Print the URL to download the most recent compatible version of the pipeline"),
|
||||
model=Arg(help="Optional loadable spaCy pipeline"),
|
||||
markdown=Arg("--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
||||
silent=Arg("--silent", "-S", help="Don't print anything (just return)"),
|
||||
exclude=Arg("--exclude", "-e", help="Keys to exclude from the print-out"),
|
||||
url=Arg("--url", "-u", help="Print the URL to download the most recent compatible version of the pipeline"),
|
||||
# fmt: on
|
||||
):
|
||||
)
|
||||
def info(
|
||||
model: Optional[str] = None,
|
||||
markdown: bool = False,
|
||||
silent: bool = False,
|
||||
exclude: List[str] = [],
|
||||
url: bool = False,
|
||||
) -> Union[str, dict]:
|
||||
"""
|
||||
Print info about spaCy installation. If a pipeline is specified as an argument,
|
||||
print its meta information. Flag --markdown prints details in Markdown for easy
|
||||
|
@ -32,24 +40,6 @@ def info_cli(
|
|||
|
||||
DOCS: https://spacy.io/api/cli#info
|
||||
"""
|
||||
exclude = string_to_list(exclude)
|
||||
info(
|
||||
model,
|
||||
markdown=markdown,
|
||||
silent=silent,
|
||||
exclude=exclude,
|
||||
url=url,
|
||||
)
|
||||
|
||||
|
||||
def info(
|
||||
model: Optional[str] = None,
|
||||
*,
|
||||
markdown: bool = False,
|
||||
silent: bool = True,
|
||||
exclude: Optional[List[str]] = None,
|
||||
url: bool = False,
|
||||
) -> Union[str, dict]:
|
||||
msg = Printer(no_print=silent, pretty=not silent)
|
||||
if not exclude:
|
||||
exclude = []
|
||||
|
|
|
@ -1,18 +1,18 @@
|
|||
from typing import Optional, List, Tuple
|
||||
from enum import Enum
|
||||
from typing import Optional, List, Tuple, Literal
|
||||
from pathlib import Path
|
||||
from wasabi import Printer, diff_strings
|
||||
from thinc.api import Config
|
||||
import srsly
|
||||
import re
|
||||
from jinja2 import Template
|
||||
from radicli import Arg, PathOrDash, ExistingFilePath
|
||||
|
||||
from .. import util
|
||||
from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
|
||||
from ..schemas import RecommendationSchema
|
||||
from ..util import SimpleFrozenList
|
||||
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
||||
from ._util import string_to_list, import_code, _handle_renamed_language_codes
|
||||
from ._util import cli, convert_string_list, show_validation_error, COMMAND
|
||||
from ._util import import_code, _handle_renamed_language_codes
|
||||
|
||||
|
||||
ROOT = Path(__file__).parent / "templates"
|
||||
|
@ -20,9 +20,7 @@ TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
|
|||
RECOMMENDATIONS = srsly.read_yaml(ROOT / "quickstart_training_recommendations.yml")
|
||||
|
||||
|
||||
class Optimizations(str, Enum):
|
||||
efficiency = "efficiency"
|
||||
accuracy = "accuracy"
|
||||
OptimizationsType = Literal["efficiency", "accuracy"]
|
||||
|
||||
|
||||
class InitValues:
|
||||
|
@ -33,23 +31,33 @@ class InitValues:
|
|||
|
||||
lang = "en"
|
||||
pipeline = SimpleFrozenList(["tagger", "parser", "ner"])
|
||||
optimize = Optimizations.efficiency
|
||||
optimize = "efficiency"
|
||||
gpu = False
|
||||
pretraining = False
|
||||
force_overwrite = False
|
||||
|
||||
|
||||
@init_cli.command("config")
|
||||
def init_config_cli(
|
||||
@cli.subcommand(
|
||||
"init",
|
||||
"config",
|
||||
# fmt: off
|
||||
output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
||||
lang: str = Opt(InitValues.lang, "--lang", "-l", help="Code of the language to use"),
|
||||
pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||
optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
pretraining: bool = Opt(InitValues.pretraining, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||
force_overwrite: bool = Opt(InitValues.force_overwrite, "--force", "-F", help="Force overwriting the output file"),
|
||||
output_file=Arg(help="File to save the config to or - for stdout (will only output config and no additional logging info)"),
|
||||
lang=Arg("--lang", "-l", help="Code of the language to use"),
|
||||
pipeline=Arg("--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')", converter=convert_string_list),
|
||||
optimize=Arg("--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
gpu=Arg("--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
pretraining=Arg("--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||
force_overwrite=Arg("--force", "-F", help="Force overwriting the output file"),
|
||||
# fmt: on
|
||||
)
|
||||
def init_config_cli(
|
||||
output_file: PathOrDash,
|
||||
lang: str = InitValues.lang,
|
||||
pipeline: List[str] = InitValues.pipeline,
|
||||
optimize: OptimizationsType = InitValues.optimize,
|
||||
gpu: bool = InitValues.gpu,
|
||||
pretraining: bool = InitValues.pretraining,
|
||||
force_overwrite: bool = InitValues.force_overwrite,
|
||||
):
|
||||
"""
|
||||
Generate a starter config file for training. Based on your requirements
|
||||
|
@ -59,8 +67,7 @@ def init_config_cli(
|
|||
|
||||
DOCS: https://spacy.io/api/cli#init-config
|
||||
"""
|
||||
pipeline = string_to_list(pipeline)
|
||||
is_stdout = str(output_file) == "-"
|
||||
is_stdout = output_file == "-"
|
||||
if not is_stdout and output_file.exists() and not force_overwrite:
|
||||
msg = Printer()
|
||||
msg.fail(
|
||||
|
@ -70,7 +77,7 @@ def init_config_cli(
|
|||
config = init_config(
|
||||
lang=lang,
|
||||
pipeline=pipeline,
|
||||
optimize=optimize.value,
|
||||
optimize=optimize,
|
||||
gpu=gpu,
|
||||
pretraining=pretraining,
|
||||
silent=is_stdout,
|
||||
|
@ -78,16 +85,25 @@ def init_config_cli(
|
|||
save_config(config, output_file, is_stdout=is_stdout)
|
||||
|
||||
|
||||
@init_cli.command("fill-config")
|
||||
def init_fill_config_cli(
|
||||
@cli.subcommand(
|
||||
"init",
|
||||
"fill-config",
|
||||
# fmt: off
|
||||
base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False),
|
||||
output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True),
|
||||
distillation: bool = Opt(False, "--distillation", "-dt", help="Include config for distillation (with 'spacy distill')"),
|
||||
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
base_path=Arg(help="Path to base config to fill"),
|
||||
output_file=Arg(help="Path to output .cfg file (or - for stdout)"),
|
||||
distillation=Arg("--distillation", "-dt", help="Include config for distillation (with 'spacy distill')"),
|
||||
pretraining=Arg("--pretraining", "-pt", help="Include config for pretraining (with `spacy pretrain`)"),
|
||||
diff=Arg("--diff", "-D", help="Print a visual diff highlighting the changes"),
|
||||
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
# fmt: on
|
||||
)
|
||||
def init_fill_config_cli(
|
||||
base_path: ExistingFilePath,
|
||||
output_file: PathOrDash = "-",
|
||||
distillation: bool = False,
|
||||
pretraining: bool = False,
|
||||
diff: bool = False,
|
||||
code_path: Optional[ExistingFilePath] = None,
|
||||
):
|
||||
"""
|
||||
Fill partial config file with default values. Will add all missing settings
|
||||
|
|
|
@ -1,30 +1,42 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Literal, List
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import typer
|
||||
import srsly
|
||||
from radicli import Arg, ExistingPath, ExistingFilePathOrDash, ExistingFilePath
|
||||
|
||||
from .. import util
|
||||
from ..training.initialize import init_nlp, convert_vectors
|
||||
from ..language import Language
|
||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import cli, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code, setup_gpu, _handle_renamed_language_codes
|
||||
|
||||
|
||||
@init_cli.command("vectors")
|
||||
def init_vectors_cli(
|
||||
@cli.subcommand(
|
||||
"init",
|
||||
"vectors",
|
||||
# fmt: off
|
||||
lang: str = Arg(..., help="The language of the nlp object to create"),
|
||||
vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),
|
||||
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
||||
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
||||
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||
mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
|
||||
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||
lang=Arg(help="The language of the nlp object to create"),
|
||||
vectors_loc=Arg(help="Vectors file in Word2Vec format"),
|
||||
output_dir=Arg(help="Pipeline output directory"),
|
||||
prune=Arg("--prune", "-p", help="Optional number of vectors to prune to"),
|
||||
truncate=Arg("--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||
mode=Arg("--mode", "-m", help="Vectors mode: default or floret"),
|
||||
name=Arg("--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||
verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"),
|
||||
jsonl_loc=Arg("--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file"),
|
||||
# fmt: on
|
||||
)
|
||||
def init_vectors_cli(
|
||||
lang: str,
|
||||
vectors_loc: ExistingPath,
|
||||
output_dir: Path,
|
||||
prune: int = -1,
|
||||
truncate: int = 0,
|
||||
mode: Literal["default", "floret"] = "default",
|
||||
name: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
jsonl_loc: Optional[Path] = None,
|
||||
):
|
||||
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
||||
you can use in the [initialize] block of your config to initialize
|
||||
|
@ -66,23 +78,28 @@ def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
|
|||
lexeme.set_attrs(**attrs)
|
||||
|
||||
|
||||
@init_cli.command(
|
||||
@cli.subcommand_with_extra(
|
||||
"init",
|
||||
"nlp",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
hidden=True,
|
||||
# fmt: off
|
||||
config_path=Arg(help="Path to config file"),
|
||||
output_path=Arg(help="Output directory for the prepared data"),
|
||||
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"),
|
||||
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
# fmt: on
|
||||
)
|
||||
def init_pipeline_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
output_path: Path = Arg(..., help="Output directory for the prepared data"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||
# fmt: on
|
||||
config_path: ExistingFilePathOrDash,
|
||||
output_path: Path,
|
||||
code_path: Optional[ExistingFilePath] = None,
|
||||
verbose: bool = False,
|
||||
use_gpu: int = -1,
|
||||
_extra: List[str] = [],
|
||||
):
|
||||
"""Initialize a pipeline."""
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
overrides = parse_config_overrides(_extra)
|
||||
import_code(code_path)
|
||||
setup_gpu(use_gpu)
|
||||
with show_validation_error(config_path):
|
||||
|
@ -93,19 +110,24 @@ def init_pipeline_cli(
|
|||
msg.good(f"Saved initialized pipeline to {output_path}")
|
||||
|
||||
|
||||
@init_cli.command(
|
||||
@cli.subcommand_with_extra(
|
||||
"init",
|
||||
"labels",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
# fmt: off
|
||||
config_path=Arg(help="Path to config file"),
|
||||
output_path=Arg(help="Output directory for the labels"),
|
||||
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"),
|
||||
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
# fmt: on
|
||||
)
|
||||
def init_labels_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
output_path: Path = Arg(..., help="Output directory for the labels"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||
# fmt: on
|
||||
config_path: ExistingFilePathOrDash,
|
||||
output_path: Path,
|
||||
code_path: Optional[ExistingFilePath] = None,
|
||||
verbose: bool = False,
|
||||
use_gpu: int = -1,
|
||||
_extra: List[str] = [],
|
||||
):
|
||||
"""Generate JSON files for the labels in the data. This helps speed up the
|
||||
training process, since spaCy won't have to preprocess the data to
|
||||
|
@ -113,7 +135,7 @@ def init_labels_cli(
|
|||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
if not output_path.exists():
|
||||
output_path.mkdir(parents=True)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
overrides = parse_config_overrides(_extra)
|
||||
import_code(code_path)
|
||||
setup_gpu(use_gpu)
|
||||
with show_validation_error(config_path):
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional, Union, Any, Dict, List, Tuple, cast
|
||||
from typing import Optional, Union, Any, Dict, List, Tuple, Literal, cast
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from wasabi import Printer, MarkdownRenderer, get_raw_input
|
||||
|
@ -8,26 +8,38 @@ from catalogue import RegistryError
|
|||
import srsly
|
||||
import sys
|
||||
import re
|
||||
from radicli import Arg, ExistingDirPath, ExistingFilePath
|
||||
|
||||
from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
|
||||
from ._util import cli, convert_path_list, WHEEL_SUFFIX, SDIST_SUFFIX
|
||||
from ..schemas import validate, ModelMetaSchema
|
||||
from .. import util
|
||||
from .. import about
|
||||
|
||||
|
||||
@app.command("package")
|
||||
def package_cli(
|
||||
@cli.command(
|
||||
"package",
|
||||
# fmt: off
|
||||
input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False),
|
||||
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
||||
code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
|
||||
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
||||
create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"),
|
||||
name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
|
||||
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
||||
build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
|
||||
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"),
|
||||
input_dir=Arg(help="Directory with pipeline data"),
|
||||
output_dir=Arg(help="Output parent directory"),
|
||||
code_paths=Arg("--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package", converter=convert_path_list),
|
||||
meta_path=Arg("--meta", "-m", help="Path to meta.json"),
|
||||
create_meta=Arg("--create-meta", "-C", help="Create meta.json, even if one exists"),
|
||||
name=Arg("--name", "-n", help="Package name to override meta"),
|
||||
version=Arg("--version", "-v", help="Package version to override meta"),
|
||||
build=Arg("--build", "-b", help="Artifact to build. Can be set multiple times, 'sdist', 'wheel' or 'none'"),
|
||||
force=Arg("--force", "-F", help="Force overwriting existing data in output directory"),
|
||||
# fmt: on
|
||||
)
|
||||
def package_cli(
|
||||
input_dir: ExistingDirPath,
|
||||
output_dir: ExistingDirPath,
|
||||
code_paths: List[Path] = [],
|
||||
meta_path: Optional[ExistingFilePath] = None,
|
||||
create_meta: bool = False,
|
||||
name: Optional[str] = None,
|
||||
version: Optional[str] = None,
|
||||
build: List[Literal["sdist", "wheel", "none"]] = ["sdist"],
|
||||
force: bool = False,
|
||||
):
|
||||
"""
|
||||
Generate an installable Python package for a pipeline. Includes binary data,
|
||||
|
@ -44,8 +56,6 @@ def package_cli(
|
|||
|
||||
DOCS: https://spacy.io/api/cli#package
|
||||
"""
|
||||
create_sdist, create_wheel = get_build_formats(string_to_list(build))
|
||||
code_paths = [Path(p.strip()) for p in string_to_list(code_paths)]
|
||||
package(
|
||||
input_dir,
|
||||
output_dir,
|
||||
|
@ -54,8 +64,8 @@ def package_cli(
|
|||
name=name,
|
||||
version=version,
|
||||
create_meta=create_meta,
|
||||
create_sdist=create_sdist,
|
||||
create_wheel=create_wheel,
|
||||
create_sdist="sdist" in build and "none" not in build,
|
||||
create_wheel="sdist" in build and "none" not in build,
|
||||
force=force,
|
||||
silent=False,
|
||||
)
|
||||
|
|
|
@ -1,29 +1,34 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, List
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import typer
|
||||
import re
|
||||
from radicli import Arg, ExistingFilePathOrDash, ExistingFilePath
|
||||
|
||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import cli, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code, setup_gpu
|
||||
from ..training.pretrain import pretrain
|
||||
from ..util import load_config
|
||||
|
||||
|
||||
@app.command(
|
||||
@cli.command_with_extra(
|
||||
"pretrain",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
# fmt: off
|
||||
config_path=Arg(help="Path to config file"),
|
||||
output_dir=Arg(help="Directory to write weights to on each epoch"),
|
||||
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
resume_path=Arg("--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||
epoch_resume=Arg("--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
||||
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
# fmt: on
|
||||
)
|
||||
def pretrain_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True),
|
||||
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
# fmt: on
|
||||
config_path: ExistingFilePathOrDash,
|
||||
output_dir: Path,
|
||||
code_path: Optional[ExistingFilePath] = None,
|
||||
resume_path: Optional[ExistingFilePath] = None,
|
||||
epoch_resume: Optional[int] = None,
|
||||
use_gpu: int = -1,
|
||||
_extra: List[str] = [],
|
||||
):
|
||||
"""
|
||||
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
||||
|
@ -46,7 +51,7 @@ def pretrain_cli(
|
|||
|
||||
DOCS: https://spacy.io/api/cli#pretrain
|
||||
"""
|
||||
config_overrides = parse_config_overrides(ctx.args)
|
||||
config_overrides = parse_config_overrides(_extra)
|
||||
import_code(code_path)
|
||||
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
|
||||
setup_gpu(use_gpu)
|
||||
|
|
|
@ -7,23 +7,25 @@ import pstats
|
|||
import sys
|
||||
import itertools
|
||||
from wasabi import msg, Printer
|
||||
import typer
|
||||
from radicli import Arg, ExistingPathOrDash
|
||||
|
||||
from ._util import app, debug_cli, Arg, Opt, NAME
|
||||
from ._util import cli
|
||||
from ..language import Language
|
||||
from ..util import load_model
|
||||
|
||||
|
||||
@debug_cli.command("profile")
|
||||
@app.command("profile", hidden=True)
|
||||
def profile_cli(
|
||||
@cli.subcommand(
|
||||
"debug",
|
||||
"profile",
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read current calling context
|
||||
model: str = Arg(..., help="Trained pipeline to load"),
|
||||
inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True),
|
||||
n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"),
|
||||
model=Arg(help="Trained pipeline to load"),
|
||||
inputs=Arg(help="Location of input file. '-' for stdin."),
|
||||
n_texts=Arg("--n-texts", "-n", help="Maximum number of texts to use if available"),
|
||||
# fmt: on
|
||||
):
|
||||
)
|
||||
def profile(
|
||||
model: str, inputs: Optional[ExistingPathOrDash] = None, n_texts: int = 10000
|
||||
) -> None:
|
||||
"""
|
||||
Profile which functions take the most time in a spaCy pipeline.
|
||||
Input should be formatted as one JSON object per line with a key "text".
|
||||
|
@ -32,16 +34,6 @@ def profile_cli(
|
|||
|
||||
DOCS: https://spacy.io/api/cli#debug-profile
|
||||
"""
|
||||
if ctx.parent.command.name == NAME: # type: ignore[union-attr] # called as top-level command
|
||||
msg.warn(
|
||||
"The profile command is now available via the 'debug profile' "
|
||||
"subcommand. You can run python -m spacy debug --help for an "
|
||||
"overview of the other available debugging commands."
|
||||
)
|
||||
profile(model, inputs=inputs, n_texts=n_texts)
|
||||
|
||||
|
||||
def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None:
|
||||
if inputs is not None:
|
||||
texts = _read_inputs(inputs, msg)
|
||||
texts = list(itertools.islice(texts, n_texts))
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
from typing import Any, Dict, Optional
|
||||
from typing import Any, Dict, Optional, List
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import requests
|
||||
import typer
|
||||
from radicli import Arg, ExistingDirPath
|
||||
|
||||
from ...util import ensure_path, working_dir
|
||||
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
||||
from .._util import cli, PROJECT_FILE, load_project_config
|
||||
from .._util import get_checksum, download_file, git_checkout, get_git_version
|
||||
from .._util import SimpleFrozenDict, parse_config_overrides
|
||||
|
||||
|
@ -16,17 +16,20 @@ from .._util import SimpleFrozenDict, parse_config_overrides
|
|||
EXTRA_DEFAULT = False
|
||||
|
||||
|
||||
@project_cli.command(
|
||||
@cli.subcommand(
|
||||
"project",
|
||||
"assets",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
# fmt: off
|
||||
project_dir=Arg(help="Path to cloned project. Defaults to current working directory"),
|
||||
sparse_checkout=Arg("--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+"),
|
||||
extra=Arg("--extra", "-e", help="Download all assets, including those marked as 'extra'"),
|
||||
# fmt: on
|
||||
)
|
||||
def project_assets_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
|
||||
extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
|
||||
# fmt: on
|
||||
project_dir: ExistingDirPath = Path.cwd(),
|
||||
sparse_checkout: bool = False,
|
||||
extra: bool = False,
|
||||
_extra: List[str] = [],
|
||||
):
|
||||
"""Fetch project assets like datasets and pretrained weights. Assets are
|
||||
defined in the "assets" section of the project.yml. If a checksum is
|
||||
|
@ -35,7 +38,7 @@ def project_assets_cli(
|
|||
|
||||
DOCS: https://spacy.io/api/cli#project-assets
|
||||
"""
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
overrides = parse_config_overrides(_extra)
|
||||
project_assets(
|
||||
project_dir,
|
||||
overrides=overrides,
|
||||
|
|
|
@ -3,10 +3,11 @@ from pathlib import Path
|
|||
from wasabi import msg
|
||||
import subprocess
|
||||
import re
|
||||
from radicli import Arg
|
||||
|
||||
from ... import about
|
||||
from ...util import ensure_path
|
||||
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
|
||||
from .._util import cli, COMMAND, PROJECT_FILE
|
||||
from .._util import git_checkout, get_git_version, git_repo_branch_exists
|
||||
|
||||
DEFAULT_REPO = about.__projects__
|
||||
|
@ -14,15 +15,23 @@ DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
|
|||
DEFAULT_BRANCHES = ["main", "master"]
|
||||
|
||||
|
||||
@project_cli.command("clone")
|
||||
def project_clone_cli(
|
||||
@cli.subcommand(
|
||||
"project",
|
||||
"clone",
|
||||
# fmt: off
|
||||
name: str = Arg(..., help="The name of the template to clone"),
|
||||
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
|
||||
repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
|
||||
branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
|
||||
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
|
||||
name=Arg(help="The name of the template to clone"),
|
||||
dest=Arg(help="Where to clone the project. Defaults to current working directory"),
|
||||
repo=Arg("--repo", "-r", help="The repository to clone from"),
|
||||
branch=Arg("--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
|
||||
sparse_checkout=Arg("--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+"),
|
||||
# fmt: on
|
||||
)
|
||||
def project_clone_cli(
|
||||
name: str,
|
||||
dest: Optional[Path] = None,
|
||||
repo: str = DEFAULT_REPO,
|
||||
branch: Optional[str] = None,
|
||||
sparse_checkout: bool = False,
|
||||
):
|
||||
"""Clone a project template from a repository. Calls into "git" and will
|
||||
only download the files from the given subdirectory. The GitHub repo
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
from pathlib import Path
|
||||
from wasabi import msg, MarkdownRenderer
|
||||
from radicli import Arg, ExistingDirPath, PathOrDash
|
||||
|
||||
from ...util import working_dir
|
||||
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
||||
from .._util import cli, PROJECT_FILE, load_project_config
|
||||
|
||||
|
||||
DOCS_URL = "https://spacy.io"
|
||||
|
@ -27,14 +28,20 @@ MARKER_END = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->"
|
|||
MARKER_IGNORE = "<!-- SPACY PROJECT: IGNORE -->"
|
||||
|
||||
|
||||
@project_cli.command("document")
|
||||
def project_document_cli(
|
||||
@cli.subcommand(
|
||||
"project",
|
||||
"document",
|
||||
# fmt: off
|
||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
|
||||
no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji")
|
||||
project_dir=Arg(help="Path to cloned project. Defaults to current working directory."),
|
||||
output_file=Arg("--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
|
||||
no_emoji=Arg("--no-emoji", "-NE", help="Don't use emoji"),
|
||||
# fmt: on
|
||||
):
|
||||
)
|
||||
def project_document(
|
||||
project_dir: ExistingDirPath = Path.cwd(),
|
||||
output_file: PathOrDash = "-",
|
||||
no_emoji: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Auto-generate a README.md for a project. If the content is saved to a file,
|
||||
hidden markers are added so you can add custom content before or after the
|
||||
|
@ -43,13 +50,7 @@ def project_document_cli(
|
|||
|
||||
DOCS: https://spacy.io/api/cli#project-document
|
||||
"""
|
||||
project_document(project_dir, output_file, no_emoji=no_emoji)
|
||||
|
||||
|
||||
def project_document(
|
||||
project_dir: Path, output_file: Path, *, no_emoji: bool = False
|
||||
) -> None:
|
||||
is_stdout = str(output_file) == "-"
|
||||
is_stdout = output_file == "-"
|
||||
config = load_project_config(project_dir)
|
||||
md = MarkdownRenderer(no_emoji=no_emoji)
|
||||
md.add(MARKER_START)
|
||||
|
|
|
@ -1,207 +0,0 @@
|
|||
"""This module contains helpers and subcommands for integrating spaCy projects
|
||||
with Data Version Controk (DVC). https://dvc.org"""
|
||||
from typing import Dict, Any, List, Optional, Iterable
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
|
||||
from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
|
||||
from .._util import Arg, Opt, NAME, COMMAND
|
||||
from ...util import working_dir, split_command, join_command, run_command
|
||||
from ...util import SimpleFrozenList
|
||||
|
||||
|
||||
DVC_CONFIG = "dvc.yaml"
|
||||
DVC_DIR = ".dvc"
|
||||
UPDATE_COMMAND = "dvc"
|
||||
DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
|
||||
# edited your {PROJECT_FILE}, you can regenerate this file by running:
|
||||
# {COMMAND} project {UPDATE_COMMAND}"""
|
||||
|
||||
|
||||
@project_cli.command(UPDATE_COMMAND)
|
||||
def project_update_dvc_cli(
|
||||
# fmt: off
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
||||
quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
||||
# fmt: on
|
||||
):
|
||||
"""Auto-generate Data Version Control (DVC) config. A DVC
|
||||
project can only define one pipeline, so you need to specify one workflow
|
||||
defined in the project.yml. If no workflow is specified, the first defined
|
||||
workflow is used. The DVC config will only be updated if the project.yml
|
||||
changed.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#project-dvc
|
||||
"""
|
||||
project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
|
||||
|
||||
|
||||
def project_update_dvc(
|
||||
project_dir: Path,
|
||||
workflow: Optional[str] = None,
|
||||
*,
|
||||
verbose: bool = False,
|
||||
quiet: bool = False,
|
||||
force: bool = False,
|
||||
) -> None:
|
||||
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
|
||||
project can only define one pipeline, so you need to specify one workflow
|
||||
defined in the project.yml. Will only update the file if the checksum changed.
|
||||
|
||||
project_dir (Path): The project directory.
|
||||
workflow (Optional[str]): Optional name of workflow defined in project.yml.
|
||||
If not set, the first workflow will be used.
|
||||
verbose (bool): Print more info.
|
||||
quiet (bool): Print less info.
|
||||
force (bool): Force update DVC config.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
updated = update_dvc_config(
|
||||
project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
|
||||
)
|
||||
help_msg = "To execute the workflow with DVC, run: dvc repro"
|
||||
if updated:
|
||||
msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
|
||||
else:
|
||||
msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
|
||||
|
||||
|
||||
def update_dvc_config(
|
||||
path: Path,
|
||||
config: Dict[str, Any],
|
||||
workflow: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
quiet: bool = False,
|
||||
force: bool = False,
|
||||
) -> bool:
|
||||
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
||||
project directory. The file is auto-generated based on the config. The
|
||||
first line of the auto-generated file specifies the hash of the config
|
||||
dict, so if any of the config values change, the DVC config is regenerated.
|
||||
|
||||
path (Path): The path to the project directory.
|
||||
config (Dict[str, Any]): The loaded project.yml.
|
||||
verbose (bool): Whether to print additional info (via DVC).
|
||||
quiet (bool): Don't output anything (via DVC).
|
||||
force (bool): Force update, even if hashes match.
|
||||
RETURNS (bool): Whether the DVC config file was updated.
|
||||
"""
|
||||
ensure_dvc(path)
|
||||
workflows = config.get("workflows", {})
|
||||
workflow_names = list(workflows.keys())
|
||||
check_workflows(workflow_names, workflow)
|
||||
if not workflow:
|
||||
workflow = workflow_names[0]
|
||||
config_hash = get_hash(config)
|
||||
path = path.resolve()
|
||||
dvc_config_path = path / DVC_CONFIG
|
||||
if dvc_config_path.exists():
|
||||
# Check if the file was generated using the current config, if not, redo
|
||||
with dvc_config_path.open("r", encoding="utf8") as f:
|
||||
ref_hash = f.readline().strip().replace("# ", "")
|
||||
if ref_hash == config_hash and not force:
|
||||
return False # Nothing has changed in project.yml, don't need to update
|
||||
dvc_config_path.unlink()
|
||||
dvc_commands = []
|
||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
|
||||
# some flags that apply to every command
|
||||
flags = []
|
||||
if verbose:
|
||||
flags.append("--verbose")
|
||||
if quiet:
|
||||
flags.append("--quiet")
|
||||
|
||||
for name in workflows[workflow]:
|
||||
command = config_commands[name]
|
||||
deps = command.get("deps", [])
|
||||
outputs = command.get("outputs", [])
|
||||
outputs_no_cache = command.get("outputs_no_cache", [])
|
||||
if not deps and not outputs and not outputs_no_cache:
|
||||
continue
|
||||
# Default to the working dir as the project path since dvc.yaml is auto-generated
|
||||
# and we don't want arbitrary paths in there
|
||||
project_cmd = ["python", "-m", NAME, "project", "run", name]
|
||||
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
||||
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
||||
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
||||
|
||||
dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
|
||||
if command.get("no_skip"):
|
||||
dvc_cmd.append("--always-changed")
|
||||
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
||||
dvc_commands.append(join_command(full_cmd))
|
||||
|
||||
if not dvc_commands:
|
||||
# If we don't check for this, then there will be an error when reading the
|
||||
# config, since DVC wouldn't create it.
|
||||
msg.fail(
|
||||
"No usable commands for DVC found. This can happen if none of your "
|
||||
"commands have dependencies or outputs.",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
with working_dir(path):
|
||||
for c in dvc_commands:
|
||||
dvc_command = "dvc " + c
|
||||
run_command(dvc_command)
|
||||
with dvc_config_path.open("r+", encoding="utf8") as f:
|
||||
content = f.read()
|
||||
f.seek(0, 0)
|
||||
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
|
||||
return True
|
||||
|
||||
|
||||
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
|
||||
"""Validate workflows provided in project.yml and check that a given
|
||||
workflow can be used to generate a DVC config.
|
||||
|
||||
workflows (List[str]): Names of the available workflows.
|
||||
workflow (Optional[str]): The name of the workflow to convert.
|
||||
"""
|
||||
if not workflows:
|
||||
msg.fail(
|
||||
f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
|
||||
f"define at least one list of commands.",
|
||||
exits=1,
|
||||
)
|
||||
if workflow is not None and workflow not in workflows:
|
||||
msg.fail(
|
||||
f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
|
||||
f"Available workflows: {', '.join(workflows)}",
|
||||
exits=1,
|
||||
)
|
||||
if not workflow:
|
||||
msg.warn(
|
||||
f"No workflow specified for DVC pipeline. Using the first workflow "
|
||||
f"defined in {PROJECT_FILE}: '{workflows[0]}'"
|
||||
)
|
||||
|
||||
|
||||
def ensure_dvc(project_dir: Path) -> None:
|
||||
"""Ensure that the "dvc" command is available and that the current project
|
||||
directory is an initialized DVC project.
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
|
||||
except Exception:
|
||||
msg.fail(
|
||||
"To use spaCy projects with DVC (Data Version Control), DVC needs "
|
||||
"to be installed and the 'dvc' command needs to be available",
|
||||
"You can install the Python package from pip (pip install dvc) or "
|
||||
"conda (conda install -c conda-forge dvc). For more details, see the "
|
||||
"documentation: https://dvc.org/doc/install",
|
||||
exits=1,
|
||||
)
|
||||
if not (project_dir / ".dvc").exists():
|
||||
msg.fail(
|
||||
"Project not initialized as a DVC project",
|
||||
"To initialize a DVC project, you can run 'dvc init' in the project "
|
||||
"directory. For more details, see the documentation: "
|
||||
"https://dvc.org/doc/command-reference/init",
|
||||
exits=1,
|
||||
)
|
|
@ -1,18 +1,23 @@
|
|||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
from radicli import Arg, ExistingDirPath
|
||||
|
||||
from .remote_storage import RemoteStorage
|
||||
from .remote_storage import get_command_hash
|
||||
from .._util import project_cli, Arg, logger
|
||||
from .._util import load_project_config
|
||||
from .._util import cli, load_project_config, logger
|
||||
from .run import update_lockfile
|
||||
|
||||
|
||||
@project_cli.command("pull")
|
||||
def project_pull_cli(
|
||||
@cli.subcommand(
|
||||
"project",
|
||||
"pull",
|
||||
# fmt: off
|
||||
remote: str = Arg("default", help="Name or path of remote storage"),
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
remote=Arg(help="Name or path of remote storage"),
|
||||
project_dir=Arg(help="Location of project directory. Defaults to current working directory."),
|
||||
# fmt: on
|
||||
)
|
||||
def project_pull_cli(
|
||||
remote: str = "default", project_dir: ExistingDirPath = Path.cwd()
|
||||
):
|
||||
"""Retrieve available precomputed outputs from a remote storage.
|
||||
You can alias remotes in your project.yml by mapping them to storage paths.
|
||||
|
|
|
@ -1,19 +1,25 @@
|
|||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
from radicli import Arg, ExistingDirPath
|
||||
|
||||
from .remote_storage import RemoteStorage
|
||||
from .remote_storage import get_content_hash, get_command_hash
|
||||
from .._util import load_project_config
|
||||
from .._util import project_cli, Arg, logger
|
||||
from .._util import cli, load_project_config, logger
|
||||
|
||||
|
||||
@project_cli.command("push")
|
||||
def project_push_cli(
|
||||
@cli.subcommand(
|
||||
"project",
|
||||
"push",
|
||||
# fmt: off
|
||||
remote: str = Arg("default", help="Name or path of remote storage"),
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
remote=Arg(help="Name or path of remote storage"),
|
||||
project_dir=Arg(help="Location of project directory. Defaults to current working directory."),
|
||||
# fmt: on
|
||||
)
|
||||
def project_push_cli(
|
||||
remote: str = "default", project_dir: ExistingDirPath = Path.cwd()
|
||||
):
|
||||
"""Persist outputs to a remote storage. You can alias remotes in your
|
||||
"""
|
||||
Persist outputs to a remote storage. You can alias remotes in your
|
||||
project.yml by mapping them to storage paths. A storage can be anything that
|
||||
the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
|
||||
local directories etc.
|
||||
|
|
|
@ -1,13 +1,12 @@
|
|||
from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
|
||||
import os.path
|
||||
from pathlib import Path
|
||||
|
||||
import pkg_resources
|
||||
from wasabi import msg
|
||||
from wasabi.util import locale_escape
|
||||
import sys
|
||||
import srsly
|
||||
import typer
|
||||
from radicli import Arg, ExistingDirPath
|
||||
|
||||
from ... import about
|
||||
from ...git_info import GIT_VERSION
|
||||
|
@ -15,21 +14,27 @@ from ...util import working_dir, run_command, split_command, is_cwd, join_comman
|
|||
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
|
||||
from ...util import check_bool_env_var, SimpleFrozenDict
|
||||
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
||||
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides
|
||||
from .._util import cli, get_checksum, COMMAND, parse_config_overrides
|
||||
|
||||
|
||||
@project_cli.command(
|
||||
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
||||
@cli.subcommand_with_extra(
|
||||
"project",
|
||||
"run",
|
||||
# fmt: off
|
||||
subcommand=Arg(help=f"Name of command defined in the {PROJECT_FILE}"),
|
||||
project_dir=Arg(help="Location of project directory. Defaults to current working directory."),
|
||||
force=Arg("--force", "-F", help="Force re-running steps, even if nothing changed"),
|
||||
dry=Arg("--dry", "-D", help="Perform a dry run and don't execute scripts"),
|
||||
show_help=Arg("--help", help="Show help message and available subcommands"),
|
||||
# fmt: on
|
||||
)
|
||||
def project_run_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
|
||||
dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
|
||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
||||
# fmt: on
|
||||
subcommand: Optional[str] = None,
|
||||
project_dir: ExistingDirPath = Path.cwd(),
|
||||
force: bool = False,
|
||||
dry: bool = False,
|
||||
show_help: bool = False,
|
||||
_extra: List[str] = [],
|
||||
):
|
||||
"""Run a named command or workflow defined in the project.yml. If a workflow
|
||||
name is specified, all commands in the workflow are run, in order. If
|
||||
|
@ -41,7 +46,7 @@ def project_run_cli(
|
|||
if show_help or not subcommand:
|
||||
print_run_help(project_dir, subcommand)
|
||||
else:
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
overrides = parse_config_overrides(_extra)
|
||||
project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)
|
||||
|
||||
|
||||
|
|
|
@ -1,29 +1,34 @@
|
|||
from typing import Optional, Dict, Any, Union
|
||||
from typing import Optional, Dict, Any, Union, List
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import typer
|
||||
import logging
|
||||
import sys
|
||||
from radicli import Arg, ExistingFilePathOrDash
|
||||
|
||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import cli, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code, setup_gpu
|
||||
from ..training.loop import train as train_nlp
|
||||
from ..training.initialize import init_nlp
|
||||
from .. import util
|
||||
|
||||
|
||||
@app.command(
|
||||
"train", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
||||
@cli.command_with_extra(
|
||||
"train",
|
||||
# fmt: off
|
||||
config_path=Arg(help="Path to config file"),
|
||||
output_path=Arg("--output", "-o", help="Output directory to store trained pipeline in"),
|
||||
code_path=Arg("--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
verbose=Arg("--verbose", "-V", help="Display more information for debugging purposes"),
|
||||
use_gpu=Arg("--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
# fmt: on
|
||||
)
|
||||
def train_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||
# fmt: on
|
||||
config_path: ExistingFilePathOrDash,
|
||||
output_path: Optional[Path] = None,
|
||||
code_path: Optional[Path] = None,
|
||||
verbose: bool = False,
|
||||
use_gpu: int = -1,
|
||||
_extra: List[str] = [],
|
||||
):
|
||||
"""
|
||||
Train or update a spaCy pipeline. Requires data in spaCy's binary format. To
|
||||
|
@ -40,7 +45,7 @@ def train_cli(
|
|||
DOCS: https://spacy.io/api/cli#train
|
||||
"""
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
overrides = parse_config_overrides(_extra)
|
||||
import_code(code_path)
|
||||
train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
|
||||
|
||||
|
|
|
@ -5,14 +5,14 @@ import requests
|
|||
from wasabi import msg, Printer
|
||||
import warnings
|
||||
|
||||
from ._util import app
|
||||
from ._util import cli
|
||||
from .. import about
|
||||
from ..util import get_package_version, get_installed_models, get_minor_version
|
||||
from ..util import get_package_path, get_model_meta, is_compatible_version
|
||||
|
||||
|
||||
@app.command("validate")
|
||||
def validate_cli():
|
||||
@cli.command("validate")
|
||||
def validate() -> None:
|
||||
"""
|
||||
Validate the currently installed pipeline packages and spaCy version. Checks
|
||||
if the installed packages are compatible and shows upgrade instructions if
|
||||
|
@ -20,10 +20,6 @@ def validate_cli():
|
|||
|
||||
DOCS: https://spacy.io/api/cli#validate
|
||||
"""
|
||||
validate()
|
||||
|
||||
|
||||
def validate() -> None:
|
||||
model_pkgs, compat = get_model_pkgs()
|
||||
spacy_version = get_minor_version(about.__version__)
|
||||
current_compat = compat.get(spacy_version, {})
|
||||
|
|
|
@ -1,20 +1,39 @@
|
|||
import pytest
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typer.testing import CliRunner
|
||||
from spacy.tokens import DocBin, Doc
|
||||
from spacy.cli._util import cli
|
||||
|
||||
from spacy.cli._util import app
|
||||
from .util import make_tempdir, normalize_whitespace
|
||||
|
||||
|
||||
def test_convert_auto():
|
||||
@pytest.fixture(scope="session")
|
||||
def all_commands():
|
||||
result = [*cli.commands.values()]
|
||||
for subcommands in cli.subcommands.values():
|
||||
result.extend(subcommands.values())
|
||||
return result
|
||||
|
||||
|
||||
def test_help_texts(all_commands):
|
||||
"""Test that all commands provide docstrings and argument help texts."""
|
||||
for command in all_commands:
|
||||
assert command.description, f"no docstring for {command.display_name}"
|
||||
for arg in command.args:
|
||||
if arg.id == cli.extra_key:
|
||||
continue
|
||||
assert arg.arg.help, f"no help text for {command.display_name} -> {arg.id}"
|
||||
|
||||
|
||||
def test_convert_auto(capsys):
|
||||
with make_tempdir() as d_in, make_tempdir() as d_out:
|
||||
for f in ["data1.iob", "data2.iob", "data3.iob"]:
|
||||
Path(d_in / f).touch()
|
||||
|
||||
# ensure that "automatic" suffix detection works
|
||||
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
|
||||
assert "Generated output file" in result.stdout
|
||||
cli.run(["spacy", "convert", str(d_in), str(d_out)])
|
||||
captured = capsys.readouterr()
|
||||
assert "Generated output file" in captured.out
|
||||
out_files = os.listdir(d_out)
|
||||
assert len(out_files) == 3
|
||||
assert "data1.spacy" in out_files
|
||||
|
@ -22,28 +41,36 @@ def test_convert_auto():
|
|||
assert "data3.spacy" in out_files
|
||||
|
||||
|
||||
def test_convert_auto_conflict():
|
||||
def test_convert_auto_conflict(capsys):
|
||||
with make_tempdir() as d_in, make_tempdir() as d_out:
|
||||
for f in ["data1.iob", "data2.iob", "data3.json"]:
|
||||
Path(d_in / f).touch()
|
||||
|
||||
# ensure that "automatic" suffix detection warns when there are different file types
|
||||
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
|
||||
assert "All input files must be same type" in result.stdout
|
||||
with pytest.raises(SystemExit):
|
||||
cli.run(["spacy", "convert", str(d_in), str(d_out)])
|
||||
captured = capsys.readouterr()
|
||||
assert "All input files must be same type" in captured.out
|
||||
out_files = os.listdir(d_out)
|
||||
assert len(out_files) == 0
|
||||
|
||||
|
||||
def test_benchmark_accuracy_alias():
|
||||
def test_benchmark_accuracy_alias(capsys):
|
||||
# Verify that the `evaluate` alias works correctly.
|
||||
result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
|
||||
result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
|
||||
assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace(
|
||||
result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy")
|
||||
with pytest.raises(SystemExit):
|
||||
cli.run(["spacy", "benchmark", "accuracy", "--help"])
|
||||
captured = capsys.readouterr()
|
||||
result_benchmark = normalize_whitespace(str(captured.out))
|
||||
with pytest.raises(SystemExit):
|
||||
cli.run(["spacy", "evaluate", "--help"])
|
||||
captured = capsys.readouterr()
|
||||
result_evaluate = normalize_whitespace(str(captured.out))
|
||||
assert result_benchmark == result_evaluate.replace(
|
||||
"spacy evaluate", "spacy benchmark accuracy"
|
||||
)
|
||||
|
||||
|
||||
def test_debug_data_trainable_lemmatizer_cli(en_vocab):
|
||||
def test_debug_data_trainable_lemmatizer_cli(en_vocab, capsys):
|
||||
train_docs = [
|
||||
Doc(en_vocab, words=["I", "like", "cats"], lemmas=["I", "like", "cat"]),
|
||||
Doc(
|
||||
|
@ -62,30 +89,30 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab):
|
|||
dev_bin = DocBin(docs=dev_docs)
|
||||
dev_bin.to_disk(d_in / "dev.spacy")
|
||||
# `debug data` requires an input pipeline config
|
||||
CliRunner().invoke(
|
||||
app,
|
||||
[
|
||||
"init",
|
||||
"config",
|
||||
f"{d_in}/config.cfg",
|
||||
"--lang",
|
||||
"en",
|
||||
"--pipeline",
|
||||
"trainable_lemmatizer",
|
||||
],
|
||||
)
|
||||
result_debug_data = CliRunner().invoke(
|
||||
app,
|
||||
[
|
||||
"debug",
|
||||
"data",
|
||||
f"{d_in}/config.cfg",
|
||||
"--paths.train",
|
||||
f"{d_in}/train.spacy",
|
||||
"--paths.dev",
|
||||
f"{d_in}/dev.spacy",
|
||||
],
|
||||
)
|
||||
args = [
|
||||
"spacy",
|
||||
"init",
|
||||
"config",
|
||||
f"{d_in}/config.cfg",
|
||||
"--lang",
|
||||
"en",
|
||||
"--pipeline",
|
||||
"trainable_lemmatizer",
|
||||
]
|
||||
cli.run(args)
|
||||
args = [
|
||||
"spacy",
|
||||
"debug",
|
||||
"data",
|
||||
f"{d_in}/config.cfg",
|
||||
"--paths.train",
|
||||
f"{d_in}/train.spacy",
|
||||
"--paths.dev",
|
||||
f"{d_in}/dev.spacy",
|
||||
]
|
||||
with pytest.raises(SystemExit):
|
||||
cli.run(args)
|
||||
captured = capsys.readouterr()
|
||||
# Instead of checking specific wording of the output, which may change,
|
||||
# we'll check that this section of the debug output is present.
|
||||
assert "= Trainable Lemmatizer =" in result_debug_data.stdout
|
||||
assert "= Trainable Lemmatizer =" in captured.out
|
||||
|
|
Loading…
Reference in New Issue
Block a user