diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 1179c15dd..db409f431 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -15,6 +15,7 @@ from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 from .init_model import init_model # noqa: F401 from .validate import validate # noqa: F401 +from .project import project_clone, project_assets, project_run # noqa: F401 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) diff --git a/spacy/cli/_app.py b/spacy/cli/_app.py index 6f64dcb59..2b3ad9524 100644 --- a/spacy/cli/_app.py +++ b/spacy/cli/_app.py @@ -1,4 +1,3 @@ -from typing import Optional import typer from typer.main import get_command diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index a18e51623..a9ddfe9be 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -1,7 +1,9 @@ -from typing import Optional, List +from typing import Optional, List, Dict from timeit import default_timer as timer from wasabi import Printer from pathlib import Path +import re +import srsly from ..gold import Corpus from ..tokens import Doc @@ -16,13 +18,12 @@ def evaluate_cli( # fmt: off model: str = Arg(..., help="Model name or path"), data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True), + output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False), gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"), gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), - return_scores: bool = Opt(False, "--return-scores", "-R", help="Return dict containing model scores"), - - # fmt: on + # fmt: on ): """ Evaluate a model. To render a sample of parses in a HTML file, set an @@ -31,24 +32,24 @@ def evaluate_cli( evaluate( model, data_path, + output=output, gpu_id=gpu_id, gold_preproc=gold_preproc, displacy_path=displacy_path, displacy_limit=displacy_limit, silent=False, - return_scores=return_scores, ) def evaluate( model: str, data_path: Path, + output: Optional[Path], gpu_id: int = -1, gold_preproc: bool = False, displacy_path: Optional[Path] = None, displacy_limit: int = 25, silent: bool = True, - return_scores: bool = False, ) -> Scorer: msg = Printer(no_print=silent, pretty=not silent) util.fix_random_seed() @@ -56,21 +57,19 @@ def evaluate( util.use_gpu(gpu_id) util.set_env_log(False) data_path = util.ensure_path(data_path) + output_path = util.ensure_path(output) displacy_path = util.ensure_path(displacy_path) if not data_path.exists(): msg.fail("Evaluation data not found", data_path, exits=1) if displacy_path and not displacy_path.exists(): msg.fail("Visualization output directory not found", displacy_path, exits=1) corpus = Corpus(data_path, data_path) - if model.startswith("blank:"): - nlp = util.get_lang_class(model.replace("blank:", ""))() - else: - nlp = util.load_model(model) + nlp = util.load_model(model) dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc)) begin = timer() scorer = nlp.evaluate(dev_dataset, verbose=False) end = timer() - nwords = sum(len(ex.doc) for ex in dev_dataset) + nwords = sum(len(ex.predicted) for ex in dev_dataset) results = { "Time": f"{end - begin:.2f} s", "Words": nwords, @@ -90,10 +89,22 @@ def evaluate( "Sent R": f"{scorer.sent_r:.2f}", "Sent F": f"{scorer.sent_f:.2f}", } + data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()} + msg.table(results, title="Results") + if scorer.ents_per_type: + data["ents_per_type"] = scorer.ents_per_type + print_ents_per_type(msg, scorer.ents_per_type) + if scorer.textcats_f_per_cat: + data["textcats_f_per_cat"] = scorer.textcats_f_per_cat + print_textcats_f_per_cat(msg, scorer.textcats_f_per_cat) + if scorer.textcats_auc_per_cat: + data["textcats_auc_per_cat"] = scorer.textcats_auc_per_cat + print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat) + if displacy_path: - docs = [ex.doc for ex in dev_dataset] + docs = [ex.predicted for ex in dev_dataset] render_deps = "parser" in nlp.meta.get("pipeline", []) render_ents = "ner" in nlp.meta.get("pipeline", []) render_parses( @@ -105,8 +116,11 @@ def evaluate( ents=render_ents, ) msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) - if return_scores: - return scorer.scores + + if output_path is not None: + srsly.write_json(output_path, data) + msg.good(f"Saved results to {output_path}") + return data def render_parses( @@ -128,3 +142,40 @@ def render_parses( ) with (output_path / "parses.html").open("w", encoding="utf8") as file_: file_.write(html) + + +def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None: + data = [ + (k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}") + for k, v in scores.items() + ] + msg.table( + data, + header=("", "P", "R", "F"), + aligns=("l", "r", "r", "r"), + title="NER (per type)", + ) + + +def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None: + data = [ + (k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}") + for k, v in scores.items() + ] + msg.table( + data, + header=("", "P", "R", "F"), + aligns=("l", "r", "r", "r"), + title="Textcat F (per type)", + ) + + +def print_textcats_auc_per_cat( + msg: Printer, scores: Dict[str, Dict[str, float]] +) -> None: + msg.table( + [(k, f"{v['roc_auc_score']:.2f}") for k, v in scores.items()], + header=("", "ROC AUC"), + aligns=("l", "r"), + title="Textcat ROC AUC (per label)", + ) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 24d9a0a08..dbc485848 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -16,8 +16,9 @@ def package_cli( # fmt: off input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False), output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False), - meta_path: Optional[Path] = Opt(None, "--meta-path", "-m", help="Path to meta.json", exists=True, dir_okay=False), + meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False), create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"), + version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"), force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"), # fmt: on ): @@ -32,6 +33,7 @@ def package_cli( input_dir, output_dir, meta_path=meta_path, + version=version, create_meta=create_meta, force=force, silent=False, @@ -42,6 +44,7 @@ def package( input_dir: Path, output_dir: Path, meta_path: Optional[Path] = None, + version: Optional[str] = None, create_meta: bool = False, force: bool = False, silent: bool = True, @@ -61,10 +64,13 @@ def package( if not meta_path.exists() or not meta_path.is_file(): msg.fail("Can't load model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) + meta = get_meta(input_dir, meta) + if version is not None: + meta["version"] = version if not create_meta: # only print if user doesn't want to overwrite msg.good("Loaded meta.json from file", meta_path) else: - meta = generate_meta(input_dir, meta, msg) + meta = generate_meta(meta, msg) errors = validate(ModelMetaSchema, meta) if errors: msg.fail("Invalid model meta.json", "\n".join(errors), exits=1) @@ -101,20 +107,20 @@ def create_file(file_path: Path, contents: str) -> None: file_path.open("w", encoding="utf-8").write(contents) -def generate_meta( - model_path: Union[str, Path], existing_meta: Dict[str, Any], msg: Printer +def get_meta( + model_path: Union[str, Path], existing_meta: Dict[str, Any] ) -> Dict[str, Any]: - meta = existing_meta or {} - settings = [ - ("lang", "Model language", meta.get("lang", "en")), - ("name", "Model name", meta.get("name", "model")), - ("version", "Model version", meta.get("version", "0.0.0")), - ("description", "Model description", meta.get("description", False)), - ("author", "Author", meta.get("author", False)), - ("email", "Author email", meta.get("email", False)), - ("url", "Author website", meta.get("url", False)), - ("license", "License", meta.get("license", "MIT")), - ] + meta = { + "lang": "en", + "name": "model", + "version": "0.0.0", + "description": None, + "author": None, + "email": None, + "url": None, + "license": "MIT", + } + meta.update(existing_meta) nlp = util.load_model_from_path(Path(model_path)) meta["spacy_version"] = util.get_model_version_range(about.__version__) meta["pipeline"] = nlp.pipe_names @@ -124,6 +130,23 @@ def generate_meta( "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } + if about.__title__ != "spacy": + meta["parent_package"] = about.__title__ + return meta + + +def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]: + meta = existing_meta or {} + settings = [ + ("lang", "Model language", meta.get("lang", "en")), + ("name", "Model name", meta.get("name", "model")), + ("version", "Model version", meta.get("version", "0.0.0")), + ("description", "Model description", meta.get("description", None)), + ("author", "Author", meta.get("author", None)), + ("email", "Author email", meta.get("email", None)), + ("url", "Author website", meta.get("url", None)), + ("license", "License", meta.get("license", "MIT")), + ] msg.divider("Generating meta.json") msg.text( "Enter the package settings for your model. The following information " @@ -132,8 +155,6 @@ def generate_meta( for setting, desc, default in settings: response = get_raw_input(desc, default) meta[setting] = default if response == "" and default else response - if about.__title__ != "spacy": - meta["parent_package"] = about.__title__ return meta @@ -184,12 +205,12 @@ def setup_package(): setup( name=model_name, - description=meta['description'], - author=meta['author'], - author_email=meta['email'], - url=meta['url'], + description=meta.get('description'), + author=meta.get('author'), + author_email=meta.get('email'), + url=meta.get('url'), version=meta['version'], - license=meta['license'], + license=meta.get('license'), packages=[model_name], package_data={model_name: list_files(model_dir)}, install_requires=list_requirements(meta), diff --git a/spacy/cli/project.py b/spacy/cli/project.py new file mode 100644 index 000000000..5011a13f9 --- /dev/null +++ b/spacy/cli/project.py @@ -0,0 +1,657 @@ +from typing import List, Dict, Any, Optional +import typer +import srsly +from pathlib import Path +from wasabi import msg +import subprocess +import shlex +import os +import re +import shutil +import sys +import requests +import tqdm + +from ._app import app, Arg, Opt, COMMAND, NAME +from .. import about +from ..schemas import ProjectConfigSchema, validate +from ..util import ensure_path, run_command, make_tempdir, working_dir +from ..util import get_hash, get_checksum + + +CONFIG_FILE = "project.yml" +DVC_CONFIG = "dvc.yaml" +DIRS = [ + "assets", + "metas", + "configs", + "packages", + "metrics", + "scripts", + "notebooks", + "training", + "corpus", +] +CACHES = [ + Path.home() / ".torch", + Path.home() / ".caches" / "torch", + os.environ.get("TORCH_HOME"), + Path.home() / ".keras", +] +DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit +# it directly and edit the project.yml instead and re-run the project.""" +CLI_HELP = f"""Command-line interface for spaCy projects and working with project +templates. You'd typically start by cloning a project template to a local +directory and fetching its assets like datasets etc. See the project's +{CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data +Version Control) to manage input and output files and to ensure steps are only +re-run if their inputs change. +""" + +project_cli = typer.Typer(help=CLI_HELP) + + +@project_cli.callback(invoke_without_command=True) +def callback(ctx: typer.Context): + """This runs before every project command and ensures DVC is installed.""" + ensure_dvc() + + +################ +# CLI COMMANDS # +################ + + +@project_cli.command("clone") +def project_clone_cli( + # fmt: off + name: str = Arg(..., help="The name of the template to fetch"), + dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False), + repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."), + git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), + no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"), + # fmt: on +): + """Clone a project template from a repository. Calls into "git" and will + only download the files from the given subdirectory. The GitHub repo + defaults to the official spaCy template repo, but can be customized + (including using a private repo). Setting the --git flag will also + initialize the project directory as a Git repo. If the project is intended + to be a Git repo, it should be initialized with Git first, before + initializing DVC (Data Version Control). This allows DVC to integrate with + Git. + """ + project_clone(name, dest, repo=repo, git=git, no_init=no_init) + + +@project_cli.command("init") +def project_init_cli( + path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), + git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), +): + """Initialize a project directory with DVC and optionally Git. This should + typically be taken care of automatically when you run the "project clone" + command, but you can also run it separately. If the project is intended to + be a Git repo, it should be initialized with Git first, before initializing + DVC. This allows DVC to integrate with Git. + """ + project_init(path, git=git, silent=True) + + +@project_cli.command("assets") +def project_assets_cli( + # fmt: off + project_dir: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), + # fmt: on +): + """Use DVC (Data Version Control) to fetch project assets. Assets are + defined in the "assets" section of the project config. If possible, DVC + will try to track the files so you can pull changes from upstream. It will + also try and store the checksum so the assets are versioned. If th file + can't be tracked or checked, it will be downloaded without DVC. If a checksum + is provided in the project config, the file is only downloaded if no local + file with the same checksum exists. + """ + project_assets(project_dir) + + +@project_cli.command( + "run-all", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def project_run_all_cli( + # fmt: off + ctx: typer.Context, + project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), + show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") + # fmt: on +): + """Run all commands defined in the project. This command will use DVC and + the defined outputs and dependencies in the project config to determine + which steps need to be re-run and where to start. This means you're only + re-generating data if the inputs have changed. + + This command calls into "dvc repro" and all additional arguments are passed + to the "dvc repro" command: https://dvc.org/doc/command-reference/repro + """ + if show_help: + print_run_help(project_dir) + else: + project_run_all(project_dir, *ctx.args) + + +@project_cli.command( + "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def project_run_cli( + # fmt: off + ctx: typer.Context, + project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), + subcommand: str = Arg(None, help="Name of command defined in project config"), + show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") + # fmt: on +): + """Run a named script defined in the project config. If the command is + part of the default pipeline defined in the "run" section, DVC is used to + determine whether the step should re-run if its inputs have changed, or + whether everything is up to date. If the script is not part of the default + pipeline, it will be called separately without DVC. + + If DVC is used, the command calls into "dvc repro" and all additional + arguments are passed to the "dvc repro" command: + https://dvc.org/doc/command-reference/repro + """ + if show_help or not subcommand: + print_run_help(project_dir, subcommand) + else: + project_run(project_dir, subcommand, *ctx.args) + + +@project_cli.command("exec", hidden=True) +def project_exec_cli( + # fmt: off + project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), + subcommand: str = Arg(..., help="Name of command defined in project config"), + # fmt: on +): + """Execute a command defined in the project config. This CLI command is + only called internally in auto-generated DVC pipelines, as a shortcut for + multi-step commands in the project config. You typically shouldn't have to + call it yourself. To run a command, call "run" or "run-all". + """ + project_exec(project_dir, subcommand) + + +@project_cli.command("update-dvc") +def project_update_dvc_cli( + # fmt: off + project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), + verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), + force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), + # fmt: on +): + """Update the auto-generated DVC config file. Uses the steps defined in the + "run" section of the project config. This typically happens automatically + when running a command, but can also be triggered manually if needed. + """ + config = load_project_config(project_dir) + updated = update_dvc_config(project_dir, config, verbose=verbose, force=force) + if updated: + msg.good(f"Updated DVC config from {CONFIG_FILE}") + else: + msg.info(f"No changes found in {CONFIG_FILE}, no update needed") + + +app.add_typer(project_cli, name="project") + + +################# +# CLI FUNCTIONS # +################# + + +def project_clone( + name: str, + dest: Path, + *, + repo: str = about.__projects__, + git: bool = False, + no_init: bool = False, +) -> None: + """Clone a project template from a repository. + + name (str): Name of subdirectory to clone. + dest (Path): Destination path of cloned project. + repo (str): URL of Git repo containing project templates. + git (bool): Initialize project as Git repo. Should be set to True if project + is intended as a repo, since it will allow DVC to integrate with Git. + no_init (bool): Don't initialize DVC and Git automatically. If True, the + "init" command or "git init" and "dvc init" need to be run manually. + """ + dest = ensure_path(dest) + check_clone(name, dest, repo) + project_dir = dest.resolve() + # We're using Git and sparse checkout to only clone the files we need + with make_tempdir() as tmp_dir: + cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" + run_command(shlex.split(cmd)) + with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: + f.write(name) + run_command(["git", "-C", tmp_dir, "fetch"]) + run_command(["git", "-C", tmp_dir, "checkout"]) + shutil.move(str(tmp_dir / Path(name).name), str(project_dir)) + msg.good(f"Cloned project '{name}' from {repo}") + for sub_dir in DIRS: + dir_path = project_dir / sub_dir + if not dir_path.exists(): + dir_path.mkdir(parents=True) + if not no_init: + project_init(project_dir, git=git, silent=True) + msg.good(f"Your project is now ready!", dest) + print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}") + + +def project_init( + project_dir: Path, + *, + git: bool = False, + silent: bool = False, + analytics: bool = False, +): + """Initialize a project as a DVC and (optionally) as a Git repo. + + project_dir (Path): Path to project directory. + git (bool): Also call "git init" to initialize directory as a Git repo. + silent (bool): Don't print any output (via DVC). + analytics (bool): Opt-in to DVC analytics (defaults to False). + """ + with working_dir(project_dir): + init_cmd = ["dvc", "init"] + if silent: + init_cmd.append("--quiet") + if not git: + init_cmd.append("--no-scm") + if git: + run_command(["git", "init"]) + run_command(init_cmd) + # We don't want to have analytics on by default – our users should + # opt-in explicitly. If they want it, they can always enable it. + if not analytics: + run_command(["dvc", "config", "core.analytics", "false"]) + config = load_project_config(project_dir) + setup_check_dvc(project_dir, config) + + +def project_assets(project_dir: Path) -> None: + """Fetch assets for a project using DVC if possible. + + project_dir (Path): Path to project directory. + """ + project_path = ensure_path(project_dir) + config = load_project_config(project_path) + setup_check_dvc(project_path, config) + assets = config.get("assets", {}) + if not assets: + msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) + msg.info(f"Fetching {len(assets)} asset(s)") + variables = config.get("variables", {}) + for asset in assets: + url = asset["url"].format(**variables) + dest = asset["dest"].format(**variables) + fetch_asset(project_path, url, dest, asset.get("checksum")) + + +def fetch_asset( + project_path: Path, url: str, dest: Path, checksum: Optional[str] = None +) -> None: + """Fetch an asset from a given URL or path. Will try to import the file + using DVC's import-url if possible (fully tracked and versioned) and falls + back to get-url (versioned) and a non-DVC download if necessary. If a + checksum is provided and a local file exists, it's only re-downloaded if the + checksum doesn't match. + + project_path (Path): Path to project directory. + url (str): URL or path to asset. + checksum (Optional[str]): Optional expected checksum of local file. + """ + url = convert_asset_url(url) + dest_path = (project_path / dest).resolve() + if dest_path.exists() and checksum: + # If there's already a file, check for checksum + # TODO: add support for caches (dvc import-url with local path) + if checksum == get_checksum(dest_path): + msg.good(f"Skipping download with matching checksum: {dest}") + return + dvc_add_cmd = ["dvc", "add", str(dest_path), "--external"] + with working_dir(project_path): + try: + # If these fail, we don't want to output an error or info message. + # Try with tracking the source first, then just downloading with + # DVC, then a regular non-DVC download. + try: + dvc_cmd = ["dvc", "import-url", url, str(dest_path)] + print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) + except subprocess.CalledProcessError: + dvc_cmd = ["dvc", "get-url", url, str(dest_path)] + print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) + run_command(dvc_add_cmd) + except subprocess.CalledProcessError: + try: + download_file(url, dest_path) + except requests.exceptions.HTTPError as e: + msg.fail(f"Download failed: {dest}", e) + run_command(dvc_add_cmd) + if checksum and checksum != get_checksum(dest_path): + msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}") + msg.good(f"Fetched asset {dest}") + + +def project_run_all(project_dir: Path, *dvc_args) -> None: + """Run all commands defined in the project using DVC. + + project_dir (Path): Path to project directory. + *dvc_args: Other arguments passed to "dvc repro". + """ + config = load_project_config(project_dir) + setup_check_dvc(project_dir, config) + dvc_cmd = ["dvc", "repro", *dvc_args] + with working_dir(project_dir): + run_command(dvc_cmd) + + +def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: + """Simulate a CLI help prompt using the info available in the project config. + + project_dir (Path): The project directory. + subcommand (Optional[str]): The subcommand or None. If a subcommand is + provided, the subcommand help is shown. Otherwise, the top-level help + and a list of available commands is printed. + """ + config = load_project_config(project_dir) + setup_check_dvc(project_dir, config) + config_commands = config.get("commands", []) + commands = {cmd["name"]: cmd for cmd in config_commands} + if subcommand: + if subcommand not in commands: + msg.fail(f"Can't find command '{subcommand}' in project config", exits=1) + print(f"Usage: {COMMAND} project run {project_dir} {subcommand}") + help_text = commands[subcommand].get("help") + if help_text: + msg.text(f"\n{help_text}\n") + else: + print(f"\nAvailable commands in {CONFIG_FILE}") + print(f"Usage: {COMMAND} project run {project_dir} [COMMAND]") + msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) + msg.text("Run all commands defined in the 'run' block of the project config:") + print(f"{COMMAND} project run-all {project_dir}") + + +def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: + """Run a named script defined in the project config. If the script is part + of the default pipeline (defined in the "run" section), DVC is used to + execute the command, so it can determine whether to rerun it. It then + calls into "exec" to execute it. + + project_dir (Path): Path to project directory. + subcommand (str): Name of command to run. + *dvc_args: Other arguments passed to "dvc repro". + """ + config = load_project_config(project_dir) + setup_check_dvc(project_dir, config) + config_commands = config.get("commands", []) + variables = config.get("variables", {}) + commands = {cmd["name"]: cmd for cmd in config_commands} + if subcommand not in commands: + msg.fail(f"Can't find command '{subcommand}' in project config", exits=1) + if subcommand in config.get("run", []): + # This is one of the pipeline commands tracked in DVC + dvc_cmd = ["dvc", "repro", subcommand, *dvc_args] + with working_dir(project_dir): + run_command(dvc_cmd) + else: + cmd = commands[subcommand] + # Deps in non-DVC commands aren't tracked, but if they're defined, + # make sure they exist before running the command + for dep in cmd.get("deps", []): + if not (project_dir / dep).exists(): + err = f"Missing dependency specified by command '{subcommand}': {dep}" + msg.fail(err, exits=1) + with working_dir(project_dir): + run_commands(cmd["script"], variables) + + +def project_exec(project_dir: Path, subcommand: str): + """Execute a command defined in the project config. + + project_dir (Path): Path to project directory. + subcommand (str): Name of command to run. + """ + config = load_project_config(project_dir) + config_commands = config.get("commands", []) + variables = config.get("variables", {}) + commands = {cmd["name"]: cmd for cmd in config_commands} + with working_dir(project_dir): + run_commands(commands[subcommand]["script"], variables) + + +########### +# HELPERS # +########### + + +def load_project_config(path: Path) -> Dict[str, Any]: + """Load the project config file from a directory and validate it. + + path (Path): The path to the project directory. + RETURNS (Dict[str, Any]): The loaded project config. + """ + config_path = path / CONFIG_FILE + if not config_path.exists(): + msg.fail("Can't find project config", config_path, exits=1) + config = srsly.read_yaml(config_path) + errors = validate(ProjectConfigSchema, config) + if errors: + msg.fail(f"Invalid project config in {CONFIG_FILE}", "\n".join(errors), exits=1) + return config + + +def update_dvc_config( + path: Path, + config: Dict[str, Any], + verbose: bool = False, + silent: bool = False, + force: bool = False, +) -> bool: + """Re-run the DVC commands in dry mode and update dvc.yaml file in the + project directory. The file is auto-generated based on the config. The + first line of the auto-generated file specifies the hash of the config + dict, so if any of the config values change, the DVC config is regenerated. + + path (Path): The path to the project directory. + config (Dict[str, Any]): The loaded project config. + verbose (bool): Whether to print additional info (via DVC). + silent (bool): Don't output anything (via DVC). + force (bool): Force update, even if hashes match. + RETURNS (bool): Whether the DVC config file was updated. + """ + config_hash = get_hash(config) + path = path.resolve() + dvc_config_path = path / DVC_CONFIG + if dvc_config_path.exists(): + # Cneck if the file was generated using the current config, if not, redo + with dvc_config_path.open("r", encoding="utf8") as f: + ref_hash = f.readline().strip().replace("# ", "") + if ref_hash == config_hash and not force: + return False # Nothing has changed in project config, don't need to update + dvc_config_path.unlink() + variables = config.get("variables", {}) + commands = [] + # We only want to include commands that are part of the main list of "run" + # commands in project.yml and should be run in sequence + config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} + for name in config.get("run", []): + if name not in config_commands: + msg.fail(f"Can't find command '{name}' in project config", exits=1) + command = config_commands[name] + deps = command.get("deps", []) + outputs = command.get("outputs", []) + outputs_no_cache = command.get("outputs_no_cache", []) + if not deps and not outputs and not outputs_no_cache: + continue + # Default to "." as the project path since dvc.yaml is auto-generated + # and we don't want arbitrary paths in there + project_cmd = ["python", "-m", NAME, "project", "exec", ".", name] + deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] + outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] + outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] + dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"] + if verbose: + dvc_cmd.append("--verbose") + if silent: + dvc_cmd.append("--quiet") + full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] + commands.append(" ".join(full_cmd)) + with working_dir(path): + run_commands(commands, variables, silent=True) + with dvc_config_path.open("r+", encoding="utf8") as f: + content = f.read() + f.seek(0, 0) + f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}") + return True + + +def ensure_dvc() -> None: + """Ensure that the "dvc" command is available and show an error if not.""" + try: + subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL) + except Exception: + msg.fail( + "spaCy projects require DVC (Data Version Control) and the 'dvc' command", + "You can install the Python package from pip (pip install dvc) or " + "conda (conda install -c conda-forge dvc). For more details, see the " + "documentation: https://dvc.org/doc/install", + exits=1, + ) + + +def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None: + """Check that the project is set up correctly with DVC and update its + config if needed. Will raise an error if the project is not an initialized + DVC project. + + project_dir (Path): The path to the project directory. + config (Dict[str, Any]): The loaded project config. + """ + if not project_dir.exists(): + msg.fail(f"Can't find project directory: {project_dir}") + if not (project_dir / ".dvc").exists(): + msg.fail( + "Project not initialized as a DVC project.", + f"Make sure that the project template was cloned correctly. To " + f"initialize the project directory manually, you can run: " + f"{COMMAND} project init {project_dir}", + exits=1, + ) + with msg.loading("Updating DVC config..."): + updated = update_dvc_config(project_dir, config, silent=True) + if updated: + msg.good(f"Updated DVC config from changed {CONFIG_FILE}") + + +def run_commands( + commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False +) -> None: + """Run a sequence of commands in a subprocess, in order. + + commands (List[str]): The split commands. + variables (Dict[str, str]): Dictionary of variable names, mapped to their + values. Will be used to substitute format string variables in the + commands. + silent (boll): Don't print the commands. + """ + for command in commands: + # Substitute variables, e.g. "./{NAME}.json" + command = command.format(**variables) + command = shlex.split(command) + # TODO: is this needed / a good idea? + if len(command) and command[0] == "python": + command[0] = sys.executable + elif len(command) and command[0] == "pip": + command = [sys.executable, "-m", "pip", *command[1:]] + if not silent: + print(" ".join(command)) + run_command(command) + + +def convert_asset_url(url: str) -> str: + """Check and convert the asset URL if needed. + + url (str): The asset URL. + RETURNS (str): The converted URL. + """ + # If the asset URL is a regular GitHub URL it's likely a mistake + if re.match("(http(s?)):\/\/github.com", url): + converted = url.replace("github.com", "raw.githubusercontent.com") + converted = re.sub(r"/(tree|blob)/", "/", converted) + msg.warn( + "Downloading from a regular GitHub URL. This will only download " + "the source of the page, not the actual file. Converting the URL " + "to a raw URL.", + converted, + ) + return converted + return url + + +def check_clone(name: str, dest: Path, repo: str) -> None: + """Check and validate that the destination path can be used to clone. Will + check that Git is available and that the destination path is suitable. + + name (str): Name of the directory to clone from the repo. + dest (Path): Local destination of cloned directory. + repo (str): URL of the repo to clone from. + """ + try: + subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL) + except Exception: + msg.fail( + f"Cloning spaCy project templates requires Git and the 'git' command. ", + f"To clone a project without Git, copy the files from the '{name}' " + f"directory in the {repo} to {dest} manually and then run:", + f"{COMMAND} project init {dest}", + exits=1, + ) + if not dest: + msg.fail(f"Not a valid directory to clone project: {dest}", exits=1) + if dest.exists(): + # Directory already exists (not allowed, clone needs to create it) + msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1) + if not dest.parent.exists(): + # We're not creating parents, parent dir should exist + msg.fail( + f"Can't clone project, parent directory doesn't exist: {dest.parent}", + exits=1, + ) + + +def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None: + """Download a file using requests. + + url (str): The URL of the file. + dest (Path): The destination path. + chunk_size (int): The size of chunks to read/write. + """ + response = requests.get(url, stream=True) + response.raise_for_status() + total = int(response.headers.get("content-length", 0)) + progress_settings = { + "total": total, + "unit": "iB", + "unit_scale": True, + "unit_divisor": chunk_size, + "leave": False, + } + with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar: + for data in response.iter_content(chunk_size=chunk_size): + size = f.write(data) + bar.update(size) diff --git a/spacy/schemas.py b/spacy/schemas.py index 04f9bbffa..38e08b4cb 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -220,8 +220,11 @@ class TrainingSchema(BaseModel): class ProjectConfigAsset(BaseModel): + # fmt: off dest: StrictStr = Field(..., title="Destination of downloaded asset") url: StrictStr = Field(..., title="URL of asset") + checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") + # fmt: on class ProjectConfigCommand(BaseModel): @@ -229,11 +232,15 @@ class ProjectConfigCommand(BaseModel): name: StrictStr = Field(..., title="Name of command") help: Optional[StrictStr] = Field(None, title="Command description") script: List[StrictStr] = Field([], title="List of CLI commands to run, in order") - dvc_deps: List[StrictStr] = Field([], title="Data Version Control dependencies") - dvc_outputs: List[StrictStr] = Field([], title="Data Version Control outputs") - dvc_outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)") + deps: List[StrictStr] = Field([], title="Data Version Control dependencies") + outputs: List[StrictStr] = Field([], title="Data Version Control outputs") + outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)") # fmt: on + class Config: + title = "A single named command specified in a project config" + extra = "forbid" + class ProjectConfigSchema(BaseModel): # fmt: off diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 7c3eaf8ad..741753c89 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -1,15 +1,14 @@ import numpy import tempfile -import shutil import contextlib import srsly -from pathlib import Path from spacy import Errors from spacy.tokens import Doc, Span from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA, MORPH from spacy.vocab import Vocab +from spacy.util import make_tempdir # noqa: F401 @contextlib.contextmanager @@ -19,13 +18,6 @@ def make_tempfile(mode="r"): f.close() -@contextlib.contextmanager -def make_tempdir(): - d = Path(tempfile.mkdtemp()) - yield d - shutil.rmtree(str(d)) - - def get_doc( vocab, words=[], diff --git a/spacy/util.py b/spacy/util.py index bbd9a73b6..3f0a1ec6f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -19,6 +19,9 @@ from packaging.specifiers import SpecifierSet, InvalidSpecifier from packaging.version import Version, InvalidVersion import subprocess from contextlib import contextmanager +import tempfile +import shutil +import hashlib try: @@ -455,6 +458,37 @@ def working_dir(path: Union[str, Path]) -> None: os.chdir(prev_cwd) +@contextmanager +def make_tempdir(): + """Execute a block in a temporary directory and remove the directory and + its contents at the end of the with block. + + YIELDS (Path): The path of the temp directory. + """ + d = Path(tempfile.mkdtemp()) + yield d + shutil.rmtree(str(d)) + + +def get_hash(data) -> str: + """Get the hash for a JSON-serializable object. + + data: The data to hash. + RETURNS (str): The hash. + """ + data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") + return hashlib.md5(data_str).hexdigest() + + +def get_checksum(path: Union[Path, str]) -> str: + """Get the checksum for a file given its file path. + + path (Union[Path, str]): The file path. + RETURNS (str): The checksum. + """ + return hashlib.md5(Path(path).read_bytes()).hexdigest() + + def is_in_jupyter(): """Check if user is running spaCy from a Jupyter notebook by detecting the IPython kernel. Mainly used for the displaCy visualizer.