From 8b305253d3746e94f2b0d0f70c94dbfd53e5d194 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 27 Jun 2020 13:02:10 +0200 Subject: [PATCH] Update with DVC WIP --- spacy/cli/__init__.py | 2 +- spacy/cli/_app.py | 1 - spacy/cli/project.py | 227 +++++++++++++++++++++++++++++++++++------- 3 files changed, 190 insertions(+), 40 deletions(-) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 14623000a..9af1265d1 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -15,7 +15,7 @@ from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 from .init_model import init_model # noqa: F401 from .validate import validate # noqa: F401 -from .project import project_clone, project_get_assets, project_run # noqa: F401 +from .project import project_clone, project_assets, project_run # noqa: F401 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) diff --git a/spacy/cli/_app.py b/spacy/cli/_app.py index 6f64dcb59..2b3ad9524 100644 --- a/spacy/cli/_app.py +++ b/spacy/cli/_app.py @@ -1,4 +1,3 @@ -from typing import Optional import typer from typer.main import get_command diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 3cced4057..12578d813 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Any +from typing import List, Dict, Any, Optional import typer import srsly from pathlib import Path @@ -9,14 +9,16 @@ import os import re import shutil import sys +import murmurhash -from ._app import app, Arg, Opt, COMMAND +from ._app import app, Arg, Opt, COMMAND, NAME from .. import about from ..schemas import ProjectConfigSchema, validate from ..util import ensure_path, run_command, make_tempdir, working_dir CONFIG_FILE = "project.yml" +DVC_CONFIG = "dvc.yaml" DIRS = [ "assets", "metas", @@ -34,13 +36,18 @@ CACHES = [ os.environ.get("TORCH_HOME"), Path.home() / ".keras", ] +DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit +# it directly and edit the project.yml instead and re-run the project.""" + project_cli = typer.Typer(help="Command-line interface for spaCy projects") @project_cli.callback(invoke_without_command=True) -def callback(): - # This runs before every project command and ensures DVC is installed +def callback(ctx: typer.Context): + """This runs before every project command and ensures DVC is installed and + everything is up to date. + """ try: subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL) except Exception: @@ -59,15 +66,21 @@ def project_clone_cli( name: str = Arg(..., help="The name of the template to fetch"), dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False), repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."), + git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), verbose: bool = Opt(False, "--verbose", "-V", help="Show detailed information") # fmt: on ): """Clone a project template from a repository.""" - project_clone(name, dest, repo=repo, verbose=verbose) + project_clone(name, dest, repo=repo, git=git, verbose=verbose) def project_clone( - name: str, dest: Path, *, repo: str = about.__projects__, verbose: bool = False + name: str, + dest: Path, + *, + repo: str = about.__projects__, + git: bool = False, + verbose: bool = False, ) -> None: dest = ensure_path(dest) check_clone_dest(dest) @@ -86,52 +99,97 @@ def project_clone( dir_path = dest / sub_dir if not dir_path.exists(): dir_path.mkdir(parents=True) + with working_dir(dest): + # TODO: check that .dvc exists in other commands? + init_cmd = ["dvc", "init"] + if not git: + init_cmd.append("--no-scm") + if git: + run_command(["git", "init"]) + run_command(init_cmd) msg.good(f"Your project is now ready!", dest.resolve()) - print(f"To get the assets, run:\npython -m spacy project get-assets {dest}") + print(f"To fetch the assets, run:\npython -m {NAME} project assets {dest}") -@project_cli.command("get-assets") -def project_get_assets_cli( - path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False) +@project_cli.command("assets") +def project_assets_cli( + # fmt: off + path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), + dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't download anything"), + # fmt: on ): """Use Data Version Control to get the assets for the project.""" - project_get_assets(path) + project_assets(path, dry=dry) -def project_get_assets(project_path: Path) -> None: +def project_assets(project_path: Path, *, dry: bool = False) -> None: + if dry: + msg.warn("Performing a dry run and not downloading anything") project_path = ensure_path(project_path) config = load_project_config(project_path) assets = config.get("assets", {}) if not assets: msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) - msg.info(f"Getting {len(assets)} asset(s)") + msg.info(f"Fetching {len(assets)} asset(s)") variables = config.get("variables", {}) for asset in assets: url = asset["url"].format(**variables) dest = asset["dest"].format(**variables) dest_path = project_path / dest check_asset(url) - cmd = ["dvc", "get-url", url, str(dest_path)] + if not dry: + cmd = ["dvc", "get-url", url, str(dest_path)] run_command(cmd) - msg.good(f"Got asset {dest}") + msg.good(f"Fetched asset {dest}") -@project_cli.command("run") +@project_cli.command( + "run-all", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def project_run_all_cli( + # fmt: off + ctx: typer.Context, + project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), + show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") + # fmt: on +): + """Run all commands. Additional arguments are passed to dvc repro.""" + if show_help: + print_run_help(project_dir) + else: + project_run_all(project_dir, *ctx.args) + + +def project_run_all(project_dir: Path, *dvc_args) -> None: + config = load_project_config(project_dir) + with msg.loading("Updating DVC config..."): + updated = update_dvc_config(project_dir, config, silent=True) + if updated: + msg.good(f"Updated DVC config from changed {CONFIG_FILE}") + dvc_cmd = ["dvc", "repro", *dvc_args] + run_command(dvc_cmd) + + +@project_cli.command( + "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) def project_run_cli( # fmt: off + ctx: typer.Context, project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), subcommand: str = Arg(None, help="Name of command defined in project config"), show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") # fmt: on ): """Run scripts defined in the project.""" - if show_help: + if show_help or not subcommand: print_run_help(project_dir, subcommand) else: - project_run(project_dir, subcommand) + project_run(project_dir, subcommand, *ctx.args) -def print_run_help(project_dir: Path, subcommand: str) -> None: +def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: """Simulate a CLI help prompt using the info available in the project config.""" config = load_project_config(project_dir) config_commands = config.get("commands", []) @@ -149,28 +207,60 @@ def print_run_help(project_dir: Path, subcommand: str) -> None: msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) -def project_run(project_dir: Path, subcommand: str) -> None: +def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: + config = load_project_config(project_dir) + with msg.loading("Updating DVC config..."): + updated = update_dvc_config(project_dir, config, silent=True) + if updated: + msg.good(f"Updated DVC config from changed {CONFIG_FILE}") + config_commands = config.get("commands", []) + variables = config.get("variables", {}) + commands = {cmd["name"]: cmd for cmd in config_commands} + if subcommand not in commands: + msg.fail(f"Can't find command '{subcommand}' in project config", exits=1) + if subcommand in config.get("run", []): + # This is one of the pipeline commands tracked in DVC + dvc_cmd = ["dvc", "repro", subcommand, *dvc_args] + run_command(dvc_cmd) + else: + with working_dir(project_dir): + run_commands(commands[subcommand]["script"], variables) + + +@project_cli.command("exec") +def project_exec_cli( + # fmt: off + project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), + subcommand: str = Arg(..., help="Name of command defined in project config"), + # fmt: on +): + """Internals""" + project_exec(project_dir, subcommand) + + +def project_exec(project_dir: Path, subcommand: str): config = load_project_config(project_dir) config_commands = config.get("commands", []) variables = config.get("variables", {}) commands = {cmd["name"]: cmd for cmd in config_commands} - if subcommand and subcommand not in commands: - msg.fail(f"Can't find command '{subcommand}' in project config", exits=1) with working_dir(project_dir): - if subcommand is None: - all_commands = config.get("run", []) - if not all_commands: - msg.warn("No run commands defined in project config", exits=0) - msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) - for command in all_commands: - if command not in commands: - msg.fail( - f"Can't find command '{command}' in project config", exits=1 - ) - msg.divider(command) - run_commands(commands[command]["script"], variables) - else: - run_commands(commands[subcommand]["script"], variables) + run_commands(commands[subcommand]["script"], variables) + + +@project_cli.command("update-dvc") +def project_update_dvc_cli( + # fmt: off + project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), + verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), + force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), + # fmt: on +): + config = load_project_config(project_dir) + updated = update_dvc_config(project_dir, config, verbose=verbose, force=force) + if updated: + msg.good(f"Updated DVC config from {CONFIG_FILE}") + else: + msg.info(f"No changes found in {CONFIG_FILE}, no update needed") app.add_typer(project_cli, name="project") @@ -187,7 +277,63 @@ def load_project_config(path: Path) -> Dict[str, Any]: return config -def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {}) -> None: +def update_dvc_config( + path: Path, + config: Dict[str, Any], + verbose: bool = False, + silent: bool = False, + force: bool = False, +) -> bool: + """Re-run the DVC commands in dry mode and update dvc.yml file in the + project directory. The file is auto-generated based on the config. + """ + config_hash = get_hash(config) + dvc_config_path = path / DVC_CONFIG + if dvc_config_path.exists(): + # Cneck if the file was generated using the current config, if not, redo + with dvc_config_path.open("r", encoding="utf8") as f: + ref_hash = f.readline().strip().replace("# ", "") + if ref_hash == config_hash and not force: + return False # Nothing has changed in project config, don't need to update + dvc_config_path.unlink() + variables = config.get("variables", {}) + commands = [] + # We only want to include commands that are part of the main list of "run" + # commands in project.yml and should be run in sequence + config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} + for name in config.get("run", []): + if name not in config_commands: + msg.fail(f"Can't find command '{name}' in project config", exits=1) + command = config_commands[name] + deps = command.get("deps", []) + outputs = command.get("outputs", []) + outputs_no_cache = command.get("outputs_no_cache", []) + if not deps and not outputs and not outputs_no_cache: + continue + # Default to "." as the project path since dvc.yaml is auto-generated + # and we don't want arbitrary paths in there + project_cmd = ["python", "-m", NAME, "project", "exec", ".", name] + deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] + outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] + outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] + dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"] + if verbose: + dvc_cmd.append("--verbose") + if silent: + dvc_cmd.append("--quiet") + full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] + commands.append(" ".join(full_cmd)) + run_commands(commands, variables, silent=True) + with dvc_config_path.open("r+", encoding="utf8") as f: + content = f.read() + f.seek(0, 0) + f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}") + return True + + +def run_commands( + commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False +) -> None: for command in commands: # Substitute variables, e.g. "./{NAME}.json" command = command.format(**variables) @@ -195,7 +341,8 @@ def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {}) # TODO: is this needed / a good idea? if len(command) and command[0] == "python": command[0] = sys.executable - print(" ".join(command)) + if not silent: + print(" ".join(command)) run_command(command) @@ -225,3 +372,7 @@ def check_clone_dest(dest: Path) -> None: f"Can't clone project, parent directory doesn't exist: {dest.parent}", exits=1, ) + + +def get_hash(data) -> str: + return str(murmurhash.hash(srsly.json_dumps(data, sort_keys=True)))