mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Update with DVC WIP
This commit is contained in:
parent
5d235fb767
commit
8b305253d3
|
@ -15,7 +15,7 @@ from .evaluate import evaluate # noqa: F401
|
||||||
from .convert import convert # noqa: F401
|
from .convert import convert # noqa: F401
|
||||||
from .init_model import init_model # noqa: F401
|
from .init_model import init_model # noqa: F401
|
||||||
from .validate import validate # noqa: F401
|
from .validate import validate # noqa: F401
|
||||||
from .project import project_clone, project_get_assets, project_run # noqa: F401
|
from .project import project_clone, project_assets, project_run # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
from typing import Optional
|
|
||||||
import typer
|
import typer
|
||||||
from typer.main import get_command
|
from typer.main import get_command
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import List, Dict, Any
|
from typing import List, Dict, Any, Optional
|
||||||
import typer
|
import typer
|
||||||
import srsly
|
import srsly
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -9,14 +9,16 @@ import os
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
|
import murmurhash
|
||||||
|
|
||||||
from ._app import app, Arg, Opt, COMMAND
|
from ._app import app, Arg, Opt, COMMAND, NAME
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
from ..schemas import ProjectConfigSchema, validate
|
||||||
from ..util import ensure_path, run_command, make_tempdir, working_dir
|
from ..util import ensure_path, run_command, make_tempdir, working_dir
|
||||||
|
|
||||||
|
|
||||||
CONFIG_FILE = "project.yml"
|
CONFIG_FILE = "project.yml"
|
||||||
|
DVC_CONFIG = "dvc.yaml"
|
||||||
DIRS = [
|
DIRS = [
|
||||||
"assets",
|
"assets",
|
||||||
"metas",
|
"metas",
|
||||||
|
@ -34,13 +36,18 @@ CACHES = [
|
||||||
os.environ.get("TORCH_HOME"),
|
os.environ.get("TORCH_HOME"),
|
||||||
Path.home() / ".keras",
|
Path.home() / ".keras",
|
||||||
]
|
]
|
||||||
|
DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit
|
||||||
|
# it directly and edit the project.yml instead and re-run the project."""
|
||||||
|
|
||||||
|
|
||||||
project_cli = typer.Typer(help="Command-line interface for spaCy projects")
|
project_cli = typer.Typer(help="Command-line interface for spaCy projects")
|
||||||
|
|
||||||
|
|
||||||
@project_cli.callback(invoke_without_command=True)
|
@project_cli.callback(invoke_without_command=True)
|
||||||
def callback():
|
def callback(ctx: typer.Context):
|
||||||
# This runs before every project command and ensures DVC is installed
|
"""This runs before every project command and ensures DVC is installed and
|
||||||
|
everything is up to date.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
|
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
|
||||||
except Exception:
|
except Exception:
|
||||||
|
@ -59,15 +66,21 @@ def project_clone_cli(
|
||||||
name: str = Arg(..., help="The name of the template to fetch"),
|
name: str = Arg(..., help="The name of the template to fetch"),
|
||||||
dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
|
dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
|
||||||
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
|
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
|
||||||
|
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", help="Show detailed information")
|
verbose: bool = Opt(False, "--verbose", "-V", help="Show detailed information")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Clone a project template from a repository."""
|
"""Clone a project template from a repository."""
|
||||||
project_clone(name, dest, repo=repo, verbose=verbose)
|
project_clone(name, dest, repo=repo, git=git, verbose=verbose)
|
||||||
|
|
||||||
|
|
||||||
def project_clone(
|
def project_clone(
|
||||||
name: str, dest: Path, *, repo: str = about.__projects__, verbose: bool = False
|
name: str,
|
||||||
|
dest: Path,
|
||||||
|
*,
|
||||||
|
repo: str = about.__projects__,
|
||||||
|
git: bool = False,
|
||||||
|
verbose: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
dest = ensure_path(dest)
|
dest = ensure_path(dest)
|
||||||
check_clone_dest(dest)
|
check_clone_dest(dest)
|
||||||
|
@ -86,52 +99,97 @@ def project_clone(
|
||||||
dir_path = dest / sub_dir
|
dir_path = dest / sub_dir
|
||||||
if not dir_path.exists():
|
if not dir_path.exists():
|
||||||
dir_path.mkdir(parents=True)
|
dir_path.mkdir(parents=True)
|
||||||
|
with working_dir(dest):
|
||||||
|
# TODO: check that .dvc exists in other commands?
|
||||||
|
init_cmd = ["dvc", "init"]
|
||||||
|
if not git:
|
||||||
|
init_cmd.append("--no-scm")
|
||||||
|
if git:
|
||||||
|
run_command(["git", "init"])
|
||||||
|
run_command(init_cmd)
|
||||||
msg.good(f"Your project is now ready!", dest.resolve())
|
msg.good(f"Your project is now ready!", dest.resolve())
|
||||||
print(f"To get the assets, run:\npython -m spacy project get-assets {dest}")
|
print(f"To fetch the assets, run:\npython -m {NAME} project assets {dest}")
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("get-assets")
|
@project_cli.command("assets")
|
||||||
def project_get_assets_cli(
|
def project_assets_cli(
|
||||||
path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False)
|
# fmt: off
|
||||||
|
path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
|
||||||
|
dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't download anything"),
|
||||||
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Use Data Version Control to get the assets for the project."""
|
"""Use Data Version Control to get the assets for the project."""
|
||||||
project_get_assets(path)
|
project_assets(path, dry=dry)
|
||||||
|
|
||||||
|
|
||||||
def project_get_assets(project_path: Path) -> None:
|
def project_assets(project_path: Path, *, dry: bool = False) -> None:
|
||||||
|
if dry:
|
||||||
|
msg.warn("Performing a dry run and not downloading anything")
|
||||||
project_path = ensure_path(project_path)
|
project_path = ensure_path(project_path)
|
||||||
config = load_project_config(project_path)
|
config = load_project_config(project_path)
|
||||||
assets = config.get("assets", {})
|
assets = config.get("assets", {})
|
||||||
if not assets:
|
if not assets:
|
||||||
msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
|
msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
|
||||||
msg.info(f"Getting {len(assets)} asset(s)")
|
msg.info(f"Fetching {len(assets)} asset(s)")
|
||||||
variables = config.get("variables", {})
|
variables = config.get("variables", {})
|
||||||
for asset in assets:
|
for asset in assets:
|
||||||
url = asset["url"].format(**variables)
|
url = asset["url"].format(**variables)
|
||||||
dest = asset["dest"].format(**variables)
|
dest = asset["dest"].format(**variables)
|
||||||
dest_path = project_path / dest
|
dest_path = project_path / dest
|
||||||
check_asset(url)
|
check_asset(url)
|
||||||
cmd = ["dvc", "get-url", url, str(dest_path)]
|
if not dry:
|
||||||
|
cmd = ["dvc", "get-url", url, str(dest_path)]
|
||||||
run_command(cmd)
|
run_command(cmd)
|
||||||
msg.good(f"Got asset {dest}")
|
msg.good(f"Fetched asset {dest}")
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("run")
|
@project_cli.command(
|
||||||
|
"run-all",
|
||||||
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
)
|
||||||
|
def project_run_all_cli(
|
||||||
|
# fmt: off
|
||||||
|
ctx: typer.Context,
|
||||||
|
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
|
||||||
|
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Run all commands. Additional arguments are passed to dvc repro."""
|
||||||
|
if show_help:
|
||||||
|
print_run_help(project_dir)
|
||||||
|
else:
|
||||||
|
project_run_all(project_dir, *ctx.args)
|
||||||
|
|
||||||
|
|
||||||
|
def project_run_all(project_dir: Path, *dvc_args) -> None:
|
||||||
|
config = load_project_config(project_dir)
|
||||||
|
with msg.loading("Updating DVC config..."):
|
||||||
|
updated = update_dvc_config(project_dir, config, silent=True)
|
||||||
|
if updated:
|
||||||
|
msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
|
||||||
|
dvc_cmd = ["dvc", "repro", *dvc_args]
|
||||||
|
run_command(dvc_cmd)
|
||||||
|
|
||||||
|
|
||||||
|
@project_cli.command(
|
||||||
|
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
)
|
||||||
def project_run_cli(
|
def project_run_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
ctx: typer.Context,
|
||||||
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
|
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
|
||||||
subcommand: str = Arg(None, help="Name of command defined in project config"),
|
subcommand: str = Arg(None, help="Name of command defined in project config"),
|
||||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Run scripts defined in the project."""
|
"""Run scripts defined in the project."""
|
||||||
if show_help:
|
if show_help or not subcommand:
|
||||||
print_run_help(project_dir, subcommand)
|
print_run_help(project_dir, subcommand)
|
||||||
else:
|
else:
|
||||||
project_run(project_dir, subcommand)
|
project_run(project_dir, subcommand, *ctx.args)
|
||||||
|
|
||||||
|
|
||||||
def print_run_help(project_dir: Path, subcommand: str) -> None:
|
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||||
"""Simulate a CLI help prompt using the info available in the project config."""
|
"""Simulate a CLI help prompt using the info available in the project config."""
|
||||||
config = load_project_config(project_dir)
|
config = load_project_config(project_dir)
|
||||||
config_commands = config.get("commands", [])
|
config_commands = config.get("commands", [])
|
||||||
|
@ -149,28 +207,60 @@ def print_run_help(project_dir: Path, subcommand: str) -> None:
|
||||||
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
||||||
|
|
||||||
|
|
||||||
def project_run(project_dir: Path, subcommand: str) -> None:
|
def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
|
||||||
|
config = load_project_config(project_dir)
|
||||||
|
with msg.loading("Updating DVC config..."):
|
||||||
|
updated = update_dvc_config(project_dir, config, silent=True)
|
||||||
|
if updated:
|
||||||
|
msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
|
||||||
|
config_commands = config.get("commands", [])
|
||||||
|
variables = config.get("variables", {})
|
||||||
|
commands = {cmd["name"]: cmd for cmd in config_commands}
|
||||||
|
if subcommand not in commands:
|
||||||
|
msg.fail(f"Can't find command '{subcommand}' in project config", exits=1)
|
||||||
|
if subcommand in config.get("run", []):
|
||||||
|
# This is one of the pipeline commands tracked in DVC
|
||||||
|
dvc_cmd = ["dvc", "repro", subcommand, *dvc_args]
|
||||||
|
run_command(dvc_cmd)
|
||||||
|
else:
|
||||||
|
with working_dir(project_dir):
|
||||||
|
run_commands(commands[subcommand]["script"], variables)
|
||||||
|
|
||||||
|
|
||||||
|
@project_cli.command("exec")
|
||||||
|
def project_exec_cli(
|
||||||
|
# fmt: off
|
||||||
|
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
|
||||||
|
subcommand: str = Arg(..., help="Name of command defined in project config"),
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Internals"""
|
||||||
|
project_exec(project_dir, subcommand)
|
||||||
|
|
||||||
|
|
||||||
|
def project_exec(project_dir: Path, subcommand: str):
|
||||||
config = load_project_config(project_dir)
|
config = load_project_config(project_dir)
|
||||||
config_commands = config.get("commands", [])
|
config_commands = config.get("commands", [])
|
||||||
variables = config.get("variables", {})
|
variables = config.get("variables", {})
|
||||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
commands = {cmd["name"]: cmd for cmd in config_commands}
|
||||||
if subcommand and subcommand not in commands:
|
|
||||||
msg.fail(f"Can't find command '{subcommand}' in project config", exits=1)
|
|
||||||
with working_dir(project_dir):
|
with working_dir(project_dir):
|
||||||
if subcommand is None:
|
run_commands(commands[subcommand]["script"], variables)
|
||||||
all_commands = config.get("run", [])
|
|
||||||
if not all_commands:
|
|
||||||
msg.warn("No run commands defined in project config", exits=0)
|
@project_cli.command("update-dvc")
|
||||||
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
def project_update_dvc_cli(
|
||||||
for command in all_commands:
|
# fmt: off
|
||||||
if command not in commands:
|
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
|
||||||
msg.fail(
|
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
||||||
f"Can't find command '{command}' in project config", exits=1
|
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
||||||
)
|
# fmt: on
|
||||||
msg.divider(command)
|
):
|
||||||
run_commands(commands[command]["script"], variables)
|
config = load_project_config(project_dir)
|
||||||
else:
|
updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
|
||||||
run_commands(commands[subcommand]["script"], variables)
|
if updated:
|
||||||
|
msg.good(f"Updated DVC config from {CONFIG_FILE}")
|
||||||
|
else:
|
||||||
|
msg.info(f"No changes found in {CONFIG_FILE}, no update needed")
|
||||||
|
|
||||||
|
|
||||||
app.add_typer(project_cli, name="project")
|
app.add_typer(project_cli, name="project")
|
||||||
|
@ -187,7 +277,63 @@ def load_project_config(path: Path) -> Dict[str, Any]:
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {}) -> None:
|
def update_dvc_config(
|
||||||
|
path: Path,
|
||||||
|
config: Dict[str, Any],
|
||||||
|
verbose: bool = False,
|
||||||
|
silent: bool = False,
|
||||||
|
force: bool = False,
|
||||||
|
) -> bool:
|
||||||
|
"""Re-run the DVC commands in dry mode and update dvc.yml file in the
|
||||||
|
project directory. The file is auto-generated based on the config.
|
||||||
|
"""
|
||||||
|
config_hash = get_hash(config)
|
||||||
|
dvc_config_path = path / DVC_CONFIG
|
||||||
|
if dvc_config_path.exists():
|
||||||
|
# Cneck if the file was generated using the current config, if not, redo
|
||||||
|
with dvc_config_path.open("r", encoding="utf8") as f:
|
||||||
|
ref_hash = f.readline().strip().replace("# ", "")
|
||||||
|
if ref_hash == config_hash and not force:
|
||||||
|
return False # Nothing has changed in project config, don't need to update
|
||||||
|
dvc_config_path.unlink()
|
||||||
|
variables = config.get("variables", {})
|
||||||
|
commands = []
|
||||||
|
# We only want to include commands that are part of the main list of "run"
|
||||||
|
# commands in project.yml and should be run in sequence
|
||||||
|
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||||
|
for name in config.get("run", []):
|
||||||
|
if name not in config_commands:
|
||||||
|
msg.fail(f"Can't find command '{name}' in project config", exits=1)
|
||||||
|
command = config_commands[name]
|
||||||
|
deps = command.get("deps", [])
|
||||||
|
outputs = command.get("outputs", [])
|
||||||
|
outputs_no_cache = command.get("outputs_no_cache", [])
|
||||||
|
if not deps and not outputs and not outputs_no_cache:
|
||||||
|
continue
|
||||||
|
# Default to "." as the project path since dvc.yaml is auto-generated
|
||||||
|
# and we don't want arbitrary paths in there
|
||||||
|
project_cmd = ["python", "-m", NAME, "project", "exec", ".", name]
|
||||||
|
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
||||||
|
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
||||||
|
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
||||||
|
dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"]
|
||||||
|
if verbose:
|
||||||
|
dvc_cmd.append("--verbose")
|
||||||
|
if silent:
|
||||||
|
dvc_cmd.append("--quiet")
|
||||||
|
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
||||||
|
commands.append(" ".join(full_cmd))
|
||||||
|
run_commands(commands, variables, silent=True)
|
||||||
|
with dvc_config_path.open("r+", encoding="utf8") as f:
|
||||||
|
content = f.read()
|
||||||
|
f.seek(0, 0)
|
||||||
|
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def run_commands(
|
||||||
|
commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
|
||||||
|
) -> None:
|
||||||
for command in commands:
|
for command in commands:
|
||||||
# Substitute variables, e.g. "./{NAME}.json"
|
# Substitute variables, e.g. "./{NAME}.json"
|
||||||
command = command.format(**variables)
|
command = command.format(**variables)
|
||||||
|
@ -195,7 +341,8 @@ def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {})
|
||||||
# TODO: is this needed / a good idea?
|
# TODO: is this needed / a good idea?
|
||||||
if len(command) and command[0] == "python":
|
if len(command) and command[0] == "python":
|
||||||
command[0] = sys.executable
|
command[0] = sys.executable
|
||||||
print(" ".join(command))
|
if not silent:
|
||||||
|
print(" ".join(command))
|
||||||
run_command(command)
|
run_command(command)
|
||||||
|
|
||||||
|
|
||||||
|
@ -225,3 +372,7 @@ def check_clone_dest(dest: Path) -> None:
|
||||||
f"Can't clone project, parent directory doesn't exist: {dest.parent}",
|
f"Can't clone project, parent directory doesn't exist: {dest.parent}",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_hash(data) -> str:
|
||||||
|
return str(murmurhash.hash(srsly.json_dumps(data, sort_keys=True)))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user