Update DVC integration

This commit is contained in:
Ines Montani 2020-06-27 14:15:41 +02:00
parent 7a0fe50610
commit c96b4a37b6
2 changed files with 66 additions and 16 deletions

View File

@ -10,6 +10,7 @@ import re
import shutil import shutil
import sys import sys
import murmurhash import murmurhash
import hashlib
from ._app import app, Arg, Opt, COMMAND, NAME from ._app import app, Arg, Opt, COMMAND, NAME
from .. import about from .. import about
@ -67,11 +68,12 @@ def project_clone_cli(
dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False), dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."), repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"),
verbose: bool = Opt(False, "--verbose", "-V", help="Show detailed information") verbose: bool = Opt(False, "--verbose", "-V", help="Show detailed information")
# fmt: on # fmt: on
): ):
"""Clone a project template from a repository.""" """Clone a project template from a repository."""
project_clone(name, dest, repo=repo, git=git, verbose=verbose) project_clone(name, dest, repo=repo, git=git, no_init=no_init, verbose=verbose)
def project_clone( def project_clone(
@ -80,6 +82,7 @@ def project_clone(
*, *,
repo: str = about.__projects__, repo: str = about.__projects__,
git: bool = False, git: bool = False,
no_init: bool = False,
verbose: bool = False, verbose: bool = False,
) -> None: ) -> None:
dest = ensure_path(dest) dest = ensure_path(dest)
@ -99,6 +102,25 @@ def project_clone(
dir_path = dest / sub_dir dir_path = dest / sub_dir
if not dir_path.exists(): if not dir_path.exists():
dir_path.mkdir(parents=True) dir_path.mkdir(parents=True)
if not no_init:
project_init(dest, git=git)
msg.good(f"Your project is now ready!", dest.resolve())
print(f"To fetch the assets, run:\npython -m {NAME} project assets {dest}")
@project_cli.command("init")
def project_init_cli(
path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
):
"""Initialize a project directory with DVC and Git (optional). This should
typically be taken care of automatically when you run the "project clone"
command.
"""
project_init(path, git=git)
def project_init(dest: Path, *, git: bool = False):
with working_dir(dest): with working_dir(dest):
# TODO: check that .dvc exists in other commands? # TODO: check that .dvc exists in other commands?
init_cmd = ["dvc", "init"] init_cmd = ["dvc", "init"]
@ -107,26 +129,27 @@ def project_clone(
if git: if git:
run_command(["git", "init"]) run_command(["git", "init"])
run_command(init_cmd) run_command(init_cmd)
msg.good(f"Your project is now ready!", dest.resolve()) # TODO: find a better solution for this?
print(f"To fetch the assets, run:\npython -m {NAME} project assets {dest}") run_command(["dvc", "config", "core.analytics", "false"])
@project_cli.command("assets") @project_cli.command("assets")
def project_assets_cli( def project_assets_cli(
# fmt: off # fmt: off
path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't download anything"),
# fmt: on # fmt: on
): ):
"""Use Data Version Control to get the assets for the project.""" """Use Data Version Control to get the assets for the project."""
project_assets(path, dry=dry) project_assets(path)
def project_assets(project_path: Path, *, dry: bool = False) -> None: def project_assets(project_path: Path) -> None:
if dry:
msg.warn("Performing a dry run and not downloading anything")
project_path = ensure_path(project_path) project_path = ensure_path(project_path)
config = load_project_config(project_path) config = load_project_config(project_path)
with msg.loading("Updating DVC config..."):
updated = update_dvc_config(project_path, config, silent=True)
if updated:
msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
assets = config.get("assets", {}) assets = config.get("assets", {})
if not assets: if not assets:
msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
@ -135,12 +158,30 @@ def project_assets(project_path: Path, *, dry: bool = False) -> None:
for asset in assets: for asset in assets:
url = asset["url"].format(**variables) url = asset["url"].format(**variables)
dest = asset["dest"].format(**variables) dest = asset["dest"].format(**variables)
dest_path = project_path / dest fetch_asset(project_path, url, dest, asset.get("checksum"))
check_asset(url)
if not dry:
cmd = ["dvc", "get-url", url, str(dest_path)] def fetch_asset(project_path: Path, url: str, dest: Path, checksum: str = None):
run_command(cmd) check_asset(url)
msg.good(f"Fetched asset {dest}") dest_path = project_path / dest
if dest_path.exists() and checksum:
# If there's already a file, check for checksum
# TODO: add support for chaches
if checksum == get_checksum(dest_path):
msg.good(f"Skipping download with matching checksum: {dest}")
return
with working_dir(project_path):
try:
dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
# If this fails, we don't want to output an error or info message
out = subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)
print(out)
except subprocess.CalledProcessError:
# TODO: Can we read out Weak ETags error?
# TODO: replace curl
run_command(["curl", url, "--output", str(dest_path)])
run_command(["dvc", "add", str(dest_path)])
msg.good(f"Fetched asset {dest}")
@project_cli.command( @project_cli.command(
@ -168,7 +209,8 @@ def project_run_all(project_dir: Path, *dvc_args) -> None:
if updated: if updated:
msg.good(f"Updated DVC config from changed {CONFIG_FILE}") msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
dvc_cmd = ["dvc", "repro", *dvc_args] dvc_cmd = ["dvc", "repro", *dvc_args]
run_command(dvc_cmd) with working_dir(project_dir):
run_command(dvc_cmd)
@project_cli.command( @project_cli.command(
@ -323,7 +365,8 @@ def update_dvc_config(
dvc_cmd.append("--quiet") dvc_cmd.append("--quiet")
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
commands.append(" ".join(full_cmd)) commands.append(" ".join(full_cmd))
run_commands(commands, variables, silent=True) with working_dir(path):
run_commands(commands, variables, silent=True)
with dvc_config_path.open("r+", encoding="utf8") as f: with dvc_config_path.open("r+", encoding="utf8") as f:
content = f.read() content = f.read()
f.seek(0, 0) f.seek(0, 0)
@ -376,3 +419,7 @@ def check_clone_dest(dest: Path) -> None:
def get_hash(data) -> str: def get_hash(data) -> str:
return str(murmurhash.hash(srsly.json_dumps(data, sort_keys=True))) return str(murmurhash.hash(srsly.json_dumps(data, sort_keys=True)))
def get_checksum(path: Path) -> str:
return hashlib.md5(path.read_bytes()).hexdigest()

View File

@ -220,8 +220,11 @@ class TrainingSchema(BaseModel):
class ProjectConfigAsset(BaseModel): class ProjectConfigAsset(BaseModel):
# fmt: off
dest: StrictStr = Field(..., title="Destination of downloaded asset") dest: StrictStr = Field(..., title="Destination of downloaded asset")
url: StrictStr = Field(..., title="URL of asset") url: StrictStr = Field(..., title="URL of asset")
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
# fmt: on
class ProjectConfigCommand(BaseModel): class ProjectConfigCommand(BaseModel):