Improve asset fetching

Get all paths first and run dvc add once so it only shows one progress bar and one combined git command (if repo is git repo)
This commit is contained in:
Ines Montani 2020-06-29 16:55:24 +02:00
parent 7c08713baa
commit 126050f259

View File

@ -295,15 +295,21 @@ def project_assets(project_dir: Path) -> None:
msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
msg.info(f"Fetching {len(assets)} asset(s)") msg.info(f"Fetching {len(assets)} asset(s)")
variables = config.get("variables", {}) variables = config.get("variables", {})
fetched_assets = []
for asset in assets: for asset in assets:
url = asset["url"].format(**variables) url = asset["url"].format(**variables)
dest = asset["dest"].format(**variables) dest = asset["dest"].format(**variables)
fetch_asset(project_path, url, dest, asset.get("checksum")) fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum"))
if fetched_path:
fetched_assets.append(str(fetched_path))
if fetched_assets:
with working_dir(project_path):
run_command(["dvc", "add", *fetched_assets, "--external"])
def fetch_asset( def fetch_asset(
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
) -> None: ) -> Optional[Path]:
"""Fetch an asset from a given URL or path. Will try to import the file """Fetch an asset from a given URL or path. Will try to import the file
using DVC's import-url if possible (fully tracked and versioned) and falls using DVC's import-url if possible (fully tracked and versioned) and falls
back to get-url (versioned) and a non-DVC download if necessary. If a back to get-url (versioned) and a non-DVC download if necessary. If a
@ -313,6 +319,8 @@ def fetch_asset(
project_path (Path): Path to project directory. project_path (Path): Path to project directory.
url (str): URL or path to asset. url (str): URL or path to asset.
checksum (Optional[str]): Optional expected checksum of local file. checksum (Optional[str]): Optional expected checksum of local file.
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
the asset failed.
""" """
url = convert_asset_url(url) url = convert_asset_url(url)
dest_path = (project_path / dest).resolve() dest_path = (project_path / dest).resolve()
@ -321,8 +329,7 @@ def fetch_asset(
# TODO: add support for caches (dvc import-url with local path) # TODO: add support for caches (dvc import-url with local path)
if checksum == get_checksum(dest_path): if checksum == get_checksum(dest_path):
msg.good(f"Skipping download with matching checksum: {dest}") msg.good(f"Skipping download with matching checksum: {dest}")
return return dest_path
dvc_add_cmd = ["dvc", "add", str(dest_path), "--external"]
with working_dir(project_path): with working_dir(project_path):
try: try:
# If these fail, we don't want to output an error or info message. # If these fail, we don't want to output an error or info message.
@ -334,16 +341,16 @@ def fetch_asset(
except subprocess.CalledProcessError: except subprocess.CalledProcessError:
dvc_cmd = ["dvc", "get-url", url, str(dest_path)] dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
run_command(dvc_add_cmd)
except subprocess.CalledProcessError: except subprocess.CalledProcessError:
try: try:
download_file(url, dest_path) download_file(url, dest_path)
except requests.exceptions.HTTPError as e: except requests.exceptions.HTTPError as e:
msg.fail(f"Download failed: {dest}", e) msg.fail(f"Download failed: {dest}", e)
run_command(dvc_add_cmd) return None
if checksum and checksum != get_checksum(dest_path): if checksum and checksum != get_checksum(dest_path):
msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}") msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}")
msg.good(f"Fetched asset {dest}") msg.good(f"Fetched asset {dest}")
return dest_path
def project_run_all(project_dir: Path, *dvc_args) -> None: def project_run_all(project_dir: Path, *dvc_args) -> None: