Fix sparse checkout and error handling

This commit is contained in:
Ines Montani 2020-09-14 14:12:58 +02:00
parent b854e0bef9
commit c052017025
4 changed files with 28 additions and 16 deletions

View File

@ -300,7 +300,9 @@ def ensure_pathy(path):
return Pathy(path)
def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "master"):
def git_checkout(
repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False
):
git_version = get_git_version()
if dest.exists():
msg.fail("Destination of checkout must not exist", exits=1)
@ -323,11 +325,14 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m
# We're using Git and sparse checkout to only clone the files we need
with make_tempdir() as tmp_dir:
supports_sparse = git_version >= (2, 22)
use_sparse = supports_sparse and sparse
# This is the "clone, but don't download anything" part.
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
if supports_sparse:
if use_sparse:
cmd += f"--filter=blob:none" # <-- The key bit
else:
# Only show warnings if the user explicitly wants sparse checkout but
# the Git version doesn't support it
elif sparse:
err_old = (
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
f"that doesn't fully support sparse checkout yet."
@ -342,19 +347,19 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m
try_run_command(cmd)
# Now we need to find the missing filenames for the subpath we want.
# Looking for this 'rev-list' command in the git --help? Hah.
cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if supports_sparse else ''} -- {subpath}"
cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if use_sparse else ''} -- {subpath}"
ret = try_run_command(cmd)
git_repo = _from_http_to_git(repo)
# Now pass those missings into another bit of git internals
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
if supports_sparse and not missings:
if use_sparse and not missings:
err = (
f"Could not find any relevant files for '{subpath}'. "
f"Did you specify a correct and complete path within repo '{repo}' "
f"and branch {branch}?"
)
msg.fail(err, exits=1)
if supports_sparse:
if use_sparse:
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
try_run_command(cmd)
# And finally, we can checkout our subpath

View File

@ -6,14 +6,15 @@ import shutil
import requests
from ...util import ensure_path, working_dir
from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum
from .._util import download_file, git_sparse_checkout, get_git_version
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
from .._util import get_checksum, download_file, git_checkout, get_git_version
@project_cli.command("assets")
def project_assets_cli(
# fmt: off
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+.")
# fmt: on
):
"""Fetch project assets like datasets and pretrained weights. Assets are
@ -23,10 +24,10 @@ def project_assets_cli(
DOCS: https://nightly.spacy.io/api/cli#project-assets
"""
project_assets(project_dir)
project_assets(project_dir, sparse_checkout=sparse_checkout)
def project_assets(project_dir: Path) -> None:
def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None:
"""Fetch assets for a project using DVC if possible.
project_dir (Path): Path to project directory.
@ -58,11 +59,12 @@ def project_assets(project_dir: Path) -> None:
shutil.rmtree(dest)
else:
dest.unlink()
git_sparse_checkout(
git_checkout(
asset["git"]["repo"],
asset["git"]["path"],
dest,
branch=asset["git"].get("branch"),
sparse=sparse_checkout,
)
else:
url = asset.get("url")

View File

@ -7,7 +7,7 @@ import re
from ... import about
from ...util import ensure_path
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
from .._util import git_sparse_checkout, get_git_version
from .._util import git_checkout, get_git_version
@project_cli.command("clone")
@ -16,7 +16,8 @@ def project_clone_cli(
name: str = Arg(..., help="The name of the template to clone"),
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"),
branch: str = Opt(about.__projects_branch__, "--branch", "-b", help="The branch to clone from")
branch: str = Opt(about.__projects_branch__, "--branch", "-b", help="The branch to clone from"),
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
# fmt: on
):
"""Clone a project template from a repository. Calls into "git" and will
@ -28,7 +29,7 @@ def project_clone_cli(
"""
if dest is None:
dest = Path.cwd() / Path(name).parts[-1]
project_clone(name, dest, repo=repo, branch=branch)
project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
def project_clone(
@ -37,6 +38,7 @@ def project_clone(
*,
repo: str = about.__projects__,
branch: str = about.__projects_branch__,
sparse_checkout: bool = False,
) -> None:
"""Clone a project template from a repository.
@ -50,7 +52,7 @@ def project_clone(
project_dir = dest.resolve()
repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
try:
git_sparse_checkout(repo, name, dest, branch=branch)
git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
except subprocess.CalledProcessError:
err = f"Could not clone '{name}' from repo '{repo_name}'"
msg.fail(err, exits=1)

View File

@ -680,7 +680,10 @@ def run_command(
Errors.E970.format(str_command=" ".join(command), tool=command[0])
) from None
except subprocess.CalledProcessError as e:
# We don't want a duplicate traceback here
# We don't want a duplicate traceback here so we're making sure the
# CalledProcessError isn't re-raised. We also print both the string
# message and the stderr, in case the error only has one of them.
print(e.stderr)
print(e)
sys.exit(1)
if ret.returncode != 0: