From c0520170257e7eca660ab23707d08203b500482a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 14 Sep 2020 14:12:58 +0200 Subject: [PATCH] Fix sparse checkout and error handling --- spacy/cli/_util.py | 17 +++++++++++------ spacy/cli/project/assets.py | 12 +++++++----- spacy/cli/project/clone.py | 10 ++++++---- spacy/util.py | 5 ++++- 4 files changed, 28 insertions(+), 16 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 649c2b373..e8f3be995 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -300,7 +300,9 @@ def ensure_pathy(path): return Pathy(path) -def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "master"): +def git_checkout( + repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False +): git_version = get_git_version() if dest.exists(): msg.fail("Destination of checkout must not exist", exits=1) @@ -323,11 +325,14 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m # We're using Git and sparse checkout to only clone the files we need with make_tempdir() as tmp_dir: supports_sparse = git_version >= (2, 22) + use_sparse = supports_sparse and sparse # This is the "clone, but don't download anything" part. cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} " - if supports_sparse: + if use_sparse: cmd += f"--filter=blob:none" # <-- The key bit - else: + # Only show warnings if the user explicitly wants sparse checkout but + # the Git version doesn't support it + elif sparse: err_old = ( f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) " f"that doesn't fully support sparse checkout yet." @@ -342,19 +347,19 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m try_run_command(cmd) # Now we need to find the missing filenames for the subpath we want. # Looking for this 'rev-list' command in the git --help? Hah. - cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if supports_sparse else ''} -- {subpath}" + cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if use_sparse else ''} -- {subpath}" ret = try_run_command(cmd) git_repo = _from_http_to_git(repo) # Now pass those missings into another bit of git internals missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) - if supports_sparse and not missings: + if use_sparse and not missings: err = ( f"Could not find any relevant files for '{subpath}'. " f"Did you specify a correct and complete path within repo '{repo}' " f"and branch {branch}?" ) msg.fail(err, exits=1) - if supports_sparse: + if use_sparse: cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}" try_run_command(cmd) # And finally, we can checkout our subpath diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py index a8b607e05..8a3aaff25 100644 --- a/spacy/cli/project/assets.py +++ b/spacy/cli/project/assets.py @@ -6,14 +6,15 @@ import shutil import requests from ...util import ensure_path, working_dir -from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum -from .._util import download_file, git_sparse_checkout, get_git_version +from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config +from .._util import get_checksum, download_file, git_checkout, get_git_version @project_cli.command("assets") def project_assets_cli( # fmt: off project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), + sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+.") # fmt: on ): """Fetch project assets like datasets and pretrained weights. Assets are @@ -23,10 +24,10 @@ def project_assets_cli( DOCS: https://nightly.spacy.io/api/cli#project-assets """ - project_assets(project_dir) + project_assets(project_dir, sparse_checkout=sparse_checkout) -def project_assets(project_dir: Path) -> None: +def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None: """Fetch assets for a project using DVC if possible. project_dir (Path): Path to project directory. @@ -58,11 +59,12 @@ def project_assets(project_dir: Path) -> None: shutil.rmtree(dest) else: dest.unlink() - git_sparse_checkout( + git_checkout( asset["git"]["repo"], asset["git"]["path"], dest, branch=asset["git"].get("branch"), + sparse=sparse_checkout, ) else: url = asset.get("url") diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py index f691e523c..851fc444a 100644 --- a/spacy/cli/project/clone.py +++ b/spacy/cli/project/clone.py @@ -7,7 +7,7 @@ import re from ... import about from ...util import ensure_path from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE -from .._util import git_sparse_checkout, get_git_version +from .._util import git_checkout, get_git_version @project_cli.command("clone") @@ -16,7 +16,8 @@ def project_clone_cli( name: str = Arg(..., help="The name of the template to clone"), dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False), repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"), - branch: str = Opt(about.__projects_branch__, "--branch", "-b", help="The branch to clone from") + branch: str = Opt(about.__projects_branch__, "--branch", "-b", help="The branch to clone from"), + sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.") # fmt: on ): """Clone a project template from a repository. Calls into "git" and will @@ -28,7 +29,7 @@ def project_clone_cli( """ if dest is None: dest = Path.cwd() / Path(name).parts[-1] - project_clone(name, dest, repo=repo, branch=branch) + project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout) def project_clone( @@ -37,6 +38,7 @@ def project_clone( *, repo: str = about.__projects__, branch: str = about.__projects_branch__, + sparse_checkout: bool = False, ) -> None: """Clone a project template from a repository. @@ -50,7 +52,7 @@ def project_clone( project_dir = dest.resolve() repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo) try: - git_sparse_checkout(repo, name, dest, branch=branch) + git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout) except subprocess.CalledProcessError: err = f"Could not clone '{name}' from repo '{repo_name}'" msg.fail(err, exits=1) diff --git a/spacy/util.py b/spacy/util.py index d1df1f92a..18b34e4d6 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -680,7 +680,10 @@ def run_command( Errors.E970.format(str_command=" ".join(command), tool=command[0]) ) from None except subprocess.CalledProcessError as e: - # We don't want a duplicate traceback here + # We don't want a duplicate traceback here so we're making sure the + # CalledProcessError isn't re-raised. We also print both the string + # message and the stderr, in case the error only has one of them. + print(e.stderr) print(e) sys.exit(1) if ret.returncode != 0: