Support git_sparse_checkout for assets

This commit is contained in:
Matthew Honnibal 2020-08-24 20:11:56 +02:00
parent 99d31e7662
commit e1457d5806
3 changed files with 42 additions and 28 deletions

View File

@ -1,5 +1,6 @@
from typing import Dict, Any, Union, List, Optional, TYPE_CHECKING
import sys
import shutil
from pathlib import Path
from wasabi import msg
import srsly
@ -11,7 +12,7 @@ from thinc.config import Config, ConfigValidationError
from configparser import InterpolationError
from ..schemas import ProjectConfigSchema, validate
from ..util import import_file
from ..util import import_file, run_command, make_tempdir
if TYPE_CHECKING:
from pathy import Pathy # noqa: F401
@ -311,3 +312,24 @@ def ensure_pathy(path):
from pathy import Pathy # noqa: F811
return Pathy(path)
def git_sparse_checkout(repo: str, subpath: str, dest: Path):
if dest.exists():
raise IOError("Destination of checkout must not exist")
if not dest.parent.exists():
raise IOError("Parent of destination of checkout must exist")
# We're using Git and sparse checkout to only clone the files we need
with make_tempdir() as tmp_dir:
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
run_command(cmd)
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
f.write(subpath)
run_command(["git", "-C", str(tmp_dir), "fetch"])
run_command(["git", "-C", str(tmp_dir), "checkout"])
# We need Path(name) to make sure we also support subdirectories
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
print(dest)
print(list(dest.iterdir()))

View File

@ -7,7 +7,7 @@ import requests
from ...util import ensure_path, working_dir
from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum
from .._util import download_file
from .._util import download_file, git_sparse_checkout
# TODO: find a solution for caches
@ -45,14 +45,18 @@ def project_assets(project_dir: Path) -> None:
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
msg.info(f"Fetching {len(assets)} asset(s)")
for asset in assets:
dest = asset["dest"]
url = asset.get("url")
dest = Path(asset["dest"])
checksum = asset.get("checksum")
if not url:
# project.yml defines asset without URL that the user has to place
check_private_asset(dest, checksum)
continue
fetch_asset(project_path, url, dest, checksum)
if "git" in asset:
print(dest)
git_sparse_checkout(asset["git"]["repo"], asset["git"]["path"], dest)
else:
url = asset.get("url")
if not url:
# project.yml defines asset without URL that the user has to place
check_private_asset(dest, checksum)
continue
fetch_asset(project_path, url, dest, checksum)
def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:

View File

@ -6,8 +6,9 @@ import shutil
import re
from ... import about
from ...util import ensure_path, run_command, make_tempdir
from ...util import ensure_path
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
from .._util import git_sparse_checkout
@project_cli.command("clone")
@ -39,24 +40,11 @@ def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> N
check_clone(name, dest, repo)
project_dir = dest.resolve()
repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
# We're using Git and sparse checkout to only clone the files we need
with make_tempdir() as tmp_dir:
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
try:
run_command(cmd)
except subprocess.CalledProcessError:
err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
msg.fail(err)
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
f.write(name)
try:
run_command(["git", "-C", str(tmp_dir), "fetch"])
run_command(["git", "-C", str(tmp_dir), "checkout"])
except subprocess.CalledProcessError:
err = f"Could not clone '{name}' from repo '{repo_name}'"
msg.fail(err)
# We need Path(name) to make sure we also support subdirectories
shutil.move(str(tmp_dir / Path(name)), str(project_dir))
try:
git_sparse_checkout(repo, name, dest)
except subprocess.CalledProcessError:
err = f"Could not clone '{name}' from repo '{repo_name}'"
msg.fail(err)
msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
if not (project_dir / PROJECT_FILE).exists():
msg.warn(f"No {PROJECT_FILE} found in directory")