diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 77cf36852..e07e58503 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -1,5 +1,6 @@ from typing import Dict, Any, Union, List, Optional, TYPE_CHECKING import sys +import shutil from pathlib import Path from wasabi import msg import srsly @@ -11,7 +12,7 @@ from thinc.config import Config, ConfigValidationError from configparser import InterpolationError from ..schemas import ProjectConfigSchema, validate -from ..util import import_file +from ..util import import_file, run_command, make_tempdir if TYPE_CHECKING: from pathy import Pathy # noqa: F401 @@ -311,3 +312,24 @@ def ensure_pathy(path): from pathy import Pathy # noqa: F811 return Pathy(path) + + +def git_sparse_checkout(repo: str, subpath: str, dest: Path): + if dest.exists(): + raise IOError("Destination of checkout must not exist") + if not dest.parent.exists(): + raise IOError("Parent of destination of checkout must exist") + # We're using Git and sparse checkout to only clone the files we need + with make_tempdir() as tmp_dir: + cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" + run_command(cmd) + with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: + f.write(subpath) + run_command(["git", "-C", str(tmp_dir), "fetch"]) + run_command(["git", "-C", str(tmp_dir), "checkout"]) + # We need Path(name) to make sure we also support subdirectories + shutil.move(str(tmp_dir / Path(subpath)), str(dest)) + print(dest) + print(list(dest.iterdir())) + + diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py index 60cf95160..3905c0976 100644 --- a/spacy/cli/project/assets.py +++ b/spacy/cli/project/assets.py @@ -7,7 +7,7 @@ import requests from ...util import ensure_path, working_dir from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum -from .._util import download_file +from .._util import download_file, git_sparse_checkout # TODO: find a solution for caches @@ -45,14 +45,18 @@ def project_assets(project_dir: Path) -> None: msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0) msg.info(f"Fetching {len(assets)} asset(s)") for asset in assets: - dest = asset["dest"] - url = asset.get("url") + dest = Path(asset["dest"]) checksum = asset.get("checksum") - if not url: - # project.yml defines asset without URL that the user has to place - check_private_asset(dest, checksum) - continue - fetch_asset(project_path, url, dest, checksum) + if "git" in asset: + print(dest) + git_sparse_checkout(asset["git"]["repo"], asset["git"]["path"], dest) + else: + url = asset.get("url") + if not url: + # project.yml defines asset without URL that the user has to place + check_private_asset(dest, checksum) + continue + fetch_asset(project_path, url, dest, checksum) def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None: diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py index bb9ba74cb..5317c3f78 100644 --- a/spacy/cli/project/clone.py +++ b/spacy/cli/project/clone.py @@ -6,8 +6,9 @@ import shutil import re from ... import about -from ...util import ensure_path, run_command, make_tempdir +from ...util import ensure_path from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE +from .._util import git_sparse_checkout @project_cli.command("clone") @@ -39,24 +40,11 @@ def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> N check_clone(name, dest, repo) project_dir = dest.resolve() repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo) - # We're using Git and sparse checkout to only clone the files we need - with make_tempdir() as tmp_dir: - cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" - try: - run_command(cmd) - except subprocess.CalledProcessError: - err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'." - msg.fail(err) - with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: - f.write(name) - try: - run_command(["git", "-C", str(tmp_dir), "fetch"]) - run_command(["git", "-C", str(tmp_dir), "checkout"]) - except subprocess.CalledProcessError: - err = f"Could not clone '{name}' from repo '{repo_name}'" - msg.fail(err) - # We need Path(name) to make sure we also support subdirectories - shutil.move(str(tmp_dir / Path(name)), str(project_dir)) + try: + git_sparse_checkout(repo, name, dest) + except subprocess.CalledProcessError: + err = f"Could not clone '{name}' from repo '{repo_name}'" + msg.fail(err) msg.good(f"Cloned '{name}' from {repo_name}", project_dir) if not (project_dir / PROJECT_FILE).exists(): msg.warn(f"No {PROJECT_FILE} found in directory")