Fix surprises when asking for the root of a git repo (#9074)

* Fix surprises when asking for the root of a git repo

In the case of the first asset I wanted to get from git, the data I
wanted was the entire repository. I tried leaving "path" blank, which
gave a less-than-helpful error, and then I tried `path: "/"`, which
started copying my entire filesystem into the project. The path I should
have used was "".

I've made two changes to make this smoother for others:

- The 'path' within a git clone defaults to ""
- If the path points outside of the tmpdir that the git clone goes
into, we fail with an error

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* use a descriptive error instead of a default

plus some minor fixes from PR review

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* check for None values in assets

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
This commit is contained in:
Robyn Speer 2021-09-01 16:52:08 -04:00 committed by GitHub
parent ba6a37d358
commit d60b748e3c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 47 additions and 4 deletions

View File

@ -397,7 +397,11 @@ def git_checkout(
run_command(cmd, capture=True) run_command(cmd, capture=True)
# We need Path(name) to make sure we also support subdirectories # We need Path(name) to make sure we also support subdirectories
try: try:
shutil.copytree(str(tmp_dir / Path(subpath)), str(dest)) source_path = tmp_dir / Path(subpath)
if not is_subpath_of(tmp_dir, source_path):
err = f"'{subpath}' is a path outside of the cloned repository."
msg.fail(err, repo, exits=1)
shutil.copytree(str(source_path), str(dest))
except FileNotFoundError: except FileNotFoundError:
err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')" err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
msg.fail(err, repo, exits=1) msg.fail(err, repo, exits=1)
@ -445,8 +449,14 @@ def git_sparse_checkout(repo, subpath, dest, branch):
# And finally, we can checkout our subpath # And finally, we can checkout our subpath
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}" cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
run_command(cmd, capture=True) run_command(cmd, capture=True)
# We need Path(name) to make sure we also support subdirectories
shutil.move(str(tmp_dir / Path(subpath)), str(dest)) # Get a subdirectory of the cloned path, if appropriate
source_path = tmp_dir / Path(subpath)
if not is_subpath_of(tmp_dir, source_path):
err = f"'{subpath}' is a path outside of the cloned repository."
msg.fail(err, repo, exits=1)
shutil.move(str(source_path), str(dest))
def get_git_version( def get_git_version(
@ -477,6 +487,19 @@ def _http_to_git(repo: str) -> str:
return repo return repo
def is_subpath_of(parent, child):
"""
Check whether `child` is a path contained within `parent`.
"""
# Based on https://stackoverflow.com/a/37095733 .
# In Python 3.9, the `Path.is_relative_to()` method will supplant this, so
# we can stop using crusty old os.path functions.
parent_realpath = os.path.realpath(parent)
child_realpath = os.path.realpath(child)
return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]: def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]:
"""Parse a comma-separated string to a list and account for various """Parse a comma-separated string to a list and account for various
formatting options. Mostly used to handle CLI arguments that take a list of formatting options. Mostly used to handle CLI arguments that take a list of

View File

@ -59,6 +59,10 @@ def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None:
shutil.rmtree(dest) shutil.rmtree(dest)
else: else:
dest.unlink() dest.unlink()
if "repo" not in asset["git"] or asset["git"]["repo"] is None:
msg.fail("A git asset must include 'repo', the repository address.", exits=1)
if "path" not in asset["git"] or asset["git"]["path"] is None:
msg.fail("A git asset must include 'path' - use \"\" to get the entire repository.", exits=1)
git_checkout( git_checkout(
asset["git"]["repo"], asset["git"]["repo"],
asset["git"]["path"], asset["git"]["path"],

View File

@ -9,6 +9,7 @@ from spacy.cli import info
from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.cli._util import load_project_config, substitute_project_variables from spacy.cli._util import load_project_config, substitute_project_variables
from spacy.cli._util import is_subpath_of
from spacy.cli._util import string_to_list from spacy.cli._util import string_to_list
from spacy import about from spacy import about
from spacy.util import get_minor_version from spacy.util import get_minor_version
@ -542,3 +543,18 @@ def test_get_third_party_dependencies_runs():
# Test with component factory based on Cython module # Test with component factory based on Cython module
nlp.add_pipe("tagger") nlp.add_pipe("tagger")
assert get_third_party_dependencies(nlp.config) == [] assert get_third_party_dependencies(nlp.config) == []
@pytest.mark.parametrize(
"parent,child,expected",
[
("/tmp", "/tmp", True),
("/tmp", "/", False),
("/tmp", "/tmp/subdir", True),
("/tmp", "/tmpdir", False),
("/tmp", "/tmp/subdir/..", True),
("/tmp", "/tmp/..", False)
],
)
def test_is_subpath_of(parent, child, expected):
assert is_subpath_of(parent, child) == expected

View File

@ -291,7 +291,7 @@ files you need and not the whole repo.
| Name | Description | | Name | Description |
| ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. | | `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
| `git` | `repo`: The URL of the repo to download from.<br />`path`: Path of the file or directory to download, relative to the repo root.<br />`branch`: The branch to download from. Defaults to `"master"`. | | `git` | `repo`: The URL of the repo to download from.<br />`path`: Path of the file or directory to download, relative to the repo root. "" specifies the root directory.<br />`branch`: The branch to download from. Defaults to `"master"`. |
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. | | `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). | | `description` | Optional asset description, used in [auto-generated docs](#custom-docs). |