From d60b748e3c760c092ed9f8d45ab1a730a5329b73 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Wed, 1 Sep 2021 16:52:08 -0400 Subject: [PATCH] Fix surprises when asking for the root of a git repo (#9074) * Fix surprises when asking for the root of a git repo In the case of the first asset I wanted to get from git, the data I wanted was the entire repository. I tried leaving "path" blank, which gave a less-than-helpful error, and then I tried `path: "/"`, which started copying my entire filesystem into the project. The path I should have used was "". I've made two changes to make this smoother for others: - The 'path' within a git clone defaults to "" - If the path points outside of the tmpdir that the git clone goes into, we fail with an error Signed-off-by: Elia Robyn Speer * use a descriptive error instead of a default plus some minor fixes from PR review Signed-off-by: Elia Robyn Speer * check for None values in assets Signed-off-by: Elia Robyn Speer Co-authored-by: Elia Robyn Speer --- spacy/cli/_util.py | 29 ++++++++++++++++++++++++++--- spacy/cli/project/assets.py | 4 ++++ spacy/tests/test_cli.py | 16 ++++++++++++++++ website/docs/usage/projects.md | 2 +- 4 files changed, 47 insertions(+), 4 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index ed1e840a5..127bba55a 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -397,7 +397,11 @@ def git_checkout( run_command(cmd, capture=True) # We need Path(name) to make sure we also support subdirectories try: - shutil.copytree(str(tmp_dir / Path(subpath)), str(dest)) + source_path = tmp_dir / Path(subpath) + if not is_subpath_of(tmp_dir, source_path): + err = f"'{subpath}' is a path outside of the cloned repository." + msg.fail(err, repo, exits=1) + shutil.copytree(str(source_path), str(dest)) except FileNotFoundError: err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')" msg.fail(err, repo, exits=1) @@ -445,8 +449,14 @@ def git_sparse_checkout(repo, subpath, dest, branch): # And finally, we can checkout our subpath cmd = f"git -C {tmp_dir} checkout {branch} {subpath}" run_command(cmd, capture=True) - # We need Path(name) to make sure we also support subdirectories - shutil.move(str(tmp_dir / Path(subpath)), str(dest)) + + # Get a subdirectory of the cloned path, if appropriate + source_path = tmp_dir / Path(subpath) + if not is_subpath_of(tmp_dir, source_path): + err = f"'{subpath}' is a path outside of the cloned repository." + msg.fail(err, repo, exits=1) + + shutil.move(str(source_path), str(dest)) def get_git_version( @@ -477,6 +487,19 @@ def _http_to_git(repo: str) -> str: return repo +def is_subpath_of(parent, child): + """ + Check whether `child` is a path contained within `parent`. + """ + # Based on https://stackoverflow.com/a/37095733 . + + # In Python 3.9, the `Path.is_relative_to()` method will supplant this, so + # we can stop using crusty old os.path functions. + parent_realpath = os.path.realpath(parent) + child_realpath = os.path.realpath(child) + return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath + + def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]: """Parse a comma-separated string to a list and account for various formatting options. Mostly used to handle CLI arguments that take a list of diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py index b49e18608..efa4d637a 100644 --- a/spacy/cli/project/assets.py +++ b/spacy/cli/project/assets.py @@ -59,6 +59,10 @@ def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None: shutil.rmtree(dest) else: dest.unlink() + if "repo" not in asset["git"] or asset["git"]["repo"] is None: + msg.fail("A git asset must include 'repo', the repository address.", exits=1) + if "path" not in asset["git"] or asset["git"]["path"] is None: + msg.fail("A git asset must include 'path' - use \"\" to get the entire repository.", exits=1) git_checkout( asset["git"]["repo"], asset["git"]["path"], diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index e84159b64..ae4cec17d 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -9,6 +9,7 @@ from spacy.cli import info from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import load_project_config, substitute_project_variables +from spacy.cli._util import is_subpath_of from spacy.cli._util import string_to_list from spacy import about from spacy.util import get_minor_version @@ -542,3 +543,18 @@ def test_get_third_party_dependencies_runs(): # Test with component factory based on Cython module nlp.add_pipe("tagger") assert get_third_party_dependencies(nlp.config) == [] + + +@pytest.mark.parametrize( + "parent,child,expected", + [ + ("/tmp", "/tmp", True), + ("/tmp", "/", False), + ("/tmp", "/tmp/subdir", True), + ("/tmp", "/tmpdir", False), + ("/tmp", "/tmp/subdir/..", True), + ("/tmp", "/tmp/..", False) + ], +) +def test_is_subpath_of(parent, child, expected): + assert is_subpath_of(parent, child) == expected diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index a646989a5..6f6cef7c8 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -291,7 +291,7 @@ files you need and not the whole repo. | Name | Description | | ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. | -| `git` | `repo`: The URL of the repo to download from.
`path`: Path of the file or directory to download, relative to the repo root.
`branch`: The branch to download from. Defaults to `"master"`. | +| `git` | `repo`: The URL of the repo to download from.
`path`: Path of the file or directory to download, relative to the repo root. "" specifies the root directory.
`branch`: The branch to download from. Defaults to `"master"`. | | `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. | | `description` | Optional asset description, used in [auto-generated docs](#custom-docs). |