diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index ed1e840a5..127bba55a 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -397,7 +397,11 @@ def git_checkout(
run_command(cmd, capture=True)
# We need Path(name) to make sure we also support subdirectories
try:
- shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
+ source_path = tmp_dir / Path(subpath)
+ if not is_subpath_of(tmp_dir, source_path):
+ err = f"'{subpath}' is a path outside of the cloned repository."
+ msg.fail(err, repo, exits=1)
+ shutil.copytree(str(source_path), str(dest))
except FileNotFoundError:
err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
msg.fail(err, repo, exits=1)
@@ -445,8 +449,14 @@ def git_sparse_checkout(repo, subpath, dest, branch):
# And finally, we can checkout our subpath
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
run_command(cmd, capture=True)
- # We need Path(name) to make sure we also support subdirectories
- shutil.move(str(tmp_dir / Path(subpath)), str(dest))
+
+ # Get a subdirectory of the cloned path, if appropriate
+ source_path = tmp_dir / Path(subpath)
+ if not is_subpath_of(tmp_dir, source_path):
+ err = f"'{subpath}' is a path outside of the cloned repository."
+ msg.fail(err, repo, exits=1)
+
+ shutil.move(str(source_path), str(dest))
def get_git_version(
@@ -477,6 +487,19 @@ def _http_to_git(repo: str) -> str:
return repo
+def is_subpath_of(parent, child):
+ """
+ Check whether `child` is a path contained within `parent`.
+ """
+ # Based on https://stackoverflow.com/a/37095733 .
+
+ # In Python 3.9, the `Path.is_relative_to()` method will supplant this, so
+ # we can stop using crusty old os.path functions.
+ parent_realpath = os.path.realpath(parent)
+ child_realpath = os.path.realpath(child)
+ return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
+
+
def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]:
"""Parse a comma-separated string to a list and account for various
formatting options. Mostly used to handle CLI arguments that take a list of
diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py
index b49e18608..efa4d637a 100644
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@@ -59,6 +59,10 @@ def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None:
shutil.rmtree(dest)
else:
dest.unlink()
+ if "repo" not in asset["git"] or asset["git"]["repo"] is None:
+ msg.fail("A git asset must include 'repo', the repository address.", exits=1)
+ if "path" not in asset["git"] or asset["git"]["path"] is None:
+ msg.fail("A git asset must include 'path' - use \"\" to get the entire repository.", exits=1)
git_checkout(
asset["git"]["repo"],
asset["git"]["path"],
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index e84159b64..ae4cec17d 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -9,6 +9,7 @@ from spacy.cli import info
from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.cli._util import load_project_config, substitute_project_variables
+from spacy.cli._util import is_subpath_of
from spacy.cli._util import string_to_list
from spacy import about
from spacy.util import get_minor_version
@@ -542,3 +543,18 @@ def test_get_third_party_dependencies_runs():
# Test with component factory based on Cython module
nlp.add_pipe("tagger")
assert get_third_party_dependencies(nlp.config) == []
+
+
+@pytest.mark.parametrize(
+ "parent,child,expected",
+ [
+ ("/tmp", "/tmp", True),
+ ("/tmp", "/", False),
+ ("/tmp", "/tmp/subdir", True),
+ ("/tmp", "/tmpdir", False),
+ ("/tmp", "/tmp/subdir/..", True),
+ ("/tmp", "/tmp/..", False)
+ ],
+)
+def test_is_subpath_of(parent, child, expected):
+ assert is_subpath_of(parent, child) == expected
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index a646989a5..6f6cef7c8 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -291,7 +291,7 @@ files you need and not the whole repo.
| Name | Description |
| ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
-| `git` | `repo`: The URL of the repo to download from.
`path`: Path of the file or directory to download, relative to the repo root.
`branch`: The branch to download from. Defaults to `"master"`. |
+| `git` | `repo`: The URL of the repo to download from.
`path`: Path of the file or directory to download, relative to the repo root. "" specifies the root directory.
`branch`: The branch to download from. Defaults to `"master"`. |
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). |