mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Fix surprises when asking for the root of a git repo (#9074)
* Fix surprises when asking for the root of a git repo In the case of the first asset I wanted to get from git, the data I wanted was the entire repository. I tried leaving "path" blank, which gave a less-than-helpful error, and then I tried `path: "/"`, which started copying my entire filesystem into the project. The path I should have used was "". I've made two changes to make this smoother for others: - The 'path' within a git clone defaults to "" - If the path points outside of the tmpdir that the git clone goes into, we fail with an error Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * use a descriptive error instead of a default plus some minor fixes from PR review Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * check for None values in assets Signed-off-by: Elia Robyn Speer <elia@explosion.ai> Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
This commit is contained in:
parent
ba6a37d358
commit
d60b748e3c
|
@ -397,7 +397,11 @@ def git_checkout(
|
|||
run_command(cmd, capture=True)
|
||||
# We need Path(name) to make sure we also support subdirectories
|
||||
try:
|
||||
shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
|
||||
source_path = tmp_dir / Path(subpath)
|
||||
if not is_subpath_of(tmp_dir, source_path):
|
||||
err = f"'{subpath}' is a path outside of the cloned repository."
|
||||
msg.fail(err, repo, exits=1)
|
||||
shutil.copytree(str(source_path), str(dest))
|
||||
except FileNotFoundError:
|
||||
err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
|
||||
msg.fail(err, repo, exits=1)
|
||||
|
@ -445,8 +449,14 @@ def git_sparse_checkout(repo, subpath, dest, branch):
|
|||
# And finally, we can checkout our subpath
|
||||
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
|
||||
run_command(cmd, capture=True)
|
||||
# We need Path(name) to make sure we also support subdirectories
|
||||
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
|
||||
|
||||
# Get a subdirectory of the cloned path, if appropriate
|
||||
source_path = tmp_dir / Path(subpath)
|
||||
if not is_subpath_of(tmp_dir, source_path):
|
||||
err = f"'{subpath}' is a path outside of the cloned repository."
|
||||
msg.fail(err, repo, exits=1)
|
||||
|
||||
shutil.move(str(source_path), str(dest))
|
||||
|
||||
|
||||
def get_git_version(
|
||||
|
@ -477,6 +487,19 @@ def _http_to_git(repo: str) -> str:
|
|||
return repo
|
||||
|
||||
|
||||
def is_subpath_of(parent, child):
|
||||
"""
|
||||
Check whether `child` is a path contained within `parent`.
|
||||
"""
|
||||
# Based on https://stackoverflow.com/a/37095733 .
|
||||
|
||||
# In Python 3.9, the `Path.is_relative_to()` method will supplant this, so
|
||||
# we can stop using crusty old os.path functions.
|
||||
parent_realpath = os.path.realpath(parent)
|
||||
child_realpath = os.path.realpath(child)
|
||||
return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
|
||||
|
||||
|
||||
def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]:
|
||||
"""Parse a comma-separated string to a list and account for various
|
||||
formatting options. Mostly used to handle CLI arguments that take a list of
|
||||
|
|
|
@ -59,6 +59,10 @@ def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None:
|
|||
shutil.rmtree(dest)
|
||||
else:
|
||||
dest.unlink()
|
||||
if "repo" not in asset["git"] or asset["git"]["repo"] is None:
|
||||
msg.fail("A git asset must include 'repo', the repository address.", exits=1)
|
||||
if "path" not in asset["git"] or asset["git"]["path"] is None:
|
||||
msg.fail("A git asset must include 'path' - use \"\" to get the entire repository.", exits=1)
|
||||
git_checkout(
|
||||
asset["git"]["repo"],
|
||||
asset["git"]["path"],
|
||||
|
|
|
@ -9,6 +9,7 @@ from spacy.cli import info
|
|||
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||
from spacy.cli._util import load_project_config, substitute_project_variables
|
||||
from spacy.cli._util import is_subpath_of
|
||||
from spacy.cli._util import string_to_list
|
||||
from spacy import about
|
||||
from spacy.util import get_minor_version
|
||||
|
@ -542,3 +543,18 @@ def test_get_third_party_dependencies_runs():
|
|||
# Test with component factory based on Cython module
|
||||
nlp.add_pipe("tagger")
|
||||
assert get_third_party_dependencies(nlp.config) == []
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"parent,child,expected",
|
||||
[
|
||||
("/tmp", "/tmp", True),
|
||||
("/tmp", "/", False),
|
||||
("/tmp", "/tmp/subdir", True),
|
||||
("/tmp", "/tmpdir", False),
|
||||
("/tmp", "/tmp/subdir/..", True),
|
||||
("/tmp", "/tmp/..", False)
|
||||
],
|
||||
)
|
||||
def test_is_subpath_of(parent, child, expected):
|
||||
assert is_subpath_of(parent, child) == expected
|
||||
|
|
|
@ -291,7 +291,7 @@ files you need and not the whole repo.
|
|||
| Name | Description |
|
||||
| ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
|
||||
| `git` | `repo`: The URL of the repo to download from.<br />`path`: Path of the file or directory to download, relative to the repo root.<br />`branch`: The branch to download from. Defaults to `"master"`. |
|
||||
| `git` | `repo`: The URL of the repo to download from.<br />`path`: Path of the file or directory to download, relative to the repo root. "" specifies the root directory.<br />`branch`: The branch to download from. Defaults to `"master"`. |
|
||||
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
|
||||
| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). |
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user