mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-24 16:24:16 +03:00
Fix the git "sparse checkout" functionality (#5973)
* Fix the git sparse checkout functionality * Format
This commit is contained in:
parent
fdcaf86c54
commit
2771e4f2b3
|
@ -322,21 +322,41 @@ def git_sparse_checkout(
|
|||
if dest.exists():
|
||||
msg.fail("Destination of checkout must not exist", exits=1)
|
||||
if not dest.parent.exists():
|
||||
msg.fail("Parent of destination of checkout must exist", exits=1)
|
||||
raise IOError("Parent of destination of checkout must exist")
|
||||
# We're using Git, partial clone and sparse checkout to
|
||||
# only clone the files we need
|
||||
# This ends up being RIDICULOUS. omg.
|
||||
# So, every tutorial and SO post talks about 'sparse checkout'...But they
|
||||
# go and *clone* the whole repo. Worthless. And cloning part of a repo
|
||||
# turns out to be completely broken. The only way to specify a "path" is..
|
||||
# a path *on the server*? The contents of which, specifies the paths. Wat.
|
||||
# Obviously this is hopelessly broken and insecure, because you can query
|
||||
# arbitrary paths on the server! So nobody enables this.
|
||||
# What we have to do is disable *all* files. We could then just checkout
|
||||
# the path, and it'd "work", but be hopelessly slow...Because it goes and
|
||||
# transfers every missing object one-by-one. So the final piece is that we
|
||||
# need to use some weird git internals to fetch the missings in bulk, and
|
||||
# *that* we can do by path.
|
||||
# We're using Git and sparse checkout to only clone the files we need
|
||||
with make_tempdir() as tmp_dir:
|
||||
# This is the "clone, but don't download anything" part.
|
||||
cmd = (
|
||||
f"git clone {repo} {tmp_dir} --no-checkout "
|
||||
"--depth 1 --config core.sparseCheckout=true"
|
||||
f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
|
||||
"--filter=blob:none" # <-- The key bit
|
||||
)
|
||||
if branch is not None:
|
||||
cmd = f"{cmd} -b {branch}"
|
||||
run_command(cmd)
|
||||
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
|
||||
f.write(subpath)
|
||||
run_command(["git", "-C", str(tmp_dir), "fetch"])
|
||||
run_command(["git", "-C", str(tmp_dir), "checkout"])
|
||||
run_command(cmd, capture=True)
|
||||
# Now we need to find the missing filenames for the subpath we want.
|
||||
# Looking for this 'rev-list' command in the git --help? Hah.
|
||||
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
|
||||
ret = run_command(cmd, capture=True)
|
||||
missings = "\n".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
||||
# Now pass those missings into another bit of git internals
|
||||
run_command(
|
||||
f"git -C {tmp_dir} fetch-pack --stdin {repo}", capture=True, stdin=missings
|
||||
)
|
||||
# And finally, we can checkout our subpath
|
||||
run_command(f"git -C {tmp_dir} checkout {branch} {subpath}")
|
||||
# We need Path(name) to make sure we also support subdirectories
|
||||
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
|
||||
print(dest)
|
||||
print(list(dest.iterdir()))
|
||||
|
|
|
@ -30,11 +30,11 @@ class PRFScore:
|
|||
|
||||
@property
|
||||
def precision(self) -> float:
|
||||
return (self.tp / (self.tp + self.fp + 1e-100))
|
||||
return self.tp / (self.tp + self.fp + 1e-100)
|
||||
|
||||
@property
|
||||
def recall(self) -> float:
|
||||
return (self.tp / (self.tp + self.fn + 1e-100))
|
||||
return self.tp / (self.tp + self.fn + 1e-100)
|
||||
|
||||
@property
|
||||
def fscore(self) -> float:
|
||||
|
|
|
@ -572,7 +572,7 @@ def join_command(command: List[str]) -> str:
|
|||
return " ".join(shlex.quote(cmd) for cmd in command)
|
||||
|
||||
|
||||
def run_command(command: Union[str, List[str]]) -> None:
|
||||
def run_command(command: Union[str, List[str]], *, capture=False, stdin=None) -> None:
|
||||
"""Run a command on the command line as a subprocess. If the subprocess
|
||||
returns a non-zero exit code, a system exit is performed.
|
||||
|
||||
|
@ -582,13 +582,21 @@ def run_command(command: Union[str, List[str]]) -> None:
|
|||
if isinstance(command, str):
|
||||
command = split_command(command)
|
||||
try:
|
||||
status = subprocess.call(command, env=os.environ.copy())
|
||||
ret = subprocess.run(
|
||||
command,
|
||||
env=os.environ.copy(),
|
||||
capture_output=capture,
|
||||
input=stdin,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
except FileNotFoundError:
|
||||
raise FileNotFoundError(
|
||||
Errors.E970.format(str_command=" ".join(command), tool=command[0])
|
||||
) from None
|
||||
if status != 0:
|
||||
sys.exit(status)
|
||||
if ret.returncode != 0:
|
||||
sys.exit(ret.returncode)
|
||||
return ret
|
||||
|
||||
|
||||
@contextmanager
|
||||
|
|
Loading…
Reference in New Issue
Block a user