mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Fix the git "sparse checkout" functionality (#5973)
* Fix the git sparse checkout functionality * Format
This commit is contained in:
parent
fdcaf86c54
commit
2771e4f2b3
|
@ -322,21 +322,41 @@ def git_sparse_checkout(
|
||||||
if dest.exists():
|
if dest.exists():
|
||||||
msg.fail("Destination of checkout must not exist", exits=1)
|
msg.fail("Destination of checkout must not exist", exits=1)
|
||||||
if not dest.parent.exists():
|
if not dest.parent.exists():
|
||||||
msg.fail("Parent of destination of checkout must exist", exits=1)
|
raise IOError("Parent of destination of checkout must exist")
|
||||||
|
# We're using Git, partial clone and sparse checkout to
|
||||||
|
# only clone the files we need
|
||||||
|
# This ends up being RIDICULOUS. omg.
|
||||||
|
# So, every tutorial and SO post talks about 'sparse checkout'...But they
|
||||||
|
# go and *clone* the whole repo. Worthless. And cloning part of a repo
|
||||||
|
# turns out to be completely broken. The only way to specify a "path" is..
|
||||||
|
# a path *on the server*? The contents of which, specifies the paths. Wat.
|
||||||
|
# Obviously this is hopelessly broken and insecure, because you can query
|
||||||
|
# arbitrary paths on the server! So nobody enables this.
|
||||||
|
# What we have to do is disable *all* files. We could then just checkout
|
||||||
|
# the path, and it'd "work", but be hopelessly slow...Because it goes and
|
||||||
|
# transfers every missing object one-by-one. So the final piece is that we
|
||||||
|
# need to use some weird git internals to fetch the missings in bulk, and
|
||||||
|
# *that* we can do by path.
|
||||||
# We're using Git and sparse checkout to only clone the files we need
|
# We're using Git and sparse checkout to only clone the files we need
|
||||||
with make_tempdir() as tmp_dir:
|
with make_tempdir() as tmp_dir:
|
||||||
|
# This is the "clone, but don't download anything" part.
|
||||||
cmd = (
|
cmd = (
|
||||||
f"git clone {repo} {tmp_dir} --no-checkout "
|
f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
|
||||||
"--depth 1 --config core.sparseCheckout=true"
|
"--filter=blob:none" # <-- The key bit
|
||||||
)
|
)
|
||||||
if branch is not None:
|
if branch is not None:
|
||||||
cmd = f"{cmd} -b {branch}"
|
cmd = f"{cmd} -b {branch}"
|
||||||
run_command(cmd)
|
run_command(cmd, capture=True)
|
||||||
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
|
# Now we need to find the missing filenames for the subpath we want.
|
||||||
f.write(subpath)
|
# Looking for this 'rev-list' command in the git --help? Hah.
|
||||||
run_command(["git", "-C", str(tmp_dir), "fetch"])
|
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
|
||||||
run_command(["git", "-C", str(tmp_dir), "checkout"])
|
ret = run_command(cmd, capture=True)
|
||||||
|
missings = "\n".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
||||||
|
# Now pass those missings into another bit of git internals
|
||||||
|
run_command(
|
||||||
|
f"git -C {tmp_dir} fetch-pack --stdin {repo}", capture=True, stdin=missings
|
||||||
|
)
|
||||||
|
# And finally, we can checkout our subpath
|
||||||
|
run_command(f"git -C {tmp_dir} checkout {branch} {subpath}")
|
||||||
# We need Path(name) to make sure we also support subdirectories
|
# We need Path(name) to make sure we also support subdirectories
|
||||||
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
|
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
|
||||||
print(dest)
|
|
||||||
print(list(dest.iterdir()))
|
|
||||||
|
|
|
@ -30,11 +30,11 @@ class PRFScore:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def precision(self) -> float:
|
def precision(self) -> float:
|
||||||
return (self.tp / (self.tp + self.fp + 1e-100))
|
return self.tp / (self.tp + self.fp + 1e-100)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def recall(self) -> float:
|
def recall(self) -> float:
|
||||||
return (self.tp / (self.tp + self.fn + 1e-100))
|
return self.tp / (self.tp + self.fn + 1e-100)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def fscore(self) -> float:
|
def fscore(self) -> float:
|
||||||
|
|
|
@ -572,7 +572,7 @@ def join_command(command: List[str]) -> str:
|
||||||
return " ".join(shlex.quote(cmd) for cmd in command)
|
return " ".join(shlex.quote(cmd) for cmd in command)
|
||||||
|
|
||||||
|
|
||||||
def run_command(command: Union[str, List[str]]) -> None:
|
def run_command(command: Union[str, List[str]], *, capture=False, stdin=None) -> None:
|
||||||
"""Run a command on the command line as a subprocess. If the subprocess
|
"""Run a command on the command line as a subprocess. If the subprocess
|
||||||
returns a non-zero exit code, a system exit is performed.
|
returns a non-zero exit code, a system exit is performed.
|
||||||
|
|
||||||
|
@ -582,13 +582,21 @@ def run_command(command: Union[str, List[str]]) -> None:
|
||||||
if isinstance(command, str):
|
if isinstance(command, str):
|
||||||
command = split_command(command)
|
command = split_command(command)
|
||||||
try:
|
try:
|
||||||
status = subprocess.call(command, env=os.environ.copy())
|
ret = subprocess.run(
|
||||||
|
command,
|
||||||
|
env=os.environ.copy(),
|
||||||
|
capture_output=capture,
|
||||||
|
input=stdin,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
raise FileNotFoundError(
|
raise FileNotFoundError(
|
||||||
Errors.E970.format(str_command=" ".join(command), tool=command[0])
|
Errors.E970.format(str_command=" ".join(command), tool=command[0])
|
||||||
) from None
|
) from None
|
||||||
if status != 0:
|
if ret.returncode != 0:
|
||||||
sys.exit(status)
|
sys.exit(ret.returncode)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
|
|
Loading…
Reference in New Issue
Block a user