From 2771e4f2b3bfcc19f6c11a6801d95f4bb595c029 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2020 04:00:14 +0200 Subject: [PATCH] Fix the git "sparse checkout" functionality (#5973) * Fix the git sparse checkout functionality * Format --- spacy/cli/_util.py | 40 ++++++++++++++++++++++++++++++---------- spacy/scorer.py | 4 ++-- spacy/util.py | 16 ++++++++++++---- 3 files changed, 44 insertions(+), 16 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 4ed316219..ebd3809fd 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -322,21 +322,41 @@ def git_sparse_checkout( if dest.exists(): msg.fail("Destination of checkout must not exist", exits=1) if not dest.parent.exists(): - msg.fail("Parent of destination of checkout must exist", exits=1) + raise IOError("Parent of destination of checkout must exist") + # We're using Git, partial clone and sparse checkout to + # only clone the files we need + # This ends up being RIDICULOUS. omg. + # So, every tutorial and SO post talks about 'sparse checkout'...But they + # go and *clone* the whole repo. Worthless. And cloning part of a repo + # turns out to be completely broken. The only way to specify a "path" is.. + # a path *on the server*? The contents of which, specifies the paths. Wat. + # Obviously this is hopelessly broken and insecure, because you can query + # arbitrary paths on the server! So nobody enables this. + # What we have to do is disable *all* files. We could then just checkout + # the path, and it'd "work", but be hopelessly slow...Because it goes and + # transfers every missing object one-by-one. So the final piece is that we + # need to use some weird git internals to fetch the missings in bulk, and + # *that* we can do by path. # We're using Git and sparse checkout to only clone the files we need with make_tempdir() as tmp_dir: + # This is the "clone, but don't download anything" part. cmd = ( - f"git clone {repo} {tmp_dir} --no-checkout " - "--depth 1 --config core.sparseCheckout=true" + f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " + "--filter=blob:none" # <-- The key bit ) if branch is not None: cmd = f"{cmd} -b {branch}" - run_command(cmd) - with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: - f.write(subpath) - run_command(["git", "-C", str(tmp_dir), "fetch"]) - run_command(["git", "-C", str(tmp_dir), "checkout"]) + run_command(cmd, capture=True) + # Now we need to find the missing filenames for the subpath we want. + # Looking for this 'rev-list' command in the git --help? Hah. + cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" + ret = run_command(cmd, capture=True) + missings = "\n".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) + # Now pass those missings into another bit of git internals + run_command( + f"git -C {tmp_dir} fetch-pack --stdin {repo}", capture=True, stdin=missings + ) + # And finally, we can checkout our subpath + run_command(f"git -C {tmp_dir} checkout {branch} {subpath}") # We need Path(name) to make sure we also support subdirectories shutil.move(str(tmp_dir / Path(subpath)), str(dest)) - print(dest) - print(list(dest.iterdir())) diff --git a/spacy/scorer.py b/spacy/scorer.py index 28975ad43..dc017f82f 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -30,11 +30,11 @@ class PRFScore: @property def precision(self) -> float: - return (self.tp / (self.tp + self.fp + 1e-100)) + return self.tp / (self.tp + self.fp + 1e-100) @property def recall(self) -> float: - return (self.tp / (self.tp + self.fn + 1e-100)) + return self.tp / (self.tp + self.fn + 1e-100) @property def fscore(self) -> float: diff --git a/spacy/util.py b/spacy/util.py index 736f4d805..eb40dfa21 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -572,7 +572,7 @@ def join_command(command: List[str]) -> str: return " ".join(shlex.quote(cmd) for cmd in command) -def run_command(command: Union[str, List[str]]) -> None: +def run_command(command: Union[str, List[str]], *, capture=False, stdin=None) -> None: """Run a command on the command line as a subprocess. If the subprocess returns a non-zero exit code, a system exit is performed. @@ -582,13 +582,21 @@ def run_command(command: Union[str, List[str]]) -> None: if isinstance(command, str): command = split_command(command) try: - status = subprocess.call(command, env=os.environ.copy()) + ret = subprocess.run( + command, + env=os.environ.copy(), + capture_output=capture, + input=stdin, + text=True, + check=True, + ) except FileNotFoundError: raise FileNotFoundError( Errors.E970.format(str_command=" ".join(command), tool=command[0]) ) from None - if status != 0: - sys.exit(status) + if ret.returncode != 0: + sys.exit(ret.returncode) + return ret @contextmanager