diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index e8f3be995..c67863ef1 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -308,6 +308,31 @@ def git_checkout( msg.fail("Destination of checkout must not exist", exits=1) if not dest.parent.exists(): raise IOError("Parent of destination of checkout must exist") + + if sparse and git_version >= (2, 22): + return git_sparse_checkout(repo, subpath, dest, branch) + elif sparse: + # Only show warnings if the user explicitly wants sparse checkout but + # the Git version doesn't support it + err_old = ( + f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) " + f"that doesn't fully support sparse checkout yet." + ) + err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled." + msg.warn( + f"{err_unk if git_version == (0, 0) else err_old} " + f"This means that more files than necessary may be downloaded " + f"temporarily. To only download the files needed, make sure " + f"you're using Git v2.22 or above." + ) + with make_tempdir() as tmp_dir: + cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}" + ret = run_command(cmd, capture=True) + # We need Path(name) to make sure we also support subdirectories + shutil.copytree(str(tmp_dir / Path(subpath)), str(dest)) + + +def git_sparse_checkout(repo, subpath, dest, branch): # We're using Git, partial clone and sparse checkout to # only clone the files we need # This ends up being RIDICULOUS. omg. @@ -324,47 +349,31 @@ def git_checkout( # *that* we can do by path. # We're using Git and sparse checkout to only clone the files we need with make_tempdir() as tmp_dir: - supports_sparse = git_version >= (2, 22) - use_sparse = supports_sparse and sparse # This is the "clone, but don't download anything" part. - cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} " - if use_sparse: - cmd += f"--filter=blob:none" # <-- The key bit - # Only show warnings if the user explicitly wants sparse checkout but - # the Git version doesn't support it - elif sparse: - err_old = ( - f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) " - f"that doesn't fully support sparse checkout yet." - ) - err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled." - msg.warn( - f"{err_unk if git_version == (0, 0) else err_old} " - f"This means that more files than necessary may be downloaded " - f"temporarily. To only download the files needed, make sure " - f"you're using Git v2.22 or above." - ) - try_run_command(cmd) + cmd = ( + f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " + f"-b {branch} --filter=blob:none" + ) + run_command(cmd) # Now we need to find the missing filenames for the subpath we want. # Looking for this 'rev-list' command in the git --help? Hah. - cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if use_sparse else ''} -- {subpath}" - ret = try_run_command(cmd) + cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" + ret = run_command(cmd, capture=True) git_repo = _from_http_to_git(repo) # Now pass those missings into another bit of git internals missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) - if use_sparse and not missings: + if not missings: err = ( f"Could not find any relevant files for '{subpath}'. " f"Did you specify a correct and complete path within repo '{repo}' " f"and branch {branch}?" ) msg.fail(err, exits=1) - if use_sparse: - cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}" - try_run_command(cmd) + cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}" + run_command(cmd, capture=True) # And finally, we can checkout our subpath cmd = f"git -C {tmp_dir} checkout {branch} {subpath}" - try_run_command(cmd) + run_command(cmd, capture=True) # We need Path(name) to make sure we also support subdirectories shutil.move(str(tmp_dir / Path(subpath)), str(dest)) @@ -378,7 +387,7 @@ def get_git_version( RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns (0, 0) if the version couldn't be determined. """ - ret = try_run_command(["git", "--version"], error=error) + ret = run_command("git --version", capture=True) stdout = ret.stdout.strip() if not stdout or not stdout.startswith("git version"): return (0, 0) @@ -386,23 +395,6 @@ def get_git_version( return (int(version[0]), int(version[1])) -def try_run_command( - cmd: Union[str, List[str]], error: str = "Could not run command" -) -> subprocess.CompletedProcess: - """Try running a command and raise an error if it fails. - - cmd (Union[str, List[str]]): The command to run. - error (str): The error message. - RETURNS (CompletedProcess): The completed process if the command ran. - """ - try: - return run_command(cmd, capture=True) - except subprocess.CalledProcessError as e: - msg.fail(error) - print(cmd) - sys.exit(1) - - def _from_http_to_git(repo: str) -> str: if repo.startswith("http://"): repo = repo.replace(r"http://", r"https://") diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 8d6cd84c1..49a0ab75d 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -110,7 +110,7 @@ def package( msg.good(f"Successfully created package '{model_name_v}'", main_path) if create_sdist: with util.working_dir(main_path): - util.run_command([sys.executable, "setup.py", "sdist"]) + util.run_command([sys.executable, "setup.py", "sdist"], capture=False) zip_file = main_path / "dist" / f"{model_name_v}.tar.gz" msg.good(f"Successfully created zipped Python package", zip_file) diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index eb7b8cc5b..13c28f1da 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -144,7 +144,7 @@ def run_commands( if not silent: print(f"Running command: {join_command(command)}") if not dry: - run_command(command) + run_command(command, capture=False) def validate_subcommand( diff --git a/spacy/util.py b/spacy/util.py index 88162b23a..93000ea27 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -254,7 +254,7 @@ def load_vectors_into_model( def load_vocab_data_into_model( - nlp: "Language", *, lookups: Optional["Lookups"]=None + nlp: "Language", *, lookups: Optional["Lookups"] = None ) -> None: """Load vocab data.""" if lookups: @@ -659,8 +659,8 @@ def join_command(command: List[str]) -> str: def run_command( command: Union[str, List[str]], *, - capture: bool = False, stdin: Optional[Any] = None, + capture: bool = False, ) -> Optional[subprocess.CompletedProcess]: """Run a command on the command line as a subprocess. If the subprocess returns a non-zero exit code, a system exit is performed. @@ -668,33 +668,46 @@ def run_command( command (str / List[str]): The command. If provided as a string, the string will be split using shlex.split. stdin (Optional[Any]): stdin to read from or None. - capture (bool): Whether to capture the output. + capture (bool): Whether to capture the output and errors. If False, + the stdout and stderr will not be redirected, and if there's an error, + sys.exit will be called with the returncode. You should use capture=False + when you want to turn over execution to the command, and capture=True + when you want to run the command more like a function. RETURNS (Optional[CompletedProcess]): The process object. """ if isinstance(command, str): - command = split_command(command) + cmd_list = split_command(command) + cmd_str = command + else: + cmd_list = command + cmd_str = " ".join(command) try: ret = subprocess.run( - command, + cmd_list, env=os.environ.copy(), input=stdin, encoding="utf8", - check=True, + check=False, stdout=subprocess.PIPE if capture else None, - stderr=subprocess.PIPE if capture else None, + stderr=subprocess.STDOUT if capture else None, ) except FileNotFoundError: + # Indicates the *command* wasn't found, it's an error before the command + # is run. raise FileNotFoundError( - Errors.E970.format(str_command=" ".join(command), tool=command[0]) + Errors.E970.format(str_command=cmd_str, tool=cmd_list[0]) ) from None - except subprocess.CalledProcessError as e: - # We don't want a duplicate traceback here so we're making sure the - # CalledProcessError isn't re-raised. We also print both the string - # message and the stderr, in case the error only has one of them. - print(e.stderr) - print(e) - sys.exit(1) - if ret.returncode != 0: + if ret.returncode != 0 and capture: + message = f"Error running command:\n\n{cmd_str}\n\n" + message += f"Subprocess exited with status {ret.returncode}" + if ret.stdout is not None: + message += f"\n\nProcess log (stdout and stderr):\n\n" + message += ret.stdout + error = subprocess.SubprocessError(message) + error.ret = ret + error.command = cmd_str + raise error + elif ret.returncode != 0: sys.exit(ret.returncode) return ret