diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index f104feff9..33404b0b7 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -1,4 +1,4 @@ -from typing import Dict, Any, Union, List, Optional, Tuple, Iterable +from typing import Dict, Any, Union, List, Optional, Iterable from typing import TYPE_CHECKING, overload import sys import shutil @@ -11,16 +11,13 @@ from click import NoSuchOption from click.parser import split_arg_string from typer.main import get_command from contextlib import contextmanager -from thinc.api import Config, ConfigValidationError, require_gpu +from thinc.api import ConfigValidationError, require_gpu from thinc.util import gpu_is_available from configparser import InterpolationError import os from ..compat import Literal -from ..schemas import ProjectConfigSchema, validate -from ..util import import_file, run_command, make_tempdir, registry, logger -from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS -from .. import about +from ..util import import_file, registry, logger, ENV_VARS if TYPE_CHECKING: from pathy import FluidPath # noqa: F401 @@ -30,7 +27,6 @@ SDIST_SUFFIX = ".tar.gz" WHEEL_SUFFIX = "-py3-none-any.whl" PROJECT_FILE = "project.yml" -PROJECT_LOCK = "project.lock" COMMAND = "python -m spacy" NAME = "spacy" HELP = """spaCy Command-line Interface @@ -135,115 +131,6 @@ def _parse_override(value: Any) -> Any: return str(value) -def load_project_config( - path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict() -) -> Dict[str, Any]: - """Load the project.yml file from a directory and validate it. Also make - sure that all directories defined in the config exist. - - path (Path): The path to the project directory. - interpolate (bool): Whether to substitute project variables. - overrides (Dict[str, Any]): Optional config overrides. - RETURNS (Dict[str, Any]): The loaded project.yml. - """ - config_path = path / PROJECT_FILE - if not config_path.exists(): - msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1) - invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct." - try: - config = srsly.read_yaml(config_path) - except ValueError as e: - msg.fail(invalid_err, e, exits=1) - errors = validate(ProjectConfigSchema, config) - if errors: - msg.fail(invalid_err) - print("\n".join(errors)) - sys.exit(1) - validate_project_version(config) - validate_project_commands(config) - if interpolate: - err = f"{PROJECT_FILE} validation error" - with show_validation_error(title=err, hint_fill=False): - config = substitute_project_variables(config, overrides) - # Make sure directories defined in config exist - for subdir in config.get("directories", []): - dir_path = path / subdir - if not dir_path.exists(): - dir_path.mkdir(parents=True) - return config - - -def substitute_project_variables( - config: Dict[str, Any], - overrides: Dict[str, Any] = SimpleFrozenDict(), - key: str = "vars", - env_key: str = "env", -) -> Dict[str, Any]: - """Interpolate variables in the project file using the config system. - - config (Dict[str, Any]): The project config. - overrides (Dict[str, Any]): Optional config overrides. - key (str): Key containing variables in project config. - env_key (str): Key containing environment variable mapping in project config. - RETURNS (Dict[str, Any]): The interpolated project config. - """ - config.setdefault(key, {}) - config.setdefault(env_key, {}) - # Substitute references to env vars with their values - for config_var, env_var in config[env_key].items(): - config[env_key][config_var] = _parse_override(os.environ.get(env_var, "")) - # Need to put variables in the top scope again so we can have a top-level - # section "project" (otherwise, a list of commands in the top scope wouldn't) - # be allowed by Thinc's config system - cfg = Config({"project": config, key: config[key], env_key: config[env_key]}) - cfg = Config().from_str(cfg.to_str(), overrides=overrides) - interpolated = cfg.interpolate() - return dict(interpolated["project"]) - - -def validate_project_version(config: Dict[str, Any]) -> None: - """If the project defines a compatible spaCy version range, chec that it's - compatible with the current version of spaCy. - - config (Dict[str, Any]): The loaded config. - """ - spacy_version = config.get("spacy_version", None) - if spacy_version and not is_compatible_version(about.__version__, spacy_version): - err = ( - f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) " - f"that's not compatible with the version of spaCy you're running " - f"({about.__version__}). You can edit version requirement in the " - f"{PROJECT_FILE} to load it, but the project may not run as expected." - ) - msg.fail(err, exits=1) - - -def validate_project_commands(config: Dict[str, Any]) -> None: - """Check that project commands and workflows are valid, don't contain - duplicates, don't clash and only refer to commands that exist. - - config (Dict[str, Any]): The loaded config. - """ - command_names = [cmd["name"] for cmd in config.get("commands", [])] - workflows = config.get("workflows", {}) - duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1]) - if duplicates: - err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}" - msg.fail(err, exits=1) - for workflow_name, workflow_steps in workflows.items(): - if workflow_name in command_names: - err = f"Can't use workflow name '{workflow_name}': name already exists as a command" - msg.fail(err, exits=1) - for step in workflow_steps: - if step not in command_names: - msg.fail( - f"Unknown command specified in workflow '{workflow_name}': {step}", - f"Workflows can only refer to commands defined in the 'commands' " - f"section of the {PROJECT_FILE}.", - exits=1, - ) - - def get_hash(data, exclude: Iterable[str] = tuple()) -> str: """Get the hash for a JSON-serializable object. @@ -381,158 +268,6 @@ def ensure_pathy(path): return Pathy.fluid(path) -def git_checkout( - repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False -): - git_version = get_git_version() - if dest.exists(): - msg.fail("Destination of checkout must not exist", exits=1) - if not dest.parent.exists(): - msg.fail("Parent of destination of checkout must exist", exits=1) - if sparse and git_version >= (2, 22): - return git_sparse_checkout(repo, subpath, dest, branch) - elif sparse: - # Only show warnings if the user explicitly wants sparse checkout but - # the Git version doesn't support it - err_old = ( - f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) " - f"that doesn't fully support sparse checkout yet." - ) - err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled." - msg.warn( - f"{err_unk if git_version == (0, 0) else err_old} " - f"This means that more files than necessary may be downloaded " - f"temporarily. To only download the files needed, make sure " - f"you're using Git v2.22 or above." - ) - with make_tempdir() as tmp_dir: - cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}" - run_command(cmd, capture=True) - # We need Path(name) to make sure we also support subdirectories - try: - source_path = tmp_dir / Path(subpath) - if not is_subpath_of(tmp_dir, source_path): - err = f"'{subpath}' is a path outside of the cloned repository." - msg.fail(err, repo, exits=1) - shutil.copytree(str(source_path), str(dest)) - except FileNotFoundError: - err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')" - msg.fail(err, repo, exits=1) - - -def git_sparse_checkout(repo, subpath, dest, branch): - # We're using Git, partial clone and sparse checkout to - # only clone the files we need - # This ends up being RIDICULOUS. omg. - # So, every tutorial and SO post talks about 'sparse checkout'...But they - # go and *clone* the whole repo. Worthless. And cloning part of a repo - # turns out to be completely broken. The only way to specify a "path" is.. - # a path *on the server*? The contents of which, specifies the paths. Wat. - # Obviously this is hopelessly broken and insecure, because you can query - # arbitrary paths on the server! So nobody enables this. - # What we have to do is disable *all* files. We could then just checkout - # the path, and it'd "work", but be hopelessly slow...Because it goes and - # transfers every missing object one-by-one. So the final piece is that we - # need to use some weird git internals to fetch the missings in bulk, and - # *that* we can do by path. - # We're using Git and sparse checkout to only clone the files we need - with make_tempdir() as tmp_dir: - # This is the "clone, but don't download anything" part. - cmd = ( - f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " - f"-b {branch} --filter=blob:none" - ) - run_command(cmd) - # Now we need to find the missing filenames for the subpath we want. - # Looking for this 'rev-list' command in the git --help? Hah. - cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" - ret = run_command(cmd, capture=True) - git_repo = _http_to_git(repo) - # Now pass those missings into another bit of git internals - missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) - if not missings: - err = ( - f"Could not find any relevant files for '{subpath}'. " - f"Did you specify a correct and complete path within repo '{repo}' " - f"and branch {branch}?" - ) - msg.fail(err, exits=1) - cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}" - run_command(cmd, capture=True) - # And finally, we can checkout our subpath - cmd = f"git -C {tmp_dir} checkout {branch} {subpath}" - run_command(cmd, capture=True) - - # Get a subdirectory of the cloned path, if appropriate - source_path = tmp_dir / Path(subpath) - if not is_subpath_of(tmp_dir, source_path): - err = f"'{subpath}' is a path outside of the cloned repository." - msg.fail(err, repo, exits=1) - - shutil.move(str(source_path), str(dest)) - - -def git_repo_branch_exists(repo: str, branch: str) -> bool: - """Uses 'git ls-remote' to check if a repository and branch exists - - repo (str): URL to get repo. - branch (str): Branch on repo to check. - RETURNS (bool): True if repo:branch exists. - """ - get_git_version() - cmd = f"git ls-remote {repo} {branch}" - # We might be tempted to use `--exit-code` with `git ls-remote`, but - # `run_command` handles the `returncode` for us, so we'll rely on - # the fact that stdout returns '' if the requested branch doesn't exist - ret = run_command(cmd, capture=True) - exists = ret.stdout != "" - return exists - - -def get_git_version( - error: str = "Could not run 'git'. Make sure it's installed and the executable is available.", -) -> Tuple[int, int]: - """Get the version of git and raise an error if calling 'git --version' fails. - - error (str): The error message to show. - RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns - (0, 0) if the version couldn't be determined. - """ - try: - ret = run_command("git --version", capture=True) - except: - raise RuntimeError(error) - stdout = ret.stdout.strip() - if not stdout or not stdout.startswith("git version"): - return 0, 0 - version = stdout[11:].strip().split(".") - return int(version[0]), int(version[1]) - - -def _http_to_git(repo: str) -> str: - if repo.startswith("http://"): - repo = repo.replace(r"http://", r"https://") - if repo.startswith(r"https://"): - repo = repo.replace("https://", "git@").replace("/", ":", 1) - if repo.endswith("/"): - repo = repo[:-1] - repo = f"{repo}.git" - return repo - - -def is_subpath_of(parent, child): - """ - Check whether `child` is a path contained within `parent`. - """ - # Based on https://stackoverflow.com/a/37095733 . - - # In Python 3.9, the `Path.is_relative_to()` method will supplant this, so - # we can stop using crusty old os.path functions. - parent_realpath = os.path.realpath(parent) - child_realpath = os.path.realpath(child) - return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath - - @overload def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]: ...