From dc2f7fffd190e0cebe2afb2cc4eea5abceadb02e Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 28 Mar 2023 13:46:20 +0200 Subject: [PATCH] Setting up weasel branch (#12456) * remove project-specific functionality * remove project-specific tests * remove project-specific schemas * remove project-specific information in about * remove project-specific functions in util.py * remove project-specific error strings * remove project-specific CLI commands * black formatting * restore some functions that are used beyond projects * remove project imports * remove imports * remove remote_storage tests * remove one more project unit test * update for PR 12394 * remove get_hash and get_checksum * remove upload_ and download_file methods * remove ensure_pathy * revert clumsy fingers * reinstate E970 --- spacy/about.py | 2 - spacy/cli/__init__.py | 7 - spacy/cli/_util.py | 330 +------------------------ spacy/cli/project/__init__.py | 0 spacy/cli/project/assets.py | 206 ---------------- spacy/cli/project/clone.py | 115 --------- spacy/cli/project/document.py | 115 --------- spacy/cli/project/dvc.py | 207 ---------------- spacy/cli/project/pull.py | 67 ------ spacy/cli/project/push.py | 69 ------ spacy/cli/project/remote_storage.py | 205 ---------------- spacy/cli/project/run.py | 360 ---------------------------- spacy/errors.py | 2 - spacy/schemas.py | 60 ----- spacy/tests/test_cli.py | 286 +--------------------- spacy/util.py | 24 -- 16 files changed, 4 insertions(+), 2051 deletions(-) delete mode 100644 spacy/cli/project/__init__.py delete mode 100644 spacy/cli/project/assets.py delete mode 100644 spacy/cli/project/clone.py delete mode 100644 spacy/cli/project/document.py delete mode 100644 spacy/cli/project/dvc.py delete mode 100644 spacy/cli/project/pull.py delete mode 100644 spacy/cli/project/push.py delete mode 100644 spacy/cli/project/remote_storage.py delete mode 100644 spacy/cli/project/run.py diff --git a/spacy/about.py b/spacy/about.py index 640e9e93b..4748d655c 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,5 +3,3 @@ __title__ = "spacy" __version__ = "3.5.0" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" -__projects__ = "https://github.com/explosion/projects" -__projects_branch__ = "v3" diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 868526b42..efabcb9cf 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -22,13 +22,6 @@ from .convert import convert # noqa: F401 from .init_pipeline import init_pipeline_cli # noqa: F401 from .init_config import init_config, fill_config # noqa: F401 from .validate import validate # noqa: F401 -from .project.clone import project_clone # noqa: F401 -from .project.assets import project_assets # noqa: F401 -from .project.run import project_run # noqa: F401 -from .project.dvc import project_update_dvc # noqa: F401 -from .project.push import project_push # noqa: F401 -from .project.pull import project_pull # noqa: F401 -from .project.document import project_document # noqa: F401 from .find_threshold import find_threshold # noqa: F401 diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index f104feff9..68e3255a8 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -11,16 +11,13 @@ from click import NoSuchOption from click.parser import split_arg_string from typer.main import get_command from contextlib import contextmanager -from thinc.api import Config, ConfigValidationError, require_gpu +from thinc.api import ConfigValidationError, require_gpu from thinc.util import gpu_is_available from configparser import InterpolationError import os from ..compat import Literal -from ..schemas import ProjectConfigSchema, validate -from ..util import import_file, run_command, make_tempdir, registry, logger -from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS -from .. import about +from ..util import import_file, run_command, registry, logger, ENV_VARS if TYPE_CHECKING: from pathy import FluidPath # noqa: F401 @@ -30,7 +27,6 @@ SDIST_SUFFIX = ".tar.gz" WHEEL_SUFFIX = "-py3-none-any.whl" PROJECT_FILE = "project.yml" -PROJECT_LOCK = "project.lock" COMMAND = "python -m spacy" NAME = "spacy" HELP = """spaCy Command-line Interface @@ -135,148 +131,6 @@ def _parse_override(value: Any) -> Any: return str(value) -def load_project_config( - path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict() -) -> Dict[str, Any]: - """Load the project.yml file from a directory and validate it. Also make - sure that all directories defined in the config exist. - - path (Path): The path to the project directory. - interpolate (bool): Whether to substitute project variables. - overrides (Dict[str, Any]): Optional config overrides. - RETURNS (Dict[str, Any]): The loaded project.yml. - """ - config_path = path / PROJECT_FILE - if not config_path.exists(): - msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1) - invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct." - try: - config = srsly.read_yaml(config_path) - except ValueError as e: - msg.fail(invalid_err, e, exits=1) - errors = validate(ProjectConfigSchema, config) - if errors: - msg.fail(invalid_err) - print("\n".join(errors)) - sys.exit(1) - validate_project_version(config) - validate_project_commands(config) - if interpolate: - err = f"{PROJECT_FILE} validation error" - with show_validation_error(title=err, hint_fill=False): - config = substitute_project_variables(config, overrides) - # Make sure directories defined in config exist - for subdir in config.get("directories", []): - dir_path = path / subdir - if not dir_path.exists(): - dir_path.mkdir(parents=True) - return config - - -def substitute_project_variables( - config: Dict[str, Any], - overrides: Dict[str, Any] = SimpleFrozenDict(), - key: str = "vars", - env_key: str = "env", -) -> Dict[str, Any]: - """Interpolate variables in the project file using the config system. - - config (Dict[str, Any]): The project config. - overrides (Dict[str, Any]): Optional config overrides. - key (str): Key containing variables in project config. - env_key (str): Key containing environment variable mapping in project config. - RETURNS (Dict[str, Any]): The interpolated project config. - """ - config.setdefault(key, {}) - config.setdefault(env_key, {}) - # Substitute references to env vars with their values - for config_var, env_var in config[env_key].items(): - config[env_key][config_var] = _parse_override(os.environ.get(env_var, "")) - # Need to put variables in the top scope again so we can have a top-level - # section "project" (otherwise, a list of commands in the top scope wouldn't) - # be allowed by Thinc's config system - cfg = Config({"project": config, key: config[key], env_key: config[env_key]}) - cfg = Config().from_str(cfg.to_str(), overrides=overrides) - interpolated = cfg.interpolate() - return dict(interpolated["project"]) - - -def validate_project_version(config: Dict[str, Any]) -> None: - """If the project defines a compatible spaCy version range, chec that it's - compatible with the current version of spaCy. - - config (Dict[str, Any]): The loaded config. - """ - spacy_version = config.get("spacy_version", None) - if spacy_version and not is_compatible_version(about.__version__, spacy_version): - err = ( - f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) " - f"that's not compatible with the version of spaCy you're running " - f"({about.__version__}). You can edit version requirement in the " - f"{PROJECT_FILE} to load it, but the project may not run as expected." - ) - msg.fail(err, exits=1) - - -def validate_project_commands(config: Dict[str, Any]) -> None: - """Check that project commands and workflows are valid, don't contain - duplicates, don't clash and only refer to commands that exist. - - config (Dict[str, Any]): The loaded config. - """ - command_names = [cmd["name"] for cmd in config.get("commands", [])] - workflows = config.get("workflows", {}) - duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1]) - if duplicates: - err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}" - msg.fail(err, exits=1) - for workflow_name, workflow_steps in workflows.items(): - if workflow_name in command_names: - err = f"Can't use workflow name '{workflow_name}': name already exists as a command" - msg.fail(err, exits=1) - for step in workflow_steps: - if step not in command_names: - msg.fail( - f"Unknown command specified in workflow '{workflow_name}': {step}", - f"Workflows can only refer to commands defined in the 'commands' " - f"section of the {PROJECT_FILE}.", - exits=1, - ) - - -def get_hash(data, exclude: Iterable[str] = tuple()) -> str: - """Get the hash for a JSON-serializable object. - - data: The data to hash. - exclude (Iterable[str]): Top-level keys to exclude if data is a dict. - RETURNS (str): The hash. - """ - if isinstance(data, dict): - data = {k: v for k, v in data.items() if k not in exclude} - data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") - return hashlib.md5(data_str).hexdigest() - - -def get_checksum(path: Union[Path, str]) -> str: - """Get the checksum for a file or directory given its file path. If a - directory path is provided, this uses all files in that directory. - - path (Union[Path, str]): The file or directory path. - RETURNS (str): The checksum. - """ - path = Path(path) - if not (path.is_file() or path.is_dir()): - msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1) - if path.is_file(): - return hashlib.md5(Path(path).read_bytes()).hexdigest() - else: - # TODO: this is currently pretty slow - dir_checksum = hashlib.md5() - for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()): - dir_checksum.update(sub_file.read_bytes()) - return dir_checksum.hexdigest() - - @contextmanager def show_validation_error( file_path: Optional[Union[str, Path]] = None, @@ -334,166 +188,10 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None: msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) -def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None: - """Upload a file. - - src (Path): The source path. - url (str): The destination URL to upload to. - """ - import smart_open - - # Create parent directories for local paths - if isinstance(dest, Path): - if not dest.parent.exists(): - dest.parent.mkdir(parents=True) - - dest = str(dest) - with smart_open.open(dest, mode="wb") as output_file: - with src.open(mode="rb") as input_file: - output_file.write(input_file.read()) - - -def download_file( - src: Union[str, "FluidPath"], dest: Path, *, force: bool = False -) -> None: - """Download a file using smart_open. - - url (str): The URL of the file. - dest (Path): The destination path. - force (bool): Whether to force download even if file exists. - If False, the download will be skipped. - """ - import smart_open - - if dest.exists() and not force: - return None - src = str(src) - with smart_open.open(src, mode="rb", compression="disable") as input_file: - with dest.open(mode="wb") as output_file: - shutil.copyfileobj(input_file, output_file) - - -def ensure_pathy(path): - """Temporary helper to prevent importing Pathy globally (which can cause - slow and annoying Google Cloud warning).""" - from pathy import Pathy # noqa: F811 - - return Pathy.fluid(path) - - -def git_checkout( - repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False -): - git_version = get_git_version() - if dest.exists(): - msg.fail("Destination of checkout must not exist", exits=1) - if not dest.parent.exists(): - msg.fail("Parent of destination of checkout must exist", exits=1) - if sparse and git_version >= (2, 22): - return git_sparse_checkout(repo, subpath, dest, branch) - elif sparse: - # Only show warnings if the user explicitly wants sparse checkout but - # the Git version doesn't support it - err_old = ( - f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) " - f"that doesn't fully support sparse checkout yet." - ) - err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled." - msg.warn( - f"{err_unk if git_version == (0, 0) else err_old} " - f"This means that more files than necessary may be downloaded " - f"temporarily. To only download the files needed, make sure " - f"you're using Git v2.22 or above." - ) - with make_tempdir() as tmp_dir: - cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}" - run_command(cmd, capture=True) - # We need Path(name) to make sure we also support subdirectories - try: - source_path = tmp_dir / Path(subpath) - if not is_subpath_of(tmp_dir, source_path): - err = f"'{subpath}' is a path outside of the cloned repository." - msg.fail(err, repo, exits=1) - shutil.copytree(str(source_path), str(dest)) - except FileNotFoundError: - err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')" - msg.fail(err, repo, exits=1) - - -def git_sparse_checkout(repo, subpath, dest, branch): - # We're using Git, partial clone and sparse checkout to - # only clone the files we need - # This ends up being RIDICULOUS. omg. - # So, every tutorial and SO post talks about 'sparse checkout'...But they - # go and *clone* the whole repo. Worthless. And cloning part of a repo - # turns out to be completely broken. The only way to specify a "path" is.. - # a path *on the server*? The contents of which, specifies the paths. Wat. - # Obviously this is hopelessly broken and insecure, because you can query - # arbitrary paths on the server! So nobody enables this. - # What we have to do is disable *all* files. We could then just checkout - # the path, and it'd "work", but be hopelessly slow...Because it goes and - # transfers every missing object one-by-one. So the final piece is that we - # need to use some weird git internals to fetch the missings in bulk, and - # *that* we can do by path. - # We're using Git and sparse checkout to only clone the files we need - with make_tempdir() as tmp_dir: - # This is the "clone, but don't download anything" part. - cmd = ( - f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " - f"-b {branch} --filter=blob:none" - ) - run_command(cmd) - # Now we need to find the missing filenames for the subpath we want. - # Looking for this 'rev-list' command in the git --help? Hah. - cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" - ret = run_command(cmd, capture=True) - git_repo = _http_to_git(repo) - # Now pass those missings into another bit of git internals - missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) - if not missings: - err = ( - f"Could not find any relevant files for '{subpath}'. " - f"Did you specify a correct and complete path within repo '{repo}' " - f"and branch {branch}?" - ) - msg.fail(err, exits=1) - cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}" - run_command(cmd, capture=True) - # And finally, we can checkout our subpath - cmd = f"git -C {tmp_dir} checkout {branch} {subpath}" - run_command(cmd, capture=True) - - # Get a subdirectory of the cloned path, if appropriate - source_path = tmp_dir / Path(subpath) - if not is_subpath_of(tmp_dir, source_path): - err = f"'{subpath}' is a path outside of the cloned repository." - msg.fail(err, repo, exits=1) - - shutil.move(str(source_path), str(dest)) - - -def git_repo_branch_exists(repo: str, branch: str) -> bool: - """Uses 'git ls-remote' to check if a repository and branch exists - - repo (str): URL to get repo. - branch (str): Branch on repo to check. - RETURNS (bool): True if repo:branch exists. - """ - get_git_version() - cmd = f"git ls-remote {repo} {branch}" - # We might be tempted to use `--exit-code` with `git ls-remote`, but - # `run_command` handles the `returncode` for us, so we'll rely on - # the fact that stdout returns '' if the requested branch doesn't exist - ret = run_command(cmd, capture=True) - exists = ret.stdout != "" - return exists - - def get_git_version( error: str = "Could not run 'git'. Make sure it's installed and the executable is available.", ) -> Tuple[int, int]: """Get the version of git and raise an error if calling 'git --version' fails. - error (str): The error message to show. RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns (0, 0) if the version couldn't be determined. @@ -509,30 +207,6 @@ def get_git_version( return int(version[0]), int(version[1]) -def _http_to_git(repo: str) -> str: - if repo.startswith("http://"): - repo = repo.replace(r"http://", r"https://") - if repo.startswith(r"https://"): - repo = repo.replace("https://", "git@").replace("/", ":", 1) - if repo.endswith("/"): - repo = repo[:-1] - repo = f"{repo}.git" - return repo - - -def is_subpath_of(parent, child): - """ - Check whether `child` is a path contained within `parent`. - """ - # Based on https://stackoverflow.com/a/37095733 . - - # In Python 3.9, the `Path.is_relative_to()` method will supplant this, so - # we can stop using crusty old os.path functions. - parent_realpath = os.path.realpath(parent) - child_realpath = os.path.realpath(child) - return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath - - @overload def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]: ... diff --git a/spacy/cli/project/__init__.py b/spacy/cli/project/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py deleted file mode 100644 index 8f35b2d23..000000000 --- a/spacy/cli/project/assets.py +++ /dev/null @@ -1,206 +0,0 @@ -from typing import Any, Dict, Optional -from pathlib import Path -from wasabi import msg -import os -import re -import shutil -import requests -import typer - -from ...util import ensure_path, working_dir -from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config -from .._util import get_checksum, download_file, git_checkout, get_git_version -from .._util import SimpleFrozenDict, parse_config_overrides - -# Whether assets are extra if `extra` is not set. -EXTRA_DEFAULT = False - - -@project_cli.command( - "assets", - context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, -) -def project_assets_cli( - # fmt: off - ctx: typer.Context, # This is only used to read additional arguments - project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), - sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."), - extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.") - # fmt: on -): - """Fetch project assets like datasets and pretrained weights. Assets are - defined in the "assets" section of the project.yml. If a checksum is - provided in the project.yml, the file is only downloaded if no local file - with the same checksum exists. - - DOCS: https://spacy.io/api/cli#project-assets - """ - overrides = parse_config_overrides(ctx.args) - project_assets( - project_dir, - overrides=overrides, - sparse_checkout=sparse_checkout, - extra=extra, - ) - - -def project_assets( - project_dir: Path, - *, - overrides: Dict[str, Any] = SimpleFrozenDict(), - sparse_checkout: bool = False, - extra: bool = False, -) -> None: - """Fetch assets for a project using DVC if possible. - - project_dir (Path): Path to project directory. - sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files - needed. - extra (bool): Whether to download all assets, including those marked as 'extra'. - """ - project_path = ensure_path(project_dir) - config = load_project_config(project_path, overrides=overrides) - assets = [ - asset - for asset in config.get("assets", []) - if extra or not asset.get("extra", EXTRA_DEFAULT) - ] - if not assets: - msg.warn( - f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)", - exits=0, - ) - msg.info(f"Fetching {len(assets)} asset(s)") - - for asset in assets: - dest = (project_dir / asset["dest"]).resolve() - checksum = asset.get("checksum") - if "git" in asset: - git_err = ( - f"Cloning spaCy project templates requires Git and the 'git' command. " - f"Make sure it's installed and that the executable is available." - ) - get_git_version(error=git_err) - if dest.exists(): - # If there's already a file, check for checksum - if checksum and checksum == get_checksum(dest): - msg.good( - f"Skipping download with matching checksum: {asset['dest']}" - ) - continue - else: - if dest.is_dir(): - shutil.rmtree(dest) - else: - dest.unlink() - if "repo" not in asset["git"] or asset["git"]["repo"] is None: - msg.fail( - "A git asset must include 'repo', the repository address.", exits=1 - ) - if "path" not in asset["git"] or asset["git"]["path"] is None: - msg.fail( - "A git asset must include 'path' - use \"\" to get the entire repository.", - exits=1, - ) - git_checkout( - asset["git"]["repo"], - asset["git"]["path"], - dest, - branch=asset["git"].get("branch"), - sparse=sparse_checkout, - ) - msg.good(f"Downloaded asset {dest}") - else: - url = asset.get("url") - if not url: - # project.yml defines asset without URL that the user has to place - check_private_asset(dest, checksum) - continue - fetch_asset(project_path, url, dest, checksum) - - -def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None: - """Check and validate assets without a URL (private assets that the user - has to provide themselves) and give feedback about the checksum. - - dest (Path): Destination path of the asset. - checksum (Optional[str]): Optional checksum of the expected file. - """ - if not Path(dest).exists(): - err = f"No URL provided for asset. You need to add this file yourself: {dest}" - msg.warn(err) - else: - if not checksum: - msg.good(f"Asset already exists: {dest}") - elif checksum == get_checksum(dest): - msg.good(f"Asset exists with matching checksum: {dest}") - else: - msg.fail(f"Asset available but with incorrect checksum: {dest}") - - -def fetch_asset( - project_path: Path, url: str, dest: Path, checksum: Optional[str] = None -) -> None: - """Fetch an asset from a given URL or path. If a checksum is provided and a - local file exists, it's only re-downloaded if the checksum doesn't match. - - project_path (Path): Path to project directory. - url (str): URL or path to asset. - checksum (Optional[str]): Optional expected checksum of local file. - RETURNS (Optional[Path]): The path to the fetched asset or None if fetching - the asset failed. - """ - dest_path = (project_path / dest).resolve() - if dest_path.exists(): - # If there's already a file, check for checksum - if checksum: - if checksum == get_checksum(dest_path): - msg.good(f"Skipping download with matching checksum: {dest}") - return - else: - # If there's not a checksum, make sure the file is a possibly valid size - if os.path.getsize(dest_path) == 0: - msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}") - os.remove(dest_path) - # We might as well support the user here and create parent directories in - # case the asset dir isn't listed as a dir to create in the project.yml - if not dest_path.parent.exists(): - dest_path.parent.mkdir(parents=True) - with working_dir(project_path): - url = convert_asset_url(url) - try: - download_file(url, dest_path) - msg.good(f"Downloaded asset {dest}") - except requests.exceptions.RequestException as e: - if Path(url).exists() and Path(url).is_file(): - # If it's a local file, copy to destination - shutil.copy(url, str(dest_path)) - msg.good(f"Copied local asset {dest}") - else: - msg.fail(f"Download failed: {dest}", e) - if checksum and checksum != get_checksum(dest_path): - msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}") - - -def convert_asset_url(url: str) -> str: - """Check and convert the asset URL if needed. - - url (str): The asset URL. - RETURNS (str): The converted URL. - """ - # If the asset URL is a regular GitHub URL it's likely a mistake - if ( - re.match(r"(http(s?)):\/\/github.com", url) - and "releases/download" not in url - and "/raw/" not in url - ): - converted = url.replace("github.com", "raw.githubusercontent.com") - converted = re.sub(r"/(tree|blob)/", "/", converted) - msg.warn( - "Downloading from a regular GitHub URL. This will only download " - "the source of the page, not the actual file. Converting the URL " - "to a raw URL.", - converted, - ) - return converted - return url diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py deleted file mode 100644 index 14b4ed9b5..000000000 --- a/spacy/cli/project/clone.py +++ /dev/null @@ -1,115 +0,0 @@ -from typing import Optional -from pathlib import Path -from wasabi import msg -import subprocess -import re - -from ... import about -from ...util import ensure_path -from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE -from .._util import git_checkout, get_git_version, git_repo_branch_exists - -DEFAULT_REPO = about.__projects__ -DEFAULT_PROJECTS_BRANCH = about.__projects_branch__ -DEFAULT_BRANCHES = ["main", "master"] - - -@project_cli.command("clone") -def project_clone_cli( - # fmt: off - name: str = Arg(..., help="The name of the template to clone"), - dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False), - repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"), - branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"), - sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.") - # fmt: on -): - """Clone a project template from a repository. Calls into "git" and will - only download the files from the given subdirectory. The GitHub repo - defaults to the official spaCy template repo, but can be customized - (including using a private repo). - - DOCS: https://spacy.io/api/cli#project-clone - """ - if dest is None: - dest = Path.cwd() / Path(name).parts[-1] - if repo == DEFAULT_REPO and branch is None: - branch = DEFAULT_PROJECTS_BRANCH - - if branch is None: - for default_branch in DEFAULT_BRANCHES: - if git_repo_branch_exists(repo, default_branch): - branch = default_branch - break - if branch is None: - default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES) - msg.fail( - "No branch provided and attempted default " - f"branches {default_branches_msg} do not exist.", - exits=1, - ) - else: - if not git_repo_branch_exists(repo, branch): - msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1) - assert isinstance(branch, str) - project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout) - - -def project_clone( - name: str, - dest: Path, - *, - repo: str = about.__projects__, - branch: str = about.__projects_branch__, - sparse_checkout: bool = False, -) -> None: - """Clone a project template from a repository. - - name (str): Name of subdirectory to clone. - dest (Path): Destination path of cloned project. - repo (str): URL of Git repo containing project templates. - branch (str): The branch to clone from - """ - dest = ensure_path(dest) - check_clone(name, dest, repo) - project_dir = dest.resolve() - repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo) - try: - git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout) - except subprocess.CalledProcessError: - err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')" - msg.fail(err, exits=1) - msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir) - if not (project_dir / PROJECT_FILE).exists(): - msg.warn(f"No {PROJECT_FILE} found in directory") - else: - msg.good(f"Your project is now ready!") - print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}") - - -def check_clone(name: str, dest: Path, repo: str) -> None: - """Check and validate that the destination path can be used to clone. Will - check that Git is available and that the destination path is suitable. - - name (str): Name of the directory to clone from the repo. - dest (Path): Local destination of cloned directory. - repo (str): URL of the repo to clone from. - """ - git_err = ( - f"Cloning spaCy project templates requires Git and the 'git' command. " - f"To clone a project without Git, copy the files from the '{name}' " - f"directory in the {repo} to {dest} manually." - ) - get_git_version(error=git_err) - if not dest: - msg.fail(f"Not a valid directory to clone project: {dest}", exits=1) - if dest.exists(): - # Directory already exists (not allowed, clone needs to create it) - msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1) - if not dest.parent.exists(): - # We're not creating parents, parent dir should exist - msg.fail( - f"Can't clone project, parent directory doesn't exist: {dest.parent}. " - f"Create the necessary folder(s) first before continuing.", - exits=1, - ) diff --git a/spacy/cli/project/document.py b/spacy/cli/project/document.py deleted file mode 100644 index 1ba43a958..000000000 --- a/spacy/cli/project/document.py +++ /dev/null @@ -1,115 +0,0 @@ -from pathlib import Path -from wasabi import msg, MarkdownRenderer - -from ...util import working_dir -from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config - - -DOCS_URL = "https://spacy.io" -INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the -project, as well as the available commands and workflows. For details, see the -[spaCy projects documentation]({DOCS_URL}/usage/projects).""" -INTRO_COMMANDS = f"""The following commands are defined by the project. They -can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run). -Commands are only re-run if their inputs have changed.""" -INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They -can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run) -and will run the specified commands in order. Commands are only re-run if their -inputs have changed.""" -INTRO_ASSETS = f"""The following assets are defined by the project. They can -be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets) -in the project directory.""" -# These markers are added to the Markdown and can be used to update the file in -# place if it already exists. Only the auto-generated part will be replaced. -MARKER_START = "" -MARKER_END = "" -# If this marker is used in an existing README, it's ignored and not replaced -MARKER_IGNORE = "" - - -@project_cli.command("document") -def project_document_cli( - # fmt: off - project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), - output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"), - no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji") - # fmt: on -): - """ - Auto-generate a README.md for a project. If the content is saved to a file, - hidden markers are added so you can add custom content before or after the - auto-generated section and only the auto-generated docs will be replaced - when you re-run the command. - - DOCS: https://spacy.io/api/cli#project-document - """ - project_document(project_dir, output_file, no_emoji=no_emoji) - - -def project_document( - project_dir: Path, output_file: Path, *, no_emoji: bool = False -) -> None: - is_stdout = str(output_file) == "-" - config = load_project_config(project_dir) - md = MarkdownRenderer(no_emoji=no_emoji) - md.add(MARKER_START) - title = config.get("title") - description = config.get("description") - md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐")) - if description: - md.add(description) - md.add(md.title(2, PROJECT_FILE, "📋")) - md.add(INTRO_PROJECT) - # Commands - cmds = config.get("commands", []) - data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds] - if data: - md.add(md.title(3, "Commands", "⏯")) - md.add(INTRO_COMMANDS) - md.add(md.table(data, ["Command", "Description"])) - # Workflows - wfs = config.get("workflows", {}).items() - data = [(md.code(n), " → ".join(md.code(w) for w in stp)) for n, stp in wfs] - if data: - md.add(md.title(3, "Workflows", "⏭")) - md.add(INTRO_WORKFLOWS) - md.add(md.table(data, ["Workflow", "Steps"])) - # Assets - assets = config.get("assets", []) - data = [] - for a in assets: - source = "Git" if a.get("git") else "URL" if a.get("url") else "Local" - dest_path = a["dest"] - dest = md.code(dest_path) - if source == "Local": - # Only link assets if they're in the repo - with working_dir(project_dir) as p: - if (p / dest_path).exists(): - dest = md.link(dest, dest_path) - data.append((dest, source, a.get("description", ""))) - if data: - md.add(md.title(3, "Assets", "🗂")) - md.add(INTRO_ASSETS) - md.add(md.table(data, ["File", "Source", "Description"])) - md.add(MARKER_END) - # Output result - if is_stdout: - print(md.text) - else: - content = md.text - if output_file.exists(): - with output_file.open("r", encoding="utf8") as f: - existing = f.read() - if MARKER_IGNORE in existing: - msg.warn("Found ignore marker in existing file: skipping", output_file) - return - if MARKER_START in existing and MARKER_END in existing: - msg.info("Found existing file: only replacing auto-generated docs") - before = existing.split(MARKER_START)[0] - after = existing.split(MARKER_END)[1] - content = f"{before}{content}{after}" - else: - msg.warn("Replacing existing file") - with output_file.open("w", encoding="utf8") as f: - f.write(content) - msg.good("Saved project documentation", output_file) diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py deleted file mode 100644 index a15353855..000000000 --- a/spacy/cli/project/dvc.py +++ /dev/null @@ -1,207 +0,0 @@ -"""This module contains helpers and subcommands for integrating spaCy projects -with Data Version Controk (DVC). https://dvc.org""" -from typing import Dict, Any, List, Optional, Iterable -import subprocess -from pathlib import Path -from wasabi import msg - -from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli -from .._util import Arg, Opt, NAME, COMMAND -from ...util import working_dir, split_command, join_command, run_command -from ...util import SimpleFrozenList - - -DVC_CONFIG = "dvc.yaml" -DVC_DIR = ".dvc" -UPDATE_COMMAND = "dvc" -DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've -# edited your {PROJECT_FILE}, you can regenerate this file by running: -# {COMMAND} project {UPDATE_COMMAND}""" - - -@project_cli.command(UPDATE_COMMAND) -def project_update_dvc_cli( - # fmt: off - project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), - workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."), - verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), - quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"), - force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), - # fmt: on -): - """Auto-generate Data Version Control (DVC) config. A DVC - project can only define one pipeline, so you need to specify one workflow - defined in the project.yml. If no workflow is specified, the first defined - workflow is used. The DVC config will only be updated if the project.yml - changed. - - DOCS: https://spacy.io/api/cli#project-dvc - """ - project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force) - - -def project_update_dvc( - project_dir: Path, - workflow: Optional[str] = None, - *, - verbose: bool = False, - quiet: bool = False, - force: bool = False, -) -> None: - """Update the auto-generated Data Version Control (DVC) config file. A DVC - project can only define one pipeline, so you need to specify one workflow - defined in the project.yml. Will only update the file if the checksum changed. - - project_dir (Path): The project directory. - workflow (Optional[str]): Optional name of workflow defined in project.yml. - If not set, the first workflow will be used. - verbose (bool): Print more info. - quiet (bool): Print less info. - force (bool): Force update DVC config. - """ - config = load_project_config(project_dir) - updated = update_dvc_config( - project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force - ) - help_msg = "To execute the workflow with DVC, run: dvc repro" - if updated: - msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg) - else: - msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg) - - -def update_dvc_config( - path: Path, - config: Dict[str, Any], - workflow: Optional[str] = None, - verbose: bool = False, - quiet: bool = False, - force: bool = False, -) -> bool: - """Re-run the DVC commands in dry mode and update dvc.yaml file in the - project directory. The file is auto-generated based on the config. The - first line of the auto-generated file specifies the hash of the config - dict, so if any of the config values change, the DVC config is regenerated. - - path (Path): The path to the project directory. - config (Dict[str, Any]): The loaded project.yml. - verbose (bool): Whether to print additional info (via DVC). - quiet (bool): Don't output anything (via DVC). - force (bool): Force update, even if hashes match. - RETURNS (bool): Whether the DVC config file was updated. - """ - ensure_dvc(path) - workflows = config.get("workflows", {}) - workflow_names = list(workflows.keys()) - check_workflows(workflow_names, workflow) - if not workflow: - workflow = workflow_names[0] - config_hash = get_hash(config) - path = path.resolve() - dvc_config_path = path / DVC_CONFIG - if dvc_config_path.exists(): - # Check if the file was generated using the current config, if not, redo - with dvc_config_path.open("r", encoding="utf8") as f: - ref_hash = f.readline().strip().replace("# ", "") - if ref_hash == config_hash and not force: - return False # Nothing has changed in project.yml, don't need to update - dvc_config_path.unlink() - dvc_commands = [] - config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} - - # some flags that apply to every command - flags = [] - if verbose: - flags.append("--verbose") - if quiet: - flags.append("--quiet") - - for name in workflows[workflow]: - command = config_commands[name] - deps = command.get("deps", []) - outputs = command.get("outputs", []) - outputs_no_cache = command.get("outputs_no_cache", []) - if not deps and not outputs and not outputs_no_cache: - continue - # Default to the working dir as the project path since dvc.yaml is auto-generated - # and we don't want arbitrary paths in there - project_cmd = ["python", "-m", NAME, "project", "run", name] - deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] - outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] - outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] - - dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"] - if command.get("no_skip"): - dvc_cmd.append("--always-changed") - full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] - dvc_commands.append(join_command(full_cmd)) - - if not dvc_commands: - # If we don't check for this, then there will be an error when reading the - # config, since DVC wouldn't create it. - msg.fail( - "No usable commands for DVC found. This can happen if none of your " - "commands have dependencies or outputs.", - exits=1, - ) - - with working_dir(path): - for c in dvc_commands: - dvc_command = "dvc " + c - run_command(dvc_command) - with dvc_config_path.open("r+", encoding="utf8") as f: - content = f.read() - f.seek(0, 0) - f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}") - return True - - -def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None: - """Validate workflows provided in project.yml and check that a given - workflow can be used to generate a DVC config. - - workflows (List[str]): Names of the available workflows. - workflow (Optional[str]): The name of the workflow to convert. - """ - if not workflows: - msg.fail( - f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, " - f"define at least one list of commands.", - exits=1, - ) - if workflow is not None and workflow not in workflows: - msg.fail( - f"Workflow '{workflow}' not defined in {PROJECT_FILE}. " - f"Available workflows: {', '.join(workflows)}", - exits=1, - ) - if not workflow: - msg.warn( - f"No workflow specified for DVC pipeline. Using the first workflow " - f"defined in {PROJECT_FILE}: '{workflows[0]}'" - ) - - -def ensure_dvc(project_dir: Path) -> None: - """Ensure that the "dvc" command is available and that the current project - directory is an initialized DVC project. - """ - try: - subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL) - except Exception: - msg.fail( - "To use spaCy projects with DVC (Data Version Control), DVC needs " - "to be installed and the 'dvc' command needs to be available", - "You can install the Python package from pip (pip install dvc) or " - "conda (conda install -c conda-forge dvc). For more details, see the " - "documentation: https://dvc.org/doc/install", - exits=1, - ) - if not (project_dir / ".dvc").exists(): - msg.fail( - "Project not initialized as a DVC project", - "To initialize a DVC project, you can run 'dvc init' in the project " - "directory. For more details, see the documentation: " - "https://dvc.org/doc/command-reference/init", - exits=1, - ) diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py deleted file mode 100644 index 8894baa50..000000000 --- a/spacy/cli/project/pull.py +++ /dev/null @@ -1,67 +0,0 @@ -from pathlib import Path -from wasabi import msg -from .remote_storage import RemoteStorage -from .remote_storage import get_command_hash -from .._util import project_cli, Arg, logger -from .._util import load_project_config -from .run import update_lockfile - - -@project_cli.command("pull") -def project_pull_cli( - # fmt: off - remote: str = Arg("default", help="Name or path of remote storage"), - project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), - # fmt: on -): - """Retrieve available precomputed outputs from a remote storage. - You can alias remotes in your project.yml by mapping them to storage paths. - A storage can be anything that the smart-open library can upload to, e.g. - AWS, Google Cloud Storage, SSH, local directories etc. - - DOCS: https://spacy.io/api/cli#project-pull - """ - for url, output_path in project_pull(project_dir, remote): - if url is not None: - msg.good(f"Pulled {output_path} from {url}") - - -def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): - # TODO: We don't have tests for this :(. It would take a bit of mockery to - # set up. I guess see if it breaks first? - config = load_project_config(project_dir) - if remote in config.get("remotes", {}): - remote = config["remotes"][remote] - storage = RemoteStorage(project_dir, remote) - commands = list(config.get("commands", [])) - # We use a while loop here because we don't know how the commands - # will be ordered. A command might need dependencies from one that's later - # in the list. - while commands: - for i, cmd in enumerate(list(commands)): - logger.debug("CMD: %s.", cmd["name"]) - deps = [project_dir / dep for dep in cmd.get("deps", [])] - if all(dep.exists() for dep in deps): - cmd_hash = get_command_hash("", "", deps, cmd["script"]) - for output_path in cmd.get("outputs", []): - url = storage.pull(output_path, command_hash=cmd_hash) - logger.debug( - "URL: %s for %s with command hash %s", - url, - output_path, - cmd_hash, - ) - yield url, output_path - - out_locs = [project_dir / out for out in cmd.get("outputs", [])] - if all(loc.exists() for loc in out_locs): - update_lockfile(project_dir, cmd) - # We remove the command from the list here, and break, so that - # we iterate over the loop again. - commands.pop(i) - break - else: - logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"]) - else: - # If we didn't break the for loop, break the while loop. - break diff --git a/spacy/cli/project/push.py b/spacy/cli/project/push.py deleted file mode 100644 index a8178de21..000000000 --- a/spacy/cli/project/push.py +++ /dev/null @@ -1,69 +0,0 @@ -from pathlib import Path -from wasabi import msg -from .remote_storage import RemoteStorage -from .remote_storage import get_content_hash, get_command_hash -from .._util import load_project_config -from .._util import project_cli, Arg, logger - - -@project_cli.command("push") -def project_push_cli( - # fmt: off - remote: str = Arg("default", help="Name or path of remote storage"), - project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), - # fmt: on -): - """Persist outputs to a remote storage. You can alias remotes in your - project.yml by mapping them to storage paths. A storage can be anything that - the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH, - local directories etc. - - DOCS: https://spacy.io/api/cli#project-push - """ - for output_path, url in project_push(project_dir, remote): - if url is None: - msg.info(f"Skipping {output_path}") - else: - msg.good(f"Pushed {output_path} to {url}") - - -def project_push(project_dir: Path, remote: str): - """Persist outputs to a remote storage. You can alias remotes in your project.yml - by mapping them to storage paths. A storage can be anything that the smart-open - library can upload to, e.g. gcs, aws, ssh, local directories etc - """ - config = load_project_config(project_dir) - if remote in config.get("remotes", {}): - remote = config["remotes"][remote] - storage = RemoteStorage(project_dir, remote) - for cmd in config.get("commands", []): - logger.debug("CMD: %s", cmd["name"]) - deps = [project_dir / dep for dep in cmd.get("deps", [])] - if any(not dep.exists() for dep in deps): - logger.debug("Dependency missing. Skipping %s outputs", cmd["name"]) - continue - cmd_hash = get_command_hash( - "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"] - ) - logger.debug("CMD_HASH: %s", cmd_hash) - for output_path in cmd.get("outputs", []): - output_loc = project_dir / output_path - if output_loc.exists() and _is_not_empty_dir(output_loc): - url = storage.push( - output_path, - command_hash=cmd_hash, - content_hash=get_content_hash(output_loc), - ) - logger.debug( - "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash - ) - yield output_path, url - - -def _is_not_empty_dir(loc: Path): - if not loc.is_dir(): - return True - elif any(_is_not_empty_dir(child) for child in loc.iterdir()): - return True - else: - return False diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py deleted file mode 100644 index 076541580..000000000 --- a/spacy/cli/project/remote_storage.py +++ /dev/null @@ -1,205 +0,0 @@ -from typing import Optional, List, Dict, TYPE_CHECKING -import os -import site -import hashlib -import urllib.parse -import tarfile -from pathlib import Path -from wasabi import msg - -from .._util import get_hash, get_checksum, upload_file, download_file -from .._util import ensure_pathy, make_tempdir -from ...util import get_minor_version, ENV_VARS, check_bool_env_var -from ...git_info import GIT_VERSION -from ... import about -from ...errors import Errors - -if TYPE_CHECKING: - from pathy import FluidPath # noqa: F401 - - -class RemoteStorage: - """Push and pull outputs to and from a remote file storage. - - Remotes can be anything that `smart-open` can support: AWS, GCS, file system, - ssh, etc. - """ - - def __init__(self, project_root: Path, url: str, *, compression="gz"): - self.root = project_root - self.url = ensure_pathy(url) - self.compression = compression - - def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath": - """Compress a file or directory within a project and upload it to a remote - storage. If an object exists at the full URL, nothing is done. - - Within the remote storage, files are addressed by their project path - (url encoded) and two user-supplied hashes, representing their creation - context and their file contents. If the URL already exists, the data is - not uploaded. Paths are archived and compressed prior to upload. - """ - loc = self.root / path - if not loc.exists(): - raise IOError(f"Cannot push {loc}: does not exist.") - url = self.make_url(path, command_hash, content_hash) - if url.exists(): - return url - tmp: Path - with make_tempdir() as tmp: - tar_loc = tmp / self.encode_name(str(path)) - mode_string = f"w:{self.compression}" if self.compression else "w" - with tarfile.open(tar_loc, mode=mode_string) as tar_file: - tar_file.add(str(loc), arcname=str(path)) - upload_file(tar_loc, url) - return url - - def pull( - self, - path: Path, - *, - command_hash: Optional[str] = None, - content_hash: Optional[str] = None, - ) -> Optional["FluidPath"]: - """Retrieve a file from the remote cache. If the file already exists, - nothing is done. - - If the command_hash and/or content_hash are specified, only matching - results are returned. If no results are available, an error is raised. - """ - dest = self.root / path - if dest.exists(): - return None - url = self.find(path, command_hash=command_hash, content_hash=content_hash) - if url is None: - return url - else: - # Make sure the destination exists - if not dest.parent.exists(): - dest.parent.mkdir(parents=True) - tmp: Path - with make_tempdir() as tmp: - tar_loc = tmp / url.parts[-1] - download_file(url, tar_loc) - mode_string = f"r:{self.compression}" if self.compression else "r" - with tarfile.open(tar_loc, mode=mode_string) as tar_file: - # This requires that the path is added correctly, relative - # to root. This is how we set things up in push() - - # Disallow paths outside the current directory for the tar - # file (CVE-2007-4559, directory traversal vulnerability) - def is_within_directory(directory, target): - abs_directory = os.path.abspath(directory) - abs_target = os.path.abspath(target) - prefix = os.path.commonprefix([abs_directory, abs_target]) - return prefix == abs_directory - - def safe_extract(tar, path): - for member in tar.getmembers(): - member_path = os.path.join(path, member.name) - if not is_within_directory(path, member_path): - raise ValueError(Errors.E852) - tar.extractall(path) - - safe_extract(tar_file, self.root) - return url - - def find( - self, - path: Path, - *, - command_hash: Optional[str] = None, - content_hash: Optional[str] = None, - ) -> Optional["FluidPath"]: - """Find the best matching version of a file within the storage, - or `None` if no match can be found. If both the creation and content hash - are specified, only exact matches will be returned. Otherwise, the most - recent matching file is preferred. - """ - name = self.encode_name(str(path)) - urls = [] - if command_hash is not None and content_hash is not None: - url = self.url / name / command_hash / content_hash - urls = [url] if url.exists() else [] - elif command_hash is not None: - if (self.url / name / command_hash).exists(): - urls = list((self.url / name / command_hash).iterdir()) - else: - if (self.url / name).exists(): - for sub_dir in (self.url / name).iterdir(): - urls.extend(sub_dir.iterdir()) - if content_hash is not None: - urls = [url for url in urls if url.parts[-1] == content_hash] - if len(urls) >= 2: - try: - urls.sort(key=lambda x: x.stat().last_modified) # type: ignore - except Exception: - msg.warn( - "Unable to sort remote files by last modified. The file(s) " - "pulled from the cache may not be the most recent." - ) - return urls[-1] if urls else None - - def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath": - """Construct a URL from a subpath, a creation hash and a content hash.""" - return self.url / self.encode_name(str(path)) / command_hash / content_hash - - def encode_name(self, name: str) -> str: - """Encode a subpath into a URL-safe name.""" - return urllib.parse.quote_plus(name) - - -def get_content_hash(loc: Path) -> str: - return get_checksum(loc) - - -def get_command_hash( - site_hash: str, env_hash: str, deps: List[Path], cmd: List[str] -) -> str: - """Create a hash representing the execution of a command. This includes the - currently installed packages, whatever environment variables have been marked - as relevant, and the command. - """ - if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION): - spacy_v = GIT_VERSION - else: - spacy_v = str(get_minor_version(about.__version__) or "") - dep_checksums = [get_checksum(dep) for dep in sorted(deps)] - hashes = [spacy_v, site_hash, env_hash] + dep_checksums - hashes.extend(cmd) - creation_bytes = "".join(hashes).encode("utf8") - return hashlib.md5(creation_bytes).hexdigest() - - -def get_site_hash(): - """Hash the current Python environment's site-packages contents, including - the name and version of the libraries. The list we're hashing is what - `pip freeze` would output. - """ - site_dirs = site.getsitepackages() - if site.ENABLE_USER_SITE: - site_dirs.extend(site.getusersitepackages()) - packages = set() - for site_dir in site_dirs: - site_dir = Path(site_dir) - for subpath in site_dir.iterdir(): - if subpath.parts[-1].endswith("dist-info"): - packages.add(subpath.parts[-1].replace(".dist-info", "")) - package_bytes = "".join(sorted(packages)).encode("utf8") - return hashlib.md5sum(package_bytes).hexdigest() - - -def get_env_hash(env: Dict[str, str]) -> str: - """Construct a hash of the environment variables that will be passed into - the commands. - - Values in the env dict may be references to the current os.environ, using - the syntax $ENV_VAR to mean os.environ[ENV_VAR] - """ - env_vars = {} - for key, value in env.items(): - if value.startswith("$"): - env_vars[key] = os.environ.get(value[1:], "") - else: - env_vars[key] = value - return get_hash(env_vars) diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py deleted file mode 100644 index 0f4858a99..000000000 --- a/spacy/cli/project/run.py +++ /dev/null @@ -1,360 +0,0 @@ -from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple -import os.path -from pathlib import Path - -from wasabi import msg -from wasabi.util import locale_escape -import sys -import srsly -import typer - -from ... import about -from ...git_info import GIT_VERSION -from ...util import working_dir, run_command, split_command, is_cwd, join_command -from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS -from ...util import check_bool_env_var, SimpleFrozenDict -from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash -from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides - - -@project_cli.command( - "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True} -) -def project_run_cli( - # fmt: off - ctx: typer.Context, # This is only used to read additional arguments - subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"), - project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), - force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"), - dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"), - show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") - # fmt: on -): - """Run a named command or workflow defined in the project.yml. If a workflow - name is specified, all commands in the workflow are run, in order. If - commands define dependencies and/or outputs, they will only be re-run if - state has changed. - - DOCS: https://spacy.io/api/cli#project-run - """ - if show_help or not subcommand: - print_run_help(project_dir, subcommand) - else: - overrides = parse_config_overrides(ctx.args) - project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry) - - -def project_run( - project_dir: Path, - subcommand: str, - *, - overrides: Dict[str, Any] = SimpleFrozenDict(), - force: bool = False, - dry: bool = False, - capture: bool = False, - skip_requirements_check: bool = False, -) -> None: - """Run a named script defined in the project.yml. If the script is part - of the default pipeline (defined in the "run" section), DVC is used to - execute the command, so it can determine whether to rerun it. It then - calls into "exec" to execute it. - - project_dir (Path): Path to project directory. - subcommand (str): Name of command to run. - overrides (Dict[str, Any]): Optional config overrides. - force (bool): Force re-running, even if nothing changed. - dry (bool): Perform a dry run and don't execute commands. - capture (bool): Whether to capture the output and errors of individual commands. - If False, the stdout and stderr will not be redirected, and if there's an error, - sys.exit will be called with the return code. You should use capture=False - when you want to turn over execution to the command, and capture=True - when you want to run the command more like a function. - skip_requirements_check (bool): Whether to skip the requirements check. - """ - config = load_project_config(project_dir, overrides=overrides) - commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} - workflows = config.get("workflows", {}) - validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand) - - req_path = project_dir / "requirements.txt" - if not skip_requirements_check: - if config.get("check_requirements", True) and os.path.exists(req_path): - with req_path.open() as requirements_file: - _check_requirements([req.strip() for req in requirements_file]) - - if subcommand in workflows: - msg.info(f"Running workflow '{subcommand}'") - for cmd in workflows[subcommand]: - project_run( - project_dir, - cmd, - overrides=overrides, - force=force, - dry=dry, - capture=capture, - skip_requirements_check=True, - ) - else: - cmd = commands[subcommand] - for dep in cmd.get("deps", []): - if not (project_dir / dep).exists(): - err = f"Missing dependency specified by command '{subcommand}': {dep}" - err_help = "Maybe you forgot to run the 'project assets' command or a previous step?" - err_exits = 1 if not dry else None - msg.fail(err, err_help, exits=err_exits) - check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION) - with working_dir(project_dir) as current_dir: - msg.divider(subcommand) - rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit) - if not rerun and not force: - msg.info(f"Skipping '{cmd['name']}': nothing changed") - else: - run_commands(cmd["script"], dry=dry, capture=capture) - if not dry: - update_lockfile(current_dir, cmd) - - -def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: - """Simulate a CLI help prompt using the info available in the project.yml. - - project_dir (Path): The project directory. - subcommand (Optional[str]): The subcommand or None. If a subcommand is - provided, the subcommand help is shown. Otherwise, the top-level help - and a list of available commands is printed. - """ - config = load_project_config(project_dir) - config_commands = config.get("commands", []) - commands = {cmd["name"]: cmd for cmd in config_commands} - workflows = config.get("workflows", {}) - project_loc = "" if is_cwd(project_dir) else project_dir - if subcommand: - validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand) - print(f"Usage: {COMMAND} project run {subcommand} {project_loc}") - if subcommand in commands: - help_text = commands[subcommand].get("help") - if help_text: - print(f"\n{help_text}\n") - elif subcommand in workflows: - steps = workflows[subcommand] - print(f"\nWorkflow consisting of {len(steps)} commands:") - steps_data = [ - (f"{i + 1}. {step}", commands[step].get("help", "")) - for i, step in enumerate(steps) - ] - msg.table(steps_data) - help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help" - print(f"For command details, run: {help_cmd}") - else: - print("") - title = config.get("title") - if title: - print(f"{locale_escape(title)}\n") - if config_commands: - print(f"Available commands in {PROJECT_FILE}") - print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}") - msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) - if workflows: - print(f"Available workflows in {PROJECT_FILE}") - print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}") - msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()]) - - -def run_commands( - commands: Iterable[str] = SimpleFrozenList(), - silent: bool = False, - dry: bool = False, - capture: bool = False, -) -> None: - """Run a sequence of commands in a subprocess, in order. - - commands (List[str]): The string commands. - silent (bool): Don't print the commands. - dry (bool): Perform a dry run and don't execut anything. - capture (bool): Whether to capture the output and errors of individual commands. - If False, the stdout and stderr will not be redirected, and if there's an error, - sys.exit will be called with the return code. You should use capture=False - when you want to turn over execution to the command, and capture=True - when you want to run the command more like a function. - """ - for c in commands: - command = split_command(c) - # Not sure if this is needed or a good idea. Motivation: users may often - # use commands in their config that reference "python" and we want to - # make sure that it's always executing the same Python that spaCy is - # executed with and the pip in the same env, not some other Python/pip. - # Also ensures cross-compatibility if user 1 writes "python3" (because - # that's how it's set up on their system), and user 2 without the - # shortcut tries to re-run the command. - if len(command) and command[0] in ("python", "python3"): - command[0] = sys.executable - elif len(command) and command[0] in ("pip", "pip3"): - command = [sys.executable, "-m", "pip", *command[1:]] - if not silent: - print(f"Running command: {join_command(command)}") - if not dry: - run_command(command, capture=capture) - - -def validate_subcommand( - commands: Sequence[str], workflows: Sequence[str], subcommand: str -) -> None: - """Check that a subcommand is valid and defined. Raises an error otherwise. - - commands (Sequence[str]): The available commands. - subcommand (str): The subcommand. - """ - if not commands and not workflows: - msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1) - if subcommand not in commands and subcommand not in workflows: - help_msg = [] - if subcommand in ["assets", "asset"]: - help_msg.append("Did you mean to run: python -m spacy project assets?") - if commands: - help_msg.append(f"Available commands: {', '.join(commands)}") - if workflows: - help_msg.append(f"Available workflows: {', '.join(workflows)}") - msg.fail( - f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}", - ". ".join(help_msg), - exits=1, - ) - - -def check_rerun( - project_dir: Path, - command: Dict[str, Any], - *, - check_spacy_version: bool = True, - check_spacy_commit: bool = False, -) -> bool: - """Check if a command should be rerun because its settings or inputs/outputs - changed. - - project_dir (Path): The current project directory. - command (Dict[str, Any]): The command, as defined in the project.yml. - strict_version (bool): - RETURNS (bool): Whether to re-run the command. - """ - # Always rerun if no-skip is set - if command.get("no_skip", False): - return True - lock_path = project_dir / PROJECT_LOCK - if not lock_path.exists(): # We don't have a lockfile, run command - return True - data = srsly.read_yaml(lock_path) - if command["name"] not in data: # We don't have info about this command - return True - entry = data[command["name"]] - # Always run commands with no outputs (otherwise they'd always be skipped) - if not entry.get("outs", []): - return True - # Always rerun if spaCy version or commit hash changed - spacy_v = entry.get("spacy_version") - commit = entry.get("spacy_git_version") - if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__): - info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)" - msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}") - return True - if check_spacy_commit and commit != GIT_VERSION: - info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)" - msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}") - return True - # If the entry in the lockfile matches the lockfile entry that would be - # generated from the current command, we don't rerun because it means that - # all inputs/outputs, hashes and scripts are the same and nothing changed - lock_entry = get_lock_entry(project_dir, command) - exclude = ["spacy_version", "spacy_git_version"] - return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude) - - -def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None: - """Update the lockfile after running a command. Will create a lockfile if - it doesn't yet exist and will add an entry for the current command, its - script and dependencies/outputs. - - project_dir (Path): The current project directory. - command (Dict[str, Any]): The command, as defined in the project.yml. - """ - lock_path = project_dir / PROJECT_LOCK - if not lock_path.exists(): - srsly.write_yaml(lock_path, {}) - data = {} - else: - data = srsly.read_yaml(lock_path) - data[command["name"]] = get_lock_entry(project_dir, command) - srsly.write_yaml(lock_path, data) - - -def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]: - """Get a lockfile entry for a given command. An entry includes the command, - the script (command steps) and a list of dependencies and outputs with - their paths and file hashes, if available. The format is based on the - dvc.lock files, to keep things consistent. - - project_dir (Path): The current project directory. - command (Dict[str, Any]): The command, as defined in the project.yml. - RETURNS (Dict[str, Any]): The lockfile entry. - """ - deps = get_fileinfo(project_dir, command.get("deps", [])) - outs = get_fileinfo(project_dir, command.get("outputs", [])) - outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", [])) - return { - "cmd": f"{COMMAND} run {command['name']}", - "script": command["script"], - "deps": deps, - "outs": [*outs, *outs_nc], - "spacy_version": about.__version__, - "spacy_git_version": GIT_VERSION, - } - - -def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]: - """Generate the file information for a list of paths (dependencies, outputs). - Includes the file path and the file's checksum. - - project_dir (Path): The current project directory. - paths (List[str]): The file paths. - RETURNS (List[Dict[str, str]]): The lockfile entry for a file. - """ - data = [] - for path in paths: - file_path = project_dir / path - md5 = get_checksum(file_path) if file_path.exists() else None - data.append({"path": path, "md5": md5}) - return data - - -def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]: - """Checks whether requirements are installed and free of version conflicts. - requirements (List[str]): List of requirements. - RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts - exist. - """ - import pkg_resources - - failed_pkgs_msgs: List[str] = [] - conflicting_pkgs_msgs: List[str] = [] - - for req in requirements: - try: - pkg_resources.require(req) - except pkg_resources.DistributionNotFound as dnf: - failed_pkgs_msgs.append(dnf.report()) - except pkg_resources.VersionConflict as vc: - conflicting_pkgs_msgs.append(vc.report()) - except Exception: - msg.warn( - f"Unable to check requirement: {req} " - "Checks are currently limited to requirement specifiers " - "(PEP 508)" - ) - - if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs): - msg.warn( - title="Missing requirements or requirement conflicts detected. Make sure your Python environment is set up " - "correctly and you installed all requirements specified in your project's requirements.txt: " - ) - for pgk_msg in failed_pkgs_msgs + conflicting_pkgs_msgs: - msg.text(pgk_msg) - - return len(failed_pkgs_msgs) > 0, len(conflicting_pkgs_msgs) > 0 diff --git a/spacy/errors.py b/spacy/errors.py index c897c29ff..526c4d0d6 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -553,8 +553,6 @@ class Errors(metaclass=ErrorsWithCodes): "vectors, not {mode} vectors.") E851 = ("The 'textcat' component labels should only have values of 0 or 1, " "but found value of '{val}'.") - E852 = ("The tar file pulled from the remote attempted an unsafe path " - "traversal.") E853 = ("Unsupported component factory name '{name}'. The character '.' is " "not permitted in factory names.") E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not " diff --git a/spacy/schemas.py b/spacy/schemas.py index 140592dcd..756d5ef3a 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -443,66 +443,6 @@ CONFIG_SCHEMAS = { "initialize": ConfigSchemaInit, } - -# Project config Schema - - -class ProjectConfigAssetGitItem(BaseModel): - # fmt: off - repo: StrictStr = Field(..., title="URL of Git repo to download from") - path: StrictStr = Field(..., title="File path or sub-directory to download (used for sparse checkout)") - branch: StrictStr = Field("master", title="Branch to clone from") - # fmt: on - - -class ProjectConfigAssetURL(BaseModel): - # fmt: off - dest: StrictStr = Field(..., title="Destination of downloaded asset") - url: Optional[StrictStr] = Field(None, title="URL of asset") - checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") - description: StrictStr = Field("", title="Description of asset") - # fmt: on - - -class ProjectConfigAssetGit(BaseModel): - # fmt: off - git: ProjectConfigAssetGitItem = Field(..., title="Git repo information") - checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") - description: Optional[StrictStr] = Field(None, title="Description of asset") - # fmt: on - - -class ProjectConfigCommand(BaseModel): - # fmt: off - name: StrictStr = Field(..., title="Name of command") - help: Optional[StrictStr] = Field(None, title="Command description") - script: List[StrictStr] = Field([], title="List of CLI commands to run, in order") - deps: List[StrictStr] = Field([], title="File dependencies required by this command") - outputs: List[StrictStr] = Field([], title="Outputs produced by this command") - outputs_no_cache: List[StrictStr] = Field([], title="Outputs not tracked by DVC (DVC only)") - no_skip: bool = Field(False, title="Never skip this command, even if nothing changed") - # fmt: on - - class Config: - title = "A single named command specified in a project config" - extra = "forbid" - - -class ProjectConfigSchema(BaseModel): - # fmt: off - vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands") - env: Dict[StrictStr, Any] = Field({}, title="Optional variable names to substitute in commands, mapped to environment variable names") - assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets") - workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order") - commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") - title: Optional[str] = Field(None, title="Project title") - spacy_version: Optional[StrictStr] = Field(None, title="spaCy version range that the project is compatible with") - # fmt: on - - class Config: - title = "Schema for project configuration file" - - # Recommendations for init config workflows diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 1fdf059b3..14b82ffe2 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -2,11 +2,9 @@ import os import math from collections import Counter from typing import Tuple, List, Dict, Any -import time from pathlib import Path import spacy -import numpy import pytest import srsly from click import NoSuchOption @@ -15,11 +13,8 @@ from thinc.api import Config, ConfigValidationError from spacy import about from spacy.cli import info -from spacy.cli._util import is_subpath_of, load_project_config, walk_directory +from spacy.cli._util import walk_directory from spacy.cli._util import parse_config_overrides, string_to_list -from spacy.cli._util import substitute_project_variables -from spacy.cli._util import validate_project_commands -from spacy.cli._util import upload_file, download_file from spacy.cli.debug_data import _compile_gold, _get_labels_from_model from spacy.cli.debug_data import _get_labels_from_spancat from spacy.cli.debug_data import _get_distribution, _get_kl_divergence @@ -31,15 +26,13 @@ from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.init_pipeline import _init_labels from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import _is_permitted_package_name -from spacy.cli.project.remote_storage import RemoteStorage -from spacy.cli.project.run import _check_requirements from spacy.cli.validate import get_model_pkgs from spacy.cli.apply import apply from spacy.cli.find_threshold import find_threshold from spacy.lang.en import English from spacy.lang.nl import Dutch from spacy.language import Language -from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate +from spacy.schemas import RecommendationSchema from spacy.tokens import Doc, DocBin from spacy.tokens.span import Span from spacy.training import Example, docs_to_json, offsets_to_biluo_tags @@ -125,25 +118,6 @@ def test_issue7055(): assert "model" in filled_cfg["components"]["ner"] -@pytest.mark.issue(11235) -def test_issue11235(): - """ - Test that the cli handles interpolation in the directory names correctly when loading project config. - """ - lang_var = "en" - variables = {"lang": lang_var} - commands = [{"name": "x", "script": ["hello ${vars.lang}"]}] - directories = ["cfg", "${vars.lang}_model"] - project = {"commands": commands, "vars": variables, "directories": directories} - with make_tempdir() as d: - srsly.write_yaml(d / "project.yml", project) - cfg = load_project_config(d) - # Check that the directories are interpolated and created correctly - assert os.path.exists(d / "cfg") - assert os.path.exists(d / f"{lang_var}_model") - assert cfg["commands"][0]["script"][0] == f"hello {lang_var}" - - def test_cli_info(): nlp = Dutch() nlp.add_pipe("textcat") @@ -370,136 +344,6 @@ def test_cli_converters_conll_ner_to_docs(): assert ent.text in ["New York City", "London"] -def test_project_config_validation_full(): - config = { - "vars": {"some_var": 20}, - "directories": ["assets", "configs", "corpus", "scripts", "training"], - "assets": [ - { - "dest": "x", - "extra": True, - "url": "https://example.com", - "checksum": "63373dd656daa1fd3043ce166a59474c", - }, - { - "dest": "y", - "git": { - "repo": "https://github.com/example/repo", - "branch": "develop", - "path": "y", - }, - }, - { - "dest": "z", - "extra": False, - "url": "https://example.com", - "checksum": "63373dd656daa1fd3043ce166a59474c", - }, - ], - "commands": [ - { - "name": "train", - "help": "Train a model", - "script": ["python -m spacy train config.cfg -o training"], - "deps": ["config.cfg", "corpus/training.spcy"], - "outputs": ["training/model-best"], - }, - {"name": "test", "script": ["pytest", "custom.py"], "no_skip": True}, - ], - "workflows": {"all": ["train", "test"], "train": ["train"]}, - } - errors = validate(ProjectConfigSchema, config) - assert not errors - - -@pytest.mark.parametrize( - "config", - [ - {"commands": [{"name": "a"}, {"name": "a"}]}, - {"commands": [{"name": "a"}], "workflows": {"a": []}}, - {"commands": [{"name": "a"}], "workflows": {"b": ["c"]}}, - ], -) -def test_project_config_validation1(config): - with pytest.raises(SystemExit): - validate_project_commands(config) - - -@pytest.mark.parametrize( - "config,n_errors", - [ - ({"commands": {"a": []}}, 1), - ({"commands": [{"help": "..."}]}, 1), - ({"commands": [{"name": "a", "extra": "b"}]}, 1), - ({"commands": [{"extra": "b"}]}, 2), - ({"commands": [{"name": "a", "deps": [123]}]}, 1), - ], -) -def test_project_config_validation2(config, n_errors): - errors = validate(ProjectConfigSchema, config) - assert len(errors) == n_errors - - -@pytest.mark.parametrize( - "int_value", - [10, pytest.param("10", marks=pytest.mark.xfail)], -) -def test_project_config_interpolation(int_value): - variables = {"a": int_value, "b": {"c": "foo", "d": True}} - commands = [ - {"name": "x", "script": ["hello ${vars.a} ${vars.b.c}"]}, - {"name": "y", "script": ["${vars.b.c} ${vars.b.d}"]}, - ] - project = {"commands": commands, "vars": variables} - with make_tempdir() as d: - srsly.write_yaml(d / "project.yml", project) - cfg = load_project_config(d) - assert type(cfg) == dict - assert type(cfg["commands"]) == list - assert cfg["commands"][0]["script"][0] == "hello 10 foo" - assert cfg["commands"][1]["script"][0] == "foo true" - commands = [{"name": "x", "script": ["hello ${vars.a} ${vars.b.e}"]}] - project = {"commands": commands, "vars": variables} - with pytest.raises(ConfigValidationError): - substitute_project_variables(project) - - -@pytest.mark.parametrize( - "greeting", - [342, "everyone", "tout le monde", pytest.param("42", marks=pytest.mark.xfail)], -) -def test_project_config_interpolation_override(greeting): - variables = {"a": "world"} - commands = [ - {"name": "x", "script": ["hello ${vars.a}"]}, - ] - overrides = {"vars.a": greeting} - project = {"commands": commands, "vars": variables} - with make_tempdir() as d: - srsly.write_yaml(d / "project.yml", project) - cfg = load_project_config(d, overrides=overrides) - assert type(cfg) == dict - assert type(cfg["commands"]) == list - assert cfg["commands"][0]["script"][0] == f"hello {greeting}" - - -def test_project_config_interpolation_env(): - variables = {"a": 10} - env_var = "SPACY_TEST_FOO" - env_vars = {"foo": env_var} - commands = [{"name": "x", "script": ["hello ${vars.a} ${env.foo}"]}] - project = {"commands": commands, "vars": variables, "env": env_vars} - with make_tempdir() as d: - srsly.write_yaml(d / "project.yml", project) - cfg = load_project_config(d) - assert cfg["commands"][0]["script"][0] == "hello 10 " - os.environ[env_var] = "123" - with make_tempdir() as d: - srsly.write_yaml(d / "project.yml", project) - cfg = load_project_config(d) - assert cfg["commands"][0]["script"][0] == "hello 10 123" - - @pytest.mark.parametrize( "args,expected", [ @@ -709,21 +553,6 @@ def test_get_third_party_dependencies(): get_third_party_dependencies(nlp.config) -@pytest.mark.parametrize( - "parent,child,expected", - [ - ("/tmp", "/tmp", True), - ("/tmp", "/", False), - ("/tmp", "/tmp/subdir", True), - ("/tmp", "/tmpdir", False), - ("/tmp", "/tmp/subdir/..", True), - ("/tmp", "/tmp/..", False), - ], -) -def test_is_subpath_of(parent, child, expected): - assert is_subpath_of(parent, child) == expected - - @pytest.mark.slow @pytest.mark.parametrize( "factory_name,pipe_name", @@ -968,60 +797,6 @@ def test_applycli_user_data(): assert result[0]._.ext == val -def test_local_remote_storage(): - with make_tempdir() as d: - filename = "a.txt" - - content_hashes = ("aaaa", "cccc", "bbbb") - for i, content_hash in enumerate(content_hashes): - # make sure that each subsequent file has a later timestamp - if i > 0: - time.sleep(1) - content = f"{content_hash} content" - loc_file = d / "root" / filename - if not loc_file.parent.exists(): - loc_file.parent.mkdir(parents=True) - with loc_file.open(mode="w") as file_: - file_.write(content) - - # push first version to remote storage - remote = RemoteStorage(d / "root", str(d / "remote")) - remote.push(filename, "aaaa", content_hash) - - # retrieve with full hashes - loc_file.unlink() - remote.pull(filename, command_hash="aaaa", content_hash=content_hash) - with loc_file.open(mode="r") as file_: - assert file_.read() == content - - # retrieve with command hash - loc_file.unlink() - remote.pull(filename, command_hash="aaaa") - with loc_file.open(mode="r") as file_: - assert file_.read() == content - - # retrieve with content hash - loc_file.unlink() - remote.pull(filename, content_hash=content_hash) - with loc_file.open(mode="r") as file_: - assert file_.read() == content - - # retrieve with no hashes - loc_file.unlink() - remote.pull(filename) - with loc_file.open(mode="r") as file_: - assert file_.read() == content - - -def test_local_remote_storage_pull_missing(): - # pulling from a non-existent remote pulls nothing gracefully - with make_tempdir() as d: - filename = "a.txt" - remote = RemoteStorage(d / "root", str(d / "remote")) - assert remote.pull(filename, command_hash="aaaa") is None - assert remote.pull(filename) is None - - def test_cli_find_threshold(capsys): def make_examples(nlp: Language) -> List[Example]: docs: List[Example] = [] @@ -1132,63 +907,6 @@ def test_cli_find_threshold(capsys): ) -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -@pytest.mark.parametrize( - "reqs,output", - [ - [ - """ - spacy - - # comment - - thinc""", - (False, False), - ], - [ - """# comment - --some-flag - spacy""", - (False, False), - ], - [ - """# comment - --some-flag - spacy; python_version >= '3.6'""", - (False, False), - ], - [ - """# comment - spacyunknowndoesnotexist12345""", - (True, False), - ], - ], -) -def test_project_check_requirements(reqs, output): - import pkg_resources - - # excessive guard against unlikely package name - try: - pkg_resources.require("spacyunknowndoesnotexist12345") - except pkg_resources.DistributionNotFound: - assert output == _check_requirements([req.strip() for req in reqs.split("\n")]) - - -def test_upload_download_local_file(): - with make_tempdir() as d1, make_tempdir() as d2: - filename = "f.txt" - content = "content" - local_file = d1 / filename - remote_file = d2 / filename - with local_file.open(mode="w") as file_: - file_.write(content) - upload_file(local_file, remote_file) - local_file.unlink() - download_file(remote_file, local_file) - with local_file.open(mode="r") as file_: - assert file_.read() == content - - def test_walk_directory(): with make_tempdir() as d: files = [ diff --git a/spacy/util.py b/spacy/util.py index 8cc89217d..73727ca62 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -78,7 +78,6 @@ logger.addHandler(logger_stream_handler) class ENV_VARS: CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES" - PROJECT_USE_GIT_VERSION = "SPACY_PROJECT_USE_GIT_VERSION" class registry(thinc.registry): @@ -951,23 +950,12 @@ def replace_model_node(model: Model, target: Model, replacement: Model) -> None: def split_command(command: str) -> List[str]: """Split a string command using shlex. Handles platform compatibility. - command (str) : The command to split RETURNS (List[str]): The split command. """ return shlex.split(command, posix=not is_windows) -def join_command(command: List[str]) -> str: - """Join a command using shlex. shlex.join is only available for Python 3.8+, - so we're using a workaround here. - - command (List[str]): The command to join. - RETURNS (str): The joined command - """ - return " ".join(shlex.quote(cmd) for cmd in command) - - def run_command( command: Union[str, List[str]], *, @@ -976,7 +964,6 @@ def run_command( ) -> subprocess.CompletedProcess: """Run a command on the command line as a subprocess. If the subprocess returns a non-zero exit code, a system exit is performed. - command (str / List[str]): The command. If provided as a string, the string will be split using shlex.split. stdin (Optional[Any]): stdin to read from or None. @@ -1027,7 +1014,6 @@ def run_command( @contextmanager def working_dir(path: Union[str, Path]) -> Iterator[Path]: """Change current working directory and returns to previous on exit. - path (str / Path): The directory to navigate to. YIELDS (Path): The absolute path to the current working directory. This should be used if the block needs to perform actions within the working @@ -1046,7 +1032,6 @@ def working_dir(path: Union[str, Path]) -> Iterator[Path]: def make_tempdir() -> Generator[Path, None, None]: """Execute a block in a temporary directory and remove the directory and its contents at the end of the with block. - YIELDS (Path): The path of the temp directory. """ d = Path(tempfile.mkdtemp()) @@ -1064,15 +1049,6 @@ def make_tempdir() -> Generator[Path, None, None]: warnings.warn(Warnings.W091.format(dir=d, msg=e)) -def is_cwd(path: Union[Path, str]) -> bool: - """Check whether a path is the current working directory. - - path (Union[Path, str]): The directory path. - RETURNS (bool): Whether the path is the current working directory. - """ - return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower() - - def is_in_jupyter() -> bool: """Check if user is running spaCy from a Jupyter notebook by detecting the IPython kernel. Mainly used for the displaCy visualizer.