mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 17:52:31 +03:00
remove project-specific functionality
This commit is contained in:
parent
3bf4539e31
commit
91fb3ef592
|
@ -1,4 +1,4 @@
|
||||||
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable
|
from typing import Dict, Any, Union, List, Optional, Iterable
|
||||||
from typing import TYPE_CHECKING, overload
|
from typing import TYPE_CHECKING, overload
|
||||||
import sys
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
|
@ -11,16 +11,13 @@ from click import NoSuchOption
|
||||||
from click.parser import split_arg_string
|
from click.parser import split_arg_string
|
||||||
from typer.main import get_command
|
from typer.main import get_command
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from thinc.api import Config, ConfigValidationError, require_gpu
|
from thinc.api import ConfigValidationError, require_gpu
|
||||||
from thinc.util import gpu_is_available
|
from thinc.util import gpu_is_available
|
||||||
from configparser import InterpolationError
|
from configparser import InterpolationError
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from ..compat import Literal
|
from ..compat import Literal
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
from ..util import import_file, registry, logger, ENV_VARS
|
||||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
|
||||||
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
|
||||||
from .. import about
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import FluidPath # noqa: F401
|
from pathy import FluidPath # noqa: F401
|
||||||
|
@ -30,7 +27,6 @@ SDIST_SUFFIX = ".tar.gz"
|
||||||
WHEEL_SUFFIX = "-py3-none-any.whl"
|
WHEEL_SUFFIX = "-py3-none-any.whl"
|
||||||
|
|
||||||
PROJECT_FILE = "project.yml"
|
PROJECT_FILE = "project.yml"
|
||||||
PROJECT_LOCK = "project.lock"
|
|
||||||
COMMAND = "python -m spacy"
|
COMMAND = "python -m spacy"
|
||||||
NAME = "spacy"
|
NAME = "spacy"
|
||||||
HELP = """spaCy Command-line Interface
|
HELP = """spaCy Command-line Interface
|
||||||
|
@ -135,115 +131,6 @@ def _parse_override(value: Any) -> Any:
|
||||||
return str(value)
|
return str(value)
|
||||||
|
|
||||||
|
|
||||||
def load_project_config(
|
|
||||||
path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Load the project.yml file from a directory and validate it. Also make
|
|
||||||
sure that all directories defined in the config exist.
|
|
||||||
|
|
||||||
path (Path): The path to the project directory.
|
|
||||||
interpolate (bool): Whether to substitute project variables.
|
|
||||||
overrides (Dict[str, Any]): Optional config overrides.
|
|
||||||
RETURNS (Dict[str, Any]): The loaded project.yml.
|
|
||||||
"""
|
|
||||||
config_path = path / PROJECT_FILE
|
|
||||||
if not config_path.exists():
|
|
||||||
msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
|
|
||||||
invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
|
|
||||||
try:
|
|
||||||
config = srsly.read_yaml(config_path)
|
|
||||||
except ValueError as e:
|
|
||||||
msg.fail(invalid_err, e, exits=1)
|
|
||||||
errors = validate(ProjectConfigSchema, config)
|
|
||||||
if errors:
|
|
||||||
msg.fail(invalid_err)
|
|
||||||
print("\n".join(errors))
|
|
||||||
sys.exit(1)
|
|
||||||
validate_project_version(config)
|
|
||||||
validate_project_commands(config)
|
|
||||||
if interpolate:
|
|
||||||
err = f"{PROJECT_FILE} validation error"
|
|
||||||
with show_validation_error(title=err, hint_fill=False):
|
|
||||||
config = substitute_project_variables(config, overrides)
|
|
||||||
# Make sure directories defined in config exist
|
|
||||||
for subdir in config.get("directories", []):
|
|
||||||
dir_path = path / subdir
|
|
||||||
if not dir_path.exists():
|
|
||||||
dir_path.mkdir(parents=True)
|
|
||||||
return config
|
|
||||||
|
|
||||||
|
|
||||||
def substitute_project_variables(
|
|
||||||
config: Dict[str, Any],
|
|
||||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
|
||||||
key: str = "vars",
|
|
||||||
env_key: str = "env",
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Interpolate variables in the project file using the config system.
|
|
||||||
|
|
||||||
config (Dict[str, Any]): The project config.
|
|
||||||
overrides (Dict[str, Any]): Optional config overrides.
|
|
||||||
key (str): Key containing variables in project config.
|
|
||||||
env_key (str): Key containing environment variable mapping in project config.
|
|
||||||
RETURNS (Dict[str, Any]): The interpolated project config.
|
|
||||||
"""
|
|
||||||
config.setdefault(key, {})
|
|
||||||
config.setdefault(env_key, {})
|
|
||||||
# Substitute references to env vars with their values
|
|
||||||
for config_var, env_var in config[env_key].items():
|
|
||||||
config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
|
|
||||||
# Need to put variables in the top scope again so we can have a top-level
|
|
||||||
# section "project" (otherwise, a list of commands in the top scope wouldn't)
|
|
||||||
# be allowed by Thinc's config system
|
|
||||||
cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
|
|
||||||
cfg = Config().from_str(cfg.to_str(), overrides=overrides)
|
|
||||||
interpolated = cfg.interpolate()
|
|
||||||
return dict(interpolated["project"])
|
|
||||||
|
|
||||||
|
|
||||||
def validate_project_version(config: Dict[str, Any]) -> None:
|
|
||||||
"""If the project defines a compatible spaCy version range, chec that it's
|
|
||||||
compatible with the current version of spaCy.
|
|
||||||
|
|
||||||
config (Dict[str, Any]): The loaded config.
|
|
||||||
"""
|
|
||||||
spacy_version = config.get("spacy_version", None)
|
|
||||||
if spacy_version and not is_compatible_version(about.__version__, spacy_version):
|
|
||||||
err = (
|
|
||||||
f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
|
|
||||||
f"that's not compatible with the version of spaCy you're running "
|
|
||||||
f"({about.__version__}). You can edit version requirement in the "
|
|
||||||
f"{PROJECT_FILE} to load it, but the project may not run as expected."
|
|
||||||
)
|
|
||||||
msg.fail(err, exits=1)
|
|
||||||
|
|
||||||
|
|
||||||
def validate_project_commands(config: Dict[str, Any]) -> None:
|
|
||||||
"""Check that project commands and workflows are valid, don't contain
|
|
||||||
duplicates, don't clash and only refer to commands that exist.
|
|
||||||
|
|
||||||
config (Dict[str, Any]): The loaded config.
|
|
||||||
"""
|
|
||||||
command_names = [cmd["name"] for cmd in config.get("commands", [])]
|
|
||||||
workflows = config.get("workflows", {})
|
|
||||||
duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
|
|
||||||
if duplicates:
|
|
||||||
err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
|
|
||||||
msg.fail(err, exits=1)
|
|
||||||
for workflow_name, workflow_steps in workflows.items():
|
|
||||||
if workflow_name in command_names:
|
|
||||||
err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
|
|
||||||
msg.fail(err, exits=1)
|
|
||||||
for step in workflow_steps:
|
|
||||||
if step not in command_names:
|
|
||||||
msg.fail(
|
|
||||||
f"Unknown command specified in workflow '{workflow_name}': {step}",
|
|
||||||
f"Workflows can only refer to commands defined in the 'commands' "
|
|
||||||
f"section of the {PROJECT_FILE}.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
|
def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
|
||||||
"""Get the hash for a JSON-serializable object.
|
"""Get the hash for a JSON-serializable object.
|
||||||
|
|
||||||
|
@ -381,158 +268,6 @@ def ensure_pathy(path):
|
||||||
return Pathy.fluid(path)
|
return Pathy.fluid(path)
|
||||||
|
|
||||||
|
|
||||||
def git_checkout(
|
|
||||||
repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False
|
|
||||||
):
|
|
||||||
git_version = get_git_version()
|
|
||||||
if dest.exists():
|
|
||||||
msg.fail("Destination of checkout must not exist", exits=1)
|
|
||||||
if not dest.parent.exists():
|
|
||||||
msg.fail("Parent of destination of checkout must exist", exits=1)
|
|
||||||
if sparse and git_version >= (2, 22):
|
|
||||||
return git_sparse_checkout(repo, subpath, dest, branch)
|
|
||||||
elif sparse:
|
|
||||||
# Only show warnings if the user explicitly wants sparse checkout but
|
|
||||||
# the Git version doesn't support it
|
|
||||||
err_old = (
|
|
||||||
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
|
|
||||||
f"that doesn't fully support sparse checkout yet."
|
|
||||||
)
|
|
||||||
err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
|
|
||||||
msg.warn(
|
|
||||||
f"{err_unk if git_version == (0, 0) else err_old} "
|
|
||||||
f"This means that more files than necessary may be downloaded "
|
|
||||||
f"temporarily. To only download the files needed, make sure "
|
|
||||||
f"you're using Git v2.22 or above."
|
|
||||||
)
|
|
||||||
with make_tempdir() as tmp_dir:
|
|
||||||
cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
|
|
||||||
run_command(cmd, capture=True)
|
|
||||||
# We need Path(name) to make sure we also support subdirectories
|
|
||||||
try:
|
|
||||||
source_path = tmp_dir / Path(subpath)
|
|
||||||
if not is_subpath_of(tmp_dir, source_path):
|
|
||||||
err = f"'{subpath}' is a path outside of the cloned repository."
|
|
||||||
msg.fail(err, repo, exits=1)
|
|
||||||
shutil.copytree(str(source_path), str(dest))
|
|
||||||
except FileNotFoundError:
|
|
||||||
err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
|
|
||||||
msg.fail(err, repo, exits=1)
|
|
||||||
|
|
||||||
|
|
||||||
def git_sparse_checkout(repo, subpath, dest, branch):
|
|
||||||
# We're using Git, partial clone and sparse checkout to
|
|
||||||
# only clone the files we need
|
|
||||||
# This ends up being RIDICULOUS. omg.
|
|
||||||
# So, every tutorial and SO post talks about 'sparse checkout'...But they
|
|
||||||
# go and *clone* the whole repo. Worthless. And cloning part of a repo
|
|
||||||
# turns out to be completely broken. The only way to specify a "path" is..
|
|
||||||
# a path *on the server*? The contents of which, specifies the paths. Wat.
|
|
||||||
# Obviously this is hopelessly broken and insecure, because you can query
|
|
||||||
# arbitrary paths on the server! So nobody enables this.
|
|
||||||
# What we have to do is disable *all* files. We could then just checkout
|
|
||||||
# the path, and it'd "work", but be hopelessly slow...Because it goes and
|
|
||||||
# transfers every missing object one-by-one. So the final piece is that we
|
|
||||||
# need to use some weird git internals to fetch the missings in bulk, and
|
|
||||||
# *that* we can do by path.
|
|
||||||
# We're using Git and sparse checkout to only clone the files we need
|
|
||||||
with make_tempdir() as tmp_dir:
|
|
||||||
# This is the "clone, but don't download anything" part.
|
|
||||||
cmd = (
|
|
||||||
f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
|
|
||||||
f"-b {branch} --filter=blob:none"
|
|
||||||
)
|
|
||||||
run_command(cmd)
|
|
||||||
# Now we need to find the missing filenames for the subpath we want.
|
|
||||||
# Looking for this 'rev-list' command in the git --help? Hah.
|
|
||||||
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
|
|
||||||
ret = run_command(cmd, capture=True)
|
|
||||||
git_repo = _http_to_git(repo)
|
|
||||||
# Now pass those missings into another bit of git internals
|
|
||||||
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
|
||||||
if not missings:
|
|
||||||
err = (
|
|
||||||
f"Could not find any relevant files for '{subpath}'. "
|
|
||||||
f"Did you specify a correct and complete path within repo '{repo}' "
|
|
||||||
f"and branch {branch}?"
|
|
||||||
)
|
|
||||||
msg.fail(err, exits=1)
|
|
||||||
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
|
|
||||||
run_command(cmd, capture=True)
|
|
||||||
# And finally, we can checkout our subpath
|
|
||||||
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
|
|
||||||
run_command(cmd, capture=True)
|
|
||||||
|
|
||||||
# Get a subdirectory of the cloned path, if appropriate
|
|
||||||
source_path = tmp_dir / Path(subpath)
|
|
||||||
if not is_subpath_of(tmp_dir, source_path):
|
|
||||||
err = f"'{subpath}' is a path outside of the cloned repository."
|
|
||||||
msg.fail(err, repo, exits=1)
|
|
||||||
|
|
||||||
shutil.move(str(source_path), str(dest))
|
|
||||||
|
|
||||||
|
|
||||||
def git_repo_branch_exists(repo: str, branch: str) -> bool:
|
|
||||||
"""Uses 'git ls-remote' to check if a repository and branch exists
|
|
||||||
|
|
||||||
repo (str): URL to get repo.
|
|
||||||
branch (str): Branch on repo to check.
|
|
||||||
RETURNS (bool): True if repo:branch exists.
|
|
||||||
"""
|
|
||||||
get_git_version()
|
|
||||||
cmd = f"git ls-remote {repo} {branch}"
|
|
||||||
# We might be tempted to use `--exit-code` with `git ls-remote`, but
|
|
||||||
# `run_command` handles the `returncode` for us, so we'll rely on
|
|
||||||
# the fact that stdout returns '' if the requested branch doesn't exist
|
|
||||||
ret = run_command(cmd, capture=True)
|
|
||||||
exists = ret.stdout != ""
|
|
||||||
return exists
|
|
||||||
|
|
||||||
|
|
||||||
def get_git_version(
|
|
||||||
error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
|
|
||||||
) -> Tuple[int, int]:
|
|
||||||
"""Get the version of git and raise an error if calling 'git --version' fails.
|
|
||||||
|
|
||||||
error (str): The error message to show.
|
|
||||||
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
|
|
||||||
(0, 0) if the version couldn't be determined.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
ret = run_command("git --version", capture=True)
|
|
||||||
except:
|
|
||||||
raise RuntimeError(error)
|
|
||||||
stdout = ret.stdout.strip()
|
|
||||||
if not stdout or not stdout.startswith("git version"):
|
|
||||||
return 0, 0
|
|
||||||
version = stdout[11:].strip().split(".")
|
|
||||||
return int(version[0]), int(version[1])
|
|
||||||
|
|
||||||
|
|
||||||
def _http_to_git(repo: str) -> str:
|
|
||||||
if repo.startswith("http://"):
|
|
||||||
repo = repo.replace(r"http://", r"https://")
|
|
||||||
if repo.startswith(r"https://"):
|
|
||||||
repo = repo.replace("https://", "git@").replace("/", ":", 1)
|
|
||||||
if repo.endswith("/"):
|
|
||||||
repo = repo[:-1]
|
|
||||||
repo = f"{repo}.git"
|
|
||||||
return repo
|
|
||||||
|
|
||||||
|
|
||||||
def is_subpath_of(parent, child):
|
|
||||||
"""
|
|
||||||
Check whether `child` is a path contained within `parent`.
|
|
||||||
"""
|
|
||||||
# Based on https://stackoverflow.com/a/37095733 .
|
|
||||||
|
|
||||||
# In Python 3.9, the `Path.is_relative_to()` method will supplant this, so
|
|
||||||
# we can stop using crusty old os.path functions.
|
|
||||||
parent_realpath = os.path.realpath(parent)
|
|
||||||
child_realpath = os.path.realpath(child)
|
|
||||||
return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
|
|
||||||
|
|
||||||
|
|
||||||
@overload
|
@overload
|
||||||
def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]:
|
def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]:
|
||||||
...
|
...
|
||||||
|
|
Loading…
Reference in New Issue
Block a user