mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-22 18:12:00 +03:00
remove project-specific CLI commands
This commit is contained in:
parent
9a25f386f3
commit
926094ad59
|
@ -1,206 +0,0 @@
|
|||
from typing import Any, Dict, Optional
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import requests
|
||||
import typer
|
||||
|
||||
from ...util import ensure_path, working_dir
|
||||
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
||||
from .._util import get_checksum, download_file, git_checkout, get_git_version
|
||||
from .._util import SimpleFrozenDict, parse_config_overrides
|
||||
|
||||
# Whether assets are extra if `extra` is not set.
|
||||
EXTRA_DEFAULT = False
|
||||
|
||||
|
||||
@project_cli.command(
|
||||
"assets",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def project_assets_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
|
||||
extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
|
||||
# fmt: on
|
||||
):
|
||||
"""Fetch project assets like datasets and pretrained weights. Assets are
|
||||
defined in the "assets" section of the project.yml. If a checksum is
|
||||
provided in the project.yml, the file is only downloaded if no local file
|
||||
with the same checksum exists.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#project-assets
|
||||
"""
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
project_assets(
|
||||
project_dir,
|
||||
overrides=overrides,
|
||||
sparse_checkout=sparse_checkout,
|
||||
extra=extra,
|
||||
)
|
||||
|
||||
|
||||
def project_assets(
|
||||
project_dir: Path,
|
||||
*,
|
||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||
sparse_checkout: bool = False,
|
||||
extra: bool = False,
|
||||
) -> None:
|
||||
"""Fetch assets for a project using DVC if possible.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
|
||||
needed.
|
||||
extra (bool): Whether to download all assets, including those marked as 'extra'.
|
||||
"""
|
||||
project_path = ensure_path(project_dir)
|
||||
config = load_project_config(project_path, overrides=overrides)
|
||||
assets = [
|
||||
asset
|
||||
for asset in config.get("assets", [])
|
||||
if extra or not asset.get("extra", EXTRA_DEFAULT)
|
||||
]
|
||||
if not assets:
|
||||
msg.warn(
|
||||
f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
|
||||
exits=0,
|
||||
)
|
||||
msg.info(f"Fetching {len(assets)} asset(s)")
|
||||
|
||||
for asset in assets:
|
||||
dest = (project_dir / asset["dest"]).resolve()
|
||||
checksum = asset.get("checksum")
|
||||
if "git" in asset:
|
||||
git_err = (
|
||||
f"Cloning spaCy project templates requires Git and the 'git' command. "
|
||||
f"Make sure it's installed and that the executable is available."
|
||||
)
|
||||
get_git_version(error=git_err)
|
||||
if dest.exists():
|
||||
# If there's already a file, check for checksum
|
||||
if checksum and checksum == get_checksum(dest):
|
||||
msg.good(
|
||||
f"Skipping download with matching checksum: {asset['dest']}"
|
||||
)
|
||||
continue
|
||||
else:
|
||||
if dest.is_dir():
|
||||
shutil.rmtree(dest)
|
||||
else:
|
||||
dest.unlink()
|
||||
if "repo" not in asset["git"] or asset["git"]["repo"] is None:
|
||||
msg.fail(
|
||||
"A git asset must include 'repo', the repository address.", exits=1
|
||||
)
|
||||
if "path" not in asset["git"] or asset["git"]["path"] is None:
|
||||
msg.fail(
|
||||
"A git asset must include 'path' - use \"\" to get the entire repository.",
|
||||
exits=1,
|
||||
)
|
||||
git_checkout(
|
||||
asset["git"]["repo"],
|
||||
asset["git"]["path"],
|
||||
dest,
|
||||
branch=asset["git"].get("branch"),
|
||||
sparse=sparse_checkout,
|
||||
)
|
||||
msg.good(f"Downloaded asset {dest}")
|
||||
else:
|
||||
url = asset.get("url")
|
||||
if not url:
|
||||
# project.yml defines asset without URL that the user has to place
|
||||
check_private_asset(dest, checksum)
|
||||
continue
|
||||
fetch_asset(project_path, url, dest, checksum)
|
||||
|
||||
|
||||
def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
|
||||
"""Check and validate assets without a URL (private assets that the user
|
||||
has to provide themselves) and give feedback about the checksum.
|
||||
|
||||
dest (Path): Destination path of the asset.
|
||||
checksum (Optional[str]): Optional checksum of the expected file.
|
||||
"""
|
||||
if not Path(dest).exists():
|
||||
err = f"No URL provided for asset. You need to add this file yourself: {dest}"
|
||||
msg.warn(err)
|
||||
else:
|
||||
if not checksum:
|
||||
msg.good(f"Asset already exists: {dest}")
|
||||
elif checksum == get_checksum(dest):
|
||||
msg.good(f"Asset exists with matching checksum: {dest}")
|
||||
else:
|
||||
msg.fail(f"Asset available but with incorrect checksum: {dest}")
|
||||
|
||||
|
||||
def fetch_asset(
|
||||
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
|
||||
) -> None:
|
||||
"""Fetch an asset from a given URL or path. If a checksum is provided and a
|
||||
local file exists, it's only re-downloaded if the checksum doesn't match.
|
||||
|
||||
project_path (Path): Path to project directory.
|
||||
url (str): URL or path to asset.
|
||||
checksum (Optional[str]): Optional expected checksum of local file.
|
||||
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
|
||||
the asset failed.
|
||||
"""
|
||||
dest_path = (project_path / dest).resolve()
|
||||
if dest_path.exists():
|
||||
# If there's already a file, check for checksum
|
||||
if checksum:
|
||||
if checksum == get_checksum(dest_path):
|
||||
msg.good(f"Skipping download with matching checksum: {dest}")
|
||||
return
|
||||
else:
|
||||
# If there's not a checksum, make sure the file is a possibly valid size
|
||||
if os.path.getsize(dest_path) == 0:
|
||||
msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
|
||||
os.remove(dest_path)
|
||||
# We might as well support the user here and create parent directories in
|
||||
# case the asset dir isn't listed as a dir to create in the project.yml
|
||||
if not dest_path.parent.exists():
|
||||
dest_path.parent.mkdir(parents=True)
|
||||
with working_dir(project_path):
|
||||
url = convert_asset_url(url)
|
||||
try:
|
||||
download_file(url, dest_path)
|
||||
msg.good(f"Downloaded asset {dest}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
if Path(url).exists() and Path(url).is_file():
|
||||
# If it's a local file, copy to destination
|
||||
shutil.copy(url, str(dest_path))
|
||||
msg.good(f"Copied local asset {dest}")
|
||||
else:
|
||||
msg.fail(f"Download failed: {dest}", e)
|
||||
if checksum and checksum != get_checksum(dest_path):
|
||||
msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
|
||||
|
||||
|
||||
def convert_asset_url(url: str) -> str:
|
||||
"""Check and convert the asset URL if needed.
|
||||
|
||||
url (str): The asset URL.
|
||||
RETURNS (str): The converted URL.
|
||||
"""
|
||||
# If the asset URL is a regular GitHub URL it's likely a mistake
|
||||
if (
|
||||
re.match(r"(http(s?)):\/\/github.com", url)
|
||||
and "releases/download" not in url
|
||||
and "/raw/" not in url
|
||||
):
|
||||
converted = url.replace("github.com", "raw.githubusercontent.com")
|
||||
converted = re.sub(r"/(tree|blob)/", "/", converted)
|
||||
msg.warn(
|
||||
"Downloading from a regular GitHub URL. This will only download "
|
||||
"the source of the page, not the actual file. Converting the URL "
|
||||
"to a raw URL.",
|
||||
converted,
|
||||
)
|
||||
return converted
|
||||
return url
|
|
@ -1,115 +0,0 @@
|
|||
from typing import Optional
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
from ... import about
|
||||
from ...util import ensure_path
|
||||
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
|
||||
from .._util import git_checkout, get_git_version, git_repo_branch_exists
|
||||
|
||||
DEFAULT_REPO = about.__projects__
|
||||
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
|
||||
DEFAULT_BRANCHES = ["main", "master"]
|
||||
|
||||
|
||||
@project_cli.command("clone")
|
||||
def project_clone_cli(
|
||||
# fmt: off
|
||||
name: str = Arg(..., help="The name of the template to clone"),
|
||||
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
|
||||
repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
|
||||
branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
|
||||
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
|
||||
# fmt: on
|
||||
):
|
||||
"""Clone a project template from a repository. Calls into "git" and will
|
||||
only download the files from the given subdirectory. The GitHub repo
|
||||
defaults to the official spaCy template repo, but can be customized
|
||||
(including using a private repo).
|
||||
|
||||
DOCS: https://spacy.io/api/cli#project-clone
|
||||
"""
|
||||
if dest is None:
|
||||
dest = Path.cwd() / Path(name).parts[-1]
|
||||
if repo == DEFAULT_REPO and branch is None:
|
||||
branch = DEFAULT_PROJECTS_BRANCH
|
||||
|
||||
if branch is None:
|
||||
for default_branch in DEFAULT_BRANCHES:
|
||||
if git_repo_branch_exists(repo, default_branch):
|
||||
branch = default_branch
|
||||
break
|
||||
if branch is None:
|
||||
default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
|
||||
msg.fail(
|
||||
"No branch provided and attempted default "
|
||||
f"branches {default_branches_msg} do not exist.",
|
||||
exits=1,
|
||||
)
|
||||
else:
|
||||
if not git_repo_branch_exists(repo, branch):
|
||||
msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
|
||||
assert isinstance(branch, str)
|
||||
project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
|
||||
|
||||
|
||||
def project_clone(
|
||||
name: str,
|
||||
dest: Path,
|
||||
*,
|
||||
repo: str = about.__projects__,
|
||||
branch: str = about.__projects_branch__,
|
||||
sparse_checkout: bool = False,
|
||||
) -> None:
|
||||
"""Clone a project template from a repository.
|
||||
|
||||
name (str): Name of subdirectory to clone.
|
||||
dest (Path): Destination path of cloned project.
|
||||
repo (str): URL of Git repo containing project templates.
|
||||
branch (str): The branch to clone from
|
||||
"""
|
||||
dest = ensure_path(dest)
|
||||
check_clone(name, dest, repo)
|
||||
project_dir = dest.resolve()
|
||||
repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
|
||||
try:
|
||||
git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
|
||||
except subprocess.CalledProcessError:
|
||||
err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
|
||||
msg.fail(err, exits=1)
|
||||
msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
|
||||
if not (project_dir / PROJECT_FILE).exists():
|
||||
msg.warn(f"No {PROJECT_FILE} found in directory")
|
||||
else:
|
||||
msg.good(f"Your project is now ready!")
|
||||
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
|
||||
|
||||
|
||||
def check_clone(name: str, dest: Path, repo: str) -> None:
|
||||
"""Check and validate that the destination path can be used to clone. Will
|
||||
check that Git is available and that the destination path is suitable.
|
||||
|
||||
name (str): Name of the directory to clone from the repo.
|
||||
dest (Path): Local destination of cloned directory.
|
||||
repo (str): URL of the repo to clone from.
|
||||
"""
|
||||
git_err = (
|
||||
f"Cloning spaCy project templates requires Git and the 'git' command. "
|
||||
f"To clone a project without Git, copy the files from the '{name}' "
|
||||
f"directory in the {repo} to {dest} manually."
|
||||
)
|
||||
get_git_version(error=git_err)
|
||||
if not dest:
|
||||
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
|
||||
if dest.exists():
|
||||
# Directory already exists (not allowed, clone needs to create it)
|
||||
msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
|
||||
if not dest.parent.exists():
|
||||
# We're not creating parents, parent dir should exist
|
||||
msg.fail(
|
||||
f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
|
||||
f"Create the necessary folder(s) first before continuing.",
|
||||
exits=1,
|
||||
)
|
|
@ -1,115 +0,0 @@
|
|||
from pathlib import Path
|
||||
from wasabi import msg, MarkdownRenderer
|
||||
|
||||
from ...util import working_dir
|
||||
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
||||
|
||||
|
||||
DOCS_URL = "https://spacy.io"
|
||||
INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
|
||||
project, as well as the available commands and workflows. For details, see the
|
||||
[spaCy projects documentation]({DOCS_URL}/usage/projects)."""
|
||||
INTRO_COMMANDS = f"""The following commands are defined by the project. They
|
||||
can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run).
|
||||
Commands are only re-run if their inputs have changed."""
|
||||
INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
|
||||
can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run)
|
||||
and will run the specified commands in order. Commands are only re-run if their
|
||||
inputs have changed."""
|
||||
INTRO_ASSETS = f"""The following assets are defined by the project. They can
|
||||
be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets)
|
||||
in the project directory."""
|
||||
# These markers are added to the Markdown and can be used to update the file in
|
||||
# place if it already exists. Only the auto-generated part will be replaced.
|
||||
MARKER_START = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->"
|
||||
MARKER_END = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->"
|
||||
# If this marker is used in an existing README, it's ignored and not replaced
|
||||
MARKER_IGNORE = "<!-- SPACY PROJECT: IGNORE -->"
|
||||
|
||||
|
||||
@project_cli.command("document")
|
||||
def project_document_cli(
|
||||
# fmt: off
|
||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
|
||||
no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji")
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Auto-generate a README.md for a project. If the content is saved to a file,
|
||||
hidden markers are added so you can add custom content before or after the
|
||||
auto-generated section and only the auto-generated docs will be replaced
|
||||
when you re-run the command.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#project-document
|
||||
"""
|
||||
project_document(project_dir, output_file, no_emoji=no_emoji)
|
||||
|
||||
|
||||
def project_document(
|
||||
project_dir: Path, output_file: Path, *, no_emoji: bool = False
|
||||
) -> None:
|
||||
is_stdout = str(output_file) == "-"
|
||||
config = load_project_config(project_dir)
|
||||
md = MarkdownRenderer(no_emoji=no_emoji)
|
||||
md.add(MARKER_START)
|
||||
title = config.get("title")
|
||||
description = config.get("description")
|
||||
md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐"))
|
||||
if description:
|
||||
md.add(description)
|
||||
md.add(md.title(2, PROJECT_FILE, "📋"))
|
||||
md.add(INTRO_PROJECT)
|
||||
# Commands
|
||||
cmds = config.get("commands", [])
|
||||
data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds]
|
||||
if data:
|
||||
md.add(md.title(3, "Commands", "⏯"))
|
||||
md.add(INTRO_COMMANDS)
|
||||
md.add(md.table(data, ["Command", "Description"]))
|
||||
# Workflows
|
||||
wfs = config.get("workflows", {}).items()
|
||||
data = [(md.code(n), " → ".join(md.code(w) for w in stp)) for n, stp in wfs]
|
||||
if data:
|
||||
md.add(md.title(3, "Workflows", "⏭"))
|
||||
md.add(INTRO_WORKFLOWS)
|
||||
md.add(md.table(data, ["Workflow", "Steps"]))
|
||||
# Assets
|
||||
assets = config.get("assets", [])
|
||||
data = []
|
||||
for a in assets:
|
||||
source = "Git" if a.get("git") else "URL" if a.get("url") else "Local"
|
||||
dest_path = a["dest"]
|
||||
dest = md.code(dest_path)
|
||||
if source == "Local":
|
||||
# Only link assets if they're in the repo
|
||||
with working_dir(project_dir) as p:
|
||||
if (p / dest_path).exists():
|
||||
dest = md.link(dest, dest_path)
|
||||
data.append((dest, source, a.get("description", "")))
|
||||
if data:
|
||||
md.add(md.title(3, "Assets", "🗂"))
|
||||
md.add(INTRO_ASSETS)
|
||||
md.add(md.table(data, ["File", "Source", "Description"]))
|
||||
md.add(MARKER_END)
|
||||
# Output result
|
||||
if is_stdout:
|
||||
print(md.text)
|
||||
else:
|
||||
content = md.text
|
||||
if output_file.exists():
|
||||
with output_file.open("r", encoding="utf8") as f:
|
||||
existing = f.read()
|
||||
if MARKER_IGNORE in existing:
|
||||
msg.warn("Found ignore marker in existing file: skipping", output_file)
|
||||
return
|
||||
if MARKER_START in existing and MARKER_END in existing:
|
||||
msg.info("Found existing file: only replacing auto-generated docs")
|
||||
before = existing.split(MARKER_START)[0]
|
||||
after = existing.split(MARKER_END)[1]
|
||||
content = f"{before}{content}{after}"
|
||||
else:
|
||||
msg.warn("Replacing existing file")
|
||||
with output_file.open("w", encoding="utf8") as f:
|
||||
f.write(content)
|
||||
msg.good("Saved project documentation", output_file)
|
|
@ -1,207 +0,0 @@
|
|||
"""This module contains helpers and subcommands for integrating spaCy projects
|
||||
with Data Version Controk (DVC). https://dvc.org"""
|
||||
from typing import Dict, Any, List, Optional, Iterable
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
|
||||
from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
|
||||
from .._util import Arg, Opt, NAME, COMMAND
|
||||
from ...util import working_dir, split_command, join_command, run_command
|
||||
from ...util import SimpleFrozenList
|
||||
|
||||
|
||||
DVC_CONFIG = "dvc.yaml"
|
||||
DVC_DIR = ".dvc"
|
||||
UPDATE_COMMAND = "dvc"
|
||||
DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
|
||||
# edited your {PROJECT_FILE}, you can regenerate this file by running:
|
||||
# {COMMAND} project {UPDATE_COMMAND}"""
|
||||
|
||||
|
||||
@project_cli.command(UPDATE_COMMAND)
|
||||
def project_update_dvc_cli(
|
||||
# fmt: off
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
||||
quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
||||
# fmt: on
|
||||
):
|
||||
"""Auto-generate Data Version Control (DVC) config. A DVC
|
||||
project can only define one pipeline, so you need to specify one workflow
|
||||
defined in the project.yml. If no workflow is specified, the first defined
|
||||
workflow is used. The DVC config will only be updated if the project.yml
|
||||
changed.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#project-dvc
|
||||
"""
|
||||
project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
|
||||
|
||||
|
||||
def project_update_dvc(
|
||||
project_dir: Path,
|
||||
workflow: Optional[str] = None,
|
||||
*,
|
||||
verbose: bool = False,
|
||||
quiet: bool = False,
|
||||
force: bool = False,
|
||||
) -> None:
|
||||
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
|
||||
project can only define one pipeline, so you need to specify one workflow
|
||||
defined in the project.yml. Will only update the file if the checksum changed.
|
||||
|
||||
project_dir (Path): The project directory.
|
||||
workflow (Optional[str]): Optional name of workflow defined in project.yml.
|
||||
If not set, the first workflow will be used.
|
||||
verbose (bool): Print more info.
|
||||
quiet (bool): Print less info.
|
||||
force (bool): Force update DVC config.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
updated = update_dvc_config(
|
||||
project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
|
||||
)
|
||||
help_msg = "To execute the workflow with DVC, run: dvc repro"
|
||||
if updated:
|
||||
msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
|
||||
else:
|
||||
msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
|
||||
|
||||
|
||||
def update_dvc_config(
|
||||
path: Path,
|
||||
config: Dict[str, Any],
|
||||
workflow: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
quiet: bool = False,
|
||||
force: bool = False,
|
||||
) -> bool:
|
||||
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
||||
project directory. The file is auto-generated based on the config. The
|
||||
first line of the auto-generated file specifies the hash of the config
|
||||
dict, so if any of the config values change, the DVC config is regenerated.
|
||||
|
||||
path (Path): The path to the project directory.
|
||||
config (Dict[str, Any]): The loaded project.yml.
|
||||
verbose (bool): Whether to print additional info (via DVC).
|
||||
quiet (bool): Don't output anything (via DVC).
|
||||
force (bool): Force update, even if hashes match.
|
||||
RETURNS (bool): Whether the DVC config file was updated.
|
||||
"""
|
||||
ensure_dvc(path)
|
||||
workflows = config.get("workflows", {})
|
||||
workflow_names = list(workflows.keys())
|
||||
check_workflows(workflow_names, workflow)
|
||||
if not workflow:
|
||||
workflow = workflow_names[0]
|
||||
config_hash = get_hash(config)
|
||||
path = path.resolve()
|
||||
dvc_config_path = path / DVC_CONFIG
|
||||
if dvc_config_path.exists():
|
||||
# Check if the file was generated using the current config, if not, redo
|
||||
with dvc_config_path.open("r", encoding="utf8") as f:
|
||||
ref_hash = f.readline().strip().replace("# ", "")
|
||||
if ref_hash == config_hash and not force:
|
||||
return False # Nothing has changed in project.yml, don't need to update
|
||||
dvc_config_path.unlink()
|
||||
dvc_commands = []
|
||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
|
||||
# some flags that apply to every command
|
||||
flags = []
|
||||
if verbose:
|
||||
flags.append("--verbose")
|
||||
if quiet:
|
||||
flags.append("--quiet")
|
||||
|
||||
for name in workflows[workflow]:
|
||||
command = config_commands[name]
|
||||
deps = command.get("deps", [])
|
||||
outputs = command.get("outputs", [])
|
||||
outputs_no_cache = command.get("outputs_no_cache", [])
|
||||
if not deps and not outputs and not outputs_no_cache:
|
||||
continue
|
||||
# Default to the working dir as the project path since dvc.yaml is auto-generated
|
||||
# and we don't want arbitrary paths in there
|
||||
project_cmd = ["python", "-m", NAME, "project", "run", name]
|
||||
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
||||
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
||||
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
||||
|
||||
dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
|
||||
if command.get("no_skip"):
|
||||
dvc_cmd.append("--always-changed")
|
||||
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
||||
dvc_commands.append(join_command(full_cmd))
|
||||
|
||||
if not dvc_commands:
|
||||
# If we don't check for this, then there will be an error when reading the
|
||||
# config, since DVC wouldn't create it.
|
||||
msg.fail(
|
||||
"No usable commands for DVC found. This can happen if none of your "
|
||||
"commands have dependencies or outputs.",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
with working_dir(path):
|
||||
for c in dvc_commands:
|
||||
dvc_command = "dvc " + c
|
||||
run_command(dvc_command)
|
||||
with dvc_config_path.open("r+", encoding="utf8") as f:
|
||||
content = f.read()
|
||||
f.seek(0, 0)
|
||||
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
|
||||
return True
|
||||
|
||||
|
||||
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
|
||||
"""Validate workflows provided in project.yml and check that a given
|
||||
workflow can be used to generate a DVC config.
|
||||
|
||||
workflows (List[str]): Names of the available workflows.
|
||||
workflow (Optional[str]): The name of the workflow to convert.
|
||||
"""
|
||||
if not workflows:
|
||||
msg.fail(
|
||||
f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
|
||||
f"define at least one list of commands.",
|
||||
exits=1,
|
||||
)
|
||||
if workflow is not None and workflow not in workflows:
|
||||
msg.fail(
|
||||
f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
|
||||
f"Available workflows: {', '.join(workflows)}",
|
||||
exits=1,
|
||||
)
|
||||
if not workflow:
|
||||
msg.warn(
|
||||
f"No workflow specified for DVC pipeline. Using the first workflow "
|
||||
f"defined in {PROJECT_FILE}: '{workflows[0]}'"
|
||||
)
|
||||
|
||||
|
||||
def ensure_dvc(project_dir: Path) -> None:
|
||||
"""Ensure that the "dvc" command is available and that the current project
|
||||
directory is an initialized DVC project.
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
|
||||
except Exception:
|
||||
msg.fail(
|
||||
"To use spaCy projects with DVC (Data Version Control), DVC needs "
|
||||
"to be installed and the 'dvc' command needs to be available",
|
||||
"You can install the Python package from pip (pip install dvc) or "
|
||||
"conda (conda install -c conda-forge dvc). For more details, see the "
|
||||
"documentation: https://dvc.org/doc/install",
|
||||
exits=1,
|
||||
)
|
||||
if not (project_dir / ".dvc").exists():
|
||||
msg.fail(
|
||||
"Project not initialized as a DVC project",
|
||||
"To initialize a DVC project, you can run 'dvc init' in the project "
|
||||
"directory. For more details, see the documentation: "
|
||||
"https://dvc.org/doc/command-reference/init",
|
||||
exits=1,
|
||||
)
|
|
@ -1,67 +0,0 @@
|
|||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
from .remote_storage import RemoteStorage
|
||||
from .remote_storage import get_command_hash
|
||||
from .._util import project_cli, Arg, logger
|
||||
from .._util import load_project_config
|
||||
from .run import update_lockfile
|
||||
|
||||
|
||||
@project_cli.command("pull")
|
||||
def project_pull_cli(
|
||||
# fmt: off
|
||||
remote: str = Arg("default", help="Name or path of remote storage"),
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
# fmt: on
|
||||
):
|
||||
"""Retrieve available precomputed outputs from a remote storage.
|
||||
You can alias remotes in your project.yml by mapping them to storage paths.
|
||||
A storage can be anything that the smart-open library can upload to, e.g.
|
||||
AWS, Google Cloud Storage, SSH, local directories etc.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#project-pull
|
||||
"""
|
||||
for url, output_path in project_pull(project_dir, remote):
|
||||
if url is not None:
|
||||
msg.good(f"Pulled {output_path} from {url}")
|
||||
|
||||
|
||||
def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
||||
# TODO: We don't have tests for this :(. It would take a bit of mockery to
|
||||
# set up. I guess see if it breaks first?
|
||||
config = load_project_config(project_dir)
|
||||
if remote in config.get("remotes", {}):
|
||||
remote = config["remotes"][remote]
|
||||
storage = RemoteStorage(project_dir, remote)
|
||||
commands = list(config.get("commands", []))
|
||||
# We use a while loop here because we don't know how the commands
|
||||
# will be ordered. A command might need dependencies from one that's later
|
||||
# in the list.
|
||||
while commands:
|
||||
for i, cmd in enumerate(list(commands)):
|
||||
logger.debug("CMD: %s.", cmd["name"])
|
||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||
if all(dep.exists() for dep in deps):
|
||||
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
||||
for output_path in cmd.get("outputs", []):
|
||||
url = storage.pull(output_path, command_hash=cmd_hash)
|
||||
logger.debug(
|
||||
"URL: %s for %s with command hash %s",
|
||||
url,
|
||||
output_path,
|
||||
cmd_hash,
|
||||
)
|
||||
yield url, output_path
|
||||
|
||||
out_locs = [project_dir / out for out in cmd.get("outputs", [])]
|
||||
if all(loc.exists() for loc in out_locs):
|
||||
update_lockfile(project_dir, cmd)
|
||||
# We remove the command from the list here, and break, so that
|
||||
# we iterate over the loop again.
|
||||
commands.pop(i)
|
||||
break
|
||||
else:
|
||||
logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
|
||||
else:
|
||||
# If we didn't break the for loop, break the while loop.
|
||||
break
|
|
@ -1,69 +0,0 @@
|
|||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
from .remote_storage import RemoteStorage
|
||||
from .remote_storage import get_content_hash, get_command_hash
|
||||
from .._util import load_project_config
|
||||
from .._util import project_cli, Arg, logger
|
||||
|
||||
|
||||
@project_cli.command("push")
|
||||
def project_push_cli(
|
||||
# fmt: off
|
||||
remote: str = Arg("default", help="Name or path of remote storage"),
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
# fmt: on
|
||||
):
|
||||
"""Persist outputs to a remote storage. You can alias remotes in your
|
||||
project.yml by mapping them to storage paths. A storage can be anything that
|
||||
the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
|
||||
local directories etc.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#project-push
|
||||
"""
|
||||
for output_path, url in project_push(project_dir, remote):
|
||||
if url is None:
|
||||
msg.info(f"Skipping {output_path}")
|
||||
else:
|
||||
msg.good(f"Pushed {output_path} to {url}")
|
||||
|
||||
|
||||
def project_push(project_dir: Path, remote: str):
|
||||
"""Persist outputs to a remote storage. You can alias remotes in your project.yml
|
||||
by mapping them to storage paths. A storage can be anything that the smart-open
|
||||
library can upload to, e.g. gcs, aws, ssh, local directories etc
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
if remote in config.get("remotes", {}):
|
||||
remote = config["remotes"][remote]
|
||||
storage = RemoteStorage(project_dir, remote)
|
||||
for cmd in config.get("commands", []):
|
||||
logger.debug("CMD: %s", cmd["name"])
|
||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||
if any(not dep.exists() for dep in deps):
|
||||
logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
|
||||
continue
|
||||
cmd_hash = get_command_hash(
|
||||
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
|
||||
)
|
||||
logger.debug("CMD_HASH: %s", cmd_hash)
|
||||
for output_path in cmd.get("outputs", []):
|
||||
output_loc = project_dir / output_path
|
||||
if output_loc.exists() and _is_not_empty_dir(output_loc):
|
||||
url = storage.push(
|
||||
output_path,
|
||||
command_hash=cmd_hash,
|
||||
content_hash=get_content_hash(output_loc),
|
||||
)
|
||||
logger.debug(
|
||||
"URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
|
||||
)
|
||||
yield output_path, url
|
||||
|
||||
|
||||
def _is_not_empty_dir(loc: Path):
|
||||
if not loc.is_dir():
|
||||
return True
|
||||
elif any(_is_not_empty_dir(child) for child in loc.iterdir()):
|
||||
return True
|
||||
else:
|
||||
return False
|
|
@ -1,205 +0,0 @@
|
|||
from typing import Optional, List, Dict, TYPE_CHECKING
|
||||
import os
|
||||
import site
|
||||
import hashlib
|
||||
import urllib.parse
|
||||
import tarfile
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
|
||||
from .._util import get_hash, get_checksum, upload_file, download_file
|
||||
from .._util import ensure_pathy, make_tempdir
|
||||
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
|
||||
from ...git_info import GIT_VERSION
|
||||
from ... import about
|
||||
from ...errors import Errors
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import FluidPath # noqa: F401
|
||||
|
||||
|
||||
class RemoteStorage:
|
||||
"""Push and pull outputs to and from a remote file storage.
|
||||
|
||||
Remotes can be anything that `smart-open` can support: AWS, GCS, file system,
|
||||
ssh, etc.
|
||||
"""
|
||||
|
||||
def __init__(self, project_root: Path, url: str, *, compression="gz"):
|
||||
self.root = project_root
|
||||
self.url = ensure_pathy(url)
|
||||
self.compression = compression
|
||||
|
||||
def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
|
||||
"""Compress a file or directory within a project and upload it to a remote
|
||||
storage. If an object exists at the full URL, nothing is done.
|
||||
|
||||
Within the remote storage, files are addressed by their project path
|
||||
(url encoded) and two user-supplied hashes, representing their creation
|
||||
context and their file contents. If the URL already exists, the data is
|
||||
not uploaded. Paths are archived and compressed prior to upload.
|
||||
"""
|
||||
loc = self.root / path
|
||||
if not loc.exists():
|
||||
raise IOError(f"Cannot push {loc}: does not exist.")
|
||||
url = self.make_url(path, command_hash, content_hash)
|
||||
if url.exists():
|
||||
return url
|
||||
tmp: Path
|
||||
with make_tempdir() as tmp:
|
||||
tar_loc = tmp / self.encode_name(str(path))
|
||||
mode_string = f"w:{self.compression}" if self.compression else "w"
|
||||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
||||
tar_file.add(str(loc), arcname=str(path))
|
||||
upload_file(tar_loc, url)
|
||||
return url
|
||||
|
||||
def pull(
|
||||
self,
|
||||
path: Path,
|
||||
*,
|
||||
command_hash: Optional[str] = None,
|
||||
content_hash: Optional[str] = None,
|
||||
) -> Optional["FluidPath"]:
|
||||
"""Retrieve a file from the remote cache. If the file already exists,
|
||||
nothing is done.
|
||||
|
||||
If the command_hash and/or content_hash are specified, only matching
|
||||
results are returned. If no results are available, an error is raised.
|
||||
"""
|
||||
dest = self.root / path
|
||||
if dest.exists():
|
||||
return None
|
||||
url = self.find(path, command_hash=command_hash, content_hash=content_hash)
|
||||
if url is None:
|
||||
return url
|
||||
else:
|
||||
# Make sure the destination exists
|
||||
if not dest.parent.exists():
|
||||
dest.parent.mkdir(parents=True)
|
||||
tmp: Path
|
||||
with make_tempdir() as tmp:
|
||||
tar_loc = tmp / url.parts[-1]
|
||||
download_file(url, tar_loc)
|
||||
mode_string = f"r:{self.compression}" if self.compression else "r"
|
||||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
||||
# This requires that the path is added correctly, relative
|
||||
# to root. This is how we set things up in push()
|
||||
|
||||
# Disallow paths outside the current directory for the tar
|
||||
# file (CVE-2007-4559, directory traversal vulnerability)
|
||||
def is_within_directory(directory, target):
|
||||
abs_directory = os.path.abspath(directory)
|
||||
abs_target = os.path.abspath(target)
|
||||
prefix = os.path.commonprefix([abs_directory, abs_target])
|
||||
return prefix == abs_directory
|
||||
|
||||
def safe_extract(tar, path):
|
||||
for member in tar.getmembers():
|
||||
member_path = os.path.join(path, member.name)
|
||||
if not is_within_directory(path, member_path):
|
||||
raise ValueError(Errors.E852)
|
||||
tar.extractall(path)
|
||||
|
||||
safe_extract(tar_file, self.root)
|
||||
return url
|
||||
|
||||
def find(
|
||||
self,
|
||||
path: Path,
|
||||
*,
|
||||
command_hash: Optional[str] = None,
|
||||
content_hash: Optional[str] = None,
|
||||
) -> Optional["FluidPath"]:
|
||||
"""Find the best matching version of a file within the storage,
|
||||
or `None` if no match can be found. If both the creation and content hash
|
||||
are specified, only exact matches will be returned. Otherwise, the most
|
||||
recent matching file is preferred.
|
||||
"""
|
||||
name = self.encode_name(str(path))
|
||||
urls = []
|
||||
if command_hash is not None and content_hash is not None:
|
||||
url = self.url / name / command_hash / content_hash
|
||||
urls = [url] if url.exists() else []
|
||||
elif command_hash is not None:
|
||||
if (self.url / name / command_hash).exists():
|
||||
urls = list((self.url / name / command_hash).iterdir())
|
||||
else:
|
||||
if (self.url / name).exists():
|
||||
for sub_dir in (self.url / name).iterdir():
|
||||
urls.extend(sub_dir.iterdir())
|
||||
if content_hash is not None:
|
||||
urls = [url for url in urls if url.parts[-1] == content_hash]
|
||||
if len(urls) >= 2:
|
||||
try:
|
||||
urls.sort(key=lambda x: x.stat().last_modified) # type: ignore
|
||||
except Exception:
|
||||
msg.warn(
|
||||
"Unable to sort remote files by last modified. The file(s) "
|
||||
"pulled from the cache may not be the most recent."
|
||||
)
|
||||
return urls[-1] if urls else None
|
||||
|
||||
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
|
||||
"""Construct a URL from a subpath, a creation hash and a content hash."""
|
||||
return self.url / self.encode_name(str(path)) / command_hash / content_hash
|
||||
|
||||
def encode_name(self, name: str) -> str:
|
||||
"""Encode a subpath into a URL-safe name."""
|
||||
return urllib.parse.quote_plus(name)
|
||||
|
||||
|
||||
def get_content_hash(loc: Path) -> str:
|
||||
return get_checksum(loc)
|
||||
|
||||
|
||||
def get_command_hash(
|
||||
site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
|
||||
) -> str:
|
||||
"""Create a hash representing the execution of a command. This includes the
|
||||
currently installed packages, whatever environment variables have been marked
|
||||
as relevant, and the command.
|
||||
"""
|
||||
if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION):
|
||||
spacy_v = GIT_VERSION
|
||||
else:
|
||||
spacy_v = str(get_minor_version(about.__version__) or "")
|
||||
dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
|
||||
hashes = [spacy_v, site_hash, env_hash] + dep_checksums
|
||||
hashes.extend(cmd)
|
||||
creation_bytes = "".join(hashes).encode("utf8")
|
||||
return hashlib.md5(creation_bytes).hexdigest()
|
||||
|
||||
|
||||
def get_site_hash():
|
||||
"""Hash the current Python environment's site-packages contents, including
|
||||
the name and version of the libraries. The list we're hashing is what
|
||||
`pip freeze` would output.
|
||||
"""
|
||||
site_dirs = site.getsitepackages()
|
||||
if site.ENABLE_USER_SITE:
|
||||
site_dirs.extend(site.getusersitepackages())
|
||||
packages = set()
|
||||
for site_dir in site_dirs:
|
||||
site_dir = Path(site_dir)
|
||||
for subpath in site_dir.iterdir():
|
||||
if subpath.parts[-1].endswith("dist-info"):
|
||||
packages.add(subpath.parts[-1].replace(".dist-info", ""))
|
||||
package_bytes = "".join(sorted(packages)).encode("utf8")
|
||||
return hashlib.md5sum(package_bytes).hexdigest()
|
||||
|
||||
|
||||
def get_env_hash(env: Dict[str, str]) -> str:
|
||||
"""Construct a hash of the environment variables that will be passed into
|
||||
the commands.
|
||||
|
||||
Values in the env dict may be references to the current os.environ, using
|
||||
the syntax $ENV_VAR to mean os.environ[ENV_VAR]
|
||||
"""
|
||||
env_vars = {}
|
||||
for key, value in env.items():
|
||||
if value.startswith("$"):
|
||||
env_vars[key] = os.environ.get(value[1:], "")
|
||||
else:
|
||||
env_vars[key] = value
|
||||
return get_hash(env_vars)
|
|
@ -1,360 +0,0 @@
|
|||
from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
|
||||
import os.path
|
||||
from pathlib import Path
|
||||
|
||||
from wasabi import msg
|
||||
from wasabi.util import locale_escape
|
||||
import sys
|
||||
import srsly
|
||||
import typer
|
||||
|
||||
from ... import about
|
||||
from ...git_info import GIT_VERSION
|
||||
from ...util import working_dir, run_command, split_command, is_cwd, join_command
|
||||
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
|
||||
from ...util import check_bool_env_var, SimpleFrozenDict
|
||||
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
||||
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides
|
||||
|
||||
|
||||
@project_cli.command(
|
||||
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
||||
)
|
||||
def project_run_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
|
||||
dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
|
||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
||||
# fmt: on
|
||||
):
|
||||
"""Run a named command or workflow defined in the project.yml. If a workflow
|
||||
name is specified, all commands in the workflow are run, in order. If
|
||||
commands define dependencies and/or outputs, they will only be re-run if
|
||||
state has changed.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#project-run
|
||||
"""
|
||||
if show_help or not subcommand:
|
||||
print_run_help(project_dir, subcommand)
|
||||
else:
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)
|
||||
|
||||
|
||||
def project_run(
|
||||
project_dir: Path,
|
||||
subcommand: str,
|
||||
*,
|
||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||
force: bool = False,
|
||||
dry: bool = False,
|
||||
capture: bool = False,
|
||||
skip_requirements_check: bool = False,
|
||||
) -> None:
|
||||
"""Run a named script defined in the project.yml. If the script is part
|
||||
of the default pipeline (defined in the "run" section), DVC is used to
|
||||
execute the command, so it can determine whether to rerun it. It then
|
||||
calls into "exec" to execute it.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
subcommand (str): Name of command to run.
|
||||
overrides (Dict[str, Any]): Optional config overrides.
|
||||
force (bool): Force re-running, even if nothing changed.
|
||||
dry (bool): Perform a dry run and don't execute commands.
|
||||
capture (bool): Whether to capture the output and errors of individual commands.
|
||||
If False, the stdout and stderr will not be redirected, and if there's an error,
|
||||
sys.exit will be called with the return code. You should use capture=False
|
||||
when you want to turn over execution to the command, and capture=True
|
||||
when you want to run the command more like a function.
|
||||
skip_requirements_check (bool): Whether to skip the requirements check.
|
||||
"""
|
||||
config = load_project_config(project_dir, overrides=overrides)
|
||||
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
workflows = config.get("workflows", {})
|
||||
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
|
||||
|
||||
req_path = project_dir / "requirements.txt"
|
||||
if not skip_requirements_check:
|
||||
if config.get("check_requirements", True) and os.path.exists(req_path):
|
||||
with req_path.open() as requirements_file:
|
||||
_check_requirements([req.strip() for req in requirements_file])
|
||||
|
||||
if subcommand in workflows:
|
||||
msg.info(f"Running workflow '{subcommand}'")
|
||||
for cmd in workflows[subcommand]:
|
||||
project_run(
|
||||
project_dir,
|
||||
cmd,
|
||||
overrides=overrides,
|
||||
force=force,
|
||||
dry=dry,
|
||||
capture=capture,
|
||||
skip_requirements_check=True,
|
||||
)
|
||||
else:
|
||||
cmd = commands[subcommand]
|
||||
for dep in cmd.get("deps", []):
|
||||
if not (project_dir / dep).exists():
|
||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
||||
err_exits = 1 if not dry else None
|
||||
msg.fail(err, err_help, exits=err_exits)
|
||||
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
||||
with working_dir(project_dir) as current_dir:
|
||||
msg.divider(subcommand)
|
||||
rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
|
||||
if not rerun and not force:
|
||||
msg.info(f"Skipping '{cmd['name']}': nothing changed")
|
||||
else:
|
||||
run_commands(cmd["script"], dry=dry, capture=capture)
|
||||
if not dry:
|
||||
update_lockfile(current_dir, cmd)
|
||||
|
||||
|
||||
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||
"""Simulate a CLI help prompt using the info available in the project.yml.
|
||||
|
||||
project_dir (Path): The project directory.
|
||||
subcommand (Optional[str]): The subcommand or None. If a subcommand is
|
||||
provided, the subcommand help is shown. Otherwise, the top-level help
|
||||
and a list of available commands is printed.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
config_commands = config.get("commands", [])
|
||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
||||
workflows = config.get("workflows", {})
|
||||
project_loc = "" if is_cwd(project_dir) else project_dir
|
||||
if subcommand:
|
||||
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
|
||||
print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
|
||||
if subcommand in commands:
|
||||
help_text = commands[subcommand].get("help")
|
||||
if help_text:
|
||||
print(f"\n{help_text}\n")
|
||||
elif subcommand in workflows:
|
||||
steps = workflows[subcommand]
|
||||
print(f"\nWorkflow consisting of {len(steps)} commands:")
|
||||
steps_data = [
|
||||
(f"{i + 1}. {step}", commands[step].get("help", ""))
|
||||
for i, step in enumerate(steps)
|
||||
]
|
||||
msg.table(steps_data)
|
||||
help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
|
||||
print(f"For command details, run: {help_cmd}")
|
||||
else:
|
||||
print("")
|
||||
title = config.get("title")
|
||||
if title:
|
||||
print(f"{locale_escape(title)}\n")
|
||||
if config_commands:
|
||||
print(f"Available commands in {PROJECT_FILE}")
|
||||
print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
|
||||
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
||||
if workflows:
|
||||
print(f"Available workflows in {PROJECT_FILE}")
|
||||
print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
|
||||
msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
|
||||
|
||||
|
||||
def run_commands(
|
||||
commands: Iterable[str] = SimpleFrozenList(),
|
||||
silent: bool = False,
|
||||
dry: bool = False,
|
||||
capture: bool = False,
|
||||
) -> None:
|
||||
"""Run a sequence of commands in a subprocess, in order.
|
||||
|
||||
commands (List[str]): The string commands.
|
||||
silent (bool): Don't print the commands.
|
||||
dry (bool): Perform a dry run and don't execut anything.
|
||||
capture (bool): Whether to capture the output and errors of individual commands.
|
||||
If False, the stdout and stderr will not be redirected, and if there's an error,
|
||||
sys.exit will be called with the return code. You should use capture=False
|
||||
when you want to turn over execution to the command, and capture=True
|
||||
when you want to run the command more like a function.
|
||||
"""
|
||||
for c in commands:
|
||||
command = split_command(c)
|
||||
# Not sure if this is needed or a good idea. Motivation: users may often
|
||||
# use commands in their config that reference "python" and we want to
|
||||
# make sure that it's always executing the same Python that spaCy is
|
||||
# executed with and the pip in the same env, not some other Python/pip.
|
||||
# Also ensures cross-compatibility if user 1 writes "python3" (because
|
||||
# that's how it's set up on their system), and user 2 without the
|
||||
# shortcut tries to re-run the command.
|
||||
if len(command) and command[0] in ("python", "python3"):
|
||||
command[0] = sys.executable
|
||||
elif len(command) and command[0] in ("pip", "pip3"):
|
||||
command = [sys.executable, "-m", "pip", *command[1:]]
|
||||
if not silent:
|
||||
print(f"Running command: {join_command(command)}")
|
||||
if not dry:
|
||||
run_command(command, capture=capture)
|
||||
|
||||
|
||||
def validate_subcommand(
|
||||
commands: Sequence[str], workflows: Sequence[str], subcommand: str
|
||||
) -> None:
|
||||
"""Check that a subcommand is valid and defined. Raises an error otherwise.
|
||||
|
||||
commands (Sequence[str]): The available commands.
|
||||
subcommand (str): The subcommand.
|
||||
"""
|
||||
if not commands and not workflows:
|
||||
msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
|
||||
if subcommand not in commands and subcommand not in workflows:
|
||||
help_msg = []
|
||||
if subcommand in ["assets", "asset"]:
|
||||
help_msg.append("Did you mean to run: python -m spacy project assets?")
|
||||
if commands:
|
||||
help_msg.append(f"Available commands: {', '.join(commands)}")
|
||||
if workflows:
|
||||
help_msg.append(f"Available workflows: {', '.join(workflows)}")
|
||||
msg.fail(
|
||||
f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
|
||||
". ".join(help_msg),
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
def check_rerun(
|
||||
project_dir: Path,
|
||||
command: Dict[str, Any],
|
||||
*,
|
||||
check_spacy_version: bool = True,
|
||||
check_spacy_commit: bool = False,
|
||||
) -> bool:
|
||||
"""Check if a command should be rerun because its settings or inputs/outputs
|
||||
changed.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||
strict_version (bool):
|
||||
RETURNS (bool): Whether to re-run the command.
|
||||
"""
|
||||
# Always rerun if no-skip is set
|
||||
if command.get("no_skip", False):
|
||||
return True
|
||||
lock_path = project_dir / PROJECT_LOCK
|
||||
if not lock_path.exists(): # We don't have a lockfile, run command
|
||||
return True
|
||||
data = srsly.read_yaml(lock_path)
|
||||
if command["name"] not in data: # We don't have info about this command
|
||||
return True
|
||||
entry = data[command["name"]]
|
||||
# Always run commands with no outputs (otherwise they'd always be skipped)
|
||||
if not entry.get("outs", []):
|
||||
return True
|
||||
# Always rerun if spaCy version or commit hash changed
|
||||
spacy_v = entry.get("spacy_version")
|
||||
commit = entry.get("spacy_git_version")
|
||||
if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
|
||||
info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
|
||||
msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
|
||||
return True
|
||||
if check_spacy_commit and commit != GIT_VERSION:
|
||||
info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
|
||||
msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
|
||||
return True
|
||||
# If the entry in the lockfile matches the lockfile entry that would be
|
||||
# generated from the current command, we don't rerun because it means that
|
||||
# all inputs/outputs, hashes and scripts are the same and nothing changed
|
||||
lock_entry = get_lock_entry(project_dir, command)
|
||||
exclude = ["spacy_version", "spacy_git_version"]
|
||||
return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
|
||||
|
||||
|
||||
def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
|
||||
"""Update the lockfile after running a command. Will create a lockfile if
|
||||
it doesn't yet exist and will add an entry for the current command, its
|
||||
script and dependencies/outputs.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||
"""
|
||||
lock_path = project_dir / PROJECT_LOCK
|
||||
if not lock_path.exists():
|
||||
srsly.write_yaml(lock_path, {})
|
||||
data = {}
|
||||
else:
|
||||
data = srsly.read_yaml(lock_path)
|
||||
data[command["name"]] = get_lock_entry(project_dir, command)
|
||||
srsly.write_yaml(lock_path, data)
|
||||
|
||||
|
||||
def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Get a lockfile entry for a given command. An entry includes the command,
|
||||
the script (command steps) and a list of dependencies and outputs with
|
||||
their paths and file hashes, if available. The format is based on the
|
||||
dvc.lock files, to keep things consistent.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||
RETURNS (Dict[str, Any]): The lockfile entry.
|
||||
"""
|
||||
deps = get_fileinfo(project_dir, command.get("deps", []))
|
||||
outs = get_fileinfo(project_dir, command.get("outputs", []))
|
||||
outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []))
|
||||
return {
|
||||
"cmd": f"{COMMAND} run {command['name']}",
|
||||
"script": command["script"],
|
||||
"deps": deps,
|
||||
"outs": [*outs, *outs_nc],
|
||||
"spacy_version": about.__version__,
|
||||
"spacy_git_version": GIT_VERSION,
|
||||
}
|
||||
|
||||
|
||||
def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]:
|
||||
"""Generate the file information for a list of paths (dependencies, outputs).
|
||||
Includes the file path and the file's checksum.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
paths (List[str]): The file paths.
|
||||
RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
|
||||
"""
|
||||
data = []
|
||||
for path in paths:
|
||||
file_path = project_dir / path
|
||||
md5 = get_checksum(file_path) if file_path.exists() else None
|
||||
data.append({"path": path, "md5": md5})
|
||||
return data
|
||||
|
||||
|
||||
def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
|
||||
"""Checks whether requirements are installed and free of version conflicts.
|
||||
requirements (List[str]): List of requirements.
|
||||
RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
|
||||
exist.
|
||||
"""
|
||||
import pkg_resources
|
||||
|
||||
failed_pkgs_msgs: List[str] = []
|
||||
conflicting_pkgs_msgs: List[str] = []
|
||||
|
||||
for req in requirements:
|
||||
try:
|
||||
pkg_resources.require(req)
|
||||
except pkg_resources.DistributionNotFound as dnf:
|
||||
failed_pkgs_msgs.append(dnf.report())
|
||||
except pkg_resources.VersionConflict as vc:
|
||||
conflicting_pkgs_msgs.append(vc.report())
|
||||
except Exception:
|
||||
msg.warn(
|
||||
f"Unable to check requirement: {req} "
|
||||
"Checks are currently limited to requirement specifiers "
|
||||
"(PEP 508)"
|
||||
)
|
||||
|
||||
if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
|
||||
msg.warn(
|
||||
title="Missing requirements or requirement conflicts detected. Make sure your Python environment is set up "
|
||||
"correctly and you installed all requirements specified in your project's requirements.txt: "
|
||||
)
|
||||
for pgk_msg in failed_pkgs_msgs + conflicting_pkgs_msgs:
|
||||
msg.text(pgk_msg)
|
||||
|
||||
return len(failed_pkgs_msgs) > 0, len(conflicting_pkgs_msgs) > 0
|
Loading…
Reference in New Issue
Block a user