Allow spacy project to push and pull to/from remote storage (#5949)

* Add utils for working with remote storage

* WIP add remote_cache for project

* WIP add push and pull commands

* Use pathy in remote_cache

* Updarte util

* Update remote_cache

* Update util

* Update project assets

* Update pull script

* Update push script

* Fix type annotation in util

* Work on remote storage

* Remove site and env hash

* Fix imports

* Fix type annotation

* Require pathy

* Require pathy

* Fix import

* Add a util to handle project variable substitution

* Import push and pull commands

* Fix pull command

* Fix push command

* Fix tarfile in remote_storage

* Improve printing

* Fiddle with status messages

* Set version to v3.0.0a9

* Draft docs for spacy project remote storages

* Update docs [ci skip]

* Use Thinc config to simplify and unify template variables

* Auto-format

* Don't import Pathy globally for now

Causes slow and annoying Google Cloud warning

* Tidy up test

* Tidy up and update tests

* Update to latest Thinc

* Update docs

* variables -> vars

* Update docs [ci skip]

* Update docs [ci skip]

Co-authored-by: Ines Montani <ines@ines.io>
This commit is contained in:
Matthew Honnibal 2020-08-23 18:32:09 +02:00 committed by GitHub
parent 9bdc9e81f5
commit e559867605
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 712 additions and 107 deletions

View File

@ -6,9 +6,10 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a28,<8.0.0a30",
"thinc>=8.0.0a29,<8.0.0a40",
"blis>=0.4.0,<0.5.0",
"pytokenizations",
"smart_open>=2.0.0,<3.0.0"
"smart_open>=2.0.0,<3.0.0",
"pathy"
]
build-backend = "setuptools.build_meta"

View File

@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a28,<8.0.0a30
thinc>=8.0.0a29,<8.0.0a40
blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0
@ -9,6 +9,7 @@ wasabi>=0.7.1,<1.1.0
srsly>=2.1.0,<3.0.0
catalogue>=0.0.7,<1.1.0
typer>=0.3.0,<0.4.0
pathy
# Third party dependencies
numpy>=1.15.0
requests>=2.13.0,<3.0.0

View File

@ -34,18 +34,19 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a28,<8.0.0a30
thinc>=8.0.0a29,<8.0.0a40
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a28,<8.0.0a30
thinc>=8.0.0a29,<8.0.0a40
blis>=0.4.0,<0.5.0
wasabi>=0.7.1,<1.1.0
srsly>=2.1.0,<3.0.0
catalogue>=0.0.7,<1.1.0
typer>=0.3.0,<0.4.0
pathy
# Third-party dependencies
tqdm>=4.38.0,<5.0.0
numpy>=1.15.0

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy-nightly"
__version__ = "3.0.0a8"
__version__ = "3.0.0a9"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -21,6 +21,8 @@ from .project.clone import project_clone # noqa: F401
from .project.assets import project_assets # noqa: F401
from .project.run import project_run # noqa: F401
from .project.dvc import project_update_dvc # noqa: F401
from .project.push import project_push # noqa: F401
from .project.pull import project_pull # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)

View File

@ -1,4 +1,5 @@
from typing import Dict, Any, Union, List, Optional
from typing import Dict, Any, Union, List, Optional, TYPE_CHECKING
import sys
from pathlib import Path
from wasabi import msg
import srsly
@ -8,11 +9,13 @@ from typer.main import get_command
from contextlib import contextmanager
from thinc.config import Config, ConfigValidationError
from configparser import InterpolationError
import sys
from ..schemas import ProjectConfigSchema, validate
from ..util import import_file
if TYPE_CHECKING:
from pathy import Pathy # noqa: F401
PROJECT_FILE = "project.yml"
PROJECT_LOCK = "project.lock"
@ -93,11 +96,12 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
return result
def load_project_config(path: Path) -> Dict[str, Any]:
def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
"""Load the project.yml file from a directory and validate it. Also make
sure that all directories defined in the config exist.
path (Path): The path to the project directory.
interpolate (bool): Whether to substitute project variables.
RETURNS (Dict[str, Any]): The loaded project.yml.
"""
config_path = path / PROJECT_FILE
@ -119,9 +123,25 @@ def load_project_config(path: Path) -> Dict[str, Any]:
dir_path = path / subdir
if not dir_path.exists():
dir_path.mkdir(parents=True)
if interpolate:
err = "project.yml validation error"
with show_validation_error(title=err, hint_fill=False):
config = substitute_project_variables(config)
return config
def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}):
key = "vars"
config.setdefault(key, {})
config[key].update(overrides)
# Need to put variables in the top scope again so we can have a top-level
# section "project" (otherwise, a list of commands in the top scope wouldn't)
# be allowed by Thinc's config system
cfg = Config({"project": config, key: config[key]})
interpolated = cfg.interpolate()
return dict(interpolated["project"])
def validate_project_commands(config: Dict[str, Any]) -> None:
"""Check that project commands and workflows are valid, don't contain
duplicates, don't clash and only refer to commands that exist.
@ -232,3 +252,39 @@ def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
for name, cfg in config.get("components", {}).items()
if "factory" not in cfg and "source" in cfg
]
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
"""Upload a file.
src (Path): The source path.
url (str): The destination URL to upload to.
"""
dest = ensure_pathy(dest)
with dest.open(mode="wb") as output_file:
with src.open(mode="rb") as input_file:
output_file.write(input_file.read())
def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None:
"""Download a file using smart_open.
url (str): The URL of the file.
dest (Path): The destination path.
force (bool): Whether to force download even if file exists.
If False, the download will be skipped.
"""
if dest.exists() and not force:
return None
src = ensure_pathy(src)
with src.open(mode="rb") as input_file:
with dest.open(mode="wb") as output_file:
output_file.write(input_file.read())
def ensure_pathy(path):
"""Temporary helper to prevent importing Pathy globally (which can cause
slow and annoying Google Cloud warning)."""
from pathy import Pathy # noqa: F811
return Pathy(path)

View File

@ -4,10 +4,10 @@ from wasabi import msg
import re
import shutil
import requests
import smart_open
from ...util import ensure_path, working_dir
from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum
from .._util import download_file
# TODO: find a solution for caches
@ -44,16 +44,14 @@ def project_assets(project_dir: Path) -> None:
if not assets:
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
msg.info(f"Fetching {len(assets)} asset(s)")
variables = config.get("variables", {})
for asset in assets:
dest = asset["dest"].format(**variables)
dest = asset["dest"]
url = asset.get("url")
checksum = asset.get("checksum")
if not url:
# project.yml defines asset without URL that the user has to place
check_private_asset(dest, checksum)
continue
url = url.format(**variables)
fetch_asset(project_path, url, dest, checksum)
@ -132,15 +130,3 @@ def convert_asset_url(url: str) -> str:
)
return converted
return url
def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
"""Download a file using smart_open.
url (str): The URL of the file.
dest (Path): The destination path.
chunk_size (int): The size of chunks to read/write.
"""
with smart_open.open(url, mode="rb") as input_file:
with dest.open(mode="wb") as output_file:
output_file.write(input_file.read())

View File

@ -99,7 +99,6 @@ def update_dvc_config(
if ref_hash == config_hash and not force:
return False # Nothing has changed in project.yml, don't need to update
dvc_config_path.unlink()
variables = config.get("variables", {})
dvc_commands = []
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
for name in workflows[workflow]:
@ -122,7 +121,7 @@ def update_dvc_config(
dvc_commands.append(join_command(full_cmd))
with working_dir(path):
dvc_flags = {"--verbose": verbose, "--quiet": silent}
run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
run_dvc_commands(dvc_commands, flags=dvc_flags)
with dvc_config_path.open("r+", encoding="utf8") as f:
content = f.read()
f.seek(0, 0)
@ -131,23 +130,16 @@ def update_dvc_config(
def run_dvc_commands(
commands: List[str] = tuple(),
variables: Dict[str, str] = {},
flags: Dict[str, bool] = {},
commands: List[str] = tuple(), flags: Dict[str, bool] = {},
) -> None:
"""Run a sequence of DVC commands in a subprocess, in order.
commands (List[str]): The string commands without the leading "dvc".
variables (Dict[str, str]): Dictionary of variable names, mapped to their
values. Will be used to substitute format string variables in the
commands.
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
easier to pass flags like --quiet that depend on a variable or
command-line setting while avoiding lots of nested conditionals.
"""
for command in commands:
# Substitute variables, e.g. "./{NAME}.json"
command = command.format(**variables)
command = split_command(command)
dvc_command = ["dvc", *command]
# Add the flags if they are set to True

36
spacy/cli/project/pull.py Normal file
View File

@ -0,0 +1,36 @@
from pathlib import Path
from wasabi import msg
from .remote_storage import RemoteStorage
from .remote_storage import get_command_hash
from .._util import project_cli, Arg
from .._util import load_project_config
@project_cli.command("pull")
def project_pull_cli(
# fmt: off
remote: str = Arg("default", help="Name or path of remote storage"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
# fmt: on
):
"""Retrieve any precomputed outputs from a remote storage that are available.
You can alias remotes in your project.yml by mapping them to storage paths.
A storage can be anything that the smart-open library can upload to, e.g.
gcs, aws, ssh, local directories etc
"""
for url, output_path in project_pull(project_dir, remote):
if url is not None:
msg.good(f"Pulled {output_path} from {url}")
def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
config = load_project_config(project_dir)
if remote in config.get("remotes", {}):
remote = config["remotes"][remote]
storage = RemoteStorage(project_dir, remote)
for cmd in config.get("commands", []):
deps = [project_dir / dep for dep in cmd.get("deps", [])]
cmd_hash = get_command_hash("", "", deps, cmd["script"])
for output_path in cmd.get("outputs", []):
url = storage.pull(output_path, command_hash=cmd_hash)
yield url, output_path

48
spacy/cli/project/push.py Normal file
View File

@ -0,0 +1,48 @@
from pathlib import Path
from wasabi import msg
from .remote_storage import RemoteStorage
from .remote_storage import get_content_hash, get_command_hash
from .._util import load_project_config
from .._util import project_cli, Arg
@project_cli.command("push")
def project_push_cli(
# fmt: off
remote: str = Arg("default", help="Name or path of remote storage"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
# fmt: on
):
"""Persist outputs to a remote storage. You can alias remotes in your project.yml
by mapping them to storage paths. A storage can be anything that the smart-open
library can upload to, e.g. gcs, aws, ssh, local directories etc
"""
for output_path, url in project_push(project_dir, remote):
if url is None:
msg.info(f"Skipping {output_path}")
else:
msg.good(f"Pushed {output_path} to {url}")
def project_push(project_dir: Path, remote: str):
"""Persist outputs to a remote storage. You can alias remotes in your project.yml
by mapping them to storage paths. A storage can be anything that the smart-open
library can upload to, e.g. gcs, aws, ssh, local directories etc
"""
config = load_project_config(project_dir)
if remote in config.get("remotes", {}):
remote = config["remotes"][remote]
storage = RemoteStorage(project_dir, remote)
for cmd in config.get("commands", []):
cmd_hash = get_command_hash(
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
)
for output_path in cmd.get("outputs", []):
output_loc = project_dir / output_path
if output_loc.exists():
url = storage.push(
output_path,
command_hash=cmd_hash,
content_hash=get_content_hash(output_loc),
)
yield output_path, url

View File

@ -0,0 +1,169 @@
from typing import Optional, List, Dict, TYPE_CHECKING
import os
import site
import hashlib
import urllib.parse
import tarfile
from pathlib import Path
from .._util import get_hash, get_checksum, download_file, ensure_pathy
from ...util import make_tempdir
if TYPE_CHECKING:
from pathy import Pathy # noqa: F401
class RemoteStorage:
"""Push and pull outputs to and from a remote file storage.
Remotes can be anything that `smart-open` can support: AWS, GCS, file system,
ssh, etc.
"""
def __init__(self, project_root: Path, url: str, *, compression="gz"):
self.root = project_root
self.url = ensure_pathy(url)
self.compression = compression
def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
"""Compress a file or directory within a project and upload it to a remote
storage. If an object exists at the full URL, nothing is done.
Within the remote storage, files are addressed by their project path
(url encoded) and two user-supplied hashes, representing their creation
context and their file contents. If the URL already exists, the data is
not uploaded. Paths are archived and compressed prior to upload.
"""
loc = self.root / path
if not loc.exists():
raise IOError(f"Cannot push {loc}: does not exist.")
url = self.make_url(path, command_hash, content_hash)
if url.exists():
return None
tmp: Path
with make_tempdir() as tmp:
tar_loc = tmp / self.encode_name(str(path))
mode_string = f"w:{self.compression}" if self.compression else "w"
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
tar_file.add(str(loc), arcname=str(path))
with tar_loc.open(mode="rb") as input_file:
with url.open(mode="wb") as output_file:
output_file.write(input_file.read())
return url
def pull(
self,
path: Path,
*,
command_hash: Optional[str] = None,
content_hash: Optional[str] = None,
) -> Optional["Pathy"]:
"""Retrieve a file from the remote cache. If the file already exists,
nothing is done.
If the command_hash and/or content_hash are specified, only matching
results are returned. If no results are available, an error is raised.
"""
dest = self.root / path
if dest.exists():
return None
url = self.find(path, command_hash=command_hash, content_hash=content_hash)
if url is None:
return url
else:
# Make sure the destination exists
if not dest.parent.exists():
dest.parent.mkdir(parents=True)
tmp: Path
with make_tempdir() as tmp:
tar_loc = tmp / url.parts[-1]
download_file(url, tar_loc)
mode_string = f"r:{self.compression}" if self.compression else "r"
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
# This requires that the path is added correctly, relative
# to root. This is how we set things up in push()
tar_file.extractall(self.root)
return url
def find(
self,
path: Path,
*,
command_hash: Optional[str] = None,
content_hash: Optional[str] = None,
) -> Optional["Pathy"]:
"""Find the best matching version of a file within the storage,
or `None` if no match can be found. If both the creation and content hash
are specified, only exact matches will be returned. Otherwise, the most
recent matching file is preferred.
"""
name = self.encode_name(str(path))
if command_hash is not None and content_hash is not None:
url = self.make_url(path, command_hash, content_hash)
urls = [url] if url.exists() else []
elif command_hash is not None:
urls = list((self.url / name / command_hash).iterdir())
else:
urls = list((self.url / name).iterdir())
if content_hash is not None:
urls = [url for url in urls if url.parts[-1] == content_hash]
return urls[-1] if urls else None
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
"""Construct a URL from a subpath, a creation hash and a content hash."""
return self.url / self.encode_name(str(path)) / command_hash / content_hash
def encode_name(self, name: str) -> str:
"""Encode a subpath into a URL-safe name."""
return urllib.parse.quote_plus(name)
def get_content_hash(loc: Path) -> str:
return get_checksum(loc)
def get_command_hash(
site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
) -> str:
"""Create a hash representing the execution of a command. This includes the
currently installed packages, whatever environment variables have been marked
as relevant, and the command.
"""
hashes = [site_hash, env_hash] + [get_checksum(dep) for dep in sorted(deps)]
hashes.extend(cmd)
creation_bytes = "".join(hashes).encode("utf8")
return hashlib.md5(creation_bytes).hexdigest()
def get_site_hash():
"""Hash the current Python environment's site-packages contents, including
the name and version of the libraries. The list we're hashing is what
`pip freeze` would output.
"""
site_dirs = site.getsitepackages()
if site.ENABLE_USER_SITE:
site_dirs.extend(site.getusersitepackages())
packages = set()
for site_dir in site_dirs:
site_dir = Path(site_dir)
for subpath in site_dir.iterdir():
if subpath.parts[-1].endswith("dist-info"):
packages.add(subpath.parts[-1].replace(".dist-info", ""))
package_bytes = "".join(sorted(packages)).encode("utf8")
return hashlib.md5sum(package_bytes).hexdigest()
def get_env_hash(env: Dict[str, str]) -> str:
"""Construct a hash of the environment variables that will be passed into
the commands.
Values in the env dict may be references to the current os.environ, using
the syntax $ENV_VAR to mean os.environ[ENV_VAR]
"""
env_vars = {}
for key, value in env.items():
if value.startswith("$"):
env_vars[key] = os.environ.get(value[1:], "")
else:
env_vars[key] = value
return get_hash(env_vars)

View File

@ -44,7 +44,6 @@ def project_run(
dry (bool): Perform a dry run and don't execute commands.
"""
config = load_project_config(project_dir)
variables = config.get("variables", {})
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
workflows = config.get("workflows", {})
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
@ -54,22 +53,20 @@ def project_run(
project_run(project_dir, cmd, force=force, dry=dry)
else:
cmd = commands[subcommand]
variables = config.get("variables", {})
for dep in cmd.get("deps", []):
dep = dep.format(**variables)
if not (project_dir / dep).exists():
err = f"Missing dependency specified by command '{subcommand}': {dep}"
err_kwargs = {"exits": 1} if not dry else {}
msg.fail(err, **err_kwargs)
with working_dir(project_dir) as current_dir:
rerun = check_rerun(current_dir, cmd, variables)
rerun = check_rerun(current_dir, cmd)
if not rerun and not force:
msg.info(f"Skipping '{cmd['name']}': nothing changed")
else:
msg.divider(subcommand)
run_commands(cmd["script"], variables, dry=dry)
run_commands(cmd["script"], dry=dry)
if not dry:
update_lockfile(current_dir, cmd, variables)
update_lockfile(current_dir, cmd)
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
@ -115,23 +112,15 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
def run_commands(
commands: List[str] = tuple(),
variables: Dict[str, Any] = {},
silent: bool = False,
dry: bool = False,
commands: List[str] = tuple(), silent: bool = False, dry: bool = False,
) -> None:
"""Run a sequence of commands in a subprocess, in order.
commands (List[str]): The string commands.
variables (Dict[str, Any]): Dictionary of variable names, mapped to their
values. Will be used to substitute format string variables in the
commands.
silent (bool): Don't print the commands.
dry (bool): Perform a dry run and don't execut anything.
"""
for command in commands:
# Substitute variables, e.g. "./{NAME}.json"
command = command.format(**variables)
command = split_command(command)
# Not sure if this is needed or a good idea. Motivation: users may often
# use commands in their config that reference "python" and we want to
@ -173,15 +162,12 @@ def validate_subcommand(
)
def check_rerun(
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
) -> bool:
def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
"""Check if a command should be rerun because its settings or inputs/outputs
changed.
project_dir (Path): The current project directory.
command (Dict[str, Any]): The command, as defined in the project.yml.
variables (Dict[str, Any]): The variables defined in the project.yml.
RETURNS (bool): Whether to re-run the command.
"""
lock_path = project_dir / PROJECT_LOCK
@ -197,19 +183,16 @@ def check_rerun(
# If the entry in the lockfile matches the lockfile entry that would be
# generated from the current command, we don't rerun because it means that
# all inputs/outputs, hashes and scripts are the same and nothing changed
return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry)
return get_hash(get_lock_entry(project_dir, command)) != get_hash(entry)
def update_lockfile(
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
) -> None:
def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
"""Update the lockfile after running a command. Will create a lockfile if
it doesn't yet exist and will add an entry for the current command, its
script and dependencies/outputs.
project_dir (Path): The current project directory.
command (Dict[str, Any]): The command, as defined in the project.yml.
variables (Dict[str, Any]): The variables defined in the project.yml.
"""
lock_path = project_dir / PROJECT_LOCK
if not lock_path.exists():
@ -217,13 +200,11 @@ def update_lockfile(
data = {}
else:
data = srsly.read_yaml(lock_path)
data[command["name"]] = get_lock_entry(project_dir, command, variables)
data[command["name"]] = get_lock_entry(project_dir, command)
srsly.write_yaml(lock_path, data)
def get_lock_entry(
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
) -> Dict[str, Any]:
def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]:
"""Get a lockfile entry for a given command. An entry includes the command,
the script (command steps) and a list of dependencies and outputs with
their paths and file hashes, if available. The format is based on the
@ -231,12 +212,11 @@ def get_lock_entry(
project_dir (Path): The current project directory.
command (Dict[str, Any]): The command, as defined in the project.yml.
variables (Dict[str, Any]): The variables defined in the project.yml.
RETURNS (Dict[str, Any]): The lockfile entry.
"""
deps = get_fileinfo(project_dir, command.get("deps", []), variables)
outs = get_fileinfo(project_dir, command.get("outputs", []), variables)
outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables)
deps = get_fileinfo(project_dir, command.get("deps", []))
outs = get_fileinfo(project_dir, command.get("outputs", []))
outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []))
return {
"cmd": f"{COMMAND} run {command['name']}",
"script": command["script"],
@ -245,20 +225,16 @@ def get_lock_entry(
}
def get_fileinfo(
project_dir: Path, paths: List[str], variables: Dict[str, Any]
) -> List[Dict[str, str]]:
def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, str]]:
"""Generate the file information for a list of paths (dependencies, outputs).
Includes the file path and the file's checksum.
project_dir (Path): The current project directory.
paths (List[str]): The file paths.
variables (Dict[str, Any]): The variables defined in the project.yml.
RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
"""
data = []
for path in paths:
path = path.format(**variables)
file_path = project_dir / path
md5 = get_checksum(file_path) if file_path.exists() else None
data.append({"path": path, "md5": md5})

View File

@ -303,7 +303,7 @@ class ProjectConfigCommand(BaseModel):
class ProjectConfigSchema(BaseModel):
# fmt: off
variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands")
vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
assets: List[ProjectConfigAsset] = Field([], title="Data assets")
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")

View File

@ -6,9 +6,12 @@ from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
from spacy.cli.pretrain import make_docs
from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.util import get_lang_class
from spacy.cli._util import load_project_config, substitute_project_variables
from thinc.config import ConfigValidationError
import srsly
from .util import make_tempdir
def test_cli_converters_conllu2json():
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
@ -295,6 +298,24 @@ def test_project_config_validation2(config, n_errors):
assert len(errors) == n_errors
def test_project_config_interpolation():
variables = {"a": 10, "b": {"c": "foo", "d": True}}
commands = [
{"name": "x", "script": ["hello ${vars.a} ${vars.b.c}"]},
{"name": "y", "script": ["${vars.b.c} ${vars.b.d}"]},
]
project = {"commands": commands, "vars": variables}
with make_tempdir() as d:
srsly.write_yaml(d / "project.yml", project)
cfg = load_project_config(d)
assert cfg["commands"][0]["script"][0] == "hello 10 foo"
assert cfg["commands"][1]["script"][0] == "foo true"
commands = [{"name": "x", "script": ["hello ${vars.a} ${vars.b.e}"]}]
project = {"commands": commands, "vars": variables}
with pytest.raises(ConfigValidationError):
substitute_project_variables(project)
@pytest.mark.parametrize(
"args,expected",
[

View File

@ -1,5 +1,5 @@
from typing import List, Union, Dict, Any, Optional, Iterable, Callable, Tuple
from typing import Iterator, Type, Pattern, TYPE_CHECKING
from typing import Iterator, Type, Pattern, Generator, TYPE_CHECKING
from types import ModuleType
import os
import importlib
@ -610,7 +610,7 @@ def working_dir(path: Union[str, Path]) -> None:
@contextmanager
def make_tempdir() -> None:
def make_tempdir() -> Generator[Path, None, None]:
"""Execute a block in a temporary directory and remove the directory and
its contents at the end of the with block.

View File

@ -847,6 +847,92 @@ $ python -m spacy project run [subcommand] [project_dir] [--force] [--dry]
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **EXECUTES** | The command defined in the `project.yml`. |
### project push {#project-push tag="command"}
Upload all available files or directories listed as in the `outputs` section of
commands to a remote storage. Outputs are archived and compressed prior to
upload, and addressed in the remote storage using the output's relative path
(URL encoded), a hash of its command string and dependencies, and a hash of its
file contents. This means `push` should **never overwrite** a file in your
remote. If all the hashes match, the contents are the same and nothing happens.
If the contents are different, the new version of the file is uploaded. Deleting
obsolete files is left up to you.
Remotes can be defined in the `remotes` section of the
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
communicate with the remote storages, so you can use any protocol that
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
you may need to install extra dependencies to use certain protocols.
```cli
$ python -m spacy project push [remote] [project_dir]
```
> #### Example
>
> ```cli
> $ python -m spacy project push my_bucket
> ```
>
> ```yaml
> ### project.yml
> remotes:
> my_bucket: 's3://my-spacy-bucket'
> ```
| Name | Description |
| -------------- | --------------------------------------------------------------------------------------- |
| `remote` | The name of the remote to upload to. Defaults to `"default"`. ~~str (positional)~~ |
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **UPLOADS** | All project outputs that exist and are not already stored in the remote. |
### project pull {#project-pull tag="command"}
Download all files or directories listed as `outputs` for commands, unless they
are not already present locally. When searching for files in the remote, `pull`
won't just look at the output path, but will also consider the **command
string** and the **hashes of the dependencies**. For instance, let's say you've
previously pushed a model checkpoint to the remote, but now you've changed some
hyper-parameters. Because you've changed the inputs to the command, if you run
`pull`, you won't retrieve the stale result. If you train your model and push
the outputs to the remote, the outputs will be saved alongside the prior
outputs, so if you change the config back, you'll be able to fetch back the
result.
Remotes can be defined in the `remotes` section of the
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
communicate with the remote storages, so you can use any protocol that
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
you may need to install extra dependencies to use certain protocols.
```cli
$ python -m spacy project pull [remote] [project_dir]
```
> #### Example
>
> ```cli
> $ python -m spacy project pull my_bucket
> ```
>
> ```yaml
> ### project.yml
> remotes:
> my_bucket: 's3://my-spacy-bucket'
> ```
| Name | Description |
| -------------- | --------------------------------------------------------------------------------------- |
| `remote` | The name of the remote to download from. Defaults to `"default"`. ~~str (positional)~~ |
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **DOWNLOADS** | All project outputs that do not exist locally and can be found in the remote. |
### project dvc {#project-dvc tag="command"}
Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 40 KiB

View File

@ -5,9 +5,12 @@ menu:
- ['Intro & Workflow', 'intro']
- ['Directory & Assets', 'directory']
- ['Custom Projects', 'custom']
- ['Remote Storage', 'remote']
- ['Integrations', 'integrations']
---
## Introduction and workflow {#intro hidden="true"}
> #### 🪐 Project templates
>
> Our [`projects`](https://github.com/explosion/projects) repo includes various
@ -19,20 +22,17 @@ spaCy projects let you manage and share **end-to-end spaCy workflows** for
different **use cases and domains**, and orchestrate training, packaging and
serving your custom models. You can start off by cloning a pre-defined project
template, adjust it to fit your needs, load in your data, train a model, export
it as a Python package and share the project templates with your team. spaCy
projects can be used via the new [`spacy project`](/api/cli#project) command.
For an overview of the available project templates, check out the
[`projects`](https://github.com/explosion/projects) repo. spaCy projects also
[integrate](#integrations) with many other cool machine learning and data
science tools to track and manage your data and experiments, iterate on demos
and prototypes and ship your models into production.
it as a Python package, upload your outputs to a remote storage and share your
results with your team. spaCy projects can be used via the new
[`spacy project`](/api/cli#project) command and we provide templates in our
[`projects`](https://github.com/explosion/projects) repo.
<!-- TODO: mention integrations -->
## Introduction and workflow {#intro}
<!-- TODO: decide how to introduce concept -->
![Illustration of project workflow and commands](../images/projects.svg)
<!-- TODO:
<Project id="some_example_project">
@ -155,8 +155,8 @@ other. For instance, to generate a packaged model, you might start by converting
your data, then run [`spacy train`](/api/cli#train) to train your model on the
converted data and if that's successful, run [`spacy package`](/api/cli#package)
to turn the best model artifact into an installable Python package. The
following command runs the workflow named `all` defined in the `project.yml`, and
executes the commands it specifies, in order:
following command runs the workflow named `all` defined in the `project.yml`,
and executes the commands it specifies, in order:
```cli
$ python -m spacy project run all
@ -171,6 +171,31 @@ advanced data pipelines and track your changes in Git, check out the
from a workflow defined in your `project.yml` so you can manage your spaCy
project as a DVC repo.
### 5. Optional: Push to remote storage {#push}
> ```yaml
> ### project.yml
> remotes:
> default: 's3://my-spacy-bucket'
> local: '/mnt/scratch/cache'
> ```
After training a model, you can optionally use the
[`spacy project push`](/api/cli#project-push) command to upload your outputs to
a remote storage, using protocols like [S3](https://aws.amazon.com/s3/),
[Google Cloud Storage](https://cloud.google.com/storage) or SSH. This can help
you **export** your model packages, **share** work with your team, or **cache
results** to avoid repeating work.
```cli
$ python -m spacy project push
```
The `remotes` section in your `project.yml` lets you assign names to the
different storages. To download state from a remote storage, you can use the
[`spacy project pull`](/api/cli#project-pull) command. For more details, see the
docs on [remote storage](#remote).
## Project directory and assets {#directory}
### project.yml {#project-yml}
@ -190,7 +215,7 @@ https://github.com/explosion/spacy-boilerplates/blob/master/ner_fashion/project.
| Section | Description |
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `variables` | A dictionary of variables that can be referenced in paths, URLs and scripts. For example, `{NAME}` will use the value of the variable `NAME`. |
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. |
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
@ -349,9 +374,9 @@ if __name__ == "__main__":
In your `project.yml`, you can then run the script by calling
`python scripts/custom_evaluation.py` with the function arguments. You can also
use the `variables` section to define reusable variables that will be
substituted in commands, paths and URLs. In this example, the `BATCH_SIZE` is
defined as a variable will be added in place of `{BATCH_SIZE}` in the script.
use the `vars` section to define reusable variables that will be substituted in
commands, paths and URLs. In this example, the batch size is defined as a
variable will be added in place of `${vars.batch_size}` in the script.
> #### Calling into Python
>
@ -363,13 +388,13 @@ defined as a variable will be added in place of `{BATCH_SIZE}` in the script.
<!-- prettier-ignore -->
```yaml
### project.yml
variables:
BATCH_SIZE: 128
vars:
batch_size: 128
commands:
- name: evaluate
script:
- 'python scripts/custom_evaluation.py {BATCH_SIZE} ./training/model-best ./corpus/eval.json'
- 'python scripts/custom_evaluation.py ${batch_size} ./training/model-best ./corpus/eval.json'
deps:
- 'training/model-best'
- 'corpus/eval.json'
@ -421,6 +446,114 @@ assets:
checksum: '5113dc04e03f079525edd8df3f4f39e3'
```
## Remote Storage {#remote}
You can persist your project outputs to a remote storage using the
[`project push`](/api/cli#project-push) command. This can help you **export**
your model packages, **share** work with your team, or **cache results** to
avoid repeating work. The [`project pull`](/api/cli#project-pull) command will
download any outputs that are in the remote storage and aren't available
locally.
You can list one or more remotes in the `remotes` section of your
[`project.yml`](#project-yml) by mapping a string name to the URL of the
storage. Under the hood, spaCy uses the
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
communicate with the remote storages, so you can use any protocol that
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
you may need to install extra dependencies to use certain protocols.
> #### Example
>
> ```cli
> $ python -m spacy project pull local
> ```
```yaml
### project.yml
remotes:
default: 's3://my-spacy-bucket'
local: '/mnt/scratch/cache'
stuff: 'ssh://myserver.example.com/whatever'
```
<Infobox title="How it works" emoji="💡">
Inside the remote storage, spaCy uses a clever **directory structure** to avoid
overwriting files. The top level of the directory structure is a URL-encoded
version of the output's path. Within this directory are subdirectories named
according to a hash of the command string and the command's dependencies.
Finally, within those directories are files, named according to an MD5 hash of
their contents.
<!-- TODO: update with actual real example? -->
<!-- prettier-ignore -->
```yaml
└── urlencoded_file_path # Path of original file
├── some_command_hash # Hash of command you ran
│ ├── some_content_hash # Hash of file content
│ └── another_content_hash
└── another_command_hash
└── third_content_hash
```
</Infobox>
For instance, let's say you had the following command in your `project.yml`:
```yaml
### project.yml
- name: train
help: 'Train a spaCy model using the specified corpus and config'
script:
- 'spacy train ./config.cfg --output training/'
deps:
- 'corpus/train'
- 'corpus/dev'
- 'config.cfg'
outputs:
- 'training/model-best'
```
> #### Example
>
> ```
> └── s3://my-spacy-bucket/training%2Fmodel-best
> └── 1d8cb33a06cc345ad3761c6050934a1b
> └── d8e20c3537a084c5c10d95899fe0b1ff
> ```
After you finish training, you run [`project push`](/api/cli#project-push) to
make sure the `training/model-best` output is saved to remote storage. spaCy
will then construct a hash from your command script and the listed dependencies,
`corpus/train`, `corpus/dev` and `config.cfg`, in order to identify the
execution context of your output. It would then compute an MD5 hash of the
`training/model-best` directory, and use those three pieces of information to
construct the storage URL.
```cli
$ python -m spacy project run train
$ python -m spacy project push
```
If you change the command or one of its dependencies (for instance, by editing
the [`config.cfg`](/usage/training#config) file to tune the hyperparameters, a
different creation hash will be calculated, so when you use
[`project push`](/api/cli#project-push) you won't be overwriting your previous
file. The system even supports multiple outputs for the same file and the same
context, which can happen if your training process is not deterministic, or if
you have dependencies that aren't represented in the command.
In summary, the [`spacy project`](/api/cli#project) remote storages are designed
to make a particular set of trade-offs. Priority is placed on **convenience**,
**correctness** and **avoiding data loss**. You can use
[`project push`](/api/cli#project-push) freely, as you'll never overwrite remote
state, and you don't have to come up with names or version numbers. However,
it's up to you to manage the size of your remote storage, and to remove files
that are no longer relevant to you.
## Integrations {#integrations}
### Data Version Control (DVC) {#dvc} <IntegrationLogo name="dvc" title="DVC" width={70} height="auto" align="right" />
@ -517,16 +650,17 @@ and evaluation set.
<!-- prettier-ignore -->
```yaml
### project.yml
variables:
PRODIGY_DATASET: 'ner_articles'
PRODIGY_LABELS: 'PERSON,ORG,PRODUCT'
PRODIGY_MODEL: 'en_core_web_md'
vars:
prodigy:
dataset: 'ner_articles'
labels: 'PERSON,ORG,PRODUCT'
model: 'en_core_web_md'
commands:
- name: annotate
- script:
- 'python -m prodigy ner.correct {PRODIGY_DATASET} ./assets/raw_data.jsonl {PRODIGY_MODEL} --labels {PRODIGY_LABELS}'
- 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner {PRODIGY_DATASET}'
- 'python -m prodigy ner.correct ${vars.prodigy.dataset} ./assets/raw_data.jsonl ${vars.prodigy.model} --labels ${vars.prodigy.labels}'
- 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner ${vars.prodigy.dataset}'
- 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy'
- 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy'
- deps:

View File

@ -104,11 +104,15 @@ spaCy projects let you manage and share **end-to-end spaCy workflows** for
different **use cases and domains**, and orchestrate training, packaging and
serving your custom models. You can start off by cloning a pre-defined project
template, adjust it to fit your needs, load in your data, train a model, export
it as a Python package and share the project templates with your team. spaCy
projects also make it easy to **integrate with other tools** in the data science
and machine learning ecosystem, including [DVC](/usage/projects#dvc) for data
version control, [Prodigy](/usage/projects#prodigy) for creating labelled data,
[Streamlit](/usage/projects#streamlit) for building interactive apps,
it as a Python package, upload your outputs to a remote storage and share your
results with your team.
![Illustration of project workflow and commands](../images/projects.svg)
spaCy projects also make it easy to **integrate with other tools** in the data
science and machine learning ecosystem, including [DVC](/usage/projects#dvc) for
data version control, [Prodigy](/usage/projects#prodigy) for creating labelled
data, [Streamlit](/usage/projects#streamlit) for building interactive apps,
[FastAPI](/usage/projects#fastapi) for serving models in production,
[Ray](/usage/projects#ray) for parallel training,
[Weights & Biases](/usage/projects#wandb) for experiment tracking, and more!

View File

@ -5,6 +5,8 @@ import Icon from './icon'
import { isString } from './util'
import classes from '../styles/table.module.sass'
const FOOT_ROW_REGEX = /^(RETURNS|YIELDS|CREATES|PRINTS|EXECUTES|UPLOADS|DOWNLOADS)/
function isNum(children) {
return isString(children) && /^\d+[.,]?[\dx]+?(|x|ms|mb|gb|k|m)?$/i.test(children)
}
@ -43,7 +45,6 @@ function isDividerRow(children) {
}
function isFootRow(children) {
const rowRegex = /^(RETURNS|YIELDS|CREATES|PRINTS|EXECUTES)/
if (children.length && children[0].props.name === 'td') {
const cellChildren = children[0].props.children
if (
@ -52,7 +53,7 @@ function isFootRow(children) {
cellChildren.props.children &&
isString(cellChildren.props.children)
) {
return rowRegex.test(cellChildren.props.children)
return FOOT_ROW_REGEX.test(cellChildren.props.children)
}
}
return false