mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Allow spacy project to push and pull to/from remote storage (#5949)
* Add utils for working with remote storage * WIP add remote_cache for project * WIP add push and pull commands * Use pathy in remote_cache * Updarte util * Update remote_cache * Update util * Update project assets * Update pull script * Update push script * Fix type annotation in util * Work on remote storage * Remove site and env hash * Fix imports * Fix type annotation * Require pathy * Require pathy * Fix import * Add a util to handle project variable substitution * Import push and pull commands * Fix pull command * Fix push command * Fix tarfile in remote_storage * Improve printing * Fiddle with status messages * Set version to v3.0.0a9 * Draft docs for spacy project remote storages * Update docs [ci skip] * Use Thinc config to simplify and unify template variables * Auto-format * Don't import Pathy globally for now Causes slow and annoying Google Cloud warning * Tidy up test * Tidy up and update tests * Update to latest Thinc * Update docs * variables -> vars * Update docs [ci skip] * Update docs [ci skip] Co-authored-by: Ines Montani <ines@ines.io>
This commit is contained in:
parent
9bdc9e81f5
commit
e559867605
|
@ -6,9 +6,10 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.0.0a28,<8.0.0a30",
|
||||
"thinc>=8.0.0a29,<8.0.0a40",
|
||||
"blis>=0.4.0,<0.5.0",
|
||||
"pytokenizations",
|
||||
"smart_open>=2.0.0,<3.0.0"
|
||||
"smart_open>=2.0.0,<3.0.0",
|
||||
"pathy"
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Our libraries
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a28,<8.0.0a30
|
||||
thinc>=8.0.0a29,<8.0.0a40
|
||||
blis>=0.4.0,<0.5.0
|
||||
ml_datasets>=0.1.1
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
|
@ -9,6 +9,7 @@ wasabi>=0.7.1,<1.1.0
|
|||
srsly>=2.1.0,<3.0.0
|
||||
catalogue>=0.0.7,<1.1.0
|
||||
typer>=0.3.0,<0.4.0
|
||||
pathy
|
||||
# Third party dependencies
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
|
|
|
@ -34,18 +34,19 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.0.0a28,<8.0.0a30
|
||||
thinc>=8.0.0a29,<8.0.0a40
|
||||
install_requires =
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a28,<8.0.0a30
|
||||
thinc>=8.0.0a29,<8.0.0a40
|
||||
blis>=0.4.0,<0.5.0
|
||||
wasabi>=0.7.1,<1.1.0
|
||||
srsly>=2.1.0,<3.0.0
|
||||
catalogue>=0.0.7,<1.1.0
|
||||
typer>=0.3.0,<0.4.0
|
||||
pathy
|
||||
# Third-party dependencies
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
numpy>=1.15.0
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0a8"
|
||||
__version__ = "3.0.0a9"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
|
|
|
@ -21,6 +21,8 @@ from .project.clone import project_clone # noqa: F401
|
|||
from .project.assets import project_assets # noqa: F401
|
||||
from .project.run import project_run # noqa: F401
|
||||
from .project.dvc import project_update_dvc # noqa: F401
|
||||
from .project.push import project_push # noqa: F401
|
||||
from .project.pull import project_pull # noqa: F401
|
||||
|
||||
|
||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from typing import Dict, Any, Union, List, Optional
|
||||
from typing import Dict, Any, Union, List, Optional, TYPE_CHECKING
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import srsly
|
||||
|
@ -8,11 +9,13 @@ from typer.main import get_command
|
|||
from contextlib import contextmanager
|
||||
from thinc.config import Config, ConfigValidationError
|
||||
from configparser import InterpolationError
|
||||
import sys
|
||||
|
||||
from ..schemas import ProjectConfigSchema, validate
|
||||
from ..util import import_file
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import Pathy # noqa: F401
|
||||
|
||||
|
||||
PROJECT_FILE = "project.yml"
|
||||
PROJECT_LOCK = "project.lock"
|
||||
|
@ -93,11 +96,12 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
|
|||
return result
|
||||
|
||||
|
||||
def load_project_config(path: Path) -> Dict[str, Any]:
|
||||
def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
|
||||
"""Load the project.yml file from a directory and validate it. Also make
|
||||
sure that all directories defined in the config exist.
|
||||
|
||||
path (Path): The path to the project directory.
|
||||
interpolate (bool): Whether to substitute project variables.
|
||||
RETURNS (Dict[str, Any]): The loaded project.yml.
|
||||
"""
|
||||
config_path = path / PROJECT_FILE
|
||||
|
@ -119,9 +123,25 @@ def load_project_config(path: Path) -> Dict[str, Any]:
|
|||
dir_path = path / subdir
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir(parents=True)
|
||||
if interpolate:
|
||||
err = "project.yml validation error"
|
||||
with show_validation_error(title=err, hint_fill=False):
|
||||
config = substitute_project_variables(config)
|
||||
return config
|
||||
|
||||
|
||||
def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}):
|
||||
key = "vars"
|
||||
config.setdefault(key, {})
|
||||
config[key].update(overrides)
|
||||
# Need to put variables in the top scope again so we can have a top-level
|
||||
# section "project" (otherwise, a list of commands in the top scope wouldn't)
|
||||
# be allowed by Thinc's config system
|
||||
cfg = Config({"project": config, key: config[key]})
|
||||
interpolated = cfg.interpolate()
|
||||
return dict(interpolated["project"])
|
||||
|
||||
|
||||
def validate_project_commands(config: Dict[str, Any]) -> None:
|
||||
"""Check that project commands and workflows are valid, don't contain
|
||||
duplicates, don't clash and only refer to commands that exist.
|
||||
|
@ -232,3 +252,39 @@ def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
|
|||
for name, cfg in config.get("components", {}).items()
|
||||
if "factory" not in cfg and "source" in cfg
|
||||
]
|
||||
|
||||
|
||||
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
||||
"""Upload a file.
|
||||
|
||||
src (Path): The source path.
|
||||
url (str): The destination URL to upload to.
|
||||
"""
|
||||
dest = ensure_pathy(dest)
|
||||
with dest.open(mode="wb") as output_file:
|
||||
with src.open(mode="rb") as input_file:
|
||||
output_file.write(input_file.read())
|
||||
|
||||
|
||||
def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None:
|
||||
"""Download a file using smart_open.
|
||||
|
||||
url (str): The URL of the file.
|
||||
dest (Path): The destination path.
|
||||
force (bool): Whether to force download even if file exists.
|
||||
If False, the download will be skipped.
|
||||
"""
|
||||
if dest.exists() and not force:
|
||||
return None
|
||||
src = ensure_pathy(src)
|
||||
with src.open(mode="rb") as input_file:
|
||||
with dest.open(mode="wb") as output_file:
|
||||
output_file.write(input_file.read())
|
||||
|
||||
|
||||
def ensure_pathy(path):
|
||||
"""Temporary helper to prevent importing Pathy globally (which can cause
|
||||
slow and annoying Google Cloud warning)."""
|
||||
from pathy import Pathy # noqa: F811
|
||||
|
||||
return Pathy(path)
|
||||
|
|
|
@ -4,10 +4,10 @@ from wasabi import msg
|
|||
import re
|
||||
import shutil
|
||||
import requests
|
||||
import smart_open
|
||||
|
||||
from ...util import ensure_path, working_dir
|
||||
from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum
|
||||
from .._util import download_file
|
||||
|
||||
|
||||
# TODO: find a solution for caches
|
||||
|
@ -44,16 +44,14 @@ def project_assets(project_dir: Path) -> None:
|
|||
if not assets:
|
||||
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
|
||||
msg.info(f"Fetching {len(assets)} asset(s)")
|
||||
variables = config.get("variables", {})
|
||||
for asset in assets:
|
||||
dest = asset["dest"].format(**variables)
|
||||
dest = asset["dest"]
|
||||
url = asset.get("url")
|
||||
checksum = asset.get("checksum")
|
||||
if not url:
|
||||
# project.yml defines asset without URL that the user has to place
|
||||
check_private_asset(dest, checksum)
|
||||
continue
|
||||
url = url.format(**variables)
|
||||
fetch_asset(project_path, url, dest, checksum)
|
||||
|
||||
|
||||
|
@ -132,15 +130,3 @@ def convert_asset_url(url: str) -> str:
|
|||
)
|
||||
return converted
|
||||
return url
|
||||
|
||||
|
||||
def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
|
||||
"""Download a file using smart_open.
|
||||
|
||||
url (str): The URL of the file.
|
||||
dest (Path): The destination path.
|
||||
chunk_size (int): The size of chunks to read/write.
|
||||
"""
|
||||
with smart_open.open(url, mode="rb") as input_file:
|
||||
with dest.open(mode="wb") as output_file:
|
||||
output_file.write(input_file.read())
|
||||
|
|
|
@ -99,7 +99,6 @@ def update_dvc_config(
|
|||
if ref_hash == config_hash and not force:
|
||||
return False # Nothing has changed in project.yml, don't need to update
|
||||
dvc_config_path.unlink()
|
||||
variables = config.get("variables", {})
|
||||
dvc_commands = []
|
||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
for name in workflows[workflow]:
|
||||
|
@ -122,7 +121,7 @@ def update_dvc_config(
|
|||
dvc_commands.append(join_command(full_cmd))
|
||||
with working_dir(path):
|
||||
dvc_flags = {"--verbose": verbose, "--quiet": silent}
|
||||
run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
|
||||
run_dvc_commands(dvc_commands, flags=dvc_flags)
|
||||
with dvc_config_path.open("r+", encoding="utf8") as f:
|
||||
content = f.read()
|
||||
f.seek(0, 0)
|
||||
|
@ -131,23 +130,16 @@ def update_dvc_config(
|
|||
|
||||
|
||||
def run_dvc_commands(
|
||||
commands: List[str] = tuple(),
|
||||
variables: Dict[str, str] = {},
|
||||
flags: Dict[str, bool] = {},
|
||||
commands: List[str] = tuple(), flags: Dict[str, bool] = {},
|
||||
) -> None:
|
||||
"""Run a sequence of DVC commands in a subprocess, in order.
|
||||
|
||||
commands (List[str]): The string commands without the leading "dvc".
|
||||
variables (Dict[str, str]): Dictionary of variable names, mapped to their
|
||||
values. Will be used to substitute format string variables in the
|
||||
commands.
|
||||
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
|
||||
easier to pass flags like --quiet that depend on a variable or
|
||||
command-line setting while avoiding lots of nested conditionals.
|
||||
"""
|
||||
for command in commands:
|
||||
# Substitute variables, e.g. "./{NAME}.json"
|
||||
command = command.format(**variables)
|
||||
command = split_command(command)
|
||||
dvc_command = ["dvc", *command]
|
||||
# Add the flags if they are set to True
|
||||
|
|
36
spacy/cli/project/pull.py
Normal file
36
spacy/cli/project/pull.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
from .remote_storage import RemoteStorage
|
||||
from .remote_storage import get_command_hash
|
||||
from .._util import project_cli, Arg
|
||||
from .._util import load_project_config
|
||||
|
||||
|
||||
@project_cli.command("pull")
|
||||
def project_pull_cli(
|
||||
# fmt: off
|
||||
remote: str = Arg("default", help="Name or path of remote storage"),
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
# fmt: on
|
||||
):
|
||||
"""Retrieve any precomputed outputs from a remote storage that are available.
|
||||
You can alias remotes in your project.yml by mapping them to storage paths.
|
||||
A storage can be anything that the smart-open library can upload to, e.g.
|
||||
gcs, aws, ssh, local directories etc
|
||||
"""
|
||||
for url, output_path in project_pull(project_dir, remote):
|
||||
if url is not None:
|
||||
msg.good(f"Pulled {output_path} from {url}")
|
||||
|
||||
|
||||
def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
||||
config = load_project_config(project_dir)
|
||||
if remote in config.get("remotes", {}):
|
||||
remote = config["remotes"][remote]
|
||||
storage = RemoteStorage(project_dir, remote)
|
||||
for cmd in config.get("commands", []):
|
||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
||||
for output_path in cmd.get("outputs", []):
|
||||
url = storage.pull(output_path, command_hash=cmd_hash)
|
||||
yield url, output_path
|
48
spacy/cli/project/push.py
Normal file
48
spacy/cli/project/push.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
from .remote_storage import RemoteStorage
|
||||
from .remote_storage import get_content_hash, get_command_hash
|
||||
from .._util import load_project_config
|
||||
from .._util import project_cli, Arg
|
||||
|
||||
|
||||
@project_cli.command("push")
|
||||
def project_push_cli(
|
||||
# fmt: off
|
||||
remote: str = Arg("default", help="Name or path of remote storage"),
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
# fmt: on
|
||||
):
|
||||
"""Persist outputs to a remote storage. You can alias remotes in your project.yml
|
||||
by mapping them to storage paths. A storage can be anything that the smart-open
|
||||
library can upload to, e.g. gcs, aws, ssh, local directories etc
|
||||
"""
|
||||
for output_path, url in project_push(project_dir, remote):
|
||||
if url is None:
|
||||
msg.info(f"Skipping {output_path}")
|
||||
else:
|
||||
msg.good(f"Pushed {output_path} to {url}")
|
||||
|
||||
|
||||
def project_push(project_dir: Path, remote: str):
|
||||
"""Persist outputs to a remote storage. You can alias remotes in your project.yml
|
||||
by mapping them to storage paths. A storage can be anything that the smart-open
|
||||
library can upload to, e.g. gcs, aws, ssh, local directories etc
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
if remote in config.get("remotes", {}):
|
||||
remote = config["remotes"][remote]
|
||||
storage = RemoteStorage(project_dir, remote)
|
||||
for cmd in config.get("commands", []):
|
||||
cmd_hash = get_command_hash(
|
||||
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
|
||||
)
|
||||
for output_path in cmd.get("outputs", []):
|
||||
output_loc = project_dir / output_path
|
||||
if output_loc.exists():
|
||||
url = storage.push(
|
||||
output_path,
|
||||
command_hash=cmd_hash,
|
||||
content_hash=get_content_hash(output_loc),
|
||||
)
|
||||
yield output_path, url
|
169
spacy/cli/project/remote_storage.py
Normal file
169
spacy/cli/project/remote_storage.py
Normal file
|
@ -0,0 +1,169 @@
|
|||
from typing import Optional, List, Dict, TYPE_CHECKING
|
||||
import os
|
||||
import site
|
||||
import hashlib
|
||||
import urllib.parse
|
||||
import tarfile
|
||||
from pathlib import Path
|
||||
|
||||
from .._util import get_hash, get_checksum, download_file, ensure_pathy
|
||||
from ...util import make_tempdir
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import Pathy # noqa: F401
|
||||
|
||||
|
||||
class RemoteStorage:
|
||||
"""Push and pull outputs to and from a remote file storage.
|
||||
|
||||
Remotes can be anything that `smart-open` can support: AWS, GCS, file system,
|
||||
ssh, etc.
|
||||
"""
|
||||
|
||||
def __init__(self, project_root: Path, url: str, *, compression="gz"):
|
||||
self.root = project_root
|
||||
self.url = ensure_pathy(url)
|
||||
self.compression = compression
|
||||
|
||||
def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
|
||||
"""Compress a file or directory within a project and upload it to a remote
|
||||
storage. If an object exists at the full URL, nothing is done.
|
||||
|
||||
Within the remote storage, files are addressed by their project path
|
||||
(url encoded) and two user-supplied hashes, representing their creation
|
||||
context and their file contents. If the URL already exists, the data is
|
||||
not uploaded. Paths are archived and compressed prior to upload.
|
||||
"""
|
||||
loc = self.root / path
|
||||
if not loc.exists():
|
||||
raise IOError(f"Cannot push {loc}: does not exist.")
|
||||
url = self.make_url(path, command_hash, content_hash)
|
||||
if url.exists():
|
||||
return None
|
||||
tmp: Path
|
||||
with make_tempdir() as tmp:
|
||||
tar_loc = tmp / self.encode_name(str(path))
|
||||
mode_string = f"w:{self.compression}" if self.compression else "w"
|
||||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
||||
tar_file.add(str(loc), arcname=str(path))
|
||||
with tar_loc.open(mode="rb") as input_file:
|
||||
with url.open(mode="wb") as output_file:
|
||||
output_file.write(input_file.read())
|
||||
return url
|
||||
|
||||
def pull(
|
||||
self,
|
||||
path: Path,
|
||||
*,
|
||||
command_hash: Optional[str] = None,
|
||||
content_hash: Optional[str] = None,
|
||||
) -> Optional["Pathy"]:
|
||||
"""Retrieve a file from the remote cache. If the file already exists,
|
||||
nothing is done.
|
||||
|
||||
If the command_hash and/or content_hash are specified, only matching
|
||||
results are returned. If no results are available, an error is raised.
|
||||
"""
|
||||
dest = self.root / path
|
||||
if dest.exists():
|
||||
return None
|
||||
url = self.find(path, command_hash=command_hash, content_hash=content_hash)
|
||||
if url is None:
|
||||
return url
|
||||
else:
|
||||
# Make sure the destination exists
|
||||
if not dest.parent.exists():
|
||||
dest.parent.mkdir(parents=True)
|
||||
tmp: Path
|
||||
with make_tempdir() as tmp:
|
||||
tar_loc = tmp / url.parts[-1]
|
||||
download_file(url, tar_loc)
|
||||
mode_string = f"r:{self.compression}" if self.compression else "r"
|
||||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
||||
# This requires that the path is added correctly, relative
|
||||
# to root. This is how we set things up in push()
|
||||
tar_file.extractall(self.root)
|
||||
return url
|
||||
|
||||
def find(
|
||||
self,
|
||||
path: Path,
|
||||
*,
|
||||
command_hash: Optional[str] = None,
|
||||
content_hash: Optional[str] = None,
|
||||
) -> Optional["Pathy"]:
|
||||
"""Find the best matching version of a file within the storage,
|
||||
or `None` if no match can be found. If both the creation and content hash
|
||||
are specified, only exact matches will be returned. Otherwise, the most
|
||||
recent matching file is preferred.
|
||||
"""
|
||||
name = self.encode_name(str(path))
|
||||
if command_hash is not None and content_hash is not None:
|
||||
url = self.make_url(path, command_hash, content_hash)
|
||||
urls = [url] if url.exists() else []
|
||||
elif command_hash is not None:
|
||||
urls = list((self.url / name / command_hash).iterdir())
|
||||
else:
|
||||
urls = list((self.url / name).iterdir())
|
||||
if content_hash is not None:
|
||||
urls = [url for url in urls if url.parts[-1] == content_hash]
|
||||
return urls[-1] if urls else None
|
||||
|
||||
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
|
||||
"""Construct a URL from a subpath, a creation hash and a content hash."""
|
||||
return self.url / self.encode_name(str(path)) / command_hash / content_hash
|
||||
|
||||
def encode_name(self, name: str) -> str:
|
||||
"""Encode a subpath into a URL-safe name."""
|
||||
return urllib.parse.quote_plus(name)
|
||||
|
||||
|
||||
def get_content_hash(loc: Path) -> str:
|
||||
return get_checksum(loc)
|
||||
|
||||
|
||||
def get_command_hash(
|
||||
site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
|
||||
) -> str:
|
||||
"""Create a hash representing the execution of a command. This includes the
|
||||
currently installed packages, whatever environment variables have been marked
|
||||
as relevant, and the command.
|
||||
"""
|
||||
hashes = [site_hash, env_hash] + [get_checksum(dep) for dep in sorted(deps)]
|
||||
hashes.extend(cmd)
|
||||
creation_bytes = "".join(hashes).encode("utf8")
|
||||
return hashlib.md5(creation_bytes).hexdigest()
|
||||
|
||||
|
||||
def get_site_hash():
|
||||
"""Hash the current Python environment's site-packages contents, including
|
||||
the name and version of the libraries. The list we're hashing is what
|
||||
`pip freeze` would output.
|
||||
"""
|
||||
site_dirs = site.getsitepackages()
|
||||
if site.ENABLE_USER_SITE:
|
||||
site_dirs.extend(site.getusersitepackages())
|
||||
packages = set()
|
||||
for site_dir in site_dirs:
|
||||
site_dir = Path(site_dir)
|
||||
for subpath in site_dir.iterdir():
|
||||
if subpath.parts[-1].endswith("dist-info"):
|
||||
packages.add(subpath.parts[-1].replace(".dist-info", ""))
|
||||
package_bytes = "".join(sorted(packages)).encode("utf8")
|
||||
return hashlib.md5sum(package_bytes).hexdigest()
|
||||
|
||||
|
||||
def get_env_hash(env: Dict[str, str]) -> str:
|
||||
"""Construct a hash of the environment variables that will be passed into
|
||||
the commands.
|
||||
|
||||
Values in the env dict may be references to the current os.environ, using
|
||||
the syntax $ENV_VAR to mean os.environ[ENV_VAR]
|
||||
"""
|
||||
env_vars = {}
|
||||
for key, value in env.items():
|
||||
if value.startswith("$"):
|
||||
env_vars[key] = os.environ.get(value[1:], "")
|
||||
else:
|
||||
env_vars[key] = value
|
||||
return get_hash(env_vars)
|
|
@ -44,7 +44,6 @@ def project_run(
|
|||
dry (bool): Perform a dry run and don't execute commands.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
variables = config.get("variables", {})
|
||||
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
workflows = config.get("workflows", {})
|
||||
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
|
||||
|
@ -54,22 +53,20 @@ def project_run(
|
|||
project_run(project_dir, cmd, force=force, dry=dry)
|
||||
else:
|
||||
cmd = commands[subcommand]
|
||||
variables = config.get("variables", {})
|
||||
for dep in cmd.get("deps", []):
|
||||
dep = dep.format(**variables)
|
||||
if not (project_dir / dep).exists():
|
||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||
err_kwargs = {"exits": 1} if not dry else {}
|
||||
msg.fail(err, **err_kwargs)
|
||||
with working_dir(project_dir) as current_dir:
|
||||
rerun = check_rerun(current_dir, cmd, variables)
|
||||
rerun = check_rerun(current_dir, cmd)
|
||||
if not rerun and not force:
|
||||
msg.info(f"Skipping '{cmd['name']}': nothing changed")
|
||||
else:
|
||||
msg.divider(subcommand)
|
||||
run_commands(cmd["script"], variables, dry=dry)
|
||||
run_commands(cmd["script"], dry=dry)
|
||||
if not dry:
|
||||
update_lockfile(current_dir, cmd, variables)
|
||||
update_lockfile(current_dir, cmd)
|
||||
|
||||
|
||||
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||
|
@ -115,23 +112,15 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
|||
|
||||
|
||||
def run_commands(
|
||||
commands: List[str] = tuple(),
|
||||
variables: Dict[str, Any] = {},
|
||||
silent: bool = False,
|
||||
dry: bool = False,
|
||||
commands: List[str] = tuple(), silent: bool = False, dry: bool = False,
|
||||
) -> None:
|
||||
"""Run a sequence of commands in a subprocess, in order.
|
||||
|
||||
commands (List[str]): The string commands.
|
||||
variables (Dict[str, Any]): Dictionary of variable names, mapped to their
|
||||
values. Will be used to substitute format string variables in the
|
||||
commands.
|
||||
silent (bool): Don't print the commands.
|
||||
dry (bool): Perform a dry run and don't execut anything.
|
||||
"""
|
||||
for command in commands:
|
||||
# Substitute variables, e.g. "./{NAME}.json"
|
||||
command = command.format(**variables)
|
||||
command = split_command(command)
|
||||
# Not sure if this is needed or a good idea. Motivation: users may often
|
||||
# use commands in their config that reference "python" and we want to
|
||||
|
@ -173,15 +162,12 @@ def validate_subcommand(
|
|||
)
|
||||
|
||||
|
||||
def check_rerun(
|
||||
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
||||
) -> bool:
|
||||
def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
|
||||
"""Check if a command should be rerun because its settings or inputs/outputs
|
||||
changed.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||
RETURNS (bool): Whether to re-run the command.
|
||||
"""
|
||||
lock_path = project_dir / PROJECT_LOCK
|
||||
|
@ -197,19 +183,16 @@ def check_rerun(
|
|||
# If the entry in the lockfile matches the lockfile entry that would be
|
||||
# generated from the current command, we don't rerun because it means that
|
||||
# all inputs/outputs, hashes and scripts are the same and nothing changed
|
||||
return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry)
|
||||
return get_hash(get_lock_entry(project_dir, command)) != get_hash(entry)
|
||||
|
||||
|
||||
def update_lockfile(
|
||||
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
||||
) -> None:
|
||||
def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
|
||||
"""Update the lockfile after running a command. Will create a lockfile if
|
||||
it doesn't yet exist and will add an entry for the current command, its
|
||||
script and dependencies/outputs.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||
"""
|
||||
lock_path = project_dir / PROJECT_LOCK
|
||||
if not lock_path.exists():
|
||||
|
@ -217,13 +200,11 @@ def update_lockfile(
|
|||
data = {}
|
||||
else:
|
||||
data = srsly.read_yaml(lock_path)
|
||||
data[command["name"]] = get_lock_entry(project_dir, command, variables)
|
||||
data[command["name"]] = get_lock_entry(project_dir, command)
|
||||
srsly.write_yaml(lock_path, data)
|
||||
|
||||
|
||||
def get_lock_entry(
|
||||
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Get a lockfile entry for a given command. An entry includes the command,
|
||||
the script (command steps) and a list of dependencies and outputs with
|
||||
their paths and file hashes, if available. The format is based on the
|
||||
|
@ -231,12 +212,11 @@ def get_lock_entry(
|
|||
|
||||
project_dir (Path): The current project directory.
|
||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||
RETURNS (Dict[str, Any]): The lockfile entry.
|
||||
"""
|
||||
deps = get_fileinfo(project_dir, command.get("deps", []), variables)
|
||||
outs = get_fileinfo(project_dir, command.get("outputs", []), variables)
|
||||
outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables)
|
||||
deps = get_fileinfo(project_dir, command.get("deps", []))
|
||||
outs = get_fileinfo(project_dir, command.get("outputs", []))
|
||||
outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []))
|
||||
return {
|
||||
"cmd": f"{COMMAND} run {command['name']}",
|
||||
"script": command["script"],
|
||||
|
@ -245,20 +225,16 @@ def get_lock_entry(
|
|||
}
|
||||
|
||||
|
||||
def get_fileinfo(
|
||||
project_dir: Path, paths: List[str], variables: Dict[str, Any]
|
||||
) -> List[Dict[str, str]]:
|
||||
def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, str]]:
|
||||
"""Generate the file information for a list of paths (dependencies, outputs).
|
||||
Includes the file path and the file's checksum.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
paths (List[str]): The file paths.
|
||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||
RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
|
||||
"""
|
||||
data = []
|
||||
for path in paths:
|
||||
path = path.format(**variables)
|
||||
file_path = project_dir / path
|
||||
md5 = get_checksum(file_path) if file_path.exists() else None
|
||||
data.append({"path": path, "md5": md5})
|
||||
|
|
|
@ -303,7 +303,7 @@ class ProjectConfigCommand(BaseModel):
|
|||
|
||||
class ProjectConfigSchema(BaseModel):
|
||||
# fmt: off
|
||||
variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands")
|
||||
vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
|
||||
assets: List[ProjectConfigAsset] = Field([], title="Data assets")
|
||||
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
|
||||
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
||||
|
|
|
@ -6,9 +6,12 @@ from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
|||
from spacy.cli.pretrain import make_docs
|
||||
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||
from spacy.util import get_lang_class
|
||||
from spacy.cli._util import load_project_config, substitute_project_variables
|
||||
from thinc.config import ConfigValidationError
|
||||
import srsly
|
||||
|
||||
from .util import make_tempdir
|
||||
|
||||
|
||||
def test_cli_converters_conllu2json():
|
||||
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
|
||||
|
@ -295,6 +298,24 @@ def test_project_config_validation2(config, n_errors):
|
|||
assert len(errors) == n_errors
|
||||
|
||||
|
||||
def test_project_config_interpolation():
|
||||
variables = {"a": 10, "b": {"c": "foo", "d": True}}
|
||||
commands = [
|
||||
{"name": "x", "script": ["hello ${vars.a} ${vars.b.c}"]},
|
||||
{"name": "y", "script": ["${vars.b.c} ${vars.b.d}"]},
|
||||
]
|
||||
project = {"commands": commands, "vars": variables}
|
||||
with make_tempdir() as d:
|
||||
srsly.write_yaml(d / "project.yml", project)
|
||||
cfg = load_project_config(d)
|
||||
assert cfg["commands"][0]["script"][0] == "hello 10 foo"
|
||||
assert cfg["commands"][1]["script"][0] == "foo true"
|
||||
commands = [{"name": "x", "script": ["hello ${vars.a} ${vars.b.e}"]}]
|
||||
project = {"commands": commands, "vars": variables}
|
||||
with pytest.raises(ConfigValidationError):
|
||||
substitute_project_variables(project)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"args,expected",
|
||||
[
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from typing import List, Union, Dict, Any, Optional, Iterable, Callable, Tuple
|
||||
from typing import Iterator, Type, Pattern, TYPE_CHECKING
|
||||
from typing import Iterator, Type, Pattern, Generator, TYPE_CHECKING
|
||||
from types import ModuleType
|
||||
import os
|
||||
import importlib
|
||||
|
@ -610,7 +610,7 @@ def working_dir(path: Union[str, Path]) -> None:
|
|||
|
||||
|
||||
@contextmanager
|
||||
def make_tempdir() -> None:
|
||||
def make_tempdir() -> Generator[Path, None, None]:
|
||||
"""Execute a block in a temporary directory and remove the directory and
|
||||
its contents at the end of the with block.
|
||||
|
||||
|
|
|
@ -847,6 +847,92 @@ $ python -m spacy project run [subcommand] [project_dir] [--force] [--dry]
|
|||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **EXECUTES** | The command defined in the `project.yml`. |
|
||||
|
||||
### project push {#project-push tag="command"}
|
||||
|
||||
Upload all available files or directories listed as in the `outputs` section of
|
||||
commands to a remote storage. Outputs are archived and compressed prior to
|
||||
upload, and addressed in the remote storage using the output's relative path
|
||||
(URL encoded), a hash of its command string and dependencies, and a hash of its
|
||||
file contents. This means `push` should **never overwrite** a file in your
|
||||
remote. If all the hashes match, the contents are the same and nothing happens.
|
||||
If the contents are different, the new version of the file is uploaded. Deleting
|
||||
obsolete files is left up to you.
|
||||
|
||||
Remotes can be defined in the `remotes` section of the
|
||||
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the
|
||||
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
|
||||
communicate with the remote storages, so you can use any protocol that
|
||||
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
|
||||
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
|
||||
you may need to install extra dependencies to use certain protocols.
|
||||
|
||||
```cli
|
||||
$ python -m spacy project push [remote] [project_dir]
|
||||
```
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```cli
|
||||
> $ python -m spacy project push my_bucket
|
||||
> ```
|
||||
>
|
||||
> ```yaml
|
||||
> ### project.yml
|
||||
> remotes:
|
||||
> my_bucket: 's3://my-spacy-bucket'
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | --------------------------------------------------------------------------------------- |
|
||||
| `remote` | The name of the remote to upload to. Defaults to `"default"`. ~~str (positional)~~ |
|
||||
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **UPLOADS** | All project outputs that exist and are not already stored in the remote. |
|
||||
|
||||
### project pull {#project-pull tag="command"}
|
||||
|
||||
Download all files or directories listed as `outputs` for commands, unless they
|
||||
are not already present locally. When searching for files in the remote, `pull`
|
||||
won't just look at the output path, but will also consider the **command
|
||||
string** and the **hashes of the dependencies**. For instance, let's say you've
|
||||
previously pushed a model checkpoint to the remote, but now you've changed some
|
||||
hyper-parameters. Because you've changed the inputs to the command, if you run
|
||||
`pull`, you won't retrieve the stale result. If you train your model and push
|
||||
the outputs to the remote, the outputs will be saved alongside the prior
|
||||
outputs, so if you change the config back, you'll be able to fetch back the
|
||||
result.
|
||||
|
||||
Remotes can be defined in the `remotes` section of the
|
||||
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the
|
||||
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
|
||||
communicate with the remote storages, so you can use any protocol that
|
||||
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
|
||||
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
|
||||
you may need to install extra dependencies to use certain protocols.
|
||||
|
||||
```cli
|
||||
$ python -m spacy project pull [remote] [project_dir]
|
||||
```
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```cli
|
||||
> $ python -m spacy project pull my_bucket
|
||||
> ```
|
||||
>
|
||||
> ```yaml
|
||||
> ### project.yml
|
||||
> remotes:
|
||||
> my_bucket: 's3://my-spacy-bucket'
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | --------------------------------------------------------------------------------------- |
|
||||
| `remote` | The name of the remote to download from. Defaults to `"default"`. ~~str (positional)~~ |
|
||||
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **DOWNLOADS** | All project outputs that do not exist locally and can be found in the remote. |
|
||||
|
||||
### project dvc {#project-dvc tag="command"}
|
||||
|
||||
Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls
|
||||
|
|
91
website/docs/images/projects.svg
Normal file
91
website/docs/images/projects.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 40 KiB |
|
@ -5,9 +5,12 @@ menu:
|
|||
- ['Intro & Workflow', 'intro']
|
||||
- ['Directory & Assets', 'directory']
|
||||
- ['Custom Projects', 'custom']
|
||||
- ['Remote Storage', 'remote']
|
||||
- ['Integrations', 'integrations']
|
||||
---
|
||||
|
||||
## Introduction and workflow {#intro hidden="true"}
|
||||
|
||||
> #### 🪐 Project templates
|
||||
>
|
||||
> Our [`projects`](https://github.com/explosion/projects) repo includes various
|
||||
|
@ -19,20 +22,17 @@ spaCy projects let you manage and share **end-to-end spaCy workflows** for
|
|||
different **use cases and domains**, and orchestrate training, packaging and
|
||||
serving your custom models. You can start off by cloning a pre-defined project
|
||||
template, adjust it to fit your needs, load in your data, train a model, export
|
||||
it as a Python package and share the project templates with your team. spaCy
|
||||
projects can be used via the new [`spacy project`](/api/cli#project) command.
|
||||
For an overview of the available project templates, check out the
|
||||
[`projects`](https://github.com/explosion/projects) repo. spaCy projects also
|
||||
[integrate](#integrations) with many other cool machine learning and data
|
||||
science tools to track and manage your data and experiments, iterate on demos
|
||||
and prototypes and ship your models into production.
|
||||
it as a Python package, upload your outputs to a remote storage and share your
|
||||
results with your team. spaCy projects can be used via the new
|
||||
[`spacy project`](/api/cli#project) command and we provide templates in our
|
||||
[`projects`](https://github.com/explosion/projects) repo.
|
||||
|
||||
<!-- TODO: mention integrations -->
|
||||
|
||||
## Introduction and workflow {#intro}
|
||||
|
||||
<!-- TODO: decide how to introduce concept -->
|
||||
|
||||
![Illustration of project workflow and commands](../images/projects.svg)
|
||||
|
||||
<!-- TODO:
|
||||
<Project id="some_example_project">
|
||||
|
||||
|
@ -155,8 +155,8 @@ other. For instance, to generate a packaged model, you might start by converting
|
|||
your data, then run [`spacy train`](/api/cli#train) to train your model on the
|
||||
converted data and if that's successful, run [`spacy package`](/api/cli#package)
|
||||
to turn the best model artifact into an installable Python package. The
|
||||
following command runs the workflow named `all` defined in the `project.yml`, and
|
||||
executes the commands it specifies, in order:
|
||||
following command runs the workflow named `all` defined in the `project.yml`,
|
||||
and executes the commands it specifies, in order:
|
||||
|
||||
```cli
|
||||
$ python -m spacy project run all
|
||||
|
@ -171,6 +171,31 @@ advanced data pipelines and track your changes in Git, check out the
|
|||
from a workflow defined in your `project.yml` so you can manage your spaCy
|
||||
project as a DVC repo.
|
||||
|
||||
### 5. Optional: Push to remote storage {#push}
|
||||
|
||||
> ```yaml
|
||||
> ### project.yml
|
||||
> remotes:
|
||||
> default: 's3://my-spacy-bucket'
|
||||
> local: '/mnt/scratch/cache'
|
||||
> ```
|
||||
|
||||
After training a model, you can optionally use the
|
||||
[`spacy project push`](/api/cli#project-push) command to upload your outputs to
|
||||
a remote storage, using protocols like [S3](https://aws.amazon.com/s3/),
|
||||
[Google Cloud Storage](https://cloud.google.com/storage) or SSH. This can help
|
||||
you **export** your model packages, **share** work with your team, or **cache
|
||||
results** to avoid repeating work.
|
||||
|
||||
```cli
|
||||
$ python -m spacy project push
|
||||
```
|
||||
|
||||
The `remotes` section in your `project.yml` lets you assign names to the
|
||||
different storages. To download state from a remote storage, you can use the
|
||||
[`spacy project pull`](/api/cli#project-pull) command. For more details, see the
|
||||
docs on [remote storage](#remote).
|
||||
|
||||
## Project directory and assets {#directory}
|
||||
|
||||
### project.yml {#project-yml}
|
||||
|
@ -190,7 +215,7 @@ https://github.com/explosion/spacy-boilerplates/blob/master/ner_fashion/project.
|
|||
|
||||
| Section | Description |
|
||||
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `variables` | A dictionary of variables that can be referenced in paths, URLs and scripts. For example, `{NAME}` will use the value of the variable `NAME`. |
|
||||
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
|
||||
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
||||
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. |
|
||||
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
|
||||
|
@ -349,9 +374,9 @@ if __name__ == "__main__":
|
|||
|
||||
In your `project.yml`, you can then run the script by calling
|
||||
`python scripts/custom_evaluation.py` with the function arguments. You can also
|
||||
use the `variables` section to define reusable variables that will be
|
||||
substituted in commands, paths and URLs. In this example, the `BATCH_SIZE` is
|
||||
defined as a variable will be added in place of `{BATCH_SIZE}` in the script.
|
||||
use the `vars` section to define reusable variables that will be substituted in
|
||||
commands, paths and URLs. In this example, the batch size is defined as a
|
||||
variable will be added in place of `${vars.batch_size}` in the script.
|
||||
|
||||
> #### Calling into Python
|
||||
>
|
||||
|
@ -363,13 +388,13 @@ defined as a variable will be added in place of `{BATCH_SIZE}` in the script.
|
|||
<!-- prettier-ignore -->
|
||||
```yaml
|
||||
### project.yml
|
||||
variables:
|
||||
BATCH_SIZE: 128
|
||||
vars:
|
||||
batch_size: 128
|
||||
|
||||
commands:
|
||||
- name: evaluate
|
||||
script:
|
||||
- 'python scripts/custom_evaluation.py {BATCH_SIZE} ./training/model-best ./corpus/eval.json'
|
||||
- 'python scripts/custom_evaluation.py ${batch_size} ./training/model-best ./corpus/eval.json'
|
||||
deps:
|
||||
- 'training/model-best'
|
||||
- 'corpus/eval.json'
|
||||
|
@ -421,6 +446,114 @@ assets:
|
|||
checksum: '5113dc04e03f079525edd8df3f4f39e3'
|
||||
```
|
||||
|
||||
## Remote Storage {#remote}
|
||||
|
||||
You can persist your project outputs to a remote storage using the
|
||||
[`project push`](/api/cli#project-push) command. This can help you **export**
|
||||
your model packages, **share** work with your team, or **cache results** to
|
||||
avoid repeating work. The [`project pull`](/api/cli#project-pull) command will
|
||||
download any outputs that are in the remote storage and aren't available
|
||||
locally.
|
||||
|
||||
You can list one or more remotes in the `remotes` section of your
|
||||
[`project.yml`](#project-yml) by mapping a string name to the URL of the
|
||||
storage. Under the hood, spaCy uses the
|
||||
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
|
||||
communicate with the remote storages, so you can use any protocol that
|
||||
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
|
||||
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
|
||||
you may need to install extra dependencies to use certain protocols.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```cli
|
||||
> $ python -m spacy project pull local
|
||||
> ```
|
||||
|
||||
```yaml
|
||||
### project.yml
|
||||
remotes:
|
||||
default: 's3://my-spacy-bucket'
|
||||
local: '/mnt/scratch/cache'
|
||||
stuff: 'ssh://myserver.example.com/whatever'
|
||||
```
|
||||
|
||||
<Infobox title="How it works" emoji="💡">
|
||||
|
||||
Inside the remote storage, spaCy uses a clever **directory structure** to avoid
|
||||
overwriting files. The top level of the directory structure is a URL-encoded
|
||||
version of the output's path. Within this directory are subdirectories named
|
||||
according to a hash of the command string and the command's dependencies.
|
||||
Finally, within those directories are files, named according to an MD5 hash of
|
||||
their contents.
|
||||
|
||||
<!-- TODO: update with actual real example? -->
|
||||
|
||||
<!-- prettier-ignore -->
|
||||
```yaml
|
||||
└── urlencoded_file_path # Path of original file
|
||||
├── some_command_hash # Hash of command you ran
|
||||
│ ├── some_content_hash # Hash of file content
|
||||
│ └── another_content_hash
|
||||
└── another_command_hash
|
||||
└── third_content_hash
|
||||
```
|
||||
|
||||
</Infobox>
|
||||
|
||||
For instance, let's say you had the following command in your `project.yml`:
|
||||
|
||||
```yaml
|
||||
### project.yml
|
||||
- name: train
|
||||
help: 'Train a spaCy model using the specified corpus and config'
|
||||
script:
|
||||
- 'spacy train ./config.cfg --output training/'
|
||||
deps:
|
||||
- 'corpus/train'
|
||||
- 'corpus/dev'
|
||||
- 'config.cfg'
|
||||
outputs:
|
||||
- 'training/model-best'
|
||||
```
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```
|
||||
> └── s3://my-spacy-bucket/training%2Fmodel-best
|
||||
> └── 1d8cb33a06cc345ad3761c6050934a1b
|
||||
> └── d8e20c3537a084c5c10d95899fe0b1ff
|
||||
> ```
|
||||
|
||||
After you finish training, you run [`project push`](/api/cli#project-push) to
|
||||
make sure the `training/model-best` output is saved to remote storage. spaCy
|
||||
will then construct a hash from your command script and the listed dependencies,
|
||||
`corpus/train`, `corpus/dev` and `config.cfg`, in order to identify the
|
||||
execution context of your output. It would then compute an MD5 hash of the
|
||||
`training/model-best` directory, and use those three pieces of information to
|
||||
construct the storage URL.
|
||||
|
||||
```cli
|
||||
$ python -m spacy project run train
|
||||
$ python -m spacy project push
|
||||
```
|
||||
|
||||
If you change the command or one of its dependencies (for instance, by editing
|
||||
the [`config.cfg`](/usage/training#config) file to tune the hyperparameters, a
|
||||
different creation hash will be calculated, so when you use
|
||||
[`project push`](/api/cli#project-push) you won't be overwriting your previous
|
||||
file. The system even supports multiple outputs for the same file and the same
|
||||
context, which can happen if your training process is not deterministic, or if
|
||||
you have dependencies that aren't represented in the command.
|
||||
|
||||
In summary, the [`spacy project`](/api/cli#project) remote storages are designed
|
||||
to make a particular set of trade-offs. Priority is placed on **convenience**,
|
||||
**correctness** and **avoiding data loss**. You can use
|
||||
[`project push`](/api/cli#project-push) freely, as you'll never overwrite remote
|
||||
state, and you don't have to come up with names or version numbers. However,
|
||||
it's up to you to manage the size of your remote storage, and to remove files
|
||||
that are no longer relevant to you.
|
||||
|
||||
## Integrations {#integrations}
|
||||
|
||||
### Data Version Control (DVC) {#dvc} <IntegrationLogo name="dvc" title="DVC" width={70} height="auto" align="right" />
|
||||
|
@ -517,16 +650,17 @@ and evaluation set.
|
|||
<!-- prettier-ignore -->
|
||||
```yaml
|
||||
### project.yml
|
||||
variables:
|
||||
PRODIGY_DATASET: 'ner_articles'
|
||||
PRODIGY_LABELS: 'PERSON,ORG,PRODUCT'
|
||||
PRODIGY_MODEL: 'en_core_web_md'
|
||||
vars:
|
||||
prodigy:
|
||||
dataset: 'ner_articles'
|
||||
labels: 'PERSON,ORG,PRODUCT'
|
||||
model: 'en_core_web_md'
|
||||
|
||||
commands:
|
||||
- name: annotate
|
||||
- script:
|
||||
- 'python -m prodigy ner.correct {PRODIGY_DATASET} ./assets/raw_data.jsonl {PRODIGY_MODEL} --labels {PRODIGY_LABELS}'
|
||||
- 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner {PRODIGY_DATASET}'
|
||||
- 'python -m prodigy ner.correct ${vars.prodigy.dataset} ./assets/raw_data.jsonl ${vars.prodigy.model} --labels ${vars.prodigy.labels}'
|
||||
- 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner ${vars.prodigy.dataset}'
|
||||
- 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy'
|
||||
- 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy'
|
||||
- deps:
|
||||
|
|
|
@ -104,11 +104,15 @@ spaCy projects let you manage and share **end-to-end spaCy workflows** for
|
|||
different **use cases and domains**, and orchestrate training, packaging and
|
||||
serving your custom models. You can start off by cloning a pre-defined project
|
||||
template, adjust it to fit your needs, load in your data, train a model, export
|
||||
it as a Python package and share the project templates with your team. spaCy
|
||||
projects also make it easy to **integrate with other tools** in the data science
|
||||
and machine learning ecosystem, including [DVC](/usage/projects#dvc) for data
|
||||
version control, [Prodigy](/usage/projects#prodigy) for creating labelled data,
|
||||
[Streamlit](/usage/projects#streamlit) for building interactive apps,
|
||||
it as a Python package, upload your outputs to a remote storage and share your
|
||||
results with your team.
|
||||
|
||||
![Illustration of project workflow and commands](../images/projects.svg)
|
||||
|
||||
spaCy projects also make it easy to **integrate with other tools** in the data
|
||||
science and machine learning ecosystem, including [DVC](/usage/projects#dvc) for
|
||||
data version control, [Prodigy](/usage/projects#prodigy) for creating labelled
|
||||
data, [Streamlit](/usage/projects#streamlit) for building interactive apps,
|
||||
[FastAPI](/usage/projects#fastapi) for serving models in production,
|
||||
[Ray](/usage/projects#ray) for parallel training,
|
||||
[Weights & Biases](/usage/projects#wandb) for experiment tracking, and more!
|
||||
|
|
|
@ -5,6 +5,8 @@ import Icon from './icon'
|
|||
import { isString } from './util'
|
||||
import classes from '../styles/table.module.sass'
|
||||
|
||||
const FOOT_ROW_REGEX = /^(RETURNS|YIELDS|CREATES|PRINTS|EXECUTES|UPLOADS|DOWNLOADS)/
|
||||
|
||||
function isNum(children) {
|
||||
return isString(children) && /^\d+[.,]?[\dx]+?(|x|ms|mb|gb|k|m)?$/i.test(children)
|
||||
}
|
||||
|
@ -43,7 +45,6 @@ function isDividerRow(children) {
|
|||
}
|
||||
|
||||
function isFootRow(children) {
|
||||
const rowRegex = /^(RETURNS|YIELDS|CREATES|PRINTS|EXECUTES)/
|
||||
if (children.length && children[0].props.name === 'td') {
|
||||
const cellChildren = children[0].props.children
|
||||
if (
|
||||
|
@ -52,7 +53,7 @@ function isFootRow(children) {
|
|||
cellChildren.props.children &&
|
||||
isString(cellChildren.props.children)
|
||||
) {
|
||||
return rowRegex.test(cellChildren.props.children)
|
||||
return FOOT_ROW_REGEX.test(cellChildren.props.children)
|
||||
}
|
||||
}
|
||||
return false
|
||||
|
|
Loading…
Reference in New Issue
Block a user