mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Allow spacy project to push and pull to/from remote storage (#5949)
* Add utils for working with remote storage * WIP add remote_cache for project * WIP add push and pull commands * Use pathy in remote_cache * Updarte util * Update remote_cache * Update util * Update project assets * Update pull script * Update push script * Fix type annotation in util * Work on remote storage * Remove site and env hash * Fix imports * Fix type annotation * Require pathy * Require pathy * Fix import * Add a util to handle project variable substitution * Import push and pull commands * Fix pull command * Fix push command * Fix tarfile in remote_storage * Improve printing * Fiddle with status messages * Set version to v3.0.0a9 * Draft docs for spacy project remote storages * Update docs [ci skip] * Use Thinc config to simplify and unify template variables * Auto-format * Don't import Pathy globally for now Causes slow and annoying Google Cloud warning * Tidy up test * Tidy up and update tests * Update to latest Thinc * Update docs * variables -> vars * Update docs [ci skip] * Update docs [ci skip] Co-authored-by: Ines Montani <ines@ines.io>
This commit is contained in:
parent
9bdc9e81f5
commit
e559867605
|
@ -6,9 +6,10 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0a28,<8.0.0a30",
|
"thinc>=8.0.0a29,<8.0.0a40",
|
||||||
"blis>=0.4.0,<0.5.0",
|
"blis>=0.4.0,<0.5.0",
|
||||||
"pytokenizations",
|
"pytokenizations",
|
||||||
"smart_open>=2.0.0,<3.0.0"
|
"smart_open>=2.0.0,<3.0.0",
|
||||||
|
"pathy"
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a28,<8.0.0a30
|
thinc>=8.0.0a29,<8.0.0a40
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
ml_datasets>=0.1.1
|
ml_datasets>=0.1.1
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
|
@ -9,6 +9,7 @@ wasabi>=0.7.1,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.1.0,<3.0.0
|
||||||
catalogue>=0.0.7,<1.1.0
|
catalogue>=0.0.7,<1.1.0
|
||||||
typer>=0.3.0,<0.4.0
|
typer>=0.3.0,<0.4.0
|
||||||
|
pathy
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
|
|
|
@ -34,18 +34,19 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.0a28,<8.0.0a30
|
thinc>=8.0.0a29,<8.0.0a40
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a28,<8.0.0a30
|
thinc>=8.0.0a29,<8.0.0a40
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
wasabi>=0.7.1,<1.1.0
|
wasabi>=0.7.1,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.1.0,<3.0.0
|
||||||
catalogue>=0.0.7,<1.1.0
|
catalogue>=0.0.7,<1.1.0
|
||||||
typer>=0.3.0,<0.4.0
|
typer>=0.3.0,<0.4.0
|
||||||
|
pathy
|
||||||
# Third-party dependencies
|
# Third-party dependencies
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a8"
|
__version__ = "3.0.0a9"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -21,6 +21,8 @@ from .project.clone import project_clone # noqa: F401
|
||||||
from .project.assets import project_assets # noqa: F401
|
from .project.assets import project_assets # noqa: F401
|
||||||
from .project.run import project_run # noqa: F401
|
from .project.run import project_run # noqa: F401
|
||||||
from .project.dvc import project_update_dvc # noqa: F401
|
from .project.dvc import project_update_dvc # noqa: F401
|
||||||
|
from .project.push import project_push # noqa: F401
|
||||||
|
from .project.pull import project_pull # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from typing import Dict, Any, Union, List, Optional
|
from typing import Dict, Any, Union, List, Optional, TYPE_CHECKING
|
||||||
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import srsly
|
import srsly
|
||||||
|
@ -8,11 +9,13 @@ from typer.main import get_command
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from thinc.config import Config, ConfigValidationError
|
from thinc.config import Config, ConfigValidationError
|
||||||
from configparser import InterpolationError
|
from configparser import InterpolationError
|
||||||
import sys
|
|
||||||
|
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
from ..schemas import ProjectConfigSchema, validate
|
||||||
from ..util import import_file
|
from ..util import import_file
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from pathy import Pathy # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
PROJECT_FILE = "project.yml"
|
PROJECT_FILE = "project.yml"
|
||||||
PROJECT_LOCK = "project.lock"
|
PROJECT_LOCK = "project.lock"
|
||||||
|
@ -93,11 +96,12 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def load_project_config(path: Path) -> Dict[str, Any]:
|
def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
|
||||||
"""Load the project.yml file from a directory and validate it. Also make
|
"""Load the project.yml file from a directory and validate it. Also make
|
||||||
sure that all directories defined in the config exist.
|
sure that all directories defined in the config exist.
|
||||||
|
|
||||||
path (Path): The path to the project directory.
|
path (Path): The path to the project directory.
|
||||||
|
interpolate (bool): Whether to substitute project variables.
|
||||||
RETURNS (Dict[str, Any]): The loaded project.yml.
|
RETURNS (Dict[str, Any]): The loaded project.yml.
|
||||||
"""
|
"""
|
||||||
config_path = path / PROJECT_FILE
|
config_path = path / PROJECT_FILE
|
||||||
|
@ -119,9 +123,25 @@ def load_project_config(path: Path) -> Dict[str, Any]:
|
||||||
dir_path = path / subdir
|
dir_path = path / subdir
|
||||||
if not dir_path.exists():
|
if not dir_path.exists():
|
||||||
dir_path.mkdir(parents=True)
|
dir_path.mkdir(parents=True)
|
||||||
|
if interpolate:
|
||||||
|
err = "project.yml validation error"
|
||||||
|
with show_validation_error(title=err, hint_fill=False):
|
||||||
|
config = substitute_project_variables(config)
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}):
|
||||||
|
key = "vars"
|
||||||
|
config.setdefault(key, {})
|
||||||
|
config[key].update(overrides)
|
||||||
|
# Need to put variables in the top scope again so we can have a top-level
|
||||||
|
# section "project" (otherwise, a list of commands in the top scope wouldn't)
|
||||||
|
# be allowed by Thinc's config system
|
||||||
|
cfg = Config({"project": config, key: config[key]})
|
||||||
|
interpolated = cfg.interpolate()
|
||||||
|
return dict(interpolated["project"])
|
||||||
|
|
||||||
|
|
||||||
def validate_project_commands(config: Dict[str, Any]) -> None:
|
def validate_project_commands(config: Dict[str, Any]) -> None:
|
||||||
"""Check that project commands and workflows are valid, don't contain
|
"""Check that project commands and workflows are valid, don't contain
|
||||||
duplicates, don't clash and only refer to commands that exist.
|
duplicates, don't clash and only refer to commands that exist.
|
||||||
|
@ -232,3 +252,39 @@ def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
|
||||||
for name, cfg in config.get("components", {}).items()
|
for name, cfg in config.get("components", {}).items()
|
||||||
if "factory" not in cfg and "source" in cfg
|
if "factory" not in cfg and "source" in cfg
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
||||||
|
"""Upload a file.
|
||||||
|
|
||||||
|
src (Path): The source path.
|
||||||
|
url (str): The destination URL to upload to.
|
||||||
|
"""
|
||||||
|
dest = ensure_pathy(dest)
|
||||||
|
with dest.open(mode="wb") as output_file:
|
||||||
|
with src.open(mode="rb") as input_file:
|
||||||
|
output_file.write(input_file.read())
|
||||||
|
|
||||||
|
|
||||||
|
def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None:
|
||||||
|
"""Download a file using smart_open.
|
||||||
|
|
||||||
|
url (str): The URL of the file.
|
||||||
|
dest (Path): The destination path.
|
||||||
|
force (bool): Whether to force download even if file exists.
|
||||||
|
If False, the download will be skipped.
|
||||||
|
"""
|
||||||
|
if dest.exists() and not force:
|
||||||
|
return None
|
||||||
|
src = ensure_pathy(src)
|
||||||
|
with src.open(mode="rb") as input_file:
|
||||||
|
with dest.open(mode="wb") as output_file:
|
||||||
|
output_file.write(input_file.read())
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_pathy(path):
|
||||||
|
"""Temporary helper to prevent importing Pathy globally (which can cause
|
||||||
|
slow and annoying Google Cloud warning)."""
|
||||||
|
from pathy import Pathy # noqa: F811
|
||||||
|
|
||||||
|
return Pathy(path)
|
||||||
|
|
|
@ -4,10 +4,10 @@ from wasabi import msg
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import requests
|
import requests
|
||||||
import smart_open
|
|
||||||
|
|
||||||
from ...util import ensure_path, working_dir
|
from ...util import ensure_path, working_dir
|
||||||
from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum
|
from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum
|
||||||
|
from .._util import download_file
|
||||||
|
|
||||||
|
|
||||||
# TODO: find a solution for caches
|
# TODO: find a solution for caches
|
||||||
|
@ -44,16 +44,14 @@ def project_assets(project_dir: Path) -> None:
|
||||||
if not assets:
|
if not assets:
|
||||||
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
|
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
|
||||||
msg.info(f"Fetching {len(assets)} asset(s)")
|
msg.info(f"Fetching {len(assets)} asset(s)")
|
||||||
variables = config.get("variables", {})
|
|
||||||
for asset in assets:
|
for asset in assets:
|
||||||
dest = asset["dest"].format(**variables)
|
dest = asset["dest"]
|
||||||
url = asset.get("url")
|
url = asset.get("url")
|
||||||
checksum = asset.get("checksum")
|
checksum = asset.get("checksum")
|
||||||
if not url:
|
if not url:
|
||||||
# project.yml defines asset without URL that the user has to place
|
# project.yml defines asset without URL that the user has to place
|
||||||
check_private_asset(dest, checksum)
|
check_private_asset(dest, checksum)
|
||||||
continue
|
continue
|
||||||
url = url.format(**variables)
|
|
||||||
fetch_asset(project_path, url, dest, checksum)
|
fetch_asset(project_path, url, dest, checksum)
|
||||||
|
|
||||||
|
|
||||||
|
@ -132,15 +130,3 @@ def convert_asset_url(url: str) -> str:
|
||||||
)
|
)
|
||||||
return converted
|
return converted
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
|
|
||||||
"""Download a file using smart_open.
|
|
||||||
|
|
||||||
url (str): The URL of the file.
|
|
||||||
dest (Path): The destination path.
|
|
||||||
chunk_size (int): The size of chunks to read/write.
|
|
||||||
"""
|
|
||||||
with smart_open.open(url, mode="rb") as input_file:
|
|
||||||
with dest.open(mode="wb") as output_file:
|
|
||||||
output_file.write(input_file.read())
|
|
||||||
|
|
|
@ -99,7 +99,6 @@ def update_dvc_config(
|
||||||
if ref_hash == config_hash and not force:
|
if ref_hash == config_hash and not force:
|
||||||
return False # Nothing has changed in project.yml, don't need to update
|
return False # Nothing has changed in project.yml, don't need to update
|
||||||
dvc_config_path.unlink()
|
dvc_config_path.unlink()
|
||||||
variables = config.get("variables", {})
|
|
||||||
dvc_commands = []
|
dvc_commands = []
|
||||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||||
for name in workflows[workflow]:
|
for name in workflows[workflow]:
|
||||||
|
@ -122,7 +121,7 @@ def update_dvc_config(
|
||||||
dvc_commands.append(join_command(full_cmd))
|
dvc_commands.append(join_command(full_cmd))
|
||||||
with working_dir(path):
|
with working_dir(path):
|
||||||
dvc_flags = {"--verbose": verbose, "--quiet": silent}
|
dvc_flags = {"--verbose": verbose, "--quiet": silent}
|
||||||
run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
|
run_dvc_commands(dvc_commands, flags=dvc_flags)
|
||||||
with dvc_config_path.open("r+", encoding="utf8") as f:
|
with dvc_config_path.open("r+", encoding="utf8") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
f.seek(0, 0)
|
f.seek(0, 0)
|
||||||
|
@ -131,23 +130,16 @@ def update_dvc_config(
|
||||||
|
|
||||||
|
|
||||||
def run_dvc_commands(
|
def run_dvc_commands(
|
||||||
commands: List[str] = tuple(),
|
commands: List[str] = tuple(), flags: Dict[str, bool] = {},
|
||||||
variables: Dict[str, str] = {},
|
|
||||||
flags: Dict[str, bool] = {},
|
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Run a sequence of DVC commands in a subprocess, in order.
|
"""Run a sequence of DVC commands in a subprocess, in order.
|
||||||
|
|
||||||
commands (List[str]): The string commands without the leading "dvc".
|
commands (List[str]): The string commands without the leading "dvc".
|
||||||
variables (Dict[str, str]): Dictionary of variable names, mapped to their
|
|
||||||
values. Will be used to substitute format string variables in the
|
|
||||||
commands.
|
|
||||||
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
|
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
|
||||||
easier to pass flags like --quiet that depend on a variable or
|
easier to pass flags like --quiet that depend on a variable or
|
||||||
command-line setting while avoiding lots of nested conditionals.
|
command-line setting while avoiding lots of nested conditionals.
|
||||||
"""
|
"""
|
||||||
for command in commands:
|
for command in commands:
|
||||||
# Substitute variables, e.g. "./{NAME}.json"
|
|
||||||
command = command.format(**variables)
|
|
||||||
command = split_command(command)
|
command = split_command(command)
|
||||||
dvc_command = ["dvc", *command]
|
dvc_command = ["dvc", *command]
|
||||||
# Add the flags if they are set to True
|
# Add the flags if they are set to True
|
||||||
|
|
36
spacy/cli/project/pull.py
Normal file
36
spacy/cli/project/pull.py
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
from .remote_storage import RemoteStorage
|
||||||
|
from .remote_storage import get_command_hash
|
||||||
|
from .._util import project_cli, Arg
|
||||||
|
from .._util import load_project_config
|
||||||
|
|
||||||
|
|
||||||
|
@project_cli.command("pull")
|
||||||
|
def project_pull_cli(
|
||||||
|
# fmt: off
|
||||||
|
remote: str = Arg("default", help="Name or path of remote storage"),
|
||||||
|
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Retrieve any precomputed outputs from a remote storage that are available.
|
||||||
|
You can alias remotes in your project.yml by mapping them to storage paths.
|
||||||
|
A storage can be anything that the smart-open library can upload to, e.g.
|
||||||
|
gcs, aws, ssh, local directories etc
|
||||||
|
"""
|
||||||
|
for url, output_path in project_pull(project_dir, remote):
|
||||||
|
if url is not None:
|
||||||
|
msg.good(f"Pulled {output_path} from {url}")
|
||||||
|
|
||||||
|
|
||||||
|
def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
||||||
|
config = load_project_config(project_dir)
|
||||||
|
if remote in config.get("remotes", {}):
|
||||||
|
remote = config["remotes"][remote]
|
||||||
|
storage = RemoteStorage(project_dir, remote)
|
||||||
|
for cmd in config.get("commands", []):
|
||||||
|
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||||
|
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
||||||
|
for output_path in cmd.get("outputs", []):
|
||||||
|
url = storage.pull(output_path, command_hash=cmd_hash)
|
||||||
|
yield url, output_path
|
48
spacy/cli/project/push.py
Normal file
48
spacy/cli/project/push.py
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
from .remote_storage import RemoteStorage
|
||||||
|
from .remote_storage import get_content_hash, get_command_hash
|
||||||
|
from .._util import load_project_config
|
||||||
|
from .._util import project_cli, Arg
|
||||||
|
|
||||||
|
|
||||||
|
@project_cli.command("push")
|
||||||
|
def project_push_cli(
|
||||||
|
# fmt: off
|
||||||
|
remote: str = Arg("default", help="Name or path of remote storage"),
|
||||||
|
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Persist outputs to a remote storage. You can alias remotes in your project.yml
|
||||||
|
by mapping them to storage paths. A storage can be anything that the smart-open
|
||||||
|
library can upload to, e.g. gcs, aws, ssh, local directories etc
|
||||||
|
"""
|
||||||
|
for output_path, url in project_push(project_dir, remote):
|
||||||
|
if url is None:
|
||||||
|
msg.info(f"Skipping {output_path}")
|
||||||
|
else:
|
||||||
|
msg.good(f"Pushed {output_path} to {url}")
|
||||||
|
|
||||||
|
|
||||||
|
def project_push(project_dir: Path, remote: str):
|
||||||
|
"""Persist outputs to a remote storage. You can alias remotes in your project.yml
|
||||||
|
by mapping them to storage paths. A storage can be anything that the smart-open
|
||||||
|
library can upload to, e.g. gcs, aws, ssh, local directories etc
|
||||||
|
"""
|
||||||
|
config = load_project_config(project_dir)
|
||||||
|
if remote in config.get("remotes", {}):
|
||||||
|
remote = config["remotes"][remote]
|
||||||
|
storage = RemoteStorage(project_dir, remote)
|
||||||
|
for cmd in config.get("commands", []):
|
||||||
|
cmd_hash = get_command_hash(
|
||||||
|
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
|
||||||
|
)
|
||||||
|
for output_path in cmd.get("outputs", []):
|
||||||
|
output_loc = project_dir / output_path
|
||||||
|
if output_loc.exists():
|
||||||
|
url = storage.push(
|
||||||
|
output_path,
|
||||||
|
command_hash=cmd_hash,
|
||||||
|
content_hash=get_content_hash(output_loc),
|
||||||
|
)
|
||||||
|
yield output_path, url
|
169
spacy/cli/project/remote_storage.py
Normal file
169
spacy/cli/project/remote_storage.py
Normal file
|
@ -0,0 +1,169 @@
|
||||||
|
from typing import Optional, List, Dict, TYPE_CHECKING
|
||||||
|
import os
|
||||||
|
import site
|
||||||
|
import hashlib
|
||||||
|
import urllib.parse
|
||||||
|
import tarfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from .._util import get_hash, get_checksum, download_file, ensure_pathy
|
||||||
|
from ...util import make_tempdir
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from pathy import Pathy # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
|
class RemoteStorage:
|
||||||
|
"""Push and pull outputs to and from a remote file storage.
|
||||||
|
|
||||||
|
Remotes can be anything that `smart-open` can support: AWS, GCS, file system,
|
||||||
|
ssh, etc.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, project_root: Path, url: str, *, compression="gz"):
|
||||||
|
self.root = project_root
|
||||||
|
self.url = ensure_pathy(url)
|
||||||
|
self.compression = compression
|
||||||
|
|
||||||
|
def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
|
||||||
|
"""Compress a file or directory within a project and upload it to a remote
|
||||||
|
storage. If an object exists at the full URL, nothing is done.
|
||||||
|
|
||||||
|
Within the remote storage, files are addressed by their project path
|
||||||
|
(url encoded) and two user-supplied hashes, representing their creation
|
||||||
|
context and their file contents. If the URL already exists, the data is
|
||||||
|
not uploaded. Paths are archived and compressed prior to upload.
|
||||||
|
"""
|
||||||
|
loc = self.root / path
|
||||||
|
if not loc.exists():
|
||||||
|
raise IOError(f"Cannot push {loc}: does not exist.")
|
||||||
|
url = self.make_url(path, command_hash, content_hash)
|
||||||
|
if url.exists():
|
||||||
|
return None
|
||||||
|
tmp: Path
|
||||||
|
with make_tempdir() as tmp:
|
||||||
|
tar_loc = tmp / self.encode_name(str(path))
|
||||||
|
mode_string = f"w:{self.compression}" if self.compression else "w"
|
||||||
|
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
||||||
|
tar_file.add(str(loc), arcname=str(path))
|
||||||
|
with tar_loc.open(mode="rb") as input_file:
|
||||||
|
with url.open(mode="wb") as output_file:
|
||||||
|
output_file.write(input_file.read())
|
||||||
|
return url
|
||||||
|
|
||||||
|
def pull(
|
||||||
|
self,
|
||||||
|
path: Path,
|
||||||
|
*,
|
||||||
|
command_hash: Optional[str] = None,
|
||||||
|
content_hash: Optional[str] = None,
|
||||||
|
) -> Optional["Pathy"]:
|
||||||
|
"""Retrieve a file from the remote cache. If the file already exists,
|
||||||
|
nothing is done.
|
||||||
|
|
||||||
|
If the command_hash and/or content_hash are specified, only matching
|
||||||
|
results are returned. If no results are available, an error is raised.
|
||||||
|
"""
|
||||||
|
dest = self.root / path
|
||||||
|
if dest.exists():
|
||||||
|
return None
|
||||||
|
url = self.find(path, command_hash=command_hash, content_hash=content_hash)
|
||||||
|
if url is None:
|
||||||
|
return url
|
||||||
|
else:
|
||||||
|
# Make sure the destination exists
|
||||||
|
if not dest.parent.exists():
|
||||||
|
dest.parent.mkdir(parents=True)
|
||||||
|
tmp: Path
|
||||||
|
with make_tempdir() as tmp:
|
||||||
|
tar_loc = tmp / url.parts[-1]
|
||||||
|
download_file(url, tar_loc)
|
||||||
|
mode_string = f"r:{self.compression}" if self.compression else "r"
|
||||||
|
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
||||||
|
# This requires that the path is added correctly, relative
|
||||||
|
# to root. This is how we set things up in push()
|
||||||
|
tar_file.extractall(self.root)
|
||||||
|
return url
|
||||||
|
|
||||||
|
def find(
|
||||||
|
self,
|
||||||
|
path: Path,
|
||||||
|
*,
|
||||||
|
command_hash: Optional[str] = None,
|
||||||
|
content_hash: Optional[str] = None,
|
||||||
|
) -> Optional["Pathy"]:
|
||||||
|
"""Find the best matching version of a file within the storage,
|
||||||
|
or `None` if no match can be found. If both the creation and content hash
|
||||||
|
are specified, only exact matches will be returned. Otherwise, the most
|
||||||
|
recent matching file is preferred.
|
||||||
|
"""
|
||||||
|
name = self.encode_name(str(path))
|
||||||
|
if command_hash is not None and content_hash is not None:
|
||||||
|
url = self.make_url(path, command_hash, content_hash)
|
||||||
|
urls = [url] if url.exists() else []
|
||||||
|
elif command_hash is not None:
|
||||||
|
urls = list((self.url / name / command_hash).iterdir())
|
||||||
|
else:
|
||||||
|
urls = list((self.url / name).iterdir())
|
||||||
|
if content_hash is not None:
|
||||||
|
urls = [url for url in urls if url.parts[-1] == content_hash]
|
||||||
|
return urls[-1] if urls else None
|
||||||
|
|
||||||
|
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
|
||||||
|
"""Construct a URL from a subpath, a creation hash and a content hash."""
|
||||||
|
return self.url / self.encode_name(str(path)) / command_hash / content_hash
|
||||||
|
|
||||||
|
def encode_name(self, name: str) -> str:
|
||||||
|
"""Encode a subpath into a URL-safe name."""
|
||||||
|
return urllib.parse.quote_plus(name)
|
||||||
|
|
||||||
|
|
||||||
|
def get_content_hash(loc: Path) -> str:
|
||||||
|
return get_checksum(loc)
|
||||||
|
|
||||||
|
|
||||||
|
def get_command_hash(
|
||||||
|
site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
|
||||||
|
) -> str:
|
||||||
|
"""Create a hash representing the execution of a command. This includes the
|
||||||
|
currently installed packages, whatever environment variables have been marked
|
||||||
|
as relevant, and the command.
|
||||||
|
"""
|
||||||
|
hashes = [site_hash, env_hash] + [get_checksum(dep) for dep in sorted(deps)]
|
||||||
|
hashes.extend(cmd)
|
||||||
|
creation_bytes = "".join(hashes).encode("utf8")
|
||||||
|
return hashlib.md5(creation_bytes).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def get_site_hash():
|
||||||
|
"""Hash the current Python environment's site-packages contents, including
|
||||||
|
the name and version of the libraries. The list we're hashing is what
|
||||||
|
`pip freeze` would output.
|
||||||
|
"""
|
||||||
|
site_dirs = site.getsitepackages()
|
||||||
|
if site.ENABLE_USER_SITE:
|
||||||
|
site_dirs.extend(site.getusersitepackages())
|
||||||
|
packages = set()
|
||||||
|
for site_dir in site_dirs:
|
||||||
|
site_dir = Path(site_dir)
|
||||||
|
for subpath in site_dir.iterdir():
|
||||||
|
if subpath.parts[-1].endswith("dist-info"):
|
||||||
|
packages.add(subpath.parts[-1].replace(".dist-info", ""))
|
||||||
|
package_bytes = "".join(sorted(packages)).encode("utf8")
|
||||||
|
return hashlib.md5sum(package_bytes).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def get_env_hash(env: Dict[str, str]) -> str:
|
||||||
|
"""Construct a hash of the environment variables that will be passed into
|
||||||
|
the commands.
|
||||||
|
|
||||||
|
Values in the env dict may be references to the current os.environ, using
|
||||||
|
the syntax $ENV_VAR to mean os.environ[ENV_VAR]
|
||||||
|
"""
|
||||||
|
env_vars = {}
|
||||||
|
for key, value in env.items():
|
||||||
|
if value.startswith("$"):
|
||||||
|
env_vars[key] = os.environ.get(value[1:], "")
|
||||||
|
else:
|
||||||
|
env_vars[key] = value
|
||||||
|
return get_hash(env_vars)
|
|
@ -44,7 +44,6 @@ def project_run(
|
||||||
dry (bool): Perform a dry run and don't execute commands.
|
dry (bool): Perform a dry run and don't execute commands.
|
||||||
"""
|
"""
|
||||||
config = load_project_config(project_dir)
|
config = load_project_config(project_dir)
|
||||||
variables = config.get("variables", {})
|
|
||||||
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||||
workflows = config.get("workflows", {})
|
workflows = config.get("workflows", {})
|
||||||
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
|
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
|
||||||
|
@ -54,22 +53,20 @@ def project_run(
|
||||||
project_run(project_dir, cmd, force=force, dry=dry)
|
project_run(project_dir, cmd, force=force, dry=dry)
|
||||||
else:
|
else:
|
||||||
cmd = commands[subcommand]
|
cmd = commands[subcommand]
|
||||||
variables = config.get("variables", {})
|
|
||||||
for dep in cmd.get("deps", []):
|
for dep in cmd.get("deps", []):
|
||||||
dep = dep.format(**variables)
|
|
||||||
if not (project_dir / dep).exists():
|
if not (project_dir / dep).exists():
|
||||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||||
err_kwargs = {"exits": 1} if not dry else {}
|
err_kwargs = {"exits": 1} if not dry else {}
|
||||||
msg.fail(err, **err_kwargs)
|
msg.fail(err, **err_kwargs)
|
||||||
with working_dir(project_dir) as current_dir:
|
with working_dir(project_dir) as current_dir:
|
||||||
rerun = check_rerun(current_dir, cmd, variables)
|
rerun = check_rerun(current_dir, cmd)
|
||||||
if not rerun and not force:
|
if not rerun and not force:
|
||||||
msg.info(f"Skipping '{cmd['name']}': nothing changed")
|
msg.info(f"Skipping '{cmd['name']}': nothing changed")
|
||||||
else:
|
else:
|
||||||
msg.divider(subcommand)
|
msg.divider(subcommand)
|
||||||
run_commands(cmd["script"], variables, dry=dry)
|
run_commands(cmd["script"], dry=dry)
|
||||||
if not dry:
|
if not dry:
|
||||||
update_lockfile(current_dir, cmd, variables)
|
update_lockfile(current_dir, cmd)
|
||||||
|
|
||||||
|
|
||||||
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||||
|
@ -115,23 +112,15 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||||
|
|
||||||
|
|
||||||
def run_commands(
|
def run_commands(
|
||||||
commands: List[str] = tuple(),
|
commands: List[str] = tuple(), silent: bool = False, dry: bool = False,
|
||||||
variables: Dict[str, Any] = {},
|
|
||||||
silent: bool = False,
|
|
||||||
dry: bool = False,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Run a sequence of commands in a subprocess, in order.
|
"""Run a sequence of commands in a subprocess, in order.
|
||||||
|
|
||||||
commands (List[str]): The string commands.
|
commands (List[str]): The string commands.
|
||||||
variables (Dict[str, Any]): Dictionary of variable names, mapped to their
|
|
||||||
values. Will be used to substitute format string variables in the
|
|
||||||
commands.
|
|
||||||
silent (bool): Don't print the commands.
|
silent (bool): Don't print the commands.
|
||||||
dry (bool): Perform a dry run and don't execut anything.
|
dry (bool): Perform a dry run and don't execut anything.
|
||||||
"""
|
"""
|
||||||
for command in commands:
|
for command in commands:
|
||||||
# Substitute variables, e.g. "./{NAME}.json"
|
|
||||||
command = command.format(**variables)
|
|
||||||
command = split_command(command)
|
command = split_command(command)
|
||||||
# Not sure if this is needed or a good idea. Motivation: users may often
|
# Not sure if this is needed or a good idea. Motivation: users may often
|
||||||
# use commands in their config that reference "python" and we want to
|
# use commands in their config that reference "python" and we want to
|
||||||
|
@ -173,15 +162,12 @@ def validate_subcommand(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def check_rerun(
|
def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
|
||||||
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
|
||||||
) -> bool:
|
|
||||||
"""Check if a command should be rerun because its settings or inputs/outputs
|
"""Check if a command should be rerun because its settings or inputs/outputs
|
||||||
changed.
|
changed.
|
||||||
|
|
||||||
project_dir (Path): The current project directory.
|
project_dir (Path): The current project directory.
|
||||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
|
||||||
RETURNS (bool): Whether to re-run the command.
|
RETURNS (bool): Whether to re-run the command.
|
||||||
"""
|
"""
|
||||||
lock_path = project_dir / PROJECT_LOCK
|
lock_path = project_dir / PROJECT_LOCK
|
||||||
|
@ -197,19 +183,16 @@ def check_rerun(
|
||||||
# If the entry in the lockfile matches the lockfile entry that would be
|
# If the entry in the lockfile matches the lockfile entry that would be
|
||||||
# generated from the current command, we don't rerun because it means that
|
# generated from the current command, we don't rerun because it means that
|
||||||
# all inputs/outputs, hashes and scripts are the same and nothing changed
|
# all inputs/outputs, hashes and scripts are the same and nothing changed
|
||||||
return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry)
|
return get_hash(get_lock_entry(project_dir, command)) != get_hash(entry)
|
||||||
|
|
||||||
|
|
||||||
def update_lockfile(
|
def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
|
||||||
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
|
||||||
) -> None:
|
|
||||||
"""Update the lockfile after running a command. Will create a lockfile if
|
"""Update the lockfile after running a command. Will create a lockfile if
|
||||||
it doesn't yet exist and will add an entry for the current command, its
|
it doesn't yet exist and will add an entry for the current command, its
|
||||||
script and dependencies/outputs.
|
script and dependencies/outputs.
|
||||||
|
|
||||||
project_dir (Path): The current project directory.
|
project_dir (Path): The current project directory.
|
||||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
|
||||||
"""
|
"""
|
||||||
lock_path = project_dir / PROJECT_LOCK
|
lock_path = project_dir / PROJECT_LOCK
|
||||||
if not lock_path.exists():
|
if not lock_path.exists():
|
||||||
|
@ -217,13 +200,11 @@ def update_lockfile(
|
||||||
data = {}
|
data = {}
|
||||||
else:
|
else:
|
||||||
data = srsly.read_yaml(lock_path)
|
data = srsly.read_yaml(lock_path)
|
||||||
data[command["name"]] = get_lock_entry(project_dir, command, variables)
|
data[command["name"]] = get_lock_entry(project_dir, command)
|
||||||
srsly.write_yaml(lock_path, data)
|
srsly.write_yaml(lock_path, data)
|
||||||
|
|
||||||
|
|
||||||
def get_lock_entry(
|
def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Get a lockfile entry for a given command. An entry includes the command,
|
"""Get a lockfile entry for a given command. An entry includes the command,
|
||||||
the script (command steps) and a list of dependencies and outputs with
|
the script (command steps) and a list of dependencies and outputs with
|
||||||
their paths and file hashes, if available. The format is based on the
|
their paths and file hashes, if available. The format is based on the
|
||||||
|
@ -231,12 +212,11 @@ def get_lock_entry(
|
||||||
|
|
||||||
project_dir (Path): The current project directory.
|
project_dir (Path): The current project directory.
|
||||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
|
||||||
RETURNS (Dict[str, Any]): The lockfile entry.
|
RETURNS (Dict[str, Any]): The lockfile entry.
|
||||||
"""
|
"""
|
||||||
deps = get_fileinfo(project_dir, command.get("deps", []), variables)
|
deps = get_fileinfo(project_dir, command.get("deps", []))
|
||||||
outs = get_fileinfo(project_dir, command.get("outputs", []), variables)
|
outs = get_fileinfo(project_dir, command.get("outputs", []))
|
||||||
outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables)
|
outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []))
|
||||||
return {
|
return {
|
||||||
"cmd": f"{COMMAND} run {command['name']}",
|
"cmd": f"{COMMAND} run {command['name']}",
|
||||||
"script": command["script"],
|
"script": command["script"],
|
||||||
|
@ -245,20 +225,16 @@ def get_lock_entry(
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_fileinfo(
|
def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, str]]:
|
||||||
project_dir: Path, paths: List[str], variables: Dict[str, Any]
|
|
||||||
) -> List[Dict[str, str]]:
|
|
||||||
"""Generate the file information for a list of paths (dependencies, outputs).
|
"""Generate the file information for a list of paths (dependencies, outputs).
|
||||||
Includes the file path and the file's checksum.
|
Includes the file path and the file's checksum.
|
||||||
|
|
||||||
project_dir (Path): The current project directory.
|
project_dir (Path): The current project directory.
|
||||||
paths (List[str]): The file paths.
|
paths (List[str]): The file paths.
|
||||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
|
||||||
RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
|
RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
|
||||||
"""
|
"""
|
||||||
data = []
|
data = []
|
||||||
for path in paths:
|
for path in paths:
|
||||||
path = path.format(**variables)
|
|
||||||
file_path = project_dir / path
|
file_path = project_dir / path
|
||||||
md5 = get_checksum(file_path) if file_path.exists() else None
|
md5 = get_checksum(file_path) if file_path.exists() else None
|
||||||
data.append({"path": path, "md5": md5})
|
data.append({"path": path, "md5": md5})
|
||||||
|
|
|
@ -303,7 +303,7 @@ class ProjectConfigCommand(BaseModel):
|
||||||
|
|
||||||
class ProjectConfigSchema(BaseModel):
|
class ProjectConfigSchema(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands")
|
vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
|
||||||
assets: List[ProjectConfigAsset] = Field([], title="Data assets")
|
assets: List[ProjectConfigAsset] = Field([], title="Data assets")
|
||||||
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
|
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
|
||||||
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
||||||
|
|
|
@ -6,9 +6,12 @@ from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||||
from spacy.cli.pretrain import make_docs
|
from spacy.cli.pretrain import make_docs
|
||||||
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
||||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||||
from spacy.util import get_lang_class
|
from spacy.cli._util import load_project_config, substitute_project_variables
|
||||||
|
from thinc.config import ConfigValidationError
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
from .util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
def test_cli_converters_conllu2json():
|
def test_cli_converters_conllu2json():
|
||||||
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
|
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
|
||||||
|
@ -295,6 +298,24 @@ def test_project_config_validation2(config, n_errors):
|
||||||
assert len(errors) == n_errors
|
assert len(errors) == n_errors
|
||||||
|
|
||||||
|
|
||||||
|
def test_project_config_interpolation():
|
||||||
|
variables = {"a": 10, "b": {"c": "foo", "d": True}}
|
||||||
|
commands = [
|
||||||
|
{"name": "x", "script": ["hello ${vars.a} ${vars.b.c}"]},
|
||||||
|
{"name": "y", "script": ["${vars.b.c} ${vars.b.d}"]},
|
||||||
|
]
|
||||||
|
project = {"commands": commands, "vars": variables}
|
||||||
|
with make_tempdir() as d:
|
||||||
|
srsly.write_yaml(d / "project.yml", project)
|
||||||
|
cfg = load_project_config(d)
|
||||||
|
assert cfg["commands"][0]["script"][0] == "hello 10 foo"
|
||||||
|
assert cfg["commands"][1]["script"][0] == "foo true"
|
||||||
|
commands = [{"name": "x", "script": ["hello ${vars.a} ${vars.b.e}"]}]
|
||||||
|
project = {"commands": commands, "vars": variables}
|
||||||
|
with pytest.raises(ConfigValidationError):
|
||||||
|
substitute_project_variables(project)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"args,expected",
|
"args,expected",
|
||||||
[
|
[
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from typing import List, Union, Dict, Any, Optional, Iterable, Callable, Tuple
|
from typing import List, Union, Dict, Any, Optional, Iterable, Callable, Tuple
|
||||||
from typing import Iterator, Type, Pattern, TYPE_CHECKING
|
from typing import Iterator, Type, Pattern, Generator, TYPE_CHECKING
|
||||||
from types import ModuleType
|
from types import ModuleType
|
||||||
import os
|
import os
|
||||||
import importlib
|
import importlib
|
||||||
|
@ -610,7 +610,7 @@ def working_dir(path: Union[str, Path]) -> None:
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def make_tempdir() -> None:
|
def make_tempdir() -> Generator[Path, None, None]:
|
||||||
"""Execute a block in a temporary directory and remove the directory and
|
"""Execute a block in a temporary directory and remove the directory and
|
||||||
its contents at the end of the with block.
|
its contents at the end of the with block.
|
||||||
|
|
||||||
|
|
|
@ -847,6 +847,92 @@ $ python -m spacy project run [subcommand] [project_dir] [--force] [--dry]
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **EXECUTES** | The command defined in the `project.yml`. |
|
| **EXECUTES** | The command defined in the `project.yml`. |
|
||||||
|
|
||||||
|
### project push {#project-push tag="command"}
|
||||||
|
|
||||||
|
Upload all available files or directories listed as in the `outputs` section of
|
||||||
|
commands to a remote storage. Outputs are archived and compressed prior to
|
||||||
|
upload, and addressed in the remote storage using the output's relative path
|
||||||
|
(URL encoded), a hash of its command string and dependencies, and a hash of its
|
||||||
|
file contents. This means `push` should **never overwrite** a file in your
|
||||||
|
remote. If all the hashes match, the contents are the same and nothing happens.
|
||||||
|
If the contents are different, the new version of the file is uploaded. Deleting
|
||||||
|
obsolete files is left up to you.
|
||||||
|
|
||||||
|
Remotes can be defined in the `remotes` section of the
|
||||||
|
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the
|
||||||
|
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
|
||||||
|
communicate with the remote storages, so you can use any protocol that
|
||||||
|
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
|
||||||
|
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
|
||||||
|
you may need to install extra dependencies to use certain protocols.
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy project push [remote] [project_dir]
|
||||||
|
```
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```cli
|
||||||
|
> $ python -m spacy project push my_bucket
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> ```yaml
|
||||||
|
> ### project.yml
|
||||||
|
> remotes:
|
||||||
|
> my_bucket: 's3://my-spacy-bucket'
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | --------------------------------------------------------------------------------------- |
|
||||||
|
| `remote` | The name of the remote to upload to. Defaults to `"default"`. ~~str (positional)~~ |
|
||||||
|
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
|
||||||
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
| **UPLOADS** | All project outputs that exist and are not already stored in the remote. |
|
||||||
|
|
||||||
|
### project pull {#project-pull tag="command"}
|
||||||
|
|
||||||
|
Download all files or directories listed as `outputs` for commands, unless they
|
||||||
|
are not already present locally. When searching for files in the remote, `pull`
|
||||||
|
won't just look at the output path, but will also consider the **command
|
||||||
|
string** and the **hashes of the dependencies**. For instance, let's say you've
|
||||||
|
previously pushed a model checkpoint to the remote, but now you've changed some
|
||||||
|
hyper-parameters. Because you've changed the inputs to the command, if you run
|
||||||
|
`pull`, you won't retrieve the stale result. If you train your model and push
|
||||||
|
the outputs to the remote, the outputs will be saved alongside the prior
|
||||||
|
outputs, so if you change the config back, you'll be able to fetch back the
|
||||||
|
result.
|
||||||
|
|
||||||
|
Remotes can be defined in the `remotes` section of the
|
||||||
|
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the
|
||||||
|
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
|
||||||
|
communicate with the remote storages, so you can use any protocol that
|
||||||
|
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
|
||||||
|
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
|
||||||
|
you may need to install extra dependencies to use certain protocols.
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy project pull [remote] [project_dir]
|
||||||
|
```
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```cli
|
||||||
|
> $ python -m spacy project pull my_bucket
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> ```yaml
|
||||||
|
> ### project.yml
|
||||||
|
> remotes:
|
||||||
|
> my_bucket: 's3://my-spacy-bucket'
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | --------------------------------------------------------------------------------------- |
|
||||||
|
| `remote` | The name of the remote to download from. Defaults to `"default"`. ~~str (positional)~~ |
|
||||||
|
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
|
||||||
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
| **DOWNLOADS** | All project outputs that do not exist locally and can be found in the remote. |
|
||||||
|
|
||||||
### project dvc {#project-dvc tag="command"}
|
### project dvc {#project-dvc tag="command"}
|
||||||
|
|
||||||
Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls
|
Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls
|
||||||
|
|
91
website/docs/images/projects.svg
Normal file
91
website/docs/images/projects.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 40 KiB |
|
@ -5,9 +5,12 @@ menu:
|
||||||
- ['Intro & Workflow', 'intro']
|
- ['Intro & Workflow', 'intro']
|
||||||
- ['Directory & Assets', 'directory']
|
- ['Directory & Assets', 'directory']
|
||||||
- ['Custom Projects', 'custom']
|
- ['Custom Projects', 'custom']
|
||||||
|
- ['Remote Storage', 'remote']
|
||||||
- ['Integrations', 'integrations']
|
- ['Integrations', 'integrations']
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## Introduction and workflow {#intro hidden="true"}
|
||||||
|
|
||||||
> #### 🪐 Project templates
|
> #### 🪐 Project templates
|
||||||
>
|
>
|
||||||
> Our [`projects`](https://github.com/explosion/projects) repo includes various
|
> Our [`projects`](https://github.com/explosion/projects) repo includes various
|
||||||
|
@ -19,20 +22,17 @@ spaCy projects let you manage and share **end-to-end spaCy workflows** for
|
||||||
different **use cases and domains**, and orchestrate training, packaging and
|
different **use cases and domains**, and orchestrate training, packaging and
|
||||||
serving your custom models. You can start off by cloning a pre-defined project
|
serving your custom models. You can start off by cloning a pre-defined project
|
||||||
template, adjust it to fit your needs, load in your data, train a model, export
|
template, adjust it to fit your needs, load in your data, train a model, export
|
||||||
it as a Python package and share the project templates with your team. spaCy
|
it as a Python package, upload your outputs to a remote storage and share your
|
||||||
projects can be used via the new [`spacy project`](/api/cli#project) command.
|
results with your team. spaCy projects can be used via the new
|
||||||
For an overview of the available project templates, check out the
|
[`spacy project`](/api/cli#project) command and we provide templates in our
|
||||||
[`projects`](https://github.com/explosion/projects) repo. spaCy projects also
|
[`projects`](https://github.com/explosion/projects) repo.
|
||||||
[integrate](#integrations) with many other cool machine learning and data
|
|
||||||
science tools to track and manage your data and experiments, iterate on demos
|
|
||||||
and prototypes and ship your models into production.
|
|
||||||
|
|
||||||
<!-- TODO: mention integrations -->
|
<!-- TODO: mention integrations -->
|
||||||
|
|
||||||
## Introduction and workflow {#intro}
|
|
||||||
|
|
||||||
<!-- TODO: decide how to introduce concept -->
|
<!-- TODO: decide how to introduce concept -->
|
||||||
|
|
||||||
|
![Illustration of project workflow and commands](../images/projects.svg)
|
||||||
|
|
||||||
<!-- TODO:
|
<!-- TODO:
|
||||||
<Project id="some_example_project">
|
<Project id="some_example_project">
|
||||||
|
|
||||||
|
@ -155,8 +155,8 @@ other. For instance, to generate a packaged model, you might start by converting
|
||||||
your data, then run [`spacy train`](/api/cli#train) to train your model on the
|
your data, then run [`spacy train`](/api/cli#train) to train your model on the
|
||||||
converted data and if that's successful, run [`spacy package`](/api/cli#package)
|
converted data and if that's successful, run [`spacy package`](/api/cli#package)
|
||||||
to turn the best model artifact into an installable Python package. The
|
to turn the best model artifact into an installable Python package. The
|
||||||
following command runs the workflow named `all` defined in the `project.yml`, and
|
following command runs the workflow named `all` defined in the `project.yml`,
|
||||||
executes the commands it specifies, in order:
|
and executes the commands it specifies, in order:
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy project run all
|
$ python -m spacy project run all
|
||||||
|
@ -171,6 +171,31 @@ advanced data pipelines and track your changes in Git, check out the
|
||||||
from a workflow defined in your `project.yml` so you can manage your spaCy
|
from a workflow defined in your `project.yml` so you can manage your spaCy
|
||||||
project as a DVC repo.
|
project as a DVC repo.
|
||||||
|
|
||||||
|
### 5. Optional: Push to remote storage {#push}
|
||||||
|
|
||||||
|
> ```yaml
|
||||||
|
> ### project.yml
|
||||||
|
> remotes:
|
||||||
|
> default: 's3://my-spacy-bucket'
|
||||||
|
> local: '/mnt/scratch/cache'
|
||||||
|
> ```
|
||||||
|
|
||||||
|
After training a model, you can optionally use the
|
||||||
|
[`spacy project push`](/api/cli#project-push) command to upload your outputs to
|
||||||
|
a remote storage, using protocols like [S3](https://aws.amazon.com/s3/),
|
||||||
|
[Google Cloud Storage](https://cloud.google.com/storage) or SSH. This can help
|
||||||
|
you **export** your model packages, **share** work with your team, or **cache
|
||||||
|
results** to avoid repeating work.
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy project push
|
||||||
|
```
|
||||||
|
|
||||||
|
The `remotes` section in your `project.yml` lets you assign names to the
|
||||||
|
different storages. To download state from a remote storage, you can use the
|
||||||
|
[`spacy project pull`](/api/cli#project-pull) command. For more details, see the
|
||||||
|
docs on [remote storage](#remote).
|
||||||
|
|
||||||
## Project directory and assets {#directory}
|
## Project directory and assets {#directory}
|
||||||
|
|
||||||
### project.yml {#project-yml}
|
### project.yml {#project-yml}
|
||||||
|
@ -190,7 +215,7 @@ https://github.com/explosion/spacy-boilerplates/blob/master/ner_fashion/project.
|
||||||
|
|
||||||
| Section | Description |
|
| Section | Description |
|
||||||
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `variables` | A dictionary of variables that can be referenced in paths, URLs and scripts. For example, `{NAME}` will use the value of the variable `NAME`. |
|
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
|
||||||
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
||||||
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. |
|
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. |
|
||||||
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
|
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
|
||||||
|
@ -349,9 +374,9 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
In your `project.yml`, you can then run the script by calling
|
In your `project.yml`, you can then run the script by calling
|
||||||
`python scripts/custom_evaluation.py` with the function arguments. You can also
|
`python scripts/custom_evaluation.py` with the function arguments. You can also
|
||||||
use the `variables` section to define reusable variables that will be
|
use the `vars` section to define reusable variables that will be substituted in
|
||||||
substituted in commands, paths and URLs. In this example, the `BATCH_SIZE` is
|
commands, paths and URLs. In this example, the batch size is defined as a
|
||||||
defined as a variable will be added in place of `{BATCH_SIZE}` in the script.
|
variable will be added in place of `${vars.batch_size}` in the script.
|
||||||
|
|
||||||
> #### Calling into Python
|
> #### Calling into Python
|
||||||
>
|
>
|
||||||
|
@ -363,13 +388,13 @@ defined as a variable will be added in place of `{BATCH_SIZE}` in the script.
|
||||||
<!-- prettier-ignore -->
|
<!-- prettier-ignore -->
|
||||||
```yaml
|
```yaml
|
||||||
### project.yml
|
### project.yml
|
||||||
variables:
|
vars:
|
||||||
BATCH_SIZE: 128
|
batch_size: 128
|
||||||
|
|
||||||
commands:
|
commands:
|
||||||
- name: evaluate
|
- name: evaluate
|
||||||
script:
|
script:
|
||||||
- 'python scripts/custom_evaluation.py {BATCH_SIZE} ./training/model-best ./corpus/eval.json'
|
- 'python scripts/custom_evaluation.py ${batch_size} ./training/model-best ./corpus/eval.json'
|
||||||
deps:
|
deps:
|
||||||
- 'training/model-best'
|
- 'training/model-best'
|
||||||
- 'corpus/eval.json'
|
- 'corpus/eval.json'
|
||||||
|
@ -421,6 +446,114 @@ assets:
|
||||||
checksum: '5113dc04e03f079525edd8df3f4f39e3'
|
checksum: '5113dc04e03f079525edd8df3f4f39e3'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Remote Storage {#remote}
|
||||||
|
|
||||||
|
You can persist your project outputs to a remote storage using the
|
||||||
|
[`project push`](/api/cli#project-push) command. This can help you **export**
|
||||||
|
your model packages, **share** work with your team, or **cache results** to
|
||||||
|
avoid repeating work. The [`project pull`](/api/cli#project-pull) command will
|
||||||
|
download any outputs that are in the remote storage and aren't available
|
||||||
|
locally.
|
||||||
|
|
||||||
|
You can list one or more remotes in the `remotes` section of your
|
||||||
|
[`project.yml`](#project-yml) by mapping a string name to the URL of the
|
||||||
|
storage. Under the hood, spaCy uses the
|
||||||
|
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
|
||||||
|
communicate with the remote storages, so you can use any protocol that
|
||||||
|
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
|
||||||
|
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
|
||||||
|
you may need to install extra dependencies to use certain protocols.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```cli
|
||||||
|
> $ python -m spacy project pull local
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
### project.yml
|
||||||
|
remotes:
|
||||||
|
default: 's3://my-spacy-bucket'
|
||||||
|
local: '/mnt/scratch/cache'
|
||||||
|
stuff: 'ssh://myserver.example.com/whatever'
|
||||||
|
```
|
||||||
|
|
||||||
|
<Infobox title="How it works" emoji="💡">
|
||||||
|
|
||||||
|
Inside the remote storage, spaCy uses a clever **directory structure** to avoid
|
||||||
|
overwriting files. The top level of the directory structure is a URL-encoded
|
||||||
|
version of the output's path. Within this directory are subdirectories named
|
||||||
|
according to a hash of the command string and the command's dependencies.
|
||||||
|
Finally, within those directories are files, named according to an MD5 hash of
|
||||||
|
their contents.
|
||||||
|
|
||||||
|
<!-- TODO: update with actual real example? -->
|
||||||
|
|
||||||
|
<!-- prettier-ignore -->
|
||||||
|
```yaml
|
||||||
|
└── urlencoded_file_path # Path of original file
|
||||||
|
├── some_command_hash # Hash of command you ran
|
||||||
|
│ ├── some_content_hash # Hash of file content
|
||||||
|
│ └── another_content_hash
|
||||||
|
└── another_command_hash
|
||||||
|
└── third_content_hash
|
||||||
|
```
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
For instance, let's say you had the following command in your `project.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
### project.yml
|
||||||
|
- name: train
|
||||||
|
help: 'Train a spaCy model using the specified corpus and config'
|
||||||
|
script:
|
||||||
|
- 'spacy train ./config.cfg --output training/'
|
||||||
|
deps:
|
||||||
|
- 'corpus/train'
|
||||||
|
- 'corpus/dev'
|
||||||
|
- 'config.cfg'
|
||||||
|
outputs:
|
||||||
|
- 'training/model-best'
|
||||||
|
```
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```
|
||||||
|
> └── s3://my-spacy-bucket/training%2Fmodel-best
|
||||||
|
> └── 1d8cb33a06cc345ad3761c6050934a1b
|
||||||
|
> └── d8e20c3537a084c5c10d95899fe0b1ff
|
||||||
|
> ```
|
||||||
|
|
||||||
|
After you finish training, you run [`project push`](/api/cli#project-push) to
|
||||||
|
make sure the `training/model-best` output is saved to remote storage. spaCy
|
||||||
|
will then construct a hash from your command script and the listed dependencies,
|
||||||
|
`corpus/train`, `corpus/dev` and `config.cfg`, in order to identify the
|
||||||
|
execution context of your output. It would then compute an MD5 hash of the
|
||||||
|
`training/model-best` directory, and use those three pieces of information to
|
||||||
|
construct the storage URL.
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy project run train
|
||||||
|
$ python -m spacy project push
|
||||||
|
```
|
||||||
|
|
||||||
|
If you change the command or one of its dependencies (for instance, by editing
|
||||||
|
the [`config.cfg`](/usage/training#config) file to tune the hyperparameters, a
|
||||||
|
different creation hash will be calculated, so when you use
|
||||||
|
[`project push`](/api/cli#project-push) you won't be overwriting your previous
|
||||||
|
file. The system even supports multiple outputs for the same file and the same
|
||||||
|
context, which can happen if your training process is not deterministic, or if
|
||||||
|
you have dependencies that aren't represented in the command.
|
||||||
|
|
||||||
|
In summary, the [`spacy project`](/api/cli#project) remote storages are designed
|
||||||
|
to make a particular set of trade-offs. Priority is placed on **convenience**,
|
||||||
|
**correctness** and **avoiding data loss**. You can use
|
||||||
|
[`project push`](/api/cli#project-push) freely, as you'll never overwrite remote
|
||||||
|
state, and you don't have to come up with names or version numbers. However,
|
||||||
|
it's up to you to manage the size of your remote storage, and to remove files
|
||||||
|
that are no longer relevant to you.
|
||||||
|
|
||||||
## Integrations {#integrations}
|
## Integrations {#integrations}
|
||||||
|
|
||||||
### Data Version Control (DVC) {#dvc} <IntegrationLogo name="dvc" title="DVC" width={70} height="auto" align="right" />
|
### Data Version Control (DVC) {#dvc} <IntegrationLogo name="dvc" title="DVC" width={70} height="auto" align="right" />
|
||||||
|
@ -517,16 +650,17 @@ and evaluation set.
|
||||||
<!-- prettier-ignore -->
|
<!-- prettier-ignore -->
|
||||||
```yaml
|
```yaml
|
||||||
### project.yml
|
### project.yml
|
||||||
variables:
|
vars:
|
||||||
PRODIGY_DATASET: 'ner_articles'
|
prodigy:
|
||||||
PRODIGY_LABELS: 'PERSON,ORG,PRODUCT'
|
dataset: 'ner_articles'
|
||||||
PRODIGY_MODEL: 'en_core_web_md'
|
labels: 'PERSON,ORG,PRODUCT'
|
||||||
|
model: 'en_core_web_md'
|
||||||
|
|
||||||
commands:
|
commands:
|
||||||
- name: annotate
|
- name: annotate
|
||||||
- script:
|
- script:
|
||||||
- 'python -m prodigy ner.correct {PRODIGY_DATASET} ./assets/raw_data.jsonl {PRODIGY_MODEL} --labels {PRODIGY_LABELS}'
|
- 'python -m prodigy ner.correct ${vars.prodigy.dataset} ./assets/raw_data.jsonl ${vars.prodigy.model} --labels ${vars.prodigy.labels}'
|
||||||
- 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner {PRODIGY_DATASET}'
|
- 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner ${vars.prodigy.dataset}'
|
||||||
- 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy'
|
- 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy'
|
||||||
- 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy'
|
- 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy'
|
||||||
- deps:
|
- deps:
|
||||||
|
|
|
@ -104,11 +104,15 @@ spaCy projects let you manage and share **end-to-end spaCy workflows** for
|
||||||
different **use cases and domains**, and orchestrate training, packaging and
|
different **use cases and domains**, and orchestrate training, packaging and
|
||||||
serving your custom models. You can start off by cloning a pre-defined project
|
serving your custom models. You can start off by cloning a pre-defined project
|
||||||
template, adjust it to fit your needs, load in your data, train a model, export
|
template, adjust it to fit your needs, load in your data, train a model, export
|
||||||
it as a Python package and share the project templates with your team. spaCy
|
it as a Python package, upload your outputs to a remote storage and share your
|
||||||
projects also make it easy to **integrate with other tools** in the data science
|
results with your team.
|
||||||
and machine learning ecosystem, including [DVC](/usage/projects#dvc) for data
|
|
||||||
version control, [Prodigy](/usage/projects#prodigy) for creating labelled data,
|
![Illustration of project workflow and commands](../images/projects.svg)
|
||||||
[Streamlit](/usage/projects#streamlit) for building interactive apps,
|
|
||||||
|
spaCy projects also make it easy to **integrate with other tools** in the data
|
||||||
|
science and machine learning ecosystem, including [DVC](/usage/projects#dvc) for
|
||||||
|
data version control, [Prodigy](/usage/projects#prodigy) for creating labelled
|
||||||
|
data, [Streamlit](/usage/projects#streamlit) for building interactive apps,
|
||||||
[FastAPI](/usage/projects#fastapi) for serving models in production,
|
[FastAPI](/usage/projects#fastapi) for serving models in production,
|
||||||
[Ray](/usage/projects#ray) for parallel training,
|
[Ray](/usage/projects#ray) for parallel training,
|
||||||
[Weights & Biases](/usage/projects#wandb) for experiment tracking, and more!
|
[Weights & Biases](/usage/projects#wandb) for experiment tracking, and more!
|
||||||
|
|
|
@ -5,6 +5,8 @@ import Icon from './icon'
|
||||||
import { isString } from './util'
|
import { isString } from './util'
|
||||||
import classes from '../styles/table.module.sass'
|
import classes from '../styles/table.module.sass'
|
||||||
|
|
||||||
|
const FOOT_ROW_REGEX = /^(RETURNS|YIELDS|CREATES|PRINTS|EXECUTES|UPLOADS|DOWNLOADS)/
|
||||||
|
|
||||||
function isNum(children) {
|
function isNum(children) {
|
||||||
return isString(children) && /^\d+[.,]?[\dx]+?(|x|ms|mb|gb|k|m)?$/i.test(children)
|
return isString(children) && /^\d+[.,]?[\dx]+?(|x|ms|mb|gb|k|m)?$/i.test(children)
|
||||||
}
|
}
|
||||||
|
@ -43,7 +45,6 @@ function isDividerRow(children) {
|
||||||
}
|
}
|
||||||
|
|
||||||
function isFootRow(children) {
|
function isFootRow(children) {
|
||||||
const rowRegex = /^(RETURNS|YIELDS|CREATES|PRINTS|EXECUTES)/
|
|
||||||
if (children.length && children[0].props.name === 'td') {
|
if (children.length && children[0].props.name === 'td') {
|
||||||
const cellChildren = children[0].props.children
|
const cellChildren = children[0].props.children
|
||||||
if (
|
if (
|
||||||
|
@ -52,7 +53,7 @@ function isFootRow(children) {
|
||||||
cellChildren.props.children &&
|
cellChildren.props.children &&
|
||||||
isString(cellChildren.props.children)
|
isString(cellChildren.props.children)
|
||||||
) {
|
) {
|
||||||
return rowRegex.test(cellChildren.props.children)
|
return FOOT_ROW_REGEX.test(cellChildren.props.children)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
|
|
Loading…
Reference in New Issue
Block a user