1
1
mirror of https://github.com/explosion/spaCy.git synced 2025-01-17 04:54:14 +03:00
spaCy/spacy/cli/project/push.py

64 lines
2.4 KiB
Python
Raw Normal View History

from pathlib import Path
from wasabi import msg
from .remote_storage import RemoteStorage
from .remote_storage import get_content_hash, get_command_hash
from .._util import load_project_config
from .._util import project_cli, Arg
@project_cli.command("push")
def project_push_cli(
# fmt: off
remote: str = Arg("default", help="Name or path of remote storage"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
# fmt: on
):
2020-09-04 13:58:50 +03:00
"""Persist outputs to a remote storage. You can alias remotes in your
project.yml by mapping them to storage paths. A storage can be anything that
the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
local directories etc.
DOCS: https://spacy.io/api/cli#project-push
"""
for output_path, url in project_push(project_dir, remote):
if url is None:
msg.info(f"Skipping {output_path}")
else:
msg.good(f"Pushed {output_path} to {url}")
def project_push(project_dir: Path, remote: str):
"""Persist outputs to a remote storage. You can alias remotes in your project.yml
by mapping them to storage paths. A storage can be anything that the smart-open
library can upload to, e.g. gcs, aws, ssh, local directories etc
"""
config = load_project_config(project_dir)
if remote in config.get("remotes", {}):
remote = config["remotes"][remote]
storage = RemoteStorage(project_dir, remote)
for cmd in config.get("commands", []):
2020-08-23 22:14:44 +03:00
deps = [project_dir / dep for dep in cmd.get("deps", [])]
if any(not dep.exists() for dep in deps):
continue
cmd_hash = get_command_hash(
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
)
for output_path in cmd.get("outputs", []):
output_loc = project_dir / output_path
2020-09-04 22:15:55 +03:00
if output_loc.exists() and _is_not_empty_dir(output_loc):
url = storage.push(
output_path,
command_hash=cmd_hash,
content_hash=get_content_hash(output_loc),
)
yield output_path, url
2020-09-04 22:15:55 +03:00
def _is_not_empty_dir(loc: Path):
if not loc.is_dir():
return True
elif any(_is_not_empty_dir(child) for child in loc.iterdir()):
return True
else:
return False