spaCy/spacy/cli/project/pull.py
Matthew Honnibal e559867605
Allow spacy project to push and pull to/from remote storage (#5949)
* Add utils for working with remote storage

* WIP add remote_cache for project

* WIP add push and pull commands

* Use pathy in remote_cache

* Updarte util

* Update remote_cache

* Update util

* Update project assets

* Update pull script

* Update push script

* Fix type annotation in util

* Work on remote storage

* Remove site and env hash

* Fix imports

* Fix type annotation

* Require pathy

* Require pathy

* Fix import

* Add a util to handle project variable substitution

* Import push and pull commands

* Fix pull command

* Fix push command

* Fix tarfile in remote_storage

* Improve printing

* Fiddle with status messages

* Set version to v3.0.0a9

* Draft docs for spacy project remote storages

* Update docs [ci skip]

* Use Thinc config to simplify and unify template variables

* Auto-format

* Don't import Pathy globally for now

Causes slow and annoying Google Cloud warning

* Tidy up test

* Tidy up and update tests

* Update to latest Thinc

* Update docs

* variables -> vars

* Update docs [ci skip]

* Update docs [ci skip]

Co-authored-by: Ines Montani <ines@ines.io>
2020-08-23 18:32:09 +02:00

37 lines
1.5 KiB
Python

from pathlib import Path
from wasabi import msg
from .remote_storage import RemoteStorage
from .remote_storage import get_command_hash
from .._util import project_cli, Arg
from .._util import load_project_config
@project_cli.command("pull")
def project_pull_cli(
# fmt: off
remote: str = Arg("default", help="Name or path of remote storage"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
# fmt: on
):
"""Retrieve any precomputed outputs from a remote storage that are available.
You can alias remotes in your project.yml by mapping them to storage paths.
A storage can be anything that the smart-open library can upload to, e.g.
gcs, aws, ssh, local directories etc
"""
for url, output_path in project_pull(project_dir, remote):
if url is not None:
msg.good(f"Pulled {output_path} from {url}")
def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
config = load_project_config(project_dir)
if remote in config.get("remotes", {}):
remote = config["remotes"][remote]
storage = RemoteStorage(project_dir, remote)
for cmd in config.get("commands", []):
deps = [project_dir / dep for dep in cmd.get("deps", [])]
cmd_hash = get_command_hash("", "", deps, cmd["script"])
for output_path in cmd.get("outputs", []):
url = storage.pull(output_path, command_hash=cmd_hash)
yield url, output_path