Support local filesystem remotes for projects

* Fix support for local filesystem remotes for projects
  * Use `FluidPath` instead of `Pathy` to support both filesystem and
    remote paths
  * Create missing parent directories if required for local filesystem
  * Add a more general `_file_exists` method to support both `Pathy`,
    `Path`, and `smart_open`-compatible URLs
* Add explicit `smart_open` dependency starting with support for
  `compression` flag
* Update `pathy` dependency to exclude older versions that aren't
  compatible with required `smart_open` version
* Update docs to refer to `Pathy` instead of `smart_open` for project
  remotes (technically you can still push to any `smart_open`-compatible
  path but you can't pull from them)
* Add tests for local filesystem remotes
This commit is contained in:
Adriane Boyd 2022-11-07 10:32:52 +01:00
parent b76222e56a
commit e9cd7e3032
7 changed files with 122 additions and 44 deletions

View File

@ -10,7 +10,8 @@ wasabi>=0.9.1,<1.1.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.8.0
pathy>=0.3.5
pathy>=0.6.0
smart-open>=5.2.1,<7.0.0
# Third party dependencies
numpy>=1.15.0
requests>=2.13.0,<3.0.0

View File

@ -52,7 +52,8 @@ install_requires =
catalogue>=2.0.6,<2.1.0
# Third-party dependencies
typer>=0.3.0,<0.8.0
pathy>=0.3.5
pathy>=0.6.0
smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0
numpy>=1.15.0
requests>=2.13.0,<3.0.0

View File

@ -23,7 +23,7 @@ from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
from .. import about
if TYPE_CHECKING:
from pathy import Pathy # noqa: F401
from pathy import FluidPath # noqa: F401
SDIST_SUFFIX = ".tar.gz"
@ -331,7 +331,33 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
def _file_exists(src: Union[str, "FluidPath"]) -> bool:
"""Check if a (remote) file exists, if possible.
src (str / FluidPath): The URL to check.
"""
try:
from pathy import Pathy
if isinstance(src, (Pathy, Path)):
if src.exists():
return True
except Exception:
pass
try:
import smart_open
with smart_open.open(src, mode="rb", compression="disable") as input_file:
pass
return True
except Exception:
pass
return False
def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None:
"""Upload a file.
src (Path): The source path.
@ -339,13 +365,20 @@ def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
"""
import smart_open
# Create parent directories for local paths
if isinstance(dest, Path):
if not dest.parent.exists():
dest.parent.mkdir(parents=True)
dest = str(dest)
with smart_open.open(dest, mode="wb") as output_file:
with src.open(mode="rb") as input_file:
output_file.write(input_file.read())
def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None:
def download_file(
src: Union[str, "FluidPath"], dest: Path, *, force: bool = False
) -> None:
"""Download a file using smart_open.
url (str): The URL of the file.
@ -358,7 +391,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
if dest.exists() and not force:
return None
src = str(src)
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
with smart_open.open(src, mode="rb", compression="disable") as input_file:
with dest.open(mode="wb") as output_file:
shutil.copyfileobj(input_file, output_file)
@ -368,7 +401,7 @@ def ensure_pathy(path):
slow and annoying Google Cloud warning)."""
from pathy import Pathy # noqa: F811
return Pathy(path)
return Pathy.fluid(path)
def git_checkout(

View File

@ -6,13 +6,14 @@ import urllib.parse
import tarfile
from pathlib import Path
from .._util import get_hash, get_checksum, download_file, ensure_pathy
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
from .._util import get_hash, get_checksum, upload_file, download_file
from .._util import ensure_pathy, make_tempdir, _file_exists
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
from ...git_info import GIT_VERSION
from ... import about
if TYPE_CHECKING:
from pathy import Pathy # noqa: F401
from pathy import FluidPath # noqa: F401
class RemoteStorage:
@ -27,7 +28,7 @@ class RemoteStorage:
self.url = ensure_pathy(url)
self.compression = compression
def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
"""Compress a file or directory within a project and upload it to a remote
storage. If an object exists at the full URL, nothing is done.
@ -40,7 +41,7 @@ class RemoteStorage:
if not loc.exists():
raise IOError(f"Cannot push {loc}: does not exist.")
url = self.make_url(path, command_hash, content_hash)
if url.exists():
if _file_exists(url):
return url
tmp: Path
with make_tempdir() as tmp:
@ -48,9 +49,7 @@ class RemoteStorage:
mode_string = f"w:{self.compression}" if self.compression else "w"
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
tar_file.add(str(loc), arcname=str(path))
with tar_loc.open(mode="rb") as input_file:
with url.open(mode="wb") as output_file:
output_file.write(input_file.read())
upload_file(tar_loc, url)
return url
def pull(
@ -59,7 +58,7 @@ class RemoteStorage:
*,
command_hash: Optional[str] = None,
content_hash: Optional[str] = None,
) -> Optional["Pathy"]:
) -> Optional["FluidPath"]:
"""Retrieve a file from the remote cache. If the file already exists,
nothing is done.
@ -93,7 +92,7 @@ class RemoteStorage:
*,
command_hash: Optional[str] = None,
content_hash: Optional[str] = None,
) -> Optional["Pathy"]:
) -> Optional["FluidPath"]:
"""Find the best matching version of a file within the storage,
or `None` if no match can be found. If both the creation and content hash
are specified, only exact matches will be returned. Otherwise, the most
@ -102,16 +101,19 @@ class RemoteStorage:
name = self.encode_name(str(path))
if command_hash is not None and content_hash is not None:
url = self.make_url(path, command_hash, content_hash)
urls = [url] if url.exists() else []
urls = [url] if _file_exists(url) else []
elif command_hash is not None:
urls = list((self.url / name / command_hash).iterdir())
else:
urls = list((self.url / name).iterdir())
urls = []
for command_hash_dir in (self.url / name).iterdir():
urls.extend(command_hash_dir.iterdir())
if content_hash is not None:
urls = [url for url in urls if url.parts[-1] == content_hash]
# TODO: URLs should be sorted by last modified
return urls[-1] if urls else None
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
"""Construct a URL from a subpath, a creation hash and a content hash."""
return self.url / self.encode_name(str(path)) / command_hash / content_hash

View File

@ -25,6 +25,7 @@ from spacy.cli.download import get_compatibility, get_version
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
from spacy.cli.package import get_third_party_dependencies
from spacy.cli.package import _is_permitted_package_name
from spacy.cli.project.remote_storage import RemoteStorage
from spacy.cli.validate import get_model_pkgs
from spacy.lang.en import English
from spacy.lang.nl import Dutch
@ -855,3 +856,42 @@ def test_span_length_freq_dist_output_must_be_correct():
span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold)
assert sum(span_freqs.values()) >= threshold
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
def test_local_remote_storage():
with make_tempdir() as d:
filename = "a.txt"
content = "a"
loc_file = d / "root" / filename
loc_file.parent.mkdir(parents=True)
with loc_file.open(mode="w") as file_:
file_.write(content)
# push to remote storage
remote = RemoteStorage(d / "root", str(d / "remote"))
remote.push(filename, "aaaa", "bbbb")
# retrieve with full hashes
loc_file.unlink()
remote.pull(filename, command_hash="aaaa", content_hash="bbbb")
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with command hash
loc_file.unlink()
remote.pull(filename, command_hash="aaaa")
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with content hash
loc_file.unlink()
remote.pull(filename, content_hash="bbbb")
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with no hashes
loc_file.unlink()
remote.pull(filename)
with loc_file.open(mode="r") as file_:
assert file_.read() == content

View File

@ -474,8 +474,7 @@ report span characteristics such as the average span length and the span (or
span boundary) distinctiveness. The distinctiveness measure shows how different
the tokens are with respect to the rest of the corpus using the KL-divergence of
the token distributions. To learn more, you can check out Papay et al.'s work on
[*Dissecting Span Identification Tasks with Performance Prediction* (EMNLP
2020)](https://aclanthology.org/2020.emnlp-main.396/).
[_Dissecting Span Identification Tasks with Performance Prediction_ (EMNLP 2020)](https://aclanthology.org/2020.emnlp-main.396/).
</Infobox>
@ -1352,12 +1351,13 @@ If the contents are different, the new version of the file is uploaded. Deleting
obsolete files is left up to you.
Remotes can be defined in the `remotes` section of the
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
communicate with the remote storages, so you can use any protocol that
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
you may need to install extra dependencies to use certain protocols.
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses
[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the
remote storages, so you can use any protocol that `Pathy` supports, including
[S3](https://aws.amazon.com/s3/),
[Google Cloud Storage](https://cloud.google.com/storage), and the local
filesystem, although you may need to install extra dependencies to use certain
protocols.
```cli
$ python -m spacy project push [remote] [project_dir]
@ -1396,12 +1396,13 @@ outputs, so if you change the config back, you'll be able to fetch back the
result.
Remotes can be defined in the `remotes` section of the
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
communicate with the remote storages, so you can use any protocol that
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
you may need to install extra dependencies to use certain protocols.
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses
[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the
remote storages, so you can use any protocol that `Pathy` supports, including
[S3](https://aws.amazon.com/s3/),
[Google Cloud Storage](https://cloud.google.com/storage), and the local
filesystem, although you may need to install extra dependencies to use certain
protocols.
```cli
$ python -m spacy project pull [remote] [project_dir]

View File

@ -259,9 +259,9 @@ pipelines.
> This can be used in a project command like so:
>
> ```yaml
> - name: "echo-path"
> script:
> - "echo ${env.ENV_PATH}"
> - name: 'echo-path'
> script:
> - 'echo ${env.ENV_PATH}'
> ```
| Section | Description |
@ -643,12 +643,13 @@ locally.
You can list one or more remotes in the `remotes` section of your
[`project.yml`](#project-yml) by mapping a string name to the URL of the
storage. Under the hood, spaCy uses the
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
communicate with the remote storages, so you can use any protocol that
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
you may need to install extra dependencies to use certain protocols.
storage. Under the hood, spaCy uses
[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the
remote storages, so you can use any protocol that `Pathy` supports, including
[S3](https://aws.amazon.com/s3/),
[Google Cloud Storage](https://cloud.google.com/storage), and the local
filesystem, although you may need to install extra dependencies to use certain
protocols.
> #### Example
>
@ -661,7 +662,6 @@ you may need to install extra dependencies to use certain protocols.
remotes:
default: 's3://my-spacy-bucket'
local: '/mnt/scratch/cache'
stuff: 'ssh://myserver.example.com/whatever'
```
<Infobox title="How it works" emoji="💡">