Support local filesystem remotes for projects

* Fix support for local filesystem remotes for projects
  * Use `FluidPath` instead of `Pathy` to support both filesystem and
    remote paths
  * Create missing parent directories if required for local filesystem
  * Add a more general `_file_exists` method to support both `Pathy`,
    `Path`, and `smart_open`-compatible URLs
* Add explicit `smart_open` dependency starting with support for
  `compression` flag
* Update `pathy` dependency to exclude older versions that aren't
  compatible with required `smart_open` version
* Update docs to refer to `Pathy` instead of `smart_open` for project
  remotes (technically you can still push to any `smart_open`-compatible
  path but you can't pull from them)
* Add tests for local filesystem remotes
This commit is contained in:
Adriane Boyd 2022-11-07 10:32:52 +01:00
parent b76222e56a
commit e9cd7e3032
7 changed files with 122 additions and 44 deletions

View File

@ -10,7 +10,8 @@ wasabi>=0.9.1,<1.1.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.8.0 typer>=0.3.0,<0.8.0
pathy>=0.3.5 pathy>=0.6.0
smart-open>=5.2.1,<7.0.0
# Third party dependencies # Third party dependencies
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0

View File

@ -52,7 +52,8 @@ install_requires =
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
# Third-party dependencies # Third-party dependencies
typer>=0.3.0,<0.8.0 typer>=0.3.0,<0.8.0
pathy>=0.3.5 pathy>=0.6.0
smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0

View File

@ -23,7 +23,7 @@ from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
from .. import about from .. import about
if TYPE_CHECKING: if TYPE_CHECKING:
from pathy import Pathy # noqa: F401 from pathy import FluidPath # noqa: F401
SDIST_SUFFIX = ".tar.gz" SDIST_SUFFIX = ".tar.gz"
@ -331,7 +331,33 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None: def _file_exists(src: Union[str, "FluidPath"]) -> bool:
"""Check if a (remote) file exists, if possible.
src (str / FluidPath): The URL to check.
"""
try:
from pathy import Pathy
if isinstance(src, (Pathy, Path)):
if src.exists():
return True
except Exception:
pass
try:
import smart_open
with smart_open.open(src, mode="rb", compression="disable") as input_file:
pass
return True
except Exception:
pass
return False
def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None:
"""Upload a file. """Upload a file.
src (Path): The source path. src (Path): The source path.
@ -339,13 +365,20 @@ def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
""" """
import smart_open import smart_open
# Create parent directories for local paths
if isinstance(dest, Path):
if not dest.parent.exists():
dest.parent.mkdir(parents=True)
dest = str(dest) dest = str(dest)
with smart_open.open(dest, mode="wb") as output_file: with smart_open.open(dest, mode="wb") as output_file:
with src.open(mode="rb") as input_file: with src.open(mode="rb") as input_file:
output_file.write(input_file.read()) output_file.write(input_file.read())
def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None: def download_file(
src: Union[str, "FluidPath"], dest: Path, *, force: bool = False
) -> None:
"""Download a file using smart_open. """Download a file using smart_open.
url (str): The URL of the file. url (str): The URL of the file.
@ -358,7 +391,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
if dest.exists() and not force: if dest.exists() and not force:
return None return None
src = str(src) src = str(src)
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file: with smart_open.open(src, mode="rb", compression="disable") as input_file:
with dest.open(mode="wb") as output_file: with dest.open(mode="wb") as output_file:
shutil.copyfileobj(input_file, output_file) shutil.copyfileobj(input_file, output_file)
@ -368,7 +401,7 @@ def ensure_pathy(path):
slow and annoying Google Cloud warning).""" slow and annoying Google Cloud warning)."""
from pathy import Pathy # noqa: F811 from pathy import Pathy # noqa: F811
return Pathy(path) return Pathy.fluid(path)
def git_checkout( def git_checkout(

View File

@ -6,13 +6,14 @@ import urllib.parse
import tarfile import tarfile
from pathlib import Path from pathlib import Path
from .._util import get_hash, get_checksum, download_file, ensure_pathy from .._util import get_hash, get_checksum, upload_file, download_file
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var from .._util import ensure_pathy, make_tempdir, _file_exists
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
from ...git_info import GIT_VERSION from ...git_info import GIT_VERSION
from ... import about from ... import about
if TYPE_CHECKING: if TYPE_CHECKING:
from pathy import Pathy # noqa: F401 from pathy import FluidPath # noqa: F401
class RemoteStorage: class RemoteStorage:
@ -27,7 +28,7 @@ class RemoteStorage:
self.url = ensure_pathy(url) self.url = ensure_pathy(url)
self.compression = compression self.compression = compression
def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy": def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
"""Compress a file or directory within a project and upload it to a remote """Compress a file or directory within a project and upload it to a remote
storage. If an object exists at the full URL, nothing is done. storage. If an object exists at the full URL, nothing is done.
@ -40,7 +41,7 @@ class RemoteStorage:
if not loc.exists(): if not loc.exists():
raise IOError(f"Cannot push {loc}: does not exist.") raise IOError(f"Cannot push {loc}: does not exist.")
url = self.make_url(path, command_hash, content_hash) url = self.make_url(path, command_hash, content_hash)
if url.exists(): if _file_exists(url):
return url return url
tmp: Path tmp: Path
with make_tempdir() as tmp: with make_tempdir() as tmp:
@ -48,9 +49,7 @@ class RemoteStorage:
mode_string = f"w:{self.compression}" if self.compression else "w" mode_string = f"w:{self.compression}" if self.compression else "w"
with tarfile.open(tar_loc, mode=mode_string) as tar_file: with tarfile.open(tar_loc, mode=mode_string) as tar_file:
tar_file.add(str(loc), arcname=str(path)) tar_file.add(str(loc), arcname=str(path))
with tar_loc.open(mode="rb") as input_file: upload_file(tar_loc, url)
with url.open(mode="wb") as output_file:
output_file.write(input_file.read())
return url return url
def pull( def pull(
@ -59,7 +58,7 @@ class RemoteStorage:
*, *,
command_hash: Optional[str] = None, command_hash: Optional[str] = None,
content_hash: Optional[str] = None, content_hash: Optional[str] = None,
) -> Optional["Pathy"]: ) -> Optional["FluidPath"]:
"""Retrieve a file from the remote cache. If the file already exists, """Retrieve a file from the remote cache. If the file already exists,
nothing is done. nothing is done.
@ -93,7 +92,7 @@ class RemoteStorage:
*, *,
command_hash: Optional[str] = None, command_hash: Optional[str] = None,
content_hash: Optional[str] = None, content_hash: Optional[str] = None,
) -> Optional["Pathy"]: ) -> Optional["FluidPath"]:
"""Find the best matching version of a file within the storage, """Find the best matching version of a file within the storage,
or `None` if no match can be found. If both the creation and content hash or `None` if no match can be found. If both the creation and content hash
are specified, only exact matches will be returned. Otherwise, the most are specified, only exact matches will be returned. Otherwise, the most
@ -102,16 +101,19 @@ class RemoteStorage:
name = self.encode_name(str(path)) name = self.encode_name(str(path))
if command_hash is not None and content_hash is not None: if command_hash is not None and content_hash is not None:
url = self.make_url(path, command_hash, content_hash) url = self.make_url(path, command_hash, content_hash)
urls = [url] if url.exists() else [] urls = [url] if _file_exists(url) else []
elif command_hash is not None: elif command_hash is not None:
urls = list((self.url / name / command_hash).iterdir()) urls = list((self.url / name / command_hash).iterdir())
else: else:
urls = list((self.url / name).iterdir()) urls = []
for command_hash_dir in (self.url / name).iterdir():
urls.extend(command_hash_dir.iterdir())
if content_hash is not None: if content_hash is not None:
urls = [url for url in urls if url.parts[-1] == content_hash] urls = [url for url in urls if url.parts[-1] == content_hash]
# TODO: URLs should be sorted by last modified
return urls[-1] if urls else None return urls[-1] if urls else None
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy": def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
"""Construct a URL from a subpath, a creation hash and a content hash.""" """Construct a URL from a subpath, a creation hash and a content hash."""
return self.url / self.encode_name(str(path)) / command_hash / content_hash return self.url / self.encode_name(str(path)) / command_hash / content_hash

View File

@ -25,6 +25,7 @@ from spacy.cli.download import get_compatibility, get_version
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import get_third_party_dependencies
from spacy.cli.package import _is_permitted_package_name from spacy.cli.package import _is_permitted_package_name
from spacy.cli.project.remote_storage import RemoteStorage
from spacy.cli.validate import get_model_pkgs from spacy.cli.validate import get_model_pkgs
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.nl import Dutch from spacy.lang.nl import Dutch
@ -855,3 +856,42 @@ def test_span_length_freq_dist_output_must_be_correct():
span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold) span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold)
assert sum(span_freqs.values()) >= threshold assert sum(span_freqs.values()) >= threshold
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2] assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
def test_local_remote_storage():
with make_tempdir() as d:
filename = "a.txt"
content = "a"
loc_file = d / "root" / filename
loc_file.parent.mkdir(parents=True)
with loc_file.open(mode="w") as file_:
file_.write(content)
# push to remote storage
remote = RemoteStorage(d / "root", str(d / "remote"))
remote.push(filename, "aaaa", "bbbb")
# retrieve with full hashes
loc_file.unlink()
remote.pull(filename, command_hash="aaaa", content_hash="bbbb")
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with command hash
loc_file.unlink()
remote.pull(filename, command_hash="aaaa")
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with content hash
loc_file.unlink()
remote.pull(filename, content_hash="bbbb")
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with no hashes
loc_file.unlink()
remote.pull(filename)
with loc_file.open(mode="r") as file_:
assert file_.read() == content

View File

@ -474,8 +474,7 @@ report span characteristics such as the average span length and the span (or
span boundary) distinctiveness. The distinctiveness measure shows how different span boundary) distinctiveness. The distinctiveness measure shows how different
the tokens are with respect to the rest of the corpus using the KL-divergence of the tokens are with respect to the rest of the corpus using the KL-divergence of
the token distributions. To learn more, you can check out Papay et al.'s work on the token distributions. To learn more, you can check out Papay et al.'s work on
[*Dissecting Span Identification Tasks with Performance Prediction* (EMNLP [_Dissecting Span Identification Tasks with Performance Prediction_ (EMNLP 2020)](https://aclanthology.org/2020.emnlp-main.396/).
2020)](https://aclanthology.org/2020.emnlp-main.396/).
</Infobox> </Infobox>
@ -1352,12 +1351,13 @@ If the contents are different, the new version of the file is uploaded. Deleting
obsolete files is left up to you. obsolete files is left up to you.
Remotes can be defined in the `remotes` section of the Remotes can be defined in the `remotes` section of the
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the [`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to [`Pathy`](https://github.com/justindujardin/pathy) to communicate with the
communicate with the remote storages, so you can use any protocol that remote storages, so you can use any protocol that `Pathy` supports, including
`smart-open` supports, including [S3](https://aws.amazon.com/s3/), [S3](https://aws.amazon.com/s3/),
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although [Google Cloud Storage](https://cloud.google.com/storage), and the local
you may need to install extra dependencies to use certain protocols. filesystem, although you may need to install extra dependencies to use certain
protocols.
```cli ```cli
$ python -m spacy project push [remote] [project_dir] $ python -m spacy project push [remote] [project_dir]
@ -1396,12 +1396,13 @@ outputs, so if you change the config back, you'll be able to fetch back the
result. result.
Remotes can be defined in the `remotes` section of the Remotes can be defined in the `remotes` section of the
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the [`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to [`Pathy`](https://github.com/justindujardin/pathy) to communicate with the
communicate with the remote storages, so you can use any protocol that remote storages, so you can use any protocol that `Pathy` supports, including
`smart-open` supports, including [S3](https://aws.amazon.com/s3/), [S3](https://aws.amazon.com/s3/),
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although [Google Cloud Storage](https://cloud.google.com/storage), and the local
you may need to install extra dependencies to use certain protocols. filesystem, although you may need to install extra dependencies to use certain
protocols.
```cli ```cli
$ python -m spacy project pull [remote] [project_dir] $ python -m spacy project pull [remote] [project_dir]

View File

@ -259,9 +259,9 @@ pipelines.
> This can be used in a project command like so: > This can be used in a project command like so:
> >
> ```yaml > ```yaml
> - name: "echo-path" > - name: 'echo-path'
> script: > script:
> - "echo ${env.ENV_PATH}" > - 'echo ${env.ENV_PATH}'
> ``` > ```
| Section | Description | | Section | Description |
@ -643,12 +643,13 @@ locally.
You can list one or more remotes in the `remotes` section of your You can list one or more remotes in the `remotes` section of your
[`project.yml`](#project-yml) by mapping a string name to the URL of the [`project.yml`](#project-yml) by mapping a string name to the URL of the
storage. Under the hood, spaCy uses the storage. Under the hood, spaCy uses
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to [`Pathy`](https://github.com/justindujardin/pathy) to communicate with the
communicate with the remote storages, so you can use any protocol that remote storages, so you can use any protocol that `Pathy` supports, including
`smart-open` supports, including [S3](https://aws.amazon.com/s3/), [S3](https://aws.amazon.com/s3/),
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although [Google Cloud Storage](https://cloud.google.com/storage), and the local
you may need to install extra dependencies to use certain protocols. filesystem, although you may need to install extra dependencies to use certain
protocols.
> #### Example > #### Example
> >
@ -661,7 +662,6 @@ you may need to install extra dependencies to use certain protocols.
remotes: remotes:
default: 's3://my-spacy-bucket' default: 's3://my-spacy-bucket'
local: '/mnt/scratch/cache' local: '/mnt/scratch/cache'
stuff: 'ssh://myserver.example.com/whatever'
``` ```
<Infobox title="How it works" emoji="💡"> <Infobox title="How it works" emoji="💡">