Demo: Switch from pathy to cloudpathlib

This commit is contained in:
Adriane Boyd 2022-11-04 15:44:45 +01:00
parent 40e1000db0
commit 4b64c1bbcc
5 changed files with 78 additions and 27 deletions

View File

@ -10,7 +10,7 @@ wasabi>=0.9.1,<1.1.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.5.0 typer>=0.3.0,<0.5.0
pathy>=0.3.5 cloudpathlib>=0.7.0,<0.11.0
# Third party dependencies # Third party dependencies
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0

View File

@ -52,7 +52,7 @@ install_requires =
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
# Third-party dependencies # Third-party dependencies
typer>=0.3.0,<0.5.0 typer>=0.3.0,<0.5.0
pathy>=0.3.5 cloudpathlib>=0.7.0,<0.11.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0

View File

@ -23,7 +23,7 @@ from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
from .. import about from .. import about
if TYPE_CHECKING: if TYPE_CHECKING:
from pathy import Pathy # noqa: F401 from cloudpathlib import CloudPath # noqa: F401
SDIST_SUFFIX = ".tar.gz" SDIST_SUFFIX = ".tar.gz"
@ -331,21 +331,25 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None: def upload_file(src: Path, dest: Union[str, Path, "CloudPath"]) -> None:
"""Upload a file. """Upload a file.
src (Path): The source path. src (Path): The source path.
url (str): The destination URL to upload to. url (str): The destination URL to upload to.
""" """
import smart_open # Create parent directories for local paths
if isinstance(dest, Path):
if not dest.parent.exists():
dest.parent.mkdir(parents=True)
dest = str(dest) with dest.open(mode="wb") as output_file:
with smart_open.open(dest, mode="wb") as output_file:
with src.open(mode="rb") as input_file: with src.open(mode="rb") as input_file:
output_file.write(input_file.read()) output_file.write(input_file.read())
def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None: def download_file(
src: Union[str, Path, "CloudPath"], dest: Path, *, force: bool = False
) -> None:
"""Download a file using smart_open. """Download a file using smart_open.
url (str): The URL of the file. url (str): The URL of the file.
@ -353,22 +357,19 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
force (bool): Whether to force download even if file exists. force (bool): Whether to force download even if file exists.
If False, the download will be skipped. If False, the download will be skipped.
""" """
import smart_open
if dest.exists() and not force: if dest.exists() and not force:
return None return None
src = str(src) with src.open(mode="rb") as input_file:
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
with dest.open(mode="wb") as output_file: with dest.open(mode="wb") as output_file:
shutil.copyfileobj(input_file, output_file) shutil.copyfileobj(input_file, output_file)
def ensure_pathy(path): def ensure_pathy(path):
"""Temporary helper to prevent importing Pathy globally (which can cause """Temporary helper to prevent importing globally (which can cause
slow and annoying Google Cloud warning).""" slow and annoying Google Cloud warning)."""
from pathy import Pathy # noqa: F811 from cloudpathlib import AnyPath # noqa: F811
return Pathy(path) return AnyPath(path)
def git_checkout( def git_checkout(

View File

@ -1,18 +1,20 @@
from typing import Optional, List, Dict, TYPE_CHECKING from typing import Optional, List, Dict, TYPE_CHECKING, Union
import os import os
import site import site
import hashlib import hashlib
import urllib.parse import urllib.parse
import tarfile import tarfile
import warnings
from pathlib import Path from pathlib import Path
from .._util import get_hash, get_checksum, download_file, ensure_pathy from .._util import get_hash, get_checksum, upload_file, download_file
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var from .._util import ensure_pathy, make_tempdir
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
from ...git_info import GIT_VERSION from ...git_info import GIT_VERSION
from ... import about from ... import about
if TYPE_CHECKING: if TYPE_CHECKING:
from pathy import Pathy # noqa: F401 from cloudpathlib import CloudPath # noqa: F401
class RemoteStorage: class RemoteStorage:
@ -27,7 +29,7 @@ class RemoteStorage:
self.url = ensure_pathy(url) self.url = ensure_pathy(url)
self.compression = compression self.compression = compression
def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy": def push(self, path: Path, command_hash: str, content_hash: str) -> Union[Path, "CloudPath"]:
"""Compress a file or directory within a project and upload it to a remote """Compress a file or directory within a project and upload it to a remote
storage. If an object exists at the full URL, nothing is done. storage. If an object exists at the full URL, nothing is done.
@ -48,9 +50,7 @@ class RemoteStorage:
mode_string = f"w:{self.compression}" if self.compression else "w" mode_string = f"w:{self.compression}" if self.compression else "w"
with tarfile.open(tar_loc, mode=mode_string) as tar_file: with tarfile.open(tar_loc, mode=mode_string) as tar_file:
tar_file.add(str(loc), arcname=str(path)) tar_file.add(str(loc), arcname=str(path))
with tar_loc.open(mode="rb") as input_file: upload_file(tar_loc, url)
with url.open(mode="wb") as output_file:
output_file.write(input_file.read())
return url return url
def pull( def pull(
@ -59,7 +59,7 @@ class RemoteStorage:
*, *,
command_hash: Optional[str] = None, command_hash: Optional[str] = None,
content_hash: Optional[str] = None, content_hash: Optional[str] = None,
) -> Optional["Pathy"]: ) -> Optional[Union[Path, "CloudPath"]]:
"""Retrieve a file from the remote cache. If the file already exists, """Retrieve a file from the remote cache. If the file already exists,
nothing is done. nothing is done.
@ -93,7 +93,7 @@ class RemoteStorage:
*, *,
command_hash: Optional[str] = None, command_hash: Optional[str] = None,
content_hash: Optional[str] = None, content_hash: Optional[str] = None,
) -> Optional["Pathy"]: ) -> Optional[Union[Path, "CloudPath"]]:
"""Find the best matching version of a file within the storage, """Find the best matching version of a file within the storage,
or `None` if no match can be found. If both the creation and content hash or `None` if no match can be found. If both the creation and content hash
are specified, only exact matches will be returned. Otherwise, the most are specified, only exact matches will be returned. Otherwise, the most
@ -106,12 +106,22 @@ class RemoteStorage:
elif command_hash is not None: elif command_hash is not None:
urls = list((self.url / name / command_hash).iterdir()) urls = list((self.url / name / command_hash).iterdir())
else: else:
urls = list((self.url / name).iterdir()) urls = []
for sub_dir in (self.url / name).iterdir():
urls.extend(sub_dir.iterdir())
if content_hash is not None: if content_hash is not None:
urls = [url for url in urls if url.parts[-1] == content_hash] urls = [url for url in urls if url.parts[-1] == content_hash]
if len(urls) > 1:
try:
urls.sort(key=lambda x: x.stat().st_mtime) # type: ignore
except Exception:
warnings.warn(
"Unable to sort remote files by last modified. The file(s) "
"pulled from the cache may not be the most recent."
)
return urls[-1] if urls else None return urls[-1] if urls else None
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy": def make_url(self, path: Path, command_hash: str, content_hash: str) -> Union[Path, "CloudPath"]:
"""Construct a URL from a subpath, a creation hash and a content hash.""" """Construct a URL from a subpath, a creation hash and a content hash."""
return self.url / self.encode_name(str(path)) / command_hash / content_hash return self.url / self.encode_name(str(path)) / command_hash / content_hash

View File

@ -25,6 +25,7 @@ from spacy.cli.download import get_compatibility, get_version
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import get_third_party_dependencies
from spacy.cli.package import _is_permitted_package_name from spacy.cli.package import _is_permitted_package_name
from spacy.cli.project.remote_storage import RemoteStorage
from spacy.cli.validate import get_model_pkgs from spacy.cli.validate import get_model_pkgs
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.nl import Dutch from spacy.lang.nl import Dutch
@ -855,3 +856,42 @@ def test_span_length_freq_dist_output_must_be_correct():
span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold) span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold)
assert sum(span_freqs.values()) >= threshold assert sum(span_freqs.values()) >= threshold
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2] assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
def test_local_remote_storage():
with make_tempdir() as d:
filename = "a.txt"
content = "a"
loc_file = d / "root" / filename
loc_file.parent.mkdir(parents=True)
with loc_file.open(mode="w") as file_:
file_.write(content)
# push to remote storage
remote = RemoteStorage(d / "root", str(d / "remote"))
remote.push(filename, "aaaa", "bbbb")
# retrieve with full hashes
loc_file.unlink()
remote.pull(filename, command_hash="aaaa", content_hash="bbbb")
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with command hash
loc_file.unlink()
remote.pull(filename, command_hash="aaaa")
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with content hash
loc_file.unlink()
remote.pull(filename, content_hash="bbbb")
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with no hashes
loc_file.unlink()
remote.pull(filename)
with loc_file.open(mode="r") as file_:
assert file_.read() == content