diff --git a/requirements.txt b/requirements.txt index 9d6bbb2c4..bf96cbc54 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ wasabi>=0.9.1,<1.1.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 typer>=0.3.0,<0.5.0 -pathy>=0.3.5 +cloudpathlib>=0.7.0,<0.11.0 # Third party dependencies numpy>=1.15.0 requests>=2.13.0,<3.0.0 diff --git a/setup.cfg b/setup.cfg index c2653feba..282aeec59 100644 --- a/setup.cfg +++ b/setup.cfg @@ -52,7 +52,7 @@ install_requires = catalogue>=2.0.6,<2.1.0 # Third-party dependencies typer>=0.3.0,<0.5.0 - pathy>=0.3.5 + cloudpathlib>=0.7.0,<0.11.0 tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 requests>=2.13.0,<3.0.0 diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 897964a88..d0e118b36 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -23,7 +23,7 @@ from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS from .. import about if TYPE_CHECKING: - from pathy import Pathy # noqa: F401 + from cloudpathlib import CloudPath # noqa: F401 SDIST_SUFFIX = ".tar.gz" @@ -331,21 +331,25 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None: msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) -def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None: +def upload_file(src: Path, dest: Union[str, Path, "CloudPath"]) -> None: """Upload a file. src (Path): The source path. url (str): The destination URL to upload to. """ - import smart_open + # Create parent directories for local paths + if isinstance(dest, Path): + if not dest.parent.exists(): + dest.parent.mkdir(parents=True) - dest = str(dest) - with smart_open.open(dest, mode="wb") as output_file: + with dest.open(mode="wb") as output_file: with src.open(mode="rb") as input_file: output_file.write(input_file.read()) -def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None: +def download_file( + src: Union[str, Path, "CloudPath"], dest: Path, *, force: bool = False +) -> None: """Download a file using smart_open. url (str): The URL of the file. @@ -353,22 +357,19 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) force (bool): Whether to force download even if file exists. If False, the download will be skipped. """ - import smart_open - if dest.exists() and not force: return None - src = str(src) - with smart_open.open(src, mode="rb", ignore_ext=True) as input_file: + with src.open(mode="rb") as input_file: with dest.open(mode="wb") as output_file: shutil.copyfileobj(input_file, output_file) def ensure_pathy(path): - """Temporary helper to prevent importing Pathy globally (which can cause + """Temporary helper to prevent importing globally (which can cause slow and annoying Google Cloud warning).""" - from pathy import Pathy # noqa: F811 + from cloudpathlib import AnyPath # noqa: F811 - return Pathy(path) + return AnyPath(path) def git_checkout( diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py index 336a4bcb3..a76461b83 100644 --- a/spacy/cli/project/remote_storage.py +++ b/spacy/cli/project/remote_storage.py @@ -1,18 +1,20 @@ -from typing import Optional, List, Dict, TYPE_CHECKING +from typing import Optional, List, Dict, TYPE_CHECKING, Union import os import site import hashlib import urllib.parse import tarfile +import warnings from pathlib import Path -from .._util import get_hash, get_checksum, download_file, ensure_pathy -from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var +from .._util import get_hash, get_checksum, upload_file, download_file +from .._util import ensure_pathy, make_tempdir +from ...util import get_minor_version, ENV_VARS, check_bool_env_var from ...git_info import GIT_VERSION from ... import about if TYPE_CHECKING: - from pathy import Pathy # noqa: F401 + from cloudpathlib import CloudPath # noqa: F401 class RemoteStorage: @@ -27,7 +29,7 @@ class RemoteStorage: self.url = ensure_pathy(url) self.compression = compression - def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy": + def push(self, path: Path, command_hash: str, content_hash: str) -> Union[Path, "CloudPath"]: """Compress a file or directory within a project and upload it to a remote storage. If an object exists at the full URL, nothing is done. @@ -48,9 +50,7 @@ class RemoteStorage: mode_string = f"w:{self.compression}" if self.compression else "w" with tarfile.open(tar_loc, mode=mode_string) as tar_file: tar_file.add(str(loc), arcname=str(path)) - with tar_loc.open(mode="rb") as input_file: - with url.open(mode="wb") as output_file: - output_file.write(input_file.read()) + upload_file(tar_loc, url) return url def pull( @@ -59,7 +59,7 @@ class RemoteStorage: *, command_hash: Optional[str] = None, content_hash: Optional[str] = None, - ) -> Optional["Pathy"]: + ) -> Optional[Union[Path, "CloudPath"]]: """Retrieve a file from the remote cache. If the file already exists, nothing is done. @@ -93,7 +93,7 @@ class RemoteStorage: *, command_hash: Optional[str] = None, content_hash: Optional[str] = None, - ) -> Optional["Pathy"]: + ) -> Optional[Union[Path, "CloudPath"]]: """Find the best matching version of a file within the storage, or `None` if no match can be found. If both the creation and content hash are specified, only exact matches will be returned. Otherwise, the most @@ -106,12 +106,22 @@ class RemoteStorage: elif command_hash is not None: urls = list((self.url / name / command_hash).iterdir()) else: - urls = list((self.url / name).iterdir()) + urls = [] + for sub_dir in (self.url / name).iterdir(): + urls.extend(sub_dir.iterdir()) if content_hash is not None: urls = [url for url in urls if url.parts[-1] == content_hash] + if len(urls) > 1: + try: + urls.sort(key=lambda x: x.stat().st_mtime) # type: ignore + except Exception: + warnings.warn( + "Unable to sort remote files by last modified. The file(s) " + "pulled from the cache may not be the most recent." + ) return urls[-1] if urls else None - def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy": + def make_url(self, path: Path, command_hash: str, content_hash: str) -> Union[Path, "CloudPath"]: """Construct a URL from a subpath, a creation hash and a content hash.""" return self.url / self.encode_name(str(path)) / command_hash / content_hash diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 838e00369..3aa4b23fc 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -25,6 +25,7 @@ from spacy.cli.download import get_compatibility, get_version from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import _is_permitted_package_name +from spacy.cli.project.remote_storage import RemoteStorage from spacy.cli.validate import get_model_pkgs from spacy.lang.en import English from spacy.lang.nl import Dutch @@ -855,3 +856,42 @@ def test_span_length_freq_dist_output_must_be_correct(): span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold) assert sum(span_freqs.values()) >= threshold assert list(span_freqs.keys()) == [3, 1, 4, 5, 2] + + +def test_local_remote_storage(): + with make_tempdir() as d: + filename = "a.txt" + content = "a" + + loc_file = d / "root" / filename + loc_file.parent.mkdir(parents=True) + with loc_file.open(mode="w") as file_: + file_.write(content) + + # push to remote storage + remote = RemoteStorage(d / "root", str(d / "remote")) + remote.push(filename, "aaaa", "bbbb") + + # retrieve with full hashes + loc_file.unlink() + remote.pull(filename, command_hash="aaaa", content_hash="bbbb") + with loc_file.open(mode="r") as file_: + assert file_.read() == content + + # retrieve with command hash + loc_file.unlink() + remote.pull(filename, command_hash="aaaa") + with loc_file.open(mode="r") as file_: + assert file_.read() == content + + # retrieve with content hash + loc_file.unlink() + remote.pull(filename, content_hash="bbbb") + with loc_file.open(mode="r") as file_: + assert file_.read() == content + + # retrieve with no hashes + loc_file.unlink() + remote.pull(filename) + with loc_file.open(mode="r") as file_: + assert file_.read() == content