diff --git a/requirements.txt b/requirements.txt index 3cbd3bd9c..778c05e21 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ wasabi>=0.9.1,<1.1.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 typer>=0.3.0,<0.8.0 -pathy>=0.6.0 +pathy>=0.10.0 smart-open>=5.2.1,<7.0.0 # Third party dependencies numpy>=1.15.0 diff --git a/setup.cfg b/setup.cfg index fba923a06..5768c9d3e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -52,7 +52,7 @@ install_requires = catalogue>=2.0.6,<2.1.0 # Third-party dependencies typer>=0.3.0,<0.8.0 - pathy>=0.6.0 + pathy>=0.10.0 smart-open>=5.2.1,<7.0.0 tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py index 14b7ad868..d08ea97f2 100644 --- a/spacy/cli/project/remote_storage.py +++ b/spacy/cli/project/remote_storage.py @@ -116,18 +116,27 @@ class RemoteStorage: recent matching file is preferred. """ name = self.encode_name(str(path)) + urls = [] if command_hash is not None and content_hash is not None: - url = self.make_url(path, command_hash, content_hash) - urls = [url] if _file_exists(url) else [] + url = self.url / name / command_hash / content_hash + urls = [url] if url.exists() else [] elif command_hash is not None: - urls = list((self.url / name / command_hash).iterdir()) + if (self.url / name / command_hash).exists(): + urls = list((self.url / name / command_hash).iterdir()) else: - urls = [] - for command_hash_dir in (self.url / name).iterdir(): - urls.extend(command_hash_dir.iterdir()) - if content_hash is not None: - urls = [url for url in urls if url.parts[-1] == content_hash] - # TODO: URLs should be sorted by last modified + if (self.url / name).exists(): + for sub_dir in (self.url / name).iterdir(): + urls.extend(sub_dir.iterdir()) + if content_hash is not None: + urls = [url for url in urls if url.parts[-1] == content_hash] + if len(urls) >= 2: + try: + urls.sort(key=lambda x: x.stat().last_modified) # type: ignore + except Exception: + msg.warn( + "Unable to sort remote files by last modified. The file(s) " + "pulled from the cache may not be the most recent." + ) return urls[-1] if urls else None def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath": diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 698e03b2f..23fca59d2 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -3,6 +3,7 @@ import math import pkg_resources from random import sample from typing import Counter +import time import pytest import srsly @@ -863,40 +864,55 @@ def test_span_length_freq_dist_output_must_be_correct(): def test_local_remote_storage(): with make_tempdir() as d: filename = "a.txt" - content = "a" - loc_file = d / "root" / filename - loc_file.parent.mkdir(parents=True) - with loc_file.open(mode="w") as file_: - file_.write(content) + content_hashes = ("aaaa", "cccc", "bbbb") + for i, content_hash in enumerate(content_hashes): + # make sure that each subsequent file has a later timestamp + if i > 0: + time.sleep(1) + content = f"{content_hash} content" + loc_file = d / "root" / filename + if not loc_file.parent.exists(): + loc_file.parent.mkdir(parents=True) + with loc_file.open(mode="w") as file_: + file_.write(content) - # push to remote storage + # push first version to remote storage + remote = RemoteStorage(d / "root", str(d / "remote")) + remote.push(filename, "aaaa", content_hash) + + # retrieve with full hashes + loc_file.unlink() + remote.pull(filename, command_hash="aaaa", content_hash=content_hash) + with loc_file.open(mode="r") as file_: + assert file_.read() == content + + # retrieve with command hash + loc_file.unlink() + remote.pull(filename, command_hash="aaaa") + with loc_file.open(mode="r") as file_: + assert file_.read() == content + + # retrieve with content hash + loc_file.unlink() + remote.pull(filename, content_hash=content_hash) + with loc_file.open(mode="r") as file_: + assert file_.read() == content + + # retrieve with no hashes + loc_file.unlink() + remote.pull(filename) + with loc_file.open(mode="r") as file_: + assert file_.read() == content + + +def test_local_remote_storage_pull_missing(): + # pulling from a non-existent remote pulls nothing gracefully + with make_tempdir() as d: + filename = "a.txt" remote = RemoteStorage(d / "root", str(d / "remote")) - remote.push(filename, "aaaa", "bbbb") - - # retrieve with full hashes - loc_file.unlink() - remote.pull(filename, command_hash="aaaa", content_hash="bbbb") - with loc_file.open(mode="r") as file_: - assert file_.read() == content - - # retrieve with command hash - loc_file.unlink() - remote.pull(filename, command_hash="aaaa") - with loc_file.open(mode="r") as file_: - assert file_.read() == content - - # retrieve with content hash - loc_file.unlink() - remote.pull(filename, content_hash="bbbb") - with loc_file.open(mode="r") as file_: - assert file_.read() == content - - # retrieve with no hashes - loc_file.unlink() - remote.pull(filename) - with loc_file.open(mode="r") as file_: - assert file_.read() == content + assert remote.pull(filename, command_hash="aaaa") is None + assert remote.pull(filename) is None @pytest.mark.parametrize( @@ -935,4 +951,4 @@ def test_project_check_requirements(reqs, output): try: pkg_resources.require("spacyunknowndoesnotexist12345") except pkg_resources.DistributionNotFound: - assert output == _check_requirements([req.strip() for req in reqs.split("\n")]) \ No newline at end of file + assert output == _check_requirements([req.strip() for req in reqs.split("\n")])