Update pathy for general BlobStat sorting

This commit is contained in:
Adriane Boyd 2022-11-24 10:59:59 +01:00
parent a4f2b16a1b
commit 2d7ef2f012
4 changed files with 68 additions and 43 deletions

View File

@ -10,7 +10,7 @@ wasabi>=0.9.1,<1.1.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.8.0 typer>=0.3.0,<0.8.0
pathy>=0.6.0 pathy>=0.10.0
smart-open>=5.2.1,<7.0.0 smart-open>=5.2.1,<7.0.0
# Third party dependencies # Third party dependencies
numpy>=1.15.0 numpy>=1.15.0

View File

@ -52,7 +52,7 @@ install_requires =
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
# Third-party dependencies # Third-party dependencies
typer>=0.3.0,<0.8.0 typer>=0.3.0,<0.8.0
pathy>=0.6.0 pathy>=0.10.0
smart-open>=5.2.1,<7.0.0 smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
numpy>=1.15.0 numpy>=1.15.0

View File

@ -116,18 +116,27 @@ class RemoteStorage:
recent matching file is preferred. recent matching file is preferred.
""" """
name = self.encode_name(str(path)) name = self.encode_name(str(path))
urls = []
if command_hash is not None and content_hash is not None: if command_hash is not None and content_hash is not None:
url = self.make_url(path, command_hash, content_hash) url = self.url / name / command_hash / content_hash
urls = [url] if _file_exists(url) else [] urls = [url] if url.exists() else []
elif command_hash is not None: elif command_hash is not None:
urls = list((self.url / name / command_hash).iterdir()) if (self.url / name / command_hash).exists():
urls = list((self.url / name / command_hash).iterdir())
else: else:
urls = [] if (self.url / name).exists():
for command_hash_dir in (self.url / name).iterdir(): for sub_dir in (self.url / name).iterdir():
urls.extend(command_hash_dir.iterdir()) urls.extend(sub_dir.iterdir())
if content_hash is not None: if content_hash is not None:
urls = [url for url in urls if url.parts[-1] == content_hash] urls = [url for url in urls if url.parts[-1] == content_hash]
# TODO: URLs should be sorted by last modified if len(urls) >= 2:
try:
urls.sort(key=lambda x: x.stat().last_modified) # type: ignore
except Exception:
msg.warn(
"Unable to sort remote files by last modified. The file(s) "
"pulled from the cache may not be the most recent."
)
return urls[-1] if urls else None return urls[-1] if urls else None
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath": def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":

View File

@ -3,6 +3,7 @@ import math
import pkg_resources import pkg_resources
from random import sample from random import sample
from typing import Counter from typing import Counter
import time
import pytest import pytest
import srsly import srsly
@ -863,40 +864,55 @@ def test_span_length_freq_dist_output_must_be_correct():
def test_local_remote_storage(): def test_local_remote_storage():
with make_tempdir() as d: with make_tempdir() as d:
filename = "a.txt" filename = "a.txt"
content = "a"
loc_file = d / "root" / filename content_hashes = ("aaaa", "cccc", "bbbb")
loc_file.parent.mkdir(parents=True) for i, content_hash in enumerate(content_hashes):
with loc_file.open(mode="w") as file_: # make sure that each subsequent file has a later timestamp
file_.write(content) if i > 0:
time.sleep(1)
content = f"{content_hash} content"
loc_file = d / "root" / filename
if not loc_file.parent.exists():
loc_file.parent.mkdir(parents=True)
with loc_file.open(mode="w") as file_:
file_.write(content)
# push to remote storage # push first version to remote storage
remote = RemoteStorage(d / "root", str(d / "remote"))
remote.push(filename, "aaaa", content_hash)
# retrieve with full hashes
loc_file.unlink()
remote.pull(filename, command_hash="aaaa", content_hash=content_hash)
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with command hash
loc_file.unlink()
remote.pull(filename, command_hash="aaaa")
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with content hash
loc_file.unlink()
remote.pull(filename, content_hash=content_hash)
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with no hashes
loc_file.unlink()
remote.pull(filename)
with loc_file.open(mode="r") as file_:
assert file_.read() == content
def test_local_remote_storage_pull_missing():
# pulling from a non-existent remote pulls nothing gracefully
with make_tempdir() as d:
filename = "a.txt"
remote = RemoteStorage(d / "root", str(d / "remote")) remote = RemoteStorage(d / "root", str(d / "remote"))
remote.push(filename, "aaaa", "bbbb") assert remote.pull(filename, command_hash="aaaa") is None
assert remote.pull(filename) is None
# retrieve with full hashes
loc_file.unlink()
remote.pull(filename, command_hash="aaaa", content_hash="bbbb")
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with command hash
loc_file.unlink()
remote.pull(filename, command_hash="aaaa")
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with content hash
loc_file.unlink()
remote.pull(filename, content_hash="bbbb")
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with no hashes
loc_file.unlink()
remote.pull(filename)
with loc_file.open(mode="r") as file_:
assert file_.read() == content
@pytest.mark.parametrize( @pytest.mark.parametrize(