Update pathy for general BlobStat sorting

This commit is contained in:
Adriane Boyd 2022-11-24 10:59:59 +01:00
parent a4f2b16a1b
commit 2d7ef2f012
4 changed files with 68 additions and 43 deletions

View File

@ -10,7 +10,7 @@ wasabi>=0.9.1,<1.1.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.8.0
pathy>=0.6.0
pathy>=0.10.0
smart-open>=5.2.1,<7.0.0
# Third party dependencies
numpy>=1.15.0

View File

@ -52,7 +52,7 @@ install_requires =
catalogue>=2.0.6,<2.1.0
# Third-party dependencies
typer>=0.3.0,<0.8.0
pathy>=0.6.0
pathy>=0.10.0
smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0
numpy>=1.15.0

View File

@ -116,18 +116,27 @@ class RemoteStorage:
recent matching file is preferred.
"""
name = self.encode_name(str(path))
urls = []
if command_hash is not None and content_hash is not None:
url = self.make_url(path, command_hash, content_hash)
urls = [url] if _file_exists(url) else []
url = self.url / name / command_hash / content_hash
urls = [url] if url.exists() else []
elif command_hash is not None:
urls = list((self.url / name / command_hash).iterdir())
if (self.url / name / command_hash).exists():
urls = list((self.url / name / command_hash).iterdir())
else:
urls = []
for command_hash_dir in (self.url / name).iterdir():
urls.extend(command_hash_dir.iterdir())
if content_hash is not None:
urls = [url for url in urls if url.parts[-1] == content_hash]
# TODO: URLs should be sorted by last modified
if (self.url / name).exists():
for sub_dir in (self.url / name).iterdir():
urls.extend(sub_dir.iterdir())
if content_hash is not None:
urls = [url for url in urls if url.parts[-1] == content_hash]
if len(urls) >= 2:
try:
urls.sort(key=lambda x: x.stat().last_modified) # type: ignore
except Exception:
msg.warn(
"Unable to sort remote files by last modified. The file(s) "
"pulled from the cache may not be the most recent."
)
return urls[-1] if urls else None
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":

View File

@ -3,6 +3,7 @@ import math
import pkg_resources
from random import sample
from typing import Counter
import time
import pytest
import srsly
@ -863,40 +864,55 @@ def test_span_length_freq_dist_output_must_be_correct():
def test_local_remote_storage():
with make_tempdir() as d:
filename = "a.txt"
content = "a"
loc_file = d / "root" / filename
loc_file.parent.mkdir(parents=True)
with loc_file.open(mode="w") as file_:
file_.write(content)
content_hashes = ("aaaa", "cccc", "bbbb")
for i, content_hash in enumerate(content_hashes):
# make sure that each subsequent file has a later timestamp
if i > 0:
time.sleep(1)
content = f"{content_hash} content"
loc_file = d / "root" / filename
if not loc_file.parent.exists():
loc_file.parent.mkdir(parents=True)
with loc_file.open(mode="w") as file_:
file_.write(content)
# push to remote storage
# push first version to remote storage
remote = RemoteStorage(d / "root", str(d / "remote"))
remote.push(filename, "aaaa", content_hash)
# retrieve with full hashes
loc_file.unlink()
remote.pull(filename, command_hash="aaaa", content_hash=content_hash)
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with command hash
loc_file.unlink()
remote.pull(filename, command_hash="aaaa")
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with content hash
loc_file.unlink()
remote.pull(filename, content_hash=content_hash)
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with no hashes
loc_file.unlink()
remote.pull(filename)
with loc_file.open(mode="r") as file_:
assert file_.read() == content
def test_local_remote_storage_pull_missing():
# pulling from a non-existent remote pulls nothing gracefully
with make_tempdir() as d:
filename = "a.txt"
remote = RemoteStorage(d / "root", str(d / "remote"))
remote.push(filename, "aaaa", "bbbb")
# retrieve with full hashes
loc_file.unlink()
remote.pull(filename, command_hash="aaaa", content_hash="bbbb")
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with command hash
loc_file.unlink()
remote.pull(filename, command_hash="aaaa")
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with content hash
loc_file.unlink()
remote.pull(filename, content_hash="bbbb")
with loc_file.open(mode="r") as file_:
assert file_.read() == content
# retrieve with no hashes
loc_file.unlink()
remote.pull(filename)
with loc_file.open(mode="r") as file_:
assert file_.read() == content
assert remote.pull(filename, command_hash="aaaa") is None
assert remote.pull(filename) is None
@pytest.mark.parametrize(
@ -935,4 +951,4 @@ def test_project_check_requirements(reqs, output):
try:
pkg_resources.require("spacyunknowndoesnotexist12345")
except pkg_resources.DistributionNotFound:
assert output == _check_requirements([req.strip() for req in reqs.split("\n")])
assert output == _check_requirements([req.strip() for req in reqs.split("\n")])