mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-06 05:10:21 +03:00
Update pathy for general BlobStat sorting
This commit is contained in:
parent
a4f2b16a1b
commit
2d7ef2f012
|
@ -10,7 +10,7 @@ wasabi>=0.9.1,<1.1.0
|
|||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
typer>=0.3.0,<0.8.0
|
||||
pathy>=0.6.0
|
||||
pathy>=0.10.0
|
||||
smart-open>=5.2.1,<7.0.0
|
||||
# Third party dependencies
|
||||
numpy>=1.15.0
|
||||
|
|
|
@ -52,7 +52,7 @@ install_requires =
|
|||
catalogue>=2.0.6,<2.1.0
|
||||
# Third-party dependencies
|
||||
typer>=0.3.0,<0.8.0
|
||||
pathy>=0.6.0
|
||||
pathy>=0.10.0
|
||||
smart-open>=5.2.1,<7.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
numpy>=1.15.0
|
||||
|
|
|
@ -116,18 +116,27 @@ class RemoteStorage:
|
|||
recent matching file is preferred.
|
||||
"""
|
||||
name = self.encode_name(str(path))
|
||||
urls = []
|
||||
if command_hash is not None and content_hash is not None:
|
||||
url = self.make_url(path, command_hash, content_hash)
|
||||
urls = [url] if _file_exists(url) else []
|
||||
url = self.url / name / command_hash / content_hash
|
||||
urls = [url] if url.exists() else []
|
||||
elif command_hash is not None:
|
||||
urls = list((self.url / name / command_hash).iterdir())
|
||||
if (self.url / name / command_hash).exists():
|
||||
urls = list((self.url / name / command_hash).iterdir())
|
||||
else:
|
||||
urls = []
|
||||
for command_hash_dir in (self.url / name).iterdir():
|
||||
urls.extend(command_hash_dir.iterdir())
|
||||
if content_hash is not None:
|
||||
urls = [url for url in urls if url.parts[-1] == content_hash]
|
||||
# TODO: URLs should be sorted by last modified
|
||||
if (self.url / name).exists():
|
||||
for sub_dir in (self.url / name).iterdir():
|
||||
urls.extend(sub_dir.iterdir())
|
||||
if content_hash is not None:
|
||||
urls = [url for url in urls if url.parts[-1] == content_hash]
|
||||
if len(urls) >= 2:
|
||||
try:
|
||||
urls.sort(key=lambda x: x.stat().last_modified) # type: ignore
|
||||
except Exception:
|
||||
msg.warn(
|
||||
"Unable to sort remote files by last modified. The file(s) "
|
||||
"pulled from the cache may not be the most recent."
|
||||
)
|
||||
return urls[-1] if urls else None
|
||||
|
||||
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
|
||||
|
|
|
@ -3,6 +3,7 @@ import math
|
|||
import pkg_resources
|
||||
from random import sample
|
||||
from typing import Counter
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import srsly
|
||||
|
@ -863,40 +864,55 @@ def test_span_length_freq_dist_output_must_be_correct():
|
|||
def test_local_remote_storage():
|
||||
with make_tempdir() as d:
|
||||
filename = "a.txt"
|
||||
content = "a"
|
||||
|
||||
loc_file = d / "root" / filename
|
||||
loc_file.parent.mkdir(parents=True)
|
||||
with loc_file.open(mode="w") as file_:
|
||||
file_.write(content)
|
||||
content_hashes = ("aaaa", "cccc", "bbbb")
|
||||
for i, content_hash in enumerate(content_hashes):
|
||||
# make sure that each subsequent file has a later timestamp
|
||||
if i > 0:
|
||||
time.sleep(1)
|
||||
content = f"{content_hash} content"
|
||||
loc_file = d / "root" / filename
|
||||
if not loc_file.parent.exists():
|
||||
loc_file.parent.mkdir(parents=True)
|
||||
with loc_file.open(mode="w") as file_:
|
||||
file_.write(content)
|
||||
|
||||
# push to remote storage
|
||||
# push first version to remote storage
|
||||
remote = RemoteStorage(d / "root", str(d / "remote"))
|
||||
remote.push(filename, "aaaa", content_hash)
|
||||
|
||||
# retrieve with full hashes
|
||||
loc_file.unlink()
|
||||
remote.pull(filename, command_hash="aaaa", content_hash=content_hash)
|
||||
with loc_file.open(mode="r") as file_:
|
||||
assert file_.read() == content
|
||||
|
||||
# retrieve with command hash
|
||||
loc_file.unlink()
|
||||
remote.pull(filename, command_hash="aaaa")
|
||||
with loc_file.open(mode="r") as file_:
|
||||
assert file_.read() == content
|
||||
|
||||
# retrieve with content hash
|
||||
loc_file.unlink()
|
||||
remote.pull(filename, content_hash=content_hash)
|
||||
with loc_file.open(mode="r") as file_:
|
||||
assert file_.read() == content
|
||||
|
||||
# retrieve with no hashes
|
||||
loc_file.unlink()
|
||||
remote.pull(filename)
|
||||
with loc_file.open(mode="r") as file_:
|
||||
assert file_.read() == content
|
||||
|
||||
|
||||
def test_local_remote_storage_pull_missing():
|
||||
# pulling from a non-existent remote pulls nothing gracefully
|
||||
with make_tempdir() as d:
|
||||
filename = "a.txt"
|
||||
remote = RemoteStorage(d / "root", str(d / "remote"))
|
||||
remote.push(filename, "aaaa", "bbbb")
|
||||
|
||||
# retrieve with full hashes
|
||||
loc_file.unlink()
|
||||
remote.pull(filename, command_hash="aaaa", content_hash="bbbb")
|
||||
with loc_file.open(mode="r") as file_:
|
||||
assert file_.read() == content
|
||||
|
||||
# retrieve with command hash
|
||||
loc_file.unlink()
|
||||
remote.pull(filename, command_hash="aaaa")
|
||||
with loc_file.open(mode="r") as file_:
|
||||
assert file_.read() == content
|
||||
|
||||
# retrieve with content hash
|
||||
loc_file.unlink()
|
||||
remote.pull(filename, content_hash="bbbb")
|
||||
with loc_file.open(mode="r") as file_:
|
||||
assert file_.read() == content
|
||||
|
||||
# retrieve with no hashes
|
||||
loc_file.unlink()
|
||||
remote.pull(filename)
|
||||
with loc_file.open(mode="r") as file_:
|
||||
assert file_.read() == content
|
||||
assert remote.pull(filename, command_hash="aaaa") is None
|
||||
assert remote.pull(filename) is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
|
Loading…
Reference in New Issue
Block a user