mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-06 05:10:21 +03:00
Update pathy for general BlobStat sorting
This commit is contained in:
parent
a4f2b16a1b
commit
2d7ef2f012
|
@ -10,7 +10,7 @@ wasabi>=0.9.1,<1.1.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<0.8.0
|
typer>=0.3.0,<0.8.0
|
||||||
pathy>=0.6.0
|
pathy>=0.10.0
|
||||||
smart-open>=5.2.1,<7.0.0
|
smart-open>=5.2.1,<7.0.0
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
|
|
|
@ -52,7 +52,7 @@ install_requires =
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
# Third-party dependencies
|
# Third-party dependencies
|
||||||
typer>=0.3.0,<0.8.0
|
typer>=0.3.0,<0.8.0
|
||||||
pathy>=0.6.0
|
pathy>=0.10.0
|
||||||
smart-open>=5.2.1,<7.0.0
|
smart-open>=5.2.1,<7.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
|
|
|
@ -116,18 +116,27 @@ class RemoteStorage:
|
||||||
recent matching file is preferred.
|
recent matching file is preferred.
|
||||||
"""
|
"""
|
||||||
name = self.encode_name(str(path))
|
name = self.encode_name(str(path))
|
||||||
|
urls = []
|
||||||
if command_hash is not None and content_hash is not None:
|
if command_hash is not None and content_hash is not None:
|
||||||
url = self.make_url(path, command_hash, content_hash)
|
url = self.url / name / command_hash / content_hash
|
||||||
urls = [url] if _file_exists(url) else []
|
urls = [url] if url.exists() else []
|
||||||
elif command_hash is not None:
|
elif command_hash is not None:
|
||||||
urls = list((self.url / name / command_hash).iterdir())
|
if (self.url / name / command_hash).exists():
|
||||||
|
urls = list((self.url / name / command_hash).iterdir())
|
||||||
else:
|
else:
|
||||||
urls = []
|
if (self.url / name).exists():
|
||||||
for command_hash_dir in (self.url / name).iterdir():
|
for sub_dir in (self.url / name).iterdir():
|
||||||
urls.extend(command_hash_dir.iterdir())
|
urls.extend(sub_dir.iterdir())
|
||||||
if content_hash is not None:
|
if content_hash is not None:
|
||||||
urls = [url for url in urls if url.parts[-1] == content_hash]
|
urls = [url for url in urls if url.parts[-1] == content_hash]
|
||||||
# TODO: URLs should be sorted by last modified
|
if len(urls) >= 2:
|
||||||
|
try:
|
||||||
|
urls.sort(key=lambda x: x.stat().last_modified) # type: ignore
|
||||||
|
except Exception:
|
||||||
|
msg.warn(
|
||||||
|
"Unable to sort remote files by last modified. The file(s) "
|
||||||
|
"pulled from the cache may not be the most recent."
|
||||||
|
)
|
||||||
return urls[-1] if urls else None
|
return urls[-1] if urls else None
|
||||||
|
|
||||||
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
|
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
|
||||||
|
|
|
@ -3,6 +3,7 @@ import math
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
from random import sample
|
from random import sample
|
||||||
from typing import Counter
|
from typing import Counter
|
||||||
|
import time
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import srsly
|
import srsly
|
||||||
|
@ -863,40 +864,55 @@ def test_span_length_freq_dist_output_must_be_correct():
|
||||||
def test_local_remote_storage():
|
def test_local_remote_storage():
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
filename = "a.txt"
|
filename = "a.txt"
|
||||||
content = "a"
|
|
||||||
|
|
||||||
loc_file = d / "root" / filename
|
content_hashes = ("aaaa", "cccc", "bbbb")
|
||||||
loc_file.parent.mkdir(parents=True)
|
for i, content_hash in enumerate(content_hashes):
|
||||||
with loc_file.open(mode="w") as file_:
|
# make sure that each subsequent file has a later timestamp
|
||||||
file_.write(content)
|
if i > 0:
|
||||||
|
time.sleep(1)
|
||||||
|
content = f"{content_hash} content"
|
||||||
|
loc_file = d / "root" / filename
|
||||||
|
if not loc_file.parent.exists():
|
||||||
|
loc_file.parent.mkdir(parents=True)
|
||||||
|
with loc_file.open(mode="w") as file_:
|
||||||
|
file_.write(content)
|
||||||
|
|
||||||
# push to remote storage
|
# push first version to remote storage
|
||||||
|
remote = RemoteStorage(d / "root", str(d / "remote"))
|
||||||
|
remote.push(filename, "aaaa", content_hash)
|
||||||
|
|
||||||
|
# retrieve with full hashes
|
||||||
|
loc_file.unlink()
|
||||||
|
remote.pull(filename, command_hash="aaaa", content_hash=content_hash)
|
||||||
|
with loc_file.open(mode="r") as file_:
|
||||||
|
assert file_.read() == content
|
||||||
|
|
||||||
|
# retrieve with command hash
|
||||||
|
loc_file.unlink()
|
||||||
|
remote.pull(filename, command_hash="aaaa")
|
||||||
|
with loc_file.open(mode="r") as file_:
|
||||||
|
assert file_.read() == content
|
||||||
|
|
||||||
|
# retrieve with content hash
|
||||||
|
loc_file.unlink()
|
||||||
|
remote.pull(filename, content_hash=content_hash)
|
||||||
|
with loc_file.open(mode="r") as file_:
|
||||||
|
assert file_.read() == content
|
||||||
|
|
||||||
|
# retrieve with no hashes
|
||||||
|
loc_file.unlink()
|
||||||
|
remote.pull(filename)
|
||||||
|
with loc_file.open(mode="r") as file_:
|
||||||
|
assert file_.read() == content
|
||||||
|
|
||||||
|
|
||||||
|
def test_local_remote_storage_pull_missing():
|
||||||
|
# pulling from a non-existent remote pulls nothing gracefully
|
||||||
|
with make_tempdir() as d:
|
||||||
|
filename = "a.txt"
|
||||||
remote = RemoteStorage(d / "root", str(d / "remote"))
|
remote = RemoteStorage(d / "root", str(d / "remote"))
|
||||||
remote.push(filename, "aaaa", "bbbb")
|
assert remote.pull(filename, command_hash="aaaa") is None
|
||||||
|
assert remote.pull(filename) is None
|
||||||
# retrieve with full hashes
|
|
||||||
loc_file.unlink()
|
|
||||||
remote.pull(filename, command_hash="aaaa", content_hash="bbbb")
|
|
||||||
with loc_file.open(mode="r") as file_:
|
|
||||||
assert file_.read() == content
|
|
||||||
|
|
||||||
# retrieve with command hash
|
|
||||||
loc_file.unlink()
|
|
||||||
remote.pull(filename, command_hash="aaaa")
|
|
||||||
with loc_file.open(mode="r") as file_:
|
|
||||||
assert file_.read() == content
|
|
||||||
|
|
||||||
# retrieve with content hash
|
|
||||||
loc_file.unlink()
|
|
||||||
remote.pull(filename, content_hash="bbbb")
|
|
||||||
with loc_file.open(mode="r") as file_:
|
|
||||||
assert file_.read() == content
|
|
||||||
|
|
||||||
# retrieve with no hashes
|
|
||||||
loc_file.unlink()
|
|
||||||
remote.pull(filename)
|
|
||||||
with loc_file.open(mode="r") as file_:
|
|
||||||
assert file_.read() == content
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
|
Loading…
Reference in New Issue
Block a user