mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-07 13:44:55 +03:00
Demo: Switch from pathy to cloudpathlib
This commit is contained in:
parent
40e1000db0
commit
4b64c1bbcc
|
@ -10,7 +10,7 @@ wasabi>=0.9.1,<1.1.0
|
|||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
typer>=0.3.0,<0.5.0
|
||||
pathy>=0.3.5
|
||||
cloudpathlib>=0.7.0,<0.11.0
|
||||
# Third party dependencies
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
|
|
|
@ -52,7 +52,7 @@ install_requires =
|
|||
catalogue>=2.0.6,<2.1.0
|
||||
# Third-party dependencies
|
||||
typer>=0.3.0,<0.5.0
|
||||
pathy>=0.3.5
|
||||
cloudpathlib>=0.7.0,<0.11.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
|
|
|
@ -23,7 +23,7 @@ from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
|||
from .. import about
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import Pathy # noqa: F401
|
||||
from cloudpathlib import CloudPath # noqa: F401
|
||||
|
||||
|
||||
SDIST_SUFFIX = ".tar.gz"
|
||||
|
@ -331,21 +331,25 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
|||
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
||||
|
||||
|
||||
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
||||
def upload_file(src: Path, dest: Union[str, Path, "CloudPath"]) -> None:
|
||||
"""Upload a file.
|
||||
|
||||
src (Path): The source path.
|
||||
url (str): The destination URL to upload to.
|
||||
"""
|
||||
import smart_open
|
||||
# Create parent directories for local paths
|
||||
if isinstance(dest, Path):
|
||||
if not dest.parent.exists():
|
||||
dest.parent.mkdir(parents=True)
|
||||
|
||||
dest = str(dest)
|
||||
with smart_open.open(dest, mode="wb") as output_file:
|
||||
with dest.open(mode="wb") as output_file:
|
||||
with src.open(mode="rb") as input_file:
|
||||
output_file.write(input_file.read())
|
||||
|
||||
|
||||
def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None:
|
||||
def download_file(
|
||||
src: Union[str, Path, "CloudPath"], dest: Path, *, force: bool = False
|
||||
) -> None:
|
||||
"""Download a file using smart_open.
|
||||
|
||||
url (str): The URL of the file.
|
||||
|
@ -353,22 +357,19 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
|
|||
force (bool): Whether to force download even if file exists.
|
||||
If False, the download will be skipped.
|
||||
"""
|
||||
import smart_open
|
||||
|
||||
if dest.exists() and not force:
|
||||
return None
|
||||
src = str(src)
|
||||
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
|
||||
with src.open(mode="rb") as input_file:
|
||||
with dest.open(mode="wb") as output_file:
|
||||
shutil.copyfileobj(input_file, output_file)
|
||||
|
||||
|
||||
def ensure_pathy(path):
|
||||
"""Temporary helper to prevent importing Pathy globally (which can cause
|
||||
"""Temporary helper to prevent importing globally (which can cause
|
||||
slow and annoying Google Cloud warning)."""
|
||||
from pathy import Pathy # noqa: F811
|
||||
from cloudpathlib import AnyPath # noqa: F811
|
||||
|
||||
return Pathy(path)
|
||||
return AnyPath(path)
|
||||
|
||||
|
||||
def git_checkout(
|
||||
|
|
|
@ -1,18 +1,20 @@
|
|||
from typing import Optional, List, Dict, TYPE_CHECKING
|
||||
from typing import Optional, List, Dict, TYPE_CHECKING, Union
|
||||
import os
|
||||
import site
|
||||
import hashlib
|
||||
import urllib.parse
|
||||
import tarfile
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
|
||||
from .._util import get_hash, get_checksum, download_file, ensure_pathy
|
||||
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
|
||||
from .._util import get_hash, get_checksum, upload_file, download_file
|
||||
from .._util import ensure_pathy, make_tempdir
|
||||
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
|
||||
from ...git_info import GIT_VERSION
|
||||
from ... import about
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import Pathy # noqa: F401
|
||||
from cloudpathlib import CloudPath # noqa: F401
|
||||
|
||||
|
||||
class RemoteStorage:
|
||||
|
@ -27,7 +29,7 @@ class RemoteStorage:
|
|||
self.url = ensure_pathy(url)
|
||||
self.compression = compression
|
||||
|
||||
def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
|
||||
def push(self, path: Path, command_hash: str, content_hash: str) -> Union[Path, "CloudPath"]:
|
||||
"""Compress a file or directory within a project and upload it to a remote
|
||||
storage. If an object exists at the full URL, nothing is done.
|
||||
|
||||
|
@ -48,9 +50,7 @@ class RemoteStorage:
|
|||
mode_string = f"w:{self.compression}" if self.compression else "w"
|
||||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
||||
tar_file.add(str(loc), arcname=str(path))
|
||||
with tar_loc.open(mode="rb") as input_file:
|
||||
with url.open(mode="wb") as output_file:
|
||||
output_file.write(input_file.read())
|
||||
upload_file(tar_loc, url)
|
||||
return url
|
||||
|
||||
def pull(
|
||||
|
@ -59,7 +59,7 @@ class RemoteStorage:
|
|||
*,
|
||||
command_hash: Optional[str] = None,
|
||||
content_hash: Optional[str] = None,
|
||||
) -> Optional["Pathy"]:
|
||||
) -> Optional[Union[Path, "CloudPath"]]:
|
||||
"""Retrieve a file from the remote cache. If the file already exists,
|
||||
nothing is done.
|
||||
|
||||
|
@ -93,7 +93,7 @@ class RemoteStorage:
|
|||
*,
|
||||
command_hash: Optional[str] = None,
|
||||
content_hash: Optional[str] = None,
|
||||
) -> Optional["Pathy"]:
|
||||
) -> Optional[Union[Path, "CloudPath"]]:
|
||||
"""Find the best matching version of a file within the storage,
|
||||
or `None` if no match can be found. If both the creation and content hash
|
||||
are specified, only exact matches will be returned. Otherwise, the most
|
||||
|
@ -106,12 +106,22 @@ class RemoteStorage:
|
|||
elif command_hash is not None:
|
||||
urls = list((self.url / name / command_hash).iterdir())
|
||||
else:
|
||||
urls = list((self.url / name).iterdir())
|
||||
urls = []
|
||||
for sub_dir in (self.url / name).iterdir():
|
||||
urls.extend(sub_dir.iterdir())
|
||||
if content_hash is not None:
|
||||
urls = [url for url in urls if url.parts[-1] == content_hash]
|
||||
if len(urls) > 1:
|
||||
try:
|
||||
urls.sort(key=lambda x: x.stat().st_mtime) # type: ignore
|
||||
except Exception:
|
||||
warnings.warn(
|
||||
"Unable to sort remote files by last modified. The file(s) "
|
||||
"pulled from the cache may not be the most recent."
|
||||
)
|
||||
return urls[-1] if urls else None
|
||||
|
||||
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
|
||||
def make_url(self, path: Path, command_hash: str, content_hash: str) -> Union[Path, "CloudPath"]:
|
||||
"""Construct a URL from a subpath, a creation hash and a content hash."""
|
||||
return self.url / self.encode_name(str(path)) / command_hash / content_hash
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@ from spacy.cli.download import get_compatibility, get_version
|
|||
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
||||
from spacy.cli.package import get_third_party_dependencies
|
||||
from spacy.cli.package import _is_permitted_package_name
|
||||
from spacy.cli.project.remote_storage import RemoteStorage
|
||||
from spacy.cli.validate import get_model_pkgs
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.nl import Dutch
|
||||
|
@ -855,3 +856,42 @@ def test_span_length_freq_dist_output_must_be_correct():
|
|||
span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold)
|
||||
assert sum(span_freqs.values()) >= threshold
|
||||
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
|
||||
|
||||
|
||||
def test_local_remote_storage():
|
||||
with make_tempdir() as d:
|
||||
filename = "a.txt"
|
||||
content = "a"
|
||||
|
||||
loc_file = d / "root" / filename
|
||||
loc_file.parent.mkdir(parents=True)
|
||||
with loc_file.open(mode="w") as file_:
|
||||
file_.write(content)
|
||||
|
||||
# push to remote storage
|
||||
remote = RemoteStorage(d / "root", str(d / "remote"))
|
||||
remote.push(filename, "aaaa", "bbbb")
|
||||
|
||||
# retrieve with full hashes
|
||||
loc_file.unlink()
|
||||
remote.pull(filename, command_hash="aaaa", content_hash="bbbb")
|
||||
with loc_file.open(mode="r") as file_:
|
||||
assert file_.read() == content
|
||||
|
||||
# retrieve with command hash
|
||||
loc_file.unlink()
|
||||
remote.pull(filename, command_hash="aaaa")
|
||||
with loc_file.open(mode="r") as file_:
|
||||
assert file_.read() == content
|
||||
|
||||
# retrieve with content hash
|
||||
loc_file.unlink()
|
||||
remote.pull(filename, content_hash="bbbb")
|
||||
with loc_file.open(mode="r") as file_:
|
||||
assert file_.read() == content
|
||||
|
||||
# retrieve with no hashes
|
||||
loc_file.unlink()
|
||||
remote.pull(filename)
|
||||
with loc_file.open(mode="r") as file_:
|
||||
assert file_.read() == content
|
||||
|
|
Loading…
Reference in New Issue
Block a user