diff --git a/requirements.txt b/requirements.txt index d91a3b3d4..1fe92c26e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,8 @@ wasabi>=0.9.1,<1.1.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 typer>=0.3.0,<0.8.0 -pathy>=0.3.5 +pathy>=0.6.0 +smart-open>=5.2.1,<7.0.0 # Third party dependencies numpy>=1.15.0 requests>=2.13.0,<3.0.0 diff --git a/setup.cfg b/setup.cfg index 82d4d2758..fba923a06 100644 --- a/setup.cfg +++ b/setup.cfg @@ -52,7 +52,8 @@ install_requires = catalogue>=2.0.6,<2.1.0 # Third-party dependencies typer>=0.3.0,<0.8.0 - pathy>=0.3.5 + pathy>=0.6.0 + smart-open>=5.2.1,<7.0.0 tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 requests>=2.13.0,<3.0.0 diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 897964a88..62aa56525 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -23,7 +23,7 @@ from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS from .. import about if TYPE_CHECKING: - from pathy import Pathy # noqa: F401 + from pathy import FluidPath # noqa: F401 SDIST_SUFFIX = ".tar.gz" @@ -331,7 +331,33 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None: msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) -def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None: +def _file_exists(src: Union[str, "FluidPath"]) -> bool: + """Check if a (remote) file exists, if possible. + + src (str / FluidPath): The URL to check. + """ + try: + from pathy import Pathy + + if isinstance(src, (Pathy, Path)): + if src.exists(): + return True + except Exception: + pass + + try: + import smart_open + + with smart_open.open(src, mode="rb", compression="disable") as input_file: + pass + return True + except Exception: + pass + + return False + + +def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None: """Upload a file. src (Path): The source path. @@ -339,13 +365,20 @@ def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None: """ import smart_open + # Create parent directories for local paths + if isinstance(dest, Path): + if not dest.parent.exists(): + dest.parent.mkdir(parents=True) + dest = str(dest) with smart_open.open(dest, mode="wb") as output_file: with src.open(mode="rb") as input_file: output_file.write(input_file.read()) -def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None: +def download_file( + src: Union[str, "FluidPath"], dest: Path, *, force: bool = False +) -> None: """Download a file using smart_open. url (str): The URL of the file. @@ -358,7 +391,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) if dest.exists() and not force: return None src = str(src) - with smart_open.open(src, mode="rb", ignore_ext=True) as input_file: + with smart_open.open(src, mode="rb", compression="disable") as input_file: with dest.open(mode="wb") as output_file: shutil.copyfileobj(input_file, output_file) @@ -368,7 +401,7 @@ def ensure_pathy(path): slow and annoying Google Cloud warning).""" from pathy import Pathy # noqa: F811 - return Pathy(path) + return Pathy.fluid(path) def git_checkout( diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py index 336a4bcb3..745380ada 100644 --- a/spacy/cli/project/remote_storage.py +++ b/spacy/cli/project/remote_storage.py @@ -6,13 +6,14 @@ import urllib.parse import tarfile from pathlib import Path -from .._util import get_hash, get_checksum, download_file, ensure_pathy -from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var +from .._util import get_hash, get_checksum, upload_file, download_file +from .._util import ensure_pathy, make_tempdir, _file_exists +from ...util import get_minor_version, ENV_VARS, check_bool_env_var from ...git_info import GIT_VERSION from ... import about if TYPE_CHECKING: - from pathy import Pathy # noqa: F401 + from pathy import FluidPath # noqa: F401 class RemoteStorage: @@ -27,7 +28,7 @@ class RemoteStorage: self.url = ensure_pathy(url) self.compression = compression - def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy": + def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath": """Compress a file or directory within a project and upload it to a remote storage. If an object exists at the full URL, nothing is done. @@ -40,7 +41,7 @@ class RemoteStorage: if not loc.exists(): raise IOError(f"Cannot push {loc}: does not exist.") url = self.make_url(path, command_hash, content_hash) - if url.exists(): + if _file_exists(url): return url tmp: Path with make_tempdir() as tmp: @@ -48,9 +49,7 @@ class RemoteStorage: mode_string = f"w:{self.compression}" if self.compression else "w" with tarfile.open(tar_loc, mode=mode_string) as tar_file: tar_file.add(str(loc), arcname=str(path)) - with tar_loc.open(mode="rb") as input_file: - with url.open(mode="wb") as output_file: - output_file.write(input_file.read()) + upload_file(tar_loc, url) return url def pull( @@ -59,7 +58,7 @@ class RemoteStorage: *, command_hash: Optional[str] = None, content_hash: Optional[str] = None, - ) -> Optional["Pathy"]: + ) -> Optional["FluidPath"]: """Retrieve a file from the remote cache. If the file already exists, nothing is done. @@ -93,7 +92,7 @@ class RemoteStorage: *, command_hash: Optional[str] = None, content_hash: Optional[str] = None, - ) -> Optional["Pathy"]: + ) -> Optional["FluidPath"]: """Find the best matching version of a file within the storage, or `None` if no match can be found. If both the creation and content hash are specified, only exact matches will be returned. Otherwise, the most @@ -102,16 +101,19 @@ class RemoteStorage: name = self.encode_name(str(path)) if command_hash is not None and content_hash is not None: url = self.make_url(path, command_hash, content_hash) - urls = [url] if url.exists() else [] + urls = [url] if _file_exists(url) else [] elif command_hash is not None: urls = list((self.url / name / command_hash).iterdir()) else: - urls = list((self.url / name).iterdir()) + urls = [] + for command_hash_dir in (self.url / name).iterdir(): + urls.extend(command_hash_dir.iterdir()) if content_hash is not None: urls = [url for url in urls if url.parts[-1] == content_hash] + # TODO: URLs should be sorted by last modified return urls[-1] if urls else None - def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy": + def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath": """Construct a URL from a subpath, a creation hash and a content hash.""" return self.url / self.encode_name(str(path)) / command_hash / content_hash diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 838e00369..3aa4b23fc 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -25,6 +25,7 @@ from spacy.cli.download import get_compatibility, get_version from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import _is_permitted_package_name +from spacy.cli.project.remote_storage import RemoteStorage from spacy.cli.validate import get_model_pkgs from spacy.lang.en import English from spacy.lang.nl import Dutch @@ -855,3 +856,42 @@ def test_span_length_freq_dist_output_must_be_correct(): span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold) assert sum(span_freqs.values()) >= threshold assert list(span_freqs.keys()) == [3, 1, 4, 5, 2] + + +def test_local_remote_storage(): + with make_tempdir() as d: + filename = "a.txt" + content = "a" + + loc_file = d / "root" / filename + loc_file.parent.mkdir(parents=True) + with loc_file.open(mode="w") as file_: + file_.write(content) + + # push to remote storage + remote = RemoteStorage(d / "root", str(d / "remote")) + remote.push(filename, "aaaa", "bbbb") + + # retrieve with full hashes + loc_file.unlink() + remote.pull(filename, command_hash="aaaa", content_hash="bbbb") + with loc_file.open(mode="r") as file_: + assert file_.read() == content + + # retrieve with command hash + loc_file.unlink() + remote.pull(filename, command_hash="aaaa") + with loc_file.open(mode="r") as file_: + assert file_.read() == content + + # retrieve with content hash + loc_file.unlink() + remote.pull(filename, content_hash="bbbb") + with loc_file.open(mode="r") as file_: + assert file_.read() == content + + # retrieve with no hashes + loc_file.unlink() + remote.pull(filename) + with loc_file.open(mode="r") as file_: + assert file_.read() == content diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index fc2c46022..882cfb602 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -474,8 +474,7 @@ report span characteristics such as the average span length and the span (or span boundary) distinctiveness. The distinctiveness measure shows how different the tokens are with respect to the rest of the corpus using the KL-divergence of the token distributions. To learn more, you can check out Papay et al.'s work on -[*Dissecting Span Identification Tasks with Performance Prediction* (EMNLP -2020)](https://aclanthology.org/2020.emnlp-main.396/). +[_Dissecting Span Identification Tasks with Performance Prediction_ (EMNLP 2020)](https://aclanthology.org/2020.emnlp-main.396/). @@ -1352,12 +1351,13 @@ If the contents are different, the new version of the file is uploaded. Deleting obsolete files is left up to you. Remotes can be defined in the `remotes` section of the -[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the -[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to -communicate with the remote storages, so you can use any protocol that -`smart-open` supports, including [S3](https://aws.amazon.com/s3/), -[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although -you may need to install extra dependencies to use certain protocols. +[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses +[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the +remote storages, so you can use any protocol that `Pathy` supports, including +[S3](https://aws.amazon.com/s3/), +[Google Cloud Storage](https://cloud.google.com/storage), and the local +filesystem, although you may need to install extra dependencies to use certain +protocols. ```cli $ python -m spacy project push [remote] [project_dir] @@ -1396,12 +1396,13 @@ outputs, so if you change the config back, you'll be able to fetch back the result. Remotes can be defined in the `remotes` section of the -[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the -[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to -communicate with the remote storages, so you can use any protocol that -`smart-open` supports, including [S3](https://aws.amazon.com/s3/), -[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although -you may need to install extra dependencies to use certain protocols. +[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses +[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the +remote storages, so you can use any protocol that `Pathy` supports, including +[S3](https://aws.amazon.com/s3/), +[Google Cloud Storage](https://cloud.google.com/storage), and the local +filesystem, although you may need to install extra dependencies to use certain +protocols. ```cli $ python -m spacy project pull [remote] [project_dir] diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 90b612358..bf005abef 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -259,9 +259,9 @@ pipelines. > This can be used in a project command like so: > > ```yaml -> - name: "echo-path" -> script: -> - "echo ${env.ENV_PATH}" +> - name: 'echo-path' +> script: +> - 'echo ${env.ENV_PATH}' > ``` | Section | Description | @@ -643,12 +643,13 @@ locally. You can list one or more remotes in the `remotes` section of your [`project.yml`](#project-yml) by mapping a string name to the URL of the -storage. Under the hood, spaCy uses the -[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to -communicate with the remote storages, so you can use any protocol that -`smart-open` supports, including [S3](https://aws.amazon.com/s3/), -[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although -you may need to install extra dependencies to use certain protocols. +storage. Under the hood, spaCy uses +[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the +remote storages, so you can use any protocol that `Pathy` supports, including +[S3](https://aws.amazon.com/s3/), +[Google Cloud Storage](https://cloud.google.com/storage), and the local +filesystem, although you may need to install extra dependencies to use certain +protocols. > #### Example > @@ -661,7 +662,6 @@ you may need to install extra dependencies to use certain protocols. remotes: default: 's3://my-spacy-bucket' local: '/mnt/scratch/cache' - stuff: 'ssh://myserver.example.com/whatever' ```