From 14bd9d89a3fea6a36bd0fe651ef43035f0a90d88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Sun, 11 Feb 2024 19:46:43 +0100 Subject: [PATCH 1/4] Update example that shows model in requirments (#13302) See #13293. --- website/docs/usage/models.mdx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/models.mdx b/website/docs/usage/models.mdx index 3b8a5fa3f..7fed9f407 100644 --- a/website/docs/usage/models.mdx +++ b/website/docs/usage/models.mdx @@ -526,13 +526,17 @@ application's `requirements.txt`. If you're running your own internal PyPi installation, you can upload the pipeline packages there. pip's [requirements file format](https://pip.pypa.io/en/latest/reference/requirements-file-format/) supports both package names to download via a PyPi server, as well as -[direct URLs](#pipeline-urls). +[direct URLs](#pipeline-urls). For instance, you can specify the +`en_core_web_sm` model for spaCy 3.7.x as follows: ```text {title="requirements.txt"} spacy>=3.0.0,<4.0.0 -en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl +en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl ``` +See the [list of models](https://spacy.io/models) for model download links for +the current spaCy version. + All pipeline packages are versioned and specify their spaCy dependency. This ensures cross-compatibility and lets you specify exact version requirements for each pipeline. If you've [trained](/usage/training) your own pipeline, you can From fdfdbcd9f40c73eefe106f9ebf26767809d69a83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Mon, 12 Feb 2024 14:39:38 +0100 Subject: [PATCH 2/4] Make `Language.pipe` workers exit cleanly (#13321) Also warn when any worker exited with a non-zero exit code and modify test to ensure that workers exit cleanly by default. --- spacy/errors.py | 1 + spacy/language.py | 5 +++++ spacy/tests/test_language.py | 11 ++++++++--- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index b6108dd0f..cf9a7b708 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -220,6 +220,7 @@ class Warnings(metaclass=ErrorsWithCodes): "key attribute for vectors, configure it through Vectors(attr=) or " "'spacy init vectors --attr'") W126 = ("These keys are unsupported: {unsupported}") + W127 = ("Not all `Language.pipe` worker processes completed successfully") class Errors(metaclass=ErrorsWithCodes): diff --git a/spacy/language.py b/spacy/language.py index 568d2d4fa..18d20c939 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1730,6 +1730,9 @@ class Language: for proc in procs: proc.join() + if not all(proc.exitcode == 0 for proc in procs): + warnings.warn(Warnings.W127) + def _link_components(self) -> None: """Register 'listeners' within pipeline components, to allow them to effectively share weights. @@ -2350,6 +2353,7 @@ def _apply_pipes( if isinstance(texts_with_ctx, _WorkDoneSentinel): sender.close() receiver.close() + return docs = ( ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx @@ -2375,6 +2379,7 @@ def _apply_pipes( # stop processing. sender.close() receiver.close() + return class _Sender: diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 51eec3239..d229739e1 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -1,5 +1,6 @@ import itertools import logging +import warnings from unittest import mock import pytest @@ -738,9 +739,13 @@ def test_pass_doc_to_pipeline(nlp, n_process): assert doc.text == texts[0] assert len(doc.cats) > 0 if isinstance(get_current_ops(), NumpyOps) or n_process < 2: - docs = nlp.pipe(docs, n_process=n_process) - assert [doc.text for doc in docs] == texts - assert all(len(doc.cats) for doc in docs) + # Catch warnings to ensure that all worker processes exited + # succesfully. + with warnings.catch_warnings(): + warnings.simplefilter("error") + docs = nlp.pipe(docs, n_process=n_process) + assert [doc.text for doc in docs] == texts + assert all(len(doc.cats) for doc in docs) def test_invalid_arg_to_pipeline(nlp): From bff8725f4b4b93033bdeba6ad306e7ea79f7a402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 14 Feb 2024 14:46:28 +0100 Subject: [PATCH 3/4] Set version to 3.7.4 (#13327) --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 239527aff..f5ee66dae 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,5 +1,5 @@ # fmt: off __title__ = "spacy" -__version__ = "3.7.3" +__version__ = "3.7.4" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 0518c36f04864a588905394b2aeefd078a87784a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 20 Feb 2024 13:17:51 +0100 Subject: [PATCH 4/4] Sanitize direct download (#13313) The 'direct' option in 'spacy download' is supposed to only download from our model releases repository. However, users were able to pass in a relative path, allowing download from arbitrary repositories. This meant that a service that sourced strings from user input and which used the direct option would allow users to install arbitrary packages. --- spacy/cli/__init__.py | 2 ++ spacy/cli/download.py | 19 ++++++++++++++++++- spacy/tests/test_cli.py | 14 +++++++++++++- 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 1d402ff0c..3095778fe 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -1,5 +1,7 @@ from wasabi import msg +# Needed for testing +from . import download as download_module # noqa: F401 from ._util import app, setup_cli # noqa: F401 from .apply import apply # noqa: F401 from .assemble import assemble_cli # noqa: F401 diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 21c777f81..4261fb830 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -1,5 +1,6 @@ import sys from typing import Optional, Sequence +from urllib.parse import urljoin import requests import typer @@ -63,6 +64,13 @@ def download( ) pip_args = pip_args + ("--no-deps",) if direct: + # Reject model names with '/', in order to prevent shenanigans. + if "/" in model: + msg.fail( + title="Model download rejected", + text=f"Cannot download model '{model}'. Models are expected to be file names, not URLs or fragments", + exits=True, + ) components = model.split("-") model_name = "".join(components[:-1]) version = components[-1] @@ -153,7 +161,16 @@ def get_latest_version(model: str) -> str: def download_model( filename: str, user_pip_args: Optional[Sequence[str]] = None ) -> None: - download_url = about.__download_url__ + "/" + filename + # Construct the download URL carefully. We need to make sure we don't + # allow relative paths or other shenanigans to trick us into download + # from outside our own repo. + base_url = about.__download_url__ + # urljoin requires that the path ends with /, or the last path part will be dropped + if not base_url.endswith("/"): + base_url = about.__download_url__ + "/" + download_url = urljoin(base_url, filename) + if not download_url.startswith(about.__download_url__): + raise ValueError(f"Download from {filename} rejected. Was it a relative path?") pip_args = list(user_pip_args) if user_pip_args is not None else [] cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url] run_command(cmd) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index ff53ed1e1..7b729d78f 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -12,7 +12,7 @@ from thinc.api import Config import spacy from spacy import about -from spacy.cli import info +from spacy.cli import download_module, info from spacy.cli._util import parse_config_overrides, string_to_list, walk_directory from spacy.cli.apply import apply from spacy.cli.debug_data import ( @@ -1066,3 +1066,15 @@ def test_debug_data_trainable_lemmatizer_not_annotated(): def test_project_api_imports(): from spacy.cli import project_run from spacy.cli.project.run import project_run # noqa: F401, F811 + + +def test_download_rejects_relative_urls(monkeypatch): + """Test that we can't tell spacy download to get an arbitrary model by using a + relative path in the filename""" + + monkeypatch.setattr(download_module, "run_command", lambda cmd: None) + + # Check that normal download works + download_module.download("en_core_web_sm-3.7.1", direct=True) + with pytest.raises(SystemExit): + download_module.download("../en_core_web_sm-3.7.1", direct=True)