Merge branch 'explosion:master' into feature/extend_fo_tokenizer_exceptions

This commit is contained in:
Lise 2024-03-05 15:01:22 +01:00 committed by GitHub
commit a92540b9a3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 54 additions and 8 deletions

View File

@ -1,5 +1,5 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "3.7.3" __version__ = "3.7.4"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -1,5 +1,7 @@
from wasabi import msg from wasabi import msg
# Needed for testing
from . import download as download_module # noqa: F401
from ._util import app, setup_cli # noqa: F401 from ._util import app, setup_cli # noqa: F401
from .apply import apply # noqa: F401 from .apply import apply # noqa: F401
from .assemble import assemble_cli # noqa: F401 from .assemble import assemble_cli # noqa: F401

View File

@ -1,5 +1,6 @@
import sys import sys
from typing import Optional, Sequence from typing import Optional, Sequence
from urllib.parse import urljoin
import requests import requests
import typer import typer
@ -63,6 +64,13 @@ def download(
) )
pip_args = pip_args + ("--no-deps",) pip_args = pip_args + ("--no-deps",)
if direct: if direct:
# Reject model names with '/', in order to prevent shenanigans.
if "/" in model:
msg.fail(
title="Model download rejected",
text=f"Cannot download model '{model}'. Models are expected to be file names, not URLs or fragments",
exits=True,
)
components = model.split("-") components = model.split("-")
model_name = "".join(components[:-1]) model_name = "".join(components[:-1])
version = components[-1] version = components[-1]
@ -153,7 +161,16 @@ def get_latest_version(model: str) -> str:
def download_model( def download_model(
filename: str, user_pip_args: Optional[Sequence[str]] = None filename: str, user_pip_args: Optional[Sequence[str]] = None
) -> None: ) -> None:
download_url = about.__download_url__ + "/" + filename # Construct the download URL carefully. We need to make sure we don't
# allow relative paths or other shenanigans to trick us into download
# from outside our own repo.
base_url = about.__download_url__
# urljoin requires that the path ends with /, or the last path part will be dropped
if not base_url.endswith("/"):
base_url = about.__download_url__ + "/"
download_url = urljoin(base_url, filename)
if not download_url.startswith(about.__download_url__):
raise ValueError(f"Download from {filename} rejected. Was it a relative path?")
pip_args = list(user_pip_args) if user_pip_args is not None else [] pip_args = list(user_pip_args) if user_pip_args is not None else []
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url] cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
run_command(cmd) run_command(cmd)

View File

@ -220,6 +220,7 @@ class Warnings(metaclass=ErrorsWithCodes):
"key attribute for vectors, configure it through Vectors(attr=) or " "key attribute for vectors, configure it through Vectors(attr=) or "
"'spacy init vectors --attr'") "'spacy init vectors --attr'")
W126 = ("These keys are unsupported: {unsupported}") W126 = ("These keys are unsupported: {unsupported}")
W127 = ("Not all `Language.pipe` worker processes completed successfully")
class Errors(metaclass=ErrorsWithCodes): class Errors(metaclass=ErrorsWithCodes):

View File

@ -1730,6 +1730,9 @@ class Language:
for proc in procs: for proc in procs:
proc.join() proc.join()
if not all(proc.exitcode == 0 for proc in procs):
warnings.warn(Warnings.W127)
def _link_components(self) -> None: def _link_components(self) -> None:
"""Register 'listeners' within pipeline components, to allow them to """Register 'listeners' within pipeline components, to allow them to
effectively share weights. effectively share weights.
@ -2350,6 +2353,7 @@ def _apply_pipes(
if isinstance(texts_with_ctx, _WorkDoneSentinel): if isinstance(texts_with_ctx, _WorkDoneSentinel):
sender.close() sender.close()
receiver.close() receiver.close()
return
docs = ( docs = (
ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx
@ -2375,6 +2379,7 @@ def _apply_pipes(
# stop processing. # stop processing.
sender.close() sender.close()
receiver.close() receiver.close()
return
class _Sender: class _Sender:

View File

@ -12,7 +12,7 @@ from thinc.api import Config
import spacy import spacy
from spacy import about from spacy import about
from spacy.cli import info from spacy.cli import download_module, info
from spacy.cli._util import parse_config_overrides, string_to_list, walk_directory from spacy.cli._util import parse_config_overrides, string_to_list, walk_directory
from spacy.cli.apply import apply from spacy.cli.apply import apply
from spacy.cli.debug_data import ( from spacy.cli.debug_data import (
@ -1066,3 +1066,15 @@ def test_debug_data_trainable_lemmatizer_not_annotated():
def test_project_api_imports(): def test_project_api_imports():
from spacy.cli import project_run from spacy.cli import project_run
from spacy.cli.project.run import project_run # noqa: F401, F811 from spacy.cli.project.run import project_run # noqa: F401, F811
def test_download_rejects_relative_urls(monkeypatch):
"""Test that we can't tell spacy download to get an arbitrary model by using a
relative path in the filename"""
monkeypatch.setattr(download_module, "run_command", lambda cmd: None)
# Check that normal download works
download_module.download("en_core_web_sm-3.7.1", direct=True)
with pytest.raises(SystemExit):
download_module.download("../en_core_web_sm-3.7.1", direct=True)

View File

@ -1,5 +1,6 @@
import itertools import itertools
import logging import logging
import warnings
from unittest import mock from unittest import mock
import pytest import pytest
@ -738,9 +739,13 @@ def test_pass_doc_to_pipeline(nlp, n_process):
assert doc.text == texts[0] assert doc.text == texts[0]
assert len(doc.cats) > 0 assert len(doc.cats) > 0
if isinstance(get_current_ops(), NumpyOps) or n_process < 2: if isinstance(get_current_ops(), NumpyOps) or n_process < 2:
docs = nlp.pipe(docs, n_process=n_process) # Catch warnings to ensure that all worker processes exited
assert [doc.text for doc in docs] == texts # succesfully.
assert all(len(doc.cats) for doc in docs) with warnings.catch_warnings():
warnings.simplefilter("error")
docs = nlp.pipe(docs, n_process=n_process)
assert [doc.text for doc in docs] == texts
assert all(len(doc.cats) for doc in docs)
def test_invalid_arg_to_pipeline(nlp): def test_invalid_arg_to_pipeline(nlp):

View File

@ -526,13 +526,17 @@ application's `requirements.txt`. If you're running your own internal PyPi
installation, you can upload the pipeline packages there. pip's installation, you can upload the pipeline packages there. pip's
[requirements file format](https://pip.pypa.io/en/latest/reference/requirements-file-format/) [requirements file format](https://pip.pypa.io/en/latest/reference/requirements-file-format/)
supports both package names to download via a PyPi server, as well as supports both package names to download via a PyPi server, as well as
[direct URLs](#pipeline-urls). [direct URLs](#pipeline-urls). For instance, you can specify the
`en_core_web_sm` model for spaCy 3.7.x as follows:
```text {title="requirements.txt"} ```text {title="requirements.txt"}
spacy>=3.0.0,<4.0.0 spacy>=3.0.0,<4.0.0
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
``` ```
See the [list of models](https://spacy.io/models) for model download links for
the current spaCy version.
All pipeline packages are versioned and specify their spaCy dependency. This All pipeline packages are versioned and specify their spaCy dependency. This
ensures cross-compatibility and lets you specify exact version requirements for ensures cross-compatibility and lets you specify exact version requirements for
each pipeline. If you've [trained](/usage/training) your own pipeline, you can each pipeline. If you've [trained](/usage/training) your own pipeline, you can