Merge pull request #11965 from adrianeboyd/backport/v3.0.9

Backport bug fixes to v3.0.x
This commit is contained in:
Adriane Boyd 2022-12-15 08:04:20 +01:00 committed by GitHub
commit a65379dede
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 130 additions and 87 deletions

View File

@ -1,9 +1,6 @@
parameters:
python_version: ''
architecture: ''
prefix: ''
gpu: false
num_build_jobs: 1
architecture: 'x64'
steps:
- task: UsePythonVersion@0
@ -16,52 +13,76 @@ steps:
displayName: 'Set variables'
- script: |
${{ parameters.prefix }} python -m pip install -U pip setuptools
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
python -m pip install -U build pip setuptools
python -m pip install -U -r requirements.txt
displayName: "Install dependencies"
- script: |
${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
${{ parameters.prefix }} python setup.py sdist --formats=gztar
displayName: "Compile and build sdist"
python -m build --sdist
displayName: "Build sdist"
- task: DeleteFiles@1
inputs:
contents: "spacy"
displayName: "Delete source directory"
- task: DeleteFiles@1
inputs:
contents: "*.egg-info"
displayName: "Delete egg-info directory"
- script: |
${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
python -m pip freeze > installed.txt
python -m pip uninstall -y -r installed.txt
displayName: "Uninstall all packages"
- bash: |
${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
${{ parameters.prefix }} python -m pip install dist/$SDIST
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
python -m pip install dist/$SDIST
displayName: "Install from sdist"
- script: |
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
python -W error -c "import spacy"
displayName: "Test import"
- script: |
python -m spacy download es_core_news_sm
python -c "import spacy; nlp=spacy.load('es_core_news_sm'); doc=nlp('test')"
displayName: 'Test download CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
displayName: 'Test convert CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy init config -p ner -l es ner.cfg
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
displayName: 'Test debug config CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
# will have errors due to sparse data, check for summary in output
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
displayName: 'Test debug data CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
displayName: 'Test train CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'es_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
displayName: 'Test assemble CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m pip install -U -r requirements.txt
displayName: "Install test requirements"
- script: |
${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
displayName: "Install GPU requirements"
condition: eq(${{ parameters.gpu }}, true)
- script: |
${{ parameters.prefix }} python -m pytest --pyargs spacy
python -m pytest --pyargs spacy -W error
displayName: "Run CPU tests"
condition: eq(${{ parameters.gpu }}, false)
- script: |
${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
displayName: "Run GPU tests"
condition: eq(${{ parameters.gpu }}, true)
- script: |
python -m spacy download en_core_web_sm
python -c "import spacy; nlp=spacy.load('en_core_web_sm'); doc=nlp('test')"
displayName: 'Test download CLI'
condition: eq(variables['python_version'], '3.8')

View File

@ -28,7 +28,7 @@ jobs:
inputs:
versionSpec: "3.7"
- script: |
pip install flake8==3.5.0
pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
displayName: "flake8"
@ -38,7 +38,7 @@ jobs:
matrix:
# We're only running one platform per Python version to speed up builds
Python36Linux:
imageName: "ubuntu-latest"
imageName: "ubuntu-20.04"
python.version: "3.6"
# Python36Windows:
# imageName: "windows-latest"
@ -47,7 +47,7 @@ jobs:
# imageName: "macos-latest"
# python.version: "3.6"
# Python37Linux:
# imageName: "ubuntu-latest"
# imageName: "ubuntu-20.04"
# python.version: "3.7"
Python37Windows:
imageName: "windows-latest"
@ -89,20 +89,3 @@ jobs:
- template: .github/azure-steps.yml
parameters:
python_version: '$(python.version)'
architecture: 'x64'
# - job: "TestGPU"
# dependsOn: "Validate"
# strategy:
# matrix:
# Python38LinuxX64_GPU:
# python.version: '3.8'
# pool:
# name: "LinuxX64_GPU"
# steps:
# - template: .github/azure-steps.yml
# parameters:
# python_version: '$(python.version)'
# architecture: 'x64'
# gpu: true
# num_build_jobs: 24

View File

@ -11,6 +11,7 @@ srsly>=2.4.1,<3.0.0
catalogue>=2.0.4,<2.1.0
typer>=0.3.0,<0.4.0
pathy>=0.3.5
smart-open>=5.2.1,<7.0.0
# Third party dependencies
numpy>=1.15.0
requests>=2.13.0,<3.0.0
@ -26,6 +27,6 @@ cython>=0.25,<3.0
pytest>=5.2.0
pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.5.0,<3.6.0
flake8>=3.8.0,<6.0.0
hypothesis>=3.27.0,<7.0.0
mypy==0.910

View File

@ -49,9 +49,10 @@ install_requires =
wasabi>=0.8.1,<1.1.0
srsly>=2.4.1,<3.0.0
catalogue>=2.0.4,<2.1.0
# Third-party dependencies
typer>=0.3.0,<0.4.0
pathy>=0.3.5
# Third-party dependencies
smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0
numpy>=1.15.0
requests>=2.13.0,<3.0.0

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
__version__ = "3.0.8"
__version__ = "3.0.9"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"

View File

@ -355,7 +355,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
if dest.exists() and not force:
return None
src = str(src)
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
with smart_open.open(src, mode="rb", compression="disable") as input_file:
with dest.open(mode="wb") as output_file:
output_file.write(input_file.read())

View File

@ -1,7 +1,7 @@
{# This is a template for training configs used for the quickstart widget in
the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #}
{%- set use_transformer = hardware != "cpu" -%}
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
[paths]
train = null

View File

@ -320,6 +320,11 @@ class Errors:
"clear the existing vectors and resize the table.")
E074 = ("Error interpreting compiled match pattern: patterns are expected "
"to end with the attribute {attr}. Got: {bad_attr}.")
E079 = ("Error computing states in beam: number of predicted beams "
"({pbeams}) does not equal number of gold beams ({gbeams}).")
E080 = ("Duplicate state found in beam: {key}.")
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
"does not equal number of losses ({losses}).")
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
"match.")

View File

@ -6,6 +6,7 @@ from thinc.api import Model, Maxout, Linear
from ...util import registry
from ...kb import KnowledgeBase, Candidate, get_candidates
from ...vocab import Vocab
from ...tokens import Span
@registry.architectures("spacy.EntityLinker.v1")
@ -44,5 +45,5 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
@registry.misc("spacy.CandidateGenerator.v1")
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
return get_candidates

View File

@ -9,7 +9,7 @@ import warnings
from ..kb import KnowledgeBase, Candidate
from ..ml import empty_kb
from ..tokens import Doc
from ..tokens import Doc, Span
from .pipe import deserialize_config
from .trainable_pipe import TrainablePipe
from ..language import Language
@ -67,7 +67,7 @@ def make_entity_linker(
incl_prior: bool,
incl_context: bool,
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
):
"""Construct an EntityLinker component.
@ -114,7 +114,7 @@ class EntityLinker(TrainablePipe):
incl_prior: bool,
incl_context: bool,
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
) -> None:
"""Initialize an entity linker.
@ -127,7 +127,7 @@ class EntityLinker(TrainablePipe):
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
incl_context (bool): Whether or not to include the local context in the model.
entity_vector_length (int): Size of encoding vectors in the KB.
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention.
DOCS: https://spacy.io/api/entitylinker#init

View File

@ -4,7 +4,7 @@ from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator, create_model
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
from pydantic.main import ModelMetaclass
from thinc.api import Optimizer, ConfigValidationError
from thinc.api import Optimizer, ConfigValidationError, Model
from thinc.config import Promise
from collections import defaultdict
import inspect
@ -17,6 +17,7 @@ if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
from .language import Language # noqa: F401
from .training import Example # noqa: F401
from .vocab import Vocab # noqa: F401
# fmt: off
@ -353,7 +354,7 @@ class ConfigSchemaPretrain(BaseModel):
batcher: Batcher = Field(..., title="Batcher for the training data")
component: str = Field(..., title="Component to find the layer to pretrain")
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
objective: Callable[["Vocab", "Model"], "Model"] = Field(..., title="A function that creates the pretraining objective.")
objective: Callable[["Vocab", Model], Model] = Field(..., title="A function that creates the pretraining objective.")
# fmt: on
class Config:

View File

@ -1,4 +1,5 @@
import pytest
import numpy
from spacy.tokens import Doc
from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
@ -100,14 +101,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
# head before start
arr = doc.to_array(["HEAD"])
arr[0] = -1
arr[0] = numpy.int32(-1).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr)
# head after end
arr = doc.to_array(["HEAD"])
arr[0] = 5
arr[0] = numpy.int32(5).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr)

View File

@ -4,6 +4,7 @@ import pytest
import numpy
import logging
import mock
import warnings
from spacy.lang.xx import MultiLanguage
from spacy.tokens import Doc, Span, Token
@ -316,9 +317,9 @@ def test_doc_from_array_sent_starts(en_vocab):
# no warning using default attrs
attrs = doc._get_array_attrs()
arr = doc.to_array(attrs)
with pytest.warns(None) as record:
with warnings.catch_warnings():
warnings.simplefilter("error")
new_doc.from_array(attrs, arr)
assert len(record) == 0
# only SENT_START uses SENT_START
attrs = [SENT_START]
arr = doc.to_array(attrs)

View File

@ -2,6 +2,9 @@ import pytest
from spacy.tokens import Doc
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_ru_doc_lemmatization(ru_lemmatizer):
words = ["мама", "мыла", "раму"]
pos = ["NOUN", "VERB", "NOUN"]

View File

@ -2,6 +2,9 @@ import pytest
from spacy.tokens import Doc
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_uk_lemmatizer(uk_lemmatizer):
"""Check that the default uk lemmatizer runs."""
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])

View File

@ -1,4 +1,5 @@
import pytest
import warnings
import srsly
from mock import Mock
from spacy.matcher import PhraseMatcher
@ -197,13 +198,13 @@ def test_phrase_matcher_validation(en_vocab):
matcher.add("TEST1", [doc1])
with pytest.warns(UserWarning):
matcher.add("TEST2", [doc2])
with pytest.warns(None) as record:
with warnings.catch_warnings():
warnings.simplefilter("error")
matcher.add("TEST3", [doc3])
assert not record.list
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
with pytest.warns(None) as record:
with warnings.catch_warnings():
warnings.simplefilter("error")
matcher.add("TEST4", [doc2])
assert not record.list
def test_attr_validation(en_vocab):

View File

@ -49,8 +49,8 @@ def test_issue5551(textcat_config):
# All results should be the same because of the fixed seed
assert len(results) == 3
ops = get_current_ops()
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]))
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]))
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5)
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5)
def test_issue5838():

View File

@ -3,6 +3,7 @@ from typing import Callable
from spacy import util
from spacy.util import ensure_path, registry, load_model_from_config
from spacy.kb import KnowledgeBase
from spacy.vocab import Vocab
from thinc.api import Config
from ..util import make_tempdir
@ -111,7 +112,7 @@ def test_serialize_subclassed_kb():
@registry.misc("spacy.CustomKB.v1")
def custom_kb(
entity_vector_length: int, custom_field: int
) -> Callable[["Vocab"], KnowledgeBase]:
) -> Callable[[Vocab], KnowledgeBase]:
def custom_kb_factory(vocab):
kb = SubKnowledgeBase(
vocab=vocab,

View File

@ -10,6 +10,7 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.cli._util import load_project_config, substitute_project_variables
from spacy.cli._util import string_to_list
from spacy.cli._util import upload_file, download_file
from thinc.api import ConfigValidationError, Config
import srsly
import os
@ -474,3 +475,18 @@ def test_string_to_list(value):
def test_string_to_list_intify(value):
assert string_to_list(value, intify=False) == ["1", "2", "3"]
assert string_to_list(value, intify=True) == [1, 2, 3]
def test_upload_download_local_file():
with make_tempdir() as d1, make_tempdir() as d2:
filename = "f.txt"
content = "content"
local_file = d1 / filename
remote_file = d2 / filename
with local_file.open(mode="w") as file_:
file_.write(content)
upload_file(local_file, remote_file)
local_file.unlink()
download_file(remote_file, local_file)
with local_file.open(mode="r") as file_:
assert file_.read() == content

View File

@ -20,7 +20,7 @@ def get_textcat_bow_kwargs():
def get_textcat_cnn_kwargs():
return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}
def get_all_params(model):
@ -62,7 +62,7 @@ def get_tok2vec_kwargs():
}
def test_tok2vec():
def make_test_tok2vec():
return build_Tok2Vec_model(**get_tok2vec_kwargs())

View File

@ -347,6 +347,7 @@ cdef class Doc:
for annot in annotations:
if annot:
if annot is heads or annot is sent_starts or annot is ent_iobs:
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
for i in range(len(words)):
if attrs.ndim == 1:
attrs[i] = annot[i]

View File

@ -297,7 +297,7 @@ cdef class Span:
for ancestor in ancestors:
ancestor_i = ancestor.i - self.c.start
if ancestor_i in range(length):
array[i, head_col] = ancestor_i - i
array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
# if there is no appropriate ancestor, define a new artificial root
value = array[i, head_col]
@ -305,7 +305,7 @@ cdef class Span:
new_root = old_to_new_root.get(ancestor_i, None)
if new_root is not None:
# take the same artificial root as a previous token from the same sentence
array[i, head_col] = new_root - i
array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
else:
# set this token as the new artificial root
array[i, head_col] = 0

View File

@ -329,26 +329,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
if key not in IDS:
raise ValueError(Errors.E974.format(obj="token", key=key))
elif key in ["ORTH", "SPACY"]:
pass
continue
elif key == "HEAD":
attrs.append(key)
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
row = [h-i if h is not None else 0 for i, h in enumerate(value)]
elif key == "DEP":
attrs.append(key)
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
elif key == "SENT_START":
attrs.append(key)
values.append([to_ternary_int(v) for v in value])
row = [to_ternary_int(v) for v in value]
elif key == "MORPH":
attrs.append(key)
values.append([vocab.morphology.add(v) for v in value])
row = [vocab.morphology.add(v) for v in value]
else:
attrs.append(key)
if not all(isinstance(v, str) for v in value):
types = set([type(v) for v in value])
raise TypeError(Errors.E969.format(field=key, types=types)) from None
values.append([vocab.strings.add(v) for v in value])
array = numpy.asarray(values, dtype="uint64")
row = [vocab.strings.add(v) for v in value]
values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
array = numpy.array(values, dtype=numpy.uint64)
return attrs, array.T

View File

@ -274,3 +274,5 @@ def ensure_shape(vectors_loc):
# store all the results in a list in memory
lines2 = open_file(vectors_loc)
yield from lines2
lines2.close()
lines.close()