Merge pull request #11966 from adrianeboyd/backport/v3.1.7

Backport bug fixes to v3.1.x
2025-09-17 09:32:42 +03:00 · 2022-12-14 20:44:36 +01:00 · 2022-12-14 20:44:36 +01:00 · a898c7e9eb
commit a898c7e9eb
parent e97b07f19d 9d5cb9ac34
23 changed files with 140 additions and 101 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -1,9 +1,6 @@
 parameters:
  python_version: ''
-  architecture: ''
+  architecture: 'x64'
  prefix: ''
  gpu: false
  num_build_jobs: 1
 steps:
  - task: UsePythonVersion@0
@ -16,16 +13,16 @@ steps:
    displayName: 'Set variables'
  - script: |
-      ${{ parameters.prefix }} python -m pip install -U pip setuptools
+      python -m pip install -U build pip setuptools
-      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+      python -m pip install -U -r requirements.txt
    displayName: "Install dependencies"
  - script: |
-      ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
+      python -m build --sdist
-      ${{ parameters.prefix }} python setup.py sdist --formats=gztar
+    displayName: "Build sdist"
    displayName: "Compile and build sdist"
-  - script: python -m mypy spacy
+  - script: |
      python -m mypy spacy
    displayName: 'Run mypy'
    condition: ne(variables['python_version'], '3.10')
@ -34,35 +31,24 @@ steps:
      contents: "spacy"
    displayName: "Delete source directory"
  - task: DeleteFiles@1
    inputs:
      contents: "*.egg-info"
    displayName: "Delete egg-info directory"
  - script: |
-      ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
+      python -m pip freeze > installed.txt
-      ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
+      python -m pip uninstall -y -r installed.txt
    displayName: "Uninstall all packages"
  - bash: |
-      ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
+      SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
-      ${{ parameters.prefix }} python -m pip install dist/$SDIST
+      python -m pip install dist/$SDIST
    displayName: "Install from sdist"
  - script: |
-      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+      python -W error -c "import spacy"
-    displayName: "Install test requirements"
+    displayName: "Test import"
  - script: |
      ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
      ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
    displayName: "Install GPU requirements"
    condition: eq(${{ parameters.gpu }}, true)
  - script: |
      ${{ parameters.prefix }} python -m pytest --pyargs spacy
    displayName: "Run CPU tests"
    condition: eq(${{ parameters.gpu }}, false)
  - script: |
      ${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
    displayName: "Run GPU tests"
    condition: eq(${{ parameters.gpu }}, true)
  - script: |
      python -m spacy download ca_core_news_sm
@ -105,13 +91,21 @@ steps:
    displayName: 'Test assemble CLI vectors warning'
    condition: eq(variables['python_version'], '3.8')
  - script: |
      python -m pip install -U -r requirements.txt
    displayName: "Install test requirements"
  - script: |
      python -m pytest --pyargs spacy -W error
    displayName: "Run CPU tests"
  - script: |
      python -m pip install 'spacy[apple]'
      python -m pytest --pyargs spacy
    displayName: "Run CPU tests with thinc-apple-ops"
    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))
  - script: |
      python .github/validate_universe_json.py website/meta/universe.json
    displayName: 'Test website/meta/universe.json'
    condition: eq(variables['python_version'], '3.8')
  - script: |
      ${{ parameters.prefix }} python -m pip install thinc-apple-ops
      ${{ parameters.prefix }} python -m pytest --pyargs spacy
    displayName: "Run CPU tests with thinc-apple-ops"
    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9'))
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -5,7 +5,7 @@ repos:
    - id: black
      language_version: python3.7
 -   repo: https://gitlab.com/pycqa/flake8
-    rev: 3.9.2
+    rev: 5.0.4
    hooks:
    - id: flake8
      args:
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -29,7 +29,7 @@ jobs:
        inputs:
          versionSpec: "3.7"
      - script: |
-          pip install flake8==3.9.2
+          pip install flake8==5.0.4
          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
        displayName: "flake8"
@ -39,7 +39,7 @@ jobs:
      matrix:
        # We're only running one platform per Python version to speed up builds
        Python36Linux:
-          imageName: "ubuntu-latest"
+          imageName: "ubuntu-20.04"
          python.version: "3.6"
        #        Python36Windows:
        #          imageName: "windows-latest"
@ -48,7 +48,7 @@ jobs:
        #          imageName: "macos-latest"
        #          python.version: "3.6"
        #        Python37Linux:
-        #          imageName: "ubuntu-latest"
+        #          imageName: "ubuntu-20.04"
        #          python.version: "3.7"
        Python37Windows:
          imageName: "windows-latest"
@ -90,20 +90,3 @@ jobs:
      - template: .github/azure-steps.yml
        parameters:
          python_version: '$(python.version)'
          architecture: 'x64'
 #  - job: "TestGPU"
 #    dependsOn: "Validate"
 #    strategy:
 #      matrix:
 #        Python38LinuxX64_GPU:
 #          python.version: '3.8'
 #    pool:
 #      name: "LinuxX64_GPU"
 #    steps:
 #      - template: .github/azure-steps.yml
 #        parameters:
 #          python_version: '$(python.version)'
 #          architecture: 'x64'
 #          gpu: true
 #          num_build_jobs: 24
--- a/requirements.txt
+++ b/requirements.txt
@ -10,8 +10,8 @@ wasabi>=0.8.1,<1.1.0
 srsly>=2.4.1,<3.0.0
 catalogue>=2.0.6,<2.1.0
 typer>=0.3.0,<0.5.0
 click<8.1.0
 pathy>=0.3.5
 smart-open>=5.2.1,<7.0.0
 # Third party dependencies
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0
@ -28,7 +28,7 @@ cython>=0.25,<3.0
 pytest>=5.2.0
 pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
-flake8>=3.8.0,<3.10.0
+flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
 mypy==0.910
 types-dataclasses>=0.1.3; python_version < "3.7"
--- a/setup.cfg
+++ b/setup.cfg
@ -50,10 +50,10 @@ install_requires =
    wasabi>=0.8.1,<1.1.0
    srsly>=2.4.1,<3.0.0
    catalogue>=2.0.6,<2.1.0
    typer>=0.3.0,<0.5.0
    click<8.1.0
    pathy>=0.3.5
    # Third-party dependencies
    typer>=0.3.0,<0.5.0
    pathy>=0.3.5
    smart-open>=5.2.1,<7.0.0
    tqdm>=4.38.0,<5.0.0
    numpy>=1.15.0
    requests>=2.13.0,<3.0.0
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.1.6"
+__version__ = "3.1.7"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -358,7 +358,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
    if dest.exists() and not force:
        return None
    src = str(src)
-    with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
+    with smart_open.open(src, mode="rb", compression="disable") as input_file:
        with dest.open(mode="wb") as output_file:
            output_file.write(input_file.read())
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -1,7 +1,7 @@
 {# This is a template for training configs used for the quickstart widget in
 the docs and the init config command. It encodes various best practices and
 can help generate the best possible configuration, given a user's requirements. #}
-{%- set use_transformer = hardware != "cpu" -%}
+{%- set use_transformer = hardware != "cpu" and transformer_data -%}
 {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
 [paths]
 train = null
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -316,6 +316,11 @@ class Errors(metaclass=ErrorsWithCodes):
            "clear the existing vectors and resize the table.")
    E074 = ("Error interpreting compiled match pattern: patterns are expected "
            "to end with the attribute {attr}. Got: {bad_attr}.")
    E079 = ("Error computing states in beam: number of predicted beams "
            "({pbeams}) does not equal number of gold beams ({gbeams}).")
    E080 = ("Duplicate state found in beam: {key}.")
    E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
            "does not equal number of losses ({losses}).")
    E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
            "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
            "match.")
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -229,6 +229,9 @@ class SpanCategorizer(TrainablePipe):
        DOCS: https://spacy.io/api/spancategorizer#predict
        """
        indices = self.suggester(docs, ops=self.model.ops)
        if indices.lengths.sum() == 0:
            scores = self.model.ops.alloc2f(0, 0)
        else:
            scores = self.model.predict((docs, indices))  # type: ignore
        return indices, scores
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@ -1,4 +1,6 @@
 import pytest
 import numpy
 from spacy.tokens import Doc
 from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
@ -100,14 +102,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
    # head before start
    arr = doc.to_array(["HEAD"])
-    arr[0] = -1
+    arr[0] = numpy.int32(-1).astype(numpy.uint64)
    doc_from_array = Doc(en_vocab, words=words)
    with pytest.raises(ValueError):
        doc_from_array.from_array(["HEAD"], arr)
    # head after end
    arr = doc.to_array(["HEAD"])
-    arr[0] = 5
+    arr[0] = numpy.int32(5).astype(numpy.uint64)
    doc_from_array = Doc(en_vocab, words=words)
    with pytest.raises(ValueError):
        doc_from_array.from_array(["HEAD"], arr)
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -2,6 +2,7 @@ import weakref
 import pytest
 import numpy
 import warnings
 from spacy.lang.xx import MultiLanguage
 from spacy.tokens import Doc, Span, Token
@ -311,9 +312,9 @@ def test_doc_from_array_sent_starts(en_vocab):
    # no warning using default attrs
    attrs = doc._get_array_attrs()
    arr = doc.to_array(attrs)
-    with pytest.warns(None) as record:
+    with warnings.catch_warnings():
        warnings.simplefilter("error")
        new_doc.from_array(attrs, arr)
        assert len(record) == 0
    # only SENT_START uses SENT_START
    attrs = [SENT_START]
    arr = doc.to_array(attrs)
--- a/spacy/tests/lang/ru/test_lemmatizer.py
+++ b/spacy/tests/lang/ru/test_lemmatizer.py
@ -2,6 +2,9 @@ import pytest
 from spacy.tokens import Doc
 pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_ru_doc_lemmatization(ru_lemmatizer):
    words = ["мама", "мыла", "раму"]
    pos = ["NOUN", "VERB", "NOUN"]
--- a/spacy/tests/lang/uk/test_lemmatizer.py
+++ b/spacy/tests/lang/uk/test_lemmatizer.py
@ -1,6 +1,10 @@
 import pytest
 from spacy.tokens import Doc
 pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_uk_lemmatizer(uk_lemmatizer):
    """Check that the default uk lemmatizer runs."""
    doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@ -1,4 +1,5 @@
 import pytest
 import warnings
 import srsly
 from mock import Mock
 from spacy.matcher import PhraseMatcher
@ -197,13 +198,13 @@ def test_phrase_matcher_validation(en_vocab):
        matcher.add("TEST1", [doc1])
    with pytest.warns(UserWarning):
        matcher.add("TEST2", [doc2])
-    with pytest.warns(None) as record:
+    with warnings.catch_warnings():
        warnings.simplefilter("error")
        matcher.add("TEST3", [doc3])
        assert not record.list
    matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
-    with pytest.warns(None) as record:
+    with warnings.catch_warnings():
        warnings.simplefilter("error")
        matcher.add("TEST4", [doc2])
        assert not record.list
 def test_attr_validation(en_vocab):
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@ -369,24 +369,39 @@ def test_overfitting_IO_overlapping():
 def test_zero_suggestions():
-    # Test with a suggester that returns 0 suggestions
+    # Test with a suggester that can return 0 suggestions
-    @registry.misc("test_zero_suggester")
+    @registry.misc("test_mixed_zero_suggester")
-    def make_zero_suggester():
+    def make_mixed_zero_suggester():
-        def zero_suggester(docs, *, ops=None):
+        def mixed_zero_suggester(docs, *, ops=None):
            if ops is None:
                ops = get_current_ops()
-            return Ragged(
+            spans = []
-                ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i")
+            lengths = []
-            )
+            for doc in docs:
                if len(doc) > 0 and len(doc) % 2 == 0:
                    spans.append((0, 1))
                    lengths.append(1)
                else:
                    lengths.append(0)
            spans = ops.asarray2i(spans)
            lengths_array = ops.asarray1i(lengths)
            if len(spans) > 0:
                output = Ragged(ops.xp.vstack(spans), lengths_array)
            else:
                output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
            return output
-        return zero_suggester
+        return mixed_zero_suggester
    fix_random_seed(0)
    nlp = English()
    spancat = nlp.add_pipe(
        "spancat",
-        config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY},
+        config={
            "suggester": {"@misc": "test_mixed_zero_suggester"},
            "spans_key": SPAN_KEY,
        },
    )
    train_examples = make_examples(nlp)
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
@ -394,3 +409,13 @@ def test_zero_suggestions():
    assert set(spancat.labels) == {"LOC", "PERSON"}
    nlp.update(train_examples, sgd=optimizer)
    # empty doc
    nlp("")
    # single doc with zero suggestions
    nlp("one")
    # single doc with one suggestion
    nlp("two two")
    # batch with mixed zero/one suggestions
    list(nlp.pipe(["one", "two two", "three three three", "", "four four four four"]))
    # batch with no suggestions
    list(nlp.pipe(["", "one", "three three three"]))
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -11,6 +11,7 @@ from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
 from spacy.cli._util import is_subpath_of
 from spacy.cli._util import string_to_list
 from spacy.cli._util import upload_file, download_file
 from spacy import about
 from spacy.util import get_minor_version
 from spacy.cli.validate import get_model_pkgs
@ -574,3 +575,18 @@ def test_get_third_party_dependencies():
 )
 def test_is_subpath_of(parent, child, expected):
    assert is_subpath_of(parent, child) == expected
 def test_upload_download_local_file():
    with make_tempdir() as d1, make_tempdir() as d2:
        filename = "f.txt"
        content = "content"
        local_file = d1 / filename
        remote_file = d2 / filename
        with local_file.open(mode="w") as file_:
            file_.write(content)
        upload_file(local_file, remote_file)
        local_file.unlink()
        download_file(remote_file, local_file)
        with local_file.open(mode="r") as file_:
            assert file_.read() == content
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@ -23,7 +23,7 @@ def get_textcat_bow_kwargs():
 def get_textcat_cnn_kwargs():
-    return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
+    return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}
 def get_all_params(model):
@ -65,7 +65,7 @@ def get_tok2vec_kwargs():
    }
-def test_tok2vec():
+def make_test_tok2vec():
    return build_Tok2Vec_model(**get_tok2vec_kwargs())
--- a/spacy/tests/vocab_vectors/test_similarity.py
+++ b/spacy/tests/vocab_vectors/test_similarity.py
@ -7,7 +7,7 @@ from ..util import get_cosine, add_vecs_to_vocab
@pytest.fixture
 def vectors():
-    return [("apple", [1, 2, 3]), ("orange", [-1, -2, -3])]
+    return [("apple", [1, 2, 3]), ("orange", [-1, -2, -5])]
@pytest.fixture()
@ -44,7 +44,6 @@ def test_vectors_similarity_TT(vocab, vectors):
 def test_vectors_similarity_TD(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = Doc(vocab, words=[word1, word2])
    with pytest.warns(UserWarning):
    assert doc.similarity(doc[0]) == doc[0].similarity(doc)
@ -57,5 +56,4 @@ def test_vectors_similarity_DS(vocab, vectors):
 def test_vectors_similarity_TS(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = Doc(vocab, words=[word1, word2])
    with pytest.warns(UserWarning):
    assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -356,6 +356,7 @@ cdef class Doc:
            for annot in annotations:
                if annot:
                    if annot is heads or annot is sent_starts or annot is ent_iobs:
                        annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
                        for i in range(len(words)):
                            if attrs.ndim == 1:
                                attrs[i] = annot[i]
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -307,7 +307,7 @@ cdef class Span:
                    for ancestor in ancestors:
                        ancestor_i = ancestor.i - self.c.start
                        if ancestor_i in range(length):
-                            array[i, head_col] = ancestor_i - i
+                            array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
                # if there is no appropriate ancestor, define a new artificial root
                value = array[i, head_col]
@ -315,7 +315,7 @@ cdef class Span:
                    new_root = old_to_new_root.get(ancestor_i, None)
                    if new_root is not None:
                        # take the same artificial root as a previous token from the same sentence
-                        array[i, head_col] = new_root - i
+                        array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
                    else:
                        # set this token as the new artificial root
                        array[i, head_col] = 0
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -333,26 +333,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
        if key not in IDS:
            raise ValueError(Errors.E974.format(obj="token", key=key))
        elif key in ["ORTH", "SPACY"]:
-            pass
+            continue
        elif key == "HEAD":
            attrs.append(key)
-            values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
+            row = [h-i if h is not None else 0 for i, h in enumerate(value)]
        elif key == "DEP":
            attrs.append(key)
-            values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
+            row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
        elif key == "SENT_START":
            attrs.append(key)
-            values.append([to_ternary_int(v) for v in value])
+            row = [to_ternary_int(v) for v in value]
        elif key == "MORPH":
            attrs.append(key)
-            values.append([vocab.morphology.add(v) for v in value])
+            row = [vocab.morphology.add(v) for v in value]
        else:
            attrs.append(key)
            if not all(isinstance(v, str) for v in value):
                types = set([type(v) for v in value])
                raise TypeError(Errors.E969.format(field=key, types=types)) from None
-            values.append([vocab.strings.add(v) for v in value])
+            row = [vocab.strings.add(v) for v in value]
-    array = numpy.asarray(values, dtype="uint64")
+        values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
    array = numpy.array(values, dtype=numpy.uint64)
    return attrs, array.T
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -290,3 +290,5 @@ def ensure_shape(vectors_loc):
        # store all the results in a list in memory
        lines2 = open_file(vectors_loc)
        yield from lines2
        lines2.close()
    lines.close()