Merge pull request #11964 from adrianeboyd/backport/v3.2.5

Backport bug fixes to v3.2.x
Set version to v3.2.5
2025-08-04 20:30:24 +03:00 · 2022-12-14 18:33:05 +01:00 · 2022-12-13 13:21:53 +01:00 · 2022-12-13 13:21:41 +01:00 · 2022-12-13 13:21:41 +01:00 · 2022-12-13 13:21:41 +01:00
27 changed files with 173 additions and 106 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -1,9 +1,7 @@
 parameters:
  python_version: ''
-  architecture: ''
-  prefix: ''
-  gpu: false
-  num_build_jobs: 1
+  architecture: 'x64'
+  num_build_jobs: 2

 steps:
  - task: UsePythonVersion@0
@ -16,16 +14,16 @@ steps:
    displayName: 'Set variables'

  - script: |
-      ${{ parameters.prefix }} python -m pip install -U pip setuptools
-      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+      python -m pip install -U build pip setuptools
+      python -m pip install -U -r requirements.txt
    displayName: "Install dependencies"

  - script: |
-      ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
-      ${{ parameters.prefix }} python setup.py sdist --formats=gztar
-    displayName: "Compile and build sdist"
+      python -m build --sdist
+    displayName: "Build sdist"

-  - script: python -m mypy spacy
+  - script: |
+      python -m mypy spacy
    displayName: 'Run mypy'
    condition: ne(variables['python_version'], '3.10')

@ -34,35 +32,24 @@ steps:
      contents: "spacy"
    displayName: "Delete source directory"

+  - task: DeleteFiles@1
+    inputs:
+      contents: "*.egg-info"
+    displayName: "Delete egg-info directory"
+
  - script: |
-      ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
-      ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
+      python -m pip freeze > installed.txt
+      python -m pip uninstall -y -r installed.txt
    displayName: "Uninstall all packages"

  - bash: |
-      ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
-      ${{ parameters.prefix }} python -m pip install dist/$SDIST
+      SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
+      SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
    displayName: "Install from sdist"

  - script: |
-      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
-    displayName: "Install test requirements"
-
-  - script: |
-      ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
-      ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
-    displayName: "Install GPU requirements"
-    condition: eq(${{ parameters.gpu }}, true)
-
-  - script: |
-      ${{ parameters.prefix }} python -m pytest --pyargs spacy
-    displayName: "Run CPU tests"
-    condition: eq(${{ parameters.gpu }}, false)
-
-  - script: |
-      ${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
-    displayName: "Run GPU tests"
-    condition: eq(${{ parameters.gpu }}, true)
+      python -W error -c "import spacy"
+    displayName: "Test import"

  - script: |
      python -m spacy download ca_core_news_sm
@ -105,13 +92,21 @@ steps:
    displayName: 'Test assemble CLI vectors warning'
    condition: eq(variables['python_version'], '3.8')

+  - script: |
+      python -m pip install -U -r requirements.txt
+    displayName: "Install test requirements"
+
+  - script: |
+      python -m pytest --pyargs spacy -W error
+    displayName: "Run CPU tests"
+
+  - script: |
+      python -m pip install 'spacy[apple]'
+      python -m pytest --pyargs spacy
+    displayName: "Run CPU tests with thinc-apple-ops"
+    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))
+
  - script: |
      python .github/validate_universe_json.py website/meta/universe.json
    displayName: 'Test website/meta/universe.json'
    condition: eq(variables['python_version'], '3.8')
-
-  - script: |
-      ${{ parameters.prefix }} python -m pip install thinc-apple-ops
-      ${{ parameters.prefix }} python -m pytest --pyargs spacy
-    displayName: "Run CPU tests with thinc-apple-ops"
-    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9'))
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -5,7 +5,7 @@ repos:
    - id: black
      language_version: python3.7
 -   repo: https://gitlab.com/pycqa/flake8
-    rev: 3.9.2
+    rev: 5.0.4
    hooks:
    - id: flake8
      args:
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -29,7 +29,7 @@ jobs:
        inputs:
          versionSpec: "3.7"
      - script: |
-          pip install flake8==3.9.2
+          pip install flake8==5.0.4
          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
        displayName: "flake8"

@ -39,7 +39,7 @@ jobs:
      matrix:
        # We're only running one platform per Python version to speed up builds
        Python36Linux:
-          imageName: "ubuntu-latest"
+          imageName: "ubuntu-20.04"
          python.version: "3.6"
        #        Python36Windows:
        #          imageName: "windows-latest"
@ -48,7 +48,7 @@ jobs:
        #          imageName: "macos-latest"
        #          python.version: "3.6"
        #        Python37Linux:
-        #          imageName: "ubuntu-latest"
+        #          imageName: "ubuntu-20.04"
        #          python.version: "3.7"
        Python37Windows:
          imageName: "windows-latest"
@ -90,20 +90,3 @@ jobs:
      - template: .github/azure-steps.yml
        parameters:
          python_version: '$(python.version)'
-          architecture: 'x64'
-
-#  - job: "TestGPU"
-#    dependsOn: "Validate"
-#    strategy:
-#      matrix:
-#        Python38LinuxX64_GPU:
-#          python.version: '3.8'
-#    pool:
-#      name: "LinuxX64_GPU"
-#    steps:
-#      - template: .github/azure-steps.yml
-#        parameters:
-#          python_version: '$(python.version)'
-#          architecture: 'x64'
-#          gpu: true
-#          num_build_jobs: 24
--- a/build-constraints.txt
+++ b/build-constraints.txt
@ -1,6 +1,8 @@
 # build version constraints for use with wheelwright + multibuild
-numpy==1.15.0; python_version<='3.7'
-numpy==1.17.3; python_version=='3.8'
+numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
+numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
+numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
+numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
 numpy==1.19.3; python_version=='3.9'
 numpy==1.21.3; python_version=='3.10'
 numpy; python_version>='3.11'
--- a/requirements.txt
+++ b/requirements.txt
@ -12,6 +12,7 @@ srsly>=2.4.1,<3.0.0
 catalogue>=2.0.6,<2.1.0
 typer>=0.3.0,<0.5.0
 pathy>=0.3.5
+smart-open>=5.2.1,<7.0.0
 # Third party dependencies
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0
--- a/setup.cfg
+++ b/setup.cfg
@ -51,9 +51,10 @@ install_requires =
    wasabi>=0.8.1,<1.1.0
    srsly>=2.4.1,<3.0.0
    catalogue>=2.0.6,<2.1.0
+    # Third-party dependencies
    typer>=0.3.0,<0.5.0
    pathy>=0.3.5
-    # Third-party dependencies
+    smart-open>=5.2.1,<7.0.0
    tqdm>=4.38.0,<5.0.0
    numpy>=1.15.0
    requests>=2.13.0,<3.0.0
--- a/setup.py
+++ b/setup.py
@ -125,6 +125,8 @@ class build_ext_options:

 class build_ext_subclass(build_ext, build_ext_options):
    def build_extensions(self):
+        if not self.parallel:
+            self.parallel = int(os.environ.get("SPACY_NUM_BUILD_JOBS", 1))
        build_ext_options.build_options(self)
        build_ext.build_extensions(self)

--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.2.2"
+__version__ = "3.2.5"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -358,7 +358,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
    if dest.exists() and not force:
        return None
    src = str(src)
-    with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
+    with smart_open.open(src, mode="rb", compression="disable") as input_file:
        with dest.open(mode="wb") as output_file:
            output_file.write(input_file.read())

--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -1,7 +1,7 @@
 {# This is a template for training configs used for the quickstart widget in
 the docs and the init config command. It encodes various best practices and
 can help generate the best possible configuration, given a user's requirements. #}
-{%- set use_transformer = hardware != "cpu" -%}
+{%- set use_transformer = hardware != "cpu" and transformer_data -%}
 {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
 [paths]
 train = null
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -322,6 +322,11 @@ class Errors(metaclass=ErrorsWithCodes):
            "clear the existing vectors and resize the table.")
    E074 = ("Error interpreting compiled match pattern: patterns are expected "
            "to end with the attribute {attr}. Got: {bad_attr}.")
+    E079 = ("Error computing states in beam: number of predicted beams "
+            "({pbeams}) does not equal number of gold beams ({gbeams}).")
+    E080 = ("Duplicate state found in beam: {key}.")
+    E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
+            "does not equal number of losses ({losses}).")
    E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
            "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
            "match.")
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -268,7 +268,10 @@ class SpanCategorizer(TrainablePipe):
        DOCS: https://spacy.io/api/spancategorizer#predict
        """
        indices = self.suggester(docs, ops=self.model.ops)
-        scores = self.model.predict((docs, indices))  # type: ignore
+        if indices.lengths.sum() == 0:
+            scores = self.model.ops.alloc2f(0, 0)
+        else:
+            scores = self.model.predict((docs, indices))  # type: ignore
        return indices, scores

    def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -118,6 +118,10 @@ class Tok2Vec(TrainablePipe):

        DOCS: https://spacy.io/api/tok2vec#predict
        """
+        if not any(len(doc) for doc in docs):
+            # Handle cases where there are no tokens in any docs.
+            width = self.model.get_dim("nO")
+            return [self.model.ops.alloc((0, width)) for doc in docs]
        tokvecs = self.model.predict(docs)
        batch_id = Tok2VecListener.get_batch_id(docs)
        for listener in self.listeners:
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):

    # head before start
    arr = doc.to_array(["HEAD"])
-    arr[0] = -1
+    arr[0] = numpy.int32(-1).astype(numpy.uint64)
    doc_from_array = Doc(en_vocab, words=words)
    with pytest.raises(ValueError):
        doc_from_array.from_array(["HEAD"], arr)

    # head after end
    arr = doc.to_array(["HEAD"])
-    arr[0] = 5
+    arr[0] = numpy.int32(5).astype(numpy.uint64)
    doc_from_array = Doc(en_vocab, words=words)
    with pytest.raises(ValueError):
        doc_from_array.from_array(["HEAD"], arr)
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -2,6 +2,7 @@ import weakref

 import numpy
 import pytest
+import warnings
 from thinc.api import NumpyOps, get_current_ops

 from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
@ -528,9 +529,9 @@ def test_doc_from_array_sent_starts(en_vocab):
    # no warning using default attrs
    attrs = doc._get_array_attrs()
    arr = doc.to_array(attrs)
-    with pytest.warns(None) as record:
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
        new_doc.from_array(attrs, arr)
-        assert len(record) == 0
    # only SENT_START uses SENT_START
    attrs = [SENT_START]
    arr = doc.to_array(attrs)
--- a/spacy/tests/lang/ru/test_lemmatizer.py
+++ b/spacy/tests/lang/ru/test_lemmatizer.py
@ -2,6 +2,9 @@ import pytest
 from spacy.tokens import Doc


+pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
+
+
 def test_ru_doc_lemmatization(ru_lemmatizer):
    words = ["мама", "мыла", "раму"]
    pos = ["NOUN", "VERB", "NOUN"]
--- a/spacy/tests/lang/uk/test_lemmatizer.py
+++ b/spacy/tests/lang/uk/test_lemmatizer.py
@ -1,6 +1,10 @@
+import pytest
 from spacy.tokens import Doc


+pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
+
+
 def test_uk_lemmatizer(uk_lemmatizer):
    """Check that the default uk lemmatizer runs."""
    doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@ -1,4 +1,5 @@
 import pytest
+import warnings
 import srsly
 from mock import Mock

@ -314,13 +315,13 @@ def test_phrase_matcher_validation(en_vocab):
        matcher.add("TEST1", [doc1])
    with pytest.warns(UserWarning):
        matcher.add("TEST2", [doc2])
-    with pytest.warns(None) as record:
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
        matcher.add("TEST3", [doc3])
-        assert not record.list
    matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
-    with pytest.warns(None) as record:
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
        matcher.add("TEST4", [doc2])
-        assert not record.list


 def test_attr_validation(en_vocab):
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@ -372,24 +372,39 @@ def test_overfitting_IO_overlapping():


 def test_zero_suggestions():
-    # Test with a suggester that returns 0 suggestions
+    # Test with a suggester that can return 0 suggestions

-    @registry.misc("test_zero_suggester")
-    def make_zero_suggester():
-        def zero_suggester(docs, *, ops=None):
+    @registry.misc("test_mixed_zero_suggester")
+    def make_mixed_zero_suggester():
+        def mixed_zero_suggester(docs, *, ops=None):
            if ops is None:
                ops = get_current_ops()
-            return Ragged(
-                ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i")
-            )
+            spans = []
+            lengths = []
+            for doc in docs:
+                if len(doc) > 0 and len(doc) % 2 == 0:
+                    spans.append((0, 1))
+                    lengths.append(1)
+                else:
+                    lengths.append(0)
+            spans = ops.asarray2i(spans)
+            lengths_array = ops.asarray1i(lengths)
+            if len(spans) > 0:
+                output = Ragged(ops.xp.vstack(spans), lengths_array)
+            else:
+                output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
+            return output

-        return zero_suggester
+        return mixed_zero_suggester

    fix_random_seed(0)
    nlp = English()
    spancat = nlp.add_pipe(
        "spancat",
-        config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY},
+        config={
+            "suggester": {"@misc": "test_mixed_zero_suggester"},
+            "spans_key": SPAN_KEY,
+        },
    )
    train_examples = make_examples(nlp)
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
@ -397,3 +412,13 @@ def test_zero_suggestions():
    assert set(spancat.labels) == {"LOC", "PERSON"}

    nlp.update(train_examples, sgd=optimizer)
+    # empty doc
+    nlp("")
+    # single doc with zero suggestions
+    nlp("one")
+    # single doc with one suggestion
+    nlp("two two")
+    # batch with mixed zero/one suggestions
+    list(nlp.pipe(["one", "two two", "three three three", "", "four four four four"]))
+    # batch with no suggestions
+    list(nlp.pipe(["", "one", "three three three"]))
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@ -11,7 +11,7 @@ from spacy.lang.en import English
 from thinc.api import Config, get_current_ops
 from numpy.testing import assert_array_equal

-from ..util import get_batch, make_tempdir
+from ..util import get_batch, make_tempdir, add_vecs_to_vocab


 def test_empty_doc():
@ -140,9 +140,25 @@ TRAIN_DATA = [
 ]


-def test_tok2vec_listener():
+@pytest.mark.parametrize("with_vectors", (False, True))
+def test_tok2vec_listener(with_vectors):
    orig_config = Config().from_str(cfg_string)
+    orig_config["components"]["tok2vec"]["model"]["embed"][
+        "include_static_vectors"
+    ] = with_vectors
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+
+    if with_vectors:
+        ops = get_current_ops()
+        vectors = [
+            ("apple", ops.asarray([1, 2, 3])),
+            ("orange", ops.asarray([-1, -2, -3])),
+            ("and", ops.asarray([-1, -1, -1])),
+            ("juice", ops.asarray([5, 5, 10])),
+            ("pie", ops.asarray([7, 6.3, 8.9])),
+        ]
+        add_vecs_to_vocab(nlp.vocab, vectors)
+
    assert nlp.pipe_names == ["tok2vec", "tagger"]
    tagger = nlp.get_pipe("tagger")
    tok2vec = nlp.get_pipe("tok2vec")
@ -169,6 +185,9 @@ def test_tok2vec_listener():
    ops = get_current_ops()
    assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor))

+    # test with empty doc
+    doc = nlp("")
+
    # TODO: should this warn or error?
    nlp.select_pipes(disable="tok2vec")
    assert nlp.pipe_names == ["tagger"]
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -12,6 +12,7 @@ from spacy.cli._util import is_subpath_of, load_project_config
 from spacy.cli._util import parse_config_overrides, string_to_list
 from spacy.cli._util import substitute_project_variables
 from spacy.cli._util import validate_project_commands
+from spacy.cli._util import upload_file, download_file
 from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
 from spacy.cli.debug_data import _get_labels_from_spancat
 from spacy.cli.download import get_compatibility, get_version
@ -719,4 +720,19 @@ def test_debug_data_compile_gold():
    ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "B-ENT", "I-ENT"])
    eg = Example(pred, ref)
    data = _compile_gold([eg], ["ner"], nlp, True)
-    assert data["boundary_cross_ents"] == 1
+    assert data["boundary_cross_ents"] == 1
+
+
+def test_upload_download_local_file():
+    with make_tempdir() as d1, make_tempdir() as d2:
+        filename = "f.txt"
+        content = "content"
+        local_file = d1 / filename
+        remote_file = d2 / filename
+        with local_file.open(mode="w") as file_:
+            file_.write(content)
+        upload_file(local_file, remote_file)
+        local_file.unlink()
+        download_file(remote_file, local_file)
+        with local_file.open(mode="r") as file_:
+            assert file_.read() == content
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@ -23,7 +23,7 @@ def get_textcat_bow_kwargs():


 def get_textcat_cnn_kwargs():
-    return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
+    return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}


 def get_all_params(model):
@ -65,7 +65,7 @@ def get_tok2vec_kwargs():
    }


-def test_tok2vec():
+def make_test_tok2vec():
    return build_Tok2Vec_model(**get_tok2vec_kwargs())


--- a/spacy/tests/vocab_vectors/test_similarity.py
+++ b/spacy/tests/vocab_vectors/test_similarity.py
@ -7,7 +7,7 @@ from ..util import get_cosine, add_vecs_to_vocab

@pytest.fixture
 def vectors():
-    return [("apple", [1, 2, 3]), ("orange", [-1, -2, -3])]
+    return [("apple", [1, 2, 3]), ("orange", [-1, -2, -5])]


@pytest.fixture()
@ -71,19 +71,17 @@ def test_vectors_similarity_DD(vocab, vectors):
 def test_vectors_similarity_TD(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = Doc(vocab, words=[word1, word2])
-    with pytest.warns(UserWarning):
-        assert isinstance(doc.similarity(doc[0]), float)
-        assert isinstance(doc[0].similarity(doc), float)
-        assert doc.similarity(doc[0]) == doc[0].similarity(doc)
+    assert isinstance(doc.similarity(doc[0]), float)
+    assert isinstance(doc[0].similarity(doc), float)
+    assert doc.similarity(doc[0]) == doc[0].similarity(doc)


 def test_vectors_similarity_TS(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = Doc(vocab, words=[word1, word2])
-    with pytest.warns(UserWarning):
-        assert isinstance(doc[:2].similarity(doc[0]), float)
-        assert isinstance(doc[0].similarity(doc[-2]), float)
-        assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
+    assert isinstance(doc[:2].similarity(doc[0]), float)
+    assert isinstance(doc[0].similarity(doc[-2]), float)
+    assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])


 def test_vectors_similarity_DS(vocab, vectors):
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -356,6 +356,7 @@ cdef class Doc:
            for annot in annotations:
                if annot:
                    if annot is heads or annot is sent_starts or annot is ent_iobs:
+                        annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
                        for i in range(len(words)):
                            if attrs.ndim == 1:
                                attrs[i] = annot[i]
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -305,7 +305,7 @@ cdef class Span:
                    for ancestor in ancestors:
                        ancestor_i = ancestor.i - self.c.start
                        if ancestor_i in range(length):
-                            array[i, head_col] = ancestor_i - i
+                            array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)

                # if there is no appropriate ancestor, define a new artificial root
                value = array[i, head_col]
@ -313,7 +313,7 @@ cdef class Span:
                    new_root = old_to_new_root.get(ancestor_i, None)
                    if new_root is not None:
                        # take the same artificial root as a previous token from the same sentence
-                        array[i, head_col] = new_root - i
+                        array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
                    else:
                        # set this token as the new artificial root
                        array[i, head_col] = 0
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -333,26 +333,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
        if key not in IDS:
            raise ValueError(Errors.E974.format(obj="token", key=key))
        elif key in ["ORTH", "SPACY"]:
-            pass
+            continue
        elif key == "HEAD":
            attrs.append(key)
-            values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
+            row = [h-i if h is not None else 0 for i, h in enumerate(value)]
        elif key == "DEP":
            attrs.append(key)
-            values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
+            row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
        elif key == "SENT_START":
            attrs.append(key)
-            values.append([to_ternary_int(v) for v in value])
+            row = [to_ternary_int(v) for v in value]
        elif key == "MORPH":
            attrs.append(key)
-            values.append([vocab.morphology.add(v) for v in value])
+            row = [vocab.morphology.add(v) for v in value]
        else:
            attrs.append(key)
            if not all(isinstance(v, str) for v in value):
                types = set([type(v) for v in value])
                raise TypeError(Errors.E969.format(field=key, types=types)) from None
-            values.append([vocab.strings.add(v) for v in value])
-    array = numpy.asarray(values, dtype="uint64")
+            row = [vocab.strings.add(v) for v in value]
+        values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
+    array = numpy.array(values, dtype=numpy.uint64)
    return attrs, array.T


--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -335,3 +335,5 @@ def ensure_shape(vectors_loc):
        # store all the results in a list in memory
        lines2 = open_file(vectors_loc)
        yield from lines2
+        lines2.close()
+    lines.close()
Author	SHA1	Message	Date
Adriane Boyd	6e8ab15445	Merge pull request #11964 from adrianeboyd/backport/v3.2.5 Backport bug fixes to v3.2.x	2022-12-14 18:33:05 +01:00
Adriane Boyd	427de63f0a	Set version to v3.2.5	2022-12-13 13:21:53 +01:00
Adriane Boyd	386a3e69da	CI and precommit hooks: switch to flake8==5.0.4	2022-12-13 13:21:41 +01:00
Adriane Boyd	b449d355d5	CI: Install thinc-apple-ops through extra (#11963 )	2022-12-13 13:21:41 +01:00
Paul O'Leary McCann	e73755e49f	Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928 ) * Switch ubuntu-latest to ubuntu-20.04 in main tests * Only use 20.04 for 3.6	2022-12-13 13:21:41 +01:00
Adriane Boyd	41afbb2f89	Modernize and simplify CI steps (#11738 ) * Use `build` instead of `python setup.py sdist` * Remove in-place build with `setup.py` * Remove `gpu` parameter and GPU tests * Keep `architecture` and `num_build_jobs` in azure steps with CI defaults * Fix use of `num_build_jobs` parameters * Remove now-unused `prefix` parameter * Test imports and CLI before installing test requirements * Remove `*.egg-info` directory in addition to source directory for an warning-free `import spacy`	2022-12-13 13:21:41 +01:00
Adriane Boyd	571ef56fa9	Modify similarity tests to avoid spurious warnings	2022-12-13 13:21:41 +01:00
Adriane Boyd	1a5352e423	Clean up warnings in the test suite (#11331 )	2022-12-13 13:21:41 +01:00
Adriane Boyd	e3ef798e03	Rename test helper method with non-test_ name (#11701 )	2022-12-12 11:09:14 +01:00
Adriane Boyd	8cfc4c7325	Cast to uint64 for all array-based doc representations (#11933 ) * Convert all individual values explicitly to uint64 for array-based doc representations * Temporarily test with latest numpy v1.24.0rc * Remove unnecessary conversion from attr_t * Reduce number of individual casts * Convert specifically from int32 to uint64 * Revert "Temporarily test with latest numpy v1.24.0rc" This reverts commit `eb0e3c5006`. * Also use int32 in tests	2022-12-12 11:09:14 +01:00
Paul O'Leary McCann	3ac7230abd	Config generation fails for GPU without transformers (#11899 ) If you don't have spacy-transformers installed, but try to use `init config` with the GPU flag, you'll get an error. The issue is that the `use_transformers` flag in the config is conflated with the GPU flag, and then there's an attempt to access transformers config info that may not exist. There may be a better way to do this, but this stops the error.	2022-12-12 11:09:14 +01:00
Paul O'Leary McCann	0de7892033	Add in errors used in the beam code that were removed at some point (#11935 ) I don't think there's any way to use the beam code at the moment, but as long as it's around the errors it refers to should also be present.	2022-12-12 11:09:14 +01:00
Adriane Boyd	21204f17c7	Add smart_open requirement, update deprecated options (#11864 ) * Switch from deprecated `ignore_ext` to `compression` * Add upload/download test for local files	2022-12-12 11:09:14 +01:00
Adriane Boyd	a8b883fead	Fix spancat for zero suggestions (#11860 ) * Add test for spancat predict with zero suggestions * Fix spancat for zero suggestions * Undo changes to extract_spans * Use .sum() as in update	2022-12-12 11:09:14 +01:00
Adriane Boyd	cca1e21ad6	Revert "Add click pin to avoid typer issues (#10573 )" This reverts commit `9966e08f32`.	2022-12-12 11:09:14 +01:00
Adriane Boyd	346a25f587	Support env var for num build jobs (#11073 )	2022-07-04 20:51:02 +02:00
Adriane Boyd	9a566e7d2b	Extend build constraints for aarch64	2022-07-04 13:31:48 +02:00
Adriane Boyd	b50fe5ec68	Merge pull request #10577 from adrianeboyd/chore/backport-click-pin-v3.2.x Backport click pin, set version to v3.2.4	2022-03-29 17:46:35 +02:00
Adriane Boyd	259ad994e2	Set version to v3.2.4	2022-03-29 14:59:29 +02:00
Adriane Boyd	03bee62568	Add click pin to avoid typer issues (#10573 )	2022-03-29 14:58:57 +02:00
Adriane Boyd	b2f34b1507	Merge pull request #10399 from adrianeboyd/chore/undo-blis-test Revert temporary blis test	2022-03-01 16:14:01 +01:00
Adriane Boyd	19b16f047f	Revert "Test spacy v3.2.3 with blis v0.7.6" This reverts commit `bee99548e0`.	2022-03-01 13:38:03 +01:00
Adriane Boyd	b6fa6ef94d	Revert "Fix requirements in setup.cfg" This reverts commit `9de43ab0a8`.	2022-03-01 13:37:52 +01:00
Adriane Boyd	9de43ab0a8	Fix requirements in setup.cfg	2022-03-01 13:25:05 +01:00
Adriane Boyd	bee99548e0	Test spacy v3.2.3 with blis v0.7.6	2022-03-01 13:19:12 +01:00
Adriane Boyd	99425de369	Set version to v3.2.3 (#10392 )	2022-02-28 12:54:33 +01:00
Adriane Boyd	b31993e03c	Merge pull request #10389 from adrianeboyd/chore/v3.2-backport-10324-2 Fix Tok2Vec for empty batches (#10324)	2022-02-28 11:18:25 +01:00
Adriane Boyd	f606e1d044	Fix Tok2Vec for empty batches (#10324 ) * Add test for tok2vec with vectors and empty docs * Add shortcut for empty batch in Tok2Vec.predict * Avoid types	2022-02-28 09:08:05 +01:00