Merge pull request #11965 from adrianeboyd/backport/v3.0.9

Backport bug fixes to v3.0.x
2025-08-04 12:20:20 +03:00 · 2022-12-15 08:04:20 +01:00 · 2022-12-15 08:04:20 +01:00 · a65379dede
commit a65379dede
parent f55b876326 4af02ac9e4
24 changed files with 130 additions and 87 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -1,9 +1,6 @@
 parameters:
  python_version: ''
-  architecture: ''
-  prefix: ''
-  gpu: false
-  num_build_jobs: 1
+  architecture: 'x64'

 steps:
  - task: UsePythonVersion@0
@ -16,52 +13,76 @@ steps:
    displayName: 'Set variables'

  - script: |
-      ${{ parameters.prefix }} python -m pip install -U pip setuptools
-      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+      python -m pip install -U build pip setuptools
+      python -m pip install -U -r requirements.txt
    displayName: "Install dependencies"

  - script: |
-      ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
-      ${{ parameters.prefix }} python setup.py sdist --formats=gztar
-    displayName: "Compile and build sdist"
+      python -m build --sdist
+    displayName: "Build sdist"

  - task: DeleteFiles@1
    inputs:
      contents: "spacy"
    displayName: "Delete source directory"

+  - task: DeleteFiles@1
+    inputs:
+      contents: "*.egg-info"
+    displayName: "Delete egg-info directory"
+
  - script: |
-      ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
-      ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
+      python -m pip freeze > installed.txt
+      python -m pip uninstall -y -r installed.txt
    displayName: "Uninstall all packages"

  - bash: |
-      ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
-      ${{ parameters.prefix }} python -m pip install dist/$SDIST
+      SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
+      python -m pip install dist/$SDIST
    displayName: "Install from sdist"

  - script: |
-      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+      python -W error -c "import spacy"
+    displayName: "Test import"
+
+  - script: |
+      python -m spacy download es_core_news_sm
+      python -c "import spacy; nlp=spacy.load('es_core_news_sm'); doc=nlp('test')"
+    displayName: 'Test download CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
+    displayName: 'Test convert CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -m spacy init config -p ner -l es ner.cfg
+      python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
+    displayName: 'Test debug config CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      # will have errors due to sparse data, check for summary in output
+      python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
+    displayName: 'Test debug data CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
+    displayName: 'Test train CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'es_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+    displayName: 'Test assemble CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -m pip install -U -r requirements.txt
    displayName: "Install test requirements"

  - script: |
-      ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
-      ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
-    displayName: "Install GPU requirements"
-    condition: eq(${{ parameters.gpu }}, true)
-
-  - script: |
-      ${{ parameters.prefix }} python -m pytest --pyargs spacy
+      python -m pytest --pyargs spacy -W error
    displayName: "Run CPU tests"
-    condition: eq(${{ parameters.gpu }}, false)
-
-  - script: |
-      ${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
-    displayName: "Run GPU tests"
-    condition: eq(${{ parameters.gpu }}, true)
-
-  - script: |
-      python -m spacy download en_core_web_sm
-      python -c "import spacy; nlp=spacy.load('en_core_web_sm'); doc=nlp('test')"
-    displayName: 'Test download CLI'
-    condition: eq(variables['python_version'], '3.8')
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -28,7 +28,7 @@ jobs:
        inputs:
          versionSpec: "3.7"
      - script: |
-          pip install flake8==3.5.0
+          pip install flake8==5.0.4
          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
        displayName: "flake8"

@ -38,7 +38,7 @@ jobs:
      matrix:
        # We're only running one platform per Python version to speed up builds
        Python36Linux:
-          imageName: "ubuntu-latest"
+          imageName: "ubuntu-20.04"
          python.version: "3.6"
        #        Python36Windows:
        #          imageName: "windows-latest"
@ -47,7 +47,7 @@ jobs:
        #          imageName: "macos-latest"
        #          python.version: "3.6"
        #        Python37Linux:
-        #          imageName: "ubuntu-latest"
+        #          imageName: "ubuntu-20.04"
        #          python.version: "3.7"
        Python37Windows:
          imageName: "windows-latest"
@ -89,20 +89,3 @@ jobs:
      - template: .github/azure-steps.yml
        parameters:
          python_version: '$(python.version)'
-          architecture: 'x64'
-
-#  - job: "TestGPU"
-#    dependsOn: "Validate"
-#    strategy:
-#      matrix:
-#        Python38LinuxX64_GPU:
-#          python.version: '3.8'
-#    pool:
-#      name: "LinuxX64_GPU"
-#    steps:
-#      - template: .github/azure-steps.yml
-#        parameters:
-#          python_version: '$(python.version)'
-#          architecture: 'x64'
-#          gpu: true
-#          num_build_jobs: 24
--- a/requirements.txt
+++ b/requirements.txt
@ -11,6 +11,7 @@ srsly>=2.4.1,<3.0.0
 catalogue>=2.0.4,<2.1.0
 typer>=0.3.0,<0.4.0
 pathy>=0.3.5
+smart-open>=5.2.1,<7.0.0
 # Third party dependencies
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0
@ -26,6 +27,6 @@ cython>=0.25,<3.0
 pytest>=5.2.0
 pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
-flake8>=3.5.0,<3.6.0
+flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
 mypy==0.910
--- a/setup.cfg
+++ b/setup.cfg
@ -49,9 +49,10 @@ install_requires =
    wasabi>=0.8.1,<1.1.0
    srsly>=2.4.1,<3.0.0
    catalogue>=2.0.4,<2.1.0
+    # Third-party dependencies
    typer>=0.3.0,<0.4.0
    pathy>=0.3.5
-    # Third-party dependencies
+    smart-open>=5.2.1,<7.0.0
    tqdm>=4.38.0,<5.0.0
    numpy>=1.15.0
    requests>=2.13.0,<3.0.0
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.0.8"
+__version__ = "3.0.9"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -355,7 +355,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
    if dest.exists() and not force:
        return None
    src = str(src)
-    with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
+    with smart_open.open(src, mode="rb", compression="disable") as input_file:
        with dest.open(mode="wb") as output_file:
            output_file.write(input_file.read())

--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -1,7 +1,7 @@
 {# This is a template for training configs used for the quickstart widget in
 the docs and the init config command. It encodes various best practices and
 can help generate the best possible configuration, given a user's requirements. #}
-{%- set use_transformer = hardware != "cpu" -%}
+{%- set use_transformer = hardware != "cpu" and transformer_data -%}
 {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
 [paths]
 train = null
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -320,6 +320,11 @@ class Errors:
            "clear the existing vectors and resize the table.")
    E074 = ("Error interpreting compiled match pattern: patterns are expected "
            "to end with the attribute {attr}. Got: {bad_attr}.")
+    E079 = ("Error computing states in beam: number of predicted beams "
+            "({pbeams}) does not equal number of gold beams ({gbeams}).")
+    E080 = ("Duplicate state found in beam: {key}.")
+    E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
+            "does not equal number of losses ({losses}).")
    E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
            "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
            "match.")
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@ -6,6 +6,7 @@ from thinc.api import Model, Maxout, Linear
 from ...util import registry
 from ...kb import KnowledgeBase, Candidate, get_candidates
 from ...vocab import Vocab
+from ...tokens import Span


@registry.architectures("spacy.EntityLinker.v1")
@ -44,5 +45,5 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:


@registry.misc("spacy.CandidateGenerator.v1")
-def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
+def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
    return get_candidates
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -9,7 +9,7 @@ import warnings

 from ..kb import KnowledgeBase, Candidate
 from ..ml import empty_kb
-from ..tokens import Doc
+from ..tokens import Doc, Span
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 from ..language import Language
@ -67,7 +67,7 @@ def make_entity_linker(
    incl_prior: bool,
    incl_context: bool,
    entity_vector_length: int,
-    get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
+    get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
 ):
    """Construct an EntityLinker component.

@ -114,7 +114,7 @@ class EntityLinker(TrainablePipe):
        incl_prior: bool,
        incl_context: bool,
        entity_vector_length: int,
-        get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
+        get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
    ) -> None:
        """Initialize an entity linker.

@ -127,7 +127,7 @@ class EntityLinker(TrainablePipe):
        incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
        incl_context (bool): Whether or not to include the local context in the model.
        entity_vector_length (int): Size of encoding vectors in the KB.
-        get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
+        get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
            produces a list of candidates, given a certain knowledge base and a textual mention.

        DOCS: https://spacy.io/api/entitylinker#init
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -4,7 +4,7 @@ from enum import Enum
 from pydantic import BaseModel, Field, ValidationError, validator, create_model
 from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
 from pydantic.main import ModelMetaclass
-from thinc.api import Optimizer, ConfigValidationError
+from thinc.api import Optimizer, ConfigValidationError, Model
 from thinc.config import Promise
 from collections import defaultdict
 import inspect
@ -17,6 +17,7 @@ if TYPE_CHECKING:
    # This lets us add type hints for mypy etc. without causing circular imports
    from .language import Language  # noqa: F401
    from .training import Example  # noqa: F401
+    from .vocab import Vocab  # noqa: F401


 # fmt: off
@ -353,7 +354,7 @@ class ConfigSchemaPretrain(BaseModel):
    batcher: Batcher = Field(..., title="Batcher for the training data")
    component: str = Field(..., title="Component to find the layer to pretrain")
    layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
-    objective: Callable[["Vocab", "Model"], "Model"] = Field(..., title="A function that creates the pretraining objective.")
+    objective: Callable[["Vocab", Model], Model] = Field(..., title="A function that creates the pretraining objective.")
    # fmt: on

    class Config:
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@ -1,4 +1,5 @@
 import pytest
+import numpy
 from spacy.tokens import Doc
 from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH

@ -100,14 +101,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):

    # head before start
    arr = doc.to_array(["HEAD"])
-    arr[0] = -1
+    arr[0] = numpy.int32(-1).astype(numpy.uint64)
    doc_from_array = Doc(en_vocab, words=words)
    with pytest.raises(ValueError):
        doc_from_array.from_array(["HEAD"], arr)

    # head after end
    arr = doc.to_array(["HEAD"])
-    arr[0] = 5
+    arr[0] = numpy.int32(5).astype(numpy.uint64)
    doc_from_array = Doc(en_vocab, words=words)
    with pytest.raises(ValueError):
        doc_from_array.from_array(["HEAD"], arr)
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -4,6 +4,7 @@ import pytest
 import numpy
 import logging
 import mock
+import warnings

 from spacy.lang.xx import MultiLanguage
 from spacy.tokens import Doc, Span, Token
@ -316,9 +317,9 @@ def test_doc_from_array_sent_starts(en_vocab):
    # no warning using default attrs
    attrs = doc._get_array_attrs()
    arr = doc.to_array(attrs)
-    with pytest.warns(None) as record:
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
        new_doc.from_array(attrs, arr)
-        assert len(record) == 0
    # only SENT_START uses SENT_START
    attrs = [SENT_START]
    arr = doc.to_array(attrs)
--- a/spacy/tests/lang/ru/test_lemmatizer.py
+++ b/spacy/tests/lang/ru/test_lemmatizer.py
@ -2,6 +2,9 @@ import pytest
 from spacy.tokens import Doc


+pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
+
+
 def test_ru_doc_lemmatization(ru_lemmatizer):
    words = ["мама", "мыла", "раму"]
    pos = ["NOUN", "VERB", "NOUN"]
--- a/spacy/tests/lang/uk/test_lemmatizer.py
+++ b/spacy/tests/lang/uk/test_lemmatizer.py
@ -2,6 +2,9 @@ import pytest
 from spacy.tokens import Doc


+pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
+
+
 def test_uk_lemmatizer(uk_lemmatizer):
    """Check that the default uk lemmatizer runs."""
    doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@ -1,4 +1,5 @@
 import pytest
+import warnings
 import srsly
 from mock import Mock
 from spacy.matcher import PhraseMatcher
@ -197,13 +198,13 @@ def test_phrase_matcher_validation(en_vocab):
        matcher.add("TEST1", [doc1])
    with pytest.warns(UserWarning):
        matcher.add("TEST2", [doc2])
-    with pytest.warns(None) as record:
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
        matcher.add("TEST3", [doc3])
-        assert not record.list
    matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
-    with pytest.warns(None) as record:
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
        matcher.add("TEST4", [doc2])
-        assert not record.list


 def test_attr_validation(en_vocab):
--- a/spacy/tests/regression/test_issue5501-6000.py
+++ b/spacy/tests/regression/test_issue5501-6000.py
@ -49,8 +49,8 @@ def test_issue5551(textcat_config):
    # All results should be the same because of the fixed seed
    assert len(results) == 3
    ops = get_current_ops()
-    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]))
-    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]))
+    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5)
+    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5)


 def test_issue5838():
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@ -3,6 +3,7 @@ from typing import Callable
 from spacy import util
 from spacy.util import ensure_path, registry, load_model_from_config
 from spacy.kb import KnowledgeBase
+from spacy.vocab import Vocab
 from thinc.api import Config

 from ..util import make_tempdir
@ -111,7 +112,7 @@ def test_serialize_subclassed_kb():
    @registry.misc("spacy.CustomKB.v1")
    def custom_kb(
        entity_vector_length: int, custom_field: int
-    ) -> Callable[["Vocab"], KnowledgeBase]:
+    ) -> Callable[[Vocab], KnowledgeBase]:
        def custom_kb_factory(vocab):
            kb = SubKnowledgeBase(
                vocab=vocab,
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -10,6 +10,7 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
 from spacy.cli._util import string_to_list
+from spacy.cli._util import upload_file, download_file
 from thinc.api import ConfigValidationError, Config
 import srsly
 import os
@ -474,3 +475,18 @@ def test_string_to_list(value):
 def test_string_to_list_intify(value):
    assert string_to_list(value, intify=False) == ["1", "2", "3"]
    assert string_to_list(value, intify=True) == [1, 2, 3]
+
+
+def test_upload_download_local_file():
+    with make_tempdir() as d1, make_tempdir() as d2:
+        filename = "f.txt"
+        content = "content"
+        local_file = d1 / filename
+        remote_file = d2 / filename
+        with local_file.open(mode="w") as file_:
+            file_.write(content)
+        upload_file(local_file, remote_file)
+        local_file.unlink()
+        download_file(remote_file, local_file)
+        with local_file.open(mode="r") as file_:
+            assert file_.read() == content
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@ -20,7 +20,7 @@ def get_textcat_bow_kwargs():


 def get_textcat_cnn_kwargs():
-    return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
+    return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}


 def get_all_params(model):
@ -62,7 +62,7 @@ def get_tok2vec_kwargs():
    }


-def test_tok2vec():
+def make_test_tok2vec():
    return build_Tok2Vec_model(**get_tok2vec_kwargs())


--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -347,6 +347,7 @@ cdef class Doc:
            for annot in annotations:
                if annot:
                    if annot is heads or annot is sent_starts or annot is ent_iobs:
+                        annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
                        for i in range(len(words)):
                            if attrs.ndim == 1:
                                attrs[i] = annot[i]
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -297,7 +297,7 @@ cdef class Span:
                    for ancestor in ancestors:
                        ancestor_i = ancestor.i - self.c.start
                        if ancestor_i in range(length):
-                            array[i, head_col] = ancestor_i - i
+                            array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)

                # if there is no appropriate ancestor, define a new artificial root
                value = array[i, head_col]
@ -305,7 +305,7 @@ cdef class Span:
                    new_root = old_to_new_root.get(ancestor_i, None)
                    if new_root is not None:
                        # take the same artificial root as a previous token from the same sentence
-                        array[i, head_col] = new_root - i
+                        array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
                    else:
                        # set this token as the new artificial root
                        array[i, head_col] = 0
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -329,26 +329,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
        if key not in IDS:
            raise ValueError(Errors.E974.format(obj="token", key=key))
        elif key in ["ORTH", "SPACY"]:
-            pass
+            continue
        elif key == "HEAD":
            attrs.append(key)
-            values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
+            row = [h-i if h is not None else 0 for i, h in enumerate(value)]
        elif key == "DEP":
            attrs.append(key)
-            values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
+            row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
        elif key == "SENT_START":
            attrs.append(key)
-            values.append([to_ternary_int(v) for v in value])
+            row = [to_ternary_int(v) for v in value]
        elif key == "MORPH":
            attrs.append(key)
-            values.append([vocab.morphology.add(v) for v in value])
+            row = [vocab.morphology.add(v) for v in value]
        else:
            attrs.append(key)
            if not all(isinstance(v, str) for v in value):
                types = set([type(v) for v in value])
                raise TypeError(Errors.E969.format(field=key, types=types)) from None
-            values.append([vocab.strings.add(v) for v in value])
-    array = numpy.asarray(values, dtype="uint64")
+            row = [vocab.strings.add(v) for v in value]
+        values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
+    array = numpy.array(values, dtype=numpy.uint64)
    return attrs, array.T


--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -274,3 +274,5 @@ def ensure_shape(vectors_loc):
        # store all the results in a list in memory
        lines2 = open_file(vectors_loc)
        yield from lines2
+        lines2.close()
+    lines.close()