Merge pull request #12674 from adrianeboyd/backport/v3.2.6

Backports and other fixes for v3.2.6
Set version to v3.2.6
2025-08-04 20:30:24 +03:00 · 2023-05-25 11:29:56 +02:00 · 2023-05-25 10:41:11 +02:00 · 2023-05-25 09:04:48 +02:00 · 2023-05-25 09:04:48 +02:00 · 2023-05-25 09:04:48 +02:00
30 changed files with 331 additions and 284 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -1,117 +0,0 @@
-parameters:
-  python_version: ''
-  architecture: ''
-  prefix: ''
-  gpu: false
-  num_build_jobs: 1
-
-steps:
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: ${{ parameters.python_version }}
-      architecture: ${{ parameters.architecture }}
-
-  - bash: |
-      echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
-    displayName: 'Set variables'
-
-  - script: |
-      ${{ parameters.prefix }} python -m pip install -U pip setuptools
-      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
-    displayName: "Install dependencies"
-
-  - script: |
-      ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
-      ${{ parameters.prefix }} python setup.py sdist --formats=gztar
-    displayName: "Compile and build sdist"
-
-  - script: python -m mypy spacy
-    displayName: 'Run mypy'
-    condition: ne(variables['python_version'], '3.10')
-
-  - task: DeleteFiles@1
-    inputs:
-      contents: "spacy"
-    displayName: "Delete source directory"
-
-  - script: |
-      ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
-      ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
-    displayName: "Uninstall all packages"
-
-  - bash: |
-      ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
-      ${{ parameters.prefix }} python -m pip install dist/$SDIST
-    displayName: "Install from sdist"
-
-  - script: |
-      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
-    displayName: "Install test requirements"
-
-  - script: |
-      ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
-      ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
-    displayName: "Install GPU requirements"
-    condition: eq(${{ parameters.gpu }}, true)
-
-  - script: |
-      ${{ parameters.prefix }} python -m pytest --pyargs spacy
-    displayName: "Run CPU tests"
-    condition: eq(${{ parameters.gpu }}, false)
-
-  - script: |
-      ${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
-    displayName: "Run GPU tests"
-    condition: eq(${{ parameters.gpu }}, true)
-
-  - script: |
-      python -m spacy download ca_core_news_sm
-      python -m spacy download ca_core_news_md
-      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-    displayName: 'Test download CLI'
-    condition: eq(variables['python_version'], '3.8')
-
-  - script: |
-      python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
-    displayName: 'Test convert CLI'
-    condition: eq(variables['python_version'], '3.8')
-
-  - script: |
-      python -m spacy init config -p ner -l ca ner.cfg
-      python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
-    displayName: 'Test debug config CLI'
-    condition: eq(variables['python_version'], '3.8')
-
-  - script: |
-      # will have errors due to sparse data, check for summary in output
-      python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
-    displayName: 'Test debug data CLI'
-    condition: eq(variables['python_version'], '3.8')
-
-  - script: |
-      python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
-    displayName: 'Test train CLI'
-    condition: eq(variables['python_version'], '3.8')
-
-  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-    displayName: 'Test assemble CLI'
-    condition: eq(variables['python_version'], '3.8')
-
-  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-    displayName: 'Test assemble CLI vectors warning'
-    condition: eq(variables['python_version'], '3.8')
-
-  - script: |
-      python .github/validate_universe_json.py website/meta/universe.json
-    displayName: 'Test website/meta/universe.json'
-    condition: eq(variables['python_version'], '3.8')
-
-  - script: |
-      ${{ parameters.prefix }} python -m pip install thinc-apple-ops
-      ${{ parameters.prefix }} python -m pytest --pyargs spacy
-    displayName: "Run CPU tests with thinc-apple-ops"
-    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9'))
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -0,0 +1,170 @@
+name: tests
+
+on:
+  push:
+    branches-ignore:
+      - "spacy.io"
+      - "nightly.spacy.io"
+      - "v2.spacy.io"
+    paths-ignore:
+      - "*.md"
+      - "*.mdx"
+      - "website/**"
+      - ".github/workflows/**"
+  pull_request:
+    types: [opened, synchronize, reopened, edited]
+    paths-ignore:
+      - "*.md"
+      - "*.mdx"
+      - "website/**"
+
+jobs:
+  validate:
+    name: Validate
+    if: github.repository_owner == 'explosion'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v3
+
+      - name: Configure Python version
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.7"
+          architecture: x64
+
+      - name: black
+        run: |
+          python -m pip install black -c requirements.txt
+          python -m black spacy --check
+      - name: flake8
+        run: |
+          python -m pip install flake8==5.0.4
+          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
+  tests:
+    name: Test
+    needs: Validate
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+        python_version: ["3.10"]
+        include:
+          - os: ubuntu-20.04
+            python_version: "3.6"
+          - os: windows-latest
+            python_version: "3.7"
+          - os: macos-latest
+            python_version: "3.8"
+          - os: ubuntu-latest
+            python_version: "3.9"
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v3
+
+      - name: Configure Python version
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python_version }}
+          architecture: x64
+
+      - name: Install dependencies
+        run: |
+          python -m pip install -U build pip setuptools
+          python -m pip install -U -r requirements.txt
+
+      - name: Build sdist
+        run: |
+          python -m build --sdist
+
+      - name: Run mypy
+        run: |
+          # Install older numpy for mypy (bug with newer numpy+mypy not fixed
+          # until mypy 0.981)
+          python -m pip install "numpy<1.22"
+          python -m mypy spacy
+        if: matrix.python_version != '3.6'
+
+      - name: Delete source directory and .egg-info
+        run: |
+          rm -rf spacy *.egg-info
+        shell: bash
+
+      - name: Uninstall all packages
+        run: |
+          python -m pip freeze
+          python -m pip freeze --exclude pywin32 > installed.txt
+          python -m pip uninstall -y -r installed.txt
+
+      - name: Install from sdist
+        run: |
+          SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
+          SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
+        shell: bash
+
+      - name: Test import
+        run: python -W error -c "import spacy"
+
+      - name: "Test download CLI"
+        run: |
+          python -m spacy download ca_core_news_sm
+          python -m spacy download ca_core_news_md
+          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+        if: matrix.python_version == '3.9'
+
+      - name: "Test no warnings on load (#11713)"
+        run: |
+          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+        if: matrix.python_version == '3.9'
+
+      - name: "Test convert CLI"
+        run: |
+          python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
+        if: matrix.python_version == '3.9'
+
+      - name: "Test debug config CLI"
+        run: |
+          python -m spacy init config -p ner -l ca ner.cfg
+          python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
+        if: matrix.python_version == '3.9'
+
+      - name: "Test debug data CLI"
+        run: |
+          # will have errors due to sparse data, check for summary in output
+          python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
+        if: matrix.python_version == '3.9'
+
+      - name: "Test train CLI"
+        run: |
+          python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
+        if: matrix.python_version == '3.9'
+
+      - name: "Test assemble CLI"
+        run: |
+          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+        if: matrix.python_version == '3.9'
+
+      - name: "Test assemble CLI vectors warning"
+        run: |
+          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+        if: matrix.python_version == '3.9'
+
+      - name: "Install test requirements"
+        run: |
+          python -m pip install -U -r requirements.txt
+
+      - name: "Run CPU tests"
+        run: |
+          python -m pytest --pyargs spacy -W error
+        if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.10')"
+
+      - name: "Run CPU tests with thinc-apple-ops"
+        run: |
+          python -m pip install 'spacy[apple]'
+          python -m pytest --pyargs spacy
+        if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.10'
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -5,7 +5,7 @@ repos:
    - id: black
      language_version: python3.7
 -   repo: https://gitlab.com/pycqa/flake8
-    rev: 3.9.2
+    rev: 5.0.4
    hooks:
    - id: flake8
      args:
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -1,109 +0,0 @@
-trigger:
-  batch: true
-  branches:
-    include:
-      - "*"
-    exclude:
-      - "spacy.io"
-      - "nightly.spacy.io"
-      - "v2.spacy.io"
-  paths:
-    exclude:
-      - "website/*"
-      - "*.md"
-pr:
-   paths:
-    exclude:
-      - "*.md"
-      - "website/docs/*"
-      - "website/src/*"
-
-jobs:
-  # Perform basic checks for most important errors (syntax etc.) Uses the config
-  # defined in .flake8 and overwrites the selected codes.
-  - job: "Validate"
-    pool:
-      vmImage: "ubuntu-latest"
-    steps:
-      - task: UsePythonVersion@0
-        inputs:
-          versionSpec: "3.7"
-      - script: |
-          pip install flake8==3.9.2
-          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
-        displayName: "flake8"
-
-  - job: "Test"
-    dependsOn: "Validate"
-    strategy:
-      matrix:
-        # We're only running one platform per Python version to speed up builds
-        Python36Linux:
-          imageName: "ubuntu-latest"
-          python.version: "3.6"
-        #        Python36Windows:
-        #          imageName: "windows-latest"
-        #          python.version: "3.6"
-        #        Python36Mac:
-        #          imageName: "macos-latest"
-        #          python.version: "3.6"
-        #        Python37Linux:
-        #          imageName: "ubuntu-latest"
-        #          python.version: "3.7"
-        Python37Windows:
-          imageName: "windows-latest"
-          python.version: "3.7"
-        #        Python37Mac:
-        #          imageName: "macos-latest"
-        #          python.version: "3.7"
-        #        Python38Linux:
-        #          imageName: "ubuntu-latest"
-        #          python.version: "3.8"
-        #        Python38Windows:
-        #          imageName: "windows-latest"
-        #          python.version: "3.8"
-        Python38Mac:
-          imageName: "macos-latest"
-          python.version: "3.8"
-        Python39Linux:
-          imageName: "ubuntu-latest"
-          python.version: "3.9"
-        #        Python39Windows:
-        #          imageName: "windows-latest"
-        #          python.version: "3.9"
-        #        Python39Mac:
-        #          imageName: "macos-latest"
-        #          python.version: "3.9"
-        Python310Linux:
-          imageName: "ubuntu-latest"
-          python.version: "3.10"
-        Python310Windows:
-          imageName: "windows-latest"
-          python.version: "3.10"
-        Python310Mac:
-          imageName: "macos-latest"
-          python.version: "3.10"
-      maxParallel: 4
-    pool:
-      vmImage: $(imageName)
-    steps:
-      - template: .github/azure-steps.yml
-        parameters:
-          python_version: '$(python.version)'
-          architecture: 'x64'
-
-#  - job: "TestGPU"
-#    dependsOn: "Validate"
-#    strategy:
-#      matrix:
-#        Python38LinuxX64_GPU:
-#          python.version: '3.8'
-#    pool:
-#      name: "LinuxX64_GPU"
-#    steps:
-#      - template: .github/azure-steps.yml
-#        parameters:
-#          python_version: '$(python.version)'
-#          architecture: 'x64'
-#          gpu: true
-#          num_build_jobs: 24
--- a/build-constraints.txt
+++ b/build-constraints.txt
@ -1,6 +1,8 @@
 # build version constraints for use with wheelwright + multibuild
-numpy==1.15.0; python_version<='3.7'
-numpy==1.17.3; python_version=='3.8'
+numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
+numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
+numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
+numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
 numpy==1.19.3; python_version=='3.9'
 numpy==1.21.3; python_version=='3.10'
 numpy; python_version>='3.11'
--- a/requirements.txt
+++ b/requirements.txt
@ -12,6 +12,7 @@ srsly>=2.4.1,<3.0.0
 catalogue>=2.0.6,<2.1.0
 typer>=0.3.0,<0.5.0
 pathy>=0.3.5
+smart-open>=5.2.1,<7.0.0
 # Third party dependencies
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0
@ -22,7 +23,9 @@ langcodes>=3.2.0,<4.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
-typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8"
+# Require and pin typing_extensions for all python versions as a workaround
+# for pydantic incompatibility with typing_extensions>=4.6.0
+typing_extensions>=3.7.4.1,<4.6.0
 # Development dependencies
 pre-commit>=2.13.0
 cython>=0.25,<3.0
--- a/setup.cfg
+++ b/setup.cfg
@ -51,9 +51,10 @@ install_requires =
    wasabi>=0.8.1,<1.1.0
    srsly>=2.4.1,<3.0.0
    catalogue>=2.0.6,<2.1.0
+    # Third-party dependencies
    typer>=0.3.0,<0.5.0
    pathy>=0.3.5
-    # Third-party dependencies
+    smart-open>=5.2.1,<7.0.0
    tqdm>=4.38.0,<5.0.0
    numpy>=1.15.0
    requests>=2.13.0,<3.0.0
@ -62,7 +63,9 @@ install_requires =
    # Official Python utilities
    setuptools
    packaging>=20.0
-    typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
+    # Require and pin typing_extensions for all python versions as a workaround
+    # for pydantic incompatibility with typing_extensions>=4.6.0
+    typing_extensions>=3.7.4.1,<4.6.0
    langcodes>=3.2.0,<4.0.0

 [options.entry_points]
--- a/setup.py
+++ b/setup.py
@ -125,6 +125,8 @@ class build_ext_options:

 class build_ext_subclass(build_ext, build_ext_options):
    def build_extensions(self):
+        if not self.parallel:
+            self.parallel = int(os.environ.get("SPACY_NUM_BUILD_JOBS", 1))
        build_ext_options.build_options(self)
        build_ext.build_extensions(self)

--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.2.2"
+__version__ = "3.2.6"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -358,7 +358,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
    if dest.exists() and not force:
        return None
    src = str(src)
-    with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
+    with smart_open.open(src, mode="rb", compression="disable") as input_file:
        with dest.open(mode="wb") as output_file:
            output_file.write(input_file.read())

--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -50,7 +50,7 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
        )
        pip_args = pip_args + ("--no-deps",)
    suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
-    dl_tpl = "{m}-{v}/{m}-{v}{s}#egg={m}=={v}"
+    dl_tpl = "{m}-{v}/{m}-{v}{s}"
    if direct:
        components = model.split("-")
        model_name = "".join(components[:-1])
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -1,7 +1,7 @@
 {# This is a template for training configs used for the quickstart widget in
 the docs and the init config command. It encodes various best practices and
 can help generate the best possible configuration, given a user's requirements. #}
-{%- set use_transformer = hardware != "cpu" -%}
+{%- set use_transformer = hardware != "cpu" and transformer_data -%}
 {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
 [paths]
 train = null
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -322,6 +322,11 @@ class Errors(metaclass=ErrorsWithCodes):
            "clear the existing vectors and resize the table.")
    E074 = ("Error interpreting compiled match pattern: patterns are expected "
            "to end with the attribute {attr}. Got: {bad_attr}.")
+    E079 = ("Error computing states in beam: number of predicted beams "
+            "({pbeams}) does not equal number of gold beams ({gbeams}).")
+    E080 = ("Duplicate state found in beam: {key}.")
+    E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
+            "does not equal number of losses ({losses}).")
    E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
            "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
            "match.")
--- a/spacy/ml/extract_spans.py
+++ b/spacy/ml/extract_spans.py
@ -1,4 +1,4 @@
-from typing import Tuple, Callable
+from typing import List, Tuple, Callable
 from thinc.api import Model, to_numpy
 from thinc.types import Ragged, Ints1d

@ -52,14 +52,14 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
    indices will be [5, 6, 7, 8, 8, 9].
    """
    spans, lengths = _ensure_cpu(spans, lengths)
-    indices = []
+    indices: List[int] = []
    offset = 0
    for i, length in enumerate(lengths):
        spans_i = spans[i].dataXd + offset
        for j in range(spans_i.shape[0]):
-            indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1]))  # type: ignore[call-overload, index]
+            indices.extend(range(spans_i[j, 0], spans_i[j, 1]))  # type: ignore
        offset += length
-    return ops.flatten(indices, dtype="i", ndim_if_empty=1)
+    return ops.asarray1i(indices)


 def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -268,7 +268,10 @@ class SpanCategorizer(TrainablePipe):
        DOCS: https://spacy.io/api/spancategorizer#predict
        """
        indices = self.suggester(docs, ops=self.model.ops)
-        scores = self.model.predict((docs, indices))  # type: ignore
+        if indices.lengths.sum() == 0:
+            scores = self.model.ops.alloc2f(0, 0)
+        else:
+            scores = self.model.predict((docs, indices))  # type: ignore
        return indices, scores

    def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -118,6 +118,10 @@ class Tok2Vec(TrainablePipe):

        DOCS: https://spacy.io/api/tok2vec#predict
        """
+        if not any(len(doc) for doc in docs):
+            # Handle cases where there are no tokens in any docs.
+            width = self.model.get_dim("nO")
+            return [self.model.ops.alloc((0, width)) for doc in docs]
        tokvecs = self.model.predict(docs)
        batch_id = Tok2VecListener.get_batch_id(docs)
        for listener in self.listeners:
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):

    # head before start
    arr = doc.to_array(["HEAD"])
-    arr[0] = -1
+    arr[0] = numpy.int32(-1).astype(numpy.uint64)
    doc_from_array = Doc(en_vocab, words=words)
    with pytest.raises(ValueError):
        doc_from_array.from_array(["HEAD"], arr)

    # head after end
    arr = doc.to_array(["HEAD"])
-    arr[0] = 5
+    arr[0] = numpy.int32(5).astype(numpy.uint64)
    doc_from_array = Doc(en_vocab, words=words)
    with pytest.raises(ValueError):
        doc_from_array.from_array(["HEAD"], arr)
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -2,6 +2,7 @@ import weakref

 import numpy
 import pytest
+import warnings
 from thinc.api import NumpyOps, get_current_ops

 from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
@ -528,9 +529,9 @@ def test_doc_from_array_sent_starts(en_vocab):
    # no warning using default attrs
    attrs = doc._get_array_attrs()
    arr = doc.to_array(attrs)
-    with pytest.warns(None) as record:
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
        new_doc.from_array(attrs, arr)
-        assert len(record) == 0
    # only SENT_START uses SENT_START
    attrs = [SENT_START]
    arr = doc.to_array(attrs)
--- a/spacy/tests/lang/ru/test_lemmatizer.py
+++ b/spacy/tests/lang/ru/test_lemmatizer.py
@ -2,6 +2,9 @@ import pytest
 from spacy.tokens import Doc


+pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
+
+
 def test_ru_doc_lemmatization(ru_lemmatizer):
    words = ["мама", "мыла", "раму"]
    pos = ["NOUN", "VERB", "NOUN"]
--- a/spacy/tests/lang/uk/test_lemmatizer.py
+++ b/spacy/tests/lang/uk/test_lemmatizer.py
@ -1,6 +1,10 @@
+import pytest
 from spacy.tokens import Doc


+pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
+
+
 def test_uk_lemmatizer(uk_lemmatizer):
    """Check that the default uk lemmatizer runs."""
    doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@ -1,4 +1,5 @@
 import pytest
+import warnings
 import srsly
 from mock import Mock

@ -314,13 +315,13 @@ def test_phrase_matcher_validation(en_vocab):
        matcher.add("TEST1", [doc1])
    with pytest.warns(UserWarning):
        matcher.add("TEST2", [doc2])
-    with pytest.warns(None) as record:
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
        matcher.add("TEST3", [doc3])
-        assert not record.list
    matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
-    with pytest.warns(None) as record:
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
        matcher.add("TEST4", [doc2])
-        assert not record.list


 def test_attr_validation(en_vocab):
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@ -372,24 +372,39 @@ def test_overfitting_IO_overlapping():


 def test_zero_suggestions():
-    # Test with a suggester that returns 0 suggestions
+    # Test with a suggester that can return 0 suggestions

-    @registry.misc("test_zero_suggester")
-    def make_zero_suggester():
-        def zero_suggester(docs, *, ops=None):
+    @registry.misc("test_mixed_zero_suggester")
+    def make_mixed_zero_suggester():
+        def mixed_zero_suggester(docs, *, ops=None):
            if ops is None:
                ops = get_current_ops()
-            return Ragged(
-                ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i")
-            )
+            spans = []
+            lengths = []
+            for doc in docs:
+                if len(doc) > 0 and len(doc) % 2 == 0:
+                    spans.append((0, 1))
+                    lengths.append(1)
+                else:
+                    lengths.append(0)
+            spans = ops.asarray2i(spans)
+            lengths_array = ops.asarray1i(lengths)
+            if len(spans) > 0:
+                output = Ragged(ops.xp.vstack(spans), lengths_array)
+            else:
+                output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
+            return output

-        return zero_suggester
+        return mixed_zero_suggester

    fix_random_seed(0)
    nlp = English()
    spancat = nlp.add_pipe(
        "spancat",
-        config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY},
+        config={
+            "suggester": {"@misc": "test_mixed_zero_suggester"},
+            "spans_key": SPAN_KEY,
+        },
    )
    train_examples = make_examples(nlp)
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
@ -397,3 +412,13 @@ def test_zero_suggestions():
    assert set(spancat.labels) == {"LOC", "PERSON"}

    nlp.update(train_examples, sgd=optimizer)
+    # empty doc
+    nlp("")
+    # single doc with zero suggestions
+    nlp("one")
+    # single doc with one suggestion
+    nlp("two two")
+    # batch with mixed zero/one suggestions
+    list(nlp.pipe(["one", "two two", "three three three", "", "four four four four"]))
+    # batch with no suggestions
+    list(nlp.pipe(["", "one", "three three three"]))
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@ -11,7 +11,7 @@ from spacy.lang.en import English
 from thinc.api import Config, get_current_ops
 from numpy.testing import assert_array_equal

-from ..util import get_batch, make_tempdir
+from ..util import get_batch, make_tempdir, add_vecs_to_vocab


 def test_empty_doc():
@ -140,9 +140,25 @@ TRAIN_DATA = [
 ]


-def test_tok2vec_listener():
+@pytest.mark.parametrize("with_vectors", (False, True))
+def test_tok2vec_listener(with_vectors):
    orig_config = Config().from_str(cfg_string)
+    orig_config["components"]["tok2vec"]["model"]["embed"][
+        "include_static_vectors"
+    ] = with_vectors
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+
+    if with_vectors:
+        ops = get_current_ops()
+        vectors = [
+            ("apple", ops.asarray([1, 2, 3])),
+            ("orange", ops.asarray([-1, -2, -3])),
+            ("and", ops.asarray([-1, -1, -1])),
+            ("juice", ops.asarray([5, 5, 10])),
+            ("pie", ops.asarray([7, 6.3, 8.9])),
+        ]
+        add_vecs_to_vocab(nlp.vocab, vectors)
+
    assert nlp.pipe_names == ["tok2vec", "tagger"]
    tagger = nlp.get_pipe("tagger")
    tok2vec = nlp.get_pipe("tok2vec")
@ -169,6 +185,9 @@ def test_tok2vec_listener():
    ops = get_current_ops()
    assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor))

+    # test with empty doc
+    doc = nlp("")
+
    # TODO: should this warn or error?
    nlp.select_pipes(disable="tok2vec")
    assert nlp.pipe_names == ["tagger"]
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -12,6 +12,7 @@ from spacy.cli._util import is_subpath_of, load_project_config
 from spacy.cli._util import parse_config_overrides, string_to_list
 from spacy.cli._util import substitute_project_variables
 from spacy.cli._util import validate_project_commands
+from spacy.cli._util import upload_file, download_file
 from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
 from spacy.cli.debug_data import _get_labels_from_spancat
 from spacy.cli.download import get_compatibility, get_version
@ -706,17 +707,42 @@ def test_permitted_package_names():
    assert _is_permitted_package_name("-package") == False
    assert _is_permitted_package_name("package-") == False

-    
+
 def test_debug_data_compile_gold():
    nlp = English()
    pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
-    ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "O", "B-ENT"])
+    ref = Doc(
+        nlp.vocab,
+        words=["Token", ".", "New York City"],
+        sent_starts=[True, False, True],
+        ents=["O", "O", "B-ENT"],
+    )
    eg = Example(pred, ref)
    data = _compile_gold([eg], ["ner"], nlp, True)
    assert data["boundary_cross_ents"] == 0

    pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
-    ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "B-ENT", "I-ENT"])
+    ref = Doc(
+        nlp.vocab,
+        words=["Token", ".", "New York City"],
+        sent_starts=[True, False, True],
+        ents=["O", "B-ENT", "I-ENT"],
+    )
    eg = Example(pred, ref)
    data = _compile_gold([eg], ["ner"], nlp, True)
-    assert data["boundary_cross_ents"] == 1
+    assert data["boundary_cross_ents"] == 1
+
+
+def test_upload_download_local_file():
+    with make_tempdir() as d1, make_tempdir() as d2:
+        filename = "f.txt"
+        content = "content"
+        local_file = d1 / filename
+        remote_file = d2 / filename
+        with local_file.open(mode="w") as file_:
+            file_.write(content)
+        upload_file(local_file, remote_file)
+        local_file.unlink()
+        download_file(remote_file, local_file)
+        with local_file.open(mode="r") as file_:
+            assert file_.read() == content
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@ -23,7 +23,7 @@ def get_textcat_bow_kwargs():


 def get_textcat_cnn_kwargs():
-    return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
+    return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}


 def get_all_params(model):
@ -65,7 +65,7 @@ def get_tok2vec_kwargs():
    }


-def test_tok2vec():
+def make_test_tok2vec():
    return build_Tok2Vec_model(**get_tok2vec_kwargs())


--- a/spacy/tests/vocab_vectors/test_similarity.py
+++ b/spacy/tests/vocab_vectors/test_similarity.py
@ -7,7 +7,7 @@ from ..util import get_cosine, add_vecs_to_vocab

@pytest.fixture
 def vectors():
-    return [("apple", [1, 2, 3]), ("orange", [-1, -2, -3])]
+    return [("apple", [1, 2, 3]), ("orange", [-1, -2, -5])]


@pytest.fixture()
@ -71,19 +71,17 @@ def test_vectors_similarity_DD(vocab, vectors):
 def test_vectors_similarity_TD(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = Doc(vocab, words=[word1, word2])
-    with pytest.warns(UserWarning):
-        assert isinstance(doc.similarity(doc[0]), float)
-        assert isinstance(doc[0].similarity(doc), float)
-        assert doc.similarity(doc[0]) == doc[0].similarity(doc)
+    assert isinstance(doc.similarity(doc[0]), float)
+    assert isinstance(doc[0].similarity(doc), float)
+    assert doc.similarity(doc[0]) == doc[0].similarity(doc)


 def test_vectors_similarity_TS(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = Doc(vocab, words=[word1, word2])
-    with pytest.warns(UserWarning):
-        assert isinstance(doc[:2].similarity(doc[0]), float)
-        assert isinstance(doc[0].similarity(doc[-2]), float)
-        assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
+    assert isinstance(doc[:2].similarity(doc[0]), float)
+    assert isinstance(doc[0].similarity(doc[-2]), float)
+    assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])


 def test_vectors_similarity_DS(vocab, vectors):
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -356,6 +356,7 @@ cdef class Doc:
            for annot in annotations:
                if annot:
                    if annot is heads or annot is sent_starts or annot is ent_iobs:
+                        annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
                        for i in range(len(words)):
                            if attrs.ndim == 1:
                                attrs[i] = annot[i]
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -305,7 +305,7 @@ cdef class Span:
                    for ancestor in ancestors:
                        ancestor_i = ancestor.i - self.c.start
                        if ancestor_i in range(length):
-                            array[i, head_col] = ancestor_i - i
+                            array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)

                # if there is no appropriate ancestor, define a new artificial root
                value = array[i, head_col]
@ -313,7 +313,7 @@ cdef class Span:
                    new_root = old_to_new_root.get(ancestor_i, None)
                    if new_root is not None:
                        # take the same artificial root as a previous token from the same sentence
-                        array[i, head_col] = new_root - i
+                        array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
                    else:
                        # set this token as the new artificial root
                        array[i, head_col] = 0
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -333,26 +333,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
        if key not in IDS:
            raise ValueError(Errors.E974.format(obj="token", key=key))
        elif key in ["ORTH", "SPACY"]:
-            pass
+            continue
        elif key == "HEAD":
            attrs.append(key)
-            values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
+            row = [h-i if h is not None else 0 for i, h in enumerate(value)]
        elif key == "DEP":
            attrs.append(key)
-            values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
+            row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
        elif key == "SENT_START":
            attrs.append(key)
-            values.append([to_ternary_int(v) for v in value])
+            row = [to_ternary_int(v) for v in value]
        elif key == "MORPH":
            attrs.append(key)
-            values.append([vocab.morphology.add(v) for v in value])
+            row = [vocab.morphology.add(v) for v in value]
        else:
            attrs.append(key)
            if not all(isinstance(v, str) for v in value):
                types = set([type(v) for v in value])
                raise TypeError(Errors.E969.format(field=key, types=types)) from None
-            values.append([vocab.strings.add(v) for v in value])
-    array = numpy.asarray(values, dtype="uint64")
+            row = [vocab.strings.add(v) for v in value]
+        values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
+    array = numpy.array(values, dtype=numpy.uint64)
    return attrs, array.T


--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -335,3 +335,5 @@ def ensure_shape(vectors_loc):
        # store all the results in a list in memory
        lines2 = open_file(vectors_loc)
        yield from lines2
+        lines2.close()
+    lines.close()
Author	SHA1	Message	Date
Adriane Boyd	0fc87f64bd	Merge pull request #12674 from adrianeboyd/backport/v3.2.6 Backports and other fixes for v3.2.6	2023-05-25 11:29:56 +02:00
Adriane Boyd	834cc20278	Set version to v3.2.6	2023-05-25 10:41:11 +02:00
Adriane Boyd	f37be33c61	Format	2023-05-25 09:04:48 +02:00
Adriane Boyd	5a04e05f95	Add typing_extensions requirement for pydantic	2023-05-25 09:04:48 +02:00
Adriane Boyd	de96c6888e	Remove #egg from download URLs The current URLs will become invalid in pip 25.0. According to the pip docs, the egg= URLs are currently only needed for editable VCS installs.	2023-05-25 09:04:48 +02:00
kadarakos	647d1e188e	Spancat speed improvement (#12577 ) * avoid nesting then flattening * mypy fix * Apply suggestions from code review * Add type for indices * Run full matrix for mypy * Add back modified type: ignore * Revert "Run full matrix for mypy" This reverts commit `e218873d04`. --------- Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>	2023-05-25 09:04:48 +02:00
Adriane Boyd	d2464d7bc9	Switch from azure to GHA	2023-05-25 08:52:02 +02:00
Adriane Boyd	6e8ab15445	Merge pull request #11964 from adrianeboyd/backport/v3.2.5 Backport bug fixes to v3.2.x	2022-12-14 18:33:05 +01:00
Adriane Boyd	427de63f0a	Set version to v3.2.5	2022-12-13 13:21:53 +01:00
Adriane Boyd	386a3e69da	CI and precommit hooks: switch to flake8==5.0.4	2022-12-13 13:21:41 +01:00
Adriane Boyd	b449d355d5	CI: Install thinc-apple-ops through extra (#11963 )	2022-12-13 13:21:41 +01:00
Paul O'Leary McCann	e73755e49f	Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928 ) * Switch ubuntu-latest to ubuntu-20.04 in main tests * Only use 20.04 for 3.6	2022-12-13 13:21:41 +01:00
Adriane Boyd	41afbb2f89	Modernize and simplify CI steps (#11738 ) * Use `build` instead of `python setup.py sdist` * Remove in-place build with `setup.py` * Remove `gpu` parameter and GPU tests * Keep `architecture` and `num_build_jobs` in azure steps with CI defaults * Fix use of `num_build_jobs` parameters * Remove now-unused `prefix` parameter * Test imports and CLI before installing test requirements * Remove `*.egg-info` directory in addition to source directory for an warning-free `import spacy`	2022-12-13 13:21:41 +01:00
Adriane Boyd	571ef56fa9	Modify similarity tests to avoid spurious warnings	2022-12-13 13:21:41 +01:00
Adriane Boyd	1a5352e423	Clean up warnings in the test suite (#11331 )	2022-12-13 13:21:41 +01:00
Adriane Boyd	e3ef798e03	Rename test helper method with non-test_ name (#11701 )	2022-12-12 11:09:14 +01:00
Adriane Boyd	8cfc4c7325	Cast to uint64 for all array-based doc representations (#11933 ) * Convert all individual values explicitly to uint64 for array-based doc representations * Temporarily test with latest numpy v1.24.0rc * Remove unnecessary conversion from attr_t * Reduce number of individual casts * Convert specifically from int32 to uint64 * Revert "Temporarily test with latest numpy v1.24.0rc" This reverts commit `eb0e3c5006`. * Also use int32 in tests	2022-12-12 11:09:14 +01:00
Paul O'Leary McCann	3ac7230abd	Config generation fails for GPU without transformers (#11899 ) If you don't have spacy-transformers installed, but try to use `init config` with the GPU flag, you'll get an error. The issue is that the `use_transformers` flag in the config is conflated with the GPU flag, and then there's an attempt to access transformers config info that may not exist. There may be a better way to do this, but this stops the error.	2022-12-12 11:09:14 +01:00
Paul O'Leary McCann	0de7892033	Add in errors used in the beam code that were removed at some point (#11935 ) I don't think there's any way to use the beam code at the moment, but as long as it's around the errors it refers to should also be present.	2022-12-12 11:09:14 +01:00
Adriane Boyd	21204f17c7	Add smart_open requirement, update deprecated options (#11864 ) * Switch from deprecated `ignore_ext` to `compression` * Add upload/download test for local files	2022-12-12 11:09:14 +01:00
Adriane Boyd	a8b883fead	Fix spancat for zero suggestions (#11860 ) * Add test for spancat predict with zero suggestions * Fix spancat for zero suggestions * Undo changes to extract_spans * Use .sum() as in update	2022-12-12 11:09:14 +01:00
Adriane Boyd	cca1e21ad6	Revert "Add click pin to avoid typer issues (#10573 )" This reverts commit `9966e08f32`.	2022-12-12 11:09:14 +01:00
Adriane Boyd	346a25f587	Support env var for num build jobs (#11073 )	2022-07-04 20:51:02 +02:00
Adriane Boyd	9a566e7d2b	Extend build constraints for aarch64	2022-07-04 13:31:48 +02:00
Adriane Boyd	b50fe5ec68	Merge pull request #10577 from adrianeboyd/chore/backport-click-pin-v3.2.x Backport click pin, set version to v3.2.4	2022-03-29 17:46:35 +02:00
Adriane Boyd	259ad994e2	Set version to v3.2.4	2022-03-29 14:59:29 +02:00
Adriane Boyd	03bee62568	Add click pin to avoid typer issues (#10573 )	2022-03-29 14:58:57 +02:00
Adriane Boyd	b2f34b1507	Merge pull request #10399 from adrianeboyd/chore/undo-blis-test Revert temporary blis test	2022-03-01 16:14:01 +01:00
Adriane Boyd	19b16f047f	Revert "Test spacy v3.2.3 with blis v0.7.6" This reverts commit `bee99548e0`.	2022-03-01 13:38:03 +01:00
Adriane Boyd	b6fa6ef94d	Revert "Fix requirements in setup.cfg" This reverts commit `9de43ab0a8`.	2022-03-01 13:37:52 +01:00
Adriane Boyd	9de43ab0a8	Fix requirements in setup.cfg	2022-03-01 13:25:05 +01:00
Adriane Boyd	bee99548e0	Test spacy v3.2.3 with blis v0.7.6	2022-03-01 13:19:12 +01:00
Adriane Boyd	99425de369	Set version to v3.2.3 (#10392 )	2022-02-28 12:54:33 +01:00
Adriane Boyd	b31993e03c	Merge pull request #10389 from adrianeboyd/chore/v3.2-backport-10324-2 Fix Tok2Vec for empty batches (#10324)	2022-02-28 11:18:25 +01:00
Adriane Boyd	f606e1d044	Fix Tok2Vec for empty batches (#10324 ) * Add test for tok2vec with vectors and empty docs * Add shortcut for empty batch in Tok2Vec.predict * Avoid types	2022-02-28 09:08:05 +01:00