Merge pull request #12494 from adrianeboyd/backport/v3.5.2-1

Backports for v3.5.2
2025-06-05 05:33:15 +03:00 · 2023-04-06 16:18:59 +02:00 · 2023-04-06 16:18:59 +02:00 · e4bbdf7b50
commit e4bbdf7b50
parent 8153bd573f f66d55fe5b
40 changed files with 604 additions and 240 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -57,51 +57,51 @@ steps:
      python -m spacy download ca_core_news_md
      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
    displayName: 'Test download CLI'
-    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.9')
  - script: |
      python -W error -m spacy info ca_core_news_sm | grep -q download_url
    displayName: 'Test download_url in info CLI'
-    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.9')
  - script: |
      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
    displayName: 'Test no warnings on load (#11713)'
-    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.9')
  - script: |
      python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
    displayName: 'Test convert CLI'
-    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.9')
  - script: |
      python -m spacy init config -p ner -l ca ner.cfg
      python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
    displayName: 'Test debug config CLI'
-    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.9')
  - script: |
      # will have errors due to sparse data, check for summary in output
      python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
    displayName: 'Test debug data CLI'
-    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.9')
  - script: |
      python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
    displayName: 'Test train CLI'
-    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.9')
  - script: |
      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
    displayName: 'Test assemble CLI'
-    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.9')
  - script: |
      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
    displayName: 'Test assemble CLI vectors warning'
-    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.9')
  - script: |
      python -m pip install -U -r requirements.txt
@ -116,9 +116,3 @@ steps:
      python -m pytest --pyargs spacy
    displayName: "Run CPU tests with thinc-apple-ops"
    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
  - script: |
      python .github/validate_universe_json.py website/meta/universe.json
    displayName: 'Test website/meta/universe.json'
    condition: eq(variables['python_version'], '3.8')
--- a/.github/workflows/autoblack.yml
+++ b/.github/workflows/autoblack.yml
@ -1,45 +0,0 @@
 # GitHub Action that uses Black to reformat all Python code and submits a PR
 # in regular intervals. Inspired by: https://github.com/cclauss/autoblack
 name: autoblack
 on:
  workflow_dispatch:  # allow manual trigger
  schedule:
    - cron: '0 8 * * 5'  # every Friday at 8am UTC
 jobs:
  autoblack:
    if: github.repository_owner == 'explosion'
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
        with:
            ref: ${{ github.head_ref }}
      - uses: actions/setup-python@v4
      - run: pip install black -c requirements.txt
      - name: Auto-format code if needed
        run: black spacy
      # We can't run black --check here because that returns a non-zero excit
      # code and makes GitHub think the action failed
      - name: Check for modified files
        id: git-check
        run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
      - name: Create Pull Request
        if: steps.git-check.outputs.modified == 'true'
        uses: peter-evans/create-pull-request@v4
        with:
            title: Auto-format code with black
            labels: meta
            commit-message: Auto-format code with black
            committer: GitHub <noreply@github.com>
            author: explosion-bot <explosion-bot@users.noreply.github.com>
            body: _This PR is auto-generated._
            branch: autoblack
            delete-branch: true
            draft: false
      - name: Check outputs
        if: steps.git-check.outputs.modified == 'true'
        run: |
          echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
          echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"
--- a/.github/workflows/explosionbot.yml
+++ b/.github/workflows/explosionbot.yml
@ -8,6 +8,7 @@ on:
 jobs:
  explosion-bot:
    if: github.repository_owner == 'explosion'
    runs-on: ubuntu-latest
    steps:
      - name: Dump GitHub context
--- a/.github/workflows/issue-manager.yml
+++ b/.github/workflows/issue-manager.yml
@ -13,6 +13,7 @@ on:
 jobs:
  issue-manager:
    if: github.repository_owner == 'explosion'
    runs-on: ubuntu-latest
    steps:
      - uses: tiangolo/issue-manager@0.4.0
--- a/.github/workflows/lock.yml
+++ b/.github/workflows/lock.yml
@ -13,6 +13,7 @@ concurrency:
 jobs:
  action:
    if: github.repository_owner == 'explosion'
    runs-on: ubuntu-latest
    steps:
      - uses: dessant/lock-threads@v4
--- a/.github/workflows/spacy_universe_alert.yml
+++ b/.github/workflows/spacy_universe_alert.yml
@ -7,6 +7,7 @@ on:
 jobs:
  build:
    if: github.repository_owner == 'explosion'
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -0,0 +1,173 @@
 name: tests
 on:
  push:
    branches-ignore:
      - "spacy.io"
      - "nightly.spacy.io"
      - "v2.spacy.io"
    paths-ignore:
      - "*.md"
      - "*.mdx"
      - "website/**"
      - ".github/workflows/**"
  pull_request:
    types: [opened, synchronize, reopened, edited]
    paths-ignore:
      - "*.md"
      - "*.mdx"
      - "website/**"
 jobs:
  validate:
    name: Validate
    if: github.repository_owner == 'explosion'
    runs-on: ubuntu-latest
    steps:
      - name: Check out repo
        uses: actions/checkout@v3
      - name: Configure Python version
        uses: actions/setup-python@v4
        with:
          python-version: "3.7"
          architecture: x64
      - name: black
        run: |
          python -m pip install black -c requirements.txt
          python -m black spacy --check
      - name: flake8
        run: |
          python -m pip install flake8==5.0.4
          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
  tests:
    name: Test
    needs: Validate
    strategy:
      fail-fast: true
      matrix:
        os: [ubuntu-latest, windows-latest, macos-latest]
        python_version: ["3.11"]
        include:
          - os: ubuntu-20.04
            python_version: "3.6"
          - os: windows-latest
            python_version: "3.7"
          - os: macos-latest
            python_version: "3.8"
          - os: ubuntu-latest
            python_version: "3.9"
          - os: windows-latest
            python_version: "3.10"
    runs-on: ${{ matrix.os }}
    steps:
      - name: Check out repo
        uses: actions/checkout@v3
      - name: Configure Python version
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python_version }}
          architecture: x64
      - name: Install dependencies
        run: |
          python -m pip install -U build pip setuptools
          python -m pip install -U -r requirements.txt
      - name: Build sdist
        run: |
          python -m build --sdist
      - name: Run mypy
        run: |
          python -m mypy spacy
        if: matrix.python_version != '3.6'
      - name: Delete source directory and .egg-info
        run: |
          rm -rf spacy *.egg-info
        shell: bash
      - name: Uninstall all packages
        run: |
          python -m pip freeze
          python -m pip freeze --exclude pywin32 > installed.txt
          python -m pip uninstall -y -r installed.txt
      - name: Install from sdist
        run: |
          SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
          SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
        shell: bash
      - name: Test import
        run: python -W error -c "import spacy"
      - name: "Test download CLI"
        run: |
          python -m spacy download ca_core_news_sm
          python -m spacy download ca_core_news_md
          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
        if: matrix.python_version == '3.9'
      - name: "Test download_url in info CLI"
        run: |
          python -W error -m spacy info ca_core_news_sm | grep -q download_url
        if: matrix.python_version == '3.9'
      - name: "Test no warnings on load (#11713)"
        run: |
          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
        if: matrix.python_version == '3.9'
      - name: "Test convert CLI"
        run: |
          python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
        if: matrix.python_version == '3.9'
      - name: "Test debug config CLI"
        run: |
          python -m spacy init config -p ner -l ca ner.cfg
          python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
        if: matrix.python_version == '3.9'
      - name: "Test debug data CLI"
        run: |
          # will have errors due to sparse data, check for summary in output
          python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
        if: matrix.python_version == '3.9'
      - name: "Test train CLI"
        run: |
          python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
        if: matrix.python_version == '3.9'
      - name: "Test assemble CLI"
        run: |
          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
        if: matrix.python_version == '3.9'
      - name: "Test assemble CLI vectors warning"
        run: |
          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
        if: matrix.python_version == '3.9'
      - name: "Install test requirements"
        run: |
          python -m pip install -U -r requirements.txt
      - name: "Run CPU tests"
        run: |
          python -m pytest --pyargs spacy -W error
      - name: "Run CPU tests with thinc-apple-ops"
        run: |
          python -m pip install 'spacy[apple]'
          python -m pytest --pyargs spacy
        if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'
--- a/.github/workflows/universe_validation.yml
+++ b/.github/workflows/universe_validation.yml
@ -0,0 +1,33 @@
 name: universe validation
 on:
  push:
    branches-ignore:
      - "spacy.io"
      - "nightly.spacy.io"
      - "v2.spacy.io"
    paths:
      - "website/meta/universe.json"
  pull_request:
    types: [opened, synchronize, reopened, edited]
    paths:
      - "website/meta/universe.json"
 jobs:
  validate:
    name: Validate
    if: github.repository_owner == 'explosion'
    runs-on: ubuntu-latest
    steps:
      - name: Check out repo
        uses: actions/checkout@v3
      - name: Configure Python version
        uses: actions/setup-python@v4
        with:
          python-version: "3.7"
          architecture: x64
      - name: Validate website/meta/universe.json
        run: |
          python .github/validate_universe_json.py website/meta/universe.json
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -48,6 +48,9 @@ jobs:
          pip install flake8==5.0.4
          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
        displayName: "flake8"
      - script: |
          python .github/validate_universe_json.py website/meta/universe.json
        displayName: 'Validate website/meta/universe.json'
  - job: "Test"
    dependsOn: "Validate"
--- a/setup.cfg
+++ b/setup.cfg
@ -78,41 +78,41 @@ transformers =
 ray =
    spacy_ray>=0.1.0,<1.0.0
 cuda =
-    cupy>=5.0.0b4,<12.0.0
+    cupy>=5.0.0b4,<13.0.0
 cuda80 =
-    cupy-cuda80>=5.0.0b4,<12.0.0
+    cupy-cuda80>=5.0.0b4,<13.0.0
 cuda90 =
-    cupy-cuda90>=5.0.0b4,<12.0.0
+    cupy-cuda90>=5.0.0b4,<13.0.0
 cuda91 =
-    cupy-cuda91>=5.0.0b4,<12.0.0
+    cupy-cuda91>=5.0.0b4,<13.0.0
 cuda92 =
-    cupy-cuda92>=5.0.0b4,<12.0.0
+    cupy-cuda92>=5.0.0b4,<13.0.0
 cuda100 =
-    cupy-cuda100>=5.0.0b4,<12.0.0
+    cupy-cuda100>=5.0.0b4,<13.0.0
 cuda101 =
-    cupy-cuda101>=5.0.0b4,<12.0.0
+    cupy-cuda101>=5.0.0b4,<13.0.0
 cuda102 =
-    cupy-cuda102>=5.0.0b4,<12.0.0
+    cupy-cuda102>=5.0.0b4,<13.0.0
 cuda110 =
-    cupy-cuda110>=5.0.0b4,<12.0.0
+    cupy-cuda110>=5.0.0b4,<13.0.0
 cuda111 =
-    cupy-cuda111>=5.0.0b4,<12.0.0
+    cupy-cuda111>=5.0.0b4,<13.0.0
 cuda112 =
-    cupy-cuda112>=5.0.0b4,<12.0.0
+    cupy-cuda112>=5.0.0b4,<13.0.0
 cuda113 =
-    cupy-cuda113>=5.0.0b4,<12.0.0
+    cupy-cuda113>=5.0.0b4,<13.0.0
 cuda114 =
-    cupy-cuda114>=5.0.0b4,<12.0.0
+    cupy-cuda114>=5.0.0b4,<13.0.0
 cuda115 =
-    cupy-cuda115>=5.0.0b4,<12.0.0
+    cupy-cuda115>=5.0.0b4,<13.0.0
 cuda116 =
-    cupy-cuda116>=5.0.0b4,<12.0.0
+    cupy-cuda116>=5.0.0b4,<13.0.0
 cuda117 =
-    cupy-cuda117>=5.0.0b4,<12.0.0
+    cupy-cuda117>=5.0.0b4,<13.0.0
 cuda11x =
-    cupy-cuda11x>=11.0.0,<12.0.0
+    cupy-cuda11x>=11.0.0,<13.0.0
 cuda-autodetect =
-    cupy-wheel>=11.0.0,<12.0.0
+    cupy-wheel>=11.0.0,<13.0.0
 apple =
    thinc-apple-ops>=0.1.0.dev0,<1.0.0
 # Language tokenizers with external dependencies
--- a/spacy/cli/find_threshold.py
+++ b/spacy/cli/find_threshold.py
@ -35,7 +35,7 @@ def find_threshold_cli(
    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
    gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
-    verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"),
+    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    # fmt: on
 ):
    """
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -23,6 +23,7 @@ def pretrain_cli(
    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
    skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"),
    # fmt: on
 ):
    """
@ -74,6 +75,7 @@ def pretrain_cli(
        epoch_resume=epoch_resume,
        use_gpu=use_gpu,
        silent=False,
        skip_last=skip_last,
    )
    msg.good("Successfully finished pretrain")
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -125,13 +125,17 @@ def app(environ, start_response):
    return [res]
-def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
+def parse_deps(
    orig_doc: Union[Doc, Span], options: Dict[str, Any] = {}
 ) -> Dict[str, Any]:
    """Generate dependency parse in {'words': [], 'arcs': []} format.
-    orig_doc (Doc): Document to parse.
+    orig_doc (Union[Doc, Span]): Document to parse.
    options (Dict[str, Any]): Dependency parse specific visualisation options.
    RETURNS (dict): Generated dependency parse keyed by words and arcs.
    """
    if isinstance(orig_doc, Span):
        orig_doc = orig_doc.as_doc()
    doc = Doc(orig_doc.vocab).from_bytes(
        orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
    )
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -549,8 +549,8 @@ class Errors(metaclass=ErrorsWithCodes):
            "during training, make sure to include it in 'annotating components'")
    # New errors added in v3.x
-    E850 = ("The PretrainVectors objective currently only supports default "
+    E850 = ("The PretrainVectors objective currently only supports default or "
-            "vectors, not {mode} vectors.")
+            "floret vectors, not {mode} vectors.")
    E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
            "but found value of '{val}'.")
    E852 = ("The tar file pulled from the remote attempted an unsafe path "
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@ -1,5 +1,5 @@
 from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast
-from thinc.types import Floats2d
+from thinc.types import Floats2d, Ints1d
 from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
 from thinc.api import MultiSoftmax, list2array
 from thinc.api import to_categorical, CosineDistance, L2Distance
@ -7,7 +7,7 @@ from thinc.loss import Loss
 from ...util import registry, OOV_RANK
 from ...errors import Errors
-from ...attrs import ID
+from ...attrs import ID, ORTH
 from ...vectors import Mode as VectorsMode
 import numpy
@ -24,8 +24,6 @@ def create_pretrain_vectors(
    maxout_pieces: int, hidden_size: int, loss: str
 ) -> Callable[["Vocab", Model], Model]:
    def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
        if vocab.vectors.mode != VectorsMode.default:
            raise ValueError(Errors.E850.format(mode=vocab.vectors.mode))
        if vocab.vectors.shape[1] == 0:
            raise ValueError(Errors.E875)
        model = build_cloze_multi_task_model(
@ -70,14 +68,23 @@ def get_vectors_loss(ops, docs, prediction, distance):
    """Compute a loss based on a distance between the documents' vectors and
    the prediction.
    """
    vocab = docs[0].vocab
    if vocab.vectors.mode == VectorsMode.default:
        # The simplest way to implement this would be to vstack the
        # token.vector values, but that's a bit inefficient, especially on GPU.
-    # Instead we fetch the index into the vectors table for each of our tokens,
+        # Instead we fetch the index into the vectors table for each of our
-    # and look them up all at once. This prevents data copying.
+        # tokens, and look them up all at once. This prevents data copying.
        ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
        target = docs[0].vocab.vectors.data[ids]
        target[ids == OOV_RANK] = 0
        d_target, loss = distance(prediction, target)
    elif vocab.vectors.mode == VectorsMode.floret:
        keys = ops.flatten([cast(Ints1d, doc.to_array(ORTH)) for doc in docs])
        target = vocab.vectors.get_batch(keys)
        target = ops.as_contig(target)
        d_target, loss = distance(prediction, target)
    else:
        raise ValueError(Errors.E850.format(mode=vocab.vectors.mode))
    return loss, d_target
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -474,18 +474,24 @@ class EntityLinker(TrainablePipe):
                # Looping through each entity in batch (TODO: rewrite)
                for j, ent in enumerate(ent_batch):
-                    sent_index = sentences.index(ent.sent)
+                    assert hasattr(ent, "sents")
-                    assert sent_index >= 0
+                    sents = list(ent.sents)
                    sent_indices = (
                        sentences.index(sents[0]),
                        sentences.index(sents[-1]),
                    )
                    assert sent_indices[1] >= sent_indices[0] >= 0
                    if self.incl_context:
                        # get n_neighbour sentences, clipped to the length of the document
-                        start_sentence = max(0, sent_index - self.n_sents)
+                        start_sentence = max(0, sent_indices[0] - self.n_sents)
                        end_sentence = min(
-                            len(sentences) - 1, sent_index + self.n_sents
+                            len(sentences) - 1, sent_indices[1] + self.n_sents
                        )
                        start_token = sentences[start_sentence].start
                        end_token = sentences[end_sentence].end
                        sent_doc = doc[start_token:end_token].as_doc()
                        # currently, the context is the same for each entity in a sentence (should be refined)
                        sentence_encoding = self.model.predict([sent_doc])[0]
                        sentence_encoding_t = sentence_encoding.T
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -1,5 +1,6 @@
 from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast, Union
 from dataclasses import dataclass
 from functools import partial
 from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
 from thinc.api import Optimizer
 from thinc.types import Ragged, Ints2d, Floats2d
@ -82,13 +83,9 @@ class Suggester(Protocol):
        ...
-@registry.misc("spacy.ngram_suggester.v1")
+def ngram_suggester(
-def build_ngram_suggester(sizes: List[int]) -> Suggester:
+    docs: Iterable[Doc], sizes: List[int], *, ops: Optional[Ops] = None
-    """Suggest all spans of the given lengths. Spans are returned as a ragged
+) -> Ragged:
    array of integers. The array has two columns, indicating the start and end
    position."""
    def ngram_suggester(docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged:
    if ops is None:
        ops = get_current_ops()
    spans = []
@ -114,7 +111,14 @@ def build_ngram_suggester(sizes: List[int]) -> Suggester:
    assert output.dataXd.ndim == 2
    return output
-    return ngram_suggester
+
@registry.misc("spacy.ngram_suggester.v1")
 def build_ngram_suggester(sizes: List[int]) -> Suggester:
    """Suggest all spans of the given lengths. Spans are returned as a ragged
    array of integers. The array has two columns, indicating the start and end
    position."""
    return partial(ngram_suggester, sizes=sizes)
@registry.misc("spacy.ngram_range_suggester.v1")
@ -726,6 +730,7 @@ class SpanCategorizer(TrainablePipe):
        if not allow_overlap:
            # Get the probabilities
            sort_idx = (argmax_scores.squeeze() * -1).argsort()
            argmax_scores = argmax_scores[sort_idx]
            predicted = predicted[sort_idx]
            indices = indices[sort_idx]
            keeps = keeps[sort_idx]
@ -748,4 +753,5 @@ class SpanCategorizer(TrainablePipe):
            attrs_scores.append(argmax_scores[i])
            spans.append(Span(doc, start, end, label=self.labels[label]))
        spans.attrs["scores"] = numpy.array(attrs_scores)
        return spans
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -700,3 +700,34 @@ def test_span_group_copy(doc):
    assert len(doc.spans["test"]) == 3
    # check that the copy spans were not modified and this is an isolated doc
    assert len(doc_copy.spans["test"]) == 2
 def test_for_partial_ent_sents():
    """Spans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences,
    which this tests for.
    """
    doc = Doc(
        English().vocab,
        words=["Mahler's", "Symphony", "No.", "8", "was", "beautiful."],
        sent_starts=[1, 0, 0, 1, 0, 0],
    )
    doc.set_ents([Span(doc, 1, 4, "WORK")])
    # The specified entity is associated with both sentences in this doc, so we expect all sentences in the doc to be
    # equal to the sentences referenced in ent.sents.
    for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents):
        assert doc_sent == ent_sent
 def test_for_no_ent_sents():
    """Span.sents() should set .sents correctly, even if Span in question is trailing and doesn't form a full
    sentence.
    """
    doc = Doc(
        English().vocab,
        words=["This", "is", "a", "test.", "ENTITY"],
        sent_starts=[1, 0, 0, 0, 1],
    )
    doc.set_ents([Span(doc, 4, 5, "WORK")])
    sents = list(doc.ents[0].sents)
    assert len(sents) == 1
    assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY"
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -1,9 +1,9 @@
-from typing import Callable, Iterable, Dict, Any
+from typing import Callable, Iterable, Dict, Any, Tuple
 import pytest
 from numpy.testing import assert_equal
-from spacy import registry, util
+from spacy import registry, util, Language
 from spacy.attrs import ENT_KB_ID
 from spacy.compat import pickle
 from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
@ -108,18 +108,23 @@ def test_issue7065():
@pytest.mark.issue(7065)
-def test_issue7065_b():
+@pytest.mark.parametrize("entity_in_first_sentence", [True, False])
 def test_sentence_crossing_ents(entity_in_first_sentence: bool):
    """Tests if NEL crashes if entities cross sentence boundaries and the first associated sentence doesn't have an
    entity.
    entity_in_prior_sentence (bool): Whether to include an entity in the first sentence associated with the
    sentence-crossing entity.
    """
    # Test that the NEL doesn't crash when an entity crosses a sentence boundary
    nlp = English()
    vector_length = 3
    nlp.add_pipe("sentencizer")
    text = "Mahler 's Symphony No. 8 was beautiful."
-    entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
+    entities = [(10, 24, "WORK")]
-    links = {
+    links = {(10, 24): {"Q7304": 0.0, "Q270853": 1.0}}
-        (0, 6): {"Q7304": 1.0, "Q270853": 0.0},
+    if entity_in_first_sentence:
-        (10, 24): {"Q7304": 0.0, "Q270853": 1.0},
+        entities.append((0, 6, "PERSON"))
-    }
+        links[(0, 6)] = {"Q7304": 1.0, "Q270853": 0.0}
-    sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
+    sent_starts = [1, -1, 0, 0, 0, 1, 0, 0, 0]
    doc = nlp(text)
    example = Example.from_dict(
        doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
@ -145,31 +150,14 @@ def test_issue7065_b():
    # Create the Entity Linker component and add it to the pipeline
    entity_linker = nlp.add_pipe("entity_linker", last=True)
-    entity_linker.set_kb(create_kb)
+    entity_linker.set_kb(create_kb)  # type: ignore
    # train the NEL pipe
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    for i in range(2):
-        losses = {}
+        nlp.update(train_examples, sgd=optimizer)
        nlp.update(train_examples, sgd=optimizer, losses=losses)
-    # Add a custom rule-based component to mimick NER
+    # This shouldn't crash.
-    patterns = [
+    entity_linker.predict([example.reference])  # type: ignore
        {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
        {
            "label": "WORK",
            "pattern": [
                {"LOWER": "symphony"},
                {"LOWER": "no"},
                {"LOWER": "."},
                {"LOWER": "8"},
            ],
        },
    ]
    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
    ruler.add_patterns(patterns)
    # test the trained model - this should not throw E148
    doc = nlp(text)
    assert doc
 def test_no_entities():
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@ -1,7 +1,7 @@
 import pytest
 import numpy
 from numpy.testing import assert_array_equal, assert_almost_equal
-from thinc.api import get_current_ops, Ragged
+from thinc.api import get_current_ops, NumpyOps, Ragged
 from spacy import util
 from spacy.lang.en import English
@ -190,17 +190,19 @@ def test_make_spangroup_singlelabel(threshold, allow_overlap, nr_results):
    spangroup = spancat._make_span_group_singlelabel(
        doc, indices, scores, allow_overlap
    )
    assert len(spangroup) == nr_results
    if threshold > 0.4:
        if allow_overlap:
            assert spangroup[0].text == "London"
            assert spangroup[0].label_ == "City"
            assert_almost_equal(0.6, spangroup.attrs["scores"][0], 5)
            assert spangroup[1].text == "Greater London"
            assert spangroup[1].label_ == "GreatCity"
-
+            assert spangroup.attrs["scores"][1] == 0.9
            assert_almost_equal(0.9, spangroup.attrs["scores"][1], 5)
        else:
            assert spangroup[0].text == "Greater London"
            assert spangroup[0].label_ == "GreatCity"
            assert spangroup.attrs["scores"][0] == 0.9
    else:
        if allow_overlap:
            assert spangroup[0].text == "Greater"
@ -256,22 +258,32 @@ def test_make_spangroup_negative_label():
    assert len(spangroup_single) == 2
    assert spangroup_single[0].text == "Greater"
    assert spangroup_single[0].label_ == "City"
    assert_almost_equal(0.4, spangroup_single.attrs["scores"][0], 5)
    assert spangroup_single[1].text == "Greater London"
    assert spangroup_single[1].label_ == "GreatCity"
    assert spangroup_single.attrs["scores"][1] == 0.9
    assert_almost_equal(0.9, spangroup_single.attrs["scores"][1], 5)
    assert len(spangroup_multi) == 6
    assert spangroup_multi[0].text == "Greater"
    assert spangroup_multi[0].label_ == "City"
    assert_almost_equal(0.4, spangroup_multi.attrs["scores"][0], 5)
    assert spangroup_multi[1].text == "Greater"
    assert spangroup_multi[1].label_ == "Person"
    assert_almost_equal(0.3, spangroup_multi.attrs["scores"][1], 5)
    assert spangroup_multi[2].text == "London"
    assert spangroup_multi[2].label_ == "City"
    assert_almost_equal(0.6, spangroup_multi.attrs["scores"][2], 5)
    assert spangroup_multi[3].text == "London"
    assert spangroup_multi[3].label_ == "GreatCity"
    assert_almost_equal(0.4, spangroup_multi.attrs["scores"][3], 5)
    assert spangroup_multi[4].text == "Greater London"
    assert spangroup_multi[4].label_ == "Thing"
    assert spangroup_multi[4].text == "Greater London"
    assert_almost_equal(0.8, spangroup_multi.attrs["scores"][4], 5)
    assert spangroup_multi[5].text == "Greater London"
    assert spangroup_multi[5].label_ == "GreatCity"
    assert_almost_equal(0.9, spangroup_multi.attrs["scores"][5], 5)
 def test_ngram_suggester(en_tokenizer):
@ -565,3 +577,21 @@ def test_set_candidates(name):
    assert len(docs[0].spans["candidates"]) == 9
    assert docs[0].spans["candidates"][0].text == "Just"
    assert docs[0].spans["candidates"][4].text == "Just a"
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
@pytest.mark.parametrize("n_process", [1, 2])
 def test_spancat_multiprocessing(name, n_process):
    if isinstance(get_current_ops, NumpyOps) or n_process < 2:
        nlp = Language()
        spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
        train_examples = make_examples(nlp)
        nlp.initialize(get_examples=lambda: train_examples)
        texts = [
            "Just a sentence.",
            "I like London and Berlin",
            "I like Berlin",
            "I eat ham.",
        ]
        docs = list(nlp.pipe(texts, n_process=n_process))
        assert len(docs) == len(texts)
--- a/spacy/tests/serialize/test_serialize_doc.py
+++ b/spacy/tests/serialize/test_serialize_doc.py
@ -213,6 +213,13 @@ def test_serialize_doc_exclude(en_vocab):
 def test_serialize_doc_span_groups(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world", "!"])
-    doc.spans["content"] = [doc[0:2]]
+    span = doc[0:2]
    span.label_ = "test_serialize_doc_span_groups_label"
    span.id_ = "test_serialize_doc_span_groups_id"
    span.kb_id_ = "test_serialize_doc_span_groups_kb_id"
    doc.spans["content"] = [span]
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
    assert len(new_doc.spans["content"]) == 1
    assert new_doc.spans["content"][0].label_ == "test_serialize_doc_span_groups_label"
    assert new_doc.spans["content"][0].id_ == "test_serialize_doc_span_groups_id"
    assert new_doc.spans["content"][0].kb_id_ == "test_serialize_doc_span_groups_kb_id"
--- a/spacy/tests/serialize/test_serialize_docbin.py
+++ b/spacy/tests/serialize/test_serialize_docbin.py
@ -49,7 +49,11 @@ def test_serialize_doc_bin():
    nlp = English()
    for doc in nlp.pipe(texts):
        doc.cats = cats
-        doc.spans["start"] = [doc[0:2]]
+        span = doc[0:2]
        span.label_ = "UNUSUAL_SPAN_LABEL"
        span.id_ = "UNUSUAL_SPAN_ID"
        span.kb_id_ = "UNUSUAL_SPAN_KB_ID"
        doc.spans["start"] = [span]
        doc[0].norm_ = "UNUSUAL_TOKEN_NORM"
        doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID"
        doc_bin.add(doc)
@ -63,6 +67,9 @@ def test_serialize_doc_bin():
        assert doc.text == texts[i]
        assert doc.cats == cats
        assert len(doc.spans) == 1
        assert doc.spans["start"][0].label_ == "UNUSUAL_SPAN_LABEL"
        assert doc.spans["start"][0].id_ == "UNUSUAL_SPAN_ID"
        assert doc.spans["start"][0].kb_id_ == "UNUSUAL_SPAN_KB_ID"
        assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM"
        assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@ -275,6 +275,20 @@ def test_displacy_parse_deps(en_vocab):
        {"start": 2, "end": 3, "label": "det", "dir": "left"},
        {"start": 1, "end": 3, "label": "attr", "dir": "right"},
    ]
    # Test that displacy.parse_deps converts Span to Doc
    deps = displacy.parse_deps(doc[:])
    assert isinstance(deps, dict)
    assert deps["words"] == [
        {"lemma": None, "text": words[0], "tag": pos[0]},
        {"lemma": None, "text": words[1], "tag": pos[1]},
        {"lemma": None, "text": words[2], "tag": pos[2]},
        {"lemma": None, "text": words[3], "tag": pos[3]},
    ]
    assert deps["arcs"] == [
        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
        {"start": 2, "end": 3, "label": "det", "dir": "left"},
        {"start": 1, "end": 3, "label": "attr", "dir": "right"},
    ]
 def test_displacy_invalid_arcs():
--- a/spacy/tests/training/test_pretraining.py
+++ b/spacy/tests/training/test_pretraining.py
@ -165,7 +165,8 @@ def test_pretraining_default():
@pytest.mark.parametrize("objective", CHAR_OBJECTIVES)
-def test_pretraining_tok2vec_characters(objective):
+@pytest.mark.parametrize("skip_last", (True, False))
 def test_pretraining_tok2vec_characters(objective, skip_last):
    """Test that pretraining works with the character objective"""
    config = Config().from_str(pretrain_string_listener)
    config["pretraining"]["objective"] = objective
@ -178,10 +179,14 @@ def test_pretraining_tok2vec_characters(objective):
        filled["paths"]["raw_text"] = file_path
        filled = filled.interpolate()
        assert filled["pretraining"]["component"] == "tok2vec"
-        pretrain(filled, tmp_dir)
+        pretrain(filled, tmp_dir, skip_last=skip_last)
        assert Path(tmp_dir / "model0.bin").exists()
        assert Path(tmp_dir / "model4.bin").exists()
        assert not Path(tmp_dir / "model5.bin").exists()
        if skip_last:
            assert not Path(tmp_dir / "model-last.bin").exists()
        else:
            assert Path(tmp_dir / "model-last.bin").exists()
@pytest.mark.parametrize("objective", VECTOR_OBJECTIVES)
@ -237,6 +242,7 @@ def test_pretraining_tagger_tok2vec(config):
        pretrain(filled, tmp_dir)
        assert Path(tmp_dir / "model0.bin").exists()
        assert Path(tmp_dir / "model4.bin").exists()
        assert Path(tmp_dir / "model-last.bin").exists()
        assert not Path(tmp_dir / "model5.bin").exists()
@ -359,19 +365,15 @@ def test_pretrain_default_vectors():
    nlp.vocab.vectors = Vectors(shape=(10, 10))
    create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
    # floret vectors are supported
    nlp.vocab.vectors = Vectors(
        data=get_current_ops().xp.zeros((10, 10)), mode="floret", hash_count=1
    )
    create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
    # error for no vectors
    with pytest.raises(ValueError, match="E875"):
        nlp.vocab.vectors = Vectors()
        create_pretrain_vectors(1, 1, "cosine")(
            nlp.vocab, nlp.get_pipe("tok2vec").model
        )
    # error for floret vectors
    with pytest.raises(ValueError, match="E850"):
        ops = get_current_ops()
        nlp.vocab.vectors = Vectors(
            data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1
        )
        create_pretrain_vectors(1, 1, "cosine")(
            nlp.vocab, nlp.get_pipe("tok2vec").model
        )
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@ -124,6 +124,10 @@ class DocBin:
        for key, group in doc.spans.items():
            for span in group:
                self.strings.add(span.label_)
                if span.kb_id in span.doc.vocab.strings:
                    self.strings.add(span.kb_id_)
                if span.id in span.doc.vocab.strings:
                    self.strings.add(span.id_)
    def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
        """Recover Doc objects from the annotations, using the given vocab.
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -544,10 +544,6 @@ cdef class Doc:
        DOCS: https://spacy.io/api/doc#char_span
        """
        if not isinstance(label, int):
            label = self.vocab.strings.add(label)
        if not isinstance(kb_id, int):
            kb_id = self.vocab.strings.add(kb_id)
        alignment_modes = ("strict", "contract", "expand")
        if alignment_mode not in alignment_modes:
            raise ValueError(
@ -1350,6 +1346,10 @@ cdef class Doc:
        for group in self.spans.values():
            for span in group:
                strings.add(span.label_)
                if span.kb_id in span.doc.vocab.strings:
                    strings.add(span.kb_id_)
                if span.id in span.doc.vocab.strings:
                    strings.add(span.id_)
        # Msgpack doesn't distinguish between lists and tuples, which is
        # vexing for user data. As a best guess, we *know* that within
        # keys, we must have tuples. In values we just have to hope
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -460,9 +460,12 @@ cdef class Span:
                    start = i
                    if start >= self.end:
                        break
-            if start < self.end:
+                elif i == self.doc.length - 1:
-                yield Span(self.doc, start, self.end)
+                    yield Span(self.doc, start, self.doc.length)
            # Ensure that trailing parts of the Span instance are included in last element of .sents.
            if start == self.doc.length - 1:
                yield Span(self.doc, start, self.doc.length)
    @property
    def ents(self):
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@ -24,6 +24,7 @@ def pretrain(
    epoch_resume: Optional[int] = None,
    use_gpu: int = -1,
    silent: bool = True,
    skip_last: bool = False,
 ):
    msg = Printer(no_print=silent)
    if config["training"]["seed"] is not None:
@ -60,10 +61,14 @@ def pretrain(
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
-    def _save_model(epoch, is_temp=False):
+    def _save_model(epoch, is_temp=False, is_last=False):
        is_temp_str = ".temp" if is_temp else ""
        with model.use_params(optimizer.averages):
-            with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
+            if is_last:
                save_path = output_dir / f"model-last.bin"
            else:
                save_path = output_dir / f"model{epoch}{is_temp_str}.bin"
            with (save_path).open("wb") as file_:
                file_.write(model.get_ref("tok2vec").to_bytes())
            log = {
                "nr_word": tracker.nr_word,
@ -76,6 +81,7 @@ def pretrain(
    # TODO: I think we probably want this to look more like the
    # 'create_train_batches' function?
    try:
        for epoch in range(epoch_resume, P["max_epochs"]):
            for batch_id, batch in enumerate(batcher(corpus(nlp))):
                docs = ensure_docs(batch)
@ -92,6 +98,9 @@ def pretrain(
            else:
                _save_model(epoch)
            tracker.epoch_loss = 0.0
    finally:
        if not skip_last:
            _save_model(P["max_epochs"], is_last=True)
 def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -1123,13 +1123,14 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
 ```
 | Name                                               | Description                                                                                                                                                                                                        |
-| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`                                      | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
 | `output_dir`                                       | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                                                           |
 | `--code`, `-c`                                     | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
 | `--resume-path`, `-r`                              | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~                                                                                                                          |
 | `--epoch-resume`, `-er`                            | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~                                                                |
 | `--gpu-id`, `-g`                                   | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
 | `--skip-last`, `-L` <Tag variant="new">3.5.2</Tag> | Skip saving `model-last.bin`. Defaults to `False`. ~~bool (flag)~~                                                                                                                                                 |
 | `--help`, `-h`                                     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
 | overrides                                          | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~                              |
 | **CREATES**                                        | The pretrained weights that can be used to initialize `spacy train`.                                                                                                                                               |
@ -1255,7 +1256,7 @@ be provided.
 > ```
 | Name                     | Description                                                                                                                                                                          |
-| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `model`                  | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
 | `data_path`              | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                |
 | `pipe_name`              | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                         |
@ -1265,7 +1266,7 @@ be provided.
 | `--code`, `-c`           | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
 | `--gpu-id`, `-g`         | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
 | `--gold-preproc`, `-G`   | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
-| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
+| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~                                                                                                                     |
 | `--help`, `-h`           | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 ## assemble {id="assemble",tag="command"}
--- a/website/docs/api/coref.mdx
+++ b/website/docs/api/coref.mdx
@ -64,7 +64,7 @@ details on the architectures and their arguments and hyperparameters.
 > config={
 >     "model": DEFAULT_COREF_MODEL,
 >     "span_cluster_prefix": DEFAULT_CLUSTER_PREFIX,
-> },
+> }
 > nlp.add_pipe("experimental_coref", config=config)
 > ```
--- a/website/docs/api/spancategorizer.mdx
+++ b/website/docs/api/spancategorizer.mdx
@ -20,8 +20,9 @@ output class probabilities are independent for each class. However, if you need
 to predict at most one true class for a span, then use `spancat_singlelabel`. It
 uses a `Softmax` layer and treats the task as a multi-class problem.
-Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc.
+Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc
-Individual span scores can be found in `spangroup.attrs["scores"]`.
+under `doc.spans[spans_key]`, where `spans_key` is a component config setting.
 Individual span scores are stored in `doc.spans[spans_key].attrs["scores"]`.
 ## Assigned Attributes {id="assigned-attributes"}
@ -29,7 +30,9 @@ Predictions will be saved to `Doc.spans[spans_key]` as a
 [`SpanGroup`](/api/spangroup). The scores for the spans in the `SpanGroup` will
 be saved in `SpanGroup.attrs["scores"]`.
-`spans_key` defaults to `"sc"`, but can be passed as a parameter.
+`spans_key` defaults to `"sc"`, but can be passed as a parameter. The `spancat`
 component will overwrite any existing spans under the spans key
 `doc.spans[spans_key]`.
 | Location                               | Value                                                    |
 | -------------------------------------- | -------------------------------------------------------- |
--- a/website/docs/api/stringstore.mdx
+++ b/website/docs/api/stringstore.mdx
@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of
 integer IDs. This ensures that strings always map to the same ID, even from
 different `StringStores`.
 <Infobox variant ="warning">
 Note that a `StringStore` instance is not static. It increases in size as texts
 with new tokens are processed.
 </Infobox>
 ## StringStore.\_\_init\_\_ {id="init",tag="method"}
 Create the `StringStore`.
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@ -25,7 +25,10 @@ and call the package's own `load()` method. If a pipeline is loaded from a path,
 spaCy will assume it's a data directory, load its
 [`config.cfg`](/api/data-formats#config) and use the language and pipeline
 information to construct the `Language` class. The data will be loaded in via
-[`Language.from_disk`](/api/language#from_disk).
+[`Language.from_disk`](/api/language#from_disk). Loading a pipeline from a
 package will also import any custom code, if present, whereas loading from a
 directory does not. For these cases, you need to manually import your custom
 code.
 <Infobox variant="warning" title="Changed in v3.0">
@ -291,7 +294,7 @@ the `manual=True` argument in `displacy.render`.
 | Name        | Description                                                         |
 | ----------- | ------------------------------------------------------------------- |
-| `orig_doc`  | Doc to parse dependencies. ~~Doc~~                                  |
+| `orig_doc`  | Doc or span to parse dependencies. ~~Union[Doc, Span]~~             |
 | `options`   | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ |
 | **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~        |
@ -577,7 +580,7 @@ start decreasing across epochs.
 > ```ini
 > [training.logger]
 > @loggers = "spacy.ConsoleLogger.v3"
-> progress_bar = "all_steps"
+> progress_bar = "eval"
 > console_output = true
 > output_file = "training_log.jsonl"
 > ```
--- a/website/docs/api/vocab.mdx
+++ b/website/docs/api/vocab.mdx
@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access
 [`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
 between `Doc` objects.
 <Infobox variant ="warning">
 Note that a `Vocab` instance is not static. It increases in size as texts with
 new tokens are processed.
 </Infobox>
 ## Vocab.\_\_init\_\_ {id="init",tag="method"}
 Create the vocabulary.
--- a/website/docs/usage/embeddings-transformers.mdx
+++ b/website/docs/usage/embeddings-transformers.mdx
@ -746,13 +746,16 @@ this by setting `initialize.init_tok2vec` to the filename of the `.bin` file
 that you want to use from pretraining.
 A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as
-an example, produces `pretrain/model0.bin` through `pretrain/model4.bin`. To
+an example, produces `pretrain/model0.bin` through `pretrain/model4.bin` plus a
-make use of the final output, you could fill in this value in your config file:
+copy of the last iteration as `pretrain/model-last.bin`. Additionally, you can
 configure `n_save_epoch` to tell pretraining in which epoch interval it should
 save the current training progress. To use the final output to initialize your
 `tok2vec` layer, you could fill in this value in your config file:
 ```ini {title="config.cfg"}
 [paths]
-init_tok2vec = "pretrain/model4.bin"
+init_tok2vec = "pretrain/model-last.bin"
 [initialize]
 init_tok2vec = ${paths.init_tok2vec}
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@ -1684,6 +1684,8 @@ def expand_person_entities(doc):
                new_ents.append(new_ent)
            else:
                new_ents.append(ent)
        else:
            new_ents.append(ent)
    doc.ents = new_ents
    return doc
--- a/website/docs/usage/training.mdx
+++ b/website/docs/usage/training.mdx
@ -758,6 +758,15 @@ any custom architectures, functions or
 your pipeline and registered when it's loaded. See the documentation on
 [saving and loading pipelines](/usage/saving-loading#models-custom) for details.
 <Infobox variant="warning">
 Note that the unpackaged models produced by `spacy train` are data directories
 that **do not include custom code**. You need to import the code in your script
 before loading in unpackaged models. For more details, see
 [`spacy.load`](/api/top-level#spacy.load).
 </Infobox>
 #### Example: Modifying the nlp object {id="custom-code-nlp-callbacks"}
 For many use cases, you don't necessarily want to implement the whole `Language`
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -3215,6 +3215,51 @@
            "category": ["pipeline"],
            "tags": ["syllables", "multilingual"]
        },
        {
            "id": "sentimental-onix",
            "title": "Sentimental Onix",
            "slogan": "Use onnx for sentiment models",
            "description": "spaCy pipeline component for sentiment analysis using onnx",
            "github": "sloev/sentimental-onix",
            "pip": "sentimental-onix",
            "code_example": [
                "# Download model:",
                "#   python -m sentimental_onix download en",
                "import spacy",
                "from sentimental_onix import pipeline",
                "",
                "nlp = spacy.load(\"en_core_web_sm\")",
                "nlp.add_pipe(\"sentencizer\")",
                "nlp.add_pipe(\"sentimental_onix\", after=\"sentencizer\")",
                "",
                "sentences = [",
                "    (sent.text, sent._.sentiment)",
                "    for doc in nlp.pipe(",
                "        [",
                "            \"i hate pasta on tuesdays\",",
                "            \"i like movies on wednesdays\",",
                "            \"i find your argument ridiculous\",",
                "            \"soda with straws are my favorite\",",
                "        ]",
                "    )",
                "    for sent in doc.sents",
                "]",
                "",
                "assert sentences == [",
                "    (\"i hate pasta on tuesdays\", \"Negative\"),",
                "    (\"i like movies on wednesdays\", \"Positive\"),",
                "    (\"i find your argument ridiculous\", \"Negative\"),",
                "    (\"soda with straws are my favorite\", \"Positive\"),",
                "]"
            ],
            "thumb": "https://raw.githubusercontent.com/sloev/sentimental-onix/master/.github/onix.webp",
            "author": "Johannes Valbjørn",
            "author_links": {
                "github": "sloev"
            },
            "category": ["pipeline"],
            "tags": ["sentiment", "english"]
        },
        {
            "id": "gobbli",
            "title": "gobbli",
--- a/website/src/styles/navigation.module.sass
+++ b/website/src/styles/navigation.module.sass
@ -111,11 +111,12 @@
    line-height: var(--line-height-xs)
    text-align: center
-@include breakpoint(max, xs)
+@include breakpoint(max, md)
-    .list
+    .alert
        display: none
-    .alert
+@include breakpoint(max, xs)
    .list
        display: none
    .has-alert
--- a/website/src/templates/index.js
+++ b/website/src/templates/index.js
@ -25,11 +25,6 @@ const AlertSpace = ({ nightly, legacy }) => {
    const isOnline = useOnlineStatus()
    return (
        <>
            {isOnline && (
                <Alert title="💥 We'd love to learn more about your experience with spaCy!">
                    <Link to="https://form.typeform.com/to/aMel9q9f">Take our survey here.</Link>
                </Alert>
            )}
            {nightly && (
                <Alert
                    title="You're viewing the pre-release docs."
@ -62,9 +57,15 @@ const AlertSpace = ({ nightly, legacy }) => {
    )
 }
 // const navAlert = (
 //     <Link to="/usage/v3-5" noLinkLayout>
 //         <strong>💥 Out now:</strong> spaCy v3.5
 //     </Link>
 // )
 const navAlert = (
-    <Link to="/usage/v3-5" noLinkLayout>
+    <Link to="https://form.typeform.com/to/aMel9q9f" noLinkLayout>
-        <strong>💥 Out now:</strong> spaCy v3.5
+        <strong>💥 Take the user survey!</strong>
    </Link>
 )