From 9da333cbfa613fa49b9fab764bdb8f96105d059e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 13 Mar 2023 13:13:47 +0100
Subject: [PATCH 01/29] Add GHA for CI tests (#12403)

* Add GHA for CI tests

* Reorder paths
---
 .github/workflows/tests.yml | 195 ++++++++++++++++++++++++++++++++++++
 1 file changed, 195 insertions(+)
 create mode 100644 .github/workflows/tests.yml

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 000000000..ad380d39a
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,195 @@
+name: tests
+
+on:
+  push:
+    branches-ignore:
+      - "spacy.io"
+      - "nightly.spacy.io"
+      - "v2.spacy.io"
+    paths-ignore:
+      - "*.md"
+      - "*.mdx"
+      - "website/docs/**"
+      - "website/src/**"
+      - "website/meta/*.tsx"
+      - "website/meta/*.mjs"
+      - "website/meta/languages.json"
+      - "website/meta/site.json"
+      - "website/meta/sidebars.json"
+      - "website/meta/type-annotations.json"
+      - "website/pages/**"
+      - ".github/workflows/**"
+  pull_request:
+    types: [opened, synchronize, reopened, edited]
+    paths:
+      - "!*.md"
+      - "!*.mdx"
+      - "!website/docs/**"
+      - "!website/src/**"
+      - "!website/meta/*.tsx"
+      - "!website/meta/*.mjs"
+      - "!website/meta/languages.json"
+      - "!website/meta/site.json"
+      - "!website/meta/sidebars.json"
+      - "!website/meta/type-annotations.json"
+      - "!website/pages/**"
+      - "!.github/workflows/**"
+      - ".github/workflows/tests.yml"
+
+jobs:
+  validate:
+    name: Validate
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v3
+
+      - name: Configure Python version
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.7"
+          architecture: x64
+
+      - name: black
+        run: |
+          python -m pip install black -c requirements.txt
+          python -m black spacy --check
+      - name: flake8
+        run: |
+          python -m pip install flake8==5.0.4
+          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
+  tests:
+    name: Test
+    needs: Validate
+    strategy:
+      fail-fast: true
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+        python_version: ["3.11"]
+        include:
+          - os: ubuntu-20.04
+            python_version: "3.6"
+          - os: windows-latest
+            python_version: "3.7"
+          - os: macos-latest
+            python_version: "3.8"
+          - os: ubuntu-latest
+            python_version: "3.9"
+          - os: windows-latest
+            python_version: "3.10"
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v3
+
+      - name: Configure Python version
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python_version }}
+          architecture: x64
+
+      - name: Install dependencies
+        run: |
+          python -m pip install -U build pip setuptools
+          python -m pip install -U -r requirements.txt
+
+      - name: Build sdist
+        run: |
+          python -m build --sdist
+
+      - name: Run mypy
+        run: |
+          python -m mypy spacy
+        if: matrix.python_version != '3.6'
+
+      - name: Delete source directory and .egg-info
+        run: |
+          rm -rf spacy *.egg-info
+        shell: bash
+
+      - name: Uninstall all packages
+        run: |
+          python -m pip freeze
+          python -m pip freeze --exclude pywin32 > installed.txt
+          python -m pip uninstall -y -r installed.txt
+
+      - name: Install from sdist
+        run: |
+          SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
+          SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
+        shell: bash
+
+      - name: Test import
+        run: python -W error -c "import spacy"
+
+      - name: "Test download CLI"
+        run: |
+          python -m spacy download ca_core_news_sm
+          python -m spacy download ca_core_news_md
+          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+        if: matrix.python_version == '3.8'
+
+      - name: "Test download_url in info CLI"
+        run: |
+          python -W error -m spacy info ca_core_news_sm | grep -q download_url
+        if: matrix.python_version == '3.8'
+
+      - name: "Test no warnings on load (#11713)"
+        run: |
+          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+        if: matrix.python_version == '3.8'
+
+      - name: "Test convert CLI"
+        run: |
+          python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
+        if: matrix.python_version == '3.8'
+
+      - name: "Test debug config CLI"
+        run: |
+          python -m spacy init config -p ner -l ca ner.cfg
+          python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
+        if: matrix.python_version == '3.8'
+
+      - name: "Test debug data CLI"
+        run: |
+          # will have errors due to sparse data, check for summary in output
+          python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
+        if: matrix.python_version == '3.8'
+
+      - name: "Test train CLI"
+        run: |
+          python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
+        if: matrix.python_version == '3.8'
+
+      - name: "Test assemble CLI"
+        run: |
+          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+        if: matrix.python_version == '3.8'
+
+      - name: "Test assemble CLI vectors warning"
+        run: |
+          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+        if: matrix.python_version == '3.8'
+
+      - name: "Install test requirements"
+        run: |
+          python -m pip install -U -r requirements.txt
+
+      - name: "Run CPU tests"
+        run: |
+          python -m pytest --pyargs spacy -W error
+
+      - name: "Run CPU tests with thinc-apple-ops"
+        run: |
+          python -m pip install 'spacy[apple]'
+          python -m pytest --pyargs spacy
+        if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.8'
+
+      - run: |
+          python .github/validate_universe_json.py website/meta/universe.json
+        name: "Test website/meta/universe.json"
+        if: matrix.python_version == '3.8'

From ed83cafe46d973ca42d3798348d750a1156feab9 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 13 Mar 2023 14:21:17 +0100
Subject: [PATCH 02/29] CI: Move universe validation to validate job (#12406)

* CI: Move universe validation to validate job

* Fix indentation

* Update step name
---
 .github/azure-steps.yml     | 6 ------
 .github/workflows/tests.yml | 8 +++-----
 azure-pipelines.yml         | 3 +++
 3 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index b2ccf3d81..1b8d81521 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -116,9 +116,3 @@ steps:
       python -m pytest --pyargs spacy
     displayName: "Run CPU tests with thinc-apple-ops"
     condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
-
-  - script: |
-      python .github/validate_universe_json.py website/meta/universe.json
-    displayName: 'Test website/meta/universe.json'
-    condition: eq(variables['python_version'], '3.8')
-
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index ad380d39a..b04e2a8c0 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -58,6 +58,9 @@ jobs:
         run: |
           python -m pip install flake8==5.0.4
           python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
+      - name: Validate website/meta/universe.json
+        run: |
+          python .github/validate_universe_json.py website/meta/universe.json
   tests:
     name: Test
     needs: Validate
@@ -188,8 +191,3 @@ jobs:
           python -m pip install 'spacy[apple]'
           python -m pytest --pyargs spacy
         if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.8'
-
-      - run: |
-          python .github/validate_universe_json.py website/meta/universe.json
-        name: "Test website/meta/universe.json"
-        if: matrix.python_version == '3.8'
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index dba11bd1a..83c57a164 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -48,6 +48,9 @@ jobs:
           pip install flake8==5.0.4
           python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
         displayName: "flake8"
+      - script: |
+          python .github/validate_universe_json.py website/meta/universe.json
+        displayName: 'Validate website/meta/universe.json'
 
   - job: "Test"
     dependsOn: "Validate"

From 9ca67dc5394a9401fe293b60ddce23372116a270 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 13 Mar 2023 15:10:04 +0100
Subject: [PATCH 03/29] Fix thinc-apple-ops test to run for python 3.11
 (#12408)

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index b04e2a8c0..880c09128 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -190,4 +190,4 @@ jobs:
         run: |
           python -m pip install 'spacy[apple]'
           python -m pytest --pyargs spacy
-        if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.8'
+        if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'
\ No newline at end of file

From d00e58d1ac7507c15d5524bb273f2b537baba1b6 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 13 Mar 2023 15:14:46 +0100
Subject: [PATCH 04/29] CI: Move CLI tests to ubuntu for speed (#12409)

---
 .github/azure-steps.yml     | 18 +++++++++---------
 .github/workflows/tests.yml | 18 +++++++++---------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index 1b8d81521..20d4582cb 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -57,51 +57,51 @@ steps:
       python -m spacy download ca_core_news_md
       python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
     displayName: 'Test download CLI'
-    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.9')
 
   - script: |
       python -W error -m spacy info ca_core_news_sm | grep -q download_url
     displayName: 'Test download_url in info CLI'
-    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.9')
 
   - script: |
       python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
     displayName: 'Test no warnings on load (#11713)'
-    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.9')
 
   - script: |
       python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
     displayName: 'Test convert CLI'
-    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.9')
 
   - script: |
       python -m spacy init config -p ner -l ca ner.cfg
       python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
     displayName: 'Test debug config CLI'
-    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.9')
 
   - script: |
       # will have errors due to sparse data, check for summary in output
       python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
     displayName: 'Test debug data CLI'
-    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.9')
 
   - script: |
       python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
     displayName: 'Test train CLI'
-    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.9')
 
   - script: |
       python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
       PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
     displayName: 'Test assemble CLI'
-    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.9')
 
   - script: |
       python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
       python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
     displayName: 'Test assemble CLI vectors warning'
-    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.9')
 
   - script: |
       python -m pip install -U -r requirements.txt
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 880c09128..e51bb6c17 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -132,51 +132,51 @@ jobs:
           python -m spacy download ca_core_news_sm
           python -m spacy download ca_core_news_md
           python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-        if: matrix.python_version == '3.8'
+        if: matrix.python_version == '3.9'
 
       - name: "Test download_url in info CLI"
         run: |
           python -W error -m spacy info ca_core_news_sm | grep -q download_url
-        if: matrix.python_version == '3.8'
+        if: matrix.python_version == '3.9'
 
       - name: "Test no warnings on load (#11713)"
         run: |
           python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-        if: matrix.python_version == '3.8'
+        if: matrix.python_version == '3.9'
 
       - name: "Test convert CLI"
         run: |
           python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
-        if: matrix.python_version == '3.8'
+        if: matrix.python_version == '3.9'
 
       - name: "Test debug config CLI"
         run: |
           python -m spacy init config -p ner -l ca ner.cfg
           python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
-        if: matrix.python_version == '3.8'
+        if: matrix.python_version == '3.9'
 
       - name: "Test debug data CLI"
         run: |
           # will have errors due to sparse data, check for summary in output
           python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
-        if: matrix.python_version == '3.8'
+        if: matrix.python_version == '3.9'
 
       - name: "Test train CLI"
         run: |
           python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
-        if: matrix.python_version == '3.8'
+        if: matrix.python_version == '3.9'
 
       - name: "Test assemble CLI"
         run: |
           python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
           PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-        if: matrix.python_version == '3.8'
+        if: matrix.python_version == '3.9'
 
       - name: "Test assemble CLI vectors warning"
         run: |
           python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
           python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-        if: matrix.python_version == '3.8'
+        if: matrix.python_version == '3.9'
 
       - name: "Install test requirements"
         run: |

From 545218a7d9763df60e300e16a489a4169242cf9c Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Tue, 14 Mar 2023 10:21:53 +0100
Subject: [PATCH 05/29] Fix sentence indexing bug in `Span.sents` (#12405)

* Add test for partial sentences in ent.sents.

* Removed unneeded import.

* Format. Simplify code.
---
 spacy/tests/doc/test_span.py | 16 ++++++++++++++++
 spacy/tokens/span.pyx        |  5 ++---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index b4631037a..adef5922f 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -700,3 +700,19 @@ def test_span_group_copy(doc):
     assert len(doc.spans["test"]) == 3
     # check that the copy spans were not modified and this is an isolated doc
     assert len(doc_copy.spans["test"]) == 2
+
+
+def test_for_partial_ent_sents():
+    """Spans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences,
+    which this tests for.
+    """
+    doc = Doc(
+        English().vocab,
+        words=["Mahler's", "Symphony", "No.", "8", "was", "beautiful."],
+        sent_starts=[1, 0, 0, 1, 0, 0],
+    )
+    doc.set_ents([Span(doc, 1, 4, "WORK")])
+    # The specified entity is associated with both sentences in this doc, so we expect all sentences in the doc to be
+    # equal to the sentences referenced in ent.sents.
+    for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents):
+        assert doc_sent == ent_sent
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index cfe1236df..7750b16ed 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -460,9 +460,8 @@ cdef class Span:
                     start = i
                     if start >= self.end:
                         break
-            if start < self.end:
-                yield Span(self.doc, start, self.end)
-
+                elif i == self.doc.length - 1:
+                    yield Span(self.doc, start, self.doc.length)
 
     @property
     def ents(self):

From 7880da952bbe459a2fdcecd74fd899dd05da2fe3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 14 Mar 2023 16:06:08 +0100
Subject: [PATCH 06/29] CI: Add all paths before excluding patterns (#12419)

---
 .github/workflows/tests.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index e51bb6c17..c18f9cd23 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -22,6 +22,7 @@ on:
   pull_request:
     types: [opened, synchronize, reopened, edited]
     paths:
+      - "**"
       - "!*.md"
       - "!*.mdx"
       - "!website/docs/**"
@@ -190,4 +191,4 @@ jobs:
         run: |
           python -m pip install 'spacy[apple]'
           python -m pytest --pyargs spacy
-        if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'
\ No newline at end of file
+        if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'

From be644caa135c49a19a41431305545ac4e4decb3d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 14 Mar 2023 17:16:49 +0100
Subject: [PATCH 07/29] Fix --verbose for spacy find-threshold (#12418)

---
 spacy/cli/find_threshold.py |  2 +-
 website/docs/api/cli.mdx    | 26 +++++++++++++-------------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py
index efa664832..6d591053d 100644
--- a/spacy/cli/find_threshold.py
+++ b/spacy/cli/find_threshold.py
@@ -35,7 +35,7 @@ def find_threshold_cli(
     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
     use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
     gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
-    verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"),
+    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     # fmt: on
 ):
     """
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 3f31bef95..2bb0199fc 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -1254,19 +1254,19 @@ be provided.
 > $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
 > ```
 
-| Name                    | Description                                                                                                                                                                          |
-| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                 | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
-| `data_path`             | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                |
-| `pipe_name`             | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                         |
-| `threshold_key`         | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                        |
-| `scores_key`            | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                            |
-| `--n_trials`, `-n`      | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                   |
-| `--code`, `-c`          | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--gpu-id`, `-g`        | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--gold-preproc`, `-G`  | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
-| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
+| Name                     | Description                                                                                                                                                                          |
+| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                  | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
+| `data_path`              | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                |
+| `pipe_name`              | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                         |
+| `threshold_key`          | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                        |
+| `scores_key`             | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                            |
+| `--n_trials`, `-n`       | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                   |
+| `--code`, `-c`           | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--gpu-id`, `-g`         | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
+| `--gold-preproc`, `-G`   | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
+| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~                                                                                                                     |
+| `--help`, `-h`           | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 
 ## assemble {id="assemble",tag="command"}
 

From bd0768c05c3b91b82b596eab4b46155e37944516 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Tue, 14 Mar 2023 22:02:49 +0100
Subject: [PATCH 08/29] Fix EL failure with sentence-crossing entities (#12398)

* Add test reproducing EL failure in sentence-crossing entities.

* Format.

* Draft fix.

* Format.

* Fix case for len(ent.sents) == 1.

* Format.

* Format.

* Format.

* Fix mypy error.

* Merge EL sentence crossing tests.

* Remove unneeded sentencizer component.

* Fix or ignore mypy issues in test.

* Simplify ent.sents handling.

* Format. Update assert in ent.sents handling.

* Small rewrite

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/pipeline/entity_linker.py            | 14 ++++--
 spacy/tests/pipeline/test_entity_linker.py | 50 ++++++++--------------
 2 files changed, 29 insertions(+), 35 deletions(-)

diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index f2dae0529..76ccc3247 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -474,18 +474,24 @@ class EntityLinker(TrainablePipe):
 
                 # Looping through each entity in batch (TODO: rewrite)
                 for j, ent in enumerate(ent_batch):
-                    sent_index = sentences.index(ent.sent)
-                    assert sent_index >= 0
+                    assert hasattr(ent, "sents")
+                    sents = list(ent.sents)
+                    sent_indices = (
+                        sentences.index(sents[0]),
+                        sentences.index(sents[-1]),
+                    )
+                    assert sent_indices[1] >= sent_indices[0] >= 0
 
                     if self.incl_context:
                         # get n_neighbour sentences, clipped to the length of the document
-                        start_sentence = max(0, sent_index - self.n_sents)
+                        start_sentence = max(0, sent_indices[0] - self.n_sents)
                         end_sentence = min(
-                            len(sentences) - 1, sent_index + self.n_sents
+                            len(sentences) - 1, sent_indices[1] + self.n_sents
                         )
                         start_token = sentences[start_sentence].start
                         end_token = sentences[end_sentence].end
                         sent_doc = doc[start_token:end_token].as_doc()
+
                         # currently, the context is the same for each entity in a sentence (should be refined)
                         sentence_encoding = self.model.predict([sent_doc])[0]
                         sentence_encoding_t = sentence_encoding.T
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 2a6258386..fc960cb01 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1,9 +1,9 @@
-from typing import Callable, Iterable, Dict, Any
+from typing import Callable, Iterable, Dict, Any, Tuple
 
 import pytest
 from numpy.testing import assert_equal
 
-from spacy import registry, util
+from spacy import registry, util, Language
 from spacy.attrs import ENT_KB_ID
 from spacy.compat import pickle
 from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
@@ -108,18 +108,23 @@ def test_issue7065():
 
 
 @pytest.mark.issue(7065)
-def test_issue7065_b():
+@pytest.mark.parametrize("entity_in_first_sentence", [True, False])
+def test_sentence_crossing_ents(entity_in_first_sentence: bool):
+    """Tests if NEL crashes if entities cross sentence boundaries and the first associated sentence doesn't have an
+    entity.
+    entity_in_prior_sentence (bool): Whether to include an entity in the first sentence associated with the
+    sentence-crossing entity.
+    """
     # Test that the NEL doesn't crash when an entity crosses a sentence boundary
     nlp = English()
     vector_length = 3
-    nlp.add_pipe("sentencizer")
     text = "Mahler 's Symphony No. 8 was beautiful."
-    entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
-    links = {
-        (0, 6): {"Q7304": 1.0, "Q270853": 0.0},
-        (10, 24): {"Q7304": 0.0, "Q270853": 1.0},
-    }
-    sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
+    entities = [(10, 24, "WORK")]
+    links = {(10, 24): {"Q7304": 0.0, "Q270853": 1.0}}
+    if entity_in_first_sentence:
+        entities.append((0, 6, "PERSON"))
+        links[(0, 6)] = {"Q7304": 1.0, "Q270853": 0.0}
+    sent_starts = [1, -1, 0, 0, 0, 1, 0, 0, 0]
     doc = nlp(text)
     example = Example.from_dict(
         doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
@@ -145,31 +150,14 @@ def test_issue7065_b():
 
     # Create the Entity Linker component and add it to the pipeline
     entity_linker = nlp.add_pipe("entity_linker", last=True)
-    entity_linker.set_kb(create_kb)
+    entity_linker.set_kb(create_kb)  # type: ignore
     # train the NEL pipe
     optimizer = nlp.initialize(get_examples=lambda: train_examples)
     for i in range(2):
-        losses = {}
-        nlp.update(train_examples, sgd=optimizer, losses=losses)
+        nlp.update(train_examples, sgd=optimizer)
 
-    # Add a custom rule-based component to mimick NER
-    patterns = [
-        {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
-        {
-            "label": "WORK",
-            "pattern": [
-                {"LOWER": "symphony"},
-                {"LOWER": "no"},
-                {"LOWER": "."},
-                {"LOWER": "8"},
-            ],
-        },
-    ]
-    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
-    ruler.add_patterns(patterns)
-    # test the trained model - this should not throw E148
-    doc = nlp(text)
-    assert doc
+    # This shouldn't crash.
+    entity_linker.predict([example.reference])  # type: ignore
 
 
 def test_no_entities():

From 6183906a0bfc07852c33a1e1928c6491f8e4e462 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 17 Mar 2023 09:35:00 +0100
Subject: [PATCH 09/29] Remove autoblack workflow (#12437)

Now that all PRs have `black` formatting validation, we no longer need the
autoblack workflow.
---
 .github/workflows/autoblack.yml | 45 ---------------------------------
 1 file changed, 45 deletions(-)
 delete mode 100644 .github/workflows/autoblack.yml

diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml
deleted file mode 100644
index 555322782..000000000
--- a/.github/workflows/autoblack.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-# GitHub Action that uses Black to reformat all Python code and submits a PR
-# in regular intervals. Inspired by: https://github.com/cclauss/autoblack
-
-name: autoblack
-on:
-  workflow_dispatch:  # allow manual trigger
-  schedule:
-    - cron: '0 8 * * 5'  # every Friday at 8am UTC
-
-jobs:
-  autoblack:
-    if: github.repository_owner == 'explosion'
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-        with:
-            ref: ${{ github.head_ref }}
-      - uses: actions/setup-python@v4
-      - run: pip install black -c requirements.txt
-      - name: Auto-format code if needed
-        run: black spacy
-      # We can't run black --check here because that returns a non-zero excit
-      # code and makes GitHub think the action failed
-      - name: Check for modified files
-        id: git-check
-        run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
-
-      - name: Create Pull Request
-        if: steps.git-check.outputs.modified == 'true'
-        uses: peter-evans/create-pull-request@v4
-        with:
-            title: Auto-format code with black
-            labels: meta
-            commit-message: Auto-format code with black
-            committer: GitHub <noreply@github.com>
-            author: explosion-bot <explosion-bot@users.noreply.github.com>
-            body: _This PR is auto-generated._
-            branch: autoblack
-            delete-branch: true
-            draft: false
-      - name: Check outputs
-        if: steps.git-check.outputs.modified == 'true'
-        run: |
-          echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
-          echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"

From f9c0220ea567b4f4415a71deefc467f21bb0d9dd Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 17 Mar 2023 10:01:49 +0100
Subject: [PATCH 10/29] CI: Switch PR back to paths-ignore (#12438)

Switch PR tests back to paths-ignore but include changes to `.github`
for all PRs rather than trying to figure out complicated
includes+excludes.  Changes to `.github` are relatively rare and should
not be a huge burden for the CI.
---
 .github/workflows/tests.yml | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index c18f9cd23..eef24ff33 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -21,21 +21,18 @@ on:
       - ".github/workflows/**"
   pull_request:
     types: [opened, synchronize, reopened, edited]
-    paths:
-      - "**"
-      - "!*.md"
-      - "!*.mdx"
-      - "!website/docs/**"
-      - "!website/src/**"
-      - "!website/meta/*.tsx"
-      - "!website/meta/*.mjs"
-      - "!website/meta/languages.json"
-      - "!website/meta/site.json"
-      - "!website/meta/sidebars.json"
-      - "!website/meta/type-annotations.json"
-      - "!website/pages/**"
-      - "!.github/workflows/**"
-      - ".github/workflows/tests.yml"
+    paths-ignore:
+      - "*.md"
+      - "*.mdx"
+      - "website/docs/**"
+      - "website/src/**"
+      - "website/meta/*.tsx"
+      - "website/meta/*.mjs"
+      - "website/meta/languages.json"
+      - "website/meta/site.json"
+      - "website/meta/sidebars.json"
+      - "website/meta/type-annotations.json"
+      - "website/pages/**"
 
 jobs:
   validate:

From f1a42b6fcc3b331a09140633194eacd537e2b458 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 17 Mar 2023 10:59:53 +0100
Subject: [PATCH 11/29] CI: Separate spacy universe validation into a separate
 workflow (#12440)

* Separate spacy universe validation into a separate workflow

* Fix new workflow name
---
 .github/workflows/tests.yml               | 23 ++--------------
 .github/workflows/universe_validation.yml | 32 +++++++++++++++++++++++
 2 files changed, 34 insertions(+), 21 deletions(-)
 create mode 100644 .github/workflows/universe_validation.yml

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index eef24ff33..41ea6ce50 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -9,30 +9,14 @@ on:
     paths-ignore:
       - "*.md"
       - "*.mdx"
-      - "website/docs/**"
-      - "website/src/**"
-      - "website/meta/*.tsx"
-      - "website/meta/*.mjs"
-      - "website/meta/languages.json"
-      - "website/meta/site.json"
-      - "website/meta/sidebars.json"
-      - "website/meta/type-annotations.json"
-      - "website/pages/**"
+      - "website/**"
       - ".github/workflows/**"
   pull_request:
     types: [opened, synchronize, reopened, edited]
     paths-ignore:
       - "*.md"
       - "*.mdx"
-      - "website/docs/**"
-      - "website/src/**"
-      - "website/meta/*.tsx"
-      - "website/meta/*.mjs"
-      - "website/meta/languages.json"
-      - "website/meta/site.json"
-      - "website/meta/sidebars.json"
-      - "website/meta/type-annotations.json"
-      - "website/pages/**"
+      - "website/**"
 
 jobs:
   validate:
@@ -56,9 +40,6 @@ jobs:
         run: |
           python -m pip install flake8==5.0.4
           python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
-      - name: Validate website/meta/universe.json
-        run: |
-          python .github/validate_universe_json.py website/meta/universe.json
   tests:
     name: Test
     needs: Validate
diff --git a/.github/workflows/universe_validation.yml b/.github/workflows/universe_validation.yml
new file mode 100644
index 000000000..f9e317aaa
--- /dev/null
+++ b/.github/workflows/universe_validation.yml
@@ -0,0 +1,32 @@
+name: universe validation
+
+on:
+  push:
+    branches-ignore:
+      - "spacy.io"
+      - "nightly.spacy.io"
+      - "v2.spacy.io"
+    paths:
+      - "website/meta/universe.json"
+  pull_request:
+    types: [opened, synchronize, reopened, edited]
+    paths:
+      - "website/meta/universe.json"
+
+jobs:
+  validate:
+    name: Validate
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v3
+
+      - name: Configure Python version
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.7"
+          architecture: x64
+
+      - name: Validate website/meta/universe.json
+        run: |
+          python .github/validate_universe_json.py website/meta/universe.json

From d2d9e9e139cca82edb07685d60c02e9c1cf728bf Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 22 Mar 2023 11:09:37 +0100
Subject: [PATCH 12/29] Add user survey alert to the top (#12452)

* Add user survey alert to the top

* Shorter

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 website/src/styles/navigation.module.sass |  7 ++++---
 website/src/templates/index.js            | 15 ++++++++-------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/website/src/styles/navigation.module.sass b/website/src/styles/navigation.module.sass
index da5c18b6f..3adc5cd03 100644
--- a/website/src/styles/navigation.module.sass
+++ b/website/src/styles/navigation.module.sass
@@ -111,11 +111,12 @@
     line-height: var(--line-height-xs)
     text-align: center
 
-@include breakpoint(max, xs)
-    .list
+@include breakpoint(max, md)
+    .alert
         display: none
 
-    .alert
+@include breakpoint(max, xs)
+    .list
         display: none
 
     .has-alert
diff --git a/website/src/templates/index.js b/website/src/templates/index.js
index 2ee29a9e9..4c10e09c5 100644
--- a/website/src/templates/index.js
+++ b/website/src/templates/index.js
@@ -25,11 +25,6 @@ const AlertSpace = ({ nightly, legacy }) => {
     const isOnline = useOnlineStatus()
     return (
         <>
-            {isOnline && (
-                <Alert title="💥 We'd love to learn more about your experience with spaCy!">
-                    <Link to="https://form.typeform.com/to/aMel9q9f">Take our survey here.</Link>
-                </Alert>
-            )}
             {nightly && (
                 <Alert
                     title="You're viewing the pre-release docs."
@@ -62,9 +57,15 @@ const AlertSpace = ({ nightly, legacy }) => {
     )
 }
 
+// const navAlert = (
+//     <Link to="/usage/v3-5" noLinkLayout>
+//         <strong>💥 Out now:</strong> spaCy v3.5
+//     </Link>
+// )
+
 const navAlert = (
-    <Link to="/usage/v3-5" noLinkLayout>
-        <strong>💥 Out now:</strong> spaCy v3.5
+    <Link to="https://form.typeform.com/to/aMel9q9f" noLinkLayout>
+        <strong>💥 Take the user survey!</strong>
     </Link>
 )
 

From 2953e7b7ce74b3451f099eb918eb12459976cb27 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 24 Mar 2023 16:28:51 +0100
Subject: [PATCH 13/29] Support floret for PretrainVectors (#12435)

* Support floret for PretrainVectors

* Format
---
 spacy/errors.py                          |  4 +--
 spacy/ml/models/multi_task.py            | 31 +++++++++++++++---------
 spacy/tests/training/test_pretraining.py | 16 +++++-------
 3 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index c897c29ff..40cfa8d92 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -549,8 +549,8 @@ class Errors(metaclass=ErrorsWithCodes):
             "during training, make sure to include it in 'annotating components'")
 
     # New errors added in v3.x
-    E850 = ("The PretrainVectors objective currently only supports default "
-            "vectors, not {mode} vectors.")
+    E850 = ("The PretrainVectors objective currently only supports default or "
+            "floret vectors, not {mode} vectors.")
     E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
             "but found value of '{val}'.")
     E852 = ("The tar file pulled from the remote attempted an unsafe path "
diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py
index 826fddd4f..7eb13b608 100644
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@@ -1,5 +1,5 @@
 from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast
-from thinc.types import Floats2d
+from thinc.types import Floats2d, Ints1d
 from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
 from thinc.api import MultiSoftmax, list2array
 from thinc.api import to_categorical, CosineDistance, L2Distance
@@ -7,7 +7,7 @@ from thinc.loss import Loss
 
 from ...util import registry, OOV_RANK
 from ...errors import Errors
-from ...attrs import ID
+from ...attrs import ID, ORTH
 from ...vectors import Mode as VectorsMode
 
 import numpy
@@ -24,8 +24,6 @@ def create_pretrain_vectors(
     maxout_pieces: int, hidden_size: int, loss: str
 ) -> Callable[["Vocab", Model], Model]:
     def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
-        if vocab.vectors.mode != VectorsMode.default:
-            raise ValueError(Errors.E850.format(mode=vocab.vectors.mode))
         if vocab.vectors.shape[1] == 0:
             raise ValueError(Errors.E875)
         model = build_cloze_multi_task_model(
@@ -70,14 +68,23 @@ def get_vectors_loss(ops, docs, prediction, distance):
     """Compute a loss based on a distance between the documents' vectors and
     the prediction.
     """
-    # The simplest way to implement this would be to vstack the
-    # token.vector values, but that's a bit inefficient, especially on GPU.
-    # Instead we fetch the index into the vectors table for each of our tokens,
-    # and look them up all at once. This prevents data copying.
-    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
-    target = docs[0].vocab.vectors.data[ids]
-    target[ids == OOV_RANK] = 0
-    d_target, loss = distance(prediction, target)
+    vocab = docs[0].vocab
+    if vocab.vectors.mode == VectorsMode.default:
+        # The simplest way to implement this would be to vstack the
+        # token.vector values, but that's a bit inefficient, especially on GPU.
+        # Instead we fetch the index into the vectors table for each of our
+        # tokens, and look them up all at once. This prevents data copying.
+        ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
+        target = docs[0].vocab.vectors.data[ids]
+        target[ids == OOV_RANK] = 0
+        d_target, loss = distance(prediction, target)
+    elif vocab.vectors.mode == VectorsMode.floret:
+        keys = ops.flatten([cast(Ints1d, doc.to_array(ORTH)) for doc in docs])
+        target = vocab.vectors.get_batch(keys)
+        target = ops.as_contig(target)
+        d_target, loss = distance(prediction, target)
+    else:
+        raise ValueError(Errors.E850.format(mode=vocab.vectors.mode))
     return loss, d_target
 
 
diff --git a/spacy/tests/training/test_pretraining.py b/spacy/tests/training/test_pretraining.py
index c0d64f1e7..d1db92de5 100644
--- a/spacy/tests/training/test_pretraining.py
+++ b/spacy/tests/training/test_pretraining.py
@@ -359,19 +359,15 @@ def test_pretrain_default_vectors():
     nlp.vocab.vectors = Vectors(shape=(10, 10))
     create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
 
+    # floret vectors are supported
+    nlp.vocab.vectors = Vectors(
+        data=get_current_ops().xp.zeros((10, 10)), mode="floret", hash_count=1
+    )
+    create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
+
     # error for no vectors
     with pytest.raises(ValueError, match="E875"):
         nlp.vocab.vectors = Vectors()
         create_pretrain_vectors(1, 1, "cosine")(
             nlp.vocab, nlp.get_pipe("tok2vec").model
         )
-
-    # error for floret vectors
-    with pytest.raises(ValueError, match="E850"):
-        ops = get_current_ops()
-        nlp.vocab.vectors = Vectors(
-            data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1
-        )
-        create_pretrain_vectors(1, 1, "cosine")(
-            nlp.vocab, nlp.get_pipe("tok2vec").model
-        )

From 4380d750f96a4c9d29a62e5b872c597ebdb09462 Mon Sep 17 00:00:00 2001
From: kadarakos <kadar.akos@gmail.com>
Date: Mon, 27 Mar 2023 10:27:11 +0200
Subject: [PATCH 14/29] add explanation about overwriting behaviour (#12464)

* add explanation about overwriting behaviour

* Update website/docs/api/spancategorizer.mdx

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update website/docs/api/spancategorizer.mdx

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update website/docs/api/spancategorizer.mdx

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* format

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 website/docs/api/spancategorizer.mdx | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx
index c7de2324b..f54a8687b 100644
--- a/website/docs/api/spancategorizer.mdx
+++ b/website/docs/api/spancategorizer.mdx
@@ -20,8 +20,9 @@ output class probabilities are independent for each class. However, if you need
 to predict at most one true class for a span, then use `spancat_singlelabel`. It
 uses a `Softmax` layer and treats the task as a multi-class problem.
 
-Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc.
-Individual span scores can be found in `spangroup.attrs["scores"]`.
+Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc
+under `doc.spans[spans_key]`, where `spans_key` is a component config setting.
+Individual span scores are stored in `doc.spans[spans_key].attrs["scores"]`.
 
 ## Assigned Attributes {id="assigned-attributes"}
 
@@ -29,7 +30,9 @@ Predictions will be saved to `Doc.spans[spans_key]` as a
 [`SpanGroup`](/api/spangroup). The scores for the spans in the `SpanGroup` will
 be saved in `SpanGroup.attrs["scores"]`.
 
-`spans_key` defaults to `"sc"`, but can be passed as a parameter.
+`spans_key` defaults to `"sc"`, but can be passed as a parameter. The `spancat`
+component will overwrite any existing spans under the spans key
+`doc.spans[spans_key]`.
 
 | Location                               | Value                                                    |
 | -------------------------------------- | -------------------------------------------------------- |

From 0ecbeff1a646036764745dc1fd176f35f731b49b Mon Sep 17 00:00:00 2001
From: Prajakta Darade <107802412+prajakta-1527@users.noreply.github.com>
Date: Mon, 27 Mar 2023 15:02:49 +0530
Subject: [PATCH 15/29] corrected example code (#12466)

---
 website/docs/api/coref.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/coref.mdx b/website/docs/api/coref.mdx
index 8647f35d1..0b9ebb888 100644
--- a/website/docs/api/coref.mdx
+++ b/website/docs/api/coref.mdx
@@ -64,7 +64,7 @@ details on the architectures and their arguments and hyperparameters.
 > config={
 >     "model": DEFAULT_COREF_MODEL,
 >     "span_cluster_prefix": DEFAULT_CLUSTER_PREFIX,
-> },
+> }
 > nlp.add_pipe("experimental_coref", config=config)
 > ```
 

From 79dcef17f758eaa84c9044272a1c5c037b60dd22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?sloev=20/=20Johannes=20Valbj=C3=B8rn?=
 <johannes.valbjorn@gmail.com>
Date: Mon, 27 Mar 2023 11:35:14 +0200
Subject: [PATCH 16/29] add spacy_onnx_sentiment_english to universe (#12422)

* add spacy_onnx_sentiment_english to universe

* rename to sentimental-onix

* fix comma json error

* fix typo

* typo fix

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* mention need to download model before example works

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 website/meta/universe.json | 45 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 16e3bc361..5fd1c2287 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -3215,6 +3215,51 @@
             "category": ["pipeline"],
             "tags": ["syllables", "multilingual"]
         },
+        {
+            "id": "sentimental-onix",
+            "title": "Sentimental Onix",
+            "slogan": "Use onnx for sentiment models",
+            "description": "spaCy pipeline component for sentiment analysis using onnx",
+            "github": "sloev/sentimental-onix",
+            "pip": "sentimental-onix",
+            "code_example": [
+                "# Download model:",
+                "#   python -m sentimental_onix download en",
+                "import spacy",
+                "from sentimental_onix import pipeline",
+                "",
+                "nlp = spacy.load(\"en_core_web_sm\")",
+                "nlp.add_pipe(\"sentencizer\")",
+                "nlp.add_pipe(\"sentimental_onix\", after=\"sentencizer\")",
+                "",
+                "sentences = [",
+                "    (sent.text, sent._.sentiment)",
+                "    for doc in nlp.pipe(",
+                "        [",
+                "            \"i hate pasta on tuesdays\",",
+                "            \"i like movies on wednesdays\",",
+                "            \"i find your argument ridiculous\",",
+                "            \"soda with straws are my favorite\",",
+                "        ]",
+                "    )",
+                "    for sent in doc.sents",
+                "]",
+                "",
+                "assert sentences == [",
+                "    (\"i hate pasta on tuesdays\", \"Negative\"),",
+                "    (\"i like movies on wednesdays\", \"Positive\"),",
+                "    (\"i find your argument ridiculous\", \"Negative\"),",
+                "    (\"soda with straws are my favorite\", \"Positive\"),",
+                "]"
+            ],
+            "thumb": "https://raw.githubusercontent.com/sloev/sentimental-onix/master/.github/onix.webp",
+            "author": "Johannes Valbjørn",
+            "author_links": {
+                "github": "sloev"
+            },
+            "category": ["pipeline"],
+            "tags": ["sentiment", "english"]
+        },
         {
             "id": "gobbli",
             "title": "gobbli",

From 1b4a67bc5467f8a18d70e452b35ce3fdaaf2459b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 27 Mar 2023 12:44:04 +0200
Subject: [PATCH 17/29] Restrict github workflows to explosion (#12470)

---
 .github/workflows/explosionbot.yml         | 1 +
 .github/workflows/issue-manager.yml        | 1 +
 .github/workflows/lock.yml                 | 1 +
 .github/workflows/spacy_universe_alert.yml | 1 +
 .github/workflows/tests.yml                | 1 +
 .github/workflows/universe_validation.yml  | 1 +
 6 files changed, 6 insertions(+)

diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml
index 6b472cd12..910cfdc40 100644
--- a/.github/workflows/explosionbot.yml
+++ b/.github/workflows/explosionbot.yml
@@ -8,6 +8,7 @@ on:
 
 jobs:
   explosion-bot:
+    if: github.repository_owner == 'explosion'
     runs-on: ubuntu-latest
     steps:
       - name: Dump GitHub context
diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml
index 8f3a151ea..6c7d7d5a6 100644
--- a/.github/workflows/issue-manager.yml
+++ b/.github/workflows/issue-manager.yml
@@ -13,6 +13,7 @@ on:
 
 jobs:
   issue-manager:
+    if: github.repository_owner == 'explosion'
     runs-on: ubuntu-latest
     steps:
       - uses: tiangolo/issue-manager@0.4.0
diff --git a/.github/workflows/lock.yml b/.github/workflows/lock.yml
index 794adee85..6c3985a93 100644
--- a/.github/workflows/lock.yml
+++ b/.github/workflows/lock.yml
@@ -13,6 +13,7 @@ concurrency:
 
 jobs:
   action:
+    if: github.repository_owner == 'explosion'
     runs-on: ubuntu-latest
     steps:
       - uses: dessant/lock-threads@v4
diff --git a/.github/workflows/spacy_universe_alert.yml b/.github/workflows/spacy_universe_alert.yml
index 837aaeb33..33851fbcc 100644
--- a/.github/workflows/spacy_universe_alert.yml
+++ b/.github/workflows/spacy_universe_alert.yml
@@ -7,6 +7,7 @@ on:
 
 jobs:
   build:
+    if: github.repository_owner == 'explosion'
     runs-on: ubuntu-latest
 
     steps:
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 41ea6ce50..f226057c9 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -21,6 +21,7 @@ on:
 jobs:
   validate:
     name: Validate
+    if: github.repository_owner == 'explosion'
     runs-on: ubuntu-latest
     steps:
       - name: Check out repo
diff --git a/.github/workflows/universe_validation.yml b/.github/workflows/universe_validation.yml
index f9e317aaa..a1e3253a9 100644
--- a/.github/workflows/universe_validation.yml
+++ b/.github/workflows/universe_validation.yml
@@ -16,6 +16,7 @@ on:
 jobs:
   validate:
     name: Validate
+    if: github.repository_owner == 'explosion'
     runs-on: ubuntu-latest
     steps:
       - name: Check out repo

From 888332dfb23eda3ee7dee2ada745236ee54b41f6 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Mon, 27 Mar 2023 13:15:14 +0200
Subject: [PATCH 18/29] Add info to stringstore and vocab (#12471)

---
 website/docs/api/stringstore.mdx | 7 +++++++
 website/docs/api/vocab.mdx       | 7 +++++++
 2 files changed, 14 insertions(+)

diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx
index 47d3715c1..6a3e9d664 100644
--- a/website/docs/api/stringstore.mdx
+++ b/website/docs/api/stringstore.mdx
@@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of
 integer IDs. This ensures that strings always map to the same ID, even from
 different `StringStores`.
 
+<Infobox variant ="warning">
+
+Note that a `StringStore` instance is not static. It increases in size as texts
+with new tokens are processed.
+
+</Infobox>
+
 ## StringStore.\_\_init\_\_ {id="init",tag="method"}
 
 Create the `StringStore`.
diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx
index 131e4ce0a..fe774d1a8 100644
--- a/website/docs/api/vocab.mdx
+++ b/website/docs/api/vocab.mdx
@@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access
 [`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
 between `Doc` objects.
 
+<Infobox variant ="warning">
+
+Note that a `Vocab` instance is not static. It increases in size as texts with
+new tokens are processed.
+
+</Infobox>
+
 ## Vocab.\_\_init\_\_ {id="init",tag="method"}
 
 Create the vocabulary.

From 26da226a39998d385e58334ca6b514fd11c30ed9 Mon Sep 17 00:00:00 2001
From: kadarakos <kadar.akos@gmail.com>
Date: Wed, 29 Mar 2023 08:38:11 +0200
Subject: [PATCH 19/29] Fix spancat-singlelabel score (#12469)

* debug argmax sort and add span scores

* add missing tests for spanscores
---
 spacy/pipeline/spancat.py            |  2 ++
 spacy/tests/pipeline/test_spancat.py | 16 ++++++++++++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 983e1fba9..ff68a3703 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -726,6 +726,7 @@ class SpanCategorizer(TrainablePipe):
         if not allow_overlap:
             # Get the probabilities
             sort_idx = (argmax_scores.squeeze() * -1).argsort()
+            argmax_scores = argmax_scores[sort_idx]
             predicted = predicted[sort_idx]
             indices = indices[sort_idx]
             keeps = keeps[sort_idx]
@@ -748,4 +749,5 @@ class SpanCategorizer(TrainablePipe):
             attrs_scores.append(argmax_scores[i])
             spans.append(Span(doc, start, end, label=self.labels[label]))
 
+        spans.attrs["scores"] = numpy.array(attrs_scores)
         return spans
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index cf6304042..b06505a6d 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -190,17 +190,19 @@ def test_make_spangroup_singlelabel(threshold, allow_overlap, nr_results):
     spangroup = spancat._make_span_group_singlelabel(
         doc, indices, scores, allow_overlap
     )
-    assert len(spangroup) == nr_results
     if threshold > 0.4:
         if allow_overlap:
             assert spangroup[0].text == "London"
             assert spangroup[0].label_ == "City"
+            assert_almost_equal(0.6, spangroup.attrs["scores"][0], 5)
             assert spangroup[1].text == "Greater London"
             assert spangroup[1].label_ == "GreatCity"
-
+            assert spangroup.attrs["scores"][1] == 0.9
+            assert_almost_equal(0.9, spangroup.attrs["scores"][1], 5)
         else:
             assert spangroup[0].text == "Greater London"
             assert spangroup[0].label_ == "GreatCity"
+            assert spangroup.attrs["scores"][0] == 0.9
     else:
         if allow_overlap:
             assert spangroup[0].text == "Greater"
@@ -256,22 +258,32 @@ def test_make_spangroup_negative_label():
     assert len(spangroup_single) == 2
     assert spangroup_single[0].text == "Greater"
     assert spangroup_single[0].label_ == "City"
+    assert_almost_equal(0.4, spangroup_single.attrs["scores"][0], 5)
     assert spangroup_single[1].text == "Greater London"
     assert spangroup_single[1].label_ == "GreatCity"
+    assert spangroup_single.attrs["scores"][1] == 0.9
+    assert_almost_equal(0.9, spangroup_single.attrs["scores"][1], 5)
 
     assert len(spangroup_multi) == 6
     assert spangroup_multi[0].text == "Greater"
     assert spangroup_multi[0].label_ == "City"
+    assert_almost_equal(0.4, spangroup_multi.attrs["scores"][0], 5)
     assert spangroup_multi[1].text == "Greater"
     assert spangroup_multi[1].label_ == "Person"
+    assert_almost_equal(0.3, spangroup_multi.attrs["scores"][1], 5)
     assert spangroup_multi[2].text == "London"
     assert spangroup_multi[2].label_ == "City"
+    assert_almost_equal(0.6, spangroup_multi.attrs["scores"][2], 5)
     assert spangroup_multi[3].text == "London"
     assert spangroup_multi[3].label_ == "GreatCity"
+    assert_almost_equal(0.4, spangroup_multi.attrs["scores"][3], 5)
     assert spangroup_multi[4].text == "Greater London"
     assert spangroup_multi[4].label_ == "Thing"
+    assert spangroup_multi[4].text == "Greater London"
+    assert_almost_equal(0.8, spangroup_multi.attrs["scores"][4], 5)
     assert spangroup_multi[5].text == "Greater London"
     assert spangroup_multi[5].label_ == "GreatCity"
+    assert_almost_equal(0.9, spangroup_multi.attrs["scores"][5], 5)
 
 
 def test_ngram_suggester(en_tokenizer):

From 8d064872ff25c23ed6bfe0a7758456ce31a2ddf7 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 29 Mar 2023 18:54:47 +0200
Subject: [PATCH 20/29] Fix Span.sents for edge case of Span being the only
 Span in the last sentence of a Doc. (#12484)

---
 spacy/tests/doc/test_span.py | 15 +++++++++++++++
 spacy/tokens/span.pyx        |  4 ++++
 2 files changed, 19 insertions(+)

diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index adef5922f..a5c512dc0 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -716,3 +716,18 @@ def test_for_partial_ent_sents():
     # equal to the sentences referenced in ent.sents.
     for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents):
         assert doc_sent == ent_sent
+
+
+def test_for_no_ent_sents():
+    """Span.sents() should set .sents correctly, even if Span in question is trailing and doesn't form a full
+    sentence.
+    """
+    doc = Doc(
+        English().vocab,
+        words=["This", "is", "a", "test.", "ENTITY"],
+        sent_starts=[1, 0, 0, 0, 1],
+    )
+    doc.set_ents([Span(doc, 4, 5, "WORK")])
+    sents = list(doc.ents[0].sents)
+    assert len(sents) == 1
+    assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY"
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 7750b16ed..29b8ce703 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -463,6 +463,10 @@ cdef class Span:
                 elif i == self.doc.length - 1:
                     yield Span(self.doc, start, self.doc.length)
 
+            # Ensure that trailing parts of the Span instance are included in last element of .sents.
+            if start == self.doc.length - 1:
+                yield Span(self.doc, start, self.doc.length)
+
     @property
     def ents(self):
         """The named entities that fall completely within the span. Returns

From b228875600d89b2b08eedaa54b717028e1f0ac37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ye=20Lei=20=28=E5=8F=B6=E7=A3=8A=29?= <blkserene@gmail.com>
Date: Fri, 31 Mar 2023 15:44:01 +0800
Subject: [PATCH 21/29] Allow passing a Span to displacy.parse_deps (#12477)

* Allow passing a Span to displacy.parse_deps

* Update docstring

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update API docs

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/displacy/__init__.py     |  8 ++++++--
 spacy/tests/test_displacy.py   | 14 ++++++++++++++
 website/docs/api/top-level.mdx |  2 +-
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index ea6bba2c9..f42dad0c9 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -125,13 +125,17 @@ def app(environ, start_response):
     return [res]
 
 
-def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
+def parse_deps(
+    orig_doc: Union[Doc, Span], options: Dict[str, Any] = {}
+) -> Dict[str, Any]:
     """Generate dependency parse in {'words': [], 'arcs': []} format.
 
-    orig_doc (Doc): Document to parse.
+    orig_doc (Union[Doc, Span]): Document to parse.
     options (Dict[str, Any]): Dependency parse specific visualisation options.
     RETURNS (dict): Generated dependency parse keyed by words and arcs.
     """
+    if isinstance(orig_doc, Span):
+        orig_doc = orig_doc.as_doc()
     doc = Doc(orig_doc.vocab).from_bytes(
         orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
     )
diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py
index f298b38e0..837a92e02 100644
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@@ -275,6 +275,20 @@ def test_displacy_parse_deps(en_vocab):
         {"start": 2, "end": 3, "label": "det", "dir": "left"},
         {"start": 1, "end": 3, "label": "attr", "dir": "right"},
     ]
+    # Test that displacy.parse_deps converts Span to Doc
+    deps = displacy.parse_deps(doc[:])
+    assert isinstance(deps, dict)
+    assert deps["words"] == [
+        {"lemma": None, "text": words[0], "tag": pos[0]},
+        {"lemma": None, "text": words[1], "tag": pos[1]},
+        {"lemma": None, "text": words[2], "tag": pos[2]},
+        {"lemma": None, "text": words[3], "tag": pos[3]},
+    ]
+    assert deps["arcs"] == [
+        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
+        {"start": 2, "end": 3, "label": "det", "dir": "left"},
+        {"start": 1, "end": 3, "label": "attr", "dir": "right"},
+    ]
 
 
 def test_displacy_invalid_arcs():
diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index d0851a59f..9193b2a7b 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -291,7 +291,7 @@ the `manual=True` argument in `displacy.render`.
 
 | Name        | Description                                                         |
 | ----------- | ------------------------------------------------------------------- |
-| `orig_doc`  | Doc to parse dependencies. ~~Doc~~                                  |
+| `orig_doc`  | Doc or span to parse dependencies. ~~Union[Doc, Span]~~             |
 | `options`   | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ |
 | **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~        |
 

From 57ee1212de0beb014183438b7a4746304d249df2 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 31 Mar 2023 13:43:51 +0200
Subject: [PATCH 22/29] Fix pickle for ngram suggester (#12486)

---
 spacy/pipeline/spancat.py            | 58 +++++++++++++++-------------
 spacy/tests/pipeline/test_spancat.py | 20 +++++++++-
 2 files changed, 50 insertions(+), 28 deletions(-)

diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index ff68a3703..5a087e42a 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,5 +1,6 @@
 from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast, Union
 from dataclasses import dataclass
+from functools import partial
 from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
 from thinc.api import Optimizer
 from thinc.types import Ragged, Ints2d, Floats2d
@@ -82,39 +83,42 @@ class Suggester(Protocol):
         ...
 
 
+def ngram_suggester(
+    docs: Iterable[Doc], sizes: List[int], *, ops: Optional[Ops] = None
+) -> Ragged:
+    if ops is None:
+        ops = get_current_ops()
+    spans = []
+    lengths = []
+    for doc in docs:
+        starts = ops.xp.arange(len(doc), dtype="i")
+        starts = starts.reshape((-1, 1))
+        length = 0
+        for size in sizes:
+            if size <= len(doc):
+                starts_size = starts[: len(doc) - (size - 1)]
+                spans.append(ops.xp.hstack((starts_size, starts_size + size)))
+                length += spans[-1].shape[0]
+            if spans:
+                assert spans[-1].ndim == 2, spans[-1].shape
+        lengths.append(length)
+    lengths_array = ops.asarray1i(lengths)
+    if len(spans) > 0:
+        output = Ragged(ops.xp.vstack(spans), lengths_array)
+    else:
+        output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
+
+    assert output.dataXd.ndim == 2
+    return output
+
+
 @registry.misc("spacy.ngram_suggester.v1")
 def build_ngram_suggester(sizes: List[int]) -> Suggester:
     """Suggest all spans of the given lengths. Spans are returned as a ragged
     array of integers. The array has two columns, indicating the start and end
     position."""
 
-    def ngram_suggester(docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged:
-        if ops is None:
-            ops = get_current_ops()
-        spans = []
-        lengths = []
-        for doc in docs:
-            starts = ops.xp.arange(len(doc), dtype="i")
-            starts = starts.reshape((-1, 1))
-            length = 0
-            for size in sizes:
-                if size <= len(doc):
-                    starts_size = starts[: len(doc) - (size - 1)]
-                    spans.append(ops.xp.hstack((starts_size, starts_size + size)))
-                    length += spans[-1].shape[0]
-                if spans:
-                    assert spans[-1].ndim == 2, spans[-1].shape
-            lengths.append(length)
-        lengths_array = ops.asarray1i(lengths)
-        if len(spans) > 0:
-            output = Ragged(ops.xp.vstack(spans), lengths_array)
-        else:
-            output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
-
-        assert output.dataXd.ndim == 2
-        return output
-
-    return ngram_suggester
+    return partial(ngram_suggester, sizes=sizes)
 
 
 @registry.misc("spacy.ngram_range_suggester.v1")
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index b06505a6d..199ef2b2a 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -1,7 +1,7 @@
 import pytest
 import numpy
 from numpy.testing import assert_array_equal, assert_almost_equal
-from thinc.api import get_current_ops, Ragged
+from thinc.api import get_current_ops, NumpyOps, Ragged
 
 from spacy import util
 from spacy.lang.en import English
@@ -577,3 +577,21 @@ def test_set_candidates(name):
     assert len(docs[0].spans["candidates"]) == 9
     assert docs[0].spans["candidates"][0].text == "Just"
     assert docs[0].spans["candidates"][4].text == "Just a"
+
+
+@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
+@pytest.mark.parametrize("n_process", [1, 2])
+def test_spancat_multiprocessing(name, n_process):
+    if isinstance(get_current_ops, NumpyOps) or n_process < 2:
+        nlp = Language()
+        spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
+        train_examples = make_examples(nlp)
+        nlp.initialize(get_examples=lambda: train_examples)
+        texts = [
+            "Just a sentence.",
+            "I like London and Berlin",
+            "I like Berlin",
+            "I eat ham.",
+        ]
+        docs = list(nlp.pipe(texts, n_process=n_process))
+        assert len(docs) == len(texts)

From a5406a6c457c7809221e365b7a14020e957fe539 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 31 Mar 2023 13:48:15 +0200
Subject: [PATCH 23/29] Allow cupy 12.0 for extras (#12490)

---
 setup.cfg | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 27499805b..eea557337 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -78,41 +78,41 @@ transformers =
 ray =
     spacy_ray>=0.1.0,<1.0.0
 cuda =
-    cupy>=5.0.0b4,<12.0.0
+    cupy>=5.0.0b4,<13.0.0
 cuda80 =
-    cupy-cuda80>=5.0.0b4,<12.0.0
+    cupy-cuda80>=5.0.0b4,<13.0.0
 cuda90 =
-    cupy-cuda90>=5.0.0b4,<12.0.0
+    cupy-cuda90>=5.0.0b4,<13.0.0
 cuda91 =
-    cupy-cuda91>=5.0.0b4,<12.0.0
+    cupy-cuda91>=5.0.0b4,<13.0.0
 cuda92 =
-    cupy-cuda92>=5.0.0b4,<12.0.0
+    cupy-cuda92>=5.0.0b4,<13.0.0
 cuda100 =
-    cupy-cuda100>=5.0.0b4,<12.0.0
+    cupy-cuda100>=5.0.0b4,<13.0.0
 cuda101 =
-    cupy-cuda101>=5.0.0b4,<12.0.0
+    cupy-cuda101>=5.0.0b4,<13.0.0
 cuda102 =
-    cupy-cuda102>=5.0.0b4,<12.0.0
+    cupy-cuda102>=5.0.0b4,<13.0.0
 cuda110 =
-    cupy-cuda110>=5.0.0b4,<12.0.0
+    cupy-cuda110>=5.0.0b4,<13.0.0
 cuda111 =
-    cupy-cuda111>=5.0.0b4,<12.0.0
+    cupy-cuda111>=5.0.0b4,<13.0.0
 cuda112 =
-    cupy-cuda112>=5.0.0b4,<12.0.0
+    cupy-cuda112>=5.0.0b4,<13.0.0
 cuda113 =
-    cupy-cuda113>=5.0.0b4,<12.0.0
+    cupy-cuda113>=5.0.0b4,<13.0.0
 cuda114 =
-    cupy-cuda114>=5.0.0b4,<12.0.0
+    cupy-cuda114>=5.0.0b4,<13.0.0
 cuda115 =
-    cupy-cuda115>=5.0.0b4,<12.0.0
+    cupy-cuda115>=5.0.0b4,<13.0.0
 cuda116 =
-    cupy-cuda116>=5.0.0b4,<12.0.0
+    cupy-cuda116>=5.0.0b4,<13.0.0
 cuda117 =
-    cupy-cuda117>=5.0.0b4,<12.0.0
+    cupy-cuda117>=5.0.0b4,<13.0.0
 cuda11x =
-    cupy-cuda11x>=11.0.0,<12.0.0
+    cupy-cuda11x>=11.0.0,<13.0.0
 cuda-autodetect =
-    cupy-wheel>=11.0.0,<12.0.0
+    cupy-wheel>=11.0.0,<13.0.0
 apple =
     thinc-apple-ops>=0.1.0.dev0,<1.0.0
 # Language tokenizers with external dependencies

From 0ec4dc5c29578f9857004b0c747e10529616b6cf Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 3 Apr 2023 11:38:56 +0200
Subject: [PATCH 24/29] Remove redundant strings.add for Doc.char_span (#12429)

---
 spacy/tokens/doc.pyx | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 7dfe0ca9f..3bc404dd0 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -544,10 +544,6 @@ cdef class Doc:
 
         DOCS: https://spacy.io/api/doc#char_span
         """
-        if not isinstance(label, int):
-            label = self.vocab.strings.add(label)
-        if not isinstance(kb_id, int):
-            kb_id = self.vocab.strings.add(kb_id)
         alignment_modes = ("strict", "contract", "expand")
         if alignment_mode not in alignment_modes:
             raise ValueError(

From bbf232e35520692f7964c03d310f80d809a1ad9c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 3 Apr 2023 15:11:12 +0200
Subject: [PATCH 25/29] Add Span.kb_id/Span.id strings to Doc/DocBin
 serialization if set (#12493)

* Add Span.kb_id/Span.id strings to Doc/DocBin serialization if set

* Format
---
 spacy/tests/serialize/test_serialize_doc.py    | 9 ++++++++-
 spacy/tests/serialize/test_serialize_docbin.py | 9 ++++++++-
 spacy/tokens/_serialize.py                     | 4 ++++
 spacy/tokens/doc.pyx                           | 4 ++++
 4 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py
index 15bf67bfd..eea13445e 100644
--- a/spacy/tests/serialize/test_serialize_doc.py
+++ b/spacy/tests/serialize/test_serialize_doc.py
@@ -213,6 +213,13 @@ def test_serialize_doc_exclude(en_vocab):
 
 def test_serialize_doc_span_groups(en_vocab):
     doc = Doc(en_vocab, words=["hello", "world", "!"])
-    doc.spans["content"] = [doc[0:2]]
+    span = doc[0:2]
+    span.label_ = "test_serialize_doc_span_groups_label"
+    span.id_ = "test_serialize_doc_span_groups_id"
+    span.kb_id_ = "test_serialize_doc_span_groups_kb_id"
+    doc.spans["content"] = [span]
     new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
     assert len(new_doc.spans["content"]) == 1
+    assert new_doc.spans["content"][0].label_ == "test_serialize_doc_span_groups_label"
+    assert new_doc.spans["content"][0].id_ == "test_serialize_doc_span_groups_id"
+    assert new_doc.spans["content"][0].kb_id_ == "test_serialize_doc_span_groups_kb_id"
diff --git a/spacy/tests/serialize/test_serialize_docbin.py b/spacy/tests/serialize/test_serialize_docbin.py
index 9f8e5e06b..6f7b1001c 100644
--- a/spacy/tests/serialize/test_serialize_docbin.py
+++ b/spacy/tests/serialize/test_serialize_docbin.py
@@ -49,7 +49,11 @@ def test_serialize_doc_bin():
     nlp = English()
     for doc in nlp.pipe(texts):
         doc.cats = cats
-        doc.spans["start"] = [doc[0:2]]
+        span = doc[0:2]
+        span.label_ = "UNUSUAL_SPAN_LABEL"
+        span.id_ = "UNUSUAL_SPAN_ID"
+        span.kb_id_ = "UNUSUAL_SPAN_KB_ID"
+        doc.spans["start"] = [span]
         doc[0].norm_ = "UNUSUAL_TOKEN_NORM"
         doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID"
         doc_bin.add(doc)
@@ -63,6 +67,9 @@ def test_serialize_doc_bin():
         assert doc.text == texts[i]
         assert doc.cats == cats
         assert len(doc.spans) == 1
+        assert doc.spans["start"][0].label_ == "UNUSUAL_SPAN_LABEL"
+        assert doc.spans["start"][0].id_ == "UNUSUAL_SPAN_ID"
+        assert doc.spans["start"][0].kb_id_ == "UNUSUAL_SPAN_KB_ID"
         assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM"
         assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
 
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index c4e8f26f4..73c857d1f 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -124,6 +124,10 @@ class DocBin:
         for key, group in doc.spans.items():
             for span in group:
                 self.strings.add(span.label_)
+                if span.kb_id in span.doc.vocab.strings:
+                    self.strings.add(span.kb_id_)
+                if span.id in span.doc.vocab.strings:
+                    self.strings.add(span.id_)
 
     def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
         """Recover Doc objects from the annotations, using the given vocab.
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 3bc404dd0..a54b4ad3c 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1346,6 +1346,10 @@ cdef class Doc:
         for group in self.spans.values():
             for span in group:
                 strings.add(span.label_)
+                if span.kb_id in span.doc.vocab.strings:
+                    strings.add(span.kb_id_)
+                if span.id in span.doc.vocab.strings:
+                    strings.add(span.id_)
         # Msgpack doesn't distinguish between lists and tuples, which is
         # vexing for user data. As a best guess, we *know* that within
         # keys, we must have tuples. In values we just have to hope

From 2fbd080a03ec7af20026a8938e72ea1a512b7285 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Mon, 3 Apr 2023 15:24:03 +0200
Subject: [PATCH 26/29] Add model-last saving mechanism to pretraining (#12459)

* Adjust pretrain command

* chane naming and add finally block

* Add unit test

* Add unit test assertions

* Update spacy/training/pretrain.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* change finally block

* Add to docs

* Update website/docs/usage/embeddings-transformers.mdx

* Add flag to skip saving model-last

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/cli/pretrain.py                         |  2 +
 spacy/tests/training/test_pretraining.py      | 10 ++++-
 spacy/training/pretrain.py                    | 41 +++++++++++--------
 website/docs/api/cli.mdx                      | 23 ++++++-----
 .../docs/usage/embeddings-transformers.mdx    |  9 ++--
 5 files changed, 53 insertions(+), 32 deletions(-)

diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 381d589cf..45042e605 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -23,6 +23,7 @@ def pretrain_cli(
     resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
     epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
+    skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"),
     # fmt: on
 ):
     """
@@ -74,6 +75,7 @@ def pretrain_cli(
         epoch_resume=epoch_resume,
         use_gpu=use_gpu,
         silent=False,
+        skip_last=skip_last,
     )
     msg.good("Successfully finished pretrain")
 
diff --git a/spacy/tests/training/test_pretraining.py b/spacy/tests/training/test_pretraining.py
index d1db92de5..6cfdeed20 100644
--- a/spacy/tests/training/test_pretraining.py
+++ b/spacy/tests/training/test_pretraining.py
@@ -165,7 +165,8 @@ def test_pretraining_default():
 
 
 @pytest.mark.parametrize("objective", CHAR_OBJECTIVES)
-def test_pretraining_tok2vec_characters(objective):
+@pytest.mark.parametrize("skip_last", (True, False))
+def test_pretraining_tok2vec_characters(objective, skip_last):
     """Test that pretraining works with the character objective"""
     config = Config().from_str(pretrain_string_listener)
     config["pretraining"]["objective"] = objective
@@ -178,10 +179,14 @@ def test_pretraining_tok2vec_characters(objective):
         filled["paths"]["raw_text"] = file_path
         filled = filled.interpolate()
         assert filled["pretraining"]["component"] == "tok2vec"
-        pretrain(filled, tmp_dir)
+        pretrain(filled, tmp_dir, skip_last=skip_last)
         assert Path(tmp_dir / "model0.bin").exists()
         assert Path(tmp_dir / "model4.bin").exists()
         assert not Path(tmp_dir / "model5.bin").exists()
+        if skip_last:
+            assert not Path(tmp_dir / "model-last.bin").exists()
+        else:
+            assert Path(tmp_dir / "model-last.bin").exists()
 
 
 @pytest.mark.parametrize("objective", VECTOR_OBJECTIVES)
@@ -237,6 +242,7 @@ def test_pretraining_tagger_tok2vec(config):
         pretrain(filled, tmp_dir)
         assert Path(tmp_dir / "model0.bin").exists()
         assert Path(tmp_dir / "model4.bin").exists()
+        assert Path(tmp_dir / "model-last.bin").exists()
         assert not Path(tmp_dir / "model5.bin").exists()
 
 
diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py
index 52af84aaf..ebbc5d837 100644
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@@ -24,6 +24,7 @@ def pretrain(
     epoch_resume: Optional[int] = None,
     use_gpu: int = -1,
     silent: bool = True,
+    skip_last: bool = False,
 ):
     msg = Printer(no_print=silent)
     if config["training"]["seed"] is not None:
@@ -60,10 +61,14 @@ def pretrain(
     row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
     msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
 
-    def _save_model(epoch, is_temp=False):
+    def _save_model(epoch, is_temp=False, is_last=False):
         is_temp_str = ".temp" if is_temp else ""
         with model.use_params(optimizer.averages):
-            with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
+            if is_last:
+                save_path = output_dir / f"model-last.bin"
+            else:
+                save_path = output_dir / f"model{epoch}{is_temp_str}.bin"
+            with (save_path).open("wb") as file_:
                 file_.write(model.get_ref("tok2vec").to_bytes())
             log = {
                 "nr_word": tracker.nr_word,
@@ -76,22 +81,26 @@ def pretrain(
 
     # TODO: I think we probably want this to look more like the
     # 'create_train_batches' function?
-    for epoch in range(epoch_resume, P["max_epochs"]):
-        for batch_id, batch in enumerate(batcher(corpus(nlp))):
-            docs = ensure_docs(batch)
-            loss = make_update(model, docs, optimizer, objective)
-            progress = tracker.update(epoch, loss, docs)
-            if progress:
-                msg.row(progress, **row_settings)
-            if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
-                _save_model(epoch, is_temp=True)
+    try:
+        for epoch in range(epoch_resume, P["max_epochs"]):
+            for batch_id, batch in enumerate(batcher(corpus(nlp))):
+                docs = ensure_docs(batch)
+                loss = make_update(model, docs, optimizer, objective)
+                progress = tracker.update(epoch, loss, docs)
+                if progress:
+                    msg.row(progress, **row_settings)
+                if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
+                    _save_model(epoch, is_temp=True)
 
-        if P["n_save_epoch"]:
-            if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1:
+            if P["n_save_epoch"]:
+                if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1:
+                    _save_model(epoch)
+            else:
                 _save_model(epoch)
-        else:
-            _save_model(epoch)
-        tracker.epoch_loss = 0.0
+            tracker.epoch_loss = 0.0
+    finally:
+        if not skip_last:
+            _save_model(P["max_epochs"], is_last=True)
 
 
 def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 2bb0199fc..323ea2a92 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -1122,17 +1122,18 @@ auto-generated by setting `--pretraining` on
 $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides]
 ```
 
-| Name                    | Description                                                                                                                                                                                                        |
-| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `config_path`           | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
-| `output_dir`            | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                                                           |
-| `--code`, `-c`          | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
-| `--resume-path`, `-r`   | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~                                                                                                                          |
-| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~                                                                |
-| `--gpu-id`, `-g`        | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
-| `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
-| overrides               | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~                              |
-| **CREATES**             | The pretrained weights that can be used to initialize `spacy train`.                                                                                                                                               |
+| Name                                               | Description                                                                                                                                                                                                        |
+| -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `config_path`                                      | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
+| `output_dir`                                       | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                                                           |
+| `--code`, `-c`                                     | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
+| `--resume-path`, `-r`                              | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~                                                                                                                          |
+| `--epoch-resume`, `-er`                            | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~                                                                |
+| `--gpu-id`, `-g`                                   | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
+| `--skip-last`, `-L` <Tag variant="new">3.5.2</Tag> | Skip saving `model-last.bin`. Defaults to `False`. ~~bool (flag)~~                                                                                                                                                 |
+| `--help`, `-h`                                     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
+| overrides                                          | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~                              |
+| **CREATES**                                        | The pretrained weights that can be used to initialize `spacy train`.                                                                                                                                               |
 
 ## evaluate {id="evaluate",version="2",tag="command"}
 
diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx
index cf80822fb..5f1e5b817 100644
--- a/website/docs/usage/embeddings-transformers.mdx
+++ b/website/docs/usage/embeddings-transformers.mdx
@@ -746,13 +746,16 @@ this by setting `initialize.init_tok2vec` to the filename of the `.bin` file
 that you want to use from pretraining.
 
 A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as
-an example, produces `pretrain/model0.bin` through `pretrain/model4.bin`. To
-make use of the final output, you could fill in this value in your config file:
+an example, produces `pretrain/model0.bin` through `pretrain/model4.bin` plus a
+copy of the last iteration as `pretrain/model-last.bin`. Additionally, you can
+configure `n_save_epoch` to tell pretraining in which epoch interval it should
+save the current training progress. To use the final output to initialize your
+`tok2vec` layer, you could fill in this value in your config file:
 
 ```ini {title="config.cfg"}
 
 [paths]
-init_tok2vec = "pretrain/model4.bin"
+init_tok2vec = "pretrain/model-last.bin"
 
 [initialize]
 init_tok2vec = ${paths.init_tok2vec}

From 314a7cea7392be23a5123fddb2cfd6a3703d5dc9 Mon Sep 17 00:00:00 2001
From: Will Frey <jfrey89@gmail.com>
Date: Tue, 4 Apr 2023 14:53:07 -0400
Subject: [PATCH 27/29] Fix invalid ConsoleLogger.v3 example config (#12498)

Replace `progress_bar = "all_steps"` with `progress_bar = "eval"`, which is consistent with the default behavior for `spacy.ConsoleLogger.v1` and `spacy.ConsoleLogger.v2`.
---
 website/docs/api/top-level.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index 9193b2a7b..975c16aaa 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -577,7 +577,7 @@ start decreasing across epochs.
 > ```ini
 > [training.logger]
 > @loggers = "spacy.ConsoleLogger.v3"
-> progress_bar = "all_steps"
+> progress_bar = "eval"
 > console_output = true
 > output_file = "training_log.jsonl"
 > ```

From 9fbb8ee912585fb2de64360a8b955a8d17e5b28a Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Thu, 6 Apr 2023 11:45:19 +0200
Subject: [PATCH 28/29] Add more information to custom code docs (#12491)

* Add info to sections

* Update website/docs/usage/training.mdx

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 website/docs/api/top-level.mdx  | 5 ++++-
 website/docs/usage/training.mdx | 9 +++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index 975c16aaa..6de1acdf0 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -25,7 +25,10 @@ and call the package's own `load()` method. If a pipeline is loaded from a path,
 spaCy will assume it's a data directory, load its
 [`config.cfg`](/api/data-formats#config) and use the language and pipeline
 information to construct the `Language` class. The data will be loaded in via
-[`Language.from_disk`](/api/language#from_disk).
+[`Language.from_disk`](/api/language#from_disk). Loading a pipeline from a
+package will also import any custom code, if present, whereas loading from a
+directory does not. For these cases, you need to manually import your custom
+code.
 
 <Infobox variant="warning" title="Changed in v3.0">
 
diff --git a/website/docs/usage/training.mdx b/website/docs/usage/training.mdx
index 6cda975cb..6caf2e94b 100644
--- a/website/docs/usage/training.mdx
+++ b/website/docs/usage/training.mdx
@@ -758,6 +758,15 @@ any custom architectures, functions or
 your pipeline and registered when it's loaded. See the documentation on
 [saving and loading pipelines](/usage/saving-loading#models-custom) for details.
 
+<Infobox variant="warning">
+
+Note that the unpackaged models produced by `spacy train` are data directories
+that **do not include custom code**. You need to import the code in your script
+before loading in unpackaged models. For more details, see
+[`spacy.load`](/api/top-level#spacy.load).
+
+</Infobox>
+
 #### Example: Modifying the nlp object {id="custom-code-nlp-callbacks"}
 
 For many use cases, you don't necessarily want to implement the whole `Language`

From f66d55fe5bba268ec3b4a747d0ea00c621d6d65a Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Thu, 6 Apr 2023 11:45:58 +0200
Subject: [PATCH 29/29] `Docs`: Fix rule-based matching example that expands
 named entities (#12495)

---
 website/docs/usage/rule-based-matching.mdx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index 55c043015..7e88bdc1f 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -1682,6 +1682,8 @@ def expand_person_entities(doc):
             if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."):
                 new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
                 new_ents.append(new_ent)
+            else:
+                new_ents.append(ent)
         else:
             new_ents.append(ent)
     doc.ents = new_ents