From 6aa6b86d496c8d9271f42c077a79f9bfb88687ac Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 1 Mar 2023 16:02:55 +0100
Subject: [PATCH 1/8] Make generation of empty `KnowledgeBase` instances
 configurable in `EntityLinker` (#12320)

* Make empty_kb() configurable.

* Format.

* Update docs.

* Be more specific in KB serialization test.

* Update KB serialization tests. Update docs.

* Remove doc update for batched candidate generation.

* Fix serialization of subclassed KB in tests.

* Format.

* Update docstring.

* Update docstring.

* Switch from pickle to json for custom field serialization.
---
 spacy/ml/models/entity_linker.py           |  8 +++
 spacy/pipeline/entity_linker.py            | 11 +++-
 spacy/tests/serialize/test_serialize_kb.py | 71 +++++++++++++++++++---
 website/docs/api/architectures.mdx         | 10 ++-
 website/docs/api/entitylinker.mdx          | 28 +++++----
 5 files changed, 101 insertions(+), 27 deletions(-)

diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index 299b6bb52..7332ca199 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -89,6 +89,14 @@ def load_kb(
     return kb_from_file
 
 
+@registry.misc("spacy.EmptyKB.v2")
+def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
+    def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
+        return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
+
+    return empty_kb_factory
+
+
 @registry.misc("spacy.EmptyKB.v1")
 def empty_kb(
     entity_vector_length: int,
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index a11964117..f2dae0529 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -54,6 +54,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
         "entity_vector_length": 64,
         "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
         "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
+        "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
         "overwrite": True,
         "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
         "use_gold_ents": True,
@@ -80,6 +81,7 @@ def make_entity_linker(
     get_candidates_batch: Callable[
         [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
     ],
+    generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
     overwrite: bool,
     scorer: Optional[Callable],
     use_gold_ents: bool,
@@ -101,6 +103,7 @@ def make_entity_linker(
     get_candidates_batch (
         Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
         ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
+    generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
     scorer (Optional[Callable]): The scoring method.
     use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
         component must provide entity annotations.
@@ -135,6 +138,7 @@ def make_entity_linker(
         entity_vector_length=entity_vector_length,
         get_candidates=get_candidates,
         get_candidates_batch=get_candidates_batch,
+        generate_empty_kb=generate_empty_kb,
         overwrite=overwrite,
         scorer=scorer,
         use_gold_ents=use_gold_ents,
@@ -175,6 +179,7 @@ class EntityLinker(TrainablePipe):
         get_candidates_batch: Callable[
             [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
         ],
+        generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
         overwrite: bool = BACKWARD_OVERWRITE,
         scorer: Optional[Callable] = entity_linker_score,
         use_gold_ents: bool,
@@ -198,6 +203,7 @@ class EntityLinker(TrainablePipe):
             Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
             Iterable[Candidate]]
             ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
+        generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
         scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
         use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
             component must provide entity annotations.
@@ -220,6 +226,7 @@ class EntityLinker(TrainablePipe):
         self.model = model
         self.name = name
         self.labels_discard = list(labels_discard)
+        # how many neighbour sentences to take into account
         self.n_sents = n_sents
         self.incl_prior = incl_prior
         self.incl_context = incl_context
@@ -227,9 +234,7 @@ class EntityLinker(TrainablePipe):
         self.get_candidates_batch = get_candidates_batch
         self.cfg: Dict[str, Any] = {"overwrite": overwrite}
         self.distance = CosineDistance(normalize=False)
-        # how many neighbour sentences to take into account
-        # create an empty KB by default
-        self.kb = empty_kb(entity_vector_length)(self.vocab)
+        self.kb = generate_empty_kb(self.vocab, entity_vector_length)
         self.scorer = scorer
         self.use_gold_ents = use_gold_ents
         self.candidates_batch_size = candidates_batch_size
diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index 8d3653ab1..f9d2e226b 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -1,7 +1,10 @@
-from typing import Callable
+from pathlib import Path
+from typing import Callable, Iterable, Any, Dict
 
-from spacy import util
-from spacy.util import ensure_path, registry, load_model_from_config
+import srsly
+
+from spacy import util, Errors
+from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList
 from spacy.kb.kb_in_memory import InMemoryLookupKB
 from spacy.vocab import Vocab
 from thinc.api import Config
@@ -91,7 +94,10 @@ def test_serialize_subclassed_kb():
 
     [components.entity_linker]
     factory = "entity_linker"
-
+    
+    [components.entity_linker.generate_empty_kb]
+    @misc = "kb_test.CustomEmptyKB.v1"
+    
     [initialize]
 
     [initialize.components]
@@ -99,7 +105,7 @@ def test_serialize_subclassed_kb():
     [initialize.components.entity_linker]
 
     [initialize.components.entity_linker.kb_loader]
-    @misc = "spacy.CustomKB.v1"
+    @misc = "kb_test.CustomKB.v1"
     entity_vector_length = 342
     custom_field = 666
     """
@@ -109,10 +115,57 @@ def test_serialize_subclassed_kb():
             super().__init__(vocab, entity_vector_length)
             self.custom_field = custom_field
 
-    @registry.misc("spacy.CustomKB.v1")
+        def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
+            """We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well."""
+            path = ensure_path(path)
+            if not path.exists():
+                path.mkdir(parents=True)
+            if not path.is_dir():
+                raise ValueError(Errors.E928.format(loc=path))
+
+            def serialize_custom_fields(file_path: Path) -> None:
+                srsly.write_json(file_path, {"custom_field": self.custom_field})
+
+            serialize = {
+                "contents": lambda p: self.write_contents(p),
+                "strings.json": lambda p: self.vocab.strings.to_disk(p),
+                "custom_fields": lambda p: serialize_custom_fields(p),
+            }
+            util.to_disk(path, serialize, exclude)
+
+        def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
+            """We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well."""
+            path = ensure_path(path)
+            if not path.exists():
+                raise ValueError(Errors.E929.format(loc=path))
+            if not path.is_dir():
+                raise ValueError(Errors.E928.format(loc=path))
+
+            def deserialize_custom_fields(file_path: Path) -> None:
+                self.custom_field = srsly.read_json(file_path)["custom_field"]
+
+            deserialize: Dict[str, Callable[[Any], Any]] = {
+                "contents": lambda p: self.read_contents(p),
+                "strings.json": lambda p: self.vocab.strings.from_disk(p),
+                "custom_fields": lambda p: deserialize_custom_fields(p),
+            }
+            util.from_disk(path, deserialize, exclude)
+
+    @registry.misc("kb_test.CustomEmptyKB.v1")
+    def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]:
+        def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
+            return SubInMemoryLookupKB(
+                vocab=vocab,
+                entity_vector_length=entity_vector_length,
+                custom_field=0,
+            )
+
+        return empty_kb_factory
+
+    @registry.misc("kb_test.CustomKB.v1")
     def custom_kb(
         entity_vector_length: int, custom_field: int
-    ) -> Callable[[Vocab], InMemoryLookupKB]:
+    ) -> Callable[[Vocab], SubInMemoryLookupKB]:
         def custom_kb_factory(vocab):
             kb = SubInMemoryLookupKB(
                 vocab=vocab,
@@ -139,6 +192,6 @@ def test_serialize_subclassed_kb():
         nlp2 = util.load_model_from_path(tmp_dir)
         entity_linker2 = nlp2.get_pipe("entity_linker")
         # After IO, the KB is the standard one
-        assert type(entity_linker2.kb) == InMemoryLookupKB
+        assert type(entity_linker2.kb) == SubInMemoryLookupKB
         assert entity_linker2.kb.entity_vector_length == 342
-        assert not hasattr(entity_linker2.kb, "custom_field")
+        assert entity_linker2.kb.custom_field == 666
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 966b5830a..268c04a07 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -899,15 +899,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
 | `nO`        | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                                    |
 
-### spacy.EmptyKB.v1 {id="EmptyKB"}
+### spacy.EmptyKB.v1 {id="EmptyKB.v1"}
 
 A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
-instance. This is the default when a new entity linker component is created.
+instance.
 
 | Name                   | Description                                                                         |
 | ---------------------- | ----------------------------------------------------------------------------------- |
 | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
 
+### spacy.EmptyKB.v2 {id="EmptyKB"}
+
+A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
+instance. This is the default when a new entity linker component is created. It
+returns a `Callable[[Vocab, int], InMemoryLookupKB]`.
+
 ### spacy.KBFromFile.v1 {id="KBFromFile"}
 
 A function that reads an existing `KnowledgeBase` from file.
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index bafb2f2da..d84dd3ca9 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -53,19 +53,21 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("entity_linker", config=config)
 > ```
 
-| Setting                                  | Description                                                                                                                                                                                                                                                                                 |
-| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `labels_discard`                         | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                              |
-| `n_sents`                                | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                           |
-| `incl_prior`                             | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                        |
-| `incl_context`                           | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                      |
-| `model`                                  | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                      |
-| `entity_vector_length`                   | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                               |
-| `use_gold_ents`                          | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                        |
-| `get_candidates`                         | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                    |
-| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                    |
-| `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                     |
-| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
+| Setting                                             | Description                                                                                                                                                                                                                                                                                                      |
+| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `labels_discard`                                    | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                                                   |
+| `n_sents`                                           | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                                                |
+| `incl_prior`                                        | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                             |
+| `incl_context`                                      | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                           |
+| `model`                                             | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                                           |
+| `entity_vector_length`                              | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                                                    |
+| `use_gold_ents`                                     | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                                             |
+| `get_candidates`                                    | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                                         |
+| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
+| `generate_empty_kb` <Tag variant="new">3.6</Tag>    | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~                                                                           |
+| `overwrite` <Tag variant="new">3.2</Tag>            | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                                         |
+| `scorer` <Tag variant="new">3.2</Tag>               | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                                          |
+| `threshold` <Tag variant="new">3.4</Tag>            | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~                      |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/entity_linker.py

From 0bbc620dd80007ac22d8bf1c9f6202eebc748596 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 6 Mar 2023 14:48:57 +0100
Subject: [PATCH 2/8] Partially work around pending deprecation of
 pkg_resources (#12368)

* Handle deprecation of pkg_resources

* Replace `pkg_resources` with `importlib_metadata` for `spacy info
--url`
* Remove requirements check from `spacy project` given the lack of
alternatives

* Fix installed model URL method and CI test

* Fix types/handling, simplify catch-all return

* Move imports instead of disabling requirements check

* Format

* Reenable test with ignored deprecation warning

* Fix except

* Fix return
---
 .github/azure-steps.yml  |  5 +++++
 spacy/cli/info.py        | 17 ++++++++---------
 spacy/cli/project/run.py |  2 +-
 spacy/tests/test_cli.py  |  4 +++-
 4 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index ed69f611b..b2ccf3d81 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -59,6 +59,11 @@ steps:
     displayName: 'Test download CLI'
     condition: eq(variables['python_version'], '3.8')
 
+  - script: |
+      python -W error -m spacy info ca_core_news_sm | grep -q download_url
+    displayName: 'Test download_url in info CLI'
+    condition: eq(variables['python_version'], '3.8')
+
   - script: |
       python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
     displayName: 'Test no warnings on load (#11713)'
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index 974bc0f4e..d82bf3fbc 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -1,6 +1,5 @@
 from typing import Optional, Dict, Any, Union, List
 import platform
-import pkg_resources
 import json
 from pathlib import Path
 from wasabi import Printer, MarkdownRenderer
@@ -10,6 +9,7 @@ from ._util import app, Arg, Opt, string_to_list
 from .download import get_model_filename, get_latest_version
 from .. import util
 from .. import about
+from ..compat import importlib_metadata
 
 
 @app.command("info")
@@ -137,15 +137,14 @@ def info_installed_model_url(model: str) -> Optional[str]:
     dist-info available.
     """
     try:
-        dist = pkg_resources.get_distribution(model)
-        data = json.loads(dist.get_metadata("direct_url.json"))
-        return data["url"]
-    except pkg_resources.DistributionNotFound:
-        # no such package
-        return None
+        dist = importlib_metadata.distribution(model)
+        text = dist.read_text("direct_url.json")
+        if isinstance(text, str):
+            data = json.loads(text)
+            return data["url"]
     except Exception:
-        # something else, like no file or invalid JSON
-        return None
+        pass
+    return None
 
 
 def info_model_url(model: str) -> Dict[str, Any]:
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index 6dd174902..0f4858a99 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
 import os.path
 from pathlib import Path
 
-import pkg_resources
 from wasabi import msg
 from wasabi.util import locale_escape
 import sys
@@ -331,6 +330,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
     RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
         exist.
     """
+    import pkg_resources
 
     failed_pkgs_msgs: List[str] = []
     conflicting_pkgs_msgs: List[str] = []
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index dc7ce46fe..752750d33 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -2,7 +2,6 @@ import os
 import math
 from collections import Counter
 from typing import Tuple, List, Dict, Any
-import pkg_resources
 import time
 from pathlib import Path
 
@@ -1126,6 +1125,7 @@ def test_cli_find_threshold(capsys):
                 )
 
 
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 @pytest.mark.parametrize(
     "reqs,output",
     [
@@ -1158,6 +1158,8 @@ def test_cli_find_threshold(capsys):
     ],
 )
 def test_project_check_requirements(reqs, output):
+    import pkg_resources
+
     # excessive guard against unlikely package name
     try:
         pkg_resources.require("spacyunknowndoesnotexist12345")

From 5ecb3babed28cf8d39da9943ae24186e2bef6133 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 6 Mar 2023 17:30:17 +0100
Subject: [PATCH 3/8] Update to use absolute imports in tests (#12372)

---
 spacy/tests/parser/test_ner.py   | 4 ++--
 spacy/tests/parser/test_parse.py | 6 +++---
 spacy/tests/test_cli.py          | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 00889efdc..030182a63 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -9,6 +9,8 @@ from spacy.lang.en import English
 from spacy.lang.it import Italian
 from spacy.language import Language
 from spacy.lookups import Lookups
+from spacy.pipeline import EntityRecognizer
+from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.pipeline._parser_internals.ner import BiluoPushDown
 from spacy.training import Example, iob_to_biluo, split_bilu_label
 from spacy.tokens import Doc, Span
@@ -16,8 +18,6 @@ from spacy.vocab import Vocab
 import logging
 
 from ..util import make_tempdir
-from ...pipeline import EntityRecognizer
-from ...pipeline.ner import DEFAULT_NER_MODEL
 
 TRAIN_DATA = [
     ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index aaf31ed56..4b05c6721 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -8,11 +8,11 @@ from spacy.lang.en import English
 from spacy.tokens import Doc
 from spacy.training import Example
 from spacy.vocab import Vocab
+from spacy.pipeline import DependencyParser
+from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
+from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 
-from ...pipeline import DependencyParser
-from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
 from ..util import apply_transition_sequence, make_tempdir
-from ...pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 
 TRAIN_DATA = [
     (
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 752750d33..f5bcdfd23 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -28,6 +28,7 @@ from spacy.cli.debug_data import _print_span_characteristics
 from spacy.cli.debug_data import _get_spans_length_freq_dist
 from spacy.cli.download import get_compatibility, get_version
 from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
+from spacy.cli.init_pipeline import _init_labels
 from spacy.cli.package import get_third_party_dependencies
 from spacy.cli.package import _is_permitted_package_name
 from spacy.cli.project.remote_storage import RemoteStorage
@@ -46,7 +47,6 @@ from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
 from spacy.training.converters import iob_to_docs
 from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config
 
-from ..cli.init_pipeline import _init_labels
 from .util import make_tempdir
 
 

From 260cb9c6febc5c7c2c17686d4f4e62c13b6833b9 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 6 Mar 2023 18:06:31 +0100
Subject: [PATCH 4/8] Raise error for non-default vectors with PretrainVectors
 (#12366)

---
 spacy/errors.py                          |  2 +
 spacy/ml/models/multi_task.py            |  3 ++
 spacy/tests/training/test_pretraining.py | 47 +++++++++++++++++++-----
 3 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 2c8b98aad..1047ed21a 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -549,6 +549,8 @@ class Errors(metaclass=ErrorsWithCodes):
             "during training, make sure to include it in 'annotating components'")
 
     # New errors added in v3.x
+    E850 = ("The PretrainVectors objective currently only supports default "
+            "vectors, not {mode} vectors.")
     E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
             "but found value of '{val}'.")
     E852 = ("The tar file pulled from the remote attempted an unsafe path "
diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py
index a7d67c6dd..826fddd4f 100644
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@@ -8,6 +8,7 @@ from thinc.loss import Loss
 from ...util import registry, OOV_RANK
 from ...errors import Errors
 from ...attrs import ID
+from ...vectors import Mode as VectorsMode
 
 import numpy
 from functools import partial
@@ -23,6 +24,8 @@ def create_pretrain_vectors(
     maxout_pieces: int, hidden_size: int, loss: str
 ) -> Callable[["Vocab", Model], Model]:
     def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
+        if vocab.vectors.mode != VectorsMode.default:
+            raise ValueError(Errors.E850.format(mode=vocab.vectors.mode))
         if vocab.vectors.shape[1] == 0:
             raise ValueError(Errors.E875)
         model = build_cloze_multi_task_model(
diff --git a/spacy/tests/training/test_pretraining.py b/spacy/tests/training/test_pretraining.py
index 9359c8485..c0d64f1e7 100644
--- a/spacy/tests/training/test_pretraining.py
+++ b/spacy/tests/training/test_pretraining.py
@@ -2,17 +2,19 @@ from pathlib import Path
 import numpy as np
 import pytest
 import srsly
-from spacy.vocab import Vocab
-from thinc.api import Config
+from thinc.api import Config, get_current_ops
 
+from spacy import util
+from spacy.lang.en import English
+from spacy.training.initialize import init_nlp
+from spacy.training.loop import train
+from spacy.training.pretrain import pretrain
+from spacy.tokens import Doc, DocBin
+from spacy.language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH
+from spacy.ml.models.multi_task import create_pretrain_vectors
+from spacy.vectors import Vectors
+from spacy.vocab import Vocab
 from ..util import make_tempdir
-from ... import util
-from ...lang.en import English
-from ...training.initialize import init_nlp
-from ...training.loop import train
-from ...training.pretrain import pretrain
-from ...tokens import Doc, DocBin
-from ...language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH
 
 pretrain_string_listener = """
 [nlp]
@@ -346,3 +348,30 @@ def write_vectors_model(tmp_dir):
     nlp = English(vocab)
     nlp.to_disk(nlp_path)
     return str(nlp_path)
+
+
+def test_pretrain_default_vectors():
+    nlp = English()
+    nlp.add_pipe("tok2vec")
+    nlp.initialize()
+
+    # default vectors are supported
+    nlp.vocab.vectors = Vectors(shape=(10, 10))
+    create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
+
+    # error for no vectors
+    with pytest.raises(ValueError, match="E875"):
+        nlp.vocab.vectors = Vectors()
+        create_pretrain_vectors(1, 1, "cosine")(
+            nlp.vocab, nlp.get_pipe("tok2vec").model
+        )
+
+    # error for floret vectors
+    with pytest.raises(ValueError, match="E850"):
+        ops = get_current_ops()
+        nlp.vocab.vectors = Vectors(
+            data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1
+        )
+        create_pretrain_vectors(1, 1, "cosine")(
+            nlp.vocab, nlp.get_pipe("tok2vec").model
+        )

From 3bf4539e31c5b2b151f46d22c9fa1a9ef346f972 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 7 Mar 2023 13:29:08 +0100
Subject: [PATCH 5/8] fix types (#12365)

---
 spacy/lexeme.pyi | 3 ++-
 spacy/lexeme.pyx | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/spacy/lexeme.pyi b/spacy/lexeme.pyi
index 4fcaa82cf..9b7a6156a 100644
--- a/spacy/lexeme.pyi
+++ b/spacy/lexeme.pyi
@@ -25,7 +25,8 @@ class Lexeme:
     def orth_(self) -> str: ...
     @property
     def text(self) -> str: ...
-    lower: str
+    orth: int
+    lower: int
     norm: int
     shape: int
     prefix: int
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 6c66effde..e70feaf9a 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -199,7 +199,7 @@ cdef class Lexeme:
         return self.orth_
 
     property lower:
-        """RETURNS (str): Lowercase form of the lexeme."""
+        """RETURNS (uint64): Lowercase form of the lexeme."""
         def __get__(self):
             return self.c.lower
 

From e656189ec35b15ea1fedbbafc115b91fea9f5957 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 8 Mar 2023 01:47:45 +0900
Subject: [PATCH 6/8] Change GPU efficient textcat to use CNN, not BOW in
 generated configs (#11900)

* Change GPU efficient textcat to use CNN, not BOW

If you generate a config with a textcat component using GPU
(transformers), the defaut option (efficiency) uses a BOW architecture,
which does not use tok2vec features. While that can make sense as part
of a larger pipeline, in the case of just a transformer and a textcat,
that means the transformer is doing a lot of work for no purpose.

This changes it so that the CNN architecture is used instead. It could
also be changed to be the same as the accuracy config, which uses the
ensemble architecture.

* Add the transformer when using a textcat with GPU

* Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928)

* Switch ubuntu-latest to ubuntu-20.04 in main tests

* Only use 20.04 for 3.6

* Require thinc v8.1.7

* Require thinc v8.1.8

* Break up longer expression

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 pyproject.toml                                |  2 +-
 requirements.txt                              |  2 +-
 setup.cfg                                     |  4 +--
 spacy/cli/templates/quickstart_training.jinja | 31 ++++++++++++++-----
 4 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7abd7a96f..9cd96ac2d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.1.0,<8.2.0",
+    "thinc>=8.1.8,<8.2.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index bc9fc183c..63e03d558 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.1.0,<8.2.0
+thinc>=8.1.8,<8.2.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.2.0
diff --git a/setup.cfg b/setup.cfg
index cddc5148c..27499805b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -39,7 +39,7 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.1.0,<8.2.0
+    thinc>=8.1.8,<8.2.0
 install_requires =
     # Our libraries
     spacy-legacy>=3.0.11,<3.1.0
@@ -47,7 +47,7 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.1.0,<8.2.0
+    thinc>=8.1.8,<8.2.0
     wasabi>=0.9.1,<1.2.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index b961ac892..441189341 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -24,8 +24,11 @@ gpu_allocator = null
 lang = "{{ lang }}"
 {%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
 {%- set with_accuracy = optimize == "accuracy" -%}
-{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
-{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%}
+{# The BOW textcat doesn't need a source of features, so it can omit the
+tok2vec/transformer. #}
+{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
+{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
+{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
 {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
 {%- else -%}
 {%- set full_pipeline = components -%}
@@ -221,10 +224,16 @@ no_output_layer = false
 
 {% else -%}
 [components.textcat.model]
-@architectures = "spacy.TextCatBOW.v2"
+@architectures = "spacy.TextCatCNN.v2"
 exclusive_classes = true
-ngram_size = 1
-no_output_layer = false
+nO = null
+
+[components.textcat.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.textcat.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
 {%- endif %}
 {%- endif %}
 
@@ -252,10 +261,16 @@ no_output_layer = false
 
 {% else -%}
 [components.textcat_multilabel.model]
-@architectures = "spacy.TextCatBOW.v2"
+@architectures = "spacy.TextCatCNN.v2"
 exclusive_classes = false
-ngram_size = 1
-no_output_layer = false
+nO = null
+
+[components.textcat_multilabel.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.textcat_multilabel.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
 {%- endif %}
 {%- endif %}
 

From f53d945b2dcbbd81def1a140e4279e1118c2c0b6 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 8 Mar 2023 10:22:23 +0100
Subject: [PATCH 7/8] Fix merge

---
 pyproject.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index d0f38084d..9cd96ac2d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,6 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.1.6,<8.2.0",
     "thinc>=8.1.8,<8.2.0",
     "numpy>=1.15.0",
 ]

From 2713890eccbbaaa8fedfbbad86dc208c4b15a9de Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 8 Mar 2023 10:59:24 +0100
Subject: [PATCH 8/8] Update website/docs/api/spancategorizer.mdx

---
 website/docs/api/spancategorizer.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx
index c9ae8e483..c7de2324b 100644
--- a/website/docs/api/spancategorizer.mdx
+++ b/website/docs/api/spancategorizer.mdx
@@ -68,11 +68,11 @@ architectures and their arguments and hyperparameters.
 >     "spans_key": "labeled_spans",
 >     "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
 >     "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
->     # Additional spancat_exclusive parameters
+>     # Additional spancat_singlelabel parameters
 >     "negative_weight": 0.8,
 >     "allow_overlap": True,
 > }
-> nlp.add_pipe("spancat_exclusive", config=config)
+> nlp.add_pipe("spancat_singlelabel", config=config)
 > ```
 
 | Setting                                             | Description                                                                                                                                                                                                                                                                                             |