From 79ef6cf0f9ca75468457c86d0d6fd0d8709a9308 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 2 Feb 2023 11:15:22 +0100
Subject: [PATCH 01/19] Have logging calls use string formatting types (#12215)

* change logging call for spacy.LookupsDataLoader.v1

* substitutions in language and _util

* various more substitutions

* add string formatting guidelines to contribution guidelines
---
 CONTRIBUTING.md              |  5 +++++
 spacy/cli/_util.py           |  4 ++--
 spacy/cli/project/pull.py    |  9 ++++++---
 spacy/cli/project/push.py    |  8 ++++----
 spacy/language.py            |  4 ++--
 spacy/tests/test_language.py |  2 +-
 spacy/training/callbacks.py  |  4 ++--
 spacy/training/corpus.py     |  2 +-
 spacy/training/initialize.py | 25 +++++++++++++------------
 spacy/training/loop.py       |  2 +-
 10 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1f396bd71..f6f6dab59 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
 Python modules. If you've built spaCy from source, you'll already have both
 tools installed.
 
+As a general rule of thumb, we use f-strings for any formatting of strings.
+One exception are calls to Python's `logging` functionality.
+To avoid unnecessary string conversions in these cases, we use string formatting
+templates with `%s` and `%d` etc.
+
 **⚠️ Note that formatting and linting is currently only possible for Python
 modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
 
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index ba3892b1d..f104feff9 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -90,9 +90,9 @@ def parse_config_overrides(
     cli_overrides = _parse_overrides(args, is_cli=True)
     if cli_overrides:
         keys = [k for k in cli_overrides if k not in env_overrides]
-        logger.debug(f"Config overrides from CLI: {keys}")
+        logger.debug("Config overrides from CLI: %s", keys)
     if env_overrides:
-        logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
+        logger.debug("Config overrides from env variables: %s", list(env_overrides))
     return {**cli_overrides, **env_overrides}
 
 
diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py
index 6e3cde88c..8894baa50 100644
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
     # in the list.
     while commands:
         for i, cmd in enumerate(list(commands)):
-            logger.debug(f"CMD: {cmd['name']}.")
+            logger.debug("CMD: %s.", cmd["name"])
             deps = [project_dir / dep for dep in cmd.get("deps", [])]
             if all(dep.exists() for dep in deps):
                 cmd_hash = get_command_hash("", "", deps, cmd["script"])
                 for output_path in cmd.get("outputs", []):
                     url = storage.pull(output_path, command_hash=cmd_hash)
                     logger.debug(
-                        f"URL: {url} for {output_path} with command hash {cmd_hash}"
+                        "URL: %s for %s with command hash %s",
+                        url,
+                        output_path,
+                        cmd_hash,
                     )
                     yield url, output_path
 
@@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
                 commands.pop(i)
                 break
             else:
-                logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.")
+                logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
         else:
             # If we didn't break the for loop, break the while loop.
             break
diff --git a/spacy/cli/project/push.py b/spacy/cli/project/push.py
index bc779e9cd..a8178de21 100644
--- a/spacy/cli/project/push.py
+++ b/spacy/cli/project/push.py
@@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str):
         remote = config["remotes"][remote]
     storage = RemoteStorage(project_dir, remote)
     for cmd in config.get("commands", []):
-        logger.debug(f"CMD: cmd['name']")
+        logger.debug("CMD: %s", cmd["name"])
         deps = [project_dir / dep for dep in cmd.get("deps", [])]
         if any(not dep.exists() for dep in deps):
-            logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs")
+            logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
             continue
         cmd_hash = get_command_hash(
             "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
         )
-        logger.debug(f"CMD_HASH: {cmd_hash}")
+        logger.debug("CMD_HASH: %s", cmd_hash)
         for output_path in cmd.get("outputs", []):
             output_loc = project_dir / output_path
             if output_loc.exists() and _is_not_empty_dir(output_loc):
@@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str):
                     content_hash=get_content_hash(output_loc),
                 )
                 logger.debug(
-                    f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}"
+                    "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
                 )
                 yield output_path, url
 
diff --git a/spacy/language.py b/spacy/language.py
index e0abfd5e7..9fdcf6328 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -104,7 +104,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
 
 @registry.misc("spacy.LookupsDataLoader.v1")
 def load_lookups_data(lang, tables):
-    util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
+    util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
     lookups = load_lookups(lang=lang, tables=tables)
     return lookups
 
@@ -1969,7 +1969,7 @@ class Language:
         pipe = self.get_pipe(pipe_name)
         pipe_cfg = self._pipe_configs[pipe_name]
         if listeners:
-            util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
+            util.logger.debug("Replacing listeners of component '%s'", pipe_name)
             if len(list(listeners)) != len(pipe_listeners):
                 # The number of listeners defined in the component model doesn't
                 # match the listeners to replace, so we won't be able to update
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 03790eb86..236856dad 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -46,7 +46,7 @@ def assert_sents_error(doc):
 
 def warn_error(proc_name, proc, docs, e):
     logger = logging.getLogger("spacy")
-    logger.warning(f"Trouble with component {proc_name}.")
+    logger.warning("Trouble with component %s.", proc_name)
 
 
 @pytest.fixture
diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py
index 426fddf90..7e2494f5b 100644
--- a/spacy/training/callbacks.py
+++ b/spacy/training/callbacks.py
@@ -11,7 +11,7 @@ def create_copy_from_base_model(
 ) -> Callable[[Language], Language]:
     def copy_from_base_model(nlp):
         if tokenizer:
-            logger.info(f"Copying tokenizer from: {tokenizer}")
+            logger.info("Copying tokenizer from: %s", tokenizer)
             base_nlp = load_model(tokenizer)
             if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
                 nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
@@ -23,7 +23,7 @@ def create_copy_from_base_model(
                     )
                 )
         if vocab:
-            logger.info(f"Copying vocab from: {vocab}")
+            logger.info("Copying vocab from: %s", vocab)
             # only reload if the vocab is from a different model
             if tokenizer != vocab:
                 base_nlp = load_model(vocab)
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index d626ad0e0..086ad831c 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -29,7 +29,7 @@ def create_docbin_reader(
 ) -> Callable[["Language"], Iterable[Example]]:
     if path is None:
         raise ValueError(Errors.E913)
-    util.logger.debug(f"Loading corpus from path: {path}")
+    util.logger.debug("Loading corpus from path: %s", path)
     return Corpus(
         path,
         gold_preproc=gold_preproc,
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 6304e4a84..e90617852 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
     frozen_components = T["frozen_components"]
     # Sourced components that require resume_training
     resume_components = [p for p in sourced if p not in frozen_components]
-    logger.info(f"Pipeline: {nlp.pipe_names}")
+    logger.info("Pipeline: %s", nlp.pipe_names)
     if resume_components:
         with nlp.select_pipes(enable=resume_components):
-            logger.info(f"Resuming training for: {resume_components}")
+            logger.info("Resuming training for: %s", resume_components)
             nlp.resume_training(sgd=optimizer)
     # Make sure that listeners are defined before initializing further
     nlp._link_components()
@@ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
         if T["max_epochs"] == -1:
             sample_size = 100
             logger.debug(
-                f"Due to streamed train corpus, using only first {sample_size} "
-                f"examples for initialization. If necessary, provide all labels "
-                f"in [initialize]. More info: https://spacy.io/api/cli#init_labels"
+                "Due to streamed train corpus, using only first %s examples for initialization. "
+                "If necessary, provide all labels in [initialize]. "
+                "More info: https://spacy.io/api/cli#init_labels",
+                sample_size,
             )
             nlp.initialize(
                 lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer
             )
         else:
             nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
-        logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
+        logger.info("Initialized pipeline components: %s", nlp.pipe_names)
     # Detect components with listeners that are not frozen consistently
     for name, proc in nlp.pipeline:
         for listener in getattr(
@@ -109,7 +110,7 @@ def init_vocab(
 ) -> None:
     if lookups:
         nlp.vocab.lookups = lookups
-        logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
+        logger.info("Added vocab lookups: %s", ", ".join(lookups.tables))
     data_path = ensure_path(data)
     if data_path is not None:
         lex_attrs = srsly.read_jsonl(data_path)
@@ -125,11 +126,11 @@ def init_vocab(
         else:
             oov_prob = DEFAULT_OOV_PROB
         nlp.vocab.cfg.update({"oov_prob": oov_prob})
-        logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
+        logger.info("Added %d lexical entries to the vocab", len(nlp.vocab))
     logger.info("Created vocabulary")
     if vectors is not None:
         load_vectors_into_model(nlp, vectors)
-        logger.info(f"Added vectors: {vectors}")
+        logger.info("Added vectors: %s", vectors)
     # warn if source model vectors are not identical
     sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
     vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
@@ -191,7 +192,7 @@ def init_tok2vec(
     if weights_data is not None:
         layer = get_tok2vec_ref(nlp, P)
         layer.from_bytes(weights_data)
-        logger.info(f"Loaded pretrained weights from {init_tok2vec}")
+        logger.info("Loaded pretrained weights from %s", init_tok2vec)
         return True
     return False
 
@@ -216,13 +217,13 @@ def convert_vectors(
         nlp.vocab.deduplicate_vectors()
     else:
         if vectors_loc:
-            logger.info(f"Reading vectors from {vectors_loc}")
+            logger.info("Reading vectors from %s", vectors_loc)
             vectors_data, vector_keys, floret_settings = read_vectors(
                 vectors_loc,
                 truncate,
                 mode=mode,
             )
-            logger.info(f"Loaded vectors from {vectors_loc}")
+            logger.info("Loaded vectors from %s", vectors_loc)
         else:
             vectors_data, vector_keys = (None, None)
         if vector_keys is not None and mode != VectorsMode.floret:
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 885257772..eca40e3d9 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -370,6 +370,6 @@ def clean_output_dir(path: Optional[Path]) -> None:
             if subdir.exists():
                 try:
                     shutil.rmtree(str(subdir))
-                    logger.debug(f"Removed existing output directory: {subdir}")
+                    logger.debug("Removed existing output directory: %s", subdir)
                 except Exception as e:
                     raise IOError(Errors.E901.format(path=path)) from e

From 9a454676f3ccb0e2ecd53aa82e4108b84d5f3bb4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 3 Feb 2023 11:44:10 +0100
Subject: [PATCH 02/19] Use black version constraints from requirements.txt
 (#12220)

---
 .github/workflows/autoblack.yml | 2 +-
 azure-pipelines.yml             | 2 +-
 requirements.txt                | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml
index 70882c3cc..555322782 100644
--- a/.github/workflows/autoblack.yml
+++ b/.github/workflows/autoblack.yml
@@ -16,7 +16,7 @@ jobs:
         with:
             ref: ${{ github.head_ref }}
       - uses: actions/setup-python@v4
-      - run: pip install black
+      - run: pip install black -c requirements.txt
       - name: Auto-format code if needed
         run: black spacy
       # We can't run black --check here because that returns a non-zero excit
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 541656c3d..dba11bd1a 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -41,7 +41,7 @@ jobs:
         inputs:
           versionSpec: "3.7"
       - script: |
-          pip install black==22.3.0
+          pip install black -c requirements.txt
           python -m black spacy --check
         displayName: "black"
       - script: |
diff --git a/requirements.txt b/requirements.txt
index 1bd4518af..d6b0bc0dd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -37,4 +37,4 @@ types-mock>=0.1.1
 types-setuptools>=57.0.0
 types-requests
 types-setuptools>=57.0.0
-black>=22.0,<23.0
+black==22.3.0

From d38a88f0f3ca97776387780c2b79711d4971b09f Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 8 Feb 2023 14:18:33 +0100
Subject: [PATCH 03/19] Remove negation. (#12252)

---
 website/docs/api/cli.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index bd966015e..3f31bef95 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -1492,7 +1492,7 @@ $ python -m spacy project push [remote] [project_dir]
 ### project pull {id="project-pull",tag="command"}
 
 Download all files or directories listed as `outputs` for commands, unless they
-are not already present locally. When searching for files in the remote, `pull`
+are already present locally. When searching for files in the remote, `pull`
 won't just look at the output path, but will also consider the **command
 string** and the **hashes of the dependencies**. For instance, let's say you've
 previously pushed a checkpoint to the remote, but now you've changed some

From 9d920bafcf4c03c6015deb67d6b0c335b8b04986 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 8 Feb 2023 14:33:16 +0100
Subject: [PATCH 04/19] Extend mypy to v1.0.x (#12245)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index d6b0bc0dd..bc9fc183c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -31,7 +31,7 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=0.990,<0.1000; platform_machine != "aarch64" and python_version >= "3.7"
+mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
 types-dataclasses>=0.1.3; python_version < "3.7"
 types-mock>=0.1.1
 types-setuptools>=57.0.0

From 2d4fb94ba0a23523cc9adb65e0dcf92bbf6177b6 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Thu, 9 Feb 2023 12:58:14 +0100
Subject: [PATCH 05/19] Fix wrong file name in docs for rule-based matcher.
 (#12262)

---
 website/docs/usage/rule-based-matching.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index 08d2b3b91..628c2953f 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -1442,8 +1442,8 @@ nlp.to_disk("/path/to/pipeline")
 
 The saved pipeline now includes the `"entity_ruler"` in its
 [`config.cfg`](/api/data-formats#config) and the pipeline directory contains a
-file `entityruler.jsonl` with the patterns. When you load the pipeline back in,
-all pipeline components will be restored and deserialized – including the entity
+file `patterns.jsonl` with the patterns. When you load the pipeline back in, all
+pipeline components will be restored and deserialized – including the entity
 ruler. This lets you ship powerful pipeline packages with binary weights _and_
 rules included!
 

From 61b84541378fddab19fc3507eb29745a3e5efd42 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Wed, 15 Feb 2023 12:32:53 +0100
Subject: [PATCH 06/19] Adjust return type of `registry.find` (#12227)

* Fix registry find return type

* add dot

* Add type ignore for mypy

* update black formatting version

* add mypy ignore to package cli

* mypy type fix (for real)

* Update find description in spacy/util.py

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>

* adjust mypy directive

---------

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
---
 spacy/cli/package.py |  2 +-
 spacy/util.py        | 13 +++++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 324c5d1bb..6351f28eb 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -252,7 +252,7 @@ def get_third_party_dependencies(
                     raise regerr from None
             module_name = func_info.get("module")  # type: ignore[attr-defined]
             if module_name:  # the code is part of a module, not a --code file
-                modules.add(func_info["module"].split(".")[0])  # type: ignore[index]
+                modules.add(func_info["module"].split(".")[0])  # type: ignore[union-attr]
     dependencies = []
     for module_name in modules:
         if module_name in distributions:
diff --git a/spacy/util.py b/spacy/util.py
index 8bf8fb1b0..dc7a4efe0 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -144,8 +144,17 @@ class registry(thinc.registry):
         return func
 
     @classmethod
-    def find(cls, registry_name: str, func_name: str) -> Callable:
-        """Get info about a registered function from the registry."""
+    def find(
+        cls, registry_name: str, func_name: str
+    ) -> Dict[str, Optional[Union[str, int]]]:
+        """Find information about a registered function, including the
+        module and path to the file it's defined in, the line number and the
+        docstring, if available.
+
+        registry_name (str): Name of the catalogue registry.
+        func_name (str): Name of the registered function.
+        RETURNS (Dict[str, Optional[Union[str, int]]]): The function info.
+        """
         # We're overwriting this classmethod so we're able to provide more
         # specific error messages and implement a fallback to spacy-legacy.
         if not hasattr(cls, registry_name):

From 80bc140533092ab129568e85975fa6fb76f97dd4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 16 Feb 2023 17:57:02 +0100
Subject: [PATCH 07/19] Add grc to langs with lexeme norms in
 spacy-lookups-data (#12287)

---
 spacy/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/util.py b/spacy/util.py
index dc7a4efe0..38ba7b1b5 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -60,7 +60,7 @@ if TYPE_CHECKING:
 # fmt: off
 OOV_RANK = numpy.iinfo(numpy.uint64).max
 DEFAULT_OOV_PROB = -20
-LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
+LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
 
 # Default order of sections in the config file. Not all sections needs to exist,
 # and additional sections are added at the end, in alphabetical order.

From daedc45d050b15be8c5422aadff7b652439a562d Mon Sep 17 00:00:00 2001
From: andyjessen <62343929+andyjessen@users.noreply.github.com>
Date: Thu, 23 Feb 2023 01:37:40 -0700
Subject: [PATCH 08/19] Fix FUZZY operator definition (#12318)

* Fix FUZZY operator definition

The default length of the FUZZY operator is 2 and not 3.

* adjust edit distance in matcher usage docs too

---------

Co-authored-by: svlandeg <svlandeg@github.com>
---
 website/docs/usage/rule-based-matching.mdx | 4 ++--
 website/docs/usage/v3-5.mdx                | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index 628c2953f..bad049479 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -384,10 +384,10 @@ the more specific attributes `FUZZY1`..`FUZZY9` you can specify the maximum
 allowed edit distance directly.
 
 ```python
-# Match lowercase with fuzzy matching (allows 3 edits)
+# Match lowercase with fuzzy matching (allows 2 edits by default)
 pattern = [{"LOWER": {"FUZZY": "definitely"}}]
 
-# Match custom attribute values with fuzzy matching (allows 3 edits)
+# Match custom attribute values with fuzzy matching (allows 2 edits by default)
 pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
 
 # Match with exact Levenshtein edit distance limits (allows 4 edits)
diff --git a/website/docs/usage/v3-5.mdx b/website/docs/usage/v3-5.mdx
index 3ca64f8a2..54c976fe5 100644
--- a/website/docs/usage/v3-5.mdx
+++ b/website/docs/usage/v3-5.mdx
@@ -70,13 +70,13 @@ distance of 2 and up to 30% of the pattern string length. `FUZZY1`..`FUZZY9` can
 be used to specify the exact number of allowed edits.
 
 ```python
-# Match lowercase with fuzzy matching (allows up to 3 edits)
+# Match lowercase with fuzzy matching (allows 2 edits by default)
 pattern = [{"LOWER": {"FUZZY": "definitely"}}]
 
-# Match custom attribute values with fuzzy matching (allows up to 3 edits)
+# Match custom attribute values with fuzzy matching (allows 2 edits by default)
 pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
 
-# Match with exact Levenshtein edit distance limits (allows up to 4 edits)
+# Match with exact Levenshtein edit distance limits (allows 4 edits)
 pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}]
 ```
 

From 1e8bac99f3febd7c00ba53cc8efebf5d6f989a8b Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 23 Feb 2023 18:22:57 +0900
Subject: [PATCH 09/19] Add tests for projects to master (#12303)

* Add tests for projects to master

* Fix git clone related issues on Windows

* Add stat import
---
 spacy/tests/test_cli_app.py | 137 ++++++++++++++++++++++++++++++++++++
 spacy/util.py               |  10 ++-
 2 files changed, 146 insertions(+), 1 deletion(-)

diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py
index 40100412a..8aaadf686 100644
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@@ -1,5 +1,7 @@
 import os
 from pathlib import Path
+import pytest
+import srsly
 from typer.testing import CliRunner
 from spacy.tokens import DocBin, Doc
 
@@ -89,3 +91,138 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab):
         # Instead of checking specific wording of the output, which may change,
         # we'll check that this section of the debug output is present.
         assert "= Trainable Lemmatizer =" in result_debug_data.stdout
+
+
+# project tests
+
+SAMPLE_PROJECT = {
+    "title": "Sample project",
+    "description": "This is a project for testing",
+    "assets": [
+        {
+            "dest": "assets/spacy-readme.md",
+            "url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md",
+            "checksum": "411b2c89ccf34288fae8ed126bf652f7",
+        },
+        {
+            "dest": "assets/citation.cff",
+            "url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff",
+            "checksum": "c996bfd80202d480eb2e592369714e5e",
+            "extra": True,
+        },
+    ],
+    "commands": [
+        {
+            "name": "ok",
+            "help": "print ok",
+            "script": ["python -c \"print('okokok')\""],
+        },
+        {
+            "name": "create",
+            "help": "make a file",
+            "script": ["touch abc.txt"],
+            "outputs": ["abc.txt"],
+        },
+        {
+            "name": "clean",
+            "help": "remove test file",
+            "script": ["rm abc.txt"],
+        },
+    ],
+}
+
+SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT)
+
+
+@pytest.fixture
+def project_dir():
+    with make_tempdir() as pdir:
+        (pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT)
+        yield pdir
+
+
+def test_project_document(project_dir):
+    readme_path = project_dir / "README.md"
+    assert not readme_path.exists(), "README already exists"
+    result = CliRunner().invoke(
+        app, ["project", "document", str(project_dir), "-o", str(readme_path)]
+    )
+    assert result.exit_code == 0
+    assert readme_path.is_file()
+    text = readme_path.read_text("utf-8")
+    assert SAMPLE_PROJECT["description"] in text
+
+
+def test_project_assets(project_dir):
+    asset_dir = project_dir / "assets"
+    assert not asset_dir.exists(), "Assets dir is already present"
+    result = CliRunner().invoke(app, ["project", "assets", str(project_dir)])
+    assert result.exit_code == 0
+    assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded"
+    # check that extras work
+    result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)])
+    assert result.exit_code == 0
+    assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded"
+
+
+def test_project_run(project_dir):
+    # make sure dry run works
+    test_file = project_dir / "abc.txt"
+    result = CliRunner().invoke(
+        app, ["project", "run", "--dry", "create", str(project_dir)]
+    )
+    assert result.exit_code == 0
+    assert not test_file.is_file()
+    result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
+    assert result.exit_code == 0
+    assert test_file.is_file()
+    result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)])
+    assert result.exit_code == 0
+    assert "okokok" in result.stdout
+
+
+@pytest.mark.parametrize(
+    "options",
+    [
+        "",
+        # "--sparse",
+        "--branch v3",
+        "--repo https://github.com/explosion/projects --branch v3",
+    ],
+)
+def test_project_clone(options):
+    with make_tempdir() as workspace:
+        out = workspace / "project"
+        target = "benchmarks/ner_conll03"
+        if not options:
+            options = []
+        else:
+            options = options.split()
+        result = CliRunner().invoke(
+            app, ["project", "clone", target, *options, str(out)]
+        )
+        assert result.exit_code == 0
+        assert (out / "README.md").is_file()
+
+
+def test_project_push_pull(project_dir):
+    proj = dict(SAMPLE_PROJECT)
+    remote = "xyz"
+
+    with make_tempdir() as remote_dir:
+        proj["remotes"] = {remote: str(remote_dir)}
+        proj_text = srsly.yaml_dumps(proj)
+        (project_dir / "project.yml").write_text(proj_text)
+
+        test_file = project_dir / "abc.txt"
+        result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
+        assert result.exit_code == 0
+        assert test_file.is_file()
+        result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)])
+        assert result.exit_code == 0
+        result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)])
+        assert result.exit_code == 0
+        assert not test_file.exists()
+        result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)])
+        assert result.exit_code == 0
+        assert test_file.is_file()
diff --git a/spacy/util.py b/spacy/util.py
index 38ba7b1b5..8cc89217d 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -32,6 +32,7 @@ import inspect
 import pkgutil
 import logging
 import socket
+import stat
 
 try:
     import cupy.random
@@ -1050,8 +1051,15 @@ def make_tempdir() -> Generator[Path, None, None]:
     """
     d = Path(tempfile.mkdtemp())
     yield d
+
+    # On Windows, git clones use read-only files, which cause permission errors
+    # when being deleted. This forcibly fixes permissions.
+    def force_remove(rmfunc, path, ex):
+        os.chmod(path, stat.S_IWRITE)
+        rmfunc(path)
+
     try:
-        shutil.rmtree(str(d))
+        shutil.rmtree(str(d), onerror=force_remove)
     except PermissionError as e:
         warnings.warn(Warnings.W091.format(dir=d, msg=e))
 

From acdd993071319cd7b02a651ac0d046a16e89695e Mon Sep 17 00:00:00 2001
From: Kevin Humphreys <kevin.humphreys@dialpad.com>
Date: Sun, 26 Feb 2023 23:35:08 -0800
Subject: [PATCH 10/19] Matcher performance fix for extension predicates: use
 shared key function (#12272)

* standardize predicate key format

* single key function

* Make optional args in key function keyword-only

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/matcher/matcher.pyx | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index ea1b4b66b..b886bd2ec 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -828,6 +828,11 @@ def _get_attr_values(spec, string_store):
     return attr_values
 
 
+def _predicate_cache_key(attr, predicate, value, *, regex=False, fuzzy=None):
+    # tuple order affects performance
+    return (attr, regex, fuzzy, predicate, srsly.json_dumps(value, sort_keys=True))
+
+
 # These predicate helper classes are used to match the REGEX, IN, >= etc
 # extensions to the matcher introduced in #3173.
 
@@ -847,7 +852,7 @@ class _FuzzyPredicate:
         fuzz = self.predicate[len("FUZZY"):] # number after prefix
         self.fuzzy = int(fuzz) if fuzz else -1
         self.fuzzy_compare = fuzzy_compare
-        self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
+        self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy)
 
     def __call__(self, Token token):
         if self.is_extension:
@@ -869,7 +874,7 @@ class _RegexPredicate:
         self.value = re.compile(value)
         self.predicate = predicate
         self.is_extension = is_extension
-        self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
+        self.key = _predicate_cache_key(self.attr, self.predicate, value)
         if self.predicate not in self.operators:
             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
 
@@ -905,7 +910,7 @@ class _SetPredicate:
                 self.value = set(get_string_id(v) for v in value)
         self.predicate = predicate
         self.is_extension = is_extension
-        self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
+        self.key = _predicate_cache_key(self.attr, self.predicate, value, regex=self.regex, fuzzy=self.fuzzy)
         if self.predicate not in self.operators:
             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
 
@@ -977,7 +982,7 @@ class _ComparisonPredicate:
         self.value = value
         self.predicate = predicate
         self.is_extension = is_extension
-        self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
+        self.key = _predicate_cache_key(self.attr, self.predicate, value)
         if self.predicate not in self.operators:
             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
 
@@ -1092,7 +1097,7 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
         if isinstance(value, dict):
             for type_, cls in predicate_types.items():
                 if type_ in value:
-                    key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True))
+                    key = _predicate_cache_key(attr, type_, value[type_])
                     if key in seen_predicates:
                         output.append(seen_predicates[key])
                     else:

From 4539fbae176295fd271855cdccb25820eef1ca96 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 27 Feb 2023 09:48:36 +0100
Subject: [PATCH 11/19] Revert "Fix FUZZY operator definition (#12318)"
 (#12336)

This reverts commit daedc45d050b15be8c5422aadff7b652439a562d.

The default length depends on the length of the pattern string and was
correct for this example.
---
 website/docs/usage/rule-based-matching.mdx | 4 ++--
 website/docs/usage/v3-5.mdx                | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index bad049479..628c2953f 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -384,10 +384,10 @@ the more specific attributes `FUZZY1`..`FUZZY9` you can specify the maximum
 allowed edit distance directly.
 
 ```python
-# Match lowercase with fuzzy matching (allows 2 edits by default)
+# Match lowercase with fuzzy matching (allows 3 edits)
 pattern = [{"LOWER": {"FUZZY": "definitely"}}]
 
-# Match custom attribute values with fuzzy matching (allows 2 edits by default)
+# Match custom attribute values with fuzzy matching (allows 3 edits)
 pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
 
 # Match with exact Levenshtein edit distance limits (allows 4 edits)
diff --git a/website/docs/usage/v3-5.mdx b/website/docs/usage/v3-5.mdx
index 54c976fe5..3ca64f8a2 100644
--- a/website/docs/usage/v3-5.mdx
+++ b/website/docs/usage/v3-5.mdx
@@ -70,13 +70,13 @@ distance of 2 and up to 30% of the pattern string length. `FUZZY1`..`FUZZY9` can
 be used to specify the exact number of allowed edits.
 
 ```python
-# Match lowercase with fuzzy matching (allows 2 edits by default)
+# Match lowercase with fuzzy matching (allows up to 3 edits)
 pattern = [{"LOWER": {"FUZZY": "definitely"}}]
 
-# Match custom attribute values with fuzzy matching (allows 2 edits by default)
+# Match custom attribute values with fuzzy matching (allows up to 3 edits)
 pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
 
-# Match with exact Levenshtein edit distance limits (allows 4 edits)
+# Match with exact Levenshtein edit distance limits (allows up to 4 edits)
 pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}]
 ```
 

From e2de188cf1a70f8aa931cb4f9648fb906fece188 Mon Sep 17 00:00:00 2001
From: lise-brinck <104826278+lise-brinck@users.noreply.github.com>
Date: Mon, 27 Feb 2023 10:53:45 +0100
Subject: [PATCH 12/19] Bugfix/swedish tokenizer (#12315)

* add unittest for explosion#12311

* create punctuation.py for swedish

* removed : from infixes in swedish punctuation.py

* allow : as infix if succeeding char is uppercase
---
 spacy/lang/sv/__init__.py                     |  5 +--
 spacy/lang/sv/punctuation.py                  | 33 +++++++++++++++++++
 .../tests/lang/sv/test_prefix_suffix_infix.py |  7 ++++
 3 files changed, 41 insertions(+), 4 deletions(-)
 create mode 100644 spacy/lang/sv/punctuation.py

diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py
index 6963e8b79..28e5085a8 100644
--- a/spacy/lang/sv/__init__.py
+++ b/spacy/lang/sv/__init__.py
@@ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language, BaseDefaults
 from ...pipeline import Lemmatizer
-
-
-# Punctuation stolen from Danish
-from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 
 
 class SwedishDefaults(BaseDefaults):
diff --git a/spacy/lang/sv/punctuation.py b/spacy/lang/sv/punctuation.py
new file mode 100644
index 000000000..67f1bcdc4
--- /dev/null
+++ b/spacy/lang/sv/punctuation.py
@@ -0,0 +1,33 @@
+from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..punctuation import TOKENIZER_SUFFIXES
+
+
+_quotes = CONCAT_QUOTES.replace("'", "")
+
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
+        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
+        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER),
+    ]
+)
+
+_suffixes = [
+    suffix
+    for suffix in TOKENIZER_SUFFIXES
+    if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
+]
+_suffixes += [r"(?<=[^sSxXzZ])\'"]
+
+
+TOKENIZER_INFIXES = _infixes
+TOKENIZER_SUFFIXES = _suffixes
diff --git a/spacy/tests/lang/sv/test_prefix_suffix_infix.py b/spacy/tests/lang/sv/test_prefix_suffix_infix.py
index bbb0ff415..0aa495992 100644
--- a/spacy/tests/lang/sv/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/sv/test_prefix_suffix_infix.py
@@ -32,3 +32,10 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
 def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
     tokens = sv_tokenizer(text)
     assert len(tokens) == 3
+
+
+@pytest.mark.issue(12311)
+@pytest.mark.parametrize("text", ["99:e", "c:a", "EU:s", "Maj:t"])
+def test_sv_tokenizer_handles_colon(sv_tokenizer, text):
+    tokens = sv_tokenizer(text)
+    assert len(tokens) == 1

From 071667376a429da5420ba6332005c05a444d3f9f Mon Sep 17 00:00:00 2001
From: TAN Long <71320000+tanloong@users.noreply.github.com>
Date: Tue, 28 Feb 2023 21:36:33 +0800
Subject: [PATCH 13/19] Add new REL_OPs: `>+`, `>-`, `<+`, and `<-` (#12334)

* Add immediate left/right child/parent dependency relations

* Add tests for new REL_OPs: `>+`, `>-`, `<+`, and `<-`.

---------

Co-authored-by: Tan Long <tanloong@foxmail.com>
---
 spacy/matcher/dependencymatcher.pyx           | 26 +++++++++++++++++++
 .../tests/matcher/test_dependency_matcher.py  | 16 ++++++++++++
 website/docs/api/dependencymatcher.mdx        |  4 +++
 website/docs/usage/rule-based-matching.mdx    |  8 ++++++
 4 files changed, 54 insertions(+)

diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index 74c2d002f..adf96702b 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -82,8 +82,12 @@ cdef class DependencyMatcher:
             "$-": self._imm_left_sib,
             "$++": self._right_sib,
             "$--": self._left_sib,
+            ">+": self._imm_right_child,
+            ">-": self._imm_left_child,
             ">++": self._right_child,
             ">--": self._left_child,
+            "<+": self._imm_right_parent,
+            "<-": self._imm_left_parent,
             "<++": self._right_parent,
             "<--": self._left_parent,
         }
@@ -427,12 +431,34 @@ cdef class DependencyMatcher:
     def _left_sib(self, doc, node):
         return [doc[child.i] for child in doc[node].head.children if child.i < node]
 
+    def _imm_right_child(self, doc, node):
+        for child in doc[node].children:
+            if child.i == node + 1:
+                return [doc[child.i]]
+        return []
+
+    def _imm_left_child(self, doc, node):
+        for child in doc[node].children:
+            if child.i == node - 1:
+                return [doc[child.i]]
+        return []
+
     def _right_child(self, doc, node):
         return [doc[child.i] for child in doc[node].children if child.i > node]
     
     def _left_child(self, doc, node):
         return [doc[child.i] for child in doc[node].children if child.i < node]
 
+    def _imm_right_parent(self, doc, node):
+        if doc[node].head.i == node + 1:
+            return [doc[node].head]
+        return []
+
+    def _imm_left_parent(self, doc, node):
+        if doc[node].head.i == node - 1:
+            return [doc[node].head]
+        return []
+
     def _right_parent(self, doc, node):
         if doc[node].head.i > node:
             return [doc[node].head]
diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py
index b4e19d69d..200384320 100644
--- a/spacy/tests/matcher/test_dependency_matcher.py
+++ b/spacy/tests/matcher/test_dependency_matcher.py
@@ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
         ("the", "brown", "$--", 0),
         ("brown", "the", "$--", 1),
         ("brown", "brown", "$--", 0),
+        ("over", "jumped", "<+", 0),
+        ("quick", "fox", "<+", 0),
+        ("the", "quick", "<+", 0),
+        ("brown", "fox", "<+", 1),
         ("quick", "fox", "<++", 1),
         ("quick", "over", "<++", 0),
         ("over", "jumped", "<++", 0),
         ("the", "fox", "<++", 2),
+        ("brown", "fox", "<-", 0),
+        ("fox", "over", "<-", 0),
+        ("the", "over", "<-", 0),
+        ("over", "jumped", "<-", 1),
         ("brown", "fox", "<--", 0),
         ("fox", "jumped", "<--", 0),
         ("fox", "over", "<--", 1),
+        ("fox", "brown", ">+", 0),
+        ("over", "fox", ">+", 0),
+        ("over", "the", ">+", 0),
+        ("jumped", "over", ">+", 1),
         ("jumped", "over", ">++", 1),
         ("fox", "lazy", ">++", 0),
         ("over", "the", ">++", 0),
+        ("jumped", "over", ">-", 0),
+        ("fox", "quick", ">-", 0),
+        ("brown", "quick", ">-", 0),
+        ("fox", "brown", ">-", 1),
         ("brown", "fox", ">--", 0),
         ("fox", "brown", ">--", 1),
         ("jumped", "fox", ">--", 1),
diff --git a/website/docs/api/dependencymatcher.mdx b/website/docs/api/dependencymatcher.mdx
index 390034a6c..cad5185f7 100644
--- a/website/docs/api/dependencymatcher.mdx
+++ b/website/docs/api/dependencymatcher.mdx
@@ -82,8 +82,12 @@ come directly from
 | `A $- B`  | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  |
 | `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                |
 | `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 |
+| `A >+ B` | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.           |
+| `A >- B` | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.            |
 | `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
 | `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
+| `A <+ B` | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.           |
+| `A <- B` | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.            |
 | `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
 | `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
 
diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index 628c2953f..6a11ac8bd 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -1110,6 +1110,14 @@ come directly from
 | `A $- B`  | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  |
 | `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                |
 | `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 |
+| `A >+ B` | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.           |
+| `A >- B` | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.            |
+| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
+| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
+| `A <+ B` | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.           |
+| `A <- B` | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.            |
+| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
+| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
 
 ### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
 

From 8f058e39bd95da1f14d0071452b4d58103014dc7 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 28 Feb 2023 16:36:03 +0100
Subject: [PATCH 14/19] Fix error message for displacy auto_select_port
 (#12343)

---
 spacy/errors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index d143e341c..ab013f3eb 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -967,7 +967,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
              "with `displacy.serve(doc, port=port)`")
     E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
-             "or use `auto_switch_port=True` to pick an available port automatically.")
+             "or use `auto_select_port=True` to pick an available port automatically.")
 
 
 # Deprecated model shortcuts, only used in errors and warnings

From 33864f1d07cba3291aaa51a20eb9482d7d1ee734 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 1 Mar 2023 10:46:13 +0100
Subject: [PATCH 15/19] Add new tags in docs for #12334 (#12348)

---
 website/docs/api/dependencymatcher.mdx     | 44 +++++++++++-----------
 website/docs/usage/rule-based-matching.mdx | 44 +++++++++++-----------
 2 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/website/docs/api/dependencymatcher.mdx b/website/docs/api/dependencymatcher.mdx
index cad5185f7..14e0916d1 100644
--- a/website/docs/api/dependencymatcher.mdx
+++ b/website/docs/api/dependencymatcher.mdx
@@ -68,28 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
 come directly from
 [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
 
-| Symbol    | Description                                                                                                          |
-| --------- | -------------------------------------------------------------------------------------------------------------------- |
-| `A < B`   | `A` is the immediate dependent of `B`.                                                                               |
-| `A > B`   | `A` is the immediate head of `B`.                                                                                    |
-| `A << B`  | `A` is the dependent in a chain to `B` following dep &rarr; head paths.                                              |
-| `A >> B`  | `A` is the head in a chain to `B` following head &rarr; dep paths.                                                   |
-| `A . B`   | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                   |
-| `A .* B`  | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_.                 |
-| `A ; B`   | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
-| `A ;* B`  | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_.                  |
-| `A $+ B`  | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                 |
-| `A $- B`  | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  |
-| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                |
-| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 |
-| `A >+ B` | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.           |
-| `A >- B` | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.            |
-| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
-| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
-| `A <+ B` | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.           |
-| `A <- B` | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.            |
-| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
-| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
+| Symbol                                  | Description                                                                                                          |
+| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
+| `A < B`                                 | `A` is the immediate dependent of `B`.                                                                               |
+| `A > B`                                 | `A` is the immediate head of `B`.                                                                                    |
+| `A << B`                                | `A` is the dependent in a chain to `B` following dep &rarr; head paths.                                              |
+| `A >> B`                                | `A` is the head in a chain to `B` following head &rarr; dep paths.                                                   |
+| `A . B`                                 | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                   |
+| `A .* B`                                | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_.                 |
+| `A ; B`                                 | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
+| `A ;* B`                                | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_.                  |
+| `A $+ B`                                | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                 |
+| `A $- B`                                | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  |
+| `A $++ B`                               | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                |
+| `A $-- B`                               | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 |
+| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          |
+| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           |
+| `A >++ B`                               | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
+| `A >-- B`                               | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
+| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          |
+| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           |
+| `A <++ B`                               | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
+| `A <-- B`                               | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
 
 ## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"}
 
diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index 6a11ac8bd..55c043015 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -1096,28 +1096,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
 come directly from
 [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
 
-| Symbol    | Description                                                                                                          |
-| --------- | -------------------------------------------------------------------------------------------------------------------- |
-| `A < B`   | `A` is the immediate dependent of `B`.                                                                               |
-| `A > B`   | `A` is the immediate head of `B`.                                                                                    |
-| `A << B`  | `A` is the dependent in a chain to `B` following dep &rarr; head paths.                                              |
-| `A >> B`  | `A` is the head in a chain to `B` following head &rarr; dep paths.                                                   |
-| `A . B`   | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                   |
-| `A .* B`  | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_.                 |
-| `A ; B`   | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
-| `A ;* B`  | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_.                  |
-| `A $+ B`  | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                 |
-| `A $- B`  | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  |
-| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                |
-| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 |
-| `A >+ B` | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.           |
-| `A >- B` | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.            |
-| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
-| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
-| `A <+ B` | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.           |
-| `A <- B` | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.            |
-| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
-| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
+| Symbol                                  | Description                                                                                                          |
+| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
+| `A < B`                                 | `A` is the immediate dependent of `B`.                                                                               |
+| `A > B`                                 | `A` is the immediate head of `B`.                                                                                    |
+| `A << B`                                | `A` is the dependent in a chain to `B` following dep &rarr; head paths.                                              |
+| `A >> B`                                | `A` is the head in a chain to `B` following head &rarr; dep paths.                                                   |
+| `A . B`                                 | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                   |
+| `A .* B`                                | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_.                 |
+| `A ; B`                                 | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
+| `A ;* B`                                | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_.                  |
+| `A $+ B`                                | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                 |
+| `A $- B`                                | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  |
+| `A $++ B`                               | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                |
+| `A $-- B`                               | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 |
+| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          |
+| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           |
+| `A >++ B`                               | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
+| `A >-- B`                               | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
+| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          |
+| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           |
+| `A <++ B`                               | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
+| `A <-- B`                               | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
 
 ### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
 

From efbc3d37b36fe1df14b23a746275cdbe19163e9b Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 1 Mar 2023 11:01:35 +0100
Subject: [PATCH 16/19] Update docs w.r.t. spacy.CandidateBatchGenerator.v1.
 (#12350)

---
 website/docs/api/architectures.mdx | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 2a1bc4380..966b5830a 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -924,6 +924,15 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default
 `CandidateGenerator` uses the text of a mention to find its potential aliases in
 the `KnowledgeBase`. Note that this function is case-dependent.
 
+### spacy.CandidateBatchGenerator.v1 {id="CandidateBatchGenerator"}
+
+A function that takes as input a [`KnowledgeBase`](/api/kb) and an `Iterable` of
+[`Span`](/api/span) objects denoting named entities, and returns a list of
+plausible [`Candidate`](/api/kb/#candidate) objects per specified
+[`Span`](/api/span). The default `CandidateBatchGenerator` uses the text of a
+mention to find its potential aliases in the `KnowledgeBase`. Note that this
+function is case-dependent.
+
 ## Coreference {id="coref-architectures",tag="experimental"}
 
 A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to

From 74cae47bf65d99dbe50b0fe95f04141779c8005b Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Wed, 1 Mar 2023 12:06:07 +0100
Subject: [PATCH 17/19] rely on is_empty property instead of __len__ (#12347)

---
 spacy/errors.py                            | 3 +--
 spacy/kb/kb_in_memory.pyx                  | 3 +++
 spacy/pipeline/entity_linker.py            | 2 +-
 spacy/tests/pipeline/test_entity_linker.py | 3 +++
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index ab013f3eb..2c8b98aad 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -444,8 +444,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E133 = ("The sum of prior probabilities for alias '{alias}' should not "
             "exceed 1, but found {sum}.")
     E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
-    E139 = ("Knowledge base for component '{name}' is empty. Use the methods "
-            "`kb.add_entity` and `kb.add_alias` to add entries.")
+    E139 = ("Knowledge base for component '{name}' is empty.")
     E140 = ("The list of entities, prior probabilities and entity vectors "
             "should be of equal length.")
     E141 = ("Entity vectors should be of length {required} instead of the "
diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx
index edba523cf..2a74d047b 100644
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@@ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
         self._alias_index = PreshMap(nr_aliases + 1)
         self._aliases_table = alias_vec(nr_aliases + 1)
 
+    def is_empty(self):
+        return len(self) == 0
+
     def __len__(self):
         return self.get_size_entities()
 
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 62845287b..a11964117 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -250,7 +250,7 @@ class EntityLinker(TrainablePipe):
         # Raise an error if the knowledge base is not initialized.
         if self.kb is None:
             raise ValueError(Errors.E1018.format(name=self.name))
-        if len(self.kb) == 0:
+        if hasattr(self.kb, "is_empty") and self.kb.is_empty():
             raise ValueError(Errors.E139.format(name=self.name))
 
     def initialize(
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 99f164f15..2a6258386 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -353,6 +353,9 @@ def test_kb_default(nlp):
     """Test that the default (empty) KB is loaded upon construction"""
     entity_linker = nlp.add_pipe("entity_linker", config={})
     assert len(entity_linker.kb) == 0
+    with pytest.raises(ValueError, match="E139"):
+        # this raises an error because the KB is empty
+        entity_linker.validate_kb()
     assert entity_linker.kb.get_size_entities() == 0
     assert entity_linker.kb.get_size_aliases() == 0
     # 64 is the default value from pipeline.entity_linker

From 56aa0cc75fbbfc55d95541392675092cb1e2e782 Mon Sep 17 00:00:00 2001
From: kadarakos <kadar.akos@gmail.com>
Date: Wed, 1 Mar 2023 15:38:23 +0100
Subject: [PATCH 18/19] Displacy doc fix (#12352)

* more details for color setting

* more details for color setting

* prettier
---
 website/docs/api/top-level.mdx     | 32 +++++++++++++++---------------
 website/docs/usage/visualizers.mdx | 12 +++++------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index 9748719d7..d0851a59f 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -354,22 +354,22 @@ If a setting is not present in the options, the default value will be used.
 > displacy.serve(doc, style="dep", options=options)
 > ```
 
-| Name               | Description                                                                                                                                  |
-| ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
-| `fine_grained`     | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~             |
-| `add_lemma`        | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~                                                      |
-| `collapse_punct`   | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
-| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~                                                                             |
-| `compact`          | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                    |
-| `color`            | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~                                                                       |
-| `bg`               | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~                                                                 |
-| `font`             | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                                                                        |
-| `offset_x`         | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~                                                                             |
-| `arrow_stroke`     | Width of arrow path in px. Defaults to `2`. ~~int~~                                                                                          |
-| `arrow_width`      | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~                                                 |
-| `arrow_spacing`    | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~                           |
-| `word_spacing`     | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~                                                                     |
-| `distance`         | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~                                           |
+| Name               | Description                                                                                                                                                                                                                                   |
+| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `fine_grained`     | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~                                                                                                              |
+| `add_lemma`        | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~                                                                                                                                                       |
+| `collapse_punct`   | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~                                                                                                  |
+| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~                                                                                                                                                                              |
+| `compact`          | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                                                                                                                     |
+| `color`            | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~       |
+| `bg`               | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
+| `font`             | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                                                                                                                                                                         |
+| `offset_x`         | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~                                                                                                                                                                              |
+| `arrow_stroke`     | Width of arrow path in px. Defaults to `2`. ~~int~~                                                                                                                                                                                           |
+| `arrow_width`      | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~                                                                                                                                                  |
+| `arrow_spacing`    | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~                                                                                                                            |
+| `word_spacing`     | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~                                                                                                                                                                      |
+| `distance`         | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~                                                                                                                                            |
 
 #### Named Entity Visualizer options {id="displacy_options-ent"}
 
diff --git a/website/docs/usage/visualizers.mdx b/website/docs/usage/visualizers.mdx
index 1d3682af4..c372744de 100644
--- a/website/docs/usage/visualizers.mdx
+++ b/website/docs/usage/visualizers.mdx
@@ -58,12 +58,12 @@ arcs.
 
 </Infobox>
 
-| Argument  | Description                                                                               |
-| --------- | ----------------------------------------------------------------------------------------- |
-| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
-| `color`   | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~                    |
-| `bg`      | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~              |
-| `font`    | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                     |
+| Argument  | Description                                                                                                                                                                                                                                   |
+| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                                                                                                                     |
+| `color`   | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~       |
+| `bg`      | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
+| `font`    | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                                                                                                                                                                         |
 
 For a list of all available options, see the
 [`displacy` API documentation](/api/top-level#displacy_options).

From 6aa6b86d496c8d9271f42c077a79f9bfb88687ac Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Wed, 1 Mar 2023 16:02:55 +0100
Subject: [PATCH 19/19] Make generation of empty `KnowledgeBase` instances
 configurable in `EntityLinker` (#12320)

* Make empty_kb() configurable.

* Format.

* Update docs.

* Be more specific in KB serialization test.

* Update KB serialization tests. Update docs.

* Remove doc update for batched candidate generation.

* Fix serialization of subclassed KB in tests.

* Format.

* Update docstring.

* Update docstring.

* Switch from pickle to json for custom field serialization.
---
 spacy/ml/models/entity_linker.py           |  8 +++
 spacy/pipeline/entity_linker.py            | 11 +++-
 spacy/tests/serialize/test_serialize_kb.py | 71 +++++++++++++++++++---
 website/docs/api/architectures.mdx         | 10 ++-
 website/docs/api/entitylinker.mdx          | 28 +++++----
 5 files changed, 101 insertions(+), 27 deletions(-)

diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index 299b6bb52..7332ca199 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -89,6 +89,14 @@ def load_kb(
     return kb_from_file
 
 
+@registry.misc("spacy.EmptyKB.v2")
+def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
+    def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
+        return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
+
+    return empty_kb_factory
+
+
 @registry.misc("spacy.EmptyKB.v1")
 def empty_kb(
     entity_vector_length: int,
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index a11964117..f2dae0529 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -54,6 +54,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
         "entity_vector_length": 64,
         "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
         "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
+        "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
         "overwrite": True,
         "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
         "use_gold_ents": True,
@@ -80,6 +81,7 @@ def make_entity_linker(
     get_candidates_batch: Callable[
         [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
     ],
+    generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
     overwrite: bool,
     scorer: Optional[Callable],
     use_gold_ents: bool,
@@ -101,6 +103,7 @@ def make_entity_linker(
     get_candidates_batch (
         Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
         ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
+    generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
     scorer (Optional[Callable]): The scoring method.
     use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
         component must provide entity annotations.
@@ -135,6 +138,7 @@ def make_entity_linker(
         entity_vector_length=entity_vector_length,
         get_candidates=get_candidates,
         get_candidates_batch=get_candidates_batch,
+        generate_empty_kb=generate_empty_kb,
         overwrite=overwrite,
         scorer=scorer,
         use_gold_ents=use_gold_ents,
@@ -175,6 +179,7 @@ class EntityLinker(TrainablePipe):
         get_candidates_batch: Callable[
             [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
         ],
+        generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
         overwrite: bool = BACKWARD_OVERWRITE,
         scorer: Optional[Callable] = entity_linker_score,
         use_gold_ents: bool,
@@ -198,6 +203,7 @@ class EntityLinker(TrainablePipe):
             Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
             Iterable[Candidate]]
             ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
+        generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
         scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
         use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
             component must provide entity annotations.
@@ -220,6 +226,7 @@ class EntityLinker(TrainablePipe):
         self.model = model
         self.name = name
         self.labels_discard = list(labels_discard)
+        # how many neighbour sentences to take into account
         self.n_sents = n_sents
         self.incl_prior = incl_prior
         self.incl_context = incl_context
@@ -227,9 +234,7 @@ class EntityLinker(TrainablePipe):
         self.get_candidates_batch = get_candidates_batch
         self.cfg: Dict[str, Any] = {"overwrite": overwrite}
         self.distance = CosineDistance(normalize=False)
-        # how many neighbour sentences to take into account
-        # create an empty KB by default
-        self.kb = empty_kb(entity_vector_length)(self.vocab)
+        self.kb = generate_empty_kb(self.vocab, entity_vector_length)
         self.scorer = scorer
         self.use_gold_ents = use_gold_ents
         self.candidates_batch_size = candidates_batch_size
diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index 8d3653ab1..f9d2e226b 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -1,7 +1,10 @@
-from typing import Callable
+from pathlib import Path
+from typing import Callable, Iterable, Any, Dict
 
-from spacy import util
-from spacy.util import ensure_path, registry, load_model_from_config
+import srsly
+
+from spacy import util, Errors
+from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList
 from spacy.kb.kb_in_memory import InMemoryLookupKB
 from spacy.vocab import Vocab
 from thinc.api import Config
@@ -91,7 +94,10 @@ def test_serialize_subclassed_kb():
 
     [components.entity_linker]
     factory = "entity_linker"
-
+    
+    [components.entity_linker.generate_empty_kb]
+    @misc = "kb_test.CustomEmptyKB.v1"
+    
     [initialize]
 
     [initialize.components]
@@ -99,7 +105,7 @@ def test_serialize_subclassed_kb():
     [initialize.components.entity_linker]
 
     [initialize.components.entity_linker.kb_loader]
-    @misc = "spacy.CustomKB.v1"
+    @misc = "kb_test.CustomKB.v1"
     entity_vector_length = 342
     custom_field = 666
     """
@@ -109,10 +115,57 @@ def test_serialize_subclassed_kb():
             super().__init__(vocab, entity_vector_length)
             self.custom_field = custom_field
 
-    @registry.misc("spacy.CustomKB.v1")
+        def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
+            """We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well."""
+            path = ensure_path(path)
+            if not path.exists():
+                path.mkdir(parents=True)
+            if not path.is_dir():
+                raise ValueError(Errors.E928.format(loc=path))
+
+            def serialize_custom_fields(file_path: Path) -> None:
+                srsly.write_json(file_path, {"custom_field": self.custom_field})
+
+            serialize = {
+                "contents": lambda p: self.write_contents(p),
+                "strings.json": lambda p: self.vocab.strings.to_disk(p),
+                "custom_fields": lambda p: serialize_custom_fields(p),
+            }
+            util.to_disk(path, serialize, exclude)
+
+        def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
+            """We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well."""
+            path = ensure_path(path)
+            if not path.exists():
+                raise ValueError(Errors.E929.format(loc=path))
+            if not path.is_dir():
+                raise ValueError(Errors.E928.format(loc=path))
+
+            def deserialize_custom_fields(file_path: Path) -> None:
+                self.custom_field = srsly.read_json(file_path)["custom_field"]
+
+            deserialize: Dict[str, Callable[[Any], Any]] = {
+                "contents": lambda p: self.read_contents(p),
+                "strings.json": lambda p: self.vocab.strings.from_disk(p),
+                "custom_fields": lambda p: deserialize_custom_fields(p),
+            }
+            util.from_disk(path, deserialize, exclude)
+
+    @registry.misc("kb_test.CustomEmptyKB.v1")
+    def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]:
+        def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
+            return SubInMemoryLookupKB(
+                vocab=vocab,
+                entity_vector_length=entity_vector_length,
+                custom_field=0,
+            )
+
+        return empty_kb_factory
+
+    @registry.misc("kb_test.CustomKB.v1")
     def custom_kb(
         entity_vector_length: int, custom_field: int
-    ) -> Callable[[Vocab], InMemoryLookupKB]:
+    ) -> Callable[[Vocab], SubInMemoryLookupKB]:
         def custom_kb_factory(vocab):
             kb = SubInMemoryLookupKB(
                 vocab=vocab,
@@ -139,6 +192,6 @@ def test_serialize_subclassed_kb():
         nlp2 = util.load_model_from_path(tmp_dir)
         entity_linker2 = nlp2.get_pipe("entity_linker")
         # After IO, the KB is the standard one
-        assert type(entity_linker2.kb) == InMemoryLookupKB
+        assert type(entity_linker2.kb) == SubInMemoryLookupKB
         assert entity_linker2.kb.entity_vector_length == 342
-        assert not hasattr(entity_linker2.kb, "custom_field")
+        assert entity_linker2.kb.custom_field == 666
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 966b5830a..268c04a07 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -899,15 +899,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
 | `nO`        | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                                    |
 
-### spacy.EmptyKB.v1 {id="EmptyKB"}
+### spacy.EmptyKB.v1 {id="EmptyKB.v1"}
 
 A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
-instance. This is the default when a new entity linker component is created.
+instance.
 
 | Name                   | Description                                                                         |
 | ---------------------- | ----------------------------------------------------------------------------------- |
 | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
 
+### spacy.EmptyKB.v2 {id="EmptyKB"}
+
+A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
+instance. This is the default when a new entity linker component is created. It
+returns a `Callable[[Vocab, int], InMemoryLookupKB]`.
+
 ### spacy.KBFromFile.v1 {id="KBFromFile"}
 
 A function that reads an existing `KnowledgeBase` from file.
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index bafb2f2da..d84dd3ca9 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -53,19 +53,21 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("entity_linker", config=config)
 > ```
 
-| Setting                                  | Description                                                                                                                                                                                                                                                                                 |
-| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `labels_discard`                         | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                              |
-| `n_sents`                                | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                           |
-| `incl_prior`                             | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                        |
-| `incl_context`                           | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                      |
-| `model`                                  | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                      |
-| `entity_vector_length`                   | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                               |
-| `use_gold_ents`                          | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                        |
-| `get_candidates`                         | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                    |
-| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                    |
-| `scorer` <Tag variant="new">3.2</Tag>    | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                     |
-| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
+| Setting                                             | Description                                                                                                                                                                                                                                                                                                      |
+| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `labels_discard`                                    | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                                                   |
+| `n_sents`                                           | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                                                |
+| `incl_prior`                                        | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                             |
+| `incl_context`                                      | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                           |
+| `model`                                             | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                                           |
+| `entity_vector_length`                              | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                                                    |
+| `use_gold_ents`                                     | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                                             |
+| `get_candidates`                                    | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                                         |
+| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
+| `generate_empty_kb` <Tag variant="new">3.6</Tag>    | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~                                                                           |
+| `overwrite` <Tag variant="new">3.2</Tag>            | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                                         |
+| `scorer` <Tag variant="new">3.2</Tag>               | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                                          |
+| `threshold` <Tag variant="new">3.4</Tag>            | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~                      |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/entity_linker.py