Merge branch 'master' into feature/etl

2025-07-30 10:00:04 +03:00 · 2023-02-01 17:03:29 +01:00 · 2023-02-01 17:03:29 +01:00 · 5a08596f92
commit 5a08596f92
parent 994304f1e8 f9e020dd67
30 changed files with 440 additions and 147 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -52,17 +52,17 @@ steps:
      python -W error -c "import spacy"
    displayName: "Test import"
-#  - script: |
+  - script: |
-#      python -m spacy download ca_core_news_sm
+      python -m spacy download ca_core_news_sm
-#      python -m spacy download ca_core_news_md
+      python -m spacy download ca_core_news_md
-#      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-#    displayName: 'Test download CLI'
+    displayName: 'Test download CLI'
-#    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.8')
-#
+
-#  - script: |
+  - script: |
-#      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-#    displayName: 'Test no warnings on load (#11713)'
+    displayName: 'Test no warnings on load (#11713)'
-#    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.8')
  - script: |
      python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
@ -86,17 +86,17 @@ steps:
    displayName: 'Test train CLI'
    condition: eq(variables['python_version'], '3.8')
-#  - script: |
+  - script: |
-#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-#      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-#    displayName: 'Test assemble CLI'
+    displayName: 'Test assemble CLI'
-#    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.8')
-#
+
-#  - script: |
+  - script: |
-#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-#      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-#    displayName: 'Test assemble CLI vectors warning'
+    displayName: 'Test assemble CLI vectors warning'
-#    condition: eq(variables['python_version'], '3.8')
+    condition: eq(variables['python_version'], '3.8')
  - script: |
      python -m pip install -U -r requirements.txt
--- a/.gitignore
+++ b/.gitignore
@ -10,16 +10,6 @@ spacy/tests/package/setup.cfg
 spacy/tests/package/pyproject.toml
 spacy/tests/package/requirements.txt
 # Website
 website/.cache/
 website/public/
 website/node_modules
 website/.npm
 website/logs
 *.log
 npm-debug.log*
 quickstart-training-generator.js
 # Cython / C extensions
 cythonize.json
 spacy/*.html
--- a/README.md
+++ b/README.md
@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
 model packaging, deployment and workflow management. spaCy is commercial
 open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
-💫 **Version 3.4 out now!**
+💫 **Version 3.5 out now!**
 [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
 [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
--- a/requirements.txt
+++ b/requirements.txt
@ -22,7 +22,7 @@ langcodes>=3.2.0,<4.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
-typing_extensions>=3.7.4.1,<4.2.0; python_version < "3.8"
+typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
 # Development dependencies
 pre-commit>=2.13.0
 cython>=0.25,<3.0
--- a/setup.cfg
+++ b/setup.cfg
@ -63,7 +63,7 @@ install_requires =
    # Official Python utilities
    setuptools
    packaging>=20.0
-    typing_extensions>=3.7.4,<4.2.0; python_version < "3.8"
+    typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
    langcodes>=3.2.0,<4.0.0
 [options.entry_points]
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -106,9 +106,7 @@ def serve(
    if is_in_jupyter():
        warnings.warn(Warnings.W011)
-    render(
+    render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
        docs, style=style, page=page, minify=minify, options=options, manual=manual
    )
    httpd = simple_server.make_server(host, port, app)
    print(f"\nUsing the '{style}' visualizer")
    print(f"Serving on http://{host}:{port} ...\n")
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@ -25,7 +25,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
    """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
    to support entity linking of named entities to real-world concepts.
-    DOCS: https://spacy.io/api/kb_in_memory
+    DOCS: https://spacy.io/api/inmemorylookupkb
    """
    def __init__(self, Vocab vocab, entity_vector_length):
--- a/spacy/matcher/levenshtein.pyx
+++ b/spacy/matcher/levenshtein.pyx
@ -22,7 +22,7 @@ cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int =
        max_edits = fuzzy
    else:
        # allow at least two edits (to allow at least one transposition) and up
-        # to 20% of the pattern string length
+        # to 30% of the pattern string length
        max_edits = max(2, round(0.3 * len(pattern_text)))
    return levenshtein(input_text, pattern_text, max_edits) <= max_edits
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@ -5,8 +5,12 @@ from ..vocab import Vocab
 from ..tokens import Doc, Span
 class Matcher:
-    def __init__(self, vocab: Vocab, validate: bool = ...,
+    def __init__(
-                 fuzzy_compare: Callable[[str, str, int], bool] = ...) -> None: ...
+        self,
        vocab: Vocab,
        validate: bool = ...,
        fuzzy_compare: Callable[[str, str, int], bool] = ...,
    ) -> None: ...
    def __reduce__(self) -> Any: ...
    def __len__(self) -> int: ...
    def __contains__(self, key: str) -> bool: ...
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@ -5,8 +5,8 @@ from itertools import islice
 import numpy as np
 import srsly
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy, NumpyOps
-from thinc.types import Floats2d, Ints1d, Ints2d
+from thinc.types import Floats2d, Ints2d
 from ._edit_tree_internals.edit_trees import EditTrees
 from ._edit_tree_internals.schemas import validate_edit_tree
@ -20,6 +20,10 @@ from ..vocab import Vocab
 from .. import util
 # The cutoff value of *top_k* above which an alternative method is used to process guesses.
 TOP_K_GUARDRAIL = 20
 default_model_config = """
 [model]
@architectures = "spacy.Tagger.v2"
@ -115,6 +119,7 @@ class EditTreeLemmatizer(TrainablePipe):
        self.cfg: Dict[str, Any] = {"labels": []}
        self.scorer = scorer
        self.numpy_ops = NumpyOps()
    def get_loss(
        self, examples: Iterable[Example], scores: List[Floats2d]
@ -144,6 +149,18 @@ class EditTreeLemmatizer(TrainablePipe):
        return float(loss), d_scores
    def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
        if self.top_k == 1:
            scores2guesses = self._scores2guesses_top_k_equals_1
        elif self.top_k <= TOP_K_GUARDRAIL:
            scores2guesses = self._scores2guesses_top_k_greater_1
        else:
            scores2guesses = self._scores2guesses_top_k_guardrail
        # The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values
        # of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used
        # for its principal purpose of lemmatizing tokens. However, the code could also
        # be used for other purposes, and with very large values of *top_k* the method
        # becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used
        # instead.
        n_docs = len(list(docs))
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
@ -153,20 +170,52 @@ class EditTreeLemmatizer(TrainablePipe):
            return guesses
        scores = self.model.predict(docs)
        assert len(scores) == n_docs
-        guesses = self._scores2guesses(docs, scores)
+        guesses = scores2guesses(docs, scores)
        assert len(guesses) == n_docs
        return guesses
-    def _scores2guesses(self, docs, scores):
+    def _scores2guesses_top_k_equals_1(self, docs, scores):
        guesses = []
        for doc, doc_scores in zip(docs, scores):
-            if self.top_k == 1:
+            doc_guesses = doc_scores.argmax(axis=1)
-                doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1)
+            doc_guesses = self.numpy_ops.asarray(doc_guesses)
            else:
                doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
-            if not isinstance(doc_guesses, np.ndarray):
+            doc_compat_guesses = []
-                doc_guesses = doc_guesses.get()
+            for i, token in enumerate(doc):
                tree_id = self.cfg["labels"][doc_guesses[i]]
                if self.trees.apply(tree_id, token.text) is not None:
                    doc_compat_guesses.append(tree_id)
                else:
                    doc_compat_guesses.append(-1)
            guesses.append(np.array(doc_compat_guesses))
        return guesses
    def _scores2guesses_top_k_greater_1(self, docs, scores):
        guesses = []
        top_k = min(self.top_k, len(self.labels))
        for doc, doc_scores in zip(docs, scores):
            doc_scores = self.numpy_ops.asarray(doc_scores)
            doc_compat_guesses = []
            for i, token in enumerate(doc):
                for _ in range(top_k):
                    candidate = int(doc_scores[i].argmax())
                    candidate_tree_id = self.cfg["labels"][candidate]
                    if self.trees.apply(candidate_tree_id, token.text) is not None:
                        doc_compat_guesses.append(candidate_tree_id)
                        break
                    doc_scores[i, candidate] = np.finfo(np.float32).min
                else:
                    doc_compat_guesses.append(-1)
            guesses.append(np.array(doc_compat_guesses))
        return guesses
    def _scores2guesses_top_k_guardrail(self, docs, scores):
        guesses = []
        for doc, doc_scores in zip(docs, scores):
            doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
            doc_guesses = self.numpy_ops.asarray(doc_guesses)
            doc_compat_guesses = []
            for token, candidates in zip(doc, doc_guesses):
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -163,15 +163,33 @@ class TokenPatternString(BaseModel):
    IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
    INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
    FUZZY: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy")
-    FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy1")
+    FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
-    FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy2")
+        None, alias="fuzzy1"
-    FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy3")
+    )
-    FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy4")
+    FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
-    FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy5")
+        None, alias="fuzzy2"
-    FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy6")
+    )
-    FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy7")
+    FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
-    FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy8")
+        None, alias="fuzzy3"
-    FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy9")
+    )
    FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
        None, alias="fuzzy4"
    )
    FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
        None, alias="fuzzy5"
    )
    FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
        None, alias="fuzzy6"
    )
    FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
        None, alias="fuzzy7"
    )
    FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
        None, alias="fuzzy8"
    )
    FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
        None, alias="fuzzy9"
    )
    class Config:
        extra = "forbid"
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@ -101,14 +101,15 @@ def test_initialize_from_labels():
    }
-def test_no_data():
+@pytest.mark.parametrize("top_k", (1, 5, 30))
 def test_no_data(top_k):
    # Test that the lemmatizer provides a nice error when there's no tagging data / labels
    TEXTCAT_DATA = [
        ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
        ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
    ]
    nlp = English()
-    nlp.add_pipe("trainable_lemmatizer")
+    nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
    nlp.add_pipe("textcat")
    train_examples = []
@ -119,10 +120,11 @@ def test_no_data():
        nlp.initialize(get_examples=lambda: train_examples)
-def test_incomplete_data():
+@pytest.mark.parametrize("top_k", (1, 5, 30))
 def test_incomplete_data(top_k):
    # Test that the lemmatizer works with incomplete information
    nlp = English()
-    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
+    lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
    lemmatizer.min_tree_freq = 1
    train_examples = []
    for t in PARTIAL_DATA:
@ -154,9 +156,10 @@ def test_incomplete_data():
    assert xp.count_nonzero(dX[1][1]) == 0
-def test_overfitting_IO():
+@pytest.mark.parametrize("top_k", (1, 5, 30))
 def test_overfitting_IO(top_k):
    nlp = English()
-    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
+    lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
    lemmatizer.min_tree_freq = 1
    train_examples = []
    for t in TRAIN_DATA:
@ -189,7 +192,7 @@ def test_overfitting_IO():
    # Check model after a {to,from}_bytes roundtrip
    nlp_bytes = nlp.to_bytes()
    nlp3 = English()
-    nlp3.add_pipe("trainable_lemmatizer")
+    nlp3.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
    nlp3.from_bytes(nlp_bytes)
    doc3 = nlp3(test_text)
    assert doc3[0].lemma_ == "she"
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -618,7 +618,6 @@ def test_string_to_list_intify(value):
    assert string_to_list(value, intify=True) == [1, 2, 3]
@pytest.mark.skip(reason="Temporarily skip for dev version")
 def test_download_compatibility():
    spec = SpecifierSet("==" + about.__version__)
    spec.prereleases = False
@ -629,7 +628,6 @@ def test_download_compatibility():
        assert get_minor_version(about.__version__) == get_minor_version(version)
@pytest.mark.skip(reason="Temporarily skip for dev version")
 def test_validate_compatibility_table():
    spec = SpecifierSet("==" + about.__version__)
    spec.prereleases = False
--- a/website/.dockerignore
+++ b/website/.dockerignore
@ -0,0 +1,9 @@
 .cache/
 .next/
 public/
 node_modules
 .npm
 logs
 *.log
 npm-debug.log*
 quickstart-training-generator.js
--- a/website/.gitignore
+++ b/website/.gitignore
@ -1,5 +1,7 @@
 # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
 quickstart-training-generator.js
 # dependencies
 /node_modules
 /.pnp
@ -41,4 +43,4 @@ next-env.d.ts
 public/robots.txt
 public/sitemap*
 public/sw.js*
-public/workbox*
+public/workbox*
--- a/website/Dockerfile
+++ b/website/Dockerfile
@ -1,16 +1,14 @@
-FROM node:11.15.0 
+FROM node:18
-WORKDIR /spacy-io
+USER node
 RUN npm install -g gatsby-cli@2.7.4
 COPY package.json .
 COPY package-lock.json . 
 RUN npm install
 # This is so the installed node_modules will be up one directory
 # from where a user mounts files, so that they don't accidentally mount
 # their own node_modules from a different build
 # https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
-WORKDIR /spacy-io/website/
+WORKDIR /home/node
 COPY --chown=node package.json .
 COPY --chown=node package-lock.json .
 RUN npm install
 WORKDIR /home/node/website/
--- a/website/README.md
+++ b/website/README.md
@ -41,33 +41,27 @@ If you'd like to do this, **be sure you do _not_ include your local
 `node_modules` folder**, since there are some dependencies that need to be built
 for the image system. Rename it before using.
-```bash
+First build the Docker image. This only needs to be done on the first run
-docker run -it \
+or when changes are made to `Dockerfile` or the website dependencies:
  -v $(pwd):/spacy-io/website \
  -p 8000:8000 \
  ghcr.io/explosion/spacy-io \
  gatsby develop -H 0.0.0.0
 ```
 This will allow you to access the built website at http://0.0.0.0:8000/ in your
 browser, and still edit code in your editor while having the site reflect those
 changes.
 **Note**: If you're working on a Mac with an M1 processor, you might see
 segfault errors from `qemu` if you use the default image. To fix this use the
 `arm64` tagged image in the `docker run` command
 (ghcr.io/explosion/spacy-io:arm64).
 ### Building the Docker image
 If you'd like to build the image locally, you can do so like this:
 ```bash
 docker build -t spacy-io .
 ```
-This will take some time, so if you want to use the prebuilt image you'll save a
+You can then build and run the website with:
-bit of time.
+
 ```bash
 docker run -it \
  --rm \
  -v $(pwd):/home/node/website \
  -p 3000:3000 \
  spacy-io \
  npm run dev -- -H 0.0.0.0
 ```
 This will allow you to access the built website at http://0.0.0.0:3000/ in your
 browser, and still edit code in your editor while having the site reflect those
 changes.
 ## Project structure
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -1215,19 +1215,19 @@ When a directory is provided it is traversed recursively to collect all files.
 $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
 ```
-| Name                                      | Description                                                                                                                                                                          |
+| Name                      | Description                                                                                                                                                                          |
-| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                                   | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                  |
+| `model`                   | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                  |
-| `data_path`                               | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~                                                 |
+| `data_path`               | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~                                                 |
-| `output-file`, `-o`                       | Output `DocBin` path. ~~str (positional)~~                                                                                                                                           |
+| `output-file`, `-o`       | Output `DocBin` path. ~~str (positional)~~                                                                                                                                           |
-| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--code`, `-c`            | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--text-key`, `-tk`                       | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~                                                                            |
+| `--text-key`, `-tk`       | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~                                                                            |
-| `--force-overwrite`, `-F`                 | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~                    |
+| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~                    |
-| `--gpu-id`, `-g`                          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
+| `--gpu-id`, `-g`          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--batch-size`, `-b`                      | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                  |
+| `--batch-size`, `-b`      | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                  |
-| `--n-process`, `-n`                       | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                         |
+| `--n-process`, `-n`       | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                         |
-| `--help`, `-h`                            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
+| `--help`, `-h`            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
-| **CREATES**                               | A `DocBin` with the annotations from the `model` for all the files found in `data-path`.                                                                                             |
+| **CREATES**               | A `DocBin` with the annotations from the `model` for all the files found in `data-path`.                                                                                             |
 ## find-threshold {id="find-threshold",version="3.5",tag="command"}
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@ -15,7 +15,7 @@ world". It requires a `KnowledgeBase`, as well as a function to generate
 plausible candidates from that `KnowledgeBase` given a certain textual mention,
 and a machine learning model to pick the right candidate, given the local
 context of the mention. `EntityLinker` defaults to using the
-[`InMemoryLookupKB`](/api/kb_in_memory) implementation.
+[`InMemoryLookupKB`](/api/inmemorylookupkb) implementation.
 ## Assigned Attributes {id="assigned-attributes"}
--- a/website/docs/api/inmemorylookupkb.mdx
+++ b/website/docs/api/inmemorylookupkb.mdx
@ -43,7 +43,7 @@ The length of the fixed-size entity vectors in the knowledge base.
 Add an entity to the knowledge base, specifying its corpus frequency and entity
 vector, which should be of length
-[`entity_vector_length`](/api/kb_in_memory#entity_vector_length).
+[`entity_vector_length`](/api/inmemorylookupkb#entity_vector_length).
 > #### Example
 >
@ -79,8 +79,9 @@ frequency and entity vector for each entity.
 Add an alias or mention to the knowledge base, specifying its potential KB
 identifiers and their prior probabilities. The entity identifiers should refer
-to entities previously added with [`add_entity`](/api/kb_in_memory#add_entity)
+to entities previously added with
-or [`set_entities`](/api/kb_in_memory#set_entities). The sum of the prior
+[`add_entity`](/api/inmemorylookupkb#add_entity) or
 [`set_entities`](/api/inmemorylookupkb#set_entities). The sum of the prior
 probabilities should not exceed 1. Note that an empty string can not be used as
 alias.
@ -156,7 +157,7 @@ Get a list of all aliases in the knowledge base.
 Given a certain textual mention as input, retrieve a list of candidate entities
 of type [`Candidate`](/api/kb#candidate). Wraps
-[`get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
+[`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
 > #### Example
 >
@ -174,7 +175,7 @@ of type [`Candidate`](/api/kb#candidate). Wraps
 ## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}
-Same as [`get_candidates()`](/api/kb_in_memory#get_candidates), but for an
+Same as [`get_candidates()`](/api/inmemorylookupkb#get_candidates), but for an
 arbitrary number of mentions. The [`EntityLinker`](/api/entitylinker) component
 will call `get_candidates_batch()` instead of `get_candidates()`, if the config
 parameter `candidates_batch_size` is greater or equal than 1.
@ -231,7 +232,7 @@ Given a certain entity ID, retrieve its pretrained entity vector.
 ## InMemoryLookupKB.get_vectors {id="get_vectors",tag="method"}
-Same as [`get_vector()`](/api/kb_in_memory#get_vector), but for an arbitrary
+Same as [`get_vector()`](/api/inmemorylookupkb#get_vector), but for an arbitrary
 number of entity IDs.
 The default implementation of `get_vectors()` executes `get_vector()` in a loop.
--- a/website/docs/api/kb.mdx
+++ b/website/docs/api/kb.mdx
@ -21,8 +21,8 @@ functions called by the [`EntityLinker`](/api/entitylinker) component.
 <Infobox variant="warning">
 This class was not abstract up to spaCy version 3.5. The `KnowledgeBase`
-implementation up to that point is available as `InMemoryLookupKB` from 3.5
+implementation up to that point is available as
-onwards.
+[`InMemoryLookupKB`](/api/inmemorylookupkb) from 3.5 onwards.
 </Infobox>
@ -110,14 +110,15 @@ to you.
 </Infobox>
 From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
-[`InMemoryLookupKB`](/api/kb_in_memory) being a drop-in replacement) to allow
+[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to
-more flexibility in customizing knowledge bases. Some of its methods were moved
+allow more flexibility in customizing knowledge bases. Some of its methods were
-to [`InMemoryLookupKB`](/api/kb_in_memory) during this refactoring, one of those
+moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring,
-being `get_alias_candidates()`. This method is now available as
+one of those being `get_alias_candidates()`. This method is now available as
-[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
+[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
-Note: [`InMemoryLookupKB.get_candidates()`](/api/kb_in_memory#get_candidates)
+Note:
 [`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates)
 defaults to
-[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
+[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
 ## KnowledgeBase.get_vector {id="get_vector",tag="method"}
--- a/website/docs/usage/101/_architecture.mdx
+++ b/website/docs/usage/101/_architecture.mdx
@ -79,7 +79,7 @@ operates on a `Doc` and gives you access to the matched tokens **in context**.
 | ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
 | [`Corpus`](/api/corpus)                          | Class for managing annotated corpora for training and evaluation data.                             |
 | [`KnowledgeBase`](/api/kb)                       | Abstract base class for storage and retrieval of data for entity linking.                          |
-| [`InMemoryLookupKB`](/api/kb_in_memory)          | Implementation of `KnowledgeBase` storing all data in memory.                                      |
+| [`InMemoryLookupKB`](/api/inmemorylookupkb)      | Implementation of `KnowledgeBase` storing all data in memory.                                      |
 | [`Candidate`](/api/kb#candidate)                 | Object associating a textual mention with a specific entity contained in a `KnowledgeBase`.        |
 | [`Lookups`](/api/lookups)                        | Container for convenient access to large lookup tables and dictionaries.                           |
 | [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis.                                                                          |
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@ -384,14 +384,14 @@ the more specific attributes `FUZZY1`..`FUZZY9` you can specify the maximum
 allowed edit distance directly.
 ```python
-# Match lowercase with fuzzy matching (allows 2 edits)
+# Match lowercase with fuzzy matching (allows 3 edits)
 pattern = [{"LOWER": {"FUZZY": "definitely"}}]
-# Match custom attribute values with fuzzy matching (allows 2 edits)
+# Match custom attribute values with fuzzy matching (allows 3 edits)
 pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
-# Match with exact Levenshtein edit distance limits (allows 3 edits)
+# Match with exact Levenshtein edit distance limits (allows 4 edits)
-pattern = [{"_": {"country": {"FUZZY3": "Kyrgyzstan"}}}]
+pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}]
 ```
 #### Regex and fuzzy matching with lists {id="regex-fuzzy-lists", version="3.5"}
--- a/website/docs/usage/saving-loading.mdx
+++ b/website/docs/usage/saving-loading.mdx
@ -684,10 +684,15 @@ If your pipeline includes
 [custom components](/usage/processing-pipelines#custom-components), model
 architectures or other [code](/usage/training#custom-code), those functions need
 to be registered **before** your pipeline is loaded. Otherwise, spaCy won't know
-how to create the objects referenced in the config. The
+how to create the objects referenced in the config. If you're loading your own
-[`spacy package`](/api/cli#package) command lets you provide one or more paths
+pipeline in Python, you can make custom components available just by importing
-to Python files containing custom registered functions using the `--code`
+the code that defines them before calling
-argument.
+[`spacy.load`](/api/top-level#spacy.load). This is also how the `--code`
 argument to CLI commands works.
 With the [`spacy package`](/api/cli#package) command, you can provide one or
 more paths to Python files containing custom registered functions using the
 `--code` argument.
 > #### \_\_init\_\_.py (excerpt)
 >
--- a/website/docs/usage/v3-5.mdx
+++ b/website/docs/usage/v3-5.mdx
@ -0,0 +1,215 @@
 ---
 title: What's New in v3.5
 teaser: New features and how to upgrade
 menu:
  - ['New Features', 'features']
  - ['Upgrading Notes', 'upgrading']
 ---
 ## New features {id="features",hidden="true"}
 spaCy v3.5 introduces three new CLI commands, `apply`, `benchmark` and
 `find-threshold`, adds fuzzy matching, provides improvements to our entity
 linking functionality, and includes a range of language updates and bug fixes.
 ### New CLI commands {id="cli"}
 #### apply CLI
 The [`apply` CLI](/api/cli#apply) can be used to apply a pipeline to one or more
 `.txt`, `.jsonl` or `.spacy` input files, saving the annotated docs in a single
 `.spacy` file.
 ```bash
 $ spacy apply en_core_web_sm my_texts/ output.spacy
 ```
 #### benchmark CLI
 The [`benchmark` CLI](/api/cli#benchmark) has been added to extend the existing
 `evaluate` functionality with a wider range of profiling subcommands.
 The `benchmark accuracy` CLI is introduced as an alias for `evaluate`. The new
 `benchmark speed` CLI performs warmup rounds before measuring the speed in words
 per second on batches of randomly shuffled documents from the provided data.
 ```bash
 $ spacy benchmark speed my_pipeline data.spacy
 ```
 The output is the mean performance using batches (`nlp.pipe`) with a 95%
 confidence interval, e.g., profiling `en_core_web_sm` on CPU:
 ```none
 Outliers: 2.0%, extreme outliers: 0.0%
 Mean: 18904.1 words/s (95% CI: -256.9 +244.1)
 ```
 #### find-threshold CLI
 The [`find-threshold` CLI](/api/cli#find-threshold) runs a series of trials
 across threshold values from `0.0` to `1.0` and identifies the best threshold
 for the provided score metric.
 The following command runs 20 trials for the `spancat` component in
 `my_pipeline`, recording the `spans_sc_f` score for each value of the threshold
 `[components.spancat.threshold]` from `0.0` to `1.0`:
 ```bash
 $ spacy find-threshold my_pipeline data.spacy spancat threshold spans_sc_f --n_trials 20
 ```
 The `find-threshold` CLI can be used with `textcat_multilabel`, `spancat` and
 custom components with thresholds that are applied while predicting or scoring.
 ### Fuzzy matching {id="fuzzy"}
 New `FUZZY` operators support [fuzzy matching](/usage/rule-based-matching#fuzzy)
 with the `Matcher`. By default, the `FUZZY` operator allows a Levenshtein edit
 distance of 2 and up to 30% of the pattern string length. `FUZZY1`..`FUZZY9` can
 be used to specify the exact number of allowed edits.
 ```python
 # Match lowercase with fuzzy matching (allows up to 3 edits)
 pattern = [{"LOWER": {"FUZZY": "definitely"}}]
 # Match custom attribute values with fuzzy matching (allows up to 3 edits)
 pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
 # Match with exact Levenshtein edit distance limits (allows up to 4 edits)
 pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}]
 ```
 Note that `FUZZY` uses Levenshtein edit distance rather than Damerau-Levenshtein
 edit distance, so a transposition like `teh` for `the` counts as two edits, one
 insertion and one deletion.
 If you'd prefer an alternate fuzzy matching algorithm, you can provide your own
 custom method to the `Matcher` or as a config option for an entity ruler and
 span ruler.
 ### FUZZY and REGEX with lists {id="fuzzy-regex-lists"}
 The `FUZZY` and `REGEX` operators are also now supported for lists with `IN` and
 `NOT_IN`:
 ```python
 pattern = [{"TEXT": {"FUZZY": {"IN": ["awesome", "cool", "wonderful"]}}}]
 pattern = [{"TEXT": {"REGEX": {"NOT_IN": ["^awe(some)?$", "^wonder(ful)?"]}}}]
 ```
 ### Entity linking generalization {id="el"}
 The knowledge base used for entity linking is now easier to customize and has a
 new default implementation [`InMemoryLookupKB`](/api/inmemorylookupkb).
 ### Additional features and improvements {id="additional-features-and-improvements"}
 - Language updates:
  - Extended support for Slovenian
  - Fixed lookup fallback for French and Catalan lemmatizers
  - Switch Russian and Ukrainian lemmatizers to `pymorphy3`
  - Support for editorial punctuation in Ancient Greek
  - Update to Russian tokenizer exceptions
  - Small fix for Dutch stop words
 - Allow up to `typer` v0.7.x, `mypy` 0.990 and `typing_extensions` v4.4.x.
 - New `spacy.ConsoleLogger.v3` with expanded progress
  [tracking](/api/top-level#ConsoleLogger).
 - Improved scoring behavior for `textcat` with `spacy.textcat_scorer.v2` and
  `spacy.textcat_multilabel_scorer.v2`.
 - Updates so that downstream components can train properly on a frozen `tok2vec`
  or `transformer` layer.
 - Allow interpolation of variables in directory names in projects.
 - Support for local file system [remotes](/usage/projects#remote) for projects.
 - Improve UX around `displacy.serve` when the default port is in use.
 - Optional `before_update` callback that is invoked at the start of each
  [training step](/api/data-formats#config-training).
 - Improve performance of `SpanGroup` and fix typing issues for `SpanGroup` and
  `Span` objects.
 - Patch a
  [security vulnerability](https://github.com/advisories/GHSA-gw9q-c7gh-j9vm) in
  extracting tar files.
 - Add equality definition for `Vectors`.
 - Ensure `Vocab.to_disk` respects the exclude setting for `lookups` and
  `vectors`.
 - Correctly handle missing annotations in the edit tree lemmatizer.
 ### Trained pipeline updates {id="pipelines"}
 - The CNN pipelines add `IS_SPACE` as a `tok2vec` feature for `tagger` and
  `morphologizer` components to improve tagging of non-whitespace vs. whitespace
  tokens.
 - The transformer pipelines require `spacy-transformers` v1.2, which uses the
  exact alignment from `tokenizers` for fast tokenizers instead of the heuristic
  alignment from `spacy-alignments`. For all trained pipelines except
  `ja_core_news_trf`, the alignments between spaCy tokens and transformer tokens
  may be slightly different. More details about the `spacy-transformers` changes
  in the
  [v1.2.0 release notes](https://github.com/explosion/spacy-transformers/releases/tag/v1.2.0).
 ## Notes about upgrading from v3.4 {id="upgrading"}
 ### Validation of textcat values {id="textcat-validation"}
 An error is now raised when unsupported values are given as input to train a
 `textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0`
 as explained in the [docs](/api/textcategorizer#assigned-attributes).
 ### Updated scorers for tokenization and textcat {id="scores"}
 We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported
 `token_acc` will drop from v3.4 to v3.5, but if `token_p/r/f` stay the same,
 your tokenization performance has not changed from v3.4.
 For new `textcat` or `textcat_multilabel` configs, the new default `v2` scorers:
 - ignore `threshold` for `textcat`, so the reported `cats_p/r/f` may increase
  slightly in v3.5 even though the underlying predictions are unchanged
 - report the performance of only the **final** `textcat` or `textcat_multilabel`
  component in the pipeline by default
 - allow custom scorers to be used to score multiple `textcat` and
  `textcat_multilabel` components with `Scorer.score_cats` by restricting the
  evaluation to the component's provided labels
 ### Pipeline package version compatibility {id="version-compat"}
 > #### Using legacy implementations
 >
 > In spaCy v3, you'll still be able to load and reference legacy implementations
 > via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
 > components or architectures change and newer versions are available in the
 > core library.
 When you're loading a pipeline package trained with an earlier version of spaCy
 v3, you will see a warning telling you that the pipeline may be incompatible.
 This doesn't necessarily have to be true, but we recommend running your
 pipelines against your test suite or evaluation data to make sure there are no
 unexpected results.
 If you're using one of the [trained pipelines](/models) we provide, you should
 run [`spacy download`](/api/cli#download) to update to the latest version. To
 see an overview of all installed packages and their compatibility, you can run
 [`spacy validate`](/api/cli#validate).
 If you've trained your own custom pipeline and you've confirmed that it's still
 working as expected, you can update the spaCy version requirements in the
 [`meta.json`](/api/data-formats#meta):
 ```diff
 - "spacy_version": ">=3.4.0,<3.5.0",
 + "spacy_version": ">=3.4.0,<3.6.0",
 ```
 ### Updating v3.4 configs
 To update a config from spaCy v3.4 with the new v3.5 settings, run
 [`init fill-config`](/api/cli#init-fill-config):
 ```cli
 $ python -m spacy init fill-config config-v3.4.cfg config-v3.5.cfg
 ```
 In many cases ([`spacy train`](/api/cli#train),
 [`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
 automatically, but you'll need to fill in the new settings to run
 [`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@ -13,7 +13,8 @@
                    { "text": "New in v3.1", "url": "/usage/v3-1" },
                    { "text": "New in v3.2", "url": "/usage/v3-2" },
                    { "text": "New in v3.3", "url": "/usage/v3-3" },
-                    { "text": "New in v3.4", "url": "/usage/v3-4" }
+                    { "text": "New in v3.4", "url": "/usage/v3-4" },
                    { "text": "New in v3.5", "url": "/usage/v3-5" }
                ]
            },
            {
@ -129,6 +130,7 @@
                "items": [
                    { "text": "Attributes", "url": "/api/attributes" },
                    { "text": "Corpus", "url": "/api/corpus" },
                    { "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" },
                    { "text": "KnowledgeBase", "url": "/api/kb" },
                    { "text": "Lookups", "url": "/api/lookups" },
                    { "text": "MorphAnalysis", "url": "/api/morphology#morphanalysis" },
--- a/website/pages/index.tsx
+++ b/website/pages/index.tsx
@ -89,7 +89,7 @@ const Landing = () => {
                </LandingCard>
                <LandingCard title="Awesome ecosystem" url="/usage/projects" button="Read more">
-                    In the five years since its release, spaCy has become an industry standard with
+                    Since its release in 2015, spaCy has become an industry standard with
                    a huge ecosystem. Choose from a variety of plugins, integrate with your machine
                    learning stack and build custom components and workflows.
                </LandingCard>
--- a/website/src/components/seo.js
+++ b/website/src/components/seo.js
@ -9,6 +9,8 @@ import socialImageLegacy from '../images/social_legacy.jpg'
 import siteMetadata from '../../meta/site.json'
 import Head from 'next/head'
 import { siteUrl } from '../../meta/dynamicMeta.mjs'
 function getPageTitle(title, sitename, slogan, sectionTitle, nightly, legacy) {
    if (sectionTitle && title) {
        const suffix = nightly ? ' (nightly)' : legacy ? ' (legacy)' : ''
@ -25,7 +27,7 @@ function getImage(section, nightly, legacy) {
    if (legacy) return socialImageLegacy
    if (section === 'api') return socialImageApi
    if (section === 'universe') return socialImageUniverse
-    return socialImageDefault
+    return `${siteUrl}${socialImageDefault.src}`
 }
 export default function SEO({
@ -46,7 +48,7 @@ export default function SEO({
        nightly,
        legacy
    )
-    const socialImage = getImage(section, nightly, legacy).src
+    const socialImage = getImage(section, nightly, legacy)
    const meta = [
        {
            name: 'description',
--- a/website/src/styles/list.module.sass
+++ b/website/src/styles/list.module.sass
@ -20,6 +20,10 @@
        display: inline-block
        margin-bottom: var(--spacing-sm)
    .ol, .ul
        margin-top: var(--spacing-xs)
        margin-bottom: var(--spacing-xs)
    &:before
        content: '\25CF'
        position: relative
--- a/website/src/templates/index.js
+++ b/website/src/templates/index.js
@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
 }
 const navAlert = (
-    <Link to="/usage/v3-4" hidden>
+    <Link to="/usage/v3-5" hidden>
-        <strong>💥 Out now:</strong> spaCy v3.4
+        <strong>💥 Out now:</strong> spaCy v3.5
    </Link>
 )