Merge branch 'master' into feature/etl

2025-07-30 10:00:04 +03:00 · 2023-02-01 17:03:29 +01:00 · 2023-02-01 17:03:29 +01:00 · 5a08596f92
commit 5a08596f92
parent 994304f1e8 f9e020dd67
30 changed files with 440 additions and 147 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -52,17 +52,17 @@ steps:
      python -W error -c "import spacy"
    displayName: "Test import"

-#  - script: |
-#      python -m spacy download ca_core_news_sm
-#      python -m spacy download ca_core_news_md
-#      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-#    displayName: 'Test download CLI'
-#    condition: eq(variables['python_version'], '3.8')
-#
-#  - script: |
-#      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-#    displayName: 'Test no warnings on load (#11713)'
-#    condition: eq(variables['python_version'], '3.8')
+  - script: |
+      python -m spacy download ca_core_news_sm
+      python -m spacy download ca_core_news_md
+      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+    displayName: 'Test download CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+    displayName: 'Test no warnings on load (#11713)'
+    condition: eq(variables['python_version'], '3.8')

  - script: |
      python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
@ -86,17 +86,17 @@ steps:
    displayName: 'Test train CLI'
    condition: eq(variables['python_version'], '3.8')

-#  - script: |
-#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-#      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-#    displayName: 'Test assemble CLI'
-#    condition: eq(variables['python_version'], '3.8')
-#
-#  - script: |
-#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-#      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-#    displayName: 'Test assemble CLI vectors warning'
-#    condition: eq(variables['python_version'], '3.8')
+  - script: |
+      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+    displayName: 'Test assemble CLI'
+    condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+    displayName: 'Test assemble CLI vectors warning'
+    condition: eq(variables['python_version'], '3.8')

  - script: |
      python -m pip install -U -r requirements.txt
--- a/.gitignore
+++ b/.gitignore
@ -10,16 +10,6 @@ spacy/tests/package/setup.cfg
 spacy/tests/package/pyproject.toml
 spacy/tests/package/requirements.txt

-# Website
-website/.cache/
-website/public/
-website/node_modules
-website/.npm
-website/logs
-*.log
-npm-debug.log*
-quickstart-training-generator.js
-
 # Cython / C extensions
 cythonize.json
 spacy/*.html
--- a/README.md
+++ b/README.md
@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
 model packaging, deployment and workflow management. spaCy is commercial
 open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).

-💫 **Version 3.4 out now!**
+💫 **Version 3.5 out now!**
 [Check out the release notes here.](https://github.com/explosion/spaCy/releases)

 [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
--- a/requirements.txt
+++ b/requirements.txt
@ -22,7 +22,7 @@ langcodes>=3.2.0,<4.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
-typing_extensions>=3.7.4.1,<4.2.0; python_version < "3.8"
+typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
 # Development dependencies
 pre-commit>=2.13.0
 cython>=0.25,<3.0
--- a/setup.cfg
+++ b/setup.cfg
@ -63,7 +63,7 @@ install_requires =
    # Official Python utilities
    setuptools
    packaging>=20.0
-    typing_extensions>=3.7.4,<4.2.0; python_version < "3.8"
+    typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
    langcodes>=3.2.0,<4.0.0

 [options.entry_points]
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -106,9 +106,7 @@ def serve(

    if is_in_jupyter():
        warnings.warn(Warnings.W011)
-    render(
-        docs, style=style, page=page, minify=minify, options=options, manual=manual
-    )
+    render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
    httpd = simple_server.make_server(host, port, app)
    print(f"\nUsing the '{style}' visualizer")
    print(f"Serving on http://{host}:{port} ...\n")
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@ -25,7 +25,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
    """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
    to support entity linking of named entities to real-world concepts.

-    DOCS: https://spacy.io/api/kb_in_memory
+    DOCS: https://spacy.io/api/inmemorylookupkb
    """

    def __init__(self, Vocab vocab, entity_vector_length):
--- a/spacy/matcher/levenshtein.pyx
+++ b/spacy/matcher/levenshtein.pyx
@ -22,7 +22,7 @@ cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int =
        max_edits = fuzzy
    else:
        # allow at least two edits (to allow at least one transposition) and up
-        # to 20% of the pattern string length
+        # to 30% of the pattern string length
        max_edits = max(2, round(0.3 * len(pattern_text)))
    return levenshtein(input_text, pattern_text, max_edits) <= max_edits

--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@ -5,8 +5,12 @@ from ..vocab import Vocab
 from ..tokens import Doc, Span

 class Matcher:
-    def __init__(self, vocab: Vocab, validate: bool = ...,
-                 fuzzy_compare: Callable[[str, str, int], bool] = ...) -> None: ...
+    def __init__(
+        self,
+        vocab: Vocab,
+        validate: bool = ...,
+        fuzzy_compare: Callable[[str, str, int], bool] = ...,
+    ) -> None: ...
    def __reduce__(self) -> Any: ...
    def __len__(self) -> int: ...
    def __contains__(self, key: str) -> bool: ...
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@ -5,8 +5,8 @@ from itertools import islice
 import numpy as np

 import srsly
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d, Ints2d
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy, NumpyOps
+from thinc.types import Floats2d, Ints2d

 from ._edit_tree_internals.edit_trees import EditTrees
 from ._edit_tree_internals.schemas import validate_edit_tree
@ -20,6 +20,10 @@ from ..vocab import Vocab
 from .. import util


+# The cutoff value of *top_k* above which an alternative method is used to process guesses.
+TOP_K_GUARDRAIL = 20
+
+
 default_model_config = """
 [model]
@architectures = "spacy.Tagger.v2"
@ -115,6 +119,7 @@ class EditTreeLemmatizer(TrainablePipe):

        self.cfg: Dict[str, Any] = {"labels": []}
        self.scorer = scorer
+        self.numpy_ops = NumpyOps()

    def get_loss(
        self, examples: Iterable[Example], scores: List[Floats2d]
@ -144,6 +149,18 @@ class EditTreeLemmatizer(TrainablePipe):
        return float(loss), d_scores

    def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
+        if self.top_k == 1:
+            scores2guesses = self._scores2guesses_top_k_equals_1
+        elif self.top_k <= TOP_K_GUARDRAIL:
+            scores2guesses = self._scores2guesses_top_k_greater_1
+        else:
+            scores2guesses = self._scores2guesses_top_k_guardrail
+        # The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values
+        # of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used
+        # for its principal purpose of lemmatizing tokens. However, the code could also
+        # be used for other purposes, and with very large values of *top_k* the method
+        # becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used
+        # instead.
        n_docs = len(list(docs))
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
@ -153,20 +170,52 @@ class EditTreeLemmatizer(TrainablePipe):
            return guesses
        scores = self.model.predict(docs)
        assert len(scores) == n_docs
-        guesses = self._scores2guesses(docs, scores)
+        guesses = scores2guesses(docs, scores)
        assert len(guesses) == n_docs
        return guesses

-    def _scores2guesses(self, docs, scores):
+    def _scores2guesses_top_k_equals_1(self, docs, scores):
        guesses = []
        for doc, doc_scores in zip(docs, scores):
-            if self.top_k == 1:
-                doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1)
-            else:
-                doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
+            doc_guesses = doc_scores.argmax(axis=1)
+            doc_guesses = self.numpy_ops.asarray(doc_guesses)

-            if not isinstance(doc_guesses, np.ndarray):
-                doc_guesses = doc_guesses.get()
+            doc_compat_guesses = []
+            for i, token in enumerate(doc):
+                tree_id = self.cfg["labels"][doc_guesses[i]]
+                if self.trees.apply(tree_id, token.text) is not None:
+                    doc_compat_guesses.append(tree_id)
+                else:
+                    doc_compat_guesses.append(-1)
+            guesses.append(np.array(doc_compat_guesses))
+
+        return guesses
+
+    def _scores2guesses_top_k_greater_1(self, docs, scores):
+        guesses = []
+        top_k = min(self.top_k, len(self.labels))
+        for doc, doc_scores in zip(docs, scores):
+            doc_scores = self.numpy_ops.asarray(doc_scores)
+            doc_compat_guesses = []
+            for i, token in enumerate(doc):
+                for _ in range(top_k):
+                    candidate = int(doc_scores[i].argmax())
+                    candidate_tree_id = self.cfg["labels"][candidate]
+                    if self.trees.apply(candidate_tree_id, token.text) is not None:
+                        doc_compat_guesses.append(candidate_tree_id)
+                        break
+                    doc_scores[i, candidate] = np.finfo(np.float32).min
+                else:
+                    doc_compat_guesses.append(-1)
+            guesses.append(np.array(doc_compat_guesses))
+
+        return guesses
+
+    def _scores2guesses_top_k_guardrail(self, docs, scores):
+        guesses = []
+        for doc, doc_scores in zip(docs, scores):
+            doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
+            doc_guesses = self.numpy_ops.asarray(doc_guesses)

            doc_compat_guesses = []
            for token, candidates in zip(doc, doc_guesses):
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -163,15 +163,33 @@ class TokenPatternString(BaseModel):
    IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
    INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
    FUZZY: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy")
-    FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy1")
-    FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy2")
-    FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy3")
-    FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy4")
-    FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy5")
-    FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy6")
-    FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy7")
-    FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy8")
-    FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy9")
+    FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+        None, alias="fuzzy1"
+    )
+    FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+        None, alias="fuzzy2"
+    )
+    FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+        None, alias="fuzzy3"
+    )
+    FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+        None, alias="fuzzy4"
+    )
+    FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+        None, alias="fuzzy5"
+    )
+    FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+        None, alias="fuzzy6"
+    )
+    FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+        None, alias="fuzzy7"
+    )
+    FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+        None, alias="fuzzy8"
+    )
+    FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+        None, alias="fuzzy9"
+    )

    class Config:
        extra = "forbid"
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@ -101,14 +101,15 @@ def test_initialize_from_labels():
    }


-def test_no_data():
+@pytest.mark.parametrize("top_k", (1, 5, 30))
+def test_no_data(top_k):
    # Test that the lemmatizer provides a nice error when there's no tagging data / labels
    TEXTCAT_DATA = [
        ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
        ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
    ]
    nlp = English()
-    nlp.add_pipe("trainable_lemmatizer")
+    nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
    nlp.add_pipe("textcat")

    train_examples = []
@ -119,10 +120,11 @@ def test_no_data():
        nlp.initialize(get_examples=lambda: train_examples)


-def test_incomplete_data():
+@pytest.mark.parametrize("top_k", (1, 5, 30))
+def test_incomplete_data(top_k):
    # Test that the lemmatizer works with incomplete information
    nlp = English()
-    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
+    lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
    lemmatizer.min_tree_freq = 1
    train_examples = []
    for t in PARTIAL_DATA:
@ -154,9 +156,10 @@ def test_incomplete_data():
    assert xp.count_nonzero(dX[1][1]) == 0


-def test_overfitting_IO():
+@pytest.mark.parametrize("top_k", (1, 5, 30))
+def test_overfitting_IO(top_k):
    nlp = English()
-    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
+    lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
    lemmatizer.min_tree_freq = 1
    train_examples = []
    for t in TRAIN_DATA:
@ -189,7 +192,7 @@ def test_overfitting_IO():
    # Check model after a {to,from}_bytes roundtrip
    nlp_bytes = nlp.to_bytes()
    nlp3 = English()
-    nlp3.add_pipe("trainable_lemmatizer")
+    nlp3.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
    nlp3.from_bytes(nlp_bytes)
    doc3 = nlp3(test_text)
    assert doc3[0].lemma_ == "she"
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -618,7 +618,6 @@ def test_string_to_list_intify(value):
    assert string_to_list(value, intify=True) == [1, 2, 3]


-@pytest.mark.skip(reason="Temporarily skip for dev version")
 def test_download_compatibility():
    spec = SpecifierSet("==" + about.__version__)
    spec.prereleases = False
@ -629,7 +628,6 @@ def test_download_compatibility():
        assert get_minor_version(about.__version__) == get_minor_version(version)


-@pytest.mark.skip(reason="Temporarily skip for dev version")
 def test_validate_compatibility_table():
    spec = SpecifierSet("==" + about.__version__)
    spec.prereleases = False
--- a/website/.dockerignore
+++ b/website/.dockerignore
@ -0,0 +1,9 @@
+.cache/
+.next/
+public/
+node_modules
+.npm
+logs
+*.log
+npm-debug.log*
+quickstart-training-generator.js
--- a/website/.gitignore
+++ b/website/.gitignore
@ -1,5 +1,7 @@
 # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.

+quickstart-training-generator.js
+
 # dependencies
 /node_modules
 /.pnp
--- a/website/Dockerfile
+++ b/website/Dockerfile
@ -1,16 +1,14 @@
-FROM node:11.15.0 
+FROM node:18

-WORKDIR /spacy-io
-
-RUN npm install -g gatsby-cli@2.7.4
-
-COPY package.json .
-COPY package-lock.json . 
-
-RUN npm install
+USER node

 # This is so the installed node_modules will be up one directory
 # from where a user mounts files, so that they don't accidentally mount
 # their own node_modules from a different build
 # https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
-WORKDIR /spacy-io/website/
+WORKDIR /home/node
+COPY --chown=node package.json .
+COPY --chown=node package-lock.json .
+RUN npm install
+
+WORKDIR /home/node/website/
--- a/website/README.md
+++ b/website/README.md
@ -41,33 +41,27 @@ If you'd like to do this, **be sure you do _not_ include your local
 `node_modules` folder**, since there are some dependencies that need to be built
 for the image system. Rename it before using.

-```bash
-docker run -it \
-  -v $(pwd):/spacy-io/website \
-  -p 8000:8000 \
-  ghcr.io/explosion/spacy-io \
-  gatsby develop -H 0.0.0.0
-```
-
-This will allow you to access the built website at http://0.0.0.0:8000/ in your
-browser, and still edit code in your editor while having the site reflect those
-changes.
-
-**Note**: If you're working on a Mac with an M1 processor, you might see
-segfault errors from `qemu` if you use the default image. To fix this use the
-`arm64` tagged image in the `docker run` command
-(ghcr.io/explosion/spacy-io:arm64).
-
-### Building the Docker image
-
-If you'd like to build the image locally, you can do so like this:
+First build the Docker image. This only needs to be done on the first run
+or when changes are made to `Dockerfile` or the website dependencies:

 ```bash
 docker build -t spacy-io .
 ```

-This will take some time, so if you want to use the prebuilt image you'll save a
-bit of time.
+You can then build and run the website with:
+
+```bash
+docker run -it \
+  --rm \
+  -v $(pwd):/home/node/website \
+  -p 3000:3000 \
+  spacy-io \
+  npm run dev -- -H 0.0.0.0
+```
+
+This will allow you to access the built website at http://0.0.0.0:3000/ in your
+browser, and still edit code in your editor while having the site reflect those
+changes.

 ## Project structure

--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -1215,19 +1215,19 @@ When a directory is provided it is traversed recursively to collect all files.
 $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
 ```

-| Name                                      | Description                                                                                                                                                                          |
-| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                                   | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                  |
-| `data_path`                               | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~                                                 |
-| `output-file`, `-o`                       | Output `DocBin` path. ~~str (positional)~~                                                                                                                                           |
-| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--text-key`, `-tk`                       | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~                                                                            |
-| `--force-overwrite`, `-F`                 | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~                    |
-| `--gpu-id`, `-g`                          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--batch-size`, `-b`                      | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                  |
-| `--n-process`, `-n`                       | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                         |
-| `--help`, `-h`                            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
-| **CREATES**                               | A `DocBin` with the annotations from the `model` for all the files found in `data-path`.                                                                                             |
+| Name                      | Description                                                                                                                                                                          |
+| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                   | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                  |
+| `data_path`               | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~                                                 |
+| `output-file`, `-o`       | Output `DocBin` path. ~~str (positional)~~                                                                                                                                           |
+| `--code`, `-c`            | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--text-key`, `-tk`       | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~                                                                            |
+| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~                    |
+| `--gpu-id`, `-g`          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
+| `--batch-size`, `-b`      | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                  |
+| `--n-process`, `-n`       | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                         |
+| `--help`, `-h`            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
+| **CREATES**               | A `DocBin` with the annotations from the `model` for all the files found in `data-path`.                                                                                             |

 ## find-threshold {id="find-threshold",version="3.5",tag="command"}

--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@ -15,7 +15,7 @@ world". It requires a `KnowledgeBase`, as well as a function to generate
 plausible candidates from that `KnowledgeBase` given a certain textual mention,
 and a machine learning model to pick the right candidate, given the local
 context of the mention. `EntityLinker` defaults to using the
-[`InMemoryLookupKB`](/api/kb_in_memory) implementation.
+[`InMemoryLookupKB`](/api/inmemorylookupkb) implementation.

 ## Assigned Attributes {id="assigned-attributes"}

--- a/website/docs/api/inmemorylookupkb.mdx
+++ b/website/docs/api/inmemorylookupkb.mdx
@ -43,7 +43,7 @@ The length of the fixed-size entity vectors in the knowledge base.

 Add an entity to the knowledge base, specifying its corpus frequency and entity
 vector, which should be of length
-[`entity_vector_length`](/api/kb_in_memory#entity_vector_length).
+[`entity_vector_length`](/api/inmemorylookupkb#entity_vector_length).

 > #### Example
 >
@ -79,8 +79,9 @@ frequency and entity vector for each entity.

 Add an alias or mention to the knowledge base, specifying its potential KB
 identifiers and their prior probabilities. The entity identifiers should refer
-to entities previously added with [`add_entity`](/api/kb_in_memory#add_entity)
-or [`set_entities`](/api/kb_in_memory#set_entities). The sum of the prior
+to entities previously added with
+[`add_entity`](/api/inmemorylookupkb#add_entity) or
+[`set_entities`](/api/inmemorylookupkb#set_entities). The sum of the prior
 probabilities should not exceed 1. Note that an empty string can not be used as
 alias.

@ -156,7 +157,7 @@ Get a list of all aliases in the knowledge base.

 Given a certain textual mention as input, retrieve a list of candidate entities
 of type [`Candidate`](/api/kb#candidate). Wraps
-[`get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
+[`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).

 > #### Example
 >
@ -174,7 +175,7 @@ of type [`Candidate`](/api/kb#candidate). Wraps

 ## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}

-Same as [`get_candidates()`](/api/kb_in_memory#get_candidates), but for an
+Same as [`get_candidates()`](/api/inmemorylookupkb#get_candidates), but for an
 arbitrary number of mentions. The [`EntityLinker`](/api/entitylinker) component
 will call `get_candidates_batch()` instead of `get_candidates()`, if the config
 parameter `candidates_batch_size` is greater or equal than 1.
@ -231,7 +232,7 @@ Given a certain entity ID, retrieve its pretrained entity vector.

 ## InMemoryLookupKB.get_vectors {id="get_vectors",tag="method"}

-Same as [`get_vector()`](/api/kb_in_memory#get_vector), but for an arbitrary
+Same as [`get_vector()`](/api/inmemorylookupkb#get_vector), but for an arbitrary
 number of entity IDs.

 The default implementation of `get_vectors()` executes `get_vector()` in a loop.
--- a/website/docs/api/kb.mdx
+++ b/website/docs/api/kb.mdx
@ -21,8 +21,8 @@ functions called by the [`EntityLinker`](/api/entitylinker) component.
 <Infobox variant="warning">

 This class was not abstract up to spaCy version 3.5. The `KnowledgeBase`
-implementation up to that point is available as `InMemoryLookupKB` from 3.5
-onwards.
+implementation up to that point is available as
+[`InMemoryLookupKB`](/api/inmemorylookupkb) from 3.5 onwards.

 </Infobox>

@ -110,14 +110,15 @@ to you.
 </Infobox>

 From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
-[`InMemoryLookupKB`](/api/kb_in_memory) being a drop-in replacement) to allow
-more flexibility in customizing knowledge bases. Some of its methods were moved
-to [`InMemoryLookupKB`](/api/kb_in_memory) during this refactoring, one of those
-being `get_alias_candidates()`. This method is now available as
-[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
-Note: [`InMemoryLookupKB.get_candidates()`](/api/kb_in_memory#get_candidates)
+[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to
+allow more flexibility in customizing knowledge bases. Some of its methods were
+moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring,
+one of those being `get_alias_candidates()`. This method is now available as
+[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
+Note:
+[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates)
 defaults to
-[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
+[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).

 ## KnowledgeBase.get_vector {id="get_vector",tag="method"}

--- a/website/docs/usage/101/_architecture.mdx
+++ b/website/docs/usage/101/_architecture.mdx
@ -79,7 +79,7 @@ operates on a `Doc` and gives you access to the matched tokens **in context**.
 | ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
 | [`Corpus`](/api/corpus)                          | Class for managing annotated corpora for training and evaluation data.                             |
 | [`KnowledgeBase`](/api/kb)                       | Abstract base class for storage and retrieval of data for entity linking.                          |
-| [`InMemoryLookupKB`](/api/kb_in_memory)          | Implementation of `KnowledgeBase` storing all data in memory.                                      |
+| [`InMemoryLookupKB`](/api/inmemorylookupkb)      | Implementation of `KnowledgeBase` storing all data in memory.                                      |
 | [`Candidate`](/api/kb#candidate)                 | Object associating a textual mention with a specific entity contained in a `KnowledgeBase`.        |
 | [`Lookups`](/api/lookups)                        | Container for convenient access to large lookup tables and dictionaries.                           |
 | [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis.                                                                          |
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@ -384,14 +384,14 @@ the more specific attributes `FUZZY1`..`FUZZY9` you can specify the maximum
 allowed edit distance directly.

 ```python
-# Match lowercase with fuzzy matching (allows 2 edits)
+# Match lowercase with fuzzy matching (allows 3 edits)
 pattern = [{"LOWER": {"FUZZY": "definitely"}}]

-# Match custom attribute values with fuzzy matching (allows 2 edits)
+# Match custom attribute values with fuzzy matching (allows 3 edits)
 pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]

-# Match with exact Levenshtein edit distance limits (allows 3 edits)
-pattern = [{"_": {"country": {"FUZZY3": "Kyrgyzstan"}}}]
+# Match with exact Levenshtein edit distance limits (allows 4 edits)
+pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}]
 ```

 #### Regex and fuzzy matching with lists {id="regex-fuzzy-lists", version="3.5"}
--- a/website/docs/usage/saving-loading.mdx
+++ b/website/docs/usage/saving-loading.mdx
@ -684,10 +684,15 @@ If your pipeline includes
 [custom components](/usage/processing-pipelines#custom-components), model
 architectures or other [code](/usage/training#custom-code), those functions need
 to be registered **before** your pipeline is loaded. Otherwise, spaCy won't know
-how to create the objects referenced in the config. The
-[`spacy package`](/api/cli#package) command lets you provide one or more paths
-to Python files containing custom registered functions using the `--code`
-argument.
+how to create the objects referenced in the config. If you're loading your own
+pipeline in Python, you can make custom components available just by importing
+the code that defines them before calling
+[`spacy.load`](/api/top-level#spacy.load). This is also how the `--code`
+argument to CLI commands works.
+
+With the [`spacy package`](/api/cli#package) command, you can provide one or
+more paths to Python files containing custom registered functions using the
+`--code` argument.

 > #### \_\_init\_\_.py (excerpt)
 >
--- a/website/docs/usage/v3-5.mdx
+++ b/website/docs/usage/v3-5.mdx
@ -0,0 +1,215 @@
+---
+title: What's New in v3.5
+teaser: New features and how to upgrade
+menu:
+  - ['New Features', 'features']
+  - ['Upgrading Notes', 'upgrading']
+---
+
+## New features {id="features",hidden="true"}
+
+spaCy v3.5 introduces three new CLI commands, `apply`, `benchmark` and
+`find-threshold`, adds fuzzy matching, provides improvements to our entity
+linking functionality, and includes a range of language updates and bug fixes.
+
+### New CLI commands {id="cli"}
+
+#### apply CLI
+
+The [`apply` CLI](/api/cli#apply) can be used to apply a pipeline to one or more
+`.txt`, `.jsonl` or `.spacy` input files, saving the annotated docs in a single
+`.spacy` file.
+
+```bash
+$ spacy apply en_core_web_sm my_texts/ output.spacy
+```
+
+#### benchmark CLI
+
+The [`benchmark` CLI](/api/cli#benchmark) has been added to extend the existing
+`evaluate` functionality with a wider range of profiling subcommands.
+
+The `benchmark accuracy` CLI is introduced as an alias for `evaluate`. The new
+`benchmark speed` CLI performs warmup rounds before measuring the speed in words
+per second on batches of randomly shuffled documents from the provided data.
+
+```bash
+$ spacy benchmark speed my_pipeline data.spacy
+```
+
+The output is the mean performance using batches (`nlp.pipe`) with a 95%
+confidence interval, e.g., profiling `en_core_web_sm` on CPU:
+
+```none
+Outliers: 2.0%, extreme outliers: 0.0%
+Mean: 18904.1 words/s (95% CI: -256.9 +244.1)
+```
+
+#### find-threshold CLI
+
+The [`find-threshold` CLI](/api/cli#find-threshold) runs a series of trials
+across threshold values from `0.0` to `1.0` and identifies the best threshold
+for the provided score metric.
+
+The following command runs 20 trials for the `spancat` component in
+`my_pipeline`, recording the `spans_sc_f` score for each value of the threshold
+`[components.spancat.threshold]` from `0.0` to `1.0`:
+
+```bash
+$ spacy find-threshold my_pipeline data.spacy spancat threshold spans_sc_f --n_trials 20
+```
+
+The `find-threshold` CLI can be used with `textcat_multilabel`, `spancat` and
+custom components with thresholds that are applied while predicting or scoring.
+
+### Fuzzy matching {id="fuzzy"}
+
+New `FUZZY` operators support [fuzzy matching](/usage/rule-based-matching#fuzzy)
+with the `Matcher`. By default, the `FUZZY` operator allows a Levenshtein edit
+distance of 2 and up to 30% of the pattern string length. `FUZZY1`..`FUZZY9` can
+be used to specify the exact number of allowed edits.
+
+```python
+# Match lowercase with fuzzy matching (allows up to 3 edits)
+pattern = [{"LOWER": {"FUZZY": "definitely"}}]
+
+# Match custom attribute values with fuzzy matching (allows up to 3 edits)
+pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
+
+# Match with exact Levenshtein edit distance limits (allows up to 4 edits)
+pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}]
+```
+
+Note that `FUZZY` uses Levenshtein edit distance rather than Damerau-Levenshtein
+edit distance, so a transposition like `teh` for `the` counts as two edits, one
+insertion and one deletion.
+
+If you'd prefer an alternate fuzzy matching algorithm, you can provide your own
+custom method to the `Matcher` or as a config option for an entity ruler and
+span ruler.
+
+### FUZZY and REGEX with lists {id="fuzzy-regex-lists"}
+
+The `FUZZY` and `REGEX` operators are also now supported for lists with `IN` and
+`NOT_IN`:
+
+```python
+pattern = [{"TEXT": {"FUZZY": {"IN": ["awesome", "cool", "wonderful"]}}}]
+pattern = [{"TEXT": {"REGEX": {"NOT_IN": ["^awe(some)?$", "^wonder(ful)?"]}}}]
+```
+
+### Entity linking generalization {id="el"}
+
+The knowledge base used for entity linking is now easier to customize and has a
+new default implementation [`InMemoryLookupKB`](/api/inmemorylookupkb).
+
+### Additional features and improvements {id="additional-features-and-improvements"}
+
+- Language updates:
+  - Extended support for Slovenian
+  - Fixed lookup fallback for French and Catalan lemmatizers
+  - Switch Russian and Ukrainian lemmatizers to `pymorphy3`
+  - Support for editorial punctuation in Ancient Greek
+  - Update to Russian tokenizer exceptions
+  - Small fix for Dutch stop words
+- Allow up to `typer` v0.7.x, `mypy` 0.990 and `typing_extensions` v4.4.x.
+- New `spacy.ConsoleLogger.v3` with expanded progress
+  [tracking](/api/top-level#ConsoleLogger).
+- Improved scoring behavior for `textcat` with `spacy.textcat_scorer.v2` and
+  `spacy.textcat_multilabel_scorer.v2`.
+- Updates so that downstream components can train properly on a frozen `tok2vec`
+  or `transformer` layer.
+- Allow interpolation of variables in directory names in projects.
+- Support for local file system [remotes](/usage/projects#remote) for projects.
+- Improve UX around `displacy.serve` when the default port is in use.
+- Optional `before_update` callback that is invoked at the start of each
+  [training step](/api/data-formats#config-training).
+- Improve performance of `SpanGroup` and fix typing issues for `SpanGroup` and
+  `Span` objects.
+- Patch a
+  [security vulnerability](https://github.com/advisories/GHSA-gw9q-c7gh-j9vm) in
+  extracting tar files.
+- Add equality definition for `Vectors`.
+- Ensure `Vocab.to_disk` respects the exclude setting for `lookups` and
+  `vectors`.
+- Correctly handle missing annotations in the edit tree lemmatizer.
+
+### Trained pipeline updates {id="pipelines"}
+
+- The CNN pipelines add `IS_SPACE` as a `tok2vec` feature for `tagger` and
+  `morphologizer` components to improve tagging of non-whitespace vs. whitespace
+  tokens.
+- The transformer pipelines require `spacy-transformers` v1.2, which uses the
+  exact alignment from `tokenizers` for fast tokenizers instead of the heuristic
+  alignment from `spacy-alignments`. For all trained pipelines except
+  `ja_core_news_trf`, the alignments between spaCy tokens and transformer tokens
+  may be slightly different. More details about the `spacy-transformers` changes
+  in the
+  [v1.2.0 release notes](https://github.com/explosion/spacy-transformers/releases/tag/v1.2.0).
+
+## Notes about upgrading from v3.4 {id="upgrading"}
+
+### Validation of textcat values {id="textcat-validation"}
+
+An error is now raised when unsupported values are given as input to train a
+`textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0`
+as explained in the [docs](/api/textcategorizer#assigned-attributes).
+
+### Updated scorers for tokenization and textcat {id="scores"}
+
+We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported
+`token_acc` will drop from v3.4 to v3.5, but if `token_p/r/f` stay the same,
+your tokenization performance has not changed from v3.4.
+
+For new `textcat` or `textcat_multilabel` configs, the new default `v2` scorers:
+
+- ignore `threshold` for `textcat`, so the reported `cats_p/r/f` may increase
+  slightly in v3.5 even though the underlying predictions are unchanged
+- report the performance of only the **final** `textcat` or `textcat_multilabel`
+  component in the pipeline by default
+- allow custom scorers to be used to score multiple `textcat` and
+  `textcat_multilabel` components with `Scorer.score_cats` by restricting the
+  evaluation to the component's provided labels
+
+### Pipeline package version compatibility {id="version-compat"}
+
+> #### Using legacy implementations
+>
+> In spaCy v3, you'll still be able to load and reference legacy implementations
+> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
+> components or architectures change and newer versions are available in the
+> core library.
+
+When you're loading a pipeline package trained with an earlier version of spaCy
+v3, you will see a warning telling you that the pipeline may be incompatible.
+This doesn't necessarily have to be true, but we recommend running your
+pipelines against your test suite or evaluation data to make sure there are no
+unexpected results.
+
+If you're using one of the [trained pipelines](/models) we provide, you should
+run [`spacy download`](/api/cli#download) to update to the latest version. To
+see an overview of all installed packages and their compatibility, you can run
+[`spacy validate`](/api/cli#validate).
+
+If you've trained your own custom pipeline and you've confirmed that it's still
+working as expected, you can update the spaCy version requirements in the
+[`meta.json`](/api/data-formats#meta):
+
+```diff
+- "spacy_version": ">=3.4.0,<3.5.0",
+ "spacy_version": ">=3.4.0,<3.6.0",
+```
+
+### Updating v3.4 configs
+
+To update a config from spaCy v3.4 with the new v3.5 settings, run
+[`init fill-config`](/api/cli#init-fill-config):
+
+```cli
+$ python -m spacy init fill-config config-v3.4.cfg config-v3.5.cfg
+```
+
+In many cases ([`spacy train`](/api/cli#train),
+[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
+automatically, but you'll need to fill in the new settings to run
+[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@ -13,7 +13,8 @@
                    { "text": "New in v3.1", "url": "/usage/v3-1" },
                    { "text": "New in v3.2", "url": "/usage/v3-2" },
                    { "text": "New in v3.3", "url": "/usage/v3-3" },
-                    { "text": "New in v3.4", "url": "/usage/v3-4" }
+                    { "text": "New in v3.4", "url": "/usage/v3-4" },
+                    { "text": "New in v3.5", "url": "/usage/v3-5" }
                ]
            },
            {
@ -129,6 +130,7 @@
                "items": [
                    { "text": "Attributes", "url": "/api/attributes" },
                    { "text": "Corpus", "url": "/api/corpus" },
+                    { "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" },
                    { "text": "KnowledgeBase", "url": "/api/kb" },
                    { "text": "Lookups", "url": "/api/lookups" },
                    { "text": "MorphAnalysis", "url": "/api/morphology#morphanalysis" },
--- a/website/pages/index.tsx
+++ b/website/pages/index.tsx
@ -89,7 +89,7 @@ const Landing = () => {
                </LandingCard>

                <LandingCard title="Awesome ecosystem" url="/usage/projects" button="Read more">
-                    In the five years since its release, spaCy has become an industry standard with
+                    Since its release in 2015, spaCy has become an industry standard with
                    a huge ecosystem. Choose from a variety of plugins, integrate with your machine
                    learning stack and build custom components and workflows.
                </LandingCard>
--- a/website/src/components/seo.js
+++ b/website/src/components/seo.js
@ -9,6 +9,8 @@ import socialImageLegacy from '../images/social_legacy.jpg'
 import siteMetadata from '../../meta/site.json'
 import Head from 'next/head'

+import { siteUrl } from '../../meta/dynamicMeta.mjs'
+
 function getPageTitle(title, sitename, slogan, sectionTitle, nightly, legacy) {
    if (sectionTitle && title) {
        const suffix = nightly ? ' (nightly)' : legacy ? ' (legacy)' : ''
@ -25,7 +27,7 @@ function getImage(section, nightly, legacy) {
    if (legacy) return socialImageLegacy
    if (section === 'api') return socialImageApi
    if (section === 'universe') return socialImageUniverse
-    return socialImageDefault
+    return `${siteUrl}${socialImageDefault.src}`
 }

 export default function SEO({
@ -46,7 +48,7 @@ export default function SEO({
        nightly,
        legacy
    )
-    const socialImage = getImage(section, nightly, legacy).src
+    const socialImage = getImage(section, nightly, legacy)
    const meta = [
        {
            name: 'description',
--- a/website/src/styles/list.module.sass
+++ b/website/src/styles/list.module.sass
@ -20,6 +20,10 @@
        display: inline-block
        margin-bottom: var(--spacing-sm)

+    .ol, .ul
+        margin-top: var(--spacing-xs)
+        margin-bottom: var(--spacing-xs)
+
    &:before
        content: '\25CF'
        position: relative
--- a/website/src/templates/index.js
+++ b/website/src/templates/index.js
@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
 }

 const navAlert = (
-    <Link to="/usage/v3-4" hidden>
-        <strong>💥 Out now:</strong> spaCy v3.4
+    <Link to="/usage/v3-5" hidden>
+        <strong>💥 Out now:</strong> spaCy v3.5
    </Link>
 )