Merge branch 'v4' into feature/multiple-code-files

2025-10-25 05:01:02 +03:00 · 2023-02-06 14:43:19 +09:00 · 2023-02-06 14:43:19 +09:00 · 7ef87e24ca
commit 7ef87e24ca
parent 5aff2b8204 eec5ccd72f
24 changed files with 206 additions and 84 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1248,17 +1248,12 @@ class Language:
            component_cfg[name].setdefault("drop", drop)
            pipe_kwargs[name].setdefault("batch_size", self.batch_size)
        for name, proc in self.pipeline:
-            # ignore statements are used here because mypy ignores hasattr
+            if (
-            if name not in exclude and hasattr(proc, "update"):
+                name not in exclude
-                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])  # type: ignore
+                and isinstance(proc, ty.TrainableComponent)
-            if sgd not in (None, False):
+                and proc.is_trainable
-                if (
+            ):
-                    name not in exclude
+                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
                    and isinstance(proc, ty.TrainableComponent)
                    and proc.is_trainable
                    and proc.model not in (True, False, None)
                ):
                    proc.finish_update(sgd)
            if name in annotates:
                for doc, eg in zip(
                    _pipe(
@ -1271,6 +1266,17 @@ class Language:
                    examples,
                ):
                    eg.predicted = doc
        # Only finish the update after all component updates are done. Some
        # components may share weights (such as tok2vec) and we only want
        # to apply weight updates after all gradients are accumulated.
        for name, proc in self.pipeline:
            if (
                name not in exclude
                and isinstance(proc, ty.TrainableComponent)
                and proc.is_trainable
            ):
                proc.finish_update(sgd)
        return losses
    def rehearse(
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -27,9 +27,6 @@ ActivationsT = Dict[str, Union[List[Ragged], List[str]]]
 KNOWLEDGE_BASE_IDS = "kb_ids"
 # See #9050
 BACKWARD_OVERWRITE = True
 default_model_config = """
 [model]
@architectures = "spacy.EntityLinker.v2"
@ -60,7 +57,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
        "entity_vector_length": 64,
        "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
        "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
-        "overwrite": True,
+        "overwrite": False,
        "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
        "use_gold_ents": True,
        "candidates_batch_size": 1,
@ -191,7 +188,7 @@ class EntityLinker(TrainablePipe):
        get_candidates_batch: Callable[
            [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
        ],
-        overwrite: bool = BACKWARD_OVERWRITE,
+        overwrite: bool = False,
        scorer: Optional[Callable] = entity_linker_score,
        use_gold_ents: bool,
        candidates_batch_size: int,
@ -215,6 +212,7 @@ class EntityLinker(TrainablePipe):
            Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
            Iterable[Candidate]]
            ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
        overwrite (bool): Whether to overwrite existing non-empty annotations.
        scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
        use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
            component must provide entity annotations.
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -21,10 +21,6 @@ from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
 # See #9050
 BACKWARD_OVERWRITE = True
 BACKWARD_EXTEND = False
 default_model_config = """
 [model]
@architectures = "spacy.Tagger.v2"
@ -102,8 +98,8 @@ class Morphologizer(Tagger):
        model: Model,
        name: str = "morphologizer",
        *,
-        overwrite: bool = BACKWARD_OVERWRITE,
+        overwrite: bool = False,
-        extend: bool = BACKWARD_EXTEND,
+        extend: bool = False,
        scorer: Optional[Callable] = morphologizer_score,
        save_activations: bool = False,
    ):
@ -113,6 +109,8 @@ class Morphologizer(Tagger):
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        overwrite (bool): Whether to overwrite existing annotations.
        extend (bool): Whether to extend existing annotations.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_token_attr for the attributes "pos" and "morph" and
            Scorer.score_token_attr_per_feat for the attribute "morph".
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@ -10,9 +10,6 @@ from ..language import Language
 from ..scorer import Scorer
 from .. import util
 # see #9050
 BACKWARD_OVERWRITE = False
@Language.factory(
    "sentencizer",
    assigns=["token.is_sent_start", "doc.sents"],
@ -52,13 +49,14 @@ class Sentencizer(Pipe):
        name="sentencizer",
        *,
        punct_chars=None,
-        overwrite=BACKWARD_OVERWRITE,
+        overwrite=False,
        scorer=senter_score,
    ):
        """Initialize the sentencizer.
        punct_chars (list): Punctuation characters to split on. Will be
            serialized with the nlp object.
        overwrite (bool): Whether to overwrite existing annotations.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_spans for the attribute "sents".
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -18,8 +18,6 @@ from ..training import validate_examples, validate_get_examples
 from ..util import registry
 from .. import util
 # See #9050
 BACKWARD_OVERWRITE = False
 default_model_config = """
 [model]
@ -83,7 +81,7 @@ class SentenceRecognizer(Tagger):
        model,
        name="senter",
        *,
-        overwrite=BACKWARD_OVERWRITE,
+        overwrite=False,
        scorer=senter_score,
        save_activations: bool = False,
    ):
@ -93,6 +91,7 @@ class SentenceRecognizer(Tagger):
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        overwrite (bool): Whether to overwrite existing annotations.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_spans for the attribute "sents".
        save_activations (bool): save model activations in Doc when annotating.
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -27,9 +27,6 @@ from .. import util
 ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
 # See #9050
 BACKWARD_OVERWRITE = False
 default_model_config = """
 [model]
@architectures = "spacy.Tagger.v2"
@ -99,7 +96,7 @@ class Tagger(TrainablePipe):
        model,
        name="tagger",
        *,
-        overwrite=BACKWARD_OVERWRITE,
+        overwrite=False,
        scorer=tagger_score,
        neg_prefix="!",
        save_activations: bool = False,
@ -110,6 +107,7 @@ class Tagger(TrainablePipe):
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        overwrite (bool): Whether to overwrite existing annotations.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_token_attr for the attribute "tag".
        save_activations (bool): save model activations in Doc when annotating.
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -175,6 +175,18 @@ def test_modify_span_group(doc):
    assert group[0].label == doc.vocab.strings["TEST"]
 def test_char_span_attributes(doc):
    label = "LABEL"
    kb_id = "KB_ID"
    span_id = "SPAN_ID"
    span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id)
    span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id)
    assert span1.text == span2.text
    assert span1.label_ == span2.label_ == label
    assert span1.kb_id_ == span2.kb_id_ == kb_id
    assert span1.id_ == span2.id_ == span_id
 def test_spans_sent_spans(doc):
    sents = list(doc.sents)
    assert sents[0].start == 0
@ -354,6 +366,14 @@ def test_spans_by_character(doc):
            span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
        )
    # Span.char_span + alignment mode "contract"
    span2 = doc[0:2].char_span(
        span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
    )
    assert span1.start_char == span2.start_char
    assert span1.end_char == span2.end_char
    assert span2.label_ == "GPE"
 def test_span_to_array(doc):
    span = doc[1:-2]
--- a/spacy/tests/pipeline/test_annotates_on_update.py
+++ b/spacy/tests/pipeline/test_annotates_on_update.py
@ -54,9 +54,11 @@ def test_annotates_on_update():
        return AssertSents(name)
    class AssertSents:
        model = None
        is_trainable = True
        def __init__(self, name, **cfg):
            self.name = name
            pass
        def __call__(self, doc):
            if not doc.has_annotation("SENT_START"):
@ -64,10 +66,16 @@ def test_annotates_on_update():
            return doc
        def update(self, examples, *, drop=0.0, sgd=None, losses=None):
            losses.setdefault(self.name, 0.0)
            for example in examples:
                if not example.predicted.has_annotation("SENT_START"):
                    raise ValueError("No sents")
-            return {}
+
            return losses
        def finish_update(self, sgd=None):
            pass
    nlp = English()
    nlp.add_pipe("sentencizer")
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -1017,8 +1017,6 @@ def test_local_remote_storage_pull_missing():
 def test_cli_find_threshold(capsys):
    thresholds = numpy.linspace(0, 1, 10)
    def make_examples(nlp: Language) -> List[Example]:
        docs: List[Example] = []
@ -1082,8 +1080,6 @@ def test_cli_find_threshold(capsys):
                scores_key="cats_macro_f",
                silent=True,
            )
            assert best_threshold != thresholds[0]
            assert thresholds[0] < best_threshold < thresholds[9]
            assert best_score == max(res.values())
            assert res[1.0] == 0.0
@ -1091,7 +1087,7 @@ def test_cli_find_threshold(capsys):
        nlp, _ = init_nlp((("spancat", {}),))
        with make_tempdir() as nlp_dir:
            nlp.to_disk(nlp_dir)
-            res = find_threshold(
+            best_threshold, best_score, res = find_threshold(
                model=nlp_dir,
                data_path=docs_dir / "docs.spacy",
                pipe_name="spancat",
@ -1099,10 +1095,8 @@ def test_cli_find_threshold(capsys):
                scores_key="spans_sc_f",
                silent=True,
            )
-            assert res[0] != thresholds[0]
+            assert best_score == max(res.values())
-            assert thresholds[0] < res[0] < thresholds[8]
+            assert res[1.0] == 0.0
            assert res[1] >= 0.6
            assert res[2][1.0] == 0.0
        # Having multiple textcat_multilabel components should work, since the name has to be specified.
        nlp, _ = init_nlp((("textcat_multilabel", {}),))
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@ -9,7 +9,7 @@ import spacy
 from spacy.cli._util import app
 from spacy.language import Language
 from spacy.tokens import DocBin
-from .util import make_tempdir
+from .util import make_tempdir, normalize_whitespace
 def test_convert_auto():
@ -247,8 +247,8 @@ def test_benchmark_accuracy_alias():
    # Verify that the `evaluate` alias works correctly.
    result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
    result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
-    assert result_benchmark.stdout == result_evaluate.stdout.replace(
+    assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace(
-        "spacy evaluate", "spacy benchmark accuracy"
+        result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy")
    )
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -10,8 +10,9 @@ from spacy.training import Example
 from spacy.lang.en import English
 from spacy.lang.de import German
 from spacy.util import registry, ignore_error, raise_error, find_matching_language
 from spacy.util import load_model_from_config
 import spacy
-from thinc.api import CupyOps, NumpyOps, get_current_ops
+from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops
 from .util import add_vecs_to_vocab, assert_docs_equal
@ -25,6 +26,51 @@ try:
 except ImportError:
    pass
 TAGGER_CFG_STRING = """
    [nlp]
    lang = "en"
    pipeline = ["tok2vec","tagger"]
    [components]
    [components.tagger]
    factory = "tagger"
    [components.tagger.model]
    @architectures = "spacy.Tagger.v2"
    nO = null
    [components.tagger.model.tok2vec]
    @architectures = "spacy.Tok2VecListener.v1"
    width = ${components.tok2vec.model.encode.width}
    [components.tok2vec]
    factory = "tok2vec"
    [components.tok2vec.model]
    @architectures = "spacy.Tok2Vec.v2"
    [components.tok2vec.model.embed]
    @architectures = "spacy.MultiHashEmbed.v1"
    width = ${components.tok2vec.model.encode.width}
    rows = [2000, 1000, 1000, 1000]
    attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
    include_static_vectors = false
    [components.tok2vec.model.encode]
    @architectures = "spacy.MaxoutWindowEncoder.v2"
    width = 96
    depth = 4
    window_size = 1
    maxout_pieces = 3
    """
 TAGGER_TRAIN_DATA = [
    ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
    ("Eat blue ham", {"tags": ["V", "J", "N"]}),
 ]
 TAGGER_TRAIN_DATA = [
    ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
@ -91,6 +137,26 @@ def test_language_update(nlp):
        example = Example.from_dict(doc, wrongkeyannots)
 def test_language_update_updates():
    config = Config().from_str(TAGGER_CFG_STRING)
    nlp = load_model_from_config(config, auto_fill=True, validate=True)
    train_examples = []
    for t in TAGGER_TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
    nlp.update(train_examples, sgd=optimizer)
    docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
    xp = get_array_module(docs_after_update[0].tensor)
    assert xp.any(
        xp.not_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
    )
 def test_language_evaluate(nlp):
    text = "hello world"
    annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@ -1,6 +1,7 @@
 import numpy
 import tempfile
 import contextlib
 import re
 import srsly
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
@ -95,3 +96,7 @@ def assert_packed_msg_equal(b1, b2):
    for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
        assert k1 == k2
        assert v1 == v2
 def normalize_whitespace(s):
    return re.sub(r"\s+", " ", s)
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@ -108,6 +108,7 @@ class Doc:
        kb_id: Union[int, str] = ...,
        vector: Optional[Floats1d] = ...,
        alignment_mode: str = ...,
        span_id: Union[int, str] = ...,
    ) -> Span: ...
    def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
    @property
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -528,9 +528,9 @@ cdef class Doc:
        doc (Doc): The parent document.
        start_idx (int): The index of the first character of the span.
        end_idx (int): The index of the first character after the span.
-        label (uint64 or string): A label to attach to the Span, e.g. for
+        label (Union[int, str]): A label to attach to the Span, e.g. for
            named entities.
-        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a
+        kb_id (Union[int, str]):  An ID from a KB to capture the meaning of a
            named entity.
        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
            the span.
@ -539,6 +539,7 @@ cdef class Doc:
            with token boundaries), "contract" (span of all tokens completely
            within the character span), "expand" (span of all tokens at least
            partially covered by the character span). Defaults to "strict".
        span_id (Union[int, str]): An identifier to associate with the span.
        RETURNS (Span): The newly constructed object.
        DOCS: https://spacy.io/api/doc#char_span
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@ -96,6 +96,9 @@ class Span:
        label: Union[int, str] = ...,
        kb_id: Union[int, str] = ...,
        vector: Optional[Floats1d] = ...,
        id: Union[int, str] = ...,
        alignment_mode: str = ...,
        span_id: Union[int, str] = ...,
    ) -> Span: ...
    @property
    def conjuncts(self) -> Tuple[Token]: ...
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -656,22 +656,29 @@ cdef class Span:
        else:
            return self.doc[root]
-    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0):
+    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0):
        """Create a `Span` object from the slice `span.text[start : end]`.
        start (int): The index of the first character of the span.
        end (int): The index of the first character after the span.
-        label (uint64 or string): A label to attach to the Span, e.g. for
+        label (Union[int, str]): A label to attach to the Span, e.g. for
            named entities.
-        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a named entity.
+        kb_id (Union[int, str]):  An ID from a KB to capture the meaning of a named entity.
        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
            the span.
        id (Union[int, str]): Unused.
        alignment_mode (str): How character indices are aligned to token
            boundaries. Options: "strict" (character indices must be aligned
            with token boundaries), "contract" (span of all tokens completely
            within the character span), "expand" (span of all tokens at least
            partially covered by the character span). Defaults to "strict".
        span_id (Union[int, str]): An identifier to associate with the span.
        RETURNS (Span): The newly constructed object.
        """
        cdef SpanC* span_c = self.span_c()
        start_idx += span_c.start_char
        end_idx += span_c.start_char
-        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
+        return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id)
    @property
    def conjuncts(self):
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@ -210,7 +210,7 @@ def train_while_improving(
                subbatch,
                drop=dropout,
                losses=losses,
-                sgd=False,  # type: ignore[arg-type]
+                sgd=None,
                exclude=exclude,
                annotates=annotating_components,
            )
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -1410,12 +1410,13 @@ $ python -m spacy project assets [project_dir]
 > $ python -m spacy project assets [--sparse]
 > ```
-| Name             | Description                                                                                                                                               |
+| Name                                           | Description                                                                                                                                               |
-| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `project_dir`    | Path to project directory. Defaults to current working directory. ~~Path (positional)~~                                                                   |
+| `project_dir`                                  | Path to project directory. Defaults to current working directory. ~~Path (positional)~~                                                                   |
-| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
+| `--extra`, `-e` <Tag variant="new">3.3.1</Tag> | Download assets marked as "extra". Default false. ~~bool (flag)~~                                                                                         |
-| `--help`, `-h`   | Show help message and available arguments. ~~bool (flag)~~                                                                                                |
+| `--sparse`, `-S`                               | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
-| **CREATES**      | Downloaded or copied assets defined in the `project.yml`.                                                                                                 |
+| `--help`, `-h`                                 | Show help message and available arguments. ~~bool (flag)~~                                                                                                |
 | **CREATES**                                    | Downloaded or copied assets defined in the `project.yml`.                                                                                                 |
 ### project run {id="project-run",tag="command"}
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@ -37,7 +37,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | `words`                                  | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~                                                                                            |
 | `spaces`                                 | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~            |
 | _keyword-only_                           |                                                                                                                                                                                                         |
-| `user\_data`                             | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                                      |
+| `user_data`                              | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                                      |
 | `tags` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                   |
 | `pos` <Tag variant="new">3</Tag>         | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                   |
 | `morphs` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                 |
@ -209,15 +209,16 @@ alignment mode `"strict".
 > assert span.text == "New York"
 > ```
-| Name             | Description                                                                                                                                                                                                                                                                  |
+| Name                                     | Description                                                                                                                                                                                                                                                                  |
-| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `start`          | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
+| `start`                                  | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
-| `end`            | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
+| `end`                                    | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
-| `label`          | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
+| `label`                                  | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
-| `kb_id`          | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
+| `kb_id`                                  | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
-| `vector`         | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
+| `vector`                                 | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
-| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
+| `alignment_mode`                         | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
-| **RETURNS**      | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |
+| `span_id` <Tag variant="new">3.3.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~                                                                                                                                                                                                                |
 | **RETURNS**                              | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |
 ## Doc.set_ents {id="set_ents",tag="method",version="3"}
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@ -63,7 +63,7 @@ architectures and their arguments and hyperparameters.
 | `entity_vector_length`                          | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                               |
 | `use_gold_ents`                                 | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                        |
 | `get_candidates`                                | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                    |
-| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                    |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                   |
 | `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                     |
 | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                        |
 | `threshold` <Tag variant="new">3.4</Tag>        | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters.
 | Setting                                         | Description                                                                                                                                                                                                                                                            |
 | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `model`                                         | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                |
-| `overwrite` <Tag variant="new">3.2</Tag>        | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                  |
+| `overwrite` <Tag variant="new">3.2</Tag>        | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                 |
 | `extend` <Tag variant="new">3.2</Tag>           | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~                                                                                                                      |
 | `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
 | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~                                                                                                                                       |
--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@ -186,14 +186,17 @@ the character indices don't map to a valid span.
 > assert span.text == "New York"
 > ```
-| Name        | Description                                                                               |
+| Name                                            | Description                                                                                                                                                                                                                                                                  |
-| ----------- | ----------------------------------------------------------------------------------------- |
+| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `start`     | The index of the first character of the span. ~~int~~                                     |
+| `start`                                         | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
-| `end`       | The index of the last character after the span. ~~int~~                                   |
+| `end`                                           | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      |
-| `label`     | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~               |
+| `label`                                         | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
-| `kb_id`     | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
+| `kb_id`                                         | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
-| `vector`    | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~            |
+| `vector`                                        | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
-| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~                                |
+| `id`                                            | Unused. ~~Union[int, str]~~                                                                                                                                                                                                                                                  |
 | `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
 | `span_id` <Tag variant="new">3.5.1</Tag>        | An identifier to associate with the span. ~~Union[int, str]~~                                                                                                                                                                                                                |
 | **RETURNS**                                     | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |
 ## Span.similarity {id="similarity",tag="method",model="vectors"}
--- a/website/docs/models/index.mdx
+++ b/website/docs/models/index.mdx
@ -21,8 +21,8 @@ menu:
 ## Package naming conventions {id="conventions"}
 In general, spaCy expects all pipeline packages to follow the naming convention
-of `[lang]\_[name]`. For spaCy's pipelines, we also chose to divide the name
+of `[lang]_[name]`. For spaCy's pipelines, we also chose to divide the name into
-into three components:
+three components:
 1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with
   tagging, parsing, lemmatization and named entity recognition, or `dep` for
--- a/website/docs/usage/v3-5.mdx
+++ b/website/docs/usage/v3-5.mdx
@ -155,6 +155,21 @@ An error is now raised when unsupported values are given as input to train a
 `textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0`
 as explained in the [docs](/api/textcategorizer#assigned-attributes).
 ### Using the default knowledge base
 As `KnowledgeBase` is now an abstract class, you should call the constructor of
 the new `InMemoryLookupKB` instead when you want to use spaCy's default KB
 implementation:
 ```diff
 - kb = KnowledgeBase()
 + kb = InMemoryLookupKB()
 ```
 If you've written a custom KB that inherits from `KnowledgeBase`, you'll need to
 implement its abstract methods, or alternatively inherit from `InMemoryLookupKB`
 instead.
 ### Updated scorers for tokenization and textcat {id="scores"}
 We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported