Update docs and types

2025-10-19 02:04:19 +03:00 · 2020-07-31 17:02:54 +02:00 · 2020-07-31 17:02:54 +02:00 · e9e8fa2466
commit e9e8fa2466
parent dab31426e1
22 changed files with 232 additions and 137 deletions
--- a/spacy/ml/_character_embed.py
+++ b/spacy/ml/_character_embed.py
@ -1,6 +1,7 @@
 from typing import List
 from thinc.api import Model
 from thinc.types import Floats2d
 from ..tokens import Doc
@ -15,14 +16,14 @@ def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]:
    )
-def init(model, X=None, Y=None):
+def init(model: Model, X=None, Y=None):
    vectors_table = model.ops.alloc3f(
        model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")
    )
    model.set_param("E", vectors_table)
-def forward(model, docs, is_train):
+def forward(model: Model, docs: List[Doc], is_train: bool):
    if docs is None:
        return []
    ids = []
--- a/spacy/ml/_iob.py
+++ b/spacy/ml/_iob.py
@ -14,7 +14,7 @@ def IOB() -> Model[Padded, Padded]:
    )
-def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None):
+def init(model: Model, X: Optional[Padded] = None, Y: Optional[Padded] = None) -> None:
    if X is not None and Y is not None:
        if X.data.shape != Y.data.shape:
            # TODO: Fix error
--- a/spacy/ml/extract_ngrams.py
+++ b/spacy/ml/extract_ngrams.py
@ -4,14 +4,14 @@ from thinc.api import Model
 from ..attrs import LOWER
-def extract_ngrams(ngram_size, attr=LOWER) -> Model:
+def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model:
    model = Model("extract_ngrams", forward)
    model.attrs["ngram_size"] = ngram_size
    model.attrs["attr"] = attr
    return model
-def forward(model, docs, is_train: bool):
+def forward(model: Model, docs, is_train: bool):
    batch_keys = []
    batch_vals = []
    for doc in docs:
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@ -1,5 +1,4 @@
-from pathlib import Path
+from typing import Optional
 from thinc.api import chain, clone, list2ragged, reduce_mean, residual
 from thinc.api import Model, Maxout, Linear
@ -9,7 +8,7 @@ from ...vocab import Vocab
@registry.architectures.register("spacy.EntityLinker.v1")
-def build_nel_encoder(tok2vec, nO=None):
+def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
    with Model.define_operators({">>": chain, "**": clone}):
        token_width = tok2vec.get_dim("nO")
        output_layer = Linear(nO=nO, nI=token_width)
@ -26,7 +25,7 @@ def build_nel_encoder(tok2vec, nO=None):
@registry.assets.register("spacy.KBFromFile.v1")
-def load_kb(vocab_path, kb_path) -> KnowledgeBase:
+def load_kb(vocab_path: str, kb_path: str) -> KnowledgeBase:
    vocab = Vocab().from_disk(vocab_path)
    kb = KnowledgeBase(vocab=vocab)
    kb.load_bulk(kb_path)
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@ -1,10 +1,20 @@
 from typing import Optional, Iterable, Tuple, List, TYPE_CHECKING
 import numpy
 from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
 from thinc.api import MultiSoftmax, list2array
 if TYPE_CHECKING:
    # This lets us add type hints for mypy etc. without causing circular imports
    from ...vocab import Vocab  # noqa: F401
    from ...tokens import Doc  # noqa: F401
-def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
+
 def build_multi_task_model(
    tok2vec: Model,
    maxout_pieces: int,
    token_vector_width: int,
    nO: Optional[int] = None,
 ) -> Model:
    softmax = Softmax(nO=nO, nI=token_vector_width * 2)
    model = chain(
        tok2vec,
@ -22,7 +32,13 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
    return model
-def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, hidden_size, nO=None):
+def build_cloze_multi_task_model(
    vocab: "Vocab",
    tok2vec: Model,
    maxout_pieces: int,
    hidden_size: int,
    nO: Optional[int] = None,
 ) -> Model:
    # nO = vocab.vectors.data.shape[1]
    output_layer = chain(
        list2array(),
@ -43,24 +59,24 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, hidden_size, nO=
 def build_cloze_characters_multi_task_model(
-    vocab, tok2vec, maxout_pieces, hidden_size, nr_char
+    vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int, nr_char: int
-):
+) -> Model:
    output_layer = chain(
        list2array(),
        Maxout(hidden_size, nP=maxout_pieces),
        LayerNorm(nI=hidden_size),
        MultiSoftmax([256] * nr_char, nI=hidden_size),
    )
    model = build_masked_language_model(vocab, chain(tok2vec, output_layer))
    model.set_ref("tok2vec", tok2vec)
    model.set_ref("output_layer", output_layer)
    return model
-def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
+def build_masked_language_model(
    vocab: "Vocab", wrapped_model: Model, mask_prob: float = 0.15
 ) -> Model:
    """Convert a model into a BERT-style masked language model"""
    random_words = _RandomWords(vocab)
    def mlm_forward(model, docs, is_train):
@ -74,7 +90,7 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
        return output, mlm_backward
-    def mlm_initialize(model, X=None, Y=None):
+    def mlm_initialize(model: Model, X=None, Y=None):
        wrapped = model.layers[0]
        wrapped.initialize(X=X, Y=Y)
        for dim in wrapped.dim_names:
@ -90,12 +106,11 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
        dims={dim: None for dim in wrapped_model.dim_names},
    )
    mlm_model.set_ref("wrapped", wrapped_model)
    return mlm_model
 class _RandomWords:
-    def __init__(self, vocab):
+    def __init__(self, vocab: "Vocab") -> None:
        self.words = [lex.text for lex in vocab if lex.prob != 0.0]
        self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
        self.words = self.words[:10000]
@ -104,7 +119,7 @@ class _RandomWords:
        self.probs /= self.probs.sum()
        self._cache = []
-    def next(self):
+    def next(self) -> str:
        if not self._cache:
            self._cache.extend(
                numpy.random.choice(len(self.words), 10000, p=self.probs)
@ -113,9 +128,11 @@ class _RandomWords:
        return self.words[index]
-def _apply_mask(docs, random_words, mask_prob=0.15):
+def _apply_mask(
    docs: Iterable["Doc"], random_words: _RandomWords, mask_prob: float = 0.15
 ) -> Tuple[numpy.ndarray, List["Doc"]]:
    # This needs to be here to avoid circular imports
-    from ...tokens import Doc
+    from ...tokens import Doc  # noqa: F811
    N = sum(len(doc) for doc in docs)
    mask = numpy.random.uniform(0.0, 1.0, (N,))
@ -141,7 +158,7 @@ def _apply_mask(docs, random_words, mask_prob=0.15):
    return mask, masked_docs
-def _replace_word(word, random_words, mask="[MASK]"):
+def _replace_word(word: str, random_words: _RandomWords, mask: str = "[MASK]") -> str:
    roll = numpy.random.random()
    if roll < 0.8:
        return mask
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@ -1,6 +1,5 @@
-from pydantic import StrictInt
+from typing import Optional
-from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops, with_array
+from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
 from thinc.api import LayerNorm, Maxout, Mish
 from ...util import registry
 from .._precomputable_affine import PrecomputableAffine
@ -10,16 +9,15 @@ from ..tb_framework import TransitionModel
@registry.architectures.register("spacy.TransitionBasedParser.v1")
 def build_tb_parser_model(
    tok2vec: Model,
-    nr_feature_tokens: StrictInt,
+    nr_feature_tokens: int,
-    hidden_width: StrictInt,
+    hidden_width: int,
-    maxout_pieces: StrictInt,
+    maxout_pieces: int,
-    use_upper=True,
+    use_upper: bool = True,
-    nO=None,
+    nO: Optional[int] = None,
-):
+) -> Model:
    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
    tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),)
    tok2vec.set_dim("nO", hidden_width)
    lower = PrecomputableAffine(
        nO=hidden_width if use_upper else nO,
        nF=nr_feature_tokens,
--- a/spacy/ml/models/simple_ner.py
+++ b/spacy/ml/models/simple_ner.py
@ -26,7 +26,6 @@ def BiluoTagger(
        with_array(softmax_activation()),
        padded2list(),
    )
    return Model(
        "biluo-tagger",
        forward,
@ -52,7 +51,6 @@ def IOBTagger(
        with_array(softmax_activation()),
        padded2list(),
    )
    return Model(
        "iob-tagger",
        forward,
--- a/spacy/ml/models/tagger.py
+++ b/spacy/ml/models/tagger.py
@ -1,10 +1,11 @@
 from typing import Optional
 from thinc.api import zero_init, with_array, Softmax, chain, Model
 from ...util import registry
@registry.architectures.register("spacy.Tagger.v1")
-def build_tagger_model(tok2vec, nO=None) -> Model:
+def build_tagger_model(tok2vec: Model, nO: Optional[int] = None) -> Model:
    # TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
    output_layer = Softmax(nO, t2v_width, init_W=zero_init)
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@ -2,10 +2,9 @@ from typing import Optional
 from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
 from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
 from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
-from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued
+from thinc.api import HashEmbed, with_array, with_cpu, uniqued
 from thinc.api import Relu, residual, expand_window, FeatureExtractor
 from ... import util
 from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
 from ...util import registry
 from ..extract_ngrams import extract_ngrams
@ -40,7 +39,12 @@ def build_simple_cnn_text_classifier(
@registry.architectures.register("spacy.TextCatBOW.v1")
-def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO=None):
+def build_bow_text_classifier(
    exclusive_classes: bool,
    ngram_size: int,
    no_output_layer: bool,
    nO: Optional[int] = None,
 ) -> Model:
    with Model.define_operators({">>": chain}):
        sparse_linear = SparseLinear(nO)
        model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
@ -55,16 +59,16 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO
@registry.architectures.register("spacy.TextCatEnsemble.v1")
 def build_text_classifier(
-    width,
+    width: int,
-    embed_size,
+    embed_size: int,
-    pretrained_vectors,
+    pretrained_vectors: Optional[bool],
-    exclusive_classes,
+    exclusive_classes: bool,
-    ngram_size,
+    ngram_size: int,
-    window_size,
+    window_size: int,
-    conv_depth,
+    conv_depth: int,
-    dropout,
+    dropout: Optional[float],
-    nO=None,
+    nO: Optional[int] = None,
-):
+) -> Model:
    cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
    with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
        lower = HashEmbed(
@ -91,7 +95,6 @@ def build_text_classifier(
            dropout=dropout,
            seed=13,
        )
        width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
        trained_vectors = FeatureExtractor(cols) >> with_array(
            uniqued(
@ -100,7 +103,6 @@ def build_text_classifier(
                column=cols.index(ORTH),
            )
        )
        if pretrained_vectors:
            static_vectors = StaticVectors(width)
            vector_layer = trained_vectors | static_vectors
@ -152,7 +154,12 @@ def build_text_classifier(
@registry.architectures.register("spacy.TextCatLowData.v1")
-def build_text_classifier_lowdata(width, pretrained_vectors, dropout, nO=None):
+def build_text_classifier_lowdata(
    width: int,
    pretrained_vectors: Optional[bool],
    dropout: Optional[float],
    nO: Optional[int] = None,
 ) -> Model:
    # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
    with Model.define_operators({">>": chain, "**": clone}):
        model = (
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -6,16 +6,15 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
 from thinc.types import Floats2d
 from ...tokens import Doc
 from ... import util
 from ...util import registry
 from ...ml import _character_embed
 from ..staticvectors import StaticVectors
 from ...pipeline.tok2vec import Tok2VecListener
-from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
+from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
@registry.architectures.register("spacy.Tok2VecListener.v1")
-def tok2vec_listener_v1(width, upstream="*"):
+def tok2vec_listener_v1(width: int, upstream: str = "*"):
    tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
    return tok2vec
@ -45,10 +44,11 @@ def build_hash_embed_cnn_tok2vec(
            width=width,
            depth=depth,
            window_size=window_size,
-            maxout_pieces=maxout_pieces
+            maxout_pieces=maxout_pieces,
-        )
+        ),
    )
@registry.architectures.register("spacy.Tok2Vec.v1")
 def build_Tok2Vec_model(
    embed: Model[List[Doc], List[Floats2d]],
@ -68,7 +68,6 @@ def MultiHashEmbed(
    width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
 ):
    cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    seed = 7
    def make_hash_embed(feature):
@ -124,11 +123,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
            chain(
                FeatureExtractor([NORM]),
                list2ragged(),
-                with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5))
+                with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
-            )
+            ),
        ),
        with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
-        ragged2list()
+        ragged2list(),
    )
    return model
@ -155,12 +154,7 @@ def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth:
 def MishWindowEncoder(width, window_size, depth):
    cnn = chain(
        expand_window(window_size=window_size),
-        Mish(
+        Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
            nO=width,
            nI=width * ((window_size * 2) + 1),
            dropout=0.0,
            normalize=True
        ),
    )
    model = clone(residual(cnn), depth)
    model.set_dim("nO", width)
--- a/spacy/util.py
+++ b/spacy/util.py
@ -7,7 +7,7 @@ import importlib.util
 import re
 from pathlib import Path
 import thinc
-from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer, Model
+from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
 import functools
 import itertools
 import numpy.random
@ -24,8 +24,6 @@ import tempfile
 import shutil
 import shlex
 import inspect
 from thinc.types import Unserializable
 try:
    import cupy.random
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -6,6 +6,7 @@ menu:
  - ['Tok2Vec', 'tok2vec']
  - ['Transformers', 'transformers']
  - ['Parser & NER', 'parser']
  - ['Tagging', 'tagger']
  - ['Text Classification', 'textcat']
  - ['Entity Linking', 'entitylinker']
 ---
@ -18,6 +19,30 @@ TODO: intro and how architectures work, link to
 ### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
 <!-- TODO: intro -->
 > #### Example Config
 >
 > ```ini
 > [model]
 > @architectures = "spacy.HashEmbedCNN.v1"
 > # TODO: ...
 >
 > [model.tok2vec]
 > # ...
 > ```
 | Name                 | Type  | Description |
 | -------------------- | ----- | ----------- |
 | `width`              | int   |             |
 | `depth`              | int   |             |
 | `embed_size`         | int   |             |
 | `window_size`        | int   |             |
 | `maxout_pieces`      | int   |             |
 | `subword_features`   | bool  |             |
 | `dropout`            | float |             |
 | `pretrained_vectors` | bool  |             |
 ### spacy.HashCharEmbedCNN.v1 {#HashCharEmbedCNN}
 ### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM}
@ -99,6 +124,28 @@ architectures into your training config.
 | `use_upper`         | bool                                       |             |
 | `nO`                | int                                        |             |
 ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
 ### spacy.Tagger.v1 {#Tagger}
 <!-- TODO: intro -->
 > #### Example Config
 >
 > ```ini
 > [model]
 > @architectures = "spacy.Tagger.v1"
 > nO = null
 >
 > [model.tok2vec]
 > # ...
 > ```
 | Name      | Type                                       | Description |
 | --------- | ------------------------------------------ | ----------- |
 | `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) |             |
 | `nO`      | int                                        |             |
 ## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"}
 ### spacy.TextCatEnsemble.v1 {#TextCatEnsemble}
@ -112,3 +159,21 @@ architectures into your training config.
 ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
 ### spacy.EntityLinker.v1 {#EntityLinker}
 <!-- TODO: intro -->
 > #### Example Config
 >
 > ```ini
 > [model]
 > @architectures = "spacy.EntityLinker.v1"
 > nO = null
 >
 > [model.tok2vec]
 > # ...
 > ```
 | Name      | Type                                       | Description |
 | --------- | ------------------------------------------ | ----------- |
 | `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) |             |
 | `nO`      | int                                        |             |
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@ -29,9 +29,11 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("parser", config=config)
 > ```
 <!-- TODO: finish API docs -->
 | Setting | Type                                       | Description       | Default                                                           |
 | ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
-| `moves` | list                                       | <!-- TODO: -->    | `None`                                                            |
+| `moves` | list                                       |                   | `None`                                                            |
 | `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
 ```python
@ -59,17 +61,19 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).
 <!-- TODO: finish API docs -->
 | Name                          | Type                                       | Description                                                                                 |
 | ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
 | `vocab`                       | `Vocab`                                    | The shared vocabulary.                                                                      |
 | `model`                       | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.             |
 | `name`                        | str                                        | String name of the component instance. Used to add entries to the `losses` during training. |
-| `moves`                       | list                                       | <!-- TODO: -->                                                                              |
+| `moves`                       | list                                       |                                                                                             |
 | _keyword-only_                |                                            |                                                                                             |
-| `update_with_oracle_cut_size` | int                                        | <!-- TODO: -->                                                                              |
+| `update_with_oracle_cut_size` | int                                        |                                                                                             |
-| `multitasks`                  | `Iterable`                                 | <!-- TODO: -->                                                                              |
+| `multitasks`                  | `Iterable`                                 |                                                                                             |
-| `learn_tokens`                | bool                                       | <!-- TODO: -->                                                                              |
+| `learn_tokens`                | bool                                       |                                                                                             |
-| `min_action_freq`             | int                                        | <!-- TODO: -->                                                                              |
+| `min_action_freq`             | int                                        |                                                                                             |
 ## DependencyParser.\_\_call\_\_ {#call tag="method"}
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@ -32,12 +32,14 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("entity_linker", config=config)
 > ```
 <!-- TODO: finish API docs -->
 | Setting          | Type                                       | Description       | Default                                         |
 | ---------------- | ------------------------------------------ | ----------------- | ----------------------------------------------- |
-| `kb`             | `KnowledgeBase`                            | <!-- TODO: -->    | `None`                                          |
+| `kb`             | `KnowledgeBase`                            |                   | `None`                                          |
-| `labels_discard` | `Iterable[str]`                            | <!-- TODO: -->    | `[]`                                            |
+| `labels_discard` | `Iterable[str]`                            |                   | `[]`                                            |
-| `incl_prior`     | bool                                       | <!-- TODO: -->    |  `True`                                         |
+| `incl_prior`     | bool                                       |                   |  `True`                                         |
-| `incl_context`   | bool                                       | <!-- TODO: -->    | `True`                                          |
+| `incl_context`   | bool                                       |                   | `True`                                          |
 | `model`          | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) |
 ```python
@ -65,16 +67,18 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).
 <!-- TODO: finish API docs -->
 | Name             | Type            | Description                                                                                 |
 | ---------------- | --------------- | ------------------------------------------------------------------------------------------- |
 | `vocab`          | `Vocab`         | The shared vocabulary.                                                                      |
 | `model`          | `Model`         | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.             |
 | `name`           | str             | String name of the component instance. Used to add entries to the `losses` during training. |
 | _keyword-only_   |                 |                                                                                             |
-| `kb`             | `KnowlegeBase`  | <!-- TODO: -->                                                                              |
+| `kb`             | `KnowlegeBase`  |                                                                                             |
-| `labels_discard` | `Iterable[str]` | <!-- TODO: -->                                                                              |
+| `labels_discard` | `Iterable[str]` |                                                                                             |
-| `incl_prior`     | bool            | <!-- TODO: -->                                                                              |
+| `incl_prior`     | bool            |                                                                                             |
-| `incl_context`   | bool            | <!-- TODO: -->                                                                              |
+| `incl_context`   | bool            |                                                                                             |
 ## EntityLinker.\_\_call\_\_ {#call tag="method"}
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@ -29,9 +29,11 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("ner", config=config)
 > ```
 <!-- TODO: finish API docs -->
 | Setting | Type                                       | Description       | Default                                                           |
 | ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
-| `moves` | list                                       | <!-- TODO: -->    | `None`                                                            |
+| `moves` | list                                       |                   | `None`                                                            |
 | `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
 ```python
@ -59,17 +61,19 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).
 <!-- TODO: finish API docs -->
 | Name                          | Type                                       | Description                                                                                 |
 | ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
 | `vocab`                       | `Vocab`                                    | The shared vocabulary.                                                                      |
 | `model`                       | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.             |
 | `name`                        | str                                        | String name of the component instance. Used to add entries to the `losses` during training. |
-| `moves`                       | list                                       | <!-- TODO: -->                                                                              |
+| `moves`                       | list                                       |                                                                                             |
 | _keyword-only_                |                                            |                                                                                             |
-| `update_with_oracle_cut_size` | int                                        | <!-- TODO: -->                                                                              |
+| `update_with_oracle_cut_size` | int                                        |                                                                                             |
-| `multitasks`                  | `Iterable`                                 | <!-- TODO: -->                                                                              |
+| `multitasks`                  | `Iterable`                                 |                                                                                             |
-| `learn_tokens`                | bool                                       | <!-- TODO: -->                                                                              |
+| `learn_tokens`                | bool                                       |                                                                                             |
-| `min_action_freq`             | int                                        | <!-- TODO: -->                                                                              |
+| `min_action_freq`             | int                                        |                                                                                             |
 ## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
--- a/website/docs/api/example.md
+++ b/website/docs/api/example.md
@ -8,9 +8,8 @@ new: 3.0
 An `Example` holds the information for one training instance. It stores two
 `Doc` objects: one for holding the gold-standard reference data, and one for
-holding the predictions of the pipeline. An `Alignment` <!-- TODO: link? -->
+holding the predictions of the pipeline. An `Alignment` object stores the
-object stores the alignment between these two documents, as they can differ in
+alignment between these two documents, as they can differ in tokenization.
 tokenization.
 ## Example.\_\_init\_\_ {#init tag="method"}
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@ -98,9 +98,9 @@ decorator. For more details and examples, see the
 | ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `name`                  | str                  | The name of the component factory.                                                                                                                                                                                          |
 | _keyword-only_          |                      |                                                                                                                                                                                                                             |
-| `assigns`               | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
+| `assigns`               | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.                                                                                                                |
-| `requires`              | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
+| `requires`              | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.                                                                                                                |
-| `retokenizes`           | bool                 | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                                                                                                    |
+| `retokenizes`           | bool                 | Whether the component changes tokenization. Used for pipeline analysis.                                                                                                                                                     |
 | `scores`                | `Iterable[str]`      | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`.                                                                                                                                  |
 | `default_score_weights` | `Dict[str, float]`   | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
 | `func`                  | `Optional[Callable]` | Optional function if not used a a decorator.                                                                                                                                                                                |
@ -146,9 +146,9 @@ examples, see the
 | `name`                  | str                  | The name of the component factory.                                                                                                                                                                                          |
 | _keyword-only_          |                      |                                                                                                                                                                                                                             |
 | `default_config`        | `Dict[str, any]`     | The default config, describing the default values of the factory arguments.                                                                                                                                                 |
-| `assigns`               | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
+| `assigns`               | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.                                                                                                                |
-| `requires`              | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
+| `requires`              | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.                                                                                                                |
-| `retokenizes`           | bool                 | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                                                                                                    |
+| `retokenizes`           | bool                 | Whether the component changes tokenization. Used for pipeline analysis.                                                                                                                                                     |
 | `scores`                | `Iterable[str]`      | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`.                                                                                                                                  |
 | `default_score_weights` | `Dict[str, float]`   | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
 | `func`                  | `Optional[Callable]` | Optional function if not used a a decorator.                                                                                                                                                                                |
@ -833,8 +833,8 @@ instance and factory instance.
 | ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `factory`               | str                | The name of the registered component factory.                                                                                                                                                                               |
 | `default_config`        | `Dict[str, Any]`   | The default config, describing the default values of the factory arguments.                                                                                                                                                 |
-| `assigns`               | `Iterable[str]`    | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
+| `assigns`               | `Iterable[str]`    | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.                                                                                                                |
-| `requires`              | `Iterable[str]`    | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
+| `requires`              | `Iterable[str]`    | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.                                                                                                                |
-| `retokenizes`           | bool               | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                                                                                                    |
+| `retokenizes`           | bool               | Whether the component changes tokenization. Used for pipeline analysis.                                                                                                                                                     |
 | `scores`                | `Iterable[str]`    | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`.                                                                                                                                  |
 | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@ -63,14 +63,16 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).
 <!-- TODO: finish API docs -->
 | Name           | Type    | Description                                                                                 |
 | -------------- | ------- | ------------------------------------------------------------------------------------------- |
 | `vocab`        | `Vocab` | The shared vocabulary.                                                                      |
 | `model`        | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.             |
 | `name`         | str     | String name of the component instance. Used to add entries to the `losses` during training. |
 | _keyword-only_ |         |                                                                                             |
-| `labels_morph` | dict    | <!-- TODO: -->                                                                              |
+| `labels_morph` | dict    |                                                                                             |
-| `labels_pos`   | dict    | <!-- TODO: -->                                                                              |
+| `labels_pos`   | dict    |                                                                                             |
 ## Morphologizer.\_\_call\_\_ {#call tag="method"}
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -290,6 +290,8 @@ factories.
 >     return Model("custom", forward, dims={"nO": nO})
 > ```
 <!-- TODO: finish table -->
 | Registry name     | Description                                                                                                                                                                                                                                       |
 | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `architectures`   | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`.                                                                          |
@ -297,7 +299,7 @@ factories.
 | `languages`       | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points).                                                                                                                |
 | `lookups`         | Registry for large lookup tables available via `vocab.lookups`.                                                                                                                                                                                   |
 | `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points).                                                                            |
-| `assets`          | <!-- TODO: what is this used for again?-->                                                                                                                                                                                                        |
+| `assets`          |                                                                                                                                                                                                                                                   |
 | `optimizers`      | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers).                                                                                                                                                            |
 | `schedules`       | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules).                                                                                                                                                              |
 | `layers`          | Registry for functions that create [layers](https://thinc.ai/docs/api-layers).                                                                                                                                                                    |
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@ -347,50 +347,52 @@ serialization by passing in the string names via the `exclude` argument.
 Transformer tokens and outputs for one `Doc` object.
-| Name      | Type                                               | Description                               |
+<!-- TODO: finish API docs, also mention "width" is property -->
-| --------- | -------------------------------------------------- | ----------------------------------------- |
+
-| `tokens`  | `Dict`                                             | <!-- TODO: -->                            |
+| Name      | Type                                               | Description |
-| `tensors` | `List[FloatsXd]`                                   | <!-- TODO: -->                            |
+| --------- | -------------------------------------------------- | ----------- |
-| `align`   | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | <!-- TODO: -->                            |
+| `tokens`  | `Dict`                                             |             |
-| `width`   | int                                                | <!-- TODO: also mention it's property --> |
+| `tensors` | `List[FloatsXd]`                                   |             |
 | `align`   | [`Ragged`](https://thinc.ai/docs/api-types#ragged) |             |
 | `width`   | int                                                |             |
 ### TransformerData.empty {#transformerdata-emoty tag="classmethod"}
-<!-- TODO: -->
+<!-- TODO: finish API docs -->
-| Name        | Type              | Description    |
+| Name        | Type              | Description |
-| ----------- | ----------------- | -------------- |
+| ----------- | ----------------- | ----------- |
-| **RETURNS** | `TransformerData` | <!-- TODO: --> |
+| **RETURNS** | `TransformerData` |             |
 ## FullTransformerBatch {#fulltransformerbatch tag="dataclass"}
-<!-- TODO: -->
+<!-- TODO: write, also mention doc_data is property -->
-| Name       | Type                                                                                                                       | Description                               |
+| Name       | Type                                                                                                                       | Description |
-| ---------- | -------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------- |
+| ---------- | -------------------------------------------------------------------------------------------------------------------------- | ----------- |
-| `spans`    | `List[List[Span]]`                                                                                                         | <!-- TODO: -->                            |
+| `spans`    | `List[List[Span]]`                                                                                                         |             |
-| `tokens`   | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | <!-- TODO: -->                            |
+| `tokens`   | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) |             |
-| `tensors`  | `List[torch.Tensor]`                                                                                                       | <!-- TODO: -->                            |
+| `tensors`  | `List[torch.Tensor]`                                                                                                       |             |
-| `align`    | [`Ragged`](https://thinc.ai/docs/api-types#ragged)                                                                         | <!-- TODO: -->                            |
+| `align`    | [`Ragged`](https://thinc.ai/docs/api-types#ragged)                                                                         |             |
-| `doc_data` | `List[TransformerData]`                                                                                                    | <!-- TODO: also mention it's property --> |
+| `doc_data` | `List[TransformerData]`                                                                                                    |             |
 ### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"}
-<!-- TODO: -->
+<!-- TODO: write -->
-| Name        | Type                   | Description    |
+| Name        | Type                   | Description |
-| ----------- | ---------------------- | -------------- |
+| ----------- | ---------------------- | ----------- |
-| `arrays`    | `List[List[Floats3d]]` | <!-- TODO: --> |
+| `arrays`    | `List[List[Floats3d]]` |             |
-| **RETURNS** | `FullTransformerBatch` | <!-- TODO: --> |
+| **RETURNS** | `FullTransformerBatch` |             |
 ### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"}
 Split a `TransformerData` object that represents a batch into a list with one
 `TransformerData` per `Doc`.
-| Name        | Type                    | Description    |
+| Name        | Type                    | Description |
-| ----------- | ----------------------- | -------------- |
+| ----------- | ----------------------- | ----------- |
-| **RETURNS** | `List[TransformerData]` | <!-- TODO: --> |
+| **RETURNS** | `List[TransformerData]` |             |
 ## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
@ -421,11 +423,13 @@ getters using the `@registry.span_getters` decorator.
 The following built-in functions are available:
 <!-- TODO: finish API docs -->
 | Name               | Description                                                        |
 | ------------------ | ------------------------------------------------------------------ |
 | `doc_spans.v1`     | Create a span for each doc (no transformation, process each text). |
 | `sent_spans.v1`    | Create a span for each sentence if sentence boundaries are set.    |
-| `strided_spans.v1` | <!-- TODO: -->                                                     |
+| `strided_spans.v1` |                                                                    |
 ## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@ -231,10 +231,10 @@ available pipeline components and component functions.
 | `morphologizer` | [`Morphologizer`](/api/morphologizer)           | Assign morphological features and coarse-grained POS tags.                                |
 | `senter`        | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries.                                                               |
 | `sentencizer`   | [`Sentencizer`](/api/sentencizer)               | Add rule-based sentence segmentation without the dependency parse.                        |
-| `tok2vec`       | [`Tok2Vec`](/api/tok2vec)                       | <!-- TODO: -->                                                                            |
+| `tok2vec`       | [`Tok2Vec`](/api/tok2vec)                       |                                                                                           |
 | `transformer`   | [`Transformer`](/api/transformer)               | Assign the tokens and outputs of a transformer model.                                     |
-<!-- TODO: update with more components -->
+<!-- TODO: finish and update with more components -->
 <!-- TODO: explain default config and factories -->
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@ -15,8 +15,6 @@ import Serialization101 from 'usage/101/\_serialization.md'
 ### Serializing the pipeline {#pipeline}
 <!-- TODO: update this -->
 When serializing the pipeline, keep in mind that this will only save out the
 **binary data for the individual components** to allow spaCy to restore them –
 not the entire objects. This is a good thing, because it makes serialization