diff --git a/spacy/about.py b/spacy/about.py
index 03de62539..eb4d2128c 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a5"
+__version__ = "3.0.0a6"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 7202ccacf..ce0eb27a0 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -35,7 +35,7 @@ def pretrain_cli(
     config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
     code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
     resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
-    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
+    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
     # fmt: on
 ):
diff --git a/spacy/gold/batchers.py b/spacy/gold/batchers.py
index 57c6b4b3a..ec1f35815 100644
--- a/spacy/gold/batchers.py
+++ b/spacy/gold/batchers.py
@@ -1,4 +1,4 @@
-from typing import Union, Iterator, Iterable, Sequence, TypeVar, List, Callable
+from typing import Union, Iterable, Sequence, TypeVar, List, Callable
 from typing import Optional, Any
 from functools import partial
 import itertools
@@ -19,6 +19,22 @@ def configure_minibatch_by_padded_size(
     discard_oversize: bool,
     get_length: Optional[Callable[[ItemT], int]] = None
 ) -> BatcherT:
+    """Create a batcher that uses the `batch_by_padded_size` strategy.
+
+    The padded size is defined as the maximum length of sequences within the
+    batch multiplied by the number of sequences in the batch.
+
+    size (int or Iterable[int]): The largest padded size to batch sequences into.
+        Can be a single integer, or a sequence, allowing for variable batch sizes.
+    buffer (int): The number of sequences to accumulate before sorting by length.
+        A larger buffer will result in more even sizing, but if the buffer is
+        very large, the iteration order will be less random, which can result
+        in suboptimal training.
+    discard_oversize (bool): Whether to discard sequences that are by themselves
+        longer than the largest padded batch size.
+    get_length (Callable or None): Function to get the length of a sequence item.
+        The `len` function is used by default.
+    """
     # Avoid displacing optional values from the underlying function.
     optionals = {"get_length": get_length} if get_length is not None else {}
     return partial(
@@ -38,6 +54,16 @@ def configure_minibatch_by_words(
     discard_oversize: bool,
     get_length: Optional[Callable[[ItemT], int]] = None
 ) -> BatcherT:
+    """Create a batcher that uses the "minibatch by words" strategy.
+
+    size (int or Iterable[int]): The target number of words per batch.
+        Can be a single integer, or a sequence, allowing for variable batch sizes.
+    tolerance (float): What percentage of the size to allow batches to exceed.
+    discard_oversize (bool): Whether to discard sequences that by themselves
+        exceed the tolerated size.
+    get_length (Callable or None): Function to get the length of a sequence
+        item. The `len` function is used by default.
+    """
     optionals = {"get_length": get_length} if get_length is not None else {}
     return partial(
         minibatch_by_words, size=size, discard_oversize=discard_oversize, **optionals
@@ -48,22 +74,43 @@ def configure_minibatch_by_words(
 def configure_minibatch(
     size: Sizing, get_length: Optional[Callable[[ItemT], int]] = None
 ) -> BatcherT:
+    """Create a batcher that creates batches of the specified size.
+
+    size (int or Iterable[int]): The target number of items per batch.
+        Can be a single integer, or a sequence, allowing for variable batch sizes.
+    """
     optionals = {"get_length": get_length} if get_length is not None else {}
     return partial(minibatch, size=size, **optionals)
 
 
 def minibatch_by_padded_size(
-    docs: Iterator["Doc"],
+    seqs: Iterable[ItemT],
     size: Sizing,
     buffer: int = 256,
     discard_oversize: bool = False,
     get_length: Callable = len,
-) -> Iterator[Iterator["Doc"]]:
+) -> Iterable[List[ItemT]]:
+    """Minibatch a sequence by the size of padded batches that would result,
+    with sequences binned by length within a window.
+
+    The padded size is defined as the maximum length of sequences within the
+    batch multiplied by the number of sequences in the batch.
+
+    size (int): The largest padded size to batch sequences into.
+    buffer (int): The number of sequences to accumulate before sorting by length.
+        A larger buffer will result in more even sizing, but if the buffer is
+        very large, the iteration order will be less random, which can result
+        in suboptimal training.
+    discard_oversize (bool): Whether to discard sequences that are by themselves
+        longer than the largest padded batch size.
+    get_length (Callable or None): Function to get the length of a sequence item.
+        The `len` function is used by default.
+    """
     if isinstance(size, int):
         size_ = itertools.repeat(size)
     else:
         size_ = size
-    for outer_batch in minibatch(docs, size=buffer):
+    for outer_batch in minibatch(seqs, size=buffer):
         outer_batch = list(outer_batch)
         target_size = next(size_)
         for indices in _batch_by_length(outer_batch, target_size, get_length):
@@ -76,12 +123,24 @@ def minibatch_by_padded_size(
 
 
 def minibatch_by_words(
-    docs, size, tolerance=0.2, discard_oversize=False, get_length=len
-):
+    seqs: Iterable[ItemT],
+    size: Sizing,
+    tolerance=0.2,
+    discard_oversize=False,
+    get_length=len,
+) -> Iterable[List[ItemT]]:
     """Create minibatches of roughly a given number of words. If any examples
     are longer than the specified batch length, they will appear in a batch by
     themselves, or be discarded if discard_oversize=True.
-    The argument 'docs' can be a list of strings, Docs or Examples.
+
+    seqs (Iterable[Sequence]): The sequences to minibatch.
+    size (int or Iterable[int]): The target number of words per batch.
+        Can be a single integer, or a sequence, allowing for variable batch sizes.
+    tolerance (float): What percentage of the size to allow batches to exceed.
+    discard_oversize (bool): Whether to discard sequences that by themselves
+        exceed the tolerated size.
+    get_length (Callable or None): Function to get the length of a sequence
+        item. The `len` function is used by default.
     """
     if isinstance(size, int):
         size_ = itertools.repeat(size)
@@ -95,20 +154,20 @@ def minibatch_by_words(
     overflow = []
     batch_size = 0
     overflow_size = 0
-    for doc in docs:
-        n_words = get_length(doc)
+    for seq in seqs:
+        n_words = get_length(seq)
         # if the current example exceeds the maximum batch size, it is returned separately
         # but only if discard_oversize=False.
         if n_words > target_size + tol_size:
             if not discard_oversize:
-                yield [doc]
+                yield [seq]
         # add the example to the current batch if there's no overflow yet and it still fits
         elif overflow_size == 0 and (batch_size + n_words) <= target_size:
-            batch.append(doc)
+            batch.append(seq)
             batch_size += n_words
         # add the example to the overflow buffer if it fits in the tolerance margin
         elif (batch_size + overflow_size + n_words) <= (target_size + tol_size):
-            overflow.append(doc)
+            overflow.append(seq)
             overflow_size += n_words
         # yield the previous batch and start a new one. The new one gets the overflow examples.
         else:
@@ -122,11 +181,11 @@ def minibatch_by_words(
             overflow_size = 0
             # this example still fits
             if (batch_size + n_words) <= target_size:
-                batch.append(doc)
+                batch.append(seq)
                 batch_size += n_words
             # this example fits in overflow
             elif (batch_size + n_words) <= (target_size + tol_size):
-                overflow.append(doc)
+                overflow.append(seq)
                 overflow_size += n_words
             # this example does not fit with the previous overflow: start another new batch
             else:
@@ -134,7 +193,7 @@ def minibatch_by_words(
                     yield batch
                 target_size = next(size_)
                 tol_size = target_size * tolerance
-                batch = [doc]
+                batch = [seq]
                 batch_size = n_words
     batch.extend(overflow)
     if batch:
diff --git a/spacy/lang/en/lemmatizer.py b/spacy/lang/en/lemmatizer.py
index b8bef39b9..be389f117 100644
--- a/spacy/lang/en/lemmatizer.py
+++ b/spacy/lang/en/lemmatizer.py
@@ -1,5 +1,3 @@
-from typing import Optional
-
 from ...pipeline import Lemmatizer
 from ...tokens import Token
 
diff --git a/spacy/language.py b/spacy/language.py
index 96661915a..85aac15ef 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -27,7 +27,6 @@ from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .lang.punctuation import TOKENIZER_INFIXES
 from .tokens import Doc
-from .lookups import load_lookups
 from .tokenizer import Tokenizer
 from .errors import Errors, Warnings
 from .schemas import ConfigSchema
@@ -1439,10 +1438,7 @@ class Language:
                 or lang_cls is not cls
             ):
                 raise ValueError(Errors.E943.format(value=type(lang_cls)))
-        nlp = lang_cls(
-            vocab=vocab,
-            create_tokenizer=create_tokenizer,
-        )
+        nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer)
         if after_creation is not None:
             nlp = after_creation(nlp)
             if not isinstance(nlp, cls):
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index 716af9909..e0a54e6f1 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -68,11 +68,11 @@ cdef class DependencyMatcher:
         key (str): The match ID.
         RETURNS (bool): Whether the matcher contains rules for this match ID.
         """
-        return self._normalize_key(key) in self._patterns
+        return self.has_key(key)
 
-    def validateInput(self, pattern, key):
+    def validate_input(self, pattern, key):
         idx = 0
-        visitedNodes = {}
+        visited_nodes = {}
         for relation in pattern:
             if "PATTERN" not in relation or "SPEC" not in relation:
                 raise ValueError(Errors.E098.format(key=key))
@@ -83,7 +83,7 @@ cdef class DependencyMatcher:
                     and "NBOR_NAME" not in relation["SPEC"]
                 ):
                     raise ValueError(Errors.E099.format(key=key))
-                visitedNodes[relation["SPEC"]["NODE_NAME"]] = True
+                visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
             else:
                 if not(
                     "NODE_NAME" in relation["SPEC"]
@@ -92,22 +92,28 @@ cdef class DependencyMatcher:
                 ):
                     raise ValueError(Errors.E100.format(key=key))
                 if (
-                    relation["SPEC"]["NODE_NAME"] in visitedNodes
-                    or relation["SPEC"]["NBOR_NAME"] not in visitedNodes
+                    relation["SPEC"]["NODE_NAME"] in visited_nodes
+                    or relation["SPEC"]["NBOR_NAME"] not in visited_nodes
                 ):
                     raise ValueError(Errors.E101.format(key=key))
-                visitedNodes[relation["SPEC"]["NODE_NAME"]] = True
-                visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True
+                visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
+                visited_nodes[relation["SPEC"]["NBOR_NAME"]] = True
             idx = idx + 1
 
     def add(self, key, patterns, *_patterns, on_match=None):
+        """Add a new matcher rule to the matcher.
+
+        key (str): The match ID.
+        patterns (list): The patterns to add for the given key.
+        on_match (callable): Optional callback executed on match.
+        """
         if patterns is None or hasattr(patterns, "__call__"):  # old API
             on_match = patterns
             patterns = _patterns
         for pattern in patterns:
             if len(pattern) == 0:
                 raise ValueError(Errors.E012.format(key=key))
-            self.validateInput(pattern,key)
+            self.validate_input(pattern,key)
         key = self._normalize_key(key)
         _patterns = []
         for pattern in patterns:
@@ -187,8 +193,7 @@ cdef class DependencyMatcher:
         key (string or int): The key to check.
         RETURNS (bool): Whether the matcher has the rule.
         """
-        key = self._normalize_key(key)
-        return key in self._patterns
+        return self._normalize_key(key) in self._patterns
 
     def get(self, key, default=None):
         """Retrieve the pattern stored for a key.
@@ -202,6 +207,13 @@ cdef class DependencyMatcher:
         return (self._callbacks[key], self._patterns[key])
 
     def __call__(self, Doc doc):
+        """Find all token sequences matching the supplied pattern.
+
+        doclike (Doc or Span): The document to match over.
+        RETURNS (list): A list of `(key, start, end)` tuples,
+            describing the matches. A match tuple describes a span
+            `doc[start:end]`. The `label_id` and `key` are both integers.
+        """
         matched_key_trees = []
         matches = self.token_matcher(doc)
         for key in list(self._patterns.keys()):
@@ -241,25 +253,25 @@ cdef class DependencyMatcher:
                     on_match(self, doc, i, matched_key_trees)
         return matched_key_trees
 
-    def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visitedNodes,matched_trees):
+    def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visited_nodes,matched_trees):
         cdef bool isValid;
         if(patternLength == len(id_to_position.keys())):
             isValid = True
             for node in range(patternLength):
                 if(node in tree):
                     for idx, (relop,nbor) in enumerate(tree[node]):
-                        computed_nbors = numpy.asarray(_node_operator_map[visitedNodes[node]][relop])
+                        computed_nbors = numpy.asarray(_node_operator_map[visited_nodes[node]][relop])
                         isNbor = False
                         for computed_nbor in computed_nbors:
-                            if(computed_nbor.i == visitedNodes[nbor]):
+                            if(computed_nbor.i == visited_nodes[nbor]):
                                 isNbor = True
                         isValid = isValid & isNbor
             if(isValid):
-                matched_trees.append(visitedNodes)
+                matched_trees.append(visited_nodes)
             return
         allPatternNodes = numpy.asarray(id_to_position[patternLength])
         for patternNode in allPatternNodes:
-            self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visitedNodes+[patternNode],matched_trees)
+            self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visited_nodes+[patternNode],matched_trees)
 
     # Given a node and an edge operator, to return the list of nodes
     # from the doc that belong to node+operator. This is used to store
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index a0f3f1655..16ab73735 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -70,7 +70,7 @@ cdef class Matcher:
         key (str): The match ID.
         RETURNS (bool): Whether the matcher contains rules for this match ID.
         """
-        return self._normalize_key(key) in self._patterns
+        return self.has_key(key)
 
     def add(self, key, patterns, *, on_match=None, greedy: str=None):
         """Add a match-rule to the matcher. A match-rule consists of: an ID
@@ -162,8 +162,7 @@ cdef class Matcher:
         key (string or int): The key to check.
         RETURNS (bool): Whether the matcher has the rule.
         """
-        key = self._normalize_key(key)
-        return key in self._patterns
+        return self._normalize_key(key) in self._patterns
 
     def get(self, key, default=None):
         """Retrieve the pattern stored for a key.
@@ -179,7 +178,7 @@ cdef class Matcher:
     def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
         """Match a stream of documents, yielding them in turn.
 
-        docs (iterable): A stream of documents.
+        docs (Iterable[Union[Doc, Span]]): A stream of documents or spans.
         batch_size (int): Number of documents to accumulate into a working set.
         return_matches (bool): Yield the match lists along with the docs, making
             results (doc, matches) tuples.
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index f85b5626a..801229af5 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -37,7 +37,6 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
     default_config={
         "moves": None,
         "update_with_oracle_cut_size": 100,
-        "multitasks": [],
         "learn_tokens": False,
         "min_action_freq": 30,
         "model": DEFAULT_PARSER_MODEL,
@@ -51,17 +50,52 @@ def make_parser(
     model: Model,
     moves: Optional[list],
     update_with_oracle_cut_size: int,
-    multitasks: Iterable,
     learn_tokens: bool,
     min_action_freq: int
 ):
+    """Create a transition-based DependencyParser component. The dependency parser
+    jointly learns sentence segmentation and labelled dependency parsing, and can
+    optionally learn to merge tokens that had been over-segmented by the tokenizer.
+
+    The parser uses a variant of the non-monotonic arc-eager transition-system
+    described by Honnibal and Johnson (2014), with the addition of a "break"
+    transition to perform the sentence segmentation. Nivre's pseudo-projective
+    dependency transformation is used to allow the parser to predict
+    non-projective parses.
+
+    The parser is trained using an imitation learning objective. The parser follows
+    the actions predicted by the current weights, and at each state, determines
+    which actions are compatible with the optimal parse that could be reached
+    from the current state. The weights such that the scores assigned to the
+    set of optimal actions is increased, while scores assigned to other
+    actions are decreased. Note that more than one action may be optimal for
+    a given state.
+
+    model (Model): The model for the transition-based parser. The model needs
+        to have a specific substructure of named components --- see the
+        spacy.ml.tb_framework.TransitionModel for details.
+    moves (List[str]): A list of transition names. Inferred from the data if not
+        provided.
+    update_with_oracle_cut_size (int):
+        During training, cut long sequences into shorter segments by creating
+        intermediate states based on the gold-standard history. The model is
+        not very sensitive to this parameter, so you usually won't need to change
+        it. 100 is a good default.
+    learn_tokens (bool): Whether to learn to merge subtokens that are split
+        relative to the gold standard. Experimental.
+    min_action_freq (int): The minimum frequency of labelled actions to retain.
+        Rarer labelled actions have their label backed-off to "dep". While this
+        primarily affects the label accuracy, it can also affect the attachment
+        structure, as the labels are used to represent the pseudo-projectivity
+        transformation.
+    """
     return DependencyParser(
         nlp.vocab,
         model,
         name,
         moves=moves,
         update_with_oracle_cut_size=update_with_oracle_cut_size,
-        multitasks=multitasks,
+        multitasks=[],
         learn_tokens=learn_tokens,
         min_action_freq=min_action_freq
     )
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 840070c23..080273f57 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -62,6 +62,16 @@ def make_entity_linker(
     incl_prior: bool,
     incl_context: bool,
 ):
+    """Construct an EntityLinker component.
+
+    model (Model[List[Doc], Floats2d]): A model that learns document vector
+        representations. Given a batch of Doc objects, it should return a single
+        array, with one row per item in the batch.
+    kb (KnowledgeBase): The knowledge-base to link entities to.
+    labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
+    incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
+    incl_context (bool): Whether or not to include the local context in the model.
+    """
     return EntityLinker(
         nlp.vocab,
         model,
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 06c9f9a25..efc494181 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -75,8 +75,8 @@ class Morphologizer(Tagger):
         model (thinc.api.Model): The Thinc Model powering the pipeline component.
         name (str): The component instance name, used to add entries to the
             losses during training.
-        labels_morph (dict): TODO:
-        labels_pos (dict): TODO:
+        labels_morph (dict): Mapping of morph + POS tags to morph labels.
+        labels_pos (dict): Mapping of morph + POS tags to POS tags.
 
         DOCS: https://spacy.io/api/morphologizer#init
         """
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index d13152a4f..a3bc3d920 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -35,9 +35,6 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
     default_config={
         "moves": None,
         "update_with_oracle_cut_size": 100,
-        "multitasks": [],
-        "learn_tokens": False,
-        "min_action_freq": 30,
         "model": DEFAULT_NER_MODEL,
     },
     scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
@@ -50,19 +47,40 @@ def make_ner(
     model: Model,
     moves: Optional[list],
     update_with_oracle_cut_size: int,
-    multitasks: Iterable,
-    learn_tokens: bool,
-    min_action_freq: int
 ):
+    """Create a transition-based EntityRecognizer component. The entity recognizer
+    identifies non-overlapping labelled spans of tokens.
+    
+    The transition-based algorithm used encodes certain assumptions that are
+    effective for "traditional" named entity recognition tasks, but may not be
+    a good fit for every span identification problem. Specifically, the loss
+    function optimizes for whole entity accuracy, so if your inter-annotator
+    agreement on boundary tokens is low, the component will likely perform poorly
+    on your problem. The transition-based algorithm also assumes that the most
+    decisive information about your entities will be close to their initial tokens.
+    If your entities are long and characterised by tokens in their middle, the
+    component will likely do poorly on your task.
+
+    model (Model): The model for the transition-based parser. The model needs
+        to have a specific substructure of named components --- see the
+        spacy.ml.tb_framework.TransitionModel for details.
+    moves (list[str]): A list of transition names. Inferred from the data if not
+        provided.
+    update_with_oracle_cut_size (int):
+        During training, cut long sequences into shorter segments by creating
+        intermediate states based on the gold-standard history. The model is
+        not very sensitive to this parameter, so you usually won't need to change
+        it. 100 is a good default.
+    """
     return EntityRecognizer(
         nlp.vocab,
         model,
         name,
         moves=moves,
         update_with_oracle_cut_size=update_with_oracle_cut_size,
-        multitasks=multitasks,
-        learn_tokens=learn_tokens,
-        min_action_freq=min_action_freq
+        multitasks=[],
+        min_action_freq=1,
+        learn_tokens=False,
     )
 
 
@@ -74,9 +92,11 @@ cdef class EntityRecognizer(Parser):
     TransitionSystem = BiluoPushDown
 
     def add_multitask_objective(self, mt_component):
+        """Register another component as a multi-task objective. Experimental."""
         self._multitasks.append(mt_component)
 
     def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
+        """Setup multi-task objective components. Experimental and internal."""
         # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
         for labeller in self._multitasks:
             labeller.model.set_dim("nO", len(self.labels))
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index aa0399b33..9be562b61 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,8 +1,9 @@
 # cython: infer_types=True, profile=True, binding=True
+from typing import List
 import numpy
 import srsly
-
 from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
+from thinc.types import Floats2d
 import warnings
 
 from ..tokens.doc cimport Doc
@@ -42,7 +43,14 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
     scores=["tag_acc"],
     default_score_weights={"tag_acc": 1.0},
 )
-def make_tagger(nlp: Language, name: str, model: Model):
+def make_tagger(nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]]):
+    """Construct a part-of-speech tagger component.
+
+    model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
+        the tag probabilities. The output vectors should match the number of tags
+        in size, and be normalized as probabilities (all scores between 0 and 1,
+        with the rows summing to 1).
+    """
     return Tagger(nlp.vocab, model, name)
 
 
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 06b72f8c7..d632825bd 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -1,5 +1,6 @@
 from typing import Iterable, Tuple, Optional, Dict, List, Callable, Iterator, Any
 from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
+from thinc.types import Floats2d
 import numpy
 
 from .pipe import Pipe
@@ -69,8 +70,22 @@ subword_features = true
     default_score_weights={"cats_score": 1.0},
 )
 def make_textcat(
-    nlp: Language, name: str, model: Model, labels: Iterable[str]
+    nlp: Language,
+    name: str,
+    model: Model[List[Doc], List[Floats2d]],
+    labels: Iterable[str],
 ) -> "TextCategorizer":
+    """Create a TextCategorizer compoment. The text categorizer predicts categories
+    over a whole document. It can learn one or more labels, and the labels can
+    be mutually exclusive (i.e. one true label per doc) or non-mutually exclusive
+    (i.e. zero or more labels may be true per doc). The multi-label setting is
+    controlled by the model instance that's provided.
+
+    model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
+        scores for each category.
+    labels (list): A list of categories to learn. If empty, the model infers the
+        categories from the data.
+    """
     return TextCategorizer(nlp.vocab, model, name, labels=labels)
 
 
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index db6843e8f..c9f0a99e9 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -32,11 +32,28 @@ def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec":
 
 
 class Tok2Vec(Pipe):
+    """Apply a "token-to-vector" model and set its outputs in the doc.tensor
+    attribute. This is mostly useful to share a single subnetwork between multiple
+    components, e.g. to have one embedding and CNN network shared between a
+    parser, tagger and NER.
+
+    In order to use the `Tok2Vec` predictions, subsequent components should use
+    the `Tok2VecListener` layer as the tok2vec subnetwork of their model. This
+    layer will read data from the `doc.tensor` attribute during prediction.
+    During training, the `Tok2Vec` component will save its prediction and backprop
+    callback for each batch, so that the subsequent components can backpropagate
+    to the shared weights. This implementation is used because it allows us to
+    avoid relying on object identity within the models to achieve the parameter
+    sharing.
+    """
+
     def __init__(self, vocab: Vocab, model: Model, name: str = "tok2vec") -> None:
         """Initialize a tok2vec component.
 
         vocab (Vocab): The shared vocabulary.
-        model (thinc.api.Model): The Thinc Model powering the pipeline component.
+        model (thinc.api.Model[List[Doc], List[Floats2d]]):
+            The Thinc Model powering the pipeline component. It should take
+            a list of Doc objects as input, and output a list of 2d float arrays.
         name (str): The component instance name.
 
         DOCS: https://spacy.io/api/tok2vec#init
@@ -48,9 +65,18 @@ class Tok2Vec(Pipe):
         self.cfg = {}
 
     def add_listener(self, listener: "Tok2VecListener") -> None:
+        """Add a listener for a downstream component. Usually internals."""
         self.listeners.append(listener)
 
     def find_listeners(self, model: Model) -> None:
+        """Walk over a model, looking for layers that are Tok2vecListener
+        subclasses that have an upstream_name that matches this component.
+        Listeners can also set their upstream_name attribute to the wildcard
+        string '*' to match any `Tok2Vec`.
+
+        You're unlikely to ever need multiple `Tok2Vec` components, so it's
+        fine to leave your listeners upstream_name on '*'.
+        """
         for node in model.walk():
             if isinstance(node, Tok2VecListener) and node.upstream_name in (
                 "*",
@@ -59,7 +85,8 @@ class Tok2Vec(Pipe):
                 self.add_listener(node)
 
     def __call__(self, doc: Doc) -> Doc:
-        """Add context-sensitive embeddings to the Doc.tensor attribute.
+        """Add context-sensitive embeddings to the Doc.tensor attribute, allowing
+        them to be used as features by downstream components.
 
         docs (Doc): The Doc to preocess.
         RETURNS (Doc): The processed Doc.
@@ -205,11 +232,27 @@ class Tok2Vec(Pipe):
 class Tok2VecListener(Model):
     """A layer that gets fed its answers from an upstream connection,
     for instance from a component earlier in the pipeline.
+
+    The Tok2VecListener layer is used as a sublayer within a component such
+    as a parser, NER or text categorizer. Usually you'll have multiple listeners
+    connecting to a single upstream Tok2Vec component, that's earlier in the
+    pipeline. The Tok2VecListener layers act as proxies, passing the predictions
+    from the Tok2Vec component into downstream components, and communicating
+    gradients back upstream.
     """
 
     name = "tok2vec-listener"
 
     def __init__(self, upstream_name: str, width: int) -> None:
+        """
+        upstream_name (str): A string to identify the 'upstream' Tok2Vec component
+            to communicate with. The upstream name should either be the wildcard
+            string '*', or the name of the `Tok2Vec` component. You'll almost
+            never have multiple upstream Tok2Vec components, so the wildcard
+            string will almost always be fine.
+        width (int):
+            The width of the vectors produced by the upstream tok2vec component.
+        """
         Model.__init__(self, name=self.name, forward=forward, dims={"nO": width})
         self.upstream_name = upstream_name
         self._batch_id = None
@@ -217,15 +260,25 @@ class Tok2VecListener(Model):
         self._backprop = None
 
     @classmethod
-    def get_batch_id(cls, inputs) -> int:
+    def get_batch_id(cls, inputs: List[Doc]) -> int:
+        """Calculate a content-sensitive hash of the batch of documents, to check
+        whether the next batch of documents is unexpected.
+        """
         return sum(sum(token.orth for token in doc) for doc in inputs)
 
     def receive(self, batch_id: int, outputs, backprop) -> None:
+        """Store a batch of training predictions and a backprop callback. The
+        predictions and callback are produced by the upstream Tok2Vec component,
+        and later will be used when the listener's component's model is called.
+        """
         self._batch_id = batch_id
         self._outputs = outputs
         self._backprop = backprop
 
     def verify_inputs(self, inputs) -> bool:
+        """Check that the batch of Doc objects matches the ones we have a
+        prediction for.
+        """
         if self._batch_id is None and self._outputs is None:
             raise ValueError(Errors.E954)
         else:
@@ -237,6 +290,7 @@ class Tok2VecListener(Model):
 
 
 def forward(model: Tok2VecListener, inputs, is_train: bool):
+    """Supply the outputs from the upstream Tok2Vec component."""
     if is_train:
         model.verify_inputs(inputs)
         return model._outputs, model._backprop
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 4a81d39d0..d77881ad0 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -426,7 +426,7 @@ class Scorer:
             f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
         }
         if len(labels) == 2 and not multi_label and positive_label:
-            positive_label_f = results[f"{attr}_f_per_type"][positive_label]['f']
+            positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
             results[f"{attr}_score"] = positive_label_f
             results[f"{attr}_score_desc"] = f"F ({positive_label})"
         elif not multi_label:
diff --git a/spacy/tests/morphology/test_morph_pickle.py b/spacy/tests/morphology/test_morph_pickle.py
index 0758a6c01..d9b0e3476 100644
--- a/spacy/tests/morphology/test_morph_pickle.py
+++ b/spacy/tests/morphology/test_morph_pickle.py
@@ -15,5 +15,7 @@ def morphology():
 def test_morphology_pickle_roundtrip(morphology):
     b = pickle.dumps(morphology)
     reloaded_morphology = pickle.loads(b)
-    assert reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"]) == "Feat1=Val1|Feat2=Val2"
-    assert reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"]) == "Feat3=Val3|Feat4=Val4"
+    feat = reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"])
+    assert feat == "Feat1=Val1|Feat2=Val2"
+    feat = reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"])
+    assert feat == "Feat3=Val3|Feat4=Val4"
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index dbeb0a9cb..0ffe74273 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -144,10 +144,7 @@ def test_accept_blocked_token():
     # 1. test normal behaviour
     nlp1 = English()
     doc1 = nlp1("I live in New York")
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-    }
+    config = {}
     ner1 = nlp1.create_pipe("ner", config=config)
     assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
     assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
@@ -166,10 +163,7 @@ def test_accept_blocked_token():
     # 2. test blocking behaviour
     nlp2 = English()
     doc2 = nlp2("I live in New York")
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-    }
+    config = {}
     ner2 = nlp2.create_pipe("ner", config=config)
 
     # set "New York" to a blocked entity
@@ -224,10 +218,7 @@ def test_overwrite_token():
     assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
     assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
     # Check that a new ner can overwrite O
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-    }
+    config = {}
     ner2 = nlp.create_pipe("ner", config=config)
     ner2.moves.add_action(5, "")
     ner2.add_label("GPE")
diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py
index 644fa0f01..8a70fdeeb 100644
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@@ -1,8 +1,7 @@
 import pytest
-
 from spacy import util, registry
 from spacy.lang.en import English
-from spacy.lookups import Lookups, load_lookups
+from spacy.lookups import Lookups
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 5f27a0afa..1af4a5121 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -1,10 +1,8 @@
 import pytest
-
 from spacy import util
 from spacy.gold import Example
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.symbols import POS, NOUN
 
 from ..util import make_tempdir
 
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 363a16a11..17add7391 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -117,9 +117,7 @@ def test_overfitting_IO():
         assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
 
     # Test scoring
-    scores = nlp.evaluate(
-        train_examples, scorer_cfg={"positive_label": "POSITIVE"}
-    )
+    scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
     assert scores["cats_micro_f"] == 1.0
     assert scores["cats_score"] == 1.0
     assert "cats_score_desc" in scores
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
index b642ca229..5c93ea3c8 100644
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@@ -1,11 +1,9 @@
 import pytest
 import random
-
 from spacy import util
 from spacy.gold import Example
 from spacy.matcher import Matcher
 from spacy.attrs import IS_PUNCT, ORTH, LOWER
-from spacy.symbols import POS, VERB
 from spacy.vocab import Vocab
 from spacy.lang.en import English
 from spacy.lookups import Lookups
diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py
index 0ac895546..d6a4600e3 100644
--- a/spacy/tests/regression/test_issue1001-1500.py
+++ b/spacy/tests/regression/test_issue1001-1500.py
@@ -6,8 +6,7 @@ from spacy.lang.en import English
 from spacy.lang.lex_attrs import LEX_ATTRS
 from spacy.matcher import Matcher
 from spacy.tokenizer import Tokenizer
-from spacy.lookups import Lookups
-from spacy.symbols import ORTH, LEMMA, POS, VERB
+from spacy.symbols import ORTH, LEMMA, POS
 
 
 def test_issue1061():
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index 83afb11f3..4988575ea 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -271,10 +271,7 @@ def test_issue1963(en_tokenizer):
 @pytest.mark.parametrize("label", ["U-JOB-NAME"])
 def test_issue1967(label):
     nlp = Language()
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-    }
+    config = {}
     ner = nlp.create_pipe("ner", config=config)
     example = Example.from_dict(
         Doc(ner.vocab, words=["word"]),
diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py
index e42779ad7..de554a5ec 100644
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@@ -157,7 +157,11 @@ def test_issue3540(en_vocab):
 
     with doc.retokenize() as retokenizer:
         heads = [(doc[3], 1), doc[2]]
-        attrs = {"POS": ["PROPN", "PROPN"], "LEMMA": ["New", "York"], "DEP": ["pobj", "compound"]}
+        attrs = {
+            "POS": ["PROPN", "PROPN"],
+            "LEMMA": ["New", "York"],
+            "DEP": ["pobj", "compound"],
+        }
         retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
 
     gold_text = ["I", "live", "in", "New", "York", "right", "now"]
diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py
index e1d03eaf5..423015106 100644
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@@ -138,10 +138,7 @@ def test_issue4042_bug2():
         if not output_dir.exists():
             output_dir.mkdir()
         ner1.to_disk(output_dir)
-        config = {
-            "learn_tokens": False,
-            "min_action_freq": 30,
-        }
+        config = {}
         ner2 = nlp1.create_pipe("ner", config=config)
         ner2.from_disk(output_dir)
         assert len(ner2.labels) == 2
@@ -303,10 +300,7 @@ def test_issue4313():
     beam_width = 16
     beam_density = 0.0001
     nlp = English()
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-    }
+    config = {}
     ner = nlp.create_pipe("ner", config=config)
     ner.add_label("SOME_LABEL")
     ner.begin_training([])
diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py
index 0b3b4a9fc..96d4e1ca4 100644
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@@ -185,20 +185,16 @@ def test_issue4725_1():
     vocab = Vocab(vectors_name="test_vocab_add_vector")
     nlp = English(vocab=vocab)
     config = {
-        "learn_tokens": False,
-        "min_action_freq": 342,
         "update_with_oracle_cut_size": 111,
     }
     ner = nlp.create_pipe("ner", config=config)
     with make_tempdir() as tmp_path:
         with (tmp_path / "ner.pkl").open("wb") as file_:
             pickle.dump(ner, file_)
-            assert ner.cfg["min_action_freq"] == 342
             assert ner.cfg["update_with_oracle_cut_size"] == 111
 
         with (tmp_path / "ner.pkl").open("rb") as file_:
             ner2 = pickle.load(file_)
-            assert ner2.cfg["min_action_freq"] == 342
             assert ner2.cfg["update_with_oracle_cut_size"] == 111
 
 
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 6865cd1e5..ebc804235 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -236,3 +236,33 @@ def test_language_from_config_before_after_init_invalid():
         config = {"nlp": {"after_pipeline_creation": {"@callbacks": callback_name}}}
         with pytest.raises(ValueError):
             English.from_config(config)
+
+
+def test_language_custom_tokenizer():
+    """Test that a fully custom tokenizer can be plugged in via the registry."""
+    name = "test_language_custom_tokenizer"
+
+    class CustomTokenizer:
+        """Dummy "tokenizer" that splits on spaces and adds prefix to each word."""
+
+        def __init__(self, nlp, prefix):
+            self.vocab = nlp.vocab
+            self.prefix = prefix
+
+        def __call__(self, text):
+            words = [f"{self.prefix}{word}" for word in text.split(" ")]
+            return Doc(self.vocab, words=words)
+
+    @registry.tokenizers(name)
+    def custom_create_tokenizer(prefix: str = "_"):
+        def create_tokenizer(nlp):
+            return CustomTokenizer(nlp, prefix=prefix)
+
+        return create_tokenizer
+
+    config = {"nlp": {"tokenizer": {"@tokenizers": name}}}
+    nlp = English.from_config(config)
+    doc = nlp("hello world")
+    assert [t.text for t in doc] == ["_hello", "_world"]
+    doc = list(nlp.pipe(["hello world"]))[0]
+    assert [t.text for t in doc] == ["_hello", "_world"]
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 8d28a78c3..8b07102ce 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -3,7 +3,7 @@ title: Model Architectures
 teaser: Pre-defined model architectures included with the core library
 source: spacy/ml/models
 menu:
-  - ['Tok2Vec', 'tok2vec']
+  - ['Tok2Vec', 'tok2vec-arch']
   - ['Transformers', 'transformers']
   - ['Parser & NER', 'parser']
   - ['Tagging', 'tagger']
@@ -70,6 +70,47 @@ blog post for background.
 | `embed`  | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Embed tokens into context-independent word vector representations.                                   |
 | `encode` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Floats2d]`. **Output:** `List[Floats2d]`. Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. |
 
+### spacy.Tok2VecListener.v1 {#Tok2VecListener}
+
+> #### Example config
+>
+> ```ini
+> [components.tok2vec]
+> factory = "tok2vec"
+>
+> [components.tok2vec.model]
+> @architectures = "spacy.HashEmbedCNN.v1"
+> width = 342
+>
+> [components.tagger]
+> factory = "tagger"
+>
+> [components.tagger.model]
+> @architectures = "spacy.Tagger.v1"
+>
+> [components.tagger.model.tok2vec]
+> @architectures = "spacy.Tok2VecListener.v1"
+> width = ${components.tok2vec.model:width}
+> ```
+
+A listener is used as a sublayer within a component such as a
+[`DependencyParser`](/api/dependencyparser),
+[`EntityRecognizer`](/api/entityrecognizer)or
+[`TextCategorizer`](/api/textcategorizer). Usually you'll have multiple
+listeners connecting to a single upstream [`Tok2Vec`](/api/tok2vec) component
+that's earlier in the pipeline. The listener layers act as **proxies**, passing
+the predictions from the `Tok2Vec` component into downstream components, and
+communicating gradients back upstream.
+
+Instead of defining its own `Tok2Vec` instance, a model architecture like
+[Tagger](/api/architectures#tagger) can define a listener as its `tok2vec`
+argument that connects to the shared `tok2vec` component in the pipeline.
+
+| Name       | Type | Description                                                                                                                                                                                                                                                                                            |
+| ---------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `width`    | int  | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component.                                                                                                                                                                                                               |
+| `upstream` | str  | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. |
+
 ### spacy.MultiHashEmbed.v1 {#MultiHashEmbed}
 
 <!-- TODO: check example config -->
@@ -195,7 +236,7 @@ and residual connections.
 > depth = 4
 > ```
 
-Encode context using bidirectonal LSTM layers. Requires
+Encode context using bidirectional LSTM layers. Requires
 [PyTorch](https://pytorch.org).
 
 | Name          | Type | Description                                                                                                                                                                                            |
@@ -237,8 +278,6 @@ architectures into your training config.
 
 ### spacy-transformers.Tok2VecListener.v1 {#Tok2VecListener}
 
-<!-- TODO: description -->
-
 > #### Example Config
 >
 > ```ini
@@ -250,10 +289,41 @@ architectures into your training config.
 > @layers = "reduce_mean.v1"
 > ```
 
-| Name          | Type                      | Description                                                                                    |
-| ------------- | ------------------------- | ---------------------------------------------------------------------------------------------- |
-| `grad_factor` | float                     | Factor for weighting the gradient if multiple components listen to the same transformer model. |
-| `pooling`     | `Model[Ragged, Floats2d]` | Pooling layer to determine how the vector for each spaCy token will be computed.               |
+Create a `TransformerListener` layer, which will connect to a
+[`Transformer`](/api/transformer) component earlier in the pipeline. The layer
+takes a list of [`Doc`](/api/doc) objects as input, and produces a list of
+2-dimensional arrays as output, with each array having one row per token. Most
+spaCy models expect a sublayer with this signature, making it easy to connect
+them to a transformer model via this sublayer. Transformer models usually
+operate over wordpieces, which usually don't align one-to-one against spaCy
+tokens. The layer therefore requires a reduction operation in order to calculate
+a single token vector given zero or more wordpiece vectors.
+
+| Name          | Type                                       | Description                                                                                                                                                                                                                                                         |
+| ------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `pooling`     | [`Model`](https://thinc.ai/docs/api-model) | **Input:** [`Ragged`](https://thinc.ai/docs/api-types#ragged). **Output:** [`Floats2d`](https://thinc.ai/docs/api-types#types)                                                                                                                                      | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. |
+| `grad_factor` | float                                      | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. |
+
+### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer}
+
+> #### Example Config
+>
+> ```ini
+> # TODO:
+> ```
+
+Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does
+**not** allow multiple components to share the transformer weights, and does
+**not** allow the transformer to set annotations into the [`Doc`](/api/doc)
+object, but it's a **simpler solution** if you only need the transformer within
+one component.
+
+| Name               | Type                                       | Description                                                                                                                                                                                                                                                         |
+| ------------------ | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_spans`        | callable                                   | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples.                                                     |
+| `tokenizer_config` | `Dict[str, Any]`                           | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer).                                                                                                                    |
+| `pooling`          | [`Model`](https://thinc.ai/docs/api-model) | **Input:** [`Ragged`](https://thinc.ai/docs/api-types#ragged). **Output:** [`Floats2d`](https://thinc.ai/docs/api-types#types)                                                                                                                                      | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. |
+| `grad_factor`      | float                                      | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. |
 
 ## Parser & NER architectures {#parser}
 
@@ -417,20 +487,18 @@ network has an internal CNN Tok2Vec layer and uses attention.
 > nO = null
 > ```
 
-| Name                 | Type  | Description                                                                                                                                 |
-| -------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes`  | bool  | Whether or not categories are mutually exclusive.                                                                                           |
-| `pretrained_vectors` | bool  | Whether or not pretrained vectors will be used in addition to the feature vectors.                                                          |
-| `width`              | int   | Output dimension of the feature encoding step.                                                                                              |
-| `embed_size`         | int   | Input dimension of the feature encoding step.                                                                                               |
-| `conv_depth`         | int   | Depth of the Tok2Vec layer.                                                                                                                 |
-| `window_size`        | int   | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right.         |
-| `ngram_size`         | int   | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. |
-| `dropout`            | float | The dropout rate.                                                                                                                           |
-| `nO`                 | int   | Output dimension, determined by the number of different labels.                                                                             |
-
-If the `nO` dimension is not set, the TextCategorizer component will set it when
-`begin_training` is called.
+| Name                        | Type  | Description                                                                                                                                              |
+| --------------------------- | ----- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes`         | bool  | Whether or not categories are mutually exclusive.                                                                                                        |
+| `pretrained_vectors`        | bool  | Whether or not pretrained vectors will be used in addition to the feature vectors.                                                                       |
+| `width`                     | int   | Output dimension of the feature encoding step.                                                                                                           |
+| `embed_size`                | int   | Input dimension of the feature encoding step.                                                                                                            |
+| `conv_depth`                | int   | Depth of the Tok2Vec layer.                                                                                                                              |
+| `window_size`               | int   | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right.                      |
+| `ngram_size`                | int   | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features.              |
+| `dropout`                   | float | The dropout rate.                                                                                                                                        |
+| `nO`                        | int   | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when |
+| `begin_training` is called. |
 
 ### spacy.TextCatCNN.v1 {#TextCatCNN}
 
@@ -457,14 +525,12 @@ A neural network model where token vectors are calculated using a CNN. The
 vectors are mean pooled and used as features in a feed-forward network. This
 architecture is usually less accurate than the ensemble, but runs faster.
 
-| Name                | Type                                       | Description                                                     |
-| ------------------- | ------------------------------------------ | --------------------------------------------------------------- |
-| `exclusive_classes` | bool                                       | Whether or not categories are mutually exclusive.               |
-| `tok2vec`           | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model.                   |
-| `nO`                | int                                        | Output dimension, determined by the number of different labels. |
-
-If the `nO` dimension is not set, the TextCategorizer component will set it when
-`begin_training` is called.
+| Name                        | Type                                       | Description                                                                                                                                              |
+| --------------------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes`         | bool                                       | Whether or not categories are mutually exclusive.                                                                                                        |
+| `tok2vec`                   | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model.                                                                                                            |
+| `nO`                        | int                                        | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when |
+| `begin_training` is called. |
 
 ### spacy.TextCatBOW.v1 {#TextCatBOW}
 
@@ -482,17 +548,17 @@ If the `nO` dimension is not set, the TextCategorizer component will set it when
 An ngram "bag-of-words" model. This architecture should run much faster than the
 others, but may not be as accurate, especially if texts are short.
 
-| Name                | Type  | Description                                                                                                                                 |
-| ------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes` | bool  | Whether or not categories are mutually exclusive.                                                                                           |
-| `ngram_size`        | int   | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. |
-| `no_output_layer`   | float | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes=True`, else `Logistic`.                      |
-| `nO`                | int   | Output dimension, determined by the number of different labels.                                                                             |
-
-If the `nO` dimension is not set, the TextCategorizer component will set it when
-`begin_training` is called.
+| Name                        | Type  | Description                                                                                                                                              |
+| --------------------------- | ----- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes`         | bool  | Whether or not categories are mutually exclusive.                                                                                                        |
+| `ngram_size`                | int   | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features.              |
+| `no_output_layer`           | float | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes=True`, else `Logistic`.                                   |
+| `nO`                        | int   | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when |
+| `begin_training` is called. |
 
+<!-- TODO:
 ### spacy.TextCatLowData.v1 {#TextCatLowData}
+-->
 
 ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
 
@@ -558,8 +624,6 @@ A function that creates a default, empty `KnowledgeBase` from a
 
 A function that takes as input a [`KnowledgeBase`](/api/kb) and a
 [`Span`](/api/span) object denoting a named entity, and returns a list of
-plausible [`Candidate` objects](/api/kb/#candidate_init).
-
-The default `CandidateGenerator` simply uses the text of a mention to find its
-potential aliases in the Knowledgebase. Note that this function is
-case-dependent.
+plausible [`Candidate` objects](/api/kb/#candidate_init). The default
+`CandidateGenerator` simply uses the text of a mention to find its potential
+aliases in the `KnowledgeBase`. Note that this function is case-dependent.
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 377b2456f..c4a774cd0 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -601,9 +601,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides
 
 ## Pretrain {#pretrain new="2.1" tag="experimental"}
 
-<!-- TODO: document new pretrain command and link to new pretraining docs -->
-
-Pre-train the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
+Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
 components on [raw text](/api/data-formats#pretrain), using an approximate
 language-modeling objective. Specifically, we load pretrained vectors, and train
 a component like a CNN, BiLSTM, etc to predict vectors which match the
@@ -611,7 +609,8 @@ pretrained ones. The weights are saved to a directory after each epoch. You can
 then include a **path to one of these pretrained weights files** in your
 [training config](/usage/training#config) as the `init_tok2vec` setting when you
 train your model. This technique may be especially helpful if you have little
-labelled data.
+labelled data. See the usage docs on [pretraining](/usage/training#pretraining)
+for more info.
 
 <Infobox title="Changed in v3.0" variant="warning">
 
@@ -634,8 +633,8 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path]
 | `output_dir`            | positional | Directory to write models to on each epoch.                                                                                                                                  |
 | `config_path`           | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters.                                                                        |
 | `--code`, `-c`          | option     | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures.                         |
-| `--resume-path`, `-r`   | option     | TODO:                                                                                                                                                                        |
-| `--epoch-resume`, `-er` | option     | TODO:                                                                                                                                                                        |
+| `--resume-path`, `-r`   | option     | Path to pretrained weights from which to resume pretraining.                                                                                                                 |
+| `--epoch-resume`, `-er` | option     | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files.                                                      |
 | `--help`, `-h`          | flag       | Show help message and available arguments.                                                                                                                                   |
 | overrides               |            | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`.                |
 | **CREATES**             | weights    | The pretrained weights that can be used to initialize `spacy train`.                                                                                                         |
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index c0a87756d..af7cb26de 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -20,9 +20,9 @@ Config files define the training process and model pipeline and can be passed to
 [`spacy train`](/api/cli#train). They use
 [Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
 hood. For details on how to use training configs, see the
-[usage documentation](/usage/training#config).
-
-<!-- TODO: add details on getting started and init config -->
+[usage documentation](/usage/training#config). To get started with a blank
+config or fill a partial config with all defaults, you can use the
+[`init config`](/api/cli#init-config) command.
 
 > #### What does the @ mean?
 >
@@ -52,8 +52,6 @@ your config and check that it's valid, you can run the
 
 </Infobox>
 
-<!-- TODO: once we know how we want to implement "starter config" workflow or outputting a full default config for the user, update this section with the command -->
-
 ### nlp {#config-nlp tag="section"}
 
 > #### Example
@@ -154,8 +152,6 @@ This section is optional and defines settings and controls for
 [language model pretraining](/usage/training#pretraining). It's used when you
 run [`spacy pretrain`](/api/cli#pretrain).
 
-<!-- TODO: complete -->
-
 | Name                         | Type                                                | Description                                                                   | Default                                             |
 | ---------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------- | --------------------------------------------------- |
 | `max_epochs`                 | int                                                 | Maximum number of epochs.                                                     | `1000`                                              |
diff --git a/website/docs/api/dependencymatcher.md b/website/docs/api/dependencymatcher.md
index 3638575df..4f192783f 100644
--- a/website/docs/api/dependencymatcher.md
+++ b/website/docs/api/dependencymatcher.md
@@ -5,4 +5,194 @@ tag: class
 source: spacy/matcher/dependencymatcher.pyx
 ---
 
-TODO: write
+The `DependencyMatcher` follows the same API as the [`Matcher`](/api/matcher)
+and [`PhraseMatcher`](/api/phrasematcher) and lets you match on dependency trees
+using the
+[Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html).
+It requires a pretrained [`DependencyParser`](/api/parser) or other component
+that sets the `Token.dep` attribute.
+
+## Pattern format {#patterns}
+
+> ```json
+> ### Example
+> [
+>   {
+>     "SPEC": {"NODE_NAME": "founded"},
+>     "PATTERN": {"ORTH": "founded"}
+>   },
+>   {
+>     "SPEC": {
+>       "NODE_NAME": "founder",
+>       "NBOR_RELOP": ">",
+>       "NBOR_NAME": "founded"
+>   },
+>     "PATTERN": {"DEP": "nsubj"}
+>   },
+>   {
+>     "SPEC": {
+>       "NODE_NAME": "object",
+>       "NBOR_RELOP": ">",
+>       "NBOR_NAME": "founded"
+>   },
+>     "PATTERN": {"DEP": "dobj"}
+>   }
+> ]
+> ```
+
+A pattern added to the `DependencyMatcher` consists of a list of dictionaries,
+with each dictionary describing a node to match. Each pattern should have the
+following top-level keys:
+
+| Name      | Type | Description                                                                                                                 |
+| --------- | ---- | --------------------------------------------------------------------------------------------------------------------------- |
+| `PATTERN` | dict | The token attributes to match in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). |
+| `SPEC`    | dict | The relationships of the nodes in the subtree that should be matched.                                                       |
+
+The `SPEC` includes the following fields:
+
+| Name         | Type | Description                                                                                                                                                            |
+| ------------ | ---- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `NODE_NAME`  | str  | A unique name for this node to refer to it in other specs.                                                                                                             |
+| `NBOR_RELOP` | str  | A [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html) operator that describes how the two nodes are related. |
+| `NBOR_NAME`  | str  | The unique name of the node that this node is connected to.                                                                                                            |
+
+## DependencyMatcher.\_\_init\_\_ {#init tag="method"}
+
+Create a rule-based `DependencyMatcher`.
+
+> #### Example
+>
+> ```python
+> from spacy.matcher import DependencyMatcher
+> matcher = DependencyMatcher(nlp.vocab)
+> ```
+
+| Name    | Type    | Description                                                                                 |
+| ------- | ------- | ------------------------------------------------------------------------------------------- |
+| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
+
+## DependencyMatcher.\_\call\_\_ {#call tag="method"}
+
+Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
+
+> #### Example
+>
+> ```python
+> from spacy.matcher import Matcher
+>
+> matcher = Matcher(nlp.vocab)
+> pattern = [
+>     {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
+>     {"SPEC": {"NODE_NAME": "founder", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
+> ]
+> matcher.add("Founder", [pattern])
+> doc = nlp("Bill Gates founded Microsoft.")
+> matches = matcher(doc)
+> ```
+
+| Name        | Type         | Description                                                                                                                                                              |
+| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `doclike`   | `Doc`/`Span` | The `Doc` or `Span` to match over.                                                                                                                                       |
+| **RETURNS** | list         | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
+
+## DependencyMatcher.\_\_len\_\_ {#len tag="method"}
+
+Get the number of rules (edges) added to the dependency matcher. Note that this
+only returns the number of rules (identical with the number of IDs), not the
+number of individual patterns.
+
+> #### Example
+>
+> ```python
+> matcher = DependencyMatcher(nlp.vocab)
+> assert len(matcher) == 0
+> pattern = [
+>     {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
+>     {"SPEC": {"NODE_NAME": "START_ENTITY", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
+> ]
+> matcher.add("Rule", [pattern])
+> assert len(matcher) == 1
+> ```
+
+| Name        | Type | Description          |
+| ----------- | ---- | -------------------- |
+| **RETURNS** | int  | The number of rules. |
+
+## DependencyMatcher.\_\_contains\_\_ {#contains tag="method"}
+
+Check whether the matcher contains rules for a match ID.
+
+> #### Example
+>
+> ```python
+> matcher = Matcher(nlp.vocab)
+> assert "Rule" not in matcher
+> matcher.add("Rule", [pattern])
+> assert "Rule" in matcher
+> ```
+
+| Name        | Type | Description                                           |
+| ----------- | ---- | ----------------------------------------------------- |
+| `key`       | str  | The match ID.                                         |
+| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
+
+## DependencyMatcher.add {#add tag="method"}
+
+Add a rule to the matcher, consisting of an ID key, one or more patterns, and an
+optional callback function to act on the matches. The callback function will
+receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
+exists for the given ID, the patterns will be extended. An `on_match` callback
+will be overwritten.
+
+> #### Example
+>
+> ```python
+> def on_match(matcher, doc, id, matches):
+>     print('Matched!', matches)
+>
+> matcher = Matcher(nlp.vocab)
+> matcher.add("TEST_PATTERNS", patterns)
+> ```
+
+| Name           | Type               | Description                                                                                   |
+| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
+| `match_id`     | str                | An ID for the thing you're matching.                                                          |
+| `patterns`     | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |
+| _keyword-only_ |                    |                                                                                               |
+| `on_match`     | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
+
+## DependencyMatcher.remove {#remove tag="method"}
+
+Remove a rule from the matcher. A `KeyError` is raised if the match ID does not
+exist.
+
+> #### Example
+>
+> ```python
+> matcher.add("Rule", [pattern]])
+> assert "Rule" in matcher
+> matcher.remove("Rule")
+> assert "Rule" not in matcher
+> ```
+
+| Name  | Type | Description               |
+| ----- | ---- | ------------------------- |
+| `key` | str  | The ID of the match rule. |
+
+## DependencyMatcher.get {#get tag="method"}
+
+Retrieve the pattern stored for a key. Returns the rule as an
+`(on_match, patterns)` tuple containing the callback and available patterns.
+
+> #### Example
+>
+> ```python
+> matcher.add("Rule", [pattern], on_match=on_match)
+> on_match, patterns = matcher.get("Rule")
+> ```
+
+| Name        | Type  | Description                                   |
+| ----------- | ----- | --------------------------------------------- |
+| `key`       | str   | The ID of the match rule.                     |
+| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |
diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md
index e56e85e64..6c9222781 100644
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@@ -8,6 +8,23 @@ api_string_name: parser
 api_trainable: true
 ---
 
+A transition-based dependency parser component. The dependency parser jointly
+learns sentence segmentation and labelled dependency parsing, and can optionally
+learn to merge tokens that had been over-segmented by the tokenizer. The parser
+uses a variant of the **non-monotonic arc-eager transition-system** described by
+[Honnibal and Johnson (2014)](https://www.aclweb.org/anthology/D15-1162/), with
+the addition of a "break" transition to perform the sentence segmentation.
+[Nivre (2005)](https://www.aclweb.org/anthology/P05-1013/)'s **pseudo-projective
+dependency transformation** is used to allow the parser to predict
+non-projective parses.
+
+The parser is trained using an **imitation learning objective**. It follows the
+actions predicted by the current weights, and at each state, determines which
+actions are compatible with the optimal parse that could be reached from the
+current state. The weights such that the scores assigned to the set of optimal
+actions is increased, while scores assigned to other actions are decreased. Note
+that more than one action may be optimal for a given state.
+
 ## Config and implementation {#config}
 
 The default config is defined by the pipeline component factory and describes
@@ -23,18 +40,21 @@ architectures and their arguments and hyperparameters.
 > from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
 > config = {
 >    "moves": None,
->   # TODO: rest
+>    "update_with_oracle_cut_size": 100,
+>    "learn_tokens": False,
+>    "min_action_freq": 30,
 >    "model": DEFAULT_PARSER_MODEL,
 > }
 > nlp.add_pipe("parser", config=config)
 > ```
 
-<!-- TODO: finish API docs -->
-
-| Setting | Type                                       | Description       | Default                                                           |
-| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
-| `moves` | list                                       |                   | `None`                                                            |
-| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
+| Setting                       | Type                                       | Description                                                                                                                                                                                                                                                                                 | Default                                                           |
+| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------- |
+| `moves`                       | `List[str]`                                | A list of transition names. Inferred from the data if not provided.                                                                                                                                                                                                                         | `None`                                                            |
+| `update_with_oracle_cut_size` | int                                        | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it.                                                                    | `100`                                                             |
+| `learn_tokens`                | bool                                       | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental.                                                                                                                                                                                             | `False`                                                           |
+| `min_action_freq`             | int                                        | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. | `30`                                                              |
+| `model`                       | [`Model`](https://thinc.ai/docs/api-model) | The model to use.                                                                                                                                                                                                                                                                           | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
 
 ```python
 https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/dep_parser.pyx
@@ -61,19 +81,16 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).
 
-<!-- TODO: finish API docs -->
-
-| Name                          | Type                                       | Description                                                                                 |
-| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
-| `vocab`                       | `Vocab`                                    | The shared vocabulary.                                                                      |
-| `model`                       | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.             |
-| `name`                        | str                                        | String name of the component instance. Used to add entries to the `losses` during training. |
-| `moves`                       | list                                       |                                                                                             |
-| _keyword-only_                |                                            |                                                                                             |
-| `update_with_oracle_cut_size` | int                                        |                                                                                             |
-| `multitasks`                  | `Iterable`                                 |                                                                                             |
-| `learn_tokens`                | bool                                       |                                                                                             |
-| `min_action_freq`             | int                                        |                                                                                             |
+| Name                          | Type                                       | Description                                                                                                                                                                                                                                                                                 |
+| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                       | `Vocab`                                    | The shared vocabulary.                                                                                                                                                                                                                                                                      |
+| `model`                       | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.                                                                                                                                                                                                             |
+| `name`                        | str                                        | String name of the component instance. Used to add entries to the `losses` during training.                                                                                                                                                                                                 |
+| `moves`                       | `List[str]`                                | A list of transition names. Inferred from the data if not provided.                                                                                                                                                                                                                         |
+| _keyword-only_                |                                            |                                                                                                                                                                                                                                                                                             |
+| `update_with_oracle_cut_size` | int                                        | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default.                                           |
+| `learn_tokens`                | bool                                       | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental.                                                                                                                                                                                             |
+| `min_action_freq`             | int                                        | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. |
 
 ## DependencyParser.\_\_call\_\_ {#call tag="method"}
 
diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 0ab17f953..a6368e62b 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -8,6 +8,18 @@ api_string_name: ner
 api_trainable: true
 ---
 
+A transition-based named entity recognition component. The entity recognizer
+identifies **non-overlapping labelled spans** of tokens. The transition-based
+algorithm used encodes certain assumptions that are effective for "traditional"
+named entity recognition tasks, but may not be a good fit for every span
+identification problem. Specifically, the loss function optimizes for **whole
+entity accuracy**, so if your inter-annotator agreement on boundary tokens is
+low, the component will likely perform poorly on your problem. The
+transition-based algorithm also assumes that the most decisive information about
+your entities will be close to their initial tokens. If your entities are long
+and characterized by tokens in their middle, the component will likely not be a
+good fit for your task.
+
 ## Config and implementation {#config}
 
 The default config is defined by the pipeline component factory and describes
@@ -23,18 +35,17 @@ architectures and their arguments and hyperparameters.
 > from spacy.pipeline.ner import DEFAULT_NER_MODEL
 > config = {
 >    "moves": None,
->   # TODO: rest
+>    "update_with_oracle_cut_size": 100,
 >    "model": DEFAULT_NER_MODEL,
 > }
 > nlp.add_pipe("ner", config=config)
 > ```
 
-<!-- TODO: finish API docs -->
-
-| Setting | Type                                       | Description       | Default                                                           |
-| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
-| `moves` | list                                       |                   | `None`                                                            |
-| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
+| Setting                       | Type                                       | Description                                                                                                                                                                                                              | Default                                                           |
+| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------- |
+| `moves`                       | `List[str]`                                | A list of transition names. Inferred from the data if not provided.                                                                                                                                                      |
+| `update_with_oracle_cut_size` | int                                        | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. | `100`                                                             |
+| `model`                       | [`Model`](https://thinc.ai/docs/api-model) | The model to use.                                                                                                                                                                                                        | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
 
 ```python
 https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/ner.pyx
@@ -61,19 +72,14 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).
 
-<!-- TODO: finish API docs -->
-
-| Name                          | Type                                       | Description                                                                                 |
-| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
-| `vocab`                       | `Vocab`                                    | The shared vocabulary.                                                                      |
-| `model`                       | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.             |
-| `name`                        | str                                        | String name of the component instance. Used to add entries to the `losses` during training. |
-| `moves`                       | list                                       |                                                                                             |
-| _keyword-only_                |                                            |                                                                                             |
-| `update_with_oracle_cut_size` | int                                        |                                                                                             |
-| `multitasks`                  | `Iterable`                                 |                                                                                             |
-| `learn_tokens`                | bool                                       |                                                                                             |
-| `min_action_freq`             | int                                        |                                                                                             |
+| Name                          | Type                                       | Description                                                                                                                                                                                                                                       |
+| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                       | `Vocab`                                    | The shared vocabulary.                                                                                                                                                                                                                            |
+| `model`                       | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.                                                                                                                                                                   |
+| `name`                        | str                                        | String name of the component instance. Used to add entries to the `losses` during training.                                                                                                                                                       |
+| `moves`                       | `List[str]`                                | A list of transition names. Inferred from the data if not provided.                                                                                                                                                                               |
+| _keyword-only_                |                                            |                                                                                                                                                                                                                                                   |
+| `update_with_oracle_cut_size` | int                                        | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. |
 
 ## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
 
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index 7464a029e..79782fd72 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -242,6 +242,21 @@ a batch of [Example](/api/example) objects.
 
 Update the models in the pipeline.
 
+<Infobox variant="warning" title="Changed in v3.0">
+
+The `Language.update` method now takes a batch of [`Example`](/api/example)
+objects instead of the raw texts and annotations or `Doc` and `GoldParse`
+objects. An [`Example`](/api/example) streamlines how data is passed around. It
+stores two `Doc` objects: one for holding the gold-standard reference data, and
+one for holding the predictions of the pipeline.
+
+For most use cases, you shouldn't have to write your own training scripts
+anymore. Instead, you can use [`spacy train`](/api/cli#train) with a config file
+and custom registered functions if needed. See the
+[training documentation](/usage/training) for details.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
@@ -253,7 +268,7 @@ Update the models in the pipeline.
 
 | Name            | Type                                                | Description                                                                                            |
 | --------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------ |
-| `examples`      | `Iterable[Example]`                                 | A batch of `Example` objects to learn from.                                                            |
+| `examples`      | `Iterable[Example]`                                 | A batch of [`Example`](/api/example) objects to learn from.                                            |
 | _keyword-only_  |                                                     |                                                                                                        |
 | `drop`          | float                                               | The dropout rate.                                                                                      |
 | `sgd`           | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer.                                                                                         |
diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md
index 6a6bb1244..f1242d193 100644
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@@ -9,6 +9,28 @@ api_string_name: lemmatizer
 api_trainable: false
 ---
 
+Component for assigning base forms to tokens using rules based on part-of-speech
+tags, or lookup tables. Functionality to train the component is coming soon.
+Different [`Language`](/api/language) subclasses can implement their own
+lemmatizer components via
+[language-specific factories](/usage/processing-pipelines#factories-language).
+The default data used is provided by the
+[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
+extension package.
+
+<Infobox variant="warning" title="New in v3.0">
+
+As of v3.0, the `Lemmatizer` is a **standalone pipeline component** that can be
+added to your pipeline, and not a hidden part of the vocab that runs behind the
+scenes. This makes it easier to customize how lemmas should be assigned in your
+pipeline.
+
+If the lemmatization mode is set to `"rule"` and requires part-of-speech tags to
+be assigned, make sure a [`Tagger`](/api/tagger) or another component assigning
+tags is available in the pipeline and runs _before_ the lemmatizer.
+
+</Infobox>
+
 ## Config and implementation
 
 The default config is defined by the pipeline component factory and describes
@@ -29,7 +51,7 @@ lemmatizers, see the
 
 | Setting     | Type                                       | Description                                                                                                                                                                            | Default    |
 | ----------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- |
-| `mode`      | str                                        | The lemmatizer mode, e.g. "lookup" or "rule".                                                                                                                                          | `"lookup"` |
+| `mode`      | str                                        | The lemmatizer mode, e.g. `"lookup"` or `"rule"`.                                                                                                                                      | `"lookup"` |
 | `lookups`   | [`Lookups`](/api/lookups)                  | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from `spacy-lookups-data`. | `None`     |
 | `overwrite` | bool                                       | Whether to overwrite existing lemmas.                                                                                                                                                  | `False`    |
 | `model`     | [`Model`](https://thinc.ai/docs/api-model) | **Not yet implemented:** the model to use.                                                                                                                                             | `None`     |
@@ -55,15 +77,15 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).
 
-| Name           | Type                                       | Description                                                                                                                      |
-| -------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`        | [`Vocab`](/api/vocab)                      | The vocab.                                                                                                                       |
-| `model`        | [`Model`](https://thinc.ai/docs/api-model) | A model (not yet implemented).                                                                                                   |
-| `name`         | str                                        | String name of the component instance. Used to add entries to the `losses` during training.                                      |
-| _keyword-only_ |                                            |                                                                                                                                  |
-| mode           | str                                        | The lemmatizer mode, e.g. "lookup" or "rule". Defaults to "lookup".                                                              |
-| lookups        | [`Lookups`](/api/lookups)                  | A lookups object containing the tables such as "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". Defaults to `None`. |
-| overwrite      | bool                                       | Whether to overwrite existing lemmas.                                                                                            |
+| Name           | Type                                       | Description                                                                                                                              |
+| -------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`        | [`Vocab`](/api/vocab)                      | The vocab.                                                                                                                               |
+| `model`        | [`Model`](https://thinc.ai/docs/api-model) | A model (not yet implemented).                                                                                                           |
+| `name`         | str                                        | String name of the component instance. Used to add entries to the `losses` during training.                                              |
+| _keyword-only_ |                                            |                                                                                                                                          |
+| mode           | str                                        | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`.                                                                |
+| lookups        | [`Lookups`](/api/lookups)                  | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. |
+| overwrite      | bool                                       | Whether to overwrite existing lemmas.                                                                                                    |
 
 ## Lemmatizer.\_\_call\_\_ {#call tag="method"}
 
diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index 925c9ad2e..b481f1972 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -5,6 +5,82 @@ tag: class
 source: spacy/matcher/matcher.pyx
 ---
 
+The `Matcher` lets you find words and phrases using rules describing their token
+attributes. Rules can refer to token annotations (like the text or
+part-of-speech tags), as well as lexical attributes like `Token.is_punct`.
+Applying the matcher to a [`Doc`](/api/doc) gives you access to the matched
+tokens in context. For in-depth examples and workflows for combining rules and
+statistical models, see the [usage guide](/usage/rule-based-matching) on
+rule-based matching.
+
+## Pattern format {#patterns}
+
+> ```json
+> ### Example
+> [
+>   {"LOWER": "i"},
+>   {"LEMMA": {"IN": ["like", "love"]}},
+>   {"POS": "NOUN", "OP": "+"}
+> ]
+> ```
+
+A pattern added to the `Matcher` consists of a list of dictionaries. Each
+dictionary describes **one token** and its attributes. The available token
+pattern keys correspond to a number of
+[`Token` attributes](/api/token#attributes). The supported attributes for
+rule-based matching are:
+
+| Attribute                              | Type |  Description                                                                                           |
+| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ |
+| `ORTH`                                 | str  | The exact verbatim text of a token.                                                                    |
+| `TEXT` <Tag variant="new">2.1</Tag>    | str  | The exact verbatim text of a token.                                                                    |
+| `LOWER`                                | str  | The lowercase form of the token text.                                                                  |
+|  `LENGTH`                              | int  | The length of the token text.                                                                          |
+|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | bool | Token text consists of alphabetic characters, ASCII characters, digits.                                |
+|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`    | bool | Token text is in lowercase, uppercase, titlecase.                                                      |
+|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`     | bool | Token is punctuation, whitespace, stop word.                                                           |
+|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`  | bool | Token text resembles a number, URL, email.                                                             |
+|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str  | The token's simple and extended part-of-speech tag, dependency label, lemma, shape.                    |
+| `ENT_TYPE`                             | str  | The token's entity label.                                                                              |
+| `_` <Tag variant="new">2.1</Tag>       | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
+| `OP`                                   | str  | Operator or quantifier to determine how often to match a token pattern.                                |
+
+Operators and quantifiers define **how often** a token pattern should be
+matched:
+
+> ```json
+> ### Example
+> [
+>   {"POS": "ADJ", "OP": "*"},
+>   {"POS": "NOUN", "OP": "+"}
+> ]
+> ```
+
+| OP  | Description                                                      |
+| --- | ---------------------------------------------------------------- |
+| `!` | Negate the pattern, by requiring it to match exactly 0 times.    |
+| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
+| `+` | Require the pattern to match 1 or more times.                    |
+| `*` | Allow the pattern to match zero or more times.                   |
+
+Token patterns can also map to a **dictionary of properties** instead of a
+single value to indicate whether the expected value is a member of a list or how
+it compares to another value.
+
+> ```json
+> ### Example
+> [
+>   {"LEMMA": {"IN": ["like", "love", "enjoy"]}},
+>   {"POS": "PROPN", "LENGTH": {">=": 10}},
+> ]
+> ```
+
+| Attribute                  | Type       | Description                                                                       |
+| -------------------------- | ---------- | --------------------------------------------------------------------------------- |
+| `IN`                       | any        | Attribute value is member of a list.                                              |
+| `NOT_IN`                   | any        | Attribute value is _not_ member of a list.                                        |
+| `==`, `>=`, `<=`, `>`, `<` | int, float | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. |
+
 ## Matcher.\_\_init\_\_ {#init tag="method"}
 
 Create the rule-based `Matcher`. If `validate=True` is set, all patterns added
@@ -60,7 +136,7 @@ Match a stream of documents, yielding them in turn.
 
 | Name                                          | Type     | Description                                                                                                                                                                                                                |
 | --------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `docs`                                        | iterable | A stream of documents.                                                                                                                                                                                                     |
+| `docs`                                        | iterable | A stream of documents or spans.                                                                                                                                                                                            |
 | `batch_size`                                  | int      | The number of documents to accumulate into a working set.                                                                                                                                                                  |
 | `return_matches` <Tag variant="new">2.1</Tag> | bool     | Yield the match lists along with the docs, making results `(doc, matches)` tuples.                                                                                                                                         |
 | `as_tuples`                                   | bool     | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. |
@@ -105,11 +181,11 @@ Check whether the matcher contains rules for a match ID.
 
 ## Matcher.add {#add tag="method" new="2"}
 
-Add a rule to the matcher, consisting of an ID key, one or more patterns, and a
-callback function to act on the matches. The callback function will receive the
-arguments `matcher`, `doc`, `i` and `matches`. If a pattern already exists for
-the given ID, the patterns will be extended. An `on_match` callback will be
-overwritten.
+Add a rule to the matcher, consisting of an ID key, one or more patterns, and an
+optional callback function to act on the matches. The callback function will
+receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
+exists for the given ID, the patterns will be extended. An `on_match` callback
+will be overwritten.
 
 > #### Example
 >
@@ -141,12 +217,13 @@ patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
 
 </Infobox>
 
-| Name           | Type               | Description                                                                                   |
-| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
-| `match_id`     | str                | An ID for the thing you're matching.                                                          |
-| `patterns`     | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |
-| _keyword-only_ |                    |                                                                                               |
-| `on_match`     | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
+| Name                                | Type               | Description                                                                                   |
+| ----------------------------------- | ------------------ | --------------------------------------------------------------------------------------------- |
+| `match_id`                          | str                | An ID for the thing you're matching.                                                          |
+| `patterns`                          | `List[List[dict]]` | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |
+| _keyword-only_                      |                    |                                                                                               |
+| `on_match`                          | callable / `None`  | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
+| `greedy` <Tag variant="new">3</Tag> | str                | Optional filter for greedy matches. Can either be `"FIRST"` or `"LONGEST"`.                   |
 
 ## Matcher.remove {#remove tag="method" new="2"}
 
diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md
index bfe5c3c77..942440234 100644
--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@@ -63,16 +63,14 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).
 
-<!-- TODO: finish API docs -->
-
 | Name           | Type    | Description                                                                                 |
 | -------------- | ------- | ------------------------------------------------------------------------------------------- |
 | `vocab`        | `Vocab` | The shared vocabulary.                                                                      |
 | `model`        | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.             |
 | `name`         | str     | String name of the component instance. Used to add entries to the `losses` during training. |
 | _keyword-only_ |         |                                                                                             |
-| `labels_morph` | dict    |                                                                                             |
-| `labels_pos`   | dict    |                                                                                             |
+| `labels_morph` | dict    | Mapping of morph + POS tags to morph labels.                                                |
+| `labels_pos`   | dict    | Mapping of morph + POS tags to POS tags.                                                    |
 
 ## Morphologizer.\_\_call\_\_ {#call tag="method"}
 
diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md
index 866aca096..71c7a463b 100644
--- a/website/docs/api/phrasematcher.md
+++ b/website/docs/api/phrasematcher.md
@@ -9,7 +9,8 @@ new: 2
 The `PhraseMatcher` lets you efficiently match large terminology lists. While
 the [`Matcher`](/api/matcher) lets you match sequences based on lists of token
 descriptions, the `PhraseMatcher` accepts match patterns in the form of `Doc`
-objects.
+objects. See the [usage guide](/usage/rule-based-matching#phrasematcher) for
+examples.
 
 ## PhraseMatcher.\_\_init\_\_ {#init tag="method"}
 
diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md
index d9b8f4caf..233171779 100644
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@@ -28,10 +28,10 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("tagger", config=config)
 > ```
 
-| Setting          | Type                                       | Description                            | Default                             |
-| ---------------- | ------------------------------------------ | -------------------------------------- | ----------------------------------- |
-| `set_morphology` | bool                                       | Whether to set morphological features. | `False`                             |
-| `model`          | [`Model`](https://thinc.ai/docs/api-model) | The model to use.                      | [Tagger](/api/architectures#Tagger) |
+| Setting          | Type                                       | Description                                                                                                                                                                                                      | Default                             |
+| ---------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------- |
+| `set_morphology` | bool                                       | Whether to set morphological features.                                                                                                                                                                           | `False`                             |
+| `model`          | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). | [Tagger](/api/architectures#Tagger) |
 
 ```python
 https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tagger.pyx
@@ -58,13 +58,13 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).
 
-| Name             | Type    | Description                                                                                 |
-| ---------------- | ------- | ------------------------------------------------------------------------------------------- |
-| `vocab`          | `Vocab` | The shared vocabulary.                                                                      |
-| `model`          | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.             |
-| `name`           | str     | String name of the component instance. Used to add entries to the `losses` during training. |
-| _keyword-only_   |         |                                                                                             |
-| `set_morphology` | bool    | Whether to set morphological features.                                                      |
+| Name             | Type                                       | Description                                                                                                                                                                                                      |
+| ---------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`          | `Vocab`                                    | The shared vocabulary.                                                                                                                                                                                           |
+| `model`          | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). |
+| `name`           | str                                        | String name of the component instance. Used to add entries to the `losses` during training.                                                                                                                      |
+| _keyword-only_   |                                            |                                                                                                                                                                                                                  |
+| `set_morphology` | bool                                       | Whether to set morphological features.                                                                                                                                                                           |
 
 ## Tagger.\_\_call\_\_ {#call tag="method"}
 
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index 1efd5831c..5af540828 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -9,6 +9,12 @@ api_string_name: textcat
 api_trainable: true
 ---
 
+The text categorizer predicts **categories over a whole document**. It can learn
+one or more labels, and the labels can be mutually exclusive (i.e. one true
+label per document) or non-mutually exclusive (i.e. zero or more labels may be
+true per document). The multi-label setting is controlled by the model instance
+that's provided.
+
 ## Config and implementation {#config}
 
 The default config is defined by the pipeline component factory and describes
@@ -29,10 +35,10 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("textcat", config=config)
 > ```
 
-| Setting  | Type                                       | Description        | Default                                               |
-| -------- | ------------------------------------------ | ------------------ | ----------------------------------------------------- |
-| `labels` | `Iterable[str]`                            | The labels to use. | `[]`                                                  |
-| `model`  | [`Model`](https://thinc.ai/docs/api-model) | The model to use.  | [TextCatEnsemble](/api/architectures#TextCatEnsemble) |
+| Setting  | Type                                       | Description                                                                             | Default                                               |
+| -------- | ------------------------------------------ | --------------------------------------------------------------------------------------- | ----------------------------------------------------- |
+| `labels` | `List[str]`                                | A list of categories to learn. If empty, the model infers the categories from the data. | `[]`                                                  |
+| `model`  | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts scores for each category.                                | [TextCatEnsemble](/api/architectures#TextCatEnsemble) |
 
 ```python
 https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/textcat.py
@@ -67,23 +73,6 @@ shortcut for this and instantiate the component using its string name and
 | _keyword-only_ |                                            |                                                                                             |
 | `labels`       | `Iterable[str]`                            | The labels to use.                                                                          |
 
-<!-- TODO move to config page
-### Architectures {#architectures new="2.1"}
-
-Text classification models can be used to solve a wide variety of problems.
-Differences in text length, number of labels, difficulty, and runtime
-performance constraints mean that no single algorithm performs well on all types
-of problems. To handle a wider variety of problems, the `TextCategorizer` object
-allows configuration of its model architecture, using the `architecture` keyword
-argument.
-
-| Name           | Description                                                                                                                                                                                                                                                                                                                                                                                                      |
-| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `"ensemble"`   | **Default:** Stacked ensemble of a bag-of-words model and a neural network model. The neural network uses a CNN with mean pooling and attention. The "ngram_size" and "attr" arguments can be used to configure the feature extraction for the bag-of-words model.                                                                                                                                               |
-| `"simple_cnn"` | A neural network model where token vectors are calculated using a CNN. The vectors are mean pooled and used as features in a feed-forward network. This architecture is usually less accurate than the ensemble, but runs faster.                                                                                                                                                                                |
-| `"bow"`        | An ngram "bag-of-words" model. This architecture should run much faster than the others, but may not be as accurate, especially if texts are short. The features extracted can be controlled using the keyword arguments `ngram_size` and `attr`. For instance, `ngram_size=3` and `attr="lower"` would give lower-cased unigram, trigram and bigram features. 2, 3 or 4 are usually good choices of ngram size. |
--->
-
 ## TextCategorizer.\_\_call\_\_ {#call tag="method"}
 
 Apply the pipe to one document. The document is modified in place, and returned.
diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md
index f810793ce..dce595023 100644
--- a/website/docs/api/tok2vec.md
+++ b/website/docs/api/tok2vec.md
@@ -8,7 +8,20 @@ api_string_name: tok2vec
 api_trainable: true
 ---
 
-<!-- TODO: intro describing component -->
+Apply a "token-to-vector" model and set its outputs in the `Doc.tensor`
+attribute. This is mostly useful to **share a single subnetwork** between
+multiple components, e.g. to have one embedding and CNN network shared between a
+[`DependencyParser`](/api/dependencyparser), [`Tagger`](/api/tagger) and
+[`EntityRecognizer`](/api/entityrecognizer).
+
+In order to use the `Tok2Vec` predictions, subsequent components should use the
+[Tok2VecListener](/api/architectures#Tok2VecListener) layer as the tok2vec
+subnetwork of their model. This layer will read data from the `doc.tensor`
+attribute during prediction. During training, the `Tok2Vec` component will save
+its prediction and backprop callback for each batch, so that the subsequent
+components can backpropagate to the shared weights. This implementation is used
+because it allows us to avoid relying on object identity within the models to
+achieve the parameter sharing.
 
 ## Config and implementation {#config}
 
@@ -27,9 +40,9 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("tok2vec", config=config)
 > ```
 
-| Setting | Type                                       | Description       | Default                                         |
-| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------- |
-| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [HashEmbedCNN](/api/architectures#HashEmbedCNN) |
+| Setting | Type                                       | Description                                                             | Default                                         |
+| ------- | ------------------------------------------ | ----------------------------------------------------------------------- | ----------------------------------------------- |
+| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. The model to use. | [HashEmbedCNN](/api/architectures#HashEmbedCNN) |
 
 ```python
 https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tok2vec.py
@@ -64,9 +77,11 @@ shortcut for this and instantiate the component using its string name and
 
 ## Tok2Vec.\_\_call\_\_ {#call tag="method"}
 
-Apply the pipe to one document. The document is modified in place, and returned.
-This usually happens under the hood when the `nlp` object is called on a text
-and all pipeline components are applied to the `Doc` in order. Both
+Apply the pipe to one document and add context-sensitive embeddings to the
+`Doc.tensor` attribute, allowing them to be used as features by downstream
+components. The document is modified in place, and returned. This usually
+happens under the hood when the `nlp` object is called on a text and all
+pipeline components are applied to the `Doc` in order. Both
 [`__call__`](/api/tok2vec#call) and [`pipe`](/api/tok2vec#pipe) delegate to the
 [`predict`](/api/tok2vec#predict) and
 [`set_annotations`](/api/tok2vec#set_annotations) methods.
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index b63a4adba..0b3167901 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -340,7 +340,7 @@ See the [`Transformer`](/api/transformer) API reference and
 
 ## Batchers {#batchers source="spacy/gold/batchers.py" new="3"}
 
-<!-- TODO: intro and also describe signature of functions -->
+<!-- TODO: intro -->
 
 #### batch_by_words.v1 {#batch_by_words tag="registered function"}
 
@@ -361,19 +361,16 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
 > get_length = null
 > ```
 
-<!-- TODO: complete table -->
-
-| Name               | Type                   | Description                                                                                                                         |
-| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `size`             | `Iterable[int]` / int  | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
-| `tolerance`        | float                  |                                                                                                                                     |
-| `discard_oversize` | bool                   | Discard items that are longer than the specified batch length.                                                                      |
-| `get_length`       | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set.                     |
+| Name               | Type                   | Description                                                                                                                                               |
+| ------------------ | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `seqs`             | `Iterable[Any]`        | The sequences to minibatch.                                                                                                                               |
+| `size`             | `Iterable[int]` / int  | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
+| `tolerance`        | float                  | What percentage of the size to allow batches to exceed.                                                                                                   |
+| `discard_oversize` | bool                   | Whether to discard sequences that by themselves exceed the tolerated size.                                                                                |
+| `get_length`       | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set.                                      |
 
 #### batch_by_sequence.v1 {#batch_by_sequence tag="registered function"}
 
-<!-- TODO: -->
-
 > #### Example config
 >
 > ```ini
@@ -383,34 +380,37 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
 > get_length = null
 > ```
 
-<!-- TODO: complete table -->
+Create a batcher that creates batches of the specified size.
 
-| Name         | Type                   | Description                                                                                                                         |
-| ------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `size`       | `Iterable[int]` / int  | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
-| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set.                     |
+| Name         | Type                   | Description                                                                                                                                               |
+| ------------ | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `size`       | `Iterable[int]` / int  | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
+| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set.                                      |
 
 #### batch_by_padded.v1 {#batch_by_padded tag="registered function"}
 
-<!-- TODO: -->
-
 > #### Example config
 >
 > ```ini
 > [training.batcher]
-> @batchers = "batch_by_words.v1"
+> @batchers = "batch_by_padded.v1"
 > size = 100
-> buffer = TODO:
+> buffer = 256
 > discard_oversize = false
 > get_length = null
 > ```
 
-| Name               | Type                   | Description                                                                                                                         |
-| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `size`             | `Iterable[int]` / int  | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
-| `buffer`           | int                    |                                                                                                                                     |
-| `discard_oversize` | bool                   | Discard items that are longer than the specified batch length.                                                                      |
-| `get_length`       | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set.                     |
+Minibatch a sequence by the size of padded batches that would result, with
+sequences binned by length within a window. The padded size is defined as the
+maximum length of sequences within the batch multiplied by the number of
+sequences in the batch.
+
+| Name               | Type                   | Description                                                                                                                                                                                                                         |
+| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `size`             | `Iterable[int]` / int  | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding).                                                                |
+| `buffer`           | int                    | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. |
+| `discard_oversize` | bool                   | Whether to discard sequences that are by themselves longer than the largest padded batch size.                                                                                                                                      |
+| `get_length`       | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set.                                                                                                                |
 
 ## Training data and alignment {#gold source="spacy/gold"}
 
diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md
index 6b6be6bd0..57f06cd9e 100644
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@@ -25,8 +25,15 @@ work out-of-the-box.
 
 </Infobox>
 
-This pipeline component lets you use transformer models in your pipeline. The
-component assigns the output of the transformer to the Doc's extension
+This pipeline component lets you use transformer models in your pipeline.
+Supports all models that are available via the
+[HuggingFace `transformers`](https://huggingface.co/transformers) library.
+Usually you will connect subsequent components to the shared transformer using
+the [TransformerListener](/api/architectures#TransformerListener) layer. This
+works similarly to spaCy's [Tok2Vec](/api/tok2vec) component and
+[Tok2VecListener](/api/architectures/Tok2VecListener) sublayer.
+
+The component assigns the output of the transformer to the `Doc`'s extension
 attributes. We also calculate an alignment between the word-piece tokens and the
 spaCy tokenization, so that we can use the last hidden states to set the
 `Doc.tensor` attribute. When multiple word-piece tokens align to the same spaCy
@@ -53,11 +60,11 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
 > ```
 
-| Setting             | Type                                       | Description                                                                                                                                                         | Default                                                 |
-| ------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- |
-| `max_batch_items`   | int                                        | Maximum size of a padded batch.                                                                                                                                     | `4096`                                                  |
-| `annotation_setter` | Callable                                   | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. | `null_annotation_setter`                                |
-| `model`             | [`Model`](https://thinc.ai/docs/api-model) | The model to use.                                                                                                                                                   | [TransformerModel](/api/architectures#TransformerModel) |
+| Setting             | Type                                       | Description                                                                                                                                                                                                                                                                                     | Default                                                 |
+| ------------------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- |
+| `max_batch_items`   | int                                        | Maximum size of a padded batch.                                                                                                                                                                                                                                                                 | `4096`                                                  |
+| `annotation_setter` | Callable                                   | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no additional annotations are set. | `null_annotation_setter`                                |
+| `model`             | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** [`FullTransformerBatch`](/api/transformer#fulltransformerbatch). The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer.                                                                                                             | [TransformerModel](/api/architectures#TransformerModel) |
 
 ```python
 https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
@@ -86,18 +93,22 @@ https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/p
 > trf = Transformer(nlp.vocab, model)
 > ```
 
-Create a new pipeline instance. In your application, you would normally use a
-shortcut for this and instantiate the component using its string name and
-[`nlp.add_pipe`](/api/language#create_pipe).
+Construct a `Transformer` component. One or more subsequent spaCy components can
+use the transformer outputs as features in its model, with gradients
+backpropagated to the single shared weights. The activations from the
+transformer are saved in the [`Doc._.trf_data`](#custom-attributes) extension
+attribute. You can also provide a callback to set additional annotations. In
+your application, you would normally use a shortcut for this and instantiate the
+component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).
 
-| Name                | Type                                       | Description                                                                                                                                                                                                                             |
-| ------------------- | ------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`             | `Vocab`                                    | The shared vocabulary.                                                                                                                                                                                                                  |
-| `model`             | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.                                                                                                                                                   |
-| `annotation_setter` | `Callable`                                 | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. Defaults to `null_annotation_setter`, a function that does nothing. |
-| _keyword-only_      |                                            |                                                                                                                                                                                                                                         |
-| `name`              | str                                        | String name of the component instance. Used to add entries to the `losses` during training.                                                                                                                                             |
-| `max_batch_items`   | int                                        | Maximum size of a padded batch. Defaults to `128*32`.                                                                                                                                                                                   |
+| Name                | Type                                       | Description                                                                                                                                                                                                                                                                                     |
+| ------------------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`             | `Vocab`                                    | The shared vocabulary.                                                                                                                                                                                                                                                                          |
+| `model`             | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** [`FullTransformerBatch`](/api/transformer#fulltransformerbatch). The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this.    |
+| `annotation_setter` | `Callable`                                 | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no additional annotations are set. |
+| _keyword-only_      |                                            |                                                                                                                                                                                                                                                                                                 |
+| `name`              | str                                        | String name of the component instance. Used to add entries to the `losses` during training.                                                                                                                                                                                                     |
+| `max_batch_items`   | int                                        | Maximum size of a padded batch. Defaults to `128*32`.                                                                                                                                                                                                                                           |
 
 ## Transformer.\_\_call\_\_ {#call tag="method"}
 
@@ -184,7 +195,10 @@ Apply the pipeline's model to a batch of docs, without modifying them.
 
 ## Transformer.set_annotations {#set_annotations tag="method"}
 
-Modify a batch of documents, using pre-computed scores.
+Assign the extracted features to the Doc objects. By default, the
+[`TransformerData`](/api/transformer#transformerdata) object is written to the
+[`Doc._.trf_data`](#custom-attributes) attribute. Your annotation_setter
+callback is then called, if provided.
 
 > #### Example
 >
@@ -201,8 +215,19 @@ Modify a batch of documents, using pre-computed scores.
 
 ## Transformer.update {#update tag="method"}
 
-Learn from a batch of documents and gold-standard information, updating the
-pipe's model. Delegates to [`predict`](/api/transformer#predict).
+Prepare for an update to the transformer. Like the [`Tok2Vec`](/api/tok2vec)
+component, the `Transformer` component is unusual in that it does not receive
+"gold standard" annotations to calculate a weight update. The optimal output of
+the transformer data is unknown – it's a hidden layer inside the network that is
+updated by backpropagating from output layers.
+
+The `Transformer` component therefore does **not** perform a weight update
+during its own `update` method. Instead, it runs its transformer model and
+communicates the output and the backpropagation callback to any **downstream
+components** that have been connected to it via the
+[TransformerListener](/api/architectures#TransformerListener) sublayer. If there
+are multiple listeners, the last layer will actually backprop to the transformer
+and call the optimizer, while the others simply increment the gradients.
 
 > #### Example
 >
@@ -212,15 +237,15 @@ pipe's model. Delegates to [`predict`](/api/transformer#predict).
 > losses = trf.update(examples, sgd=optimizer)
 > ```
 
-| Name              | Type                                                | Description                                                                                                                               |
-| ----------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`        | `Iterable[Example]`                                 | A batch of [`Example`](/api/example) objects to learn from.                                                                               |
-| _keyword-only_    |                                                     |                                                                                                                                           |
-| `drop`            | float                                               | The dropout rate.                                                                                                                         |
-| `set_annotations` | bool                                                | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/transformer#set_annotations). |
-| `sgd`             | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer.                                                                                                                            |
-| `losses`          | `Dict[str, float]`                                  | Optional record of the loss during training. Updated using the component name as the key.                                                 |
-| **RETURNS**       | `Dict[str, float]`                                  | The updated `losses` dictionary.                                                                                                          |
+| Name              | Type                                                | Description                                                                                                                                                |
+| ----------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`        | `Iterable[Example]`                                 | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. |
+| _keyword-only_    |                                                     |                                                                                                                                                            |
+| `drop`            | float                                               | The dropout rate.                                                                                                                                          |
+| `set_annotations` | bool                                                | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/transformer#set_annotations).                  |
+| `sgd`             | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer.                                                                                                                                             |
+| `losses`          | `Dict[str, float]`                                  | Optional record of the loss during training. Updated using the component name as the key.                                                                  |
+| **RETURNS**       | `Dict[str, float]`                                  | The updated `losses` dictionary.                                                                                                                           |
 
 ## Transformer.create_optimizer {#create_optimizer tag="method"}
 
@@ -394,21 +419,23 @@ Split a `TransformerData` object that represents a batch into a list with one
 | ----------- | ----------------------- | ----------- |
 | **RETURNS** | `List[TransformerData]` |             |
 
-## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
-
-<!-- TODO: details on what this is for -->
+## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
 
 Span getters are functions that take a batch of [`Doc`](/api/doc) objects and
 return a lists of [`Span`](/api/span) objects for each doc, to be processed by
-the transformer. The returned spans can overlap. Span getters can be referenced
-in the config's `[components.transformer.model.get_spans]` block to customize
-the sequences processed by the transformer. You can also register custom span
-getters using the `@registry.span_getters` decorator.
+the transformer. This is used to manage long documents, by cutting them into
+smaller sequences before running the transformer. The spans are allowed to
+overlap, and you can also omit sections of the Doc if they are not relevant.
+
+Span getters can be referenced in the `[components.transformer.model.get_spans]`
+block of the config to customize the sequences processed by the transformer. You
+can also register custom span getters using the `@spacy.registry.span_getters`
+decorator.
 
 > #### Example
 >
 > ```python
-> @registry.span_getters("sent_spans.v1")
+> @spacy.registry.span_getters("sent_spans.v1")
 > def configure_get_sent_spans() -> Callable:
 >     def get_sent_spans(docs: Iterable[Doc]) -> List[List[Span]]:
 >         return [list(doc.sents) for doc in docs]
@@ -421,15 +448,55 @@ getters using the `@registry.span_getters` decorator.
 | `docs`      | `Iterable[Doc]`    | A batch of `Doc` objects.                |
 | **RETURNS** | `List[List[Span]]` | The spans to process by the transformer. |
 
-The following built-in functions are available:
+### doc_spans.v1 {#doc_spans tag="registered function"}
 
-<!-- TODO: finish API docs -->
+> #### Example config
+>
+> ```ini
+> [transformer.model.get_spans]
+> @span_getters = "doc_spans.v1"
+> ```
 
-| Name               | Description                                                        |
-| ------------------ | ------------------------------------------------------------------ |
-| `doc_spans.v1`     | Create a span for each doc (no transformation, process each text). |
-| `sent_spans.v1`    | Create a span for each sentence if sentence boundaries are set.    |
-| `strided_spans.v1` |                                                                    |
+Create a span getter that uses the whole document as its spans. This is the best
+approach if your [`Doc`](/api/doc) objects already refer to relatively short
+texts.
+
+### sent_spans.v1 {#sent_spans tag="registered function"}
+
+> #### Example config
+>
+> ```ini
+> [transformer.model.get_spans]
+> @span_getters = "sent_spans.v1"
+> ```
+
+Create a span getter that uses sentence boundary markers to extract the spans.
+This requires sentence boundaries to be set (e.g. by the
+[`Sentencizer`](/api/sentencizer)), and may result in somewhat uneven batches,
+depending on the sentence lengths. However, it does provide the transformer with
+more meaningful windows to attend over.
+
+### strided_spans.v1 {#strided_spans tag="registered function"}
+
+> #### Example config
+>
+> ```ini
+> [transformer.model.get_spans]
+> @span_getters = "strided_spans.v1"
+> window = 128
+> stride = 96
+> ```
+
+Create a span getter for strided spans. If you set the `window` and `stride` to
+the same value, the spans will cover each token once. Setting `stride` lower
+than `window` will allow for an overlap, so that some tokens are counted twice.
+This can be desirable, because it allows all tokens to have both a left and
+right context.
+
+| Name      | Type | Description      |
+| --------- | ---- | ---------------- |
+|  `window` | int  | The window size. |
+| `stride`  | int  | The stride size. |
 
 ## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}
 
diff --git a/website/docs/usage/101/_architecture.md b/website/docs/usage/101/_architecture.md
index d321b7cb9..42ba44dc8 100644
--- a/website/docs/usage/101/_architecture.md
+++ b/website/docs/usage/101/_architecture.md
@@ -1,54 +1,88 @@
-The central data structures in spaCy are the `Doc` and the `Vocab`. The `Doc`
-object owns the **sequence of tokens** and all their annotations. The `Vocab`
-object owns a set of **look-up tables** that make common information available
-across documents. By centralizing strings, word vectors and lexical attributes,
-we avoid storing multiple copies of this data. This saves memory, and ensures
-there's a **single source of truth**.
+The central data structures in spaCy are the [`Language`](/api/language) class,
+the [`Vocab`](/api/vocab) and the [`Doc`](/api/doc) object. The `Language` class
+is used to process a text and turn it into a `Doc` object. It's typically stored
+as a variable called `nlp`. The `Doc` object owns the **sequence of tokens** and
+all their annotations. By centralizing strings, word vectors and lexical
+attributes in the `Vocab`, we avoid storing multiple copies of this data. This
+saves memory, and ensures there's a **single source of truth**.
 
 Text annotations are also designed to allow a single source of truth: the `Doc`
-object owns the data, and `Span` and `Token` are **views that point into it**.
-The `Doc` object is constructed by the `Tokenizer`, and then **modified in
-place** by the components of the pipeline. The `Language` object coordinates
-these components. It takes raw text and sends it through the pipeline, returning
-an **annotated document**. It also orchestrates training and serialization.
+object owns the data, and [`Span`](/api/span) and [`Token`](/api/token) are
+**views that point into it**. The `Doc` object is constructed by the
+[`Tokenizer`](/api/tokenizer), and then **modified in place** by the components
+of the pipeline. The `Language` object coordinates these components. It takes
+raw text and sends it through the pipeline, returning an **annotated document**.
+It also orchestrates training and serialization.
 
-<!-- TODO: update architecture and tables below to match sidebar in API docs etc. -->
+<!-- TODO: update graphic -->
 
 ![Library architecture](../../images/architecture.svg)
 
 ### Container objects {#architecture-containers}
 
-| Name                                  | Description                                                                                                                                             |
-| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [`Doc`](/api/doc)                     | A container for accessing linguistic annotations.                                                                                                       |
-| [`Span`](/api/span)                   | A slice from a `Doc` object.                                                                                                                            |
-| [`Token`](/api/token)                 | An individual token — i.e. a word, punctuation symbol, whitespace, etc.                                                                                 |
-| [`Lexeme`](/api/lexeme)               | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. |
-| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis.                                                                                                                               |
+| Name                        | Description                                                                                                                                             |
+| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`Language`](/api/language) | Processing class that turns text into `Doc` objects. Different languages implement their own subclasses of it. The variable is typically called `nlp`.  |
+| [`Doc`](/api/doc)           | A container for accessing linguistic annotations.                                                                                                       |
+| [`Span`](/api/span)         | A slice from a `Doc` object.                                                                                                                            |
+| [`Token`](/api/token)       | An individual token — i.e. a word, punctuation symbol, whitespace, etc.                                                                                 |
+| [`Lexeme`](/api/lexeme)     | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. |
+| [`Example`](/api/example)   | A collection of training annotations, containing two `Doc` objects: the reference data and the predictions.                                             |
+| [`DocBin`](/api/docbin)     | A collection of `Doc` objects for efficient binary serialization. Also used for [training data](/api/data-formats#binary-training).                     |
 
 ### Processing pipeline {#architecture-pipeline}
 
-| Name                                        | Description                                                                                                                   |
-| ------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
-| [`Language`](/api/language)                 | A text-processing pipeline. Usually you'll load this once per process as `nlp` and pass the instance around your application. |
-| [`Tokenizer`](/api/tokenizer)               | Segment text, and create `Doc` objects with the discovered segment boundaries.                                                |
-| [`Lemmatizer`](/api/lemmatizer)             | Determine the base forms of words.                                                                                            |
-| [`Morphology`](/api/morphology)             | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag.              |
-| [`Tagger`](/api/tagger)                     | Annotate part-of-speech tags on `Doc` objects.                                                                                |
-| [`DependencyParser`](/api/dependencyparser) | Annotate syntactic dependencies on `Doc` objects.                                                                             |
-| [`EntityRecognizer`](/api/entityrecognizer) | Annotate named entities, e.g. persons or products, on `Doc` objects.                                                          |
-| [`TextCategorizer`](/api/textcategorizer)   | Assign categories or labels to `Doc` objects.                                                                                 |
-| [`Matcher`](/api/matcher)                   | Match sequences of tokens, based on pattern rules, similar to regular expressions.                                            |
-| [`PhraseMatcher`](/api/phrasematcher)       | Match sequences of tokens based on phrases.                                                                                   |
-| [`EntityRuler`](/api/entityruler)           | Add entity spans to the `Doc` using token-based rules or exact phrase matches.                                                |
-| [`Sentencizer`](/api/sentencizer)           | Implement custom sentence boundary detection logic that doesn't require the dependency parse.                                 |
-| [Other functions](/api/pipeline-functions)  | Automatically apply something to the `Doc`, e.g. to merge spans of tokens.                                                    |
+The processing pipeline consists of one or more **pipeline components** that are
+called on the `Doc` in order. The tokenizer runs before the components. Pipeline
+components can be added using [`Language.add_pipe`](/api/language#add_pipe).
+They can contain a statistical model and trained weights, or only make
+rule-based modifications to the `Doc`. spaCy provides a range of built-in
+components for different language processing tasks and also allows adding
+[custom components](/usage/processing-pipelines#custom-components).
+
+![The processing pipeline](../../images/pipeline.svg)
+
+| Name                                            | Description                                                                                 |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------- |
+| [`Tokenizer`](/api/tokenizer)                   | Segment raw text and create `Doc` objects from the words.                                   |
+| [`Tok2Vec`](/api/tok2vec)                       | Apply a "token-to-vector" model and set its outputs.                                        |
+| [`Transformer`](/api/transformer)               | Use a transformer model and set its outputs.                                                |
+| [`Lemmatizer`](/api/lemmatizer)                 | Determine the base forms of words.                                                          |
+| [`Morphologizer`](/api/morphologizer)           | Predict morphological features and coarse-grained part-of-speech tags.                      |
+| [`Tagger`](/api/tagger)                         | Predict part-of-speech tags.                                                                |
+| [`AttributeRuler`](/api/attributeruler)         | Set token attributes using matcher rules.                                                   |
+| [`DependencyParser`](/api/dependencyparser)     | Predict syntactic dependencies.                                                             |
+| [`EntityRecognizer`](/api/entityrecognizer)     | Predict named entities, e.g. persons or products.                                           |
+| [`EntityRuler`](/api/entityruler)               | Add entity spans to the `Doc` using token-based rules or exact phrase matches.              |
+| [`EntityLinker`](/api/entitylinker)             | Disambiguate named entities to nodes in a knowledge base.                                   |
+| [`TextCategorizer`](/api/textcategorizer)       | Predict categories or labels over the whole document.                                       |
+| [`Sentencizer`](/api/sentencizer)               | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
+| [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries.                                                                |
+| [Other functions](/api/pipeline-functions)      | Automatically apply something to the `Doc`, e.g. to merge spans of tokens.                  |
+| [`Pipe`](/api/pipe)                             | Base class that all trainable pipeline components inherit from.                             |
+
+### Matchers {#architecture-matchers}
+
+Matchers help you find and extract information from [`Doc`](/api/doc) objects
+based on match patterns describing the sequences you're looking for. A matcher
+operates on a `Doc` and gives you access to the matched tokens **in context**.
+
+| Name                                          | Description                                                                                                                                                                         |
+| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`Matcher`](/api/matcher)                     | Match sequences of tokens, based on pattern rules, similar to regular expressions.                                                                                                  |
+| [`PhraseMatcher`](/api/phrasematcher)         | Match sequences of tokens based on phrases.                                                                                                                                         |
+| [`DependencyMatcher`](/api/dependencymatcher) | Match sequences of tokens based on dependency trees using the [Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). |
 
 ### Other classes {#architecture-other}
 
-| Name                              | Description                                                                   |
-| --------------------------------- | ----------------------------------------------------------------------------- |
-| [`Vocab`](/api/vocab)             | A lookup table for the vocabulary that allows you to access `Lexeme` objects. |
-| [`StringStore`](/api/stringstore) | Map strings to and from hash values.                                          |
-| [`Vectors`](/api/vectors)         | Container class for vector data keyed by string.                              |
-| [`Example`](/api/example)         | Collection for training annotations.                                          |
+| Name                                  | Description                                                                                                      |
+| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------- |
+| [`Vocab`](/api/vocab)                 | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects.               |
+| [`StringStore`](/api/stringstore)     | Map strings to and from hash values.                                                                             |
+| [`Vectors`](/api/vectors)             | Container class for vector data keyed by string.                                                                 |
+| [`Lookups`](/api/lookups)             | Container for convenient access to large lookup tables and dictionaries.                                         |
+| [`Morphology`](/api/morphology)       | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. |
+| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis.                                                                                        |
+| [`KnowledgeBase`](/api/kb)            | Storage for entities and aliases of a knowledge base for entity linking.                                         |
+| [`Scorer`](/api/scorer)               | Compute evaluation scores.                                                                                       |
+| [`Corpus`](/api/corpis)               | Class for managing annotated corpora for training and evaluation data.                                           |
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 5ad59482f..589cef44c 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -750,16 +750,13 @@ print([w.text for w in nlp("gimme that")])  # ['gim', 'me', 'that']
 
 The special case doesn't have to match an entire whitespace-delimited substring.
 The tokenizer will incrementally split off punctuation, and keep looking up the
-remaining substring:
+remaining substring. The special case rules also have precedence over the
+punctuation splitting.
 
 ```python
 assert "gimme" not in [w.text for w in nlp("gimme!")]
 assert "gimme" not in [w.text for w in nlp('("...gimme...?")')]
-```
 
-The special case rules have precedence over the punctuation splitting:
-
-```python
 nlp.tokenizer.add_special_case("...gimme...?", [{"ORTH": "...gimme...?"}])
 assert len(nlp("...gimme...?")) == 1
 ```
@@ -813,19 +810,6 @@ domain. There are six things you may need to define:
 6. An optional boolean function `url_match`, which is similar to `token_match`
    except that prefixes and suffixes are removed before applying the match.
 
-<Infobox title="Important note: token match in spaCy v2.2" variant="warning">
-
-In spaCy v2.2.2-v2.2.4, the `token_match` was equivalent to the `url_match`
-above and there was no match pattern applied before prefixes and suffixes were
-analyzed. As of spaCy v2.3.0, the `token_match` has been reverted to its
-behavior in v2.2.1 and earlier with precedence over prefixes and suffixes.
-
-The `url_match` is introduced in v2.3.0 to handle cases like URLs where the
-tokenizer should remove prefixes and suffixes (e.g., a comma at the end of a
-URL) before applying the match.
-
-</Infobox>
-
 You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is
 to use `re.compile()` to build a regular expression object, and pass its
 `.search()` and `.finditer()` methods:
@@ -905,12 +889,13 @@ function that behaves the same way.
 
 <Infobox title="Important note" variant="warning">
 
-If you're using a statistical model, writing to the `nlp.Defaults` or
-`English.Defaults` directly won't work, since the regular expressions are read
-from the model and will be compiled when you load it. If you modify
-`nlp.Defaults`, you'll only see the effect if you call
-[`spacy.blank`](/api/top-level#spacy.blank). If you want to modify the tokenizer
-loaded from a statistical model, you should modify `nlp.tokenizer` directly.
+If you're using a statistical model, writing to the
+[`nlp.Defaults`](/api/language#defaults) or `English.Defaults` directly won't
+work, since the regular expressions are read from the model and will be compiled
+when you load it. If you modify `nlp.Defaults`, you'll only see the effect if
+you call [`spacy.blank`](/api/top-level#spacy.blank). If you want to modify the
+tokenizer loaded from a statistical model, you should modify `nlp.tokenizer`
+directly.
 
 </Infobox>
 
@@ -961,51 +946,50 @@ and language-specific definitions such as
 [`lang/de/punctuation.py`](https://github.com/explosion/spaCy/blob/master/spacy/lang/de/punctuation.py)
 for German.
 
-### Hooking an arbitrary tokenizer into the pipeline {#custom-tokenizer}
+### Hooking a custom tokenizer into the pipeline {#custom-tokenizer}
 
 The tokenizer is the first component of the processing pipeline and the only one
 that can't be replaced by writing to `nlp.pipeline`. This is because it has a
 different signature from all the other components: it takes a text and returns a
-`Doc`, whereas all other components expect to already receive a tokenized `Doc`.
+[`Doc`](/api/doc), whereas all other components expect to already receive a
+tokenized `Doc`.
 
 ![The processing pipeline](../images/pipeline.svg)
 
 To overwrite the existing tokenizer, you need to replace `nlp.tokenizer` with a
-custom function that takes a text, and returns a `Doc`.
+custom function that takes a text, and returns a [`Doc`](/api/doc).
+
+> #### Creating a Doc
+>
+> Constructing a [`Doc`](/api/doc) object manually requires at least two
+> arguments: the shared `Vocab` and a list of words. Optionally, you can pass in
+> a list of `spaces` values indicating whether the token at this position is
+> followed by a space (default `True`). See the section on
+> [pre-tokenized text](#own-annotations) for more info.
+>
+> ```python
+> words = ["Let", "'s", "go", "!"]
+> spaces = [False, True, False, False]
+> doc = Doc(nlp.vocab, words=words, spaces=spaces)
+> ```
 
 ```python
-nlp = spacy.load("en_core_web_sm")
+nlp = spacy.blank("en")
 nlp.tokenizer = my_tokenizer
 ```
 
-| Argument    | Type  | Description               |
-| ----------- | ----- | ------------------------- |
-| `text`      | str   | The raw text to tokenize. |
-| **RETURNS** | `Doc` | The tokenized document.   |
+| Argument    | Type              | Description               |
+| ----------- | ----------------- | ------------------------- |
+| `text`      | str               | The raw text to tokenize. |
+| **RETURNS** | [`Doc`](/api/doc) | The tokenized document.   |
 
-<Infobox title="Important note: using a custom tokenizer" variant="warning">
+#### Example 1: Basic whitespace tokenizer {#custom-tokenizer-example}
 
-In spaCy v1.x, you had to add a custom tokenizer by passing it to the `make_doc`
-keyword argument, or by passing a tokenizer "factory" to `create_make_doc`. This
-was unnecessarily complicated. Since spaCy v2.0, you can write to
-`nlp.tokenizer` instead. If your tokenizer needs the vocab, you can write a
-function and use `nlp.vocab`.
-
-```diff
-- nlp = spacy.load("en_core_web_sm", make_doc=my_tokenizer)
-- nlp = spacy.load("en_core_web_sm", create_make_doc=my_tokenizer_factory)
-
-+ nlp.tokenizer = my_tokenizer
-+ nlp.tokenizer = my_tokenizer_factory(nlp.vocab)
-```
-
-</Infobox>
-
-### Example: A custom whitespace tokenizer {#custom-tokenizer-example}
-
-To construct the tokenizer, we usually want attributes of the `nlp` pipeline.
-Specifically, we want the tokenizer to hold a reference to the vocabulary
-object. Let's say we have the following class as our tokenizer:
+Here's an example of the most basic whitespace tokenizer. It takes the shared
+vocab, so it can construct `Doc` objects. When it's called on a text, it returns
+a `Doc` object consisting of the text split on single space characters. We can
+then overwrite the `nlp.tokenizer` attribute with an instance of our custom
+tokenizer.
 
 ```python
 ### {executable="true"}
@@ -1017,68 +1001,189 @@ class WhitespaceTokenizer:
         self.vocab = vocab
 
     def __call__(self, text):
-        words = text.split(' ')
-        # All tokens 'own' a subsequent space character in this tokenizer
-        spaces = [True] * len(words)
-        return Doc(self.vocab, words=words, spaces=spaces)
+        words = text.split(" ")
+        return Doc(self.vocab, words=words)
 
-nlp = spacy.load("en_core_web_sm")
+nlp = spacy.blank("en")
 nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
 doc = nlp("What's happened to me? he thought. It wasn't a dream.")
-print([t.text for t in doc])
+print([token.text for token in doc])
 ```
 
-As you can see, we need a `Vocab` instance to construct this — but we won't have
-it until we get back the loaded `nlp` object. The simplest solution is to build
-the tokenizer in two steps. This also means that you can reuse the "tokenizer
-factory" and initialize it with different instances of `Vocab`.
+#### Example 2: Third-party tokenizers (BERT word pieces) {#custom-tokenizer-example2}
 
-### Bringing your own annotations {#own-annotations}
+You can use the same approach to plug in any other third-party tokenizers. Your
+custom callable just needs to return a `Doc` object with the tokens produced by
+your tokenizer. In this example, the wrapper uses the **BERT word piece
+tokenizer**, provided by the
+[`tokenizers`](https://github.com/huggingface/tokenizers) library. The tokens
+available in the `Doc` object returned by spaCy now match the exact word pieces
+produced by the tokenizer.
 
-spaCy generally assumes by default that your data is raw text. However,
+> #### 💡 Tip: spacy-transformers
+>
+> If you're working with transformer models like BERT, check out the
+> [`spacy-transformers`](https://github.com/explosion/spacy-transformers)
+> extension package and [documentation](/usage/transformers). It includes a
+> pipeline component for using pretrained transformer weights and **training
+> transformer models** in spaCy, as well as helpful utilities for aligning word
+> pieces to linguistic tokenization.
+
+```python
+### Custom BERT word piece tokenizer
+from tokenizers import BertWordPieceTokenizer
+from spacy.tokens import Doc
+import spacy
+
+class BertTokenizer:
+    def __init__(self, vocab, vocab_file, lowercase=True):
+        self.vocab = vocab
+        self._tokenizer = BertWordPieceTokenizer(vocab_file, lowercase=lowercase)
+
+    def __call__(self, text):
+        tokens = self._tokenizer.encode(text)
+        words = []
+        spaces = []
+        for i, (text, (start, end)) in enumerate(zip(tokens.tokens, tokens.offsets)):
+            words.append(text)
+            if i < len(tokens.tokens) - 1:
+                # If next start != current end we assume a space in between
+                next_start, next_end = tokens.offsets[i + 1]
+                spaces.append(next_start > end)
+            else:
+                spaces.append(True)
+        return Doc(self.vocab, words=words, spaces=spaces)
+
+nlp = spacy.blank("en")
+nlp.tokenizer = BertTokenizer(nlp.vocab, "bert-base-uncased-vocab.txt")
+doc = nlp("Justin Drew Bieber is a Canadian singer, songwriter, and actor.")
+print(doc.text, [token.text for token in doc])
+# [CLS]justin drew bi##eber is a canadian singer, songwriter, and actor.[SEP]
+# ['[CLS]', 'justin', 'drew', 'bi', '##eber', 'is', 'a', 'canadian', 'singer',
+#  ',', 'songwriter', ',', 'and', 'actor', '.', '[SEP]']
+```
+
+<Infobox title="Important note on tokenization and models" variant="warning">
+
+Keep in mind that your model's result may be less accurate if the tokenization
+during training differs from the tokenization at runtime. So if you modify a
+pretrained model's tokenization afterwards, it may produce very different
+predictions. You should therefore train your model with the **same tokenizer**
+it will be using at runtime. See the docs on
+[training with custom tokenization](#custom-tokenizer-training) for details.
+
+</Infobox>
+
+#### Training with custom tokenization {#custom-tokenizer-training new="3"}
+
+spaCy's [training config](/usage/training#config) describe the settings,
+hyperparameters, pipeline and tokenizer used for constructing and training the
+model. The `[nlp.tokenizer]` block refers to a **registered function** that
+takes the `nlp` object and returns a tokenizer. Here, we're registering a
+function called `whitespace_tokenizer` in the
+[`@tokenizers` registry](/api/registry). To make sure spaCy knows how to
+construct your tokenizer during training, you can pass in your Python file by
+setting `--code functions.py` when you run [`spacy train`](/api/cli#train).
+
+> #### config.cfg
+>
+> ```ini
+> [nlp.tokenizer]
+> @tokenizers = "whitespace_tokenizer"
+> ```
+
+```python
+### functions.py {highlight="1"}
+@spacy.registry.tokenizers("whitespace_tokenizer")
+def create_whitespace_tokenizer():
+    def create_tokenizer(nlp):
+        return WhitespaceTokenizer(nlp.vocab)
+
+    return create_tokenizer
+```
+
+Registered functions can also take arguments that are then passed in from the
+config. This allows you to quickly change and keep track of different settings.
+Here, the registered function called `bert_word_piece_tokenizer` takes two
+arguments: the path to a vocabulary file and whether to lowercase the text. The
+Python type hints `str` and `bool` ensure that the received values have the
+correct type.
+
+> #### config.cfg
+>
+> ```ini
+> [nlp.tokenizer]
+> @tokenizers = "bert_word_piece_tokenizer"
+> vocab_file = "bert-base-uncased-vocab.txt"
+> lowercase = true
+> ```
+
+```python
+### functions.py {highlight="1"}
+@spacy.registry.tokenizers("bert_word_piece_tokenizer")
+def create_whitespace_tokenizer(vocab_file: str, lowercase: bool):
+    def create_tokenizer(nlp):
+        return BertWordPieceTokenizer(nlp.vocab, vocab_file, lowercase)
+
+    return create_tokenizer
+```
+
+To avoid hard-coding local paths into your config file, you can also set the
+vocab path on the CLI by using the `--nlp.tokenizer.vocab_file`
+[override](/usage/training#config-overrides) when you run
+[`spacy train`](/api/cli#train). For more details on using registered functions,
+see the docs in [training with custom code](/usage/training#custom-code).
+
+<Infobox variant="warning">
+
+Remember that a registered function should always be a function that spaCy
+**calls to create something**, not the "something" itself. In this case, it
+**creates a function** that takes the `nlp` object and returns a callable that
+takes a text and returns a `Doc`.
+
+</Infobox>
+
+#### Using pre-tokenized text {#own-annotations}
+
+spaCy generally assumes by default that your data is **raw text**. However,
 sometimes your data is partially annotated, e.g. with pre-existing tokenization,
-part-of-speech tags, etc. The most common situation is that you have pre-defined
-tokenization. If you have a list of strings, you can create a `Doc` object
-directly. Optionally, you can also specify a list of boolean values, indicating
-whether each word has a subsequent space.
+part-of-speech tags, etc. The most common situation is that you have
+**pre-defined tokenization**. If you have a list of strings, you can create a
+[`Doc`](/api/doc) object directly. Optionally, you can also specify a list of
+boolean values, indicating whether each word is followed by a space.
+
+> #### ✏️ Things to try
+>
+> 1. Change a boolean value in the list of `spaces`. You should see it reflected
+>    in the `doc.text` and whether the token is followed by a space.
+> 2. Remove `spaces=spaces` from the `Doc`. You should see that every token is
+>    now followed by a space.
+> 3. Copy-paste a random sentence from the internet and manually construct a
+>    `Doc` with `words` and `spaces` so that the `doc.text` matches the original
+>    input text.
 
 ```python
 ### {executable="true"}
 import spacy
 from spacy.tokens import Doc
-from spacy.lang.en import English
 
-nlp = English()
-doc = Doc(nlp.vocab, words=["Hello", ",", "world", "!"],
-          spaces=[False, True, False, False])
+nlp = spacy.blank("en")
+words = ["Hello", ",", "world", "!"]
+spaces = [False, True, False, False]
+doc = Doc(nlp.vocab, words=words, spaces=spaces)
+print(doc.text)
 print([(t.text, t.text_with_ws, t.whitespace_) for t in doc])
 ```
 
-If provided, the spaces list must be the same length as the words list. The
+If provided, the spaces list must be the **same length** as the words list. The
 spaces list affects the `doc.text`, `span.text`, `token.idx`, `span.start_char`
 and `span.end_char` attributes. If you don't provide a `spaces` sequence, spaCy
-will assume that all words are whitespace delimited.
+will assume that all words are followed by a space. Once you have a
+[`Doc`](/api/doc) object, you can write to its attributes to set the
+part-of-speech tags, syntactic dependencies, named entities and other
+attributes.
 
-```python
-### {executable="true"}
-import spacy
-from spacy.tokens import Doc
-from spacy.lang.en import English
-
-nlp = English()
-bad_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"])
-good_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"],
-                  spaces=[False, True, False, False])
-
-print(bad_spaces.text)   # 'Hello , world !'
-print(good_spaces.text)  # 'Hello, world!'
-```
-
-Once you have a [`Doc`](/api/doc) object, you can write to its attributes to set
-the part-of-speech tags, syntactic dependencies, named entities and other
-attributes. For details, see the respective usage pages.
-
-### Aligning tokenization {#aligning-tokenization}
+#### Aligning tokenization {#aligning-tokenization}
 
 spaCy's tokenization is non-destructive and uses language-specific rules
 optimized for compatibility with treebank annotations. Other tools and resources
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 2e07eff48..d7c3d49f8 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -979,8 +979,8 @@ added via [`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is
 called on a text, it will find matches in the `doc` and add them as entities to
 the `doc.ents`, using the specified pattern label as the entity label. If any
 matches were to overlap, the pattern matching most tokens takes priority. If
-they also happen to be equally long, then the match occuring first in the Doc is
-chosen.
+they also happen to be equally long, then the match occurring first in the `Doc`
+is chosen.
 
 ```python
 ### {executable="true"}
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index d20d87863..36f934e96 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -6,25 +6,97 @@ menu:
   - ['New Features', 'features']
   - ['Backwards Incompatibilities', 'incompat']
   - ['Migrating from v2.x', 'migrating']
-  - ['Migrating plugins', 'plugins']
 ---
 
 ## Summary {#summary}
 
 ## New Features {#features}
 
+### New training workflow and config system {#features-training}
+
+### Transformer-based pipelines {#features-transformers}
+
+### Custom models using any framework {#feautres-custom-models}
+
+### Manage end-to-end workflows with projects {#features-projects}
+
+### New built-in pipeline components {#features-pipeline-components}
+
+| Name                                            | Description                                                                                                                                                                                                  |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation.                                                                                                                                                               |
+| [`Morphologizer`](/api/morphologizer)           | Trainable component to predict morphological features.                                                                                                                                                       |
+| [`Lemmatizer`](/api/lemmatizer)                 | Standalone component for rule-based and lookup lemmatization.                                                                                                                                                |
+| [`AttributeRuler`](/api/attributeruler)         | Component for setting token attributes using match patterns.                                                                                                                                                 |
+| [`Transformer`](/api/transformer)               | Component for using [transformer models](/usage/transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
+
+### New and improved pipeline component APIs {#features-components}
+
+- `Language.factory`, `Language.component`
+- `Language.analyze_pipes`
+- Adding components from other models
+
+### Type hints and type-based data validation {#features-types}
+
+spaCy v3.0 officially drops support for Python 2 and now requires **Python
+3.6+**. This also means that the code base can take full advantage of
+[type hints](https://docs.python.org/3/library/typing.html). spaCy's user-facing
+API that's implemented in pure Python (as opposed to Cython) now comes with type
+hints. The new version of spaCy's machine learning library
+[Thinc](https://thinc.ai) also features extensive
+[type support](https://thinc.ai/docs/usage-type-checking/), including custom
+types for models and arrays, and a custom `mypy` plugin that can be used to
+type-check model definitions.
+
+For data validation, spacy v3.0 adopts
+[`pydantic`](https://github.com/samuelcolvin/pydantic). It also powers the data
+validation of Thinc's [config system](https://thinc.ai/docs/usage-config), which
+lets you to register **custom functions with typed arguments**, reference them
+in your config and see validation errors if the argument values don't match.
+
+### CLI
+
+| Name                                    | Description                                                                                              |
+| --------------------------------------- | -------------------------------------------------------------------------------------------------------- |
+| [`init config`](/api/cli#init-config)   | Initialize a [training config](/usage/training) file for a blank language or auto-fill a partial config. |
+| [`debug config`](/api/cli#debug-config) | Debug a [training config](/usage/training) file and show validation errors.                              |
+| [`project`](/api/cli#project)           | Subcommand for cloning and running [spaCy projects](/usage/projects).                                    |
+
 ## Backwards Incompatibilities {#incompat}
 
-### Removed or renamed objects, methods, attributes and arguments {#incompat-removed}
+As always, we've tried to keep the breaking changes to a minimum and focus on
+changes that were necessary to support the new features, fix problems or improve
+usability. The following section lists the relevant changes to the user-facing
+API. For specific examples of how to rewrite your code, check out the
+[migration guide](#migrating).
 
-| Removed                                                  | Replacement                               |
-| -------------------------------------------------------- | ----------------------------------------- |
-| `GoldParse`                                              | [`Example`](/api/example)                 |
-| `GoldCorpus`                                             | [`Corpus`](/api/corpus)                   |
-| `spacy debug-data`                                       | [`spacy debug data`](/api/cli#debug-data) |
-| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated |
+### Compatibility {#incompat-compat}
 
-### Removed deprecated methods, attributes and arguments {#incompat-removed-deprecated}
+- spaCy now requires **Python 3.6+**.
+
+### API changes {#incompat-api}
+
+- [`Language.add_pipe`](/api/language#add_pipe) now takes the **string name** of
+  the component factory instead of the component function.
+- **Custom pipeline components** now needs to be decorated with the
+  [`@Language.component`](/api/language#component) or
+  [`@Language.factory`](/api/language#factory) decorator.
+- [`Language.update`](/api/language#update) now takes a batch of
+  [`Example`](/api/example) objects instead of raw texts and annotations, or
+  `Doc` and `GoldParse` objects.
+- The `Language.disable_pipes` contextmanager has been replaced by
+  [`Language.select_pipes`](/api/language#select_pipes), which can explicitly
+  disable or enable components.
+
+### Removed or renamed API {#incompat-removed}
+
+| Removed                                                  | Replacement                                           |
+| -------------------------------------------------------- | ----------------------------------------------------- |
+| `Language.disable_pipes`                                 | [`Language.select_pipes`](/api/language#select_pipes) |
+| `GoldParse`                                              | [`Example`](/api/example)                             |
+| `GoldCorpus`                                             | [`Corpus`](/api/corpus)                               |
+| `spacy debug-data`                                       | [`spacy debug data`](/api/cli#debug-data)             |
+| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated             |
 
 The following deprecated methods, attributes and arguments were removed in v3.0.
 Most of them have been **deprecated for a while** and many would previously
@@ -214,17 +286,14 @@ python -m spacy package ./model ./packages
 - python setup.py sdist
 ```
 
-## Migration notes for plugin maintainers {#plugins}
+#### Migration notes for plugin maintainers {#migrating-plugins}
 
 Thanks to everyone who's been contributing to the spaCy ecosystem by developing
 and maintaining one of the many awesome [plugins and extensions](/universe).
-We've tried to keep breaking changes to a minimum and make it as easy as
-possible for you to upgrade your packages for spaCy v3.
-
-### Custom pipeline components
-
-The most common use case for plugins is providing pipeline components and
-extension attributes.
+We've tried to make it as easy as possible for you to upgrade your packages for
+spaCy v3. The most common use case for plugins is providing pipeline components
+and extension attributes. When migrating your plugin, double-check the
+following:
 
 - Use the [`@Language.factory`](/api/language#factory) decorator to register
   your component and assign it a name. This allows users to refer to your
diff --git a/website/src/components/code.js b/website/src/components/code.js
index a51986634..952014ed5 100644
--- a/website/src/components/code.js
+++ b/website/src/components/code.js
@@ -11,7 +11,7 @@ import Link from './link'
 import GitHubCode from './github'
 import classes from '../styles/code.module.sass'
 
-const WRAP_THRESHOLD = 15
+const WRAP_THRESHOLD = 16
 
 export default props => (
     <Pre>