Merge branch 'develop' into nightly.spacy.io

2025-08-23 21:44:54 +03:00 · 2020-08-10 00:45:37 +02:00 · 2020-08-10 00:45:37 +02:00 · 94da9f48de
commit 94da9f48de
parent bbc3e96690 d611cbef43
48 changed files with 1374 additions and 499 deletions
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a5"
+__version__ = "3.0.0a6"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -35,7 +35,7 @@ def pretrain_cli(
    config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
-    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
+    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
    # fmt: on
 ):
--- a/spacy/gold/batchers.py
+++ b/spacy/gold/batchers.py
@ -1,4 +1,4 @@
-from typing import Union, Iterator, Iterable, Sequence, TypeVar, List, Callable
+from typing import Union, Iterable, Sequence, TypeVar, List, Callable
 from typing import Optional, Any
 from functools import partial
 import itertools
@ -19,6 +19,22 @@ def configure_minibatch_by_padded_size(
    discard_oversize: bool,
    get_length: Optional[Callable[[ItemT], int]] = None
 ) -> BatcherT:
+    """Create a batcher that uses the `batch_by_padded_size` strategy.
+
+    The padded size is defined as the maximum length of sequences within the
+    batch multiplied by the number of sequences in the batch.
+
+    size (int or Iterable[int]): The largest padded size to batch sequences into.
+        Can be a single integer, or a sequence, allowing for variable batch sizes.
+    buffer (int): The number of sequences to accumulate before sorting by length.
+        A larger buffer will result in more even sizing, but if the buffer is
+        very large, the iteration order will be less random, which can result
+        in suboptimal training.
+    discard_oversize (bool): Whether to discard sequences that are by themselves
+        longer than the largest padded batch size.
+    get_length (Callable or None): Function to get the length of a sequence item.
+        The `len` function is used by default.
+    """
    # Avoid displacing optional values from the underlying function.
    optionals = {"get_length": get_length} if get_length is not None else {}
    return partial(
@ -38,6 +54,16 @@ def configure_minibatch_by_words(
    discard_oversize: bool,
    get_length: Optional[Callable[[ItemT], int]] = None
 ) -> BatcherT:
+    """Create a batcher that uses the "minibatch by words" strategy.
+
+    size (int or Iterable[int]): The target number of words per batch.
+        Can be a single integer, or a sequence, allowing for variable batch sizes.
+    tolerance (float): What percentage of the size to allow batches to exceed.
+    discard_oversize (bool): Whether to discard sequences that by themselves
+        exceed the tolerated size.
+    get_length (Callable or None): Function to get the length of a sequence
+        item. The `len` function is used by default.
+    """
    optionals = {"get_length": get_length} if get_length is not None else {}
    return partial(
        minibatch_by_words, size=size, discard_oversize=discard_oversize, **optionals
@ -48,22 +74,43 @@ def configure_minibatch_by_words(
 def configure_minibatch(
    size: Sizing, get_length: Optional[Callable[[ItemT], int]] = None
 ) -> BatcherT:
+    """Create a batcher that creates batches of the specified size.
+
+    size (int or Iterable[int]): The target number of items per batch.
+        Can be a single integer, or a sequence, allowing for variable batch sizes.
+    """
    optionals = {"get_length": get_length} if get_length is not None else {}
    return partial(minibatch, size=size, **optionals)


 def minibatch_by_padded_size(
-    docs: Iterator["Doc"],
+    seqs: Iterable[ItemT],
    size: Sizing,
    buffer: int = 256,
    discard_oversize: bool = False,
    get_length: Callable = len,
-) -> Iterator[Iterator["Doc"]]:
+) -> Iterable[List[ItemT]]:
+    """Minibatch a sequence by the size of padded batches that would result,
+    with sequences binned by length within a window.
+
+    The padded size is defined as the maximum length of sequences within the
+    batch multiplied by the number of sequences in the batch.
+
+    size (int): The largest padded size to batch sequences into.
+    buffer (int): The number of sequences to accumulate before sorting by length.
+        A larger buffer will result in more even sizing, but if the buffer is
+        very large, the iteration order will be less random, which can result
+        in suboptimal training.
+    discard_oversize (bool): Whether to discard sequences that are by themselves
+        longer than the largest padded batch size.
+    get_length (Callable or None): Function to get the length of a sequence item.
+        The `len` function is used by default.
+    """
    if isinstance(size, int):
        size_ = itertools.repeat(size)
    else:
        size_ = size
-    for outer_batch in minibatch(docs, size=buffer):
+    for outer_batch in minibatch(seqs, size=buffer):
        outer_batch = list(outer_batch)
        target_size = next(size_)
        for indices in _batch_by_length(outer_batch, target_size, get_length):
@ -76,12 +123,24 @@ def minibatch_by_padded_size(


 def minibatch_by_words(
-    docs, size, tolerance=0.2, discard_oversize=False, get_length=len
-):
+    seqs: Iterable[ItemT],
+    size: Sizing,
+    tolerance=0.2,
+    discard_oversize=False,
+    get_length=len,
+) -> Iterable[List[ItemT]]:
    """Create minibatches of roughly a given number of words. If any examples
    are longer than the specified batch length, they will appear in a batch by
    themselves, or be discarded if discard_oversize=True.
-    The argument 'docs' can be a list of strings, Docs or Examples.
+
+    seqs (Iterable[Sequence]): The sequences to minibatch.
+    size (int or Iterable[int]): The target number of words per batch.
+        Can be a single integer, or a sequence, allowing for variable batch sizes.
+    tolerance (float): What percentage of the size to allow batches to exceed.
+    discard_oversize (bool): Whether to discard sequences that by themselves
+        exceed the tolerated size.
+    get_length (Callable or None): Function to get the length of a sequence
+        item. The `len` function is used by default.
    """
    if isinstance(size, int):
        size_ = itertools.repeat(size)
@ -95,20 +154,20 @@ def minibatch_by_words(
    overflow = []
    batch_size = 0
    overflow_size = 0
-    for doc in docs:
-        n_words = get_length(doc)
+    for seq in seqs:
+        n_words = get_length(seq)
        # if the current example exceeds the maximum batch size, it is returned separately
        # but only if discard_oversize=False.
        if n_words > target_size + tol_size:
            if not discard_oversize:
-                yield [doc]
+                yield [seq]
        # add the example to the current batch if there's no overflow yet and it still fits
        elif overflow_size == 0 and (batch_size + n_words) <= target_size:
-            batch.append(doc)
+            batch.append(seq)
            batch_size += n_words
        # add the example to the overflow buffer if it fits in the tolerance margin
        elif (batch_size + overflow_size + n_words) <= (target_size + tol_size):
-            overflow.append(doc)
+            overflow.append(seq)
            overflow_size += n_words
        # yield the previous batch and start a new one. The new one gets the overflow examples.
        else:
@ -122,11 +181,11 @@ def minibatch_by_words(
            overflow_size = 0
            # this example still fits
            if (batch_size + n_words) <= target_size:
-                batch.append(doc)
+                batch.append(seq)
                batch_size += n_words
            # this example fits in overflow
            elif (batch_size + n_words) <= (target_size + tol_size):
-                overflow.append(doc)
+                overflow.append(seq)
                overflow_size += n_words
            # this example does not fit with the previous overflow: start another new batch
            else:
@ -134,7 +193,7 @@ def minibatch_by_words(
                    yield batch
                target_size = next(size_)
                tol_size = target_size * tolerance
-                batch = [doc]
+                batch = [seq]
                batch_size = n_words
    batch.extend(overflow)
    if batch:
--- a/spacy/lang/en/lemmatizer.py
+++ b/spacy/lang/en/lemmatizer.py
@ -1,5 +1,3 @@
-from typing import Optional
-
 from ...pipeline import Lemmatizer
 from ...tokens import Token

--- a/spacy/language.py
+++ b/spacy/language.py
@ -27,7 +27,6 @@ from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .lang.punctuation import TOKENIZER_INFIXES
 from .tokens import Doc
-from .lookups import load_lookups
 from .tokenizer import Tokenizer
 from .errors import Errors, Warnings
 from .schemas import ConfigSchema
@ -1439,10 +1438,7 @@ class Language:
                or lang_cls is not cls
            ):
                raise ValueError(Errors.E943.format(value=type(lang_cls)))
-        nlp = lang_cls(
-            vocab=vocab,
-            create_tokenizer=create_tokenizer,
-        )
+        nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer)
        if after_creation is not None:
            nlp = after_creation(nlp)
            if not isinstance(nlp, cls):
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@ -68,11 +68,11 @@ cdef class DependencyMatcher:
        key (str): The match ID.
        RETURNS (bool): Whether the matcher contains rules for this match ID.
        """
-        return self._normalize_key(key) in self._patterns
+        return self.has_key(key)

-    def validateInput(self, pattern, key):
+    def validate_input(self, pattern, key):
        idx = 0
-        visitedNodes = {}
+        visited_nodes = {}
        for relation in pattern:
            if "PATTERN" not in relation or "SPEC" not in relation:
                raise ValueError(Errors.E098.format(key=key))
@ -83,7 +83,7 @@ cdef class DependencyMatcher:
                    and "NBOR_NAME" not in relation["SPEC"]
                ):
                    raise ValueError(Errors.E099.format(key=key))
-                visitedNodes[relation["SPEC"]["NODE_NAME"]] = True
+                visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
            else:
                if not(
                    "NODE_NAME" in relation["SPEC"]
@ -92,22 +92,28 @@ cdef class DependencyMatcher:
                ):
                    raise ValueError(Errors.E100.format(key=key))
                if (
-                    relation["SPEC"]["NODE_NAME"] in visitedNodes
-                    or relation["SPEC"]["NBOR_NAME"] not in visitedNodes
+                    relation["SPEC"]["NODE_NAME"] in visited_nodes
+                    or relation["SPEC"]["NBOR_NAME"] not in visited_nodes
                ):
                    raise ValueError(Errors.E101.format(key=key))
-                visitedNodes[relation["SPEC"]["NODE_NAME"]] = True
-                visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True
+                visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
+                visited_nodes[relation["SPEC"]["NBOR_NAME"]] = True
            idx = idx + 1

    def add(self, key, patterns, *_patterns, on_match=None):
+        """Add a new matcher rule to the matcher.
+
+        key (str): The match ID.
+        patterns (list): The patterns to add for the given key.
+        on_match (callable): Optional callback executed on match.
+        """
        if patterns is None or hasattr(patterns, "__call__"):  # old API
            on_match = patterns
            patterns = _patterns
        for pattern in patterns:
            if len(pattern) == 0:
                raise ValueError(Errors.E012.format(key=key))
-            self.validateInput(pattern,key)
+            self.validate_input(pattern,key)
        key = self._normalize_key(key)
        _patterns = []
        for pattern in patterns:
@ -187,8 +193,7 @@ cdef class DependencyMatcher:
        key (string or int): The key to check.
        RETURNS (bool): Whether the matcher has the rule.
        """
-        key = self._normalize_key(key)
-        return key in self._patterns
+        return self._normalize_key(key) in self._patterns

    def get(self, key, default=None):
        """Retrieve the pattern stored for a key.
@ -202,6 +207,13 @@ cdef class DependencyMatcher:
        return (self._callbacks[key], self._patterns[key])

    def __call__(self, Doc doc):
+        """Find all token sequences matching the supplied pattern.
+
+        doclike (Doc or Span): The document to match over.
+        RETURNS (list): A list of `(key, start, end)` tuples,
+            describing the matches. A match tuple describes a span
+            `doc[start:end]`. The `label_id` and `key` are both integers.
+        """
        matched_key_trees = []
        matches = self.token_matcher(doc)
        for key in list(self._patterns.keys()):
@ -241,25 +253,25 @@ cdef class DependencyMatcher:
                    on_match(self, doc, i, matched_key_trees)
        return matched_key_trees

-    def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visitedNodes,matched_trees):
+    def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visited_nodes,matched_trees):
        cdef bool isValid;
        if(patternLength == len(id_to_position.keys())):
            isValid = True
            for node in range(patternLength):
                if(node in tree):
                    for idx, (relop,nbor) in enumerate(tree[node]):
-                        computed_nbors = numpy.asarray(_node_operator_map[visitedNodes[node]][relop])
+                        computed_nbors = numpy.asarray(_node_operator_map[visited_nodes[node]][relop])
                        isNbor = False
                        for computed_nbor in computed_nbors:
-                            if(computed_nbor.i == visitedNodes[nbor]):
+                            if(computed_nbor.i == visited_nodes[nbor]):
                                isNbor = True
                        isValid = isValid & isNbor
            if(isValid):
-                matched_trees.append(visitedNodes)
+                matched_trees.append(visited_nodes)
            return
        allPatternNodes = numpy.asarray(id_to_position[patternLength])
        for patternNode in allPatternNodes:
-            self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visitedNodes+[patternNode],matched_trees)
+            self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visited_nodes+[patternNode],matched_trees)

    # Given a node and an edge operator, to return the list of nodes
    # from the doc that belong to node+operator. This is used to store
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -70,7 +70,7 @@ cdef class Matcher:
        key (str): The match ID.
        RETURNS (bool): Whether the matcher contains rules for this match ID.
        """
-        return self._normalize_key(key) in self._patterns
+        return self.has_key(key)

    def add(self, key, patterns, *, on_match=None, greedy: str=None):
        """Add a match-rule to the matcher. A match-rule consists of: an ID
@ -162,8 +162,7 @@ cdef class Matcher:
        key (string or int): The key to check.
        RETURNS (bool): Whether the matcher has the rule.
        """
-        key = self._normalize_key(key)
-        return key in self._patterns
+        return self._normalize_key(key) in self._patterns

    def get(self, key, default=None):
        """Retrieve the pattern stored for a key.
@ -179,7 +178,7 @@ cdef class Matcher:
    def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
        """Match a stream of documents, yielding them in turn.

-        docs (iterable): A stream of documents.
+        docs (Iterable[Union[Doc, Span]]): A stream of documents or spans.
        batch_size (int): Number of documents to accumulate into a working set.
        return_matches (bool): Yield the match lists along with the docs, making
            results (doc, matches) tuples.
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -37,7 +37,6 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
    default_config={
        "moves": None,
        "update_with_oracle_cut_size": 100,
-        "multitasks": [],
        "learn_tokens": False,
        "min_action_freq": 30,
        "model": DEFAULT_PARSER_MODEL,
@ -51,17 +50,52 @@ def make_parser(
    model: Model,
    moves: Optional[list],
    update_with_oracle_cut_size: int,
-    multitasks: Iterable,
    learn_tokens: bool,
    min_action_freq: int
 ):
+    """Create a transition-based DependencyParser component. The dependency parser
+    jointly learns sentence segmentation and labelled dependency parsing, and can
+    optionally learn to merge tokens that had been over-segmented by the tokenizer.
+
+    The parser uses a variant of the non-monotonic arc-eager transition-system
+    described by Honnibal and Johnson (2014), with the addition of a "break"
+    transition to perform the sentence segmentation. Nivre's pseudo-projective
+    dependency transformation is used to allow the parser to predict
+    non-projective parses.
+
+    The parser is trained using an imitation learning objective. The parser follows
+    the actions predicted by the current weights, and at each state, determines
+    which actions are compatible with the optimal parse that could be reached
+    from the current state. The weights such that the scores assigned to the
+    set of optimal actions is increased, while scores assigned to other
+    actions are decreased. Note that more than one action may be optimal for
+    a given state.
+
+    model (Model): The model for the transition-based parser. The model needs
+        to have a specific substructure of named components --- see the
+        spacy.ml.tb_framework.TransitionModel for details.
+    moves (List[str]): A list of transition names. Inferred from the data if not
+        provided.
+    update_with_oracle_cut_size (int):
+        During training, cut long sequences into shorter segments by creating
+        intermediate states based on the gold-standard history. The model is
+        not very sensitive to this parameter, so you usually won't need to change
+        it. 100 is a good default.
+    learn_tokens (bool): Whether to learn to merge subtokens that are split
+        relative to the gold standard. Experimental.
+    min_action_freq (int): The minimum frequency of labelled actions to retain.
+        Rarer labelled actions have their label backed-off to "dep". While this
+        primarily affects the label accuracy, it can also affect the attachment
+        structure, as the labels are used to represent the pseudo-projectivity
+        transformation.
+    """
    return DependencyParser(
        nlp.vocab,
        model,
        name,
        moves=moves,
        update_with_oracle_cut_size=update_with_oracle_cut_size,
-        multitasks=multitasks,
+        multitasks=[],
        learn_tokens=learn_tokens,
        min_action_freq=min_action_freq
    )
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -62,6 +62,16 @@ def make_entity_linker(
    incl_prior: bool,
    incl_context: bool,
 ):
+    """Construct an EntityLinker component.
+
+    model (Model[List[Doc], Floats2d]): A model that learns document vector
+        representations. Given a batch of Doc objects, it should return a single
+        array, with one row per item in the batch.
+    kb (KnowledgeBase): The knowledge-base to link entities to.
+    labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
+    incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
+    incl_context (bool): Whether or not to include the local context in the model.
+    """
    return EntityLinker(
        nlp.vocab,
        model,
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -75,8 +75,8 @@ class Morphologizer(Tagger):
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
-        labels_morph (dict): TODO:
-        labels_pos (dict): TODO:
+        labels_morph (dict): Mapping of morph + POS tags to morph labels.
+        labels_pos (dict): Mapping of morph + POS tags to POS tags.

        DOCS: https://spacy.io/api/morphologizer#init
        """
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -35,9 +35,6 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
    default_config={
        "moves": None,
        "update_with_oracle_cut_size": 100,
-        "multitasks": [],
-        "learn_tokens": False,
-        "min_action_freq": 30,
        "model": DEFAULT_NER_MODEL,
    },
    scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
@ -50,19 +47,40 @@ def make_ner(
    model: Model,
    moves: Optional[list],
    update_with_oracle_cut_size: int,
-    multitasks: Iterable,
-    learn_tokens: bool,
-    min_action_freq: int
 ):
+    """Create a transition-based EntityRecognizer component. The entity recognizer
+    identifies non-overlapping labelled spans of tokens.
+    
+    The transition-based algorithm used encodes certain assumptions that are
+    effective for "traditional" named entity recognition tasks, but may not be
+    a good fit for every span identification problem. Specifically, the loss
+    function optimizes for whole entity accuracy, so if your inter-annotator
+    agreement on boundary tokens is low, the component will likely perform poorly
+    on your problem. The transition-based algorithm also assumes that the most
+    decisive information about your entities will be close to their initial tokens.
+    If your entities are long and characterised by tokens in their middle, the
+    component will likely do poorly on your task.
+
+    model (Model): The model for the transition-based parser. The model needs
+        to have a specific substructure of named components --- see the
+        spacy.ml.tb_framework.TransitionModel for details.
+    moves (list[str]): A list of transition names. Inferred from the data if not
+        provided.
+    update_with_oracle_cut_size (int):
+        During training, cut long sequences into shorter segments by creating
+        intermediate states based on the gold-standard history. The model is
+        not very sensitive to this parameter, so you usually won't need to change
+        it. 100 is a good default.
+    """
    return EntityRecognizer(
        nlp.vocab,
        model,
        name,
        moves=moves,
        update_with_oracle_cut_size=update_with_oracle_cut_size,
-        multitasks=multitasks,
-        learn_tokens=learn_tokens,
-        min_action_freq=min_action_freq
+        multitasks=[],
+        min_action_freq=1,
+        learn_tokens=False,
    )


@ -74,9 +92,11 @@ cdef class EntityRecognizer(Parser):
    TransitionSystem = BiluoPushDown

    def add_multitask_objective(self, mt_component):
+        """Register another component as a multi-task objective. Experimental."""
        self._multitasks.append(mt_component)

    def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
+        """Setup multi-task objective components. Experimental and internal."""
        # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
        for labeller in self._multitasks:
            labeller.model.set_dim("nO", len(self.labels))
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -1,8 +1,9 @@
 # cython: infer_types=True, profile=True, binding=True
+from typing import List
 import numpy
 import srsly
-
 from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
+from thinc.types import Floats2d
 import warnings

 from ..tokens.doc cimport Doc
@ -42,7 +43,14 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
    scores=["tag_acc"],
    default_score_weights={"tag_acc": 1.0},
 )
-def make_tagger(nlp: Language, name: str, model: Model):
+def make_tagger(nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]]):
+    """Construct a part-of-speech tagger component.
+
+    model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
+        the tag probabilities. The output vectors should match the number of tags
+        in size, and be normalized as probabilities (all scores between 0 and 1,
+        with the rows summing to 1).
+    """
    return Tagger(nlp.vocab, model, name)


--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -1,5 +1,6 @@
 from typing import Iterable, Tuple, Optional, Dict, List, Callable, Iterator, Any
 from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
+from thinc.types import Floats2d
 import numpy

 from .pipe import Pipe
@ -69,8 +70,22 @@ subword_features = true
    default_score_weights={"cats_score": 1.0},
 )
 def make_textcat(
-    nlp: Language, name: str, model: Model, labels: Iterable[str]
+    nlp: Language,
+    name: str,
+    model: Model[List[Doc], List[Floats2d]],
+    labels: Iterable[str],
 ) -> "TextCategorizer":
+    """Create a TextCategorizer compoment. The text categorizer predicts categories
+    over a whole document. It can learn one or more labels, and the labels can
+    be mutually exclusive (i.e. one true label per doc) or non-mutually exclusive
+    (i.e. zero or more labels may be true per doc). The multi-label setting is
+    controlled by the model instance that's provided.
+
+    model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
+        scores for each category.
+    labels (list): A list of categories to learn. If empty, the model infers the
+        categories from the data.
+    """
    return TextCategorizer(nlp.vocab, model, name, labels=labels)


--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -32,11 +32,28 @@ def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec":


 class Tok2Vec(Pipe):
+    """Apply a "token-to-vector" model and set its outputs in the doc.tensor
+    attribute. This is mostly useful to share a single subnetwork between multiple
+    components, e.g. to have one embedding and CNN network shared between a
+    parser, tagger and NER.
+
+    In order to use the `Tok2Vec` predictions, subsequent components should use
+    the `Tok2VecListener` layer as the tok2vec subnetwork of their model. This
+    layer will read data from the `doc.tensor` attribute during prediction.
+    During training, the `Tok2Vec` component will save its prediction and backprop
+    callback for each batch, so that the subsequent components can backpropagate
+    to the shared weights. This implementation is used because it allows us to
+    avoid relying on object identity within the models to achieve the parameter
+    sharing.
+    """
+
    def __init__(self, vocab: Vocab, model: Model, name: str = "tok2vec") -> None:
        """Initialize a tok2vec component.

        vocab (Vocab): The shared vocabulary.
-        model (thinc.api.Model): The Thinc Model powering the pipeline component.
+        model (thinc.api.Model[List[Doc], List[Floats2d]]):
+            The Thinc Model powering the pipeline component. It should take
+            a list of Doc objects as input, and output a list of 2d float arrays.
        name (str): The component instance name.

        DOCS: https://spacy.io/api/tok2vec#init
@ -48,9 +65,18 @@ class Tok2Vec(Pipe):
        self.cfg = {}

    def add_listener(self, listener: "Tok2VecListener") -> None:
+        """Add a listener for a downstream component. Usually internals."""
        self.listeners.append(listener)

    def find_listeners(self, model: Model) -> None:
+        """Walk over a model, looking for layers that are Tok2vecListener
+        subclasses that have an upstream_name that matches this component.
+        Listeners can also set their upstream_name attribute to the wildcard
+        string '*' to match any `Tok2Vec`.
+
+        You're unlikely to ever need multiple `Tok2Vec` components, so it's
+        fine to leave your listeners upstream_name on '*'.
+        """
        for node in model.walk():
            if isinstance(node, Tok2VecListener) and node.upstream_name in (
                "*",
@ -59,7 +85,8 @@ class Tok2Vec(Pipe):
                self.add_listener(node)

    def __call__(self, doc: Doc) -> Doc:
-        """Add context-sensitive embeddings to the Doc.tensor attribute.
+        """Add context-sensitive embeddings to the Doc.tensor attribute, allowing
+        them to be used as features by downstream components.

        docs (Doc): The Doc to preocess.
        RETURNS (Doc): The processed Doc.
@ -205,11 +232,27 @@ class Tok2Vec(Pipe):
 class Tok2VecListener(Model):
    """A layer that gets fed its answers from an upstream connection,
    for instance from a component earlier in the pipeline.
+
+    The Tok2VecListener layer is used as a sublayer within a component such
+    as a parser, NER or text categorizer. Usually you'll have multiple listeners
+    connecting to a single upstream Tok2Vec component, that's earlier in the
+    pipeline. The Tok2VecListener layers act as proxies, passing the predictions
+    from the Tok2Vec component into downstream components, and communicating
+    gradients back upstream.
    """

    name = "tok2vec-listener"

    def __init__(self, upstream_name: str, width: int) -> None:
+        """
+        upstream_name (str): A string to identify the 'upstream' Tok2Vec component
+            to communicate with. The upstream name should either be the wildcard
+            string '*', or the name of the `Tok2Vec` component. You'll almost
+            never have multiple upstream Tok2Vec components, so the wildcard
+            string will almost always be fine.
+        width (int):
+            The width of the vectors produced by the upstream tok2vec component.
+        """
        Model.__init__(self, name=self.name, forward=forward, dims={"nO": width})
        self.upstream_name = upstream_name
        self._batch_id = None
@ -217,15 +260,25 @@ class Tok2VecListener(Model):
        self._backprop = None

    @classmethod
-    def get_batch_id(cls, inputs) -> int:
+    def get_batch_id(cls, inputs: List[Doc]) -> int:
+        """Calculate a content-sensitive hash of the batch of documents, to check
+        whether the next batch of documents is unexpected.
+        """
        return sum(sum(token.orth for token in doc) for doc in inputs)

    def receive(self, batch_id: int, outputs, backprop) -> None:
+        """Store a batch of training predictions and a backprop callback. The
+        predictions and callback are produced by the upstream Tok2Vec component,
+        and later will be used when the listener's component's model is called.
+        """
        self._batch_id = batch_id
        self._outputs = outputs
        self._backprop = backprop

    def verify_inputs(self, inputs) -> bool:
+        """Check that the batch of Doc objects matches the ones we have a
+        prediction for.
+        """
        if self._batch_id is None and self._outputs is None:
            raise ValueError(Errors.E954)
        else:
@ -237,6 +290,7 @@ class Tok2VecListener(Model):


 def forward(model: Tok2VecListener, inputs, is_train: bool):
+    """Supply the outputs from the upstream Tok2Vec component."""
    if is_train:
        model.verify_inputs(inputs)
        return model._outputs, model._backprop
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -426,7 +426,7 @@ class Scorer:
            f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
        }
        if len(labels) == 2 and not multi_label and positive_label:
-            positive_label_f = results[f"{attr}_f_per_type"][positive_label]['f']
+            positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
            results[f"{attr}_score"] = positive_label_f
            results[f"{attr}_score_desc"] = f"F ({positive_label})"
        elif not multi_label:
--- a/spacy/tests/morphology/test_morph_pickle.py
+++ b/spacy/tests/morphology/test_morph_pickle.py
@ -15,5 +15,7 @@ def morphology():
 def test_morphology_pickle_roundtrip(morphology):
    b = pickle.dumps(morphology)
    reloaded_morphology = pickle.loads(b)
-    assert reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"]) == "Feat1=Val1|Feat2=Val2"
-    assert reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"]) == "Feat3=Val3|Feat4=Val4"
+    feat = reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"])
+    assert feat == "Feat1=Val1|Feat2=Val2"
+    feat = reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"])
+    assert feat == "Feat3=Val3|Feat4=Val4"
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -144,10 +144,7 @@ def test_accept_blocked_token():
    # 1. test normal behaviour
    nlp1 = English()
    doc1 = nlp1("I live in New York")
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-    }
+    config = {}
    ner1 = nlp1.create_pipe("ner", config=config)
    assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
    assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
@ -166,10 +163,7 @@ def test_accept_blocked_token():
    # 2. test blocking behaviour
    nlp2 = English()
    doc2 = nlp2("I live in New York")
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-    }
+    config = {}
    ner2 = nlp2.create_pipe("ner", config=config)

    # set "New York" to a blocked entity
@ -224,10 +218,7 @@ def test_overwrite_token():
    assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
    assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
    # Check that a new ner can overwrite O
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-    }
+    config = {}
    ner2 = nlp.create_pipe("ner", config=config)
    ner2.moves.add_action(5, "")
    ner2.add_label("GPE")
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@ -1,8 +1,7 @@
 import pytest
-
 from spacy import util, registry
 from spacy.lang.en import English
-from spacy.lookups import Lookups, load_lookups
+from spacy.lookups import Lookups

 from ..util import make_tempdir

--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@ -1,10 +1,8 @@
 import pytest
-
 from spacy import util
 from spacy.gold import Example
 from spacy.lang.en import English
 from spacy.language import Language
-from spacy.symbols import POS, NOUN

 from ..util import make_tempdir

--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -117,9 +117,7 @@ def test_overfitting_IO():
        assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)

    # Test scoring
-    scores = nlp.evaluate(
-        train_examples, scorer_cfg={"positive_label": "POSITIVE"}
-    )
+    scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
    assert scores["cats_micro_f"] == 1.0
    assert scores["cats_score"] == 1.0
    assert "cats_score_desc" in scores
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@ -1,11 +1,9 @@
 import pytest
 import random
-
 from spacy import util
 from spacy.gold import Example
 from spacy.matcher import Matcher
 from spacy.attrs import IS_PUNCT, ORTH, LOWER
-from spacy.symbols import POS, VERB
 from spacy.vocab import Vocab
 from spacy.lang.en import English
 from spacy.lookups import Lookups
--- a/spacy/tests/regression/test_issue1001-1500.py
+++ b/spacy/tests/regression/test_issue1001-1500.py
@ -6,8 +6,7 @@ from spacy.lang.en import English
 from spacy.lang.lex_attrs import LEX_ATTRS
 from spacy.matcher import Matcher
 from spacy.tokenizer import Tokenizer
-from spacy.lookups import Lookups
-from spacy.symbols import ORTH, LEMMA, POS, VERB
+from spacy.symbols import ORTH, LEMMA, POS


 def test_issue1061():
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@ -271,10 +271,7 @@ def test_issue1963(en_tokenizer):
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
 def test_issue1967(label):
    nlp = Language()
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-    }
+    config = {}
    ner = nlp.create_pipe("ner", config=config)
    example = Example.from_dict(
        Doc(ner.vocab, words=["word"]),
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@ -157,7 +157,11 @@ def test_issue3540(en_vocab):

    with doc.retokenize() as retokenizer:
        heads = [(doc[3], 1), doc[2]]
-        attrs = {"POS": ["PROPN", "PROPN"], "LEMMA": ["New", "York"], "DEP": ["pobj", "compound"]}
+        attrs = {
+            "POS": ["PROPN", "PROPN"],
+            "LEMMA": ["New", "York"],
+            "DEP": ["pobj", "compound"],
+        }
        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)

    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@ -138,10 +138,7 @@ def test_issue4042_bug2():
        if not output_dir.exists():
            output_dir.mkdir()
        ner1.to_disk(output_dir)
-        config = {
-            "learn_tokens": False,
-            "min_action_freq": 30,
-        }
+        config = {}
        ner2 = nlp1.create_pipe("ner", config=config)
        ner2.from_disk(output_dir)
        assert len(ner2.labels) == 2
@ -303,10 +300,7 @@ def test_issue4313():
    beam_width = 16
    beam_density = 0.0001
    nlp = English()
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-    }
+    config = {}
    ner = nlp.create_pipe("ner", config=config)
    ner.add_label("SOME_LABEL")
    ner.begin_training([])
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@ -185,20 +185,16 @@ def test_issue4725_1():
    vocab = Vocab(vectors_name="test_vocab_add_vector")
    nlp = English(vocab=vocab)
    config = {
-        "learn_tokens": False,
-        "min_action_freq": 342,
        "update_with_oracle_cut_size": 111,
    }
    ner = nlp.create_pipe("ner", config=config)
    with make_tempdir() as tmp_path:
        with (tmp_path / "ner.pkl").open("wb") as file_:
            pickle.dump(ner, file_)
-            assert ner.cfg["min_action_freq"] == 342
            assert ner.cfg["update_with_oracle_cut_size"] == 111

        with (tmp_path / "ner.pkl").open("rb") as file_:
            ner2 = pickle.load(file_)
-            assert ner2.cfg["min_action_freq"] == 342
            assert ner2.cfg["update_with_oracle_cut_size"] == 111


--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -236,3 +236,33 @@ def test_language_from_config_before_after_init_invalid():
        config = {"nlp": {"after_pipeline_creation": {"@callbacks": callback_name}}}
        with pytest.raises(ValueError):
            English.from_config(config)
+
+
+def test_language_custom_tokenizer():
+    """Test that a fully custom tokenizer can be plugged in via the registry."""
+    name = "test_language_custom_tokenizer"
+
+    class CustomTokenizer:
+        """Dummy "tokenizer" that splits on spaces and adds prefix to each word."""
+
+        def __init__(self, nlp, prefix):
+            self.vocab = nlp.vocab
+            self.prefix = prefix
+
+        def __call__(self, text):
+            words = [f"{self.prefix}{word}" for word in text.split(" ")]
+            return Doc(self.vocab, words=words)
+
+    @registry.tokenizers(name)
+    def custom_create_tokenizer(prefix: str = "_"):
+        def create_tokenizer(nlp):
+            return CustomTokenizer(nlp, prefix=prefix)
+
+        return create_tokenizer
+
+    config = {"nlp": {"tokenizer": {"@tokenizers": name}}}
+    nlp = English.from_config(config)
+    doc = nlp("hello world")
+    assert [t.text for t in doc] == ["_hello", "_world"]
+    doc = list(nlp.pipe(["hello world"]))[0]
+    assert [t.text for t in doc] == ["_hello", "_world"]
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -3,7 +3,7 @@ title: Model Architectures
 teaser: Pre-defined model architectures included with the core library
 source: spacy/ml/models
 menu:
-  - ['Tok2Vec', 'tok2vec']
+  - ['Tok2Vec', 'tok2vec-arch']
  - ['Transformers', 'transformers']
  - ['Parser & NER', 'parser']
  - ['Tagging', 'tagger']
@ -70,6 +70,47 @@ blog post for background.
 | `embed`  | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Embed tokens into context-independent word vector representations.                                   |
 | `encode` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Floats2d]`. **Output:** `List[Floats2d]`. Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. |

+### spacy.Tok2VecListener.v1 {#Tok2VecListener}
+
+> #### Example config
+>
+> ```ini
+> [components.tok2vec]
+> factory = "tok2vec"
+>
+> [components.tok2vec.model]
+> @architectures = "spacy.HashEmbedCNN.v1"
+> width = 342
+>
+> [components.tagger]
+> factory = "tagger"
+>
+> [components.tagger.model]
+> @architectures = "spacy.Tagger.v1"
+>
+> [components.tagger.model.tok2vec]
+> @architectures = "spacy.Tok2VecListener.v1"
+> width = ${components.tok2vec.model:width}
+> ```
+
+A listener is used as a sublayer within a component such as a
+[`DependencyParser`](/api/dependencyparser),
+[`EntityRecognizer`](/api/entityrecognizer)or
+[`TextCategorizer`](/api/textcategorizer). Usually you'll have multiple
+listeners connecting to a single upstream [`Tok2Vec`](/api/tok2vec) component
+that's earlier in the pipeline. The listener layers act as **proxies**, passing
+the predictions from the `Tok2Vec` component into downstream components, and
+communicating gradients back upstream.
+
+Instead of defining its own `Tok2Vec` instance, a model architecture like
+[Tagger](/api/architectures#tagger) can define a listener as its `tok2vec`
+argument that connects to the shared `tok2vec` component in the pipeline.
+
+| Name       | Type | Description                                                                                                                                                                                                                                                                                            |
+| ---------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `width`    | int  | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component.                                                                                                                                                                                                               |
+| `upstream` | str  | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. |
+
 ### spacy.MultiHashEmbed.v1 {#MultiHashEmbed}

 <!-- TODO: check example config -->
@ -195,7 +236,7 @@ and residual connections.
 > depth = 4
 > ```

-Encode context using bidirectonal LSTM layers. Requires
+Encode context using bidirectional LSTM layers. Requires
 [PyTorch](https://pytorch.org).

 | Name          | Type | Description                                                                                                                                                                                            |
@ -237,8 +278,6 @@ architectures into your training config.

 ### spacy-transformers.Tok2VecListener.v1 {#Tok2VecListener}

-<!-- TODO: description -->
-
 > #### Example Config
 >
 > ```ini
@ -250,10 +289,41 @@ architectures into your training config.
 > @layers = "reduce_mean.v1"
 > ```

+Create a `TransformerListener` layer, which will connect to a
+[`Transformer`](/api/transformer) component earlier in the pipeline. The layer
+takes a list of [`Doc`](/api/doc) objects as input, and produces a list of
+2-dimensional arrays as output, with each array having one row per token. Most
+spaCy models expect a sublayer with this signature, making it easy to connect
+them to a transformer model via this sublayer. Transformer models usually
+operate over wordpieces, which usually don't align one-to-one against spaCy
+tokens. The layer therefore requires a reduction operation in order to calculate
+a single token vector given zero or more wordpiece vectors.
+
 | Name          | Type                                       | Description                                                                                                                                                                                                                                                         |
-| ------------- | ------------------------- | ---------------------------------------------------------------------------------------------- |
-| `grad_factor` | float                     | Factor for weighting the gradient if multiple components listen to the same transformer model. |
-| `pooling`     | `Model[Ragged, Floats2d]` | Pooling layer to determine how the vector for each spaCy token will be computed.               |
+| ------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `pooling`     | [`Model`](https://thinc.ai/docs/api-model) | **Input:** [`Ragged`](https://thinc.ai/docs/api-types#ragged). **Output:** [`Floats2d`](https://thinc.ai/docs/api-types#types)                                                                                                                                      | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. |
+| `grad_factor` | float                                      | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. |
+
+### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer}
+
+> #### Example Config
+>
+> ```ini
+> # TODO:
+> ```
+
+Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does
+**not** allow multiple components to share the transformer weights, and does
+**not** allow the transformer to set annotations into the [`Doc`](/api/doc)
+object, but it's a **simpler solution** if you only need the transformer within
+one component.
+
+| Name               | Type                                       | Description                                                                                                                                                                                                                                                         |
+| ------------------ | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_spans`        | callable                                   | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples.                                                     |
+| `tokenizer_config` | `Dict[str, Any]`                           | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer).                                                                                                                    |
+| `pooling`          | [`Model`](https://thinc.ai/docs/api-model) | **Input:** [`Ragged`](https://thinc.ai/docs/api-types#ragged). **Output:** [`Floats2d`](https://thinc.ai/docs/api-types#types)                                                                                                                                      | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. |
+| `grad_factor`      | float                                      | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. |

 ## Parser & NER architectures {#parser}

@ -418,7 +488,7 @@ network has an internal CNN Tok2Vec layer and uses attention.
 > ```

 | Name                        | Type  | Description                                                                                                                                              |
-| -------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| --------------------------- | ----- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `exclusive_classes`         | bool  | Whether or not categories are mutually exclusive.                                                                                                        |
 | `pretrained_vectors`        | bool  | Whether or not pretrained vectors will be used in addition to the feature vectors.                                                                       |
 | `width`                     | int   | Output dimension of the feature encoding step.                                                                                                           |
@ -427,10 +497,8 @@ network has an internal CNN Tok2Vec layer and uses attention.
 | `window_size`               | int   | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right.                      |
 | `ngram_size`                | int   | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features.              |
 | `dropout`                   | float | The dropout rate.                                                                                                                                        |
-| `nO`                 | int   | Output dimension, determined by the number of different labels.                                                                             |
-
-If the `nO` dimension is not set, the TextCategorizer component will set it when
-`begin_training` is called.
+| `nO`                        | int   | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when |
+| `begin_training` is called. |

 ### spacy.TextCatCNN.v1 {#TextCatCNN}

@ -458,13 +526,11 @@ vectors are mean pooled and used as features in a feed-forward network. This
 architecture is usually less accurate than the ensemble, but runs faster.

 | Name                        | Type                                       | Description                                                                                                                                              |
-| ------------------- | ------------------------------------------ | --------------------------------------------------------------- |
+| --------------------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `exclusive_classes`         | bool                                       | Whether or not categories are mutually exclusive.                                                                                                        |
 | `tok2vec`                   | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model.                                                                                                            |
-| `nO`                | int                                        | Output dimension, determined by the number of different labels. |
-
-If the `nO` dimension is not set, the TextCategorizer component will set it when
-`begin_training` is called.
+| `nO`                        | int                                        | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when |
+| `begin_training` is called. |

 ### spacy.TextCatBOW.v1 {#TextCatBOW}

@ -483,16 +549,16 @@ An ngram "bag-of-words" model. This architecture should run much faster than the
 others, but may not be as accurate, especially if texts are short.

 | Name                        | Type  | Description                                                                                                                                              |
-| ------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| --------------------------- | ----- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `exclusive_classes`         | bool  | Whether or not categories are mutually exclusive.                                                                                                        |
 | `ngram_size`                | int   | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features.              |
 | `no_output_layer`           | float | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes=True`, else `Logistic`.                                   |
-| `nO`                | int   | Output dimension, determined by the number of different labels.                                                                             |
-
-If the `nO` dimension is not set, the TextCategorizer component will set it when
-`begin_training` is called.
+| `nO`                        | int   | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when |
+| `begin_training` is called. |

+<!-- TODO:
 ### spacy.TextCatLowData.v1 {#TextCatLowData}
+-->

 ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}

@ -558,8 +624,6 @@ A function that creates a default, empty `KnowledgeBase` from a

 A function that takes as input a [`KnowledgeBase`](/api/kb) and a
 [`Span`](/api/span) object denoting a named entity, and returns a list of
-plausible [`Candidate` objects](/api/kb/#candidate_init).
-
-The default `CandidateGenerator` simply uses the text of a mention to find its
-potential aliases in the Knowledgebase. Note that this function is
-case-dependent.
+plausible [`Candidate` objects](/api/kb/#candidate_init). The default
+`CandidateGenerator` simply uses the text of a mention to find its potential
+aliases in the `KnowledgeBase`. Note that this function is case-dependent.
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -601,9 +601,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides

 ## Pretrain {#pretrain new="2.1" tag="experimental"}

-<!-- TODO: document new pretrain command and link to new pretraining docs -->
-
-Pre-train the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
+Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
 components on [raw text](/api/data-formats#pretrain), using an approximate
 language-modeling objective. Specifically, we load pretrained vectors, and train
 a component like a CNN, BiLSTM, etc to predict vectors which match the
@ -611,7 +609,8 @@ pretrained ones. The weights are saved to a directory after each epoch. You can
 then include a **path to one of these pretrained weights files** in your
 [training config](/usage/training#config) as the `init_tok2vec` setting when you
 train your model. This technique may be especially helpful if you have little
-labelled data.
+labelled data. See the usage docs on [pretraining](/usage/training#pretraining)
+for more info.

 <Infobox title="Changed in v3.0" variant="warning">

@ -634,8 +633,8 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path]
 | `output_dir`            | positional | Directory to write models to on each epoch.                                                                                                                                  |
 | `config_path`           | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters.                                                                        |
 | `--code`, `-c`          | option     | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures.                         |
-| `--resume-path`, `-r`   | option     | TODO:                                                                                                                                                                        |
-| `--epoch-resume`, `-er` | option     | TODO:                                                                                                                                                                        |
+| `--resume-path`, `-r`   | option     | Path to pretrained weights from which to resume pretraining.                                                                                                                 |
+| `--epoch-resume`, `-er` | option     | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files.                                                      |
 | `--help`, `-h`          | flag       | Show help message and available arguments.                                                                                                                                   |
 | overrides               |            | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`.                |
 | **CREATES**             | weights    | The pretrained weights that can be used to initialize `spacy train`.                                                                                                         |
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@ -20,9 +20,9 @@ Config files define the training process and model pipeline and can be passed to
 [`spacy train`](/api/cli#train). They use
 [Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
 hood. For details on how to use training configs, see the
-[usage documentation](/usage/training#config).
-
-<!-- TODO: add details on getting started and init config -->
+[usage documentation](/usage/training#config). To get started with a blank
+config or fill a partial config with all defaults, you can use the
+[`init config`](/api/cli#init-config) command.

 > #### What does the @ mean?
 >
@ -52,8 +52,6 @@ your config and check that it's valid, you can run the

 </Infobox>

-<!-- TODO: once we know how we want to implement "starter config" workflow or outputting a full default config for the user, update this section with the command -->
-
 ### nlp {#config-nlp tag="section"}

 > #### Example
@ -154,8 +152,6 @@ This section is optional and defines settings and controls for
 [language model pretraining](/usage/training#pretraining). It's used when you
 run [`spacy pretrain`](/api/cli#pretrain).

-<!-- TODO: complete -->
-
 | Name                         | Type                                                | Description                                                                   | Default                                             |
 | ---------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------- | --------------------------------------------------- |
 | `max_epochs`                 | int                                                 | Maximum number of epochs.                                                     | `1000`                                              |
--- a/website/docs/api/dependencymatcher.md
+++ b/website/docs/api/dependencymatcher.md
@ -5,4 +5,194 @@ tag: class
 source: spacy/matcher/dependencymatcher.pyx
 ---

-TODO: write
+The `DependencyMatcher` follows the same API as the [`Matcher`](/api/matcher)
+and [`PhraseMatcher`](/api/phrasematcher) and lets you match on dependency trees
+using the
+[Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html).
+It requires a pretrained [`DependencyParser`](/api/parser) or other component
+that sets the `Token.dep` attribute.
+
+## Pattern format {#patterns}
+
+> ```json
+> ### Example
+> [
+>   {
+>     "SPEC": {"NODE_NAME": "founded"},
+>     "PATTERN": {"ORTH": "founded"}
+>   },
+>   {
+>     "SPEC": {
+>       "NODE_NAME": "founder",
+>       "NBOR_RELOP": ">",
+>       "NBOR_NAME": "founded"
+>   },
+>     "PATTERN": {"DEP": "nsubj"}
+>   },
+>   {
+>     "SPEC": {
+>       "NODE_NAME": "object",
+>       "NBOR_RELOP": ">",
+>       "NBOR_NAME": "founded"
+>   },
+>     "PATTERN": {"DEP": "dobj"}
+>   }
+> ]
+> ```
+
+A pattern added to the `DependencyMatcher` consists of a list of dictionaries,
+with each dictionary describing a node to match. Each pattern should have the
+following top-level keys:
+
+| Name      | Type | Description                                                                                                                 |
+| --------- | ---- | --------------------------------------------------------------------------------------------------------------------------- |
+| `PATTERN` | dict | The token attributes to match in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). |
+| `SPEC`    | dict | The relationships of the nodes in the subtree that should be matched.                                                       |
+
+The `SPEC` includes the following fields:
+
+| Name         | Type | Description                                                                                                                                                            |
+| ------------ | ---- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `NODE_NAME`  | str  | A unique name for this node to refer to it in other specs.                                                                                                             |
+| `NBOR_RELOP` | str  | A [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html) operator that describes how the two nodes are related. |
+| `NBOR_NAME`  | str  | The unique name of the node that this node is connected to.                                                                                                            |
+
+## DependencyMatcher.\_\_init\_\_ {#init tag="method"}
+
+Create a rule-based `DependencyMatcher`.
+
+> #### Example
+>
+> ```python
+> from spacy.matcher import DependencyMatcher
+> matcher = DependencyMatcher(nlp.vocab)
+> ```
+
+| Name    | Type    | Description                                                                                 |
+| ------- | ------- | ------------------------------------------------------------------------------------------- |
+| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
+
+## DependencyMatcher.\_\call\_\_ {#call tag="method"}
+
+Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
+
+> #### Example
+>
+> ```python
+> from spacy.matcher import Matcher
+>
+> matcher = Matcher(nlp.vocab)
+> pattern = [
+>     {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
+>     {"SPEC": {"NODE_NAME": "founder", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
+> ]
+> matcher.add("Founder", [pattern])
+> doc = nlp("Bill Gates founded Microsoft.")
+> matches = matcher(doc)
+> ```
+
+| Name        | Type         | Description                                                                                                                                                              |
+| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `doclike`   | `Doc`/`Span` | The `Doc` or `Span` to match over.                                                                                                                                       |
+| **RETURNS** | list         | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
+
+## DependencyMatcher.\_\_len\_\_ {#len tag="method"}
+
+Get the number of rules (edges) added to the dependency matcher. Note that this
+only returns the number of rules (identical with the number of IDs), not the
+number of individual patterns.
+
+> #### Example
+>
+> ```python
+> matcher = DependencyMatcher(nlp.vocab)
+> assert len(matcher) == 0
+> pattern = [
+>     {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
+>     {"SPEC": {"NODE_NAME": "START_ENTITY", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
+> ]
+> matcher.add("Rule", [pattern])
+> assert len(matcher) == 1
+> ```
+
+| Name        | Type | Description          |
+| ----------- | ---- | -------------------- |
+| **RETURNS** | int  | The number of rules. |
+
+## DependencyMatcher.\_\_contains\_\_ {#contains tag="method"}
+
+Check whether the matcher contains rules for a match ID.
+
+> #### Example
+>
+> ```python
+> matcher = Matcher(nlp.vocab)
+> assert "Rule" not in matcher
+> matcher.add("Rule", [pattern])
+> assert "Rule" in matcher
+> ```
+
+| Name        | Type | Description                                           |
+| ----------- | ---- | ----------------------------------------------------- |
+| `key`       | str  | The match ID.                                         |
+| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
+
+## DependencyMatcher.add {#add tag="method"}
+
+Add a rule to the matcher, consisting of an ID key, one or more patterns, and an
+optional callback function to act on the matches. The callback function will
+receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
+exists for the given ID, the patterns will be extended. An `on_match` callback
+will be overwritten.
+
+> #### Example
+>
+> ```python
+> def on_match(matcher, doc, id, matches):
+>     print('Matched!', matches)
+>
+> matcher = Matcher(nlp.vocab)
+> matcher.add("TEST_PATTERNS", patterns)
+> ```
+
+| Name           | Type               | Description                                                                                   |
+| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
+| `match_id`     | str                | An ID for the thing you're matching.                                                          |
+| `patterns`     | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |
+| _keyword-only_ |                    |                                                                                               |
+| `on_match`     | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
+
+## DependencyMatcher.remove {#remove tag="method"}
+
+Remove a rule from the matcher. A `KeyError` is raised if the match ID does not
+exist.
+
+> #### Example
+>
+> ```python
+> matcher.add("Rule", [pattern]])
+> assert "Rule" in matcher
+> matcher.remove("Rule")
+> assert "Rule" not in matcher
+> ```
+
+| Name  | Type | Description               |
+| ----- | ---- | ------------------------- |
+| `key` | str  | The ID of the match rule. |
+
+## DependencyMatcher.get {#get tag="method"}
+
+Retrieve the pattern stored for a key. Returns the rule as an
+`(on_match, patterns)` tuple containing the callback and available patterns.
+
+> #### Example
+>
+> ```python
+> matcher.add("Rule", [pattern], on_match=on_match)
+> on_match, patterns = matcher.get("Rule")
+> ```
+
+| Name        | Type  | Description                                   |
+| ----------- | ----- | --------------------------------------------- |
+| `key`       | str   | The ID of the match rule.                     |
+| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@ -8,6 +8,23 @@ api_string_name: parser
 api_trainable: true
 ---

+A transition-based dependency parser component. The dependency parser jointly
+learns sentence segmentation and labelled dependency parsing, and can optionally
+learn to merge tokens that had been over-segmented by the tokenizer. The parser
+uses a variant of the **non-monotonic arc-eager transition-system** described by
+[Honnibal and Johnson (2014)](https://www.aclweb.org/anthology/D15-1162/), with
+the addition of a "break" transition to perform the sentence segmentation.
+[Nivre (2005)](https://www.aclweb.org/anthology/P05-1013/)'s **pseudo-projective
+dependency transformation** is used to allow the parser to predict
+non-projective parses.
+
+The parser is trained using an **imitation learning objective**. It follows the
+actions predicted by the current weights, and at each state, determines which
+actions are compatible with the optimal parse that could be reached from the
+current state. The weights such that the scores assigned to the set of optimal
+actions is increased, while scores assigned to other actions are decreased. Note
+that more than one action may be optimal for a given state.
+
 ## Config and implementation {#config}

 The default config is defined by the pipeline component factory and describes
@ -23,17 +40,20 @@ architectures and their arguments and hyperparameters.
 > from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
 > config = {
 >    "moves": None,
->   # TODO: rest
+>    "update_with_oracle_cut_size": 100,
+>    "learn_tokens": False,
+>    "min_action_freq": 30,
 >    "model": DEFAULT_PARSER_MODEL,
 > }
 > nlp.add_pipe("parser", config=config)
 > ```

-<!-- TODO: finish API docs -->
-
 | Setting                       | Type                                       | Description                                                                                                                                                                                                                                                                                 | Default                                                           |
-| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
-| `moves` | list                                       |                   | `None`                                                            |
+| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------- |
+| `moves`                       | `List[str]`                                | A list of transition names. Inferred from the data if not provided.                                                                                                                                                                                                                         | `None`                                                            |
+| `update_with_oracle_cut_size` | int                                        | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it.                                                                    | `100`                                                             |
+| `learn_tokens`                | bool                                       | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental.                                                                                                                                                                                             | `False`                                                           |
+| `min_action_freq`             | int                                        | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. | `30`                                                              |
 | `model`                       | [`Model`](https://thinc.ai/docs/api-model) | The model to use.                                                                                                                                                                                                                                                                           | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |

 ```python
@ -61,19 +81,16 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).

-<!-- TODO: finish API docs -->
-
 | Name                          | Type                                       | Description                                                                                                                                                                                                                                                                                 |
-| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
+| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `vocab`                       | `Vocab`                                    | The shared vocabulary.                                                                                                                                                                                                                                                                      |
 | `model`                       | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.                                                                                                                                                                                                             |
 | `name`                        | str                                        | String name of the component instance. Used to add entries to the `losses` during training.                                                                                                                                                                                                 |
-| `moves`                       | list                                       |                                                                                             |
+| `moves`                       | `List[str]`                                | A list of transition names. Inferred from the data if not provided.                                                                                                                                                                                                                         |
 | _keyword-only_                |                                            |                                                                                                                                                                                                                                                                                             |
-| `update_with_oracle_cut_size` | int                                        |                                                                                             |
-| `multitasks`                  | `Iterable`                                 |                                                                                             |
-| `learn_tokens`                | bool                                       |                                                                                             |
-| `min_action_freq`             | int                                        |                                                                                             |
+| `update_with_oracle_cut_size` | int                                        | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default.                                           |
+| `learn_tokens`                | bool                                       | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental.                                                                                                                                                                                             |
+| `min_action_freq`             | int                                        | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. |

 ## DependencyParser.\_\_call\_\_ {#call tag="method"}

--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@ -8,6 +8,18 @@ api_string_name: ner
 api_trainable: true
 ---

+A transition-based named entity recognition component. The entity recognizer
+identifies **non-overlapping labelled spans** of tokens. The transition-based
+algorithm used encodes certain assumptions that are effective for "traditional"
+named entity recognition tasks, but may not be a good fit for every span
+identification problem. Specifically, the loss function optimizes for **whole
+entity accuracy**, so if your inter-annotator agreement on boundary tokens is
+low, the component will likely perform poorly on your problem. The
+transition-based algorithm also assumes that the most decisive information about
+your entities will be close to their initial tokens. If your entities are long
+and characterized by tokens in their middle, the component will likely not be a
+good fit for your task.
+
 ## Config and implementation {#config}

 The default config is defined by the pipeline component factory and describes
@ -23,17 +35,16 @@ architectures and their arguments and hyperparameters.
 > from spacy.pipeline.ner import DEFAULT_NER_MODEL
 > config = {
 >    "moves": None,
->   # TODO: rest
+>    "update_with_oracle_cut_size": 100,
 >    "model": DEFAULT_NER_MODEL,
 > }
 > nlp.add_pipe("ner", config=config)
 > ```

-<!-- TODO: finish API docs -->
-
 | Setting                       | Type                                       | Description                                                                                                                                                                                                              | Default                                                           |
-| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
-| `moves` | list                                       |                   | `None`                                                            |
+| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------- |
+| `moves`                       | `List[str]`                                | A list of transition names. Inferred from the data if not provided.                                                                                                                                                      |
+| `update_with_oracle_cut_size` | int                                        | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. | `100`                                                             |
 | `model`                       | [`Model`](https://thinc.ai/docs/api-model) | The model to use.                                                                                                                                                                                                        | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |

 ```python
@ -61,19 +72,14 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).

-<!-- TODO: finish API docs -->
-
 | Name                          | Type                                       | Description                                                                                                                                                                                                                                       |
-| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
+| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `vocab`                       | `Vocab`                                    | The shared vocabulary.                                                                                                                                                                                                                            |
 | `model`                       | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.                                                                                                                                                                   |
 | `name`                        | str                                        | String name of the component instance. Used to add entries to the `losses` during training.                                                                                                                                                       |
-| `moves`                       | list                                       |                                                                                             |
+| `moves`                       | `List[str]`                                | A list of transition names. Inferred from the data if not provided.                                                                                                                                                                               |
 | _keyword-only_                |                                            |                                                                                                                                                                                                                                                   |
-| `update_with_oracle_cut_size` | int                                        |                                                                                             |
-| `multitasks`                  | `Iterable`                                 |                                                                                             |
-| `learn_tokens`                | bool                                       |                                                                                             |
-| `min_action_freq`             | int                                        |                                                                                             |
+| `update_with_oracle_cut_size` | int                                        | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. |

 ## EntityRecognizer.\_\_call\_\_ {#call tag="method"}

--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@ -242,6 +242,21 @@ a batch of [Example](/api/example) objects.

 Update the models in the pipeline.

+<Infobox variant="warning" title="Changed in v3.0">
+
+The `Language.update` method now takes a batch of [`Example`](/api/example)
+objects instead of the raw texts and annotations or `Doc` and `GoldParse`
+objects. An [`Example`](/api/example) streamlines how data is passed around. It
+stores two `Doc` objects: one for holding the gold-standard reference data, and
+one for holding the predictions of the pipeline.
+
+For most use cases, you shouldn't have to write your own training scripts
+anymore. Instead, you can use [`spacy train`](/api/cli#train) with a config file
+and custom registered functions if needed. See the
+[training documentation](/usage/training) for details.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
@ -253,7 +268,7 @@ Update the models in the pipeline.

 | Name            | Type                                                | Description                                                                                            |
 | --------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------ |
-| `examples`      | `Iterable[Example]`                                 | A batch of `Example` objects to learn from.                                                            |
+| `examples`      | `Iterable[Example]`                                 | A batch of [`Example`](/api/example) objects to learn from.                                            |
 | _keyword-only_  |                                                     |                                                                                                        |
 | `drop`          | float                                               | The dropout rate.                                                                                      |
 | `sgd`           | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer.                                                                                         |
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@ -9,6 +9,28 @@ api_string_name: lemmatizer
 api_trainable: false
 ---

+Component for assigning base forms to tokens using rules based on part-of-speech
+tags, or lookup tables. Functionality to train the component is coming soon.
+Different [`Language`](/api/language) subclasses can implement their own
+lemmatizer components via
+[language-specific factories](/usage/processing-pipelines#factories-language).
+The default data used is provided by the
+[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
+extension package.
+
+<Infobox variant="warning" title="New in v3.0">
+
+As of v3.0, the `Lemmatizer` is a **standalone pipeline component** that can be
+added to your pipeline, and not a hidden part of the vocab that runs behind the
+scenes. This makes it easier to customize how lemmas should be assigned in your
+pipeline.
+
+If the lemmatization mode is set to `"rule"` and requires part-of-speech tags to
+be assigned, make sure a [`Tagger`](/api/tagger) or another component assigning
+tags is available in the pipeline and runs _before_ the lemmatizer.
+
+</Infobox>
+
 ## Config and implementation

 The default config is defined by the pipeline component factory and describes
@ -29,7 +51,7 @@ lemmatizers, see the

 | Setting     | Type                                       | Description                                                                                                                                                                            | Default    |
 | ----------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- |
-| `mode`      | str                                        | The lemmatizer mode, e.g. "lookup" or "rule".                                                                                                                                          | `"lookup"` |
+| `mode`      | str                                        | The lemmatizer mode, e.g. `"lookup"` or `"rule"`.                                                                                                                                      | `"lookup"` |
 | `lookups`   | [`Lookups`](/api/lookups)                  | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from `spacy-lookups-data`. | `None`     |
 | `overwrite` | bool                                       | Whether to overwrite existing lemmas.                                                                                                                                                  | `False`    |
 | `model`     | [`Model`](https://thinc.ai/docs/api-model) | **Not yet implemented:** the model to use.                                                                                                                                             | `None`     |
@ -56,13 +78,13 @@ shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).

 | Name           | Type                                       | Description                                                                                                                              |
-| -------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------- |
+| -------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------- |
 | `vocab`        | [`Vocab`](/api/vocab)                      | The vocab.                                                                                                                               |
 | `model`        | [`Model`](https://thinc.ai/docs/api-model) | A model (not yet implemented).                                                                                                           |
 | `name`         | str                                        | String name of the component instance. Used to add entries to the `losses` during training.                                              |
 | _keyword-only_ |                                            |                                                                                                                                          |
-| mode           | str                                        | The lemmatizer mode, e.g. "lookup" or "rule". Defaults to "lookup".                                                              |
-| lookups        | [`Lookups`](/api/lookups)                  | A lookups object containing the tables such as "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". Defaults to `None`. |
+| mode           | str                                        | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`.                                                                |
+| lookups        | [`Lookups`](/api/lookups)                  | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. |
 | overwrite      | bool                                       | Whether to overwrite existing lemmas.                                                                                                    |

 ## Lemmatizer.\_\_call\_\_ {#call tag="method"}
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@ -5,6 +5,82 @@ tag: class
 source: spacy/matcher/matcher.pyx
 ---

+The `Matcher` lets you find words and phrases using rules describing their token
+attributes. Rules can refer to token annotations (like the text or
+part-of-speech tags), as well as lexical attributes like `Token.is_punct`.
+Applying the matcher to a [`Doc`](/api/doc) gives you access to the matched
+tokens in context. For in-depth examples and workflows for combining rules and
+statistical models, see the [usage guide](/usage/rule-based-matching) on
+rule-based matching.
+
+## Pattern format {#patterns}
+
+> ```json
+> ### Example
+> [
+>   {"LOWER": "i"},
+>   {"LEMMA": {"IN": ["like", "love"]}},
+>   {"POS": "NOUN", "OP": "+"}
+> ]
+> ```
+
+A pattern added to the `Matcher` consists of a list of dictionaries. Each
+dictionary describes **one token** and its attributes. The available token
+pattern keys correspond to a number of
+[`Token` attributes](/api/token#attributes). The supported attributes for
+rule-based matching are:
+
+| Attribute                              | Type |  Description                                                                                           |
+| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ |
+| `ORTH`                                 | str  | The exact verbatim text of a token.                                                                    |
+| `TEXT` <Tag variant="new">2.1</Tag>    | str  | The exact verbatim text of a token.                                                                    |
+| `LOWER`                                | str  | The lowercase form of the token text.                                                                  |
+|  `LENGTH`                              | int  | The length of the token text.                                                                          |
+|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | bool | Token text consists of alphabetic characters, ASCII characters, digits.                                |
+|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`    | bool | Token text is in lowercase, uppercase, titlecase.                                                      |
+|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`     | bool | Token is punctuation, whitespace, stop word.                                                           |
+|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`  | bool | Token text resembles a number, URL, email.                                                             |
+|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str  | The token's simple and extended part-of-speech tag, dependency label, lemma, shape.                    |
+| `ENT_TYPE`                             | str  | The token's entity label.                                                                              |
+| `_` <Tag variant="new">2.1</Tag>       | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
+| `OP`                                   | str  | Operator or quantifier to determine how often to match a token pattern.                                |
+
+Operators and quantifiers define **how often** a token pattern should be
+matched:
+
+> ```json
+> ### Example
+> [
+>   {"POS": "ADJ", "OP": "*"},
+>   {"POS": "NOUN", "OP": "+"}
+> ]
+> ```
+
+| OP  | Description                                                      |
+| --- | ---------------------------------------------------------------- |
+| `!` | Negate the pattern, by requiring it to match exactly 0 times.    |
+| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
+| `+` | Require the pattern to match 1 or more times.                    |
+| `*` | Allow the pattern to match zero or more times.                   |
+
+Token patterns can also map to a **dictionary of properties** instead of a
+single value to indicate whether the expected value is a member of a list or how
+it compares to another value.
+
+> ```json
+> ### Example
+> [
+>   {"LEMMA": {"IN": ["like", "love", "enjoy"]}},
+>   {"POS": "PROPN", "LENGTH": {">=": 10}},
+> ]
+> ```
+
+| Attribute                  | Type       | Description                                                                       |
+| -------------------------- | ---------- | --------------------------------------------------------------------------------- |
+| `IN`                       | any        | Attribute value is member of a list.                                              |
+| `NOT_IN`                   | any        | Attribute value is _not_ member of a list.                                        |
+| `==`, `>=`, `<=`, `>`, `<` | int, float | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. |
+
 ## Matcher.\_\_init\_\_ {#init tag="method"}

 Create the rule-based `Matcher`. If `validate=True` is set, all patterns added
@ -60,7 +136,7 @@ Match a stream of documents, yielding them in turn.

 | Name                                          | Type     | Description                                                                                                                                                                                                                |
 | --------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `docs`                                        | iterable | A stream of documents.                                                                                                                                                                                                     |
+| `docs`                                        | iterable | A stream of documents or spans.                                                                                                                                                                                            |
 | `batch_size`                                  | int      | The number of documents to accumulate into a working set.                                                                                                                                                                  |
 | `return_matches` <Tag variant="new">2.1</Tag> | bool     | Yield the match lists along with the docs, making results `(doc, matches)` tuples.                                                                                                                                         |
 | `as_tuples`                                   | bool     | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. |
@ -105,11 +181,11 @@ Check whether the matcher contains rules for a match ID.

 ## Matcher.add {#add tag="method" new="2"}

-Add a rule to the matcher, consisting of an ID key, one or more patterns, and a
-callback function to act on the matches. The callback function will receive the
-arguments `matcher`, `doc`, `i` and `matches`. If a pattern already exists for
-the given ID, the patterns will be extended. An `on_match` callback will be
-overwritten.
+Add a rule to the matcher, consisting of an ID key, one or more patterns, and an
+optional callback function to act on the matches. The callback function will
+receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
+exists for the given ID, the patterns will be extended. An `on_match` callback
+will be overwritten.

 > #### Example
 >
@ -142,11 +218,12 @@ patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
 </Infobox>

 | Name                                | Type               | Description                                                                                   |
-| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
+| ----------------------------------- | ------------------ | --------------------------------------------------------------------------------------------- |
 | `match_id`                          | str                | An ID for the thing you're matching.                                                          |
-| `patterns`     | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |
+| `patterns`                          | `List[List[dict]]` | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |
 | _keyword-only_                      |                    |                                                                                               |
-| `on_match`     | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
+| `on_match`                          | callable / `None`  | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
+| `greedy` <Tag variant="new">3</Tag> | str                | Optional filter for greedy matches. Can either be `"FIRST"` or `"LONGEST"`.                   |

 ## Matcher.remove {#remove tag="method" new="2"}

--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@ -63,16 +63,14 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).

-<!-- TODO: finish API docs -->
-
 | Name           | Type    | Description                                                                                 |
 | -------------- | ------- | ------------------------------------------------------------------------------------------- |
 | `vocab`        | `Vocab` | The shared vocabulary.                                                                      |
 | `model`        | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.             |
 | `name`         | str     | String name of the component instance. Used to add entries to the `losses` during training. |
 | _keyword-only_ |         |                                                                                             |
-| `labels_morph` | dict    |                                                                                             |
-| `labels_pos`   | dict    |                                                                                             |
+| `labels_morph` | dict    | Mapping of morph + POS tags to morph labels.                                                |
+| `labels_pos`   | dict    | Mapping of morph + POS tags to POS tags.                                                    |

 ## Morphologizer.\_\_call\_\_ {#call tag="method"}

--- a/website/docs/api/phrasematcher.md
+++ b/website/docs/api/phrasematcher.md
@ -9,7 +9,8 @@ new: 2
 The `PhraseMatcher` lets you efficiently match large terminology lists. While
 the [`Matcher`](/api/matcher) lets you match sequences based on lists of token
 descriptions, the `PhraseMatcher` accepts match patterns in the form of `Doc`
-objects.
+objects. See the [usage guide](/usage/rule-based-matching#phrasematcher) for
+examples.

 ## PhraseMatcher.\_\_init\_\_ {#init tag="method"}

--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@ -29,9 +29,9 @@ architectures and their arguments and hyperparameters.
 > ```

 | Setting          | Type                                       | Description                                                                                                                                                                                                      | Default                             |
-| ---------------- | ------------------------------------------ | -------------------------------------- | ----------------------------------- |
+| ---------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------- |
 | `set_morphology` | bool                                       | Whether to set morphological features.                                                                                                                                                                           | `False`                             |
-| `model`          | [`Model`](https://thinc.ai/docs/api-model) | The model to use.                      | [Tagger](/api/architectures#Tagger) |
+| `model`          | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). | [Tagger](/api/architectures#Tagger) |

 ```python
 https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tagger.pyx
@ -59,9 +59,9 @@ shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).

 | Name             | Type                                       | Description                                                                                                                                                                                                      |
-| ---------------- | ------- | ------------------------------------------------------------------------------------------- |
+| ---------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `vocab`          | `Vocab`                                    | The shared vocabulary.                                                                                                                                                                                           |
-| `model`          | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.             |
+| `model`          | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). |
 | `name`           | str                                        | String name of the component instance. Used to add entries to the `losses` during training.                                                                                                                      |
 | _keyword-only_   |                                            |                                                                                                                                                                                                                  |
 | `set_morphology` | bool                                       | Whether to set morphological features.                                                                                                                                                                           |
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@ -9,6 +9,12 @@ api_string_name: textcat
 api_trainable: true
 ---

+The text categorizer predicts **categories over a whole document**. It can learn
+one or more labels, and the labels can be mutually exclusive (i.e. one true
+label per document) or non-mutually exclusive (i.e. zero or more labels may be
+true per document). The multi-label setting is controlled by the model instance
+that's provided.
+
 ## Config and implementation {#config}

 The default config is defined by the pipeline component factory and describes
@ -30,9 +36,9 @@ architectures and their arguments and hyperparameters.
 > ```

 | Setting  | Type                                       | Description                                                                             | Default                                               |
-| -------- | ------------------------------------------ | ------------------ | ----------------------------------------------------- |
-| `labels` | `Iterable[str]`                            | The labels to use. | `[]`                                                  |
-| `model`  | [`Model`](https://thinc.ai/docs/api-model) | The model to use.  | [TextCatEnsemble](/api/architectures#TextCatEnsemble) |
+| -------- | ------------------------------------------ | --------------------------------------------------------------------------------------- | ----------------------------------------------------- |
+| `labels` | `List[str]`                                | A list of categories to learn. If empty, the model infers the categories from the data. | `[]`                                                  |
+| `model`  | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts scores for each category.                                | [TextCatEnsemble](/api/architectures#TextCatEnsemble) |

 ```python
 https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/textcat.py
@ -67,23 +73,6 @@ shortcut for this and instantiate the component using its string name and
 | _keyword-only_ |                                            |                                                                                             |
 | `labels`       | `Iterable[str]`                            | The labels to use.                                                                          |

-<!-- TODO move to config page
-### Architectures {#architectures new="2.1"}
-
-Text classification models can be used to solve a wide variety of problems.
-Differences in text length, number of labels, difficulty, and runtime
-performance constraints mean that no single algorithm performs well on all types
-of problems. To handle a wider variety of problems, the `TextCategorizer` object
-allows configuration of its model architecture, using the `architecture` keyword
-argument.
-
-| Name           | Description                                                                                                                                                                                                                                                                                                                                                                                                      |
-| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `"ensemble"`   | **Default:** Stacked ensemble of a bag-of-words model and a neural network model. The neural network uses a CNN with mean pooling and attention. The "ngram_size" and "attr" arguments can be used to configure the feature extraction for the bag-of-words model.                                                                                                                                               |
-| `"simple_cnn"` | A neural network model where token vectors are calculated using a CNN. The vectors are mean pooled and used as features in a feed-forward network. This architecture is usually less accurate than the ensemble, but runs faster.                                                                                                                                                                                |
-| `"bow"`        | An ngram "bag-of-words" model. This architecture should run much faster than the others, but may not be as accurate, especially if texts are short. The features extracted can be controlled using the keyword arguments `ngram_size` and `attr`. For instance, `ngram_size=3` and `attr="lower"` would give lower-cased unigram, trigram and bigram features. 2, 3 or 4 are usually good choices of ngram size. |
-->
-
 ## TextCategorizer.\_\_call\_\_ {#call tag="method"}

 Apply the pipe to one document. The document is modified in place, and returned.
--- a/website/docs/api/tok2vec.md
+++ b/website/docs/api/tok2vec.md
@ -8,7 +8,20 @@ api_string_name: tok2vec
 api_trainable: true
 ---

-<!-- TODO: intro describing component -->
+Apply a "token-to-vector" model and set its outputs in the `Doc.tensor`
+attribute. This is mostly useful to **share a single subnetwork** between
+multiple components, e.g. to have one embedding and CNN network shared between a
+[`DependencyParser`](/api/dependencyparser), [`Tagger`](/api/tagger) and
+[`EntityRecognizer`](/api/entityrecognizer).
+
+In order to use the `Tok2Vec` predictions, subsequent components should use the
+[Tok2VecListener](/api/architectures#Tok2VecListener) layer as the tok2vec
+subnetwork of their model. This layer will read data from the `doc.tensor`
+attribute during prediction. During training, the `Tok2Vec` component will save
+its prediction and backprop callback for each batch, so that the subsequent
+components can backpropagate to the shared weights. This implementation is used
+because it allows us to avoid relying on object identity within the models to
+achieve the parameter sharing.

 ## Config and implementation {#config}

@ -28,8 +41,8 @@ architectures and their arguments and hyperparameters.
 > ```

 | Setting | Type                                       | Description                                                             | Default                                         |
-| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------- |
-| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [HashEmbedCNN](/api/architectures#HashEmbedCNN) |
+| ------- | ------------------------------------------ | ----------------------------------------------------------------------- | ----------------------------------------------- |
+| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. The model to use. | [HashEmbedCNN](/api/architectures#HashEmbedCNN) |

 ```python
 https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tok2vec.py
@ -64,9 +77,11 @@ shortcut for this and instantiate the component using its string name and

 ## Tok2Vec.\_\_call\_\_ {#call tag="method"}

-Apply the pipe to one document. The document is modified in place, and returned.
-This usually happens under the hood when the `nlp` object is called on a text
-and all pipeline components are applied to the `Doc` in order. Both
+Apply the pipe to one document and add context-sensitive embeddings to the
+`Doc.tensor` attribute, allowing them to be used as features by downstream
+components. The document is modified in place, and returned. This usually
+happens under the hood when the `nlp` object is called on a text and all
+pipeline components are applied to the `Doc` in order. Both
 [`__call__`](/api/tok2vec#call) and [`pipe`](/api/tok2vec#pipe) delegate to the
 [`predict`](/api/tok2vec#predict) and
 [`set_annotations`](/api/tok2vec#set_annotations) methods.
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -340,7 +340,7 @@ See the [`Transformer`](/api/transformer) API reference and

 ## Batchers {#batchers source="spacy/gold/batchers.py" new="3"}

-<!-- TODO: intro and also describe signature of functions -->
+<!-- TODO: intro -->

 #### batch_by_words.v1 {#batch_by_words tag="registered function"}

@ -361,19 +361,16 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
 > get_length = null
 > ```

-<!-- TODO: complete table -->
-
 | Name               | Type                   | Description                                                                                                                                               |
-| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `size`             | `Iterable[int]` / int  | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
-| `tolerance`        | float                  |                                                                                                                                     |
-| `discard_oversize` | bool                   | Discard items that are longer than the specified batch length.                                                                      |
-| `get_length`       | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set.                     |
+| ------------------ | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `seqs`             | `Iterable[Any]`        | The sequences to minibatch.                                                                                                                               |
+| `size`             | `Iterable[int]` / int  | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
+| `tolerance`        | float                  | What percentage of the size to allow batches to exceed.                                                                                                   |
+| `discard_oversize` | bool                   | Whether to discard sequences that by themselves exceed the tolerated size.                                                                                |
+| `get_length`       | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set.                                      |

 #### batch_by_sequence.v1 {#batch_by_sequence tag="registered function"}

-<!-- TODO: -->
-
 > #### Example config
 >
 > ```ini
@ -383,34 +380,37 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
 > get_length = null
 > ```

-<!-- TODO: complete table -->
+Create a batcher that creates batches of the specified size.

 | Name         | Type                   | Description                                                                                                                                               |
-| ------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `size`       | `Iterable[int]` / int  | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
-| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set.                     |
+| ------------ | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `size`       | `Iterable[int]` / int  | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
+| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set.                                      |

 #### batch_by_padded.v1 {#batch_by_padded tag="registered function"}

-<!-- TODO: -->
-
 > #### Example config
 >
 > ```ini
 > [training.batcher]
-> @batchers = "batch_by_words.v1"
+> @batchers = "batch_by_padded.v1"
 > size = 100
-> buffer = TODO:
+> buffer = 256
 > discard_oversize = false
 > get_length = null
 > ```

+Minibatch a sequence by the size of padded batches that would result, with
+sequences binned by length within a window. The padded size is defined as the
+maximum length of sequences within the batch multiplied by the number of
+sequences in the batch.
+
 | Name               | Type                   | Description                                                                                                                                                                                                                         |
-| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `size`             | `Iterable[int]` / int  | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
-| `buffer`           | int                    |                                                                                                                                     |
-| `discard_oversize` | bool                   | Discard items that are longer than the specified batch length.                                                                      |
-| `get_length`       | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set.                     |
+| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `size`             | `Iterable[int]` / int  | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding).                                                                |
+| `buffer`           | int                    | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. |
+| `discard_oversize` | bool                   | Whether to discard sequences that are by themselves longer than the largest padded batch size.                                                                                                                                      |
+| `get_length`       | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set.                                                                                                                |

 ## Training data and alignment {#gold source="spacy/gold"}

--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@ -25,8 +25,15 @@ work out-of-the-box.

 </Infobox>

-This pipeline component lets you use transformer models in your pipeline. The
-component assigns the output of the transformer to the Doc's extension
+This pipeline component lets you use transformer models in your pipeline.
+Supports all models that are available via the
+[HuggingFace `transformers`](https://huggingface.co/transformers) library.
+Usually you will connect subsequent components to the shared transformer using
+the [TransformerListener](/api/architectures#TransformerListener) layer. This
+works similarly to spaCy's [Tok2Vec](/api/tok2vec) component and
+[Tok2VecListener](/api/architectures/Tok2VecListener) sublayer.
+
+The component assigns the output of the transformer to the `Doc`'s extension
 attributes. We also calculate an alignment between the word-piece tokens and the
 spaCy tokenization, so that we can use the last hidden states to set the
 `Doc.tensor` attribute. When multiple word-piece tokens align to the same spaCy
@ -54,10 +61,10 @@ architectures and their arguments and hyperparameters.
 > ```

 | Setting             | Type                                       | Description                                                                                                                                                                                                                                                                                     | Default                                                 |
-| ------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- |
+| ------------------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- |
 | `max_batch_items`   | int                                        | Maximum size of a padded batch.                                                                                                                                                                                                                                                                 | `4096`                                                  |
-| `annotation_setter` | Callable                                   | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. | `null_annotation_setter`                                |
-| `model`             | [`Model`](https://thinc.ai/docs/api-model) | The model to use.                                                                                                                                                   | [TransformerModel](/api/architectures#TransformerModel) |
+| `annotation_setter` | Callable                                   | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no additional annotations are set. | `null_annotation_setter`                                |
+| `model`             | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** [`FullTransformerBatch`](/api/transformer#fulltransformerbatch). The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer.                                                                                                             | [TransformerModel](/api/architectures#TransformerModel) |

 ```python
 https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
@ -86,15 +93,19 @@ https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/p
 > trf = Transformer(nlp.vocab, model)
 > ```

-Create a new pipeline instance. In your application, you would normally use a
-shortcut for this and instantiate the component using its string name and
-[`nlp.add_pipe`](/api/language#create_pipe).
+Construct a `Transformer` component. One or more subsequent spaCy components can
+use the transformer outputs as features in its model, with gradients
+backpropagated to the single shared weights. The activations from the
+transformer are saved in the [`Doc._.trf_data`](#custom-attributes) extension
+attribute. You can also provide a callback to set additional annotations. In
+your application, you would normally use a shortcut for this and instantiate the
+component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).

 | Name                | Type                                       | Description                                                                                                                                                                                                                                                                                     |
-| ------------------- | ------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ------------------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `vocab`             | `Vocab`                                    | The shared vocabulary.                                                                                                                                                                                                                                                                          |
-| `model`             | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.                                                                                                                                                   |
-| `annotation_setter` | `Callable`                                 | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. Defaults to `null_annotation_setter`, a function that does nothing. |
+| `model`             | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** [`FullTransformerBatch`](/api/transformer#fulltransformerbatch). The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this.    |
+| `annotation_setter` | `Callable`                                 | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no additional annotations are set. |
 | _keyword-only_      |                                            |                                                                                                                                                                                                                                                                                                 |
 | `name`              | str                                        | String name of the component instance. Used to add entries to the `losses` during training.                                                                                                                                                                                                     |
 | `max_batch_items`   | int                                        | Maximum size of a padded batch. Defaults to `128*32`.                                                                                                                                                                                                                                           |
@ -184,7 +195,10 @@ Apply the pipeline's model to a batch of docs, without modifying them.

 ## Transformer.set_annotations {#set_annotations tag="method"}

-Modify a batch of documents, using pre-computed scores.
+Assign the extracted features to the Doc objects. By default, the
+[`TransformerData`](/api/transformer#transformerdata) object is written to the
+[`Doc._.trf_data`](#custom-attributes) attribute. Your annotation_setter
+callback is then called, if provided.

 > #### Example
 >
@ -201,8 +215,19 @@ Modify a batch of documents, using pre-computed scores.

 ## Transformer.update {#update tag="method"}

-Learn from a batch of documents and gold-standard information, updating the
-pipe's model. Delegates to [`predict`](/api/transformer#predict).
+Prepare for an update to the transformer. Like the [`Tok2Vec`](/api/tok2vec)
+component, the `Transformer` component is unusual in that it does not receive
+"gold standard" annotations to calculate a weight update. The optimal output of
+the transformer data is unknown – it's a hidden layer inside the network that is
+updated by backpropagating from output layers.
+
+The `Transformer` component therefore does **not** perform a weight update
+during its own `update` method. Instead, it runs its transformer model and
+communicates the output and the backpropagation callback to any **downstream
+components** that have been connected to it via the
+[TransformerListener](/api/architectures#TransformerListener) sublayer. If there
+are multiple listeners, the last layer will actually backprop to the transformer
+and call the optimizer, while the others simply increment the gradients.

 > #### Example
 >
@ -213,8 +238,8 @@ pipe's model. Delegates to [`predict`](/api/transformer#predict).
 > ```

 | Name              | Type                                                | Description                                                                                                                                                |
-| ----------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`        | `Iterable[Example]`                                 | A batch of [`Example`](/api/example) objects to learn from.                                                                               |
+| ----------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`        | `Iterable[Example]`                                 | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. |
 | _keyword-only_    |                                                     |                                                                                                                                                            |
 | `drop`            | float                                               | The dropout rate.                                                                                                                                          |
 | `set_annotations` | bool                                                | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/transformer#set_annotations).                  |
@ -394,21 +419,23 @@ Split a `TransformerData` object that represents a batch into a list with one
 | ----------- | ----------------------- | ----------- |
 | **RETURNS** | `List[TransformerData]` |             |

-## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
-
-<!-- TODO: details on what this is for -->
+## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}

 Span getters are functions that take a batch of [`Doc`](/api/doc) objects and
 return a lists of [`Span`](/api/span) objects for each doc, to be processed by
-the transformer. The returned spans can overlap. Span getters can be referenced
-in the config's `[components.transformer.model.get_spans]` block to customize
-the sequences processed by the transformer. You can also register custom span
-getters using the `@registry.span_getters` decorator.
+the transformer. This is used to manage long documents, by cutting them into
+smaller sequences before running the transformer. The spans are allowed to
+overlap, and you can also omit sections of the Doc if they are not relevant.
+
+Span getters can be referenced in the `[components.transformer.model.get_spans]`
+block of the config to customize the sequences processed by the transformer. You
+can also register custom span getters using the `@spacy.registry.span_getters`
+decorator.

 > #### Example
 >
 > ```python
-> @registry.span_getters("sent_spans.v1")
+> @spacy.registry.span_getters("sent_spans.v1")
 > def configure_get_sent_spans() -> Callable:
 >     def get_sent_spans(docs: Iterable[Doc]) -> List[List[Span]]:
 >         return [list(doc.sents) for doc in docs]
@ -421,15 +448,55 @@ getters using the `@registry.span_getters` decorator.
 | `docs`      | `Iterable[Doc]`    | A batch of `Doc` objects.                |
 | **RETURNS** | `List[List[Span]]` | The spans to process by the transformer. |

-The following built-in functions are available:
+### doc_spans.v1 {#doc_spans tag="registered function"}

-<!-- TODO: finish API docs -->
+> #### Example config
+>
+> ```ini
+> [transformer.model.get_spans]
+> @span_getters = "doc_spans.v1"
+> ```

-| Name               | Description                                                        |
-| ------------------ | ------------------------------------------------------------------ |
-| `doc_spans.v1`     | Create a span for each doc (no transformation, process each text). |
-| `sent_spans.v1`    | Create a span for each sentence if sentence boundaries are set.    |
-| `strided_spans.v1` |                                                                    |
+Create a span getter that uses the whole document as its spans. This is the best
+approach if your [`Doc`](/api/doc) objects already refer to relatively short
+texts.
+
+### sent_spans.v1 {#sent_spans tag="registered function"}
+
+> #### Example config
+>
+> ```ini
+> [transformer.model.get_spans]
+> @span_getters = "sent_spans.v1"
+> ```
+
+Create a span getter that uses sentence boundary markers to extract the spans.
+This requires sentence boundaries to be set (e.g. by the
+[`Sentencizer`](/api/sentencizer)), and may result in somewhat uneven batches,
+depending on the sentence lengths. However, it does provide the transformer with
+more meaningful windows to attend over.
+
+### strided_spans.v1 {#strided_spans tag="registered function"}
+
+> #### Example config
+>
+> ```ini
+> [transformer.model.get_spans]
+> @span_getters = "strided_spans.v1"
+> window = 128
+> stride = 96
+> ```
+
+Create a span getter for strided spans. If you set the `window` and `stride` to
+the same value, the spans will cover each token once. Setting `stride` lower
+than `window` will allow for an overlap, so that some tokens are counted twice.
+This can be desirable, because it allows all tokens to have both a left and
+right context.
+
+| Name      | Type | Description      |
+| --------- | ---- | ---------------- |
+|  `window` | int  | The window size. |
+| `stride`  | int  | The stride size. |

 ## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}

--- a/website/docs/usage/101/_architecture.md
+++ b/website/docs/usage/101/_architecture.md
@ -1,54 +1,88 @@
-The central data structures in spaCy are the `Doc` and the `Vocab`. The `Doc`
-object owns the **sequence of tokens** and all their annotations. The `Vocab`
-object owns a set of **look-up tables** that make common information available
-across documents. By centralizing strings, word vectors and lexical attributes,
-we avoid storing multiple copies of this data. This saves memory, and ensures
-there's a **single source of truth**.
+The central data structures in spaCy are the [`Language`](/api/language) class,
+the [`Vocab`](/api/vocab) and the [`Doc`](/api/doc) object. The `Language` class
+is used to process a text and turn it into a `Doc` object. It's typically stored
+as a variable called `nlp`. The `Doc` object owns the **sequence of tokens** and
+all their annotations. By centralizing strings, word vectors and lexical
+attributes in the `Vocab`, we avoid storing multiple copies of this data. This
+saves memory, and ensures there's a **single source of truth**.

 Text annotations are also designed to allow a single source of truth: the `Doc`
-object owns the data, and `Span` and `Token` are **views that point into it**.
-The `Doc` object is constructed by the `Tokenizer`, and then **modified in
-place** by the components of the pipeline. The `Language` object coordinates
-these components. It takes raw text and sends it through the pipeline, returning
-an **annotated document**. It also orchestrates training and serialization.
+object owns the data, and [`Span`](/api/span) and [`Token`](/api/token) are
+**views that point into it**. The `Doc` object is constructed by the
+[`Tokenizer`](/api/tokenizer), and then **modified in place** by the components
+of the pipeline. The `Language` object coordinates these components. It takes
+raw text and sends it through the pipeline, returning an **annotated document**.
+It also orchestrates training and serialization.

-<!-- TODO: update architecture and tables below to match sidebar in API docs etc. -->
+<!-- TODO: update graphic -->

 ![Library architecture](../../images/architecture.svg)

 ### Container objects {#architecture-containers}

 | Name                        | Description                                                                                                                                             |
-| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`Language`](/api/language) | Processing class that turns text into `Doc` objects. Different languages implement their own subclasses of it. The variable is typically called `nlp`.  |
 | [`Doc`](/api/doc)           | A container for accessing linguistic annotations.                                                                                                       |
 | [`Span`](/api/span)         | A slice from a `Doc` object.                                                                                                                            |
 | [`Token`](/api/token)       | An individual token — i.e. a word, punctuation symbol, whitespace, etc.                                                                                 |
 | [`Lexeme`](/api/lexeme)     | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. |
-| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis.                                                                                                                               |
+| [`Example`](/api/example)   | A collection of training annotations, containing two `Doc` objects: the reference data and the predictions.                                             |
+| [`DocBin`](/api/docbin)     | A collection of `Doc` objects for efficient binary serialization. Also used for [training data](/api/data-formats#binary-training).                     |

 ### Processing pipeline {#architecture-pipeline}

+The processing pipeline consists of one or more **pipeline components** that are
+called on the `Doc` in order. The tokenizer runs before the components. Pipeline
+components can be added using [`Language.add_pipe`](/api/language#add_pipe).
+They can contain a statistical model and trained weights, or only make
+rule-based modifications to the `Doc`. spaCy provides a range of built-in
+components for different language processing tasks and also allows adding
+[custom components](/usage/processing-pipelines#custom-components).
+
+![The processing pipeline](../../images/pipeline.svg)
+
 | Name                                            | Description                                                                                 |
-| ------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
-| [`Language`](/api/language)                 | A text-processing pipeline. Usually you'll load this once per process as `nlp` and pass the instance around your application. |
-| [`Tokenizer`](/api/tokenizer)               | Segment text, and create `Doc` objects with the discovered segment boundaries.                                                |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------- |
+| [`Tokenizer`](/api/tokenizer)                   | Segment raw text and create `Doc` objects from the words.                                   |
+| [`Tok2Vec`](/api/tok2vec)                       | Apply a "token-to-vector" model and set its outputs.                                        |
+| [`Transformer`](/api/transformer)               | Use a transformer model and set its outputs.                                                |
 | [`Lemmatizer`](/api/lemmatizer)                 | Determine the base forms of words.                                                          |
-| [`Morphology`](/api/morphology)             | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag.              |
-| [`Tagger`](/api/tagger)                     | Annotate part-of-speech tags on `Doc` objects.                                                                                |
-| [`DependencyParser`](/api/dependencyparser) | Annotate syntactic dependencies on `Doc` objects.                                                                             |
-| [`EntityRecognizer`](/api/entityrecognizer) | Annotate named entities, e.g. persons or products, on `Doc` objects.                                                          |
-| [`TextCategorizer`](/api/textcategorizer)   | Assign categories or labels to `Doc` objects.                                                                                 |
+| [`Morphologizer`](/api/morphologizer)           | Predict morphological features and coarse-grained part-of-speech tags.                      |
+| [`Tagger`](/api/tagger)                         | Predict part-of-speech tags.                                                                |
+| [`AttributeRuler`](/api/attributeruler)         | Set token attributes using matcher rules.                                                   |
+| [`DependencyParser`](/api/dependencyparser)     | Predict syntactic dependencies.                                                             |
+| [`EntityRecognizer`](/api/entityrecognizer)     | Predict named entities, e.g. persons or products.                                           |
+| [`EntityRuler`](/api/entityruler)               | Add entity spans to the `Doc` using token-based rules or exact phrase matches.              |
+| [`EntityLinker`](/api/entitylinker)             | Disambiguate named entities to nodes in a knowledge base.                                   |
+| [`TextCategorizer`](/api/textcategorizer)       | Predict categories or labels over the whole document.                                       |
+| [`Sentencizer`](/api/sentencizer)               | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
+| [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries.                                                                |
+| [Other functions](/api/pipeline-functions)      | Automatically apply something to the `Doc`, e.g. to merge spans of tokens.                  |
+| [`Pipe`](/api/pipe)                             | Base class that all trainable pipeline components inherit from.                             |
+
+### Matchers {#architecture-matchers}
+
+Matchers help you find and extract information from [`Doc`](/api/doc) objects
+based on match patterns describing the sequences you're looking for. A matcher
+operates on a `Doc` and gives you access to the matched tokens **in context**.
+
+| Name                                          | Description                                                                                                                                                                         |
+| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | [`Matcher`](/api/matcher)                     | Match sequences of tokens, based on pattern rules, similar to regular expressions.                                                                                                  |
 | [`PhraseMatcher`](/api/phrasematcher)         | Match sequences of tokens based on phrases.                                                                                                                                         |
-| [`EntityRuler`](/api/entityruler)           | Add entity spans to the `Doc` using token-based rules or exact phrase matches.                                                |
-| [`Sentencizer`](/api/sentencizer)           | Implement custom sentence boundary detection logic that doesn't require the dependency parse.                                 |
-| [Other functions](/api/pipeline-functions)  | Automatically apply something to the `Doc`, e.g. to merge spans of tokens.                                                    |
+| [`DependencyMatcher`](/api/dependencymatcher) | Match sequences of tokens based on dependency trees using the [Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). |

 ### Other classes {#architecture-other}

 | Name                                  | Description                                                                                                      |
-| --------------------------------- | ----------------------------------------------------------------------------- |
-| [`Vocab`](/api/vocab)             | A lookup table for the vocabulary that allows you to access `Lexeme` objects. |
+| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------- |
+| [`Vocab`](/api/vocab)                 | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects.               |
 | [`StringStore`](/api/stringstore)     | Map strings to and from hash values.                                                                             |
 | [`Vectors`](/api/vectors)             | Container class for vector data keyed by string.                                                                 |
-| [`Example`](/api/example)         | Collection for training annotations.                                          |
+| [`Lookups`](/api/lookups)             | Container for convenient access to large lookup tables and dictionaries.                                         |
+| [`Morphology`](/api/morphology)       | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. |
+| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis.                                                                                        |
+| [`KnowledgeBase`](/api/kb)            | Storage for entities and aliases of a knowledge base for entity linking.                                         |
+| [`Scorer`](/api/scorer)               | Compute evaluation scores.                                                                                       |
+| [`Corpus`](/api/corpis)               | Class for managing annotated corpora for training and evaluation data.                                           |
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -750,16 +750,13 @@ print([w.text for w in nlp("gimme that")])  # ['gim', 'me', 'that']

 The special case doesn't have to match an entire whitespace-delimited substring.
 The tokenizer will incrementally split off punctuation, and keep looking up the
-remaining substring:
+remaining substring. The special case rules also have precedence over the
+punctuation splitting.

 ```python
 assert "gimme" not in [w.text for w in nlp("gimme!")]
 assert "gimme" not in [w.text for w in nlp('("...gimme...?")')]
-```

-The special case rules have precedence over the punctuation splitting:
-
-```python
 nlp.tokenizer.add_special_case("...gimme...?", [{"ORTH": "...gimme...?"}])
 assert len(nlp("...gimme...?")) == 1
 ```
@ -813,19 +810,6 @@ domain. There are six things you may need to define:
 6. An optional boolean function `url_match`, which is similar to `token_match`
   except that prefixes and suffixes are removed before applying the match.

-<Infobox title="Important note: token match in spaCy v2.2" variant="warning">
-
-In spaCy v2.2.2-v2.2.4, the `token_match` was equivalent to the `url_match`
-above and there was no match pattern applied before prefixes and suffixes were
-analyzed. As of spaCy v2.3.0, the `token_match` has been reverted to its
-behavior in v2.2.1 and earlier with precedence over prefixes and suffixes.
-
-The `url_match` is introduced in v2.3.0 to handle cases like URLs where the
-tokenizer should remove prefixes and suffixes (e.g., a comma at the end of a
-URL) before applying the match.
-
-</Infobox>
-
 You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is
 to use `re.compile()` to build a regular expression object, and pass its
 `.search()` and `.finditer()` methods:
@ -905,12 +889,13 @@ function that behaves the same way.

 <Infobox title="Important note" variant="warning">

-If you're using a statistical model, writing to the `nlp.Defaults` or
-`English.Defaults` directly won't work, since the regular expressions are read
-from the model and will be compiled when you load it. If you modify
-`nlp.Defaults`, you'll only see the effect if you call
-[`spacy.blank`](/api/top-level#spacy.blank). If you want to modify the tokenizer
-loaded from a statistical model, you should modify `nlp.tokenizer` directly.
+If you're using a statistical model, writing to the
+[`nlp.Defaults`](/api/language#defaults) or `English.Defaults` directly won't
+work, since the regular expressions are read from the model and will be compiled
+when you load it. If you modify `nlp.Defaults`, you'll only see the effect if
+you call [`spacy.blank`](/api/top-level#spacy.blank). If you want to modify the
+tokenizer loaded from a statistical model, you should modify `nlp.tokenizer`
+directly.

 </Infobox>

@ -961,51 +946,50 @@ and language-specific definitions such as
 [`lang/de/punctuation.py`](https://github.com/explosion/spaCy/blob/master/spacy/lang/de/punctuation.py)
 for German.

-### Hooking an arbitrary tokenizer into the pipeline {#custom-tokenizer}
+### Hooking a custom tokenizer into the pipeline {#custom-tokenizer}

 The tokenizer is the first component of the processing pipeline and the only one
 that can't be replaced by writing to `nlp.pipeline`. This is because it has a
 different signature from all the other components: it takes a text and returns a
-`Doc`, whereas all other components expect to already receive a tokenized `Doc`.
+[`Doc`](/api/doc), whereas all other components expect to already receive a
+tokenized `Doc`.

 ![The processing pipeline](../images/pipeline.svg)

 To overwrite the existing tokenizer, you need to replace `nlp.tokenizer` with a
-custom function that takes a text, and returns a `Doc`.
+custom function that takes a text, and returns a [`Doc`](/api/doc).
+
+> #### Creating a Doc
+>
+> Constructing a [`Doc`](/api/doc) object manually requires at least two
+> arguments: the shared `Vocab` and a list of words. Optionally, you can pass in
+> a list of `spaces` values indicating whether the token at this position is
+> followed by a space (default `True`). See the section on
+> [pre-tokenized text](#own-annotations) for more info.
+>
+> ```python
+> words = ["Let", "'s", "go", "!"]
+> spaces = [False, True, False, False]
+> doc = Doc(nlp.vocab, words=words, spaces=spaces)
+> ```

 ```python
-nlp = spacy.load("en_core_web_sm")
+nlp = spacy.blank("en")
 nlp.tokenizer = my_tokenizer
 ```

 | Argument    | Type              | Description               |
-| ----------- | ----- | ------------------------- |
+| ----------- | ----------------- | ------------------------- |
 | `text`      | str               | The raw text to tokenize. |
-| **RETURNS** | `Doc` | The tokenized document.   |
+| **RETURNS** | [`Doc`](/api/doc) | The tokenized document.   |

-<Infobox title="Important note: using a custom tokenizer" variant="warning">
+#### Example 1: Basic whitespace tokenizer {#custom-tokenizer-example}

-In spaCy v1.x, you had to add a custom tokenizer by passing it to the `make_doc`
-keyword argument, or by passing a tokenizer "factory" to `create_make_doc`. This
-was unnecessarily complicated. Since spaCy v2.0, you can write to
-`nlp.tokenizer` instead. If your tokenizer needs the vocab, you can write a
-function and use `nlp.vocab`.
-
-```diff
- nlp = spacy.load("en_core_web_sm", make_doc=my_tokenizer)
- nlp = spacy.load("en_core_web_sm", create_make_doc=my_tokenizer_factory)
-
-+ nlp.tokenizer = my_tokenizer
-+ nlp.tokenizer = my_tokenizer_factory(nlp.vocab)
-```
-
-</Infobox>
-
-### Example: A custom whitespace tokenizer {#custom-tokenizer-example}
-
-To construct the tokenizer, we usually want attributes of the `nlp` pipeline.
-Specifically, we want the tokenizer to hold a reference to the vocabulary
-object. Let's say we have the following class as our tokenizer:
+Here's an example of the most basic whitespace tokenizer. It takes the shared
+vocab, so it can construct `Doc` objects. When it's called on a text, it returns
+a `Doc` object consisting of the text split on single space characters. We can
+then overwrite the `nlp.tokenizer` attribute with an instance of our custom
+tokenizer.

 ```python
 ### {executable="true"}
@ -1017,68 +1001,189 @@ class WhitespaceTokenizer:
        self.vocab = vocab

    def __call__(self, text):
-        words = text.split(' ')
-        # All tokens 'own' a subsequent space character in this tokenizer
-        spaces = [True] * len(words)
-        return Doc(self.vocab, words=words, spaces=spaces)
+        words = text.split(" ")
+        return Doc(self.vocab, words=words)

-nlp = spacy.load("en_core_web_sm")
+nlp = spacy.blank("en")
 nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
 doc = nlp("What's happened to me? he thought. It wasn't a dream.")
-print([t.text for t in doc])
+print([token.text for token in doc])
 ```

-As you can see, we need a `Vocab` instance to construct this — but we won't have
-it until we get back the loaded `nlp` object. The simplest solution is to build
-the tokenizer in two steps. This also means that you can reuse the "tokenizer
-factory" and initialize it with different instances of `Vocab`.
+#### Example 2: Third-party tokenizers (BERT word pieces) {#custom-tokenizer-example2}

-### Bringing your own annotations {#own-annotations}
+You can use the same approach to plug in any other third-party tokenizers. Your
+custom callable just needs to return a `Doc` object with the tokens produced by
+your tokenizer. In this example, the wrapper uses the **BERT word piece
+tokenizer**, provided by the
+[`tokenizers`](https://github.com/huggingface/tokenizers) library. The tokens
+available in the `Doc` object returned by spaCy now match the exact word pieces
+produced by the tokenizer.

-spaCy generally assumes by default that your data is raw text. However,
+> #### 💡 Tip: spacy-transformers
+>
+> If you're working with transformer models like BERT, check out the
+> [`spacy-transformers`](https://github.com/explosion/spacy-transformers)
+> extension package and [documentation](/usage/transformers). It includes a
+> pipeline component for using pretrained transformer weights and **training
+> transformer models** in spaCy, as well as helpful utilities for aligning word
+> pieces to linguistic tokenization.
+
+```python
+### Custom BERT word piece tokenizer
+from tokenizers import BertWordPieceTokenizer
+from spacy.tokens import Doc
+import spacy
+
+class BertTokenizer:
+    def __init__(self, vocab, vocab_file, lowercase=True):
+        self.vocab = vocab
+        self._tokenizer = BertWordPieceTokenizer(vocab_file, lowercase=lowercase)
+
+    def __call__(self, text):
+        tokens = self._tokenizer.encode(text)
+        words = []
+        spaces = []
+        for i, (text, (start, end)) in enumerate(zip(tokens.tokens, tokens.offsets)):
+            words.append(text)
+            if i < len(tokens.tokens) - 1:
+                # If next start != current end we assume a space in between
+                next_start, next_end = tokens.offsets[i + 1]
+                spaces.append(next_start > end)
+            else:
+                spaces.append(True)
+        return Doc(self.vocab, words=words, spaces=spaces)
+
+nlp = spacy.blank("en")
+nlp.tokenizer = BertTokenizer(nlp.vocab, "bert-base-uncased-vocab.txt")
+doc = nlp("Justin Drew Bieber is a Canadian singer, songwriter, and actor.")
+print(doc.text, [token.text for token in doc])
+# [CLS]justin drew bi##eber is a canadian singer, songwriter, and actor.[SEP]
+# ['[CLS]', 'justin', 'drew', 'bi', '##eber', 'is', 'a', 'canadian', 'singer',
+#  ',', 'songwriter', ',', 'and', 'actor', '.', '[SEP]']
+```
+
+<Infobox title="Important note on tokenization and models" variant="warning">
+
+Keep in mind that your model's result may be less accurate if the tokenization
+during training differs from the tokenization at runtime. So if you modify a
+pretrained model's tokenization afterwards, it may produce very different
+predictions. You should therefore train your model with the **same tokenizer**
+it will be using at runtime. See the docs on
+[training with custom tokenization](#custom-tokenizer-training) for details.
+
+</Infobox>
+
+#### Training with custom tokenization {#custom-tokenizer-training new="3"}
+
+spaCy's [training config](/usage/training#config) describe the settings,
+hyperparameters, pipeline and tokenizer used for constructing and training the
+model. The `[nlp.tokenizer]` block refers to a **registered function** that
+takes the `nlp` object and returns a tokenizer. Here, we're registering a
+function called `whitespace_tokenizer` in the
+[`@tokenizers` registry](/api/registry). To make sure spaCy knows how to
+construct your tokenizer during training, you can pass in your Python file by
+setting `--code functions.py` when you run [`spacy train`](/api/cli#train).
+
+> #### config.cfg
+>
+> ```ini
+> [nlp.tokenizer]
+> @tokenizers = "whitespace_tokenizer"
+> ```
+
+```python
+### functions.py {highlight="1"}
+@spacy.registry.tokenizers("whitespace_tokenizer")
+def create_whitespace_tokenizer():
+    def create_tokenizer(nlp):
+        return WhitespaceTokenizer(nlp.vocab)
+
+    return create_tokenizer
+```
+
+Registered functions can also take arguments that are then passed in from the
+config. This allows you to quickly change and keep track of different settings.
+Here, the registered function called `bert_word_piece_tokenizer` takes two
+arguments: the path to a vocabulary file and whether to lowercase the text. The
+Python type hints `str` and `bool` ensure that the received values have the
+correct type.
+
+> #### config.cfg
+>
+> ```ini
+> [nlp.tokenizer]
+> @tokenizers = "bert_word_piece_tokenizer"
+> vocab_file = "bert-base-uncased-vocab.txt"
+> lowercase = true
+> ```
+
+```python
+### functions.py {highlight="1"}
+@spacy.registry.tokenizers("bert_word_piece_tokenizer")
+def create_whitespace_tokenizer(vocab_file: str, lowercase: bool):
+    def create_tokenizer(nlp):
+        return BertWordPieceTokenizer(nlp.vocab, vocab_file, lowercase)
+
+    return create_tokenizer
+```
+
+To avoid hard-coding local paths into your config file, you can also set the
+vocab path on the CLI by using the `--nlp.tokenizer.vocab_file`
+[override](/usage/training#config-overrides) when you run
+[`spacy train`](/api/cli#train). For more details on using registered functions,
+see the docs in [training with custom code](/usage/training#custom-code).
+
+<Infobox variant="warning">
+
+Remember that a registered function should always be a function that spaCy
+**calls to create something**, not the "something" itself. In this case, it
+**creates a function** that takes the `nlp` object and returns a callable that
+takes a text and returns a `Doc`.
+
+</Infobox>
+
+#### Using pre-tokenized text {#own-annotations}
+
+spaCy generally assumes by default that your data is **raw text**. However,
 sometimes your data is partially annotated, e.g. with pre-existing tokenization,
-part-of-speech tags, etc. The most common situation is that you have pre-defined
-tokenization. If you have a list of strings, you can create a `Doc` object
-directly. Optionally, you can also specify a list of boolean values, indicating
-whether each word has a subsequent space.
+part-of-speech tags, etc. The most common situation is that you have
+**pre-defined tokenization**. If you have a list of strings, you can create a
+[`Doc`](/api/doc) object directly. Optionally, you can also specify a list of
+boolean values, indicating whether each word is followed by a space.
+
+> #### ✏️ Things to try
+>
+> 1. Change a boolean value in the list of `spaces`. You should see it reflected
+>    in the `doc.text` and whether the token is followed by a space.
+> 2. Remove `spaces=spaces` from the `Doc`. You should see that every token is
+>    now followed by a space.
+> 3. Copy-paste a random sentence from the internet and manually construct a
+>    `Doc` with `words` and `spaces` so that the `doc.text` matches the original
+>    input text.

 ```python
 ### {executable="true"}
 import spacy
 from spacy.tokens import Doc
-from spacy.lang.en import English

-nlp = English()
-doc = Doc(nlp.vocab, words=["Hello", ",", "world", "!"],
-          spaces=[False, True, False, False])
+nlp = spacy.blank("en")
+words = ["Hello", ",", "world", "!"]
+spaces = [False, True, False, False]
+doc = Doc(nlp.vocab, words=words, spaces=spaces)
+print(doc.text)
 print([(t.text, t.text_with_ws, t.whitespace_) for t in doc])
 ```

-If provided, the spaces list must be the same length as the words list. The
+If provided, the spaces list must be the **same length** as the words list. The
 spaces list affects the `doc.text`, `span.text`, `token.idx`, `span.start_char`
 and `span.end_char` attributes. If you don't provide a `spaces` sequence, spaCy
-will assume that all words are whitespace delimited.
+will assume that all words are followed by a space. Once you have a
+[`Doc`](/api/doc) object, you can write to its attributes to set the
+part-of-speech tags, syntactic dependencies, named entities and other
+attributes.

-```python
-### {executable="true"}
-import spacy
-from spacy.tokens import Doc
-from spacy.lang.en import English
-
-nlp = English()
-bad_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"])
-good_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"],
-                  spaces=[False, True, False, False])
-
-print(bad_spaces.text)   # 'Hello , world !'
-print(good_spaces.text)  # 'Hello, world!'
-```
-
-Once you have a [`Doc`](/api/doc) object, you can write to its attributes to set
-the part-of-speech tags, syntactic dependencies, named entities and other
-attributes. For details, see the respective usage pages.
-
-### Aligning tokenization {#aligning-tokenization}
+#### Aligning tokenization {#aligning-tokenization}

 spaCy's tokenization is non-destructive and uses language-specific rules
 optimized for compatibility with treebank annotations. Other tools and resources
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -979,8 +979,8 @@ added via [`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is
 called on a text, it will find matches in the `doc` and add them as entities to
 the `doc.ents`, using the specified pattern label as the entity label. If any
 matches were to overlap, the pattern matching most tokens takes priority. If
-they also happen to be equally long, then the match occuring first in the Doc is
-chosen.
+they also happen to be equally long, then the match occurring first in the `Doc`
+is chosen.

 ```python
 ### {executable="true"}
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -6,26 +6,98 @@ menu:
  - ['New Features', 'features']
  - ['Backwards Incompatibilities', 'incompat']
  - ['Migrating from v2.x', 'migrating']
-  - ['Migrating plugins', 'plugins']
 ---

 ## Summary {#summary}

 ## New Features {#features}

+### New training workflow and config system {#features-training}
+
+### Transformer-based pipelines {#features-transformers}
+
+### Custom models using any framework {#feautres-custom-models}
+
+### Manage end-to-end workflows with projects {#features-projects}
+
+### New built-in pipeline components {#features-pipeline-components}
+
+| Name                                            | Description                                                                                                                                                                                                  |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation.                                                                                                                                                               |
+| [`Morphologizer`](/api/morphologizer)           | Trainable component to predict morphological features.                                                                                                                                                       |
+| [`Lemmatizer`](/api/lemmatizer)                 | Standalone component for rule-based and lookup lemmatization.                                                                                                                                                |
+| [`AttributeRuler`](/api/attributeruler)         | Component for setting token attributes using match patterns.                                                                                                                                                 |
+| [`Transformer`](/api/transformer)               | Component for using [transformer models](/usage/transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
+
+### New and improved pipeline component APIs {#features-components}
+
+- `Language.factory`, `Language.component`
+- `Language.analyze_pipes`
+- Adding components from other models
+
+### Type hints and type-based data validation {#features-types}
+
+spaCy v3.0 officially drops support for Python 2 and now requires **Python
+3.6+**. This also means that the code base can take full advantage of
+[type hints](https://docs.python.org/3/library/typing.html). spaCy's user-facing
+API that's implemented in pure Python (as opposed to Cython) now comes with type
+hints. The new version of spaCy's machine learning library
+[Thinc](https://thinc.ai) also features extensive
+[type support](https://thinc.ai/docs/usage-type-checking/), including custom
+types for models and arrays, and a custom `mypy` plugin that can be used to
+type-check model definitions.
+
+For data validation, spacy v3.0 adopts
+[`pydantic`](https://github.com/samuelcolvin/pydantic). It also powers the data
+validation of Thinc's [config system](https://thinc.ai/docs/usage-config), which
+lets you to register **custom functions with typed arguments**, reference them
+in your config and see validation errors if the argument values don't match.
+
+### CLI
+
+| Name                                    | Description                                                                                              |
+| --------------------------------------- | -------------------------------------------------------------------------------------------------------- |
+| [`init config`](/api/cli#init-config)   | Initialize a [training config](/usage/training) file for a blank language or auto-fill a partial config. |
+| [`debug config`](/api/cli#debug-config) | Debug a [training config](/usage/training) file and show validation errors.                              |
+| [`project`](/api/cli#project)           | Subcommand for cloning and running [spaCy projects](/usage/projects).                                    |
+
 ## Backwards Incompatibilities {#incompat}

-### Removed or renamed objects, methods, attributes and arguments {#incompat-removed}
+As always, we've tried to keep the breaking changes to a minimum and focus on
+changes that were necessary to support the new features, fix problems or improve
+usability. The following section lists the relevant changes to the user-facing
+API. For specific examples of how to rewrite your code, check out the
+[migration guide](#migrating).
+
+### Compatibility {#incompat-compat}
+
+- spaCy now requires **Python 3.6+**.
+
+### API changes {#incompat-api}
+
+- [`Language.add_pipe`](/api/language#add_pipe) now takes the **string name** of
+  the component factory instead of the component function.
+- **Custom pipeline components** now needs to be decorated with the
+  [`@Language.component`](/api/language#component) or
+  [`@Language.factory`](/api/language#factory) decorator.
+- [`Language.update`](/api/language#update) now takes a batch of
+  [`Example`](/api/example) objects instead of raw texts and annotations, or
+  `Doc` and `GoldParse` objects.
+- The `Language.disable_pipes` contextmanager has been replaced by
+  [`Language.select_pipes`](/api/language#select_pipes), which can explicitly
+  disable or enable components.
+
+### Removed or renamed API {#incompat-removed}

 | Removed                                                  | Replacement                                           |
-| -------------------------------------------------------- | ----------------------------------------- |
+| -------------------------------------------------------- | ----------------------------------------------------- |
+| `Language.disable_pipes`                                 | [`Language.select_pipes`](/api/language#select_pipes) |
 | `GoldParse`                                              | [`Example`](/api/example)                             |
 | `GoldCorpus`                                             | [`Corpus`](/api/corpus)                               |
 | `spacy debug-data`                                       | [`spacy debug data`](/api/cli#debug-data)             |
 | `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated             |

-### Removed deprecated methods, attributes and arguments {#incompat-removed-deprecated}
-
 The following deprecated methods, attributes and arguments were removed in v3.0.
 Most of them have been **deprecated for a while** and many would previously
 raise errors. Many of them were also mostly internals. If you've been working
@ -214,17 +286,14 @@ python -m spacy package ./model ./packages
 - python setup.py sdist
 ```

-## Migration notes for plugin maintainers {#plugins}
+#### Migration notes for plugin maintainers {#migrating-plugins}

 Thanks to everyone who's been contributing to the spaCy ecosystem by developing
 and maintaining one of the many awesome [plugins and extensions](/universe).
-We've tried to keep breaking changes to a minimum and make it as easy as
-possible for you to upgrade your packages for spaCy v3.
-
-### Custom pipeline components
-
-The most common use case for plugins is providing pipeline components and
-extension attributes.
+We've tried to make it as easy as possible for you to upgrade your packages for
+spaCy v3. The most common use case for plugins is providing pipeline components
+and extension attributes. When migrating your plugin, double-check the
+following:

 - Use the [`@Language.factory`](/api/language#factory) decorator to register
  your component and assign it a name. This allows users to refer to your
--- a/website/src/components/code.js
+++ b/website/src/components/code.js
@ -11,7 +11,7 @@ import Link from './link'
 import GitHubCode from './github'
 import classes from '../styles/code.module.sass'

-const WRAP_THRESHOLD = 15
+const WRAP_THRESHOLD = 16

 export default props => (
    <Pre>