Merge branch 'develop' into nightly.spacy.io

2025-11-04 01:48:04 +03:00 · 2020-12-08 16:32:40 +11:00 · 2020-12-08 16:32:40 +11:00 · 24f5fe8839
commit 24f5fe8839
parent 576eeed849 ee2ec52f48
16 changed files with 375 additions and 245 deletions
--- a/licenses/3rd_party_licenses.txt
+++ b/licenses/3rd_party_licenses.txt
@ -36,3 +36,44 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 scikit-learn
 ------------
 * Files: scorer.py
 The following implementation of roc_auc_score() is adapted from
 scikit-learn, which is distributed under the following license:
 New BSD License
 Copyright (c) 2007–2019 The scikit-learn developers.
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
  a. Redistributions of source code must retain the above copyright notice,
     this list of conditions and the following disclaimer.
  b. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.
  c. Neither the name of the Scikit-learn Developers  nor the names of
     its contributors may be used to endorse or promote products
     derived from this software without specific prior written
     permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 DAMAGE.
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -35,7 +35,7 @@ def download_cli(
 def download(model: str, direct: bool = False, *pip_args) -> None:
-    if not is_package("spacy") and "--no-deps" not in pip_args:
+    if not (is_package("spacy") or is_package("spacy-nightly")) and "--no-deps" not in pip_args:
        msg.warn(
            "Skipping pipeline package dependencies and setting `--no-deps`. "
            "You don't seem to have the spaCy package itself installed "
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -103,6 +103,9 @@ def package(
            )
    Path.mkdir(package_path, parents=True)
    shutil.copytree(str(input_dir), str(package_path / model_name_v))
    license_path = package_path / model_name_v / "LICENSE"
    if license_path.exists():
        shutil.move(str(license_path), str(main_path))
    create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
    create_file(main_path / "setup.py", TEMPLATE_SETUP)
    create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
@ -238,7 +241,7 @@ if __name__ == '__main__':
 TEMPLATE_MANIFEST = """
 include meta.json
-include config.cfg
+include LICENSE
 """.strip()
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -125,8 +125,9 @@ class Warnings:
 class Errors:
    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
    E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
-            "This usually happens when spaCy calls `nlp.{method}` with custom "
+            "This usually happens when spaCy calls `nlp.{method}` with a custom "
            "component name that's not registered on the current language class. "
            "If you're using a Transformer, make sure to install 'spacy-transformers'. "
            "If you're using a custom component, make sure you've added the "
            "decorator `@Language.component` (for function components) or "
            "`@Language.factory` (for class components).\n\nAvailable "
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -67,9 +67,6 @@ class Morphologizer(Tagger):
        vocab: Vocab,
        model: Model,
        name: str = "morphologizer",
        *,
        labels_morph: Optional[dict] = None,
        labels_pos: Optional[dict] = None,
    ):
        """Initialize a morphologizer.
@ -77,8 +74,6 @@ class Morphologizer(Tagger):
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        labels_morph (dict): Mapping of morph + POS tags to morph labels.
        labels_pos (dict): Mapping of morph + POS tags to POS tags.
        DOCS: https://nightly.spacy.io/api/morphologizer#init
        """
@ -90,7 +85,7 @@ class Morphologizer(Tagger):
        # store mappings from morph+POS labels to token-level annotations:
        # 1) labels_morph stores a mapping from morph+POS->morph
        # 2) labels_pos stores a mapping from morph+POS->POS
-        cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}}
+        cfg = {"labels_morph": {}, "labels_pos": {}}
        self.cfg = dict(sorted(cfg.items()))
    @property
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@ -47,7 +47,7 @@ class MultitaskObjective(Tagger):
    side-objective.
    """
-    def __init__(self, vocab, model, name="nn_labeller", *, labels, target):
+    def __init__(self, vocab, model, name="nn_labeller", *, target):
        self.vocab = vocab
        self.model = model
        self.name = name
@ -67,7 +67,7 @@ class MultitaskObjective(Tagger):
            self.make_label = target
        else:
            raise ValueError(Errors.E016)
-        cfg = {"labels": labels or {}, "target": target}
+        cfg = {"labels": {}, "target": target}
        self.cfg = dict(cfg)
    @property
@ -81,15 +81,18 @@ class MultitaskObjective(Tagger):
    def set_annotations(self, docs, dep_ids):
        pass
-    def initialize(self, get_examples, nlp=None):
+    def initialize(self, get_examples, nlp=None, labels=None):
        if not hasattr(get_examples, "__call__"):
            err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
            raise ValueError(err)
-        for example in get_examples():
+        if labels is not None:
-            for token in example.y:
+            self.labels = labels
-                label = self.make_label(token)
+        else:
-                if label is not None and label not in self.labels:
+            for example in get_examples():
-                    self.labels[label] = len(self.labels)
+                for token in example.y:
                    label = self.make_label(token)
                    if label is not None and label not in self.labels:
                        self.labels[label] = len(self.labels)
        self.model.initialize()   # TODO: fix initialization by defining X and Y
    def predict(self, docs):
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -61,14 +61,13 @@ class Tagger(TrainablePipe):
    DOCS: https://nightly.spacy.io/api/tagger
    """
-    def __init__(self, vocab, model, name="tagger", *, labels=None):
+    def __init__(self, vocab, model, name="tagger"):
        """Initialize a part-of-speech tagger.
        vocab (Vocab): The shared vocabulary.
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        labels (List): The set of labels. Defaults to None.
        DOCS: https://nightly.spacy.io/api/tagger#init
        """
@ -76,7 +75,7 @@ class Tagger(TrainablePipe):
        self.model = model
        self.name = name
        self._rehearsal_model = None
-        cfg = {"labels": labels or []}
+        cfg = {"labels": []}
        self.cfg = dict(sorted(cfg.items()))
    @property
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -720,44 +720,10 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
        }
 #############################################################################
 #
 # The following implementation of roc_auc_score() is adapted from
-# scikit-learn, which is distributed under the following license:
+# scikit-learn, which is distributed under the New BSD License.
 #
 # New BSD License
 #
 # Copyright (c) 2007–2019 The scikit-learn developers.
-# All rights reserved.
+# See licenses/3rd_party_licenses.txt
 #
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 #   a. Redistributions of source code must retain the above copyright notice,
 #      this list of conditions and the following disclaimer.
 #   b. Redistributions in binary form must reproduce the above copyright
 #      notice, this list of conditions and the following disclaimer in the
 #      documentation and/or other materials provided with the distribution.
 #   c. Neither the name of the Scikit-learn Developers  nor the names of
 #      its contributors may be used to endorse or promote products
 #      derived from this software without specific prior written
 #      permission.
 #
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
 # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 # DAMAGE.
 def _roc_auc_score(y_true, y_score):
    """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
    from prediction scores.
--- a/spacy/tests/regression/test_issue5501-6000.py
+++ b/spacy/tests/regression/test_issue5501-6000.py
@ -1,35 +1,38 @@
-from thinc.api import fix_random_seed
+import pytest
 from thinc.api import Config, fix_random_seed
 from spacy.lang.en import English
 from spacy.pipeline.textcat import default_model_config, bow_model_config
 from spacy.pipeline.textcat import cnn_model_config
 from spacy.tokens import Span
 from spacy import displacy
 from spacy.pipeline import merge_entities
 from spacy.training import Example
-def test_issue5551():
+@pytest.mark.parametrize(
    "textcat_config", [default_model_config, bow_model_config, cnn_model_config]
 )
 def test_issue5551(textcat_config):
    """Test that after fixing the random seed, the results of the pipeline are truly identical"""
    component = "textcat"
-    pipe_cfg = {
+
-        "model": {
+    pipe_cfg = Config().from_str(textcat_config)
            "@architectures": "spacy.TextCatBOW.v1",
            "exclusive_classes": True,
            "ngram_size": 2,
            "no_output_layer": False,
        }
    }
    results = []
    for i in range(3):
        fix_random_seed(0)
        nlp = English()
-        example = (
+        text = "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g."
-            "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g.",
+        annots = {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}}
            {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}},
        )
        pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
-        for label in set(example[1]["cats"]):
+        for label in set(annots["cats"]):
            pipe.add_label(label)
        # Train
        nlp.initialize()
        doc = nlp.make_doc(text)
        nlp.update([Example.from_dict(doc, annots)])
        # Store the result of each iteration
-        result = pipe.model.predict([nlp.make_doc(example[0])])
+        result = pipe.model.predict([doc])
        results.append(list(result[0]))
    # All results should be the same because of the fixed seed
    assert len(results) == 3
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@ -72,6 +72,10 @@ def test_readers():
 def test_cat_readers(reader, additional_config):
    nlp_config_string = """
    [training]
    seed = 0
    [training.score_weights]
    cats_macro_auc = 1.0
    [corpora]
    @readers = "PLACEHOLDER"
@ -92,9 +96,7 @@ def test_cat_readers(reader, additional_config):
    config["corpora"]["@readers"] = reader
    config["corpora"].update(additional_config)
    nlp = load_model_from_config(config, auto_fill=True)
-    T = registry.resolve(
+    T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
        nlp.config["training"].interpolate(), schema=ConfigSchemaTraining
    )
    dot_names = [T["train_corpus"], T["dev_corpus"]]
    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
    optimizer = T["optimizer"]
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@ -17,7 +17,7 @@ from ..ml.models.multi_task import build_cloze_multi_task_model
 from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
 from ..errors import Errors
-from ..util import registry, load_model_from_config, resolve_dot_names
+from ..util import registry, load_model_from_config, dot_to_object
 def pretrain(
@ -38,7 +38,8 @@ def pretrain(
    _config = nlp.config.interpolate()
    T = registry.resolve(_config["training"], schema=ConfigSchemaTraining)
    P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain)
-    corpus = resolve_dot_names(_config, [P["corpus"]])[0]
+    corpus = dot_to_object(_config, P["corpus"])
    corpus = registry.resolve({"corpus": corpus})["corpus"]
    batcher = P["batcher"]
    model = create_pretraining_model(nlp, P)
    optimizer = P["optimizer"]
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -143,10 +143,10 @@ argument that connects to the shared `tok2vec` component in the pipeline.
 Construct an embedding layer that separately embeds a number of lexical
 attributes using hash embedding, concatenates the results, and passes it through
-a feed-forward subnetwork to build a mixed representation. The features used
+a feed-forward subnetwork to build a mixed representation. The features used can
-can be configured with the `attrs` argument. The suggested attributes are
+be configured with the `attrs` argument. The suggested attributes are `NORM`,
-`NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account
+`PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account some
-some subword information, without construction a fully character-based
+subword information, without construction a fully character-based
 representation. If pretrained vectors are available, they can be included in the
 representation as well, with the vectors table will be kept static (i.e. it's
 not updated).
@ -393,11 +393,12 @@ operate over wordpieces, which usually don't align one-to-one against spaCy
 tokens. The layer therefore requires a reduction operation in order to calculate
 a single token vector given zero or more wordpiece vectors.
-| Name          | Description                                                                                                                                                                                                                                                                   |
+| Name          | Description                                                                                                                                                                                                                                                                                                                                      |
-| ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `pooling`     | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~                            |
+| `pooling`     | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~                                                                                               |
-| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ |
+| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~                                                                    |
-| **CREATES**   | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                        |
+| `upstream`    | A string to identify the "upstream" `Transformer` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Transformer` component. You'll almost never have multiple upstream `Transformer` components, so the wildcard string will almost always be fine. ~~str~~ |
 | **CREATES**   | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                           |
 ### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer}
@ -563,7 +564,8 @@ from the linear model, where it is stored in `model.attrs["multi_label"]`.
 <Accordion title="spacy.TextCatEnsemble.v1 definition" spaced>
-The v1 was functionally similar, but used an internal `tok2vec` instead of taking it as argument.
+The v1 was functionally similar, but used an internal `tok2vec` instead of
 taking it as argument.
 | Name                 | Description                                                                                                                                                                                    |
 | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@ -66,9 +66,6 @@ shortcut for this and instantiate the component using its string name and
 | `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                     |
 | `model`        | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
 | `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                  |
 | _keyword-only_ |                                                                                                                      |
 | `labels_morph` | Mapping of morph + POS tags to morph labels. ~~Dict[str, str]~~                                                      |
 | `labels_pos`   | Mapping of morph + POS tags to POS tags. ~~Dict[str, str]~~                                                          |
 ## Morphologizer.\_\_call\_\_ {#call tag="method"}
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@ -21,16 +21,12 @@ architectures and their arguments and hyperparameters.
 >
 > ```python
 > from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
-> config = {
+> config = {"model": DEFAULT_TAGGER_MODEL}
 >    "set_morphology": False,
 >    "model": DEFAULT_TAGGER_MODEL,
 > }
 > nlp.add_pipe("tagger", config=config)
 > ```
 | Setting          | Description                                                                                                                                                                                                                                                                                            |
 | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `set_morphology` | Whether to set morphological features. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                                   |
 | `model`          | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
 ```python
@ -63,8 +59,6 @@ shortcut for this and instantiate the component using its string name and
 | `vocab`          | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                      |
 | `model`          | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
 | `name`           | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                   |
 | _keyword-only_   |                                                                                                                                                                                                                                                       |
 | `set_morphology` | Whether to set morphological features. ~~bool~~                                                                                                                                                                                                       |
 ## Tagger.\_\_call\_\_ {#call tag="method"}
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@ -502,7 +502,7 @@ with Model.define_operators({">>": chain}):
 ## Create new trainable components {#components}
-In addition to [swapping out](#swap-architectures) default models in built-in
+In addition to [swapping out](#swap-architectures) layers in existing
 components, you can also implement an entirely new,
 [trainable](/usage/processing-pipelines#trainable-components) pipeline component
 from scratch. This can be done by creating a new class inheriting from
@ -523,20 +523,28 @@ overview of the `TrainablePipe` methods used by
 This section outlines an example use-case of implementing a **novel relation
 extraction component** from scratch. We'll implement a binary relation
 extraction method that determines whether or not **two entities** in a document
-are related, and if so, what type of relation. We'll allow multiple types of
+are related, and if so, what type of relation connects them. We allow multiple
-relations between two such entities (multi-label setting). There are two major
+types of relations between two such entities (a multi-label setting). There are
-steps required:
+two major steps required:
 1. Implement a [machine learning model](#component-rel-model) specific to this
-   task. It will have to extract candidates from a [`Doc`](/api/doc) and predict
+   task. It will have to extract candidate relation instances from a
-   a relation for the available candidate pairs.
+   [`Doc`](/api/doc) and predict the corresponding scores for each relation
-2. Implement a custom [pipeline component](#component-rel-pipe) powered by the
+   label.
-   machine learning model that sets annotations on the [`Doc`](/api/doc) passing
+2. Implement a custom [pipeline component](#component-rel-pipe) - powered by the
-   through the pipeline.
+   machine learning model from step 1 - that translates the predicted scores
   into annotations that are stored on the [`Doc`](/api/doc) objects as they
   pass through the `nlp` pipeline.
-<!-- TODO: <Project id="tutorials/ner-relations">
+<Project id="tutorials/rel_component">
-
+Run this example use-case by using our project template. It includes all the 
-</Project> -->
+code to create the ML model and the pipeline component from scratch.
 It also contains two config files to train the model: 
 one to run on CPU with a Tok2Vec layer, and one for the GPU using a transformer.
 The project applies the relation extraction component to identify biomolecular 
 interactions in a sample dataset, but you can easily swap in your own dataset 
 for your experiments in any other domain.
 </Project>
 #### Step 1: Implementing the Model {#component-rel-model}
@ -552,41 +560,17 @@ matrix** (~~Floats2d~~) of predictions:
 > for details.
 ```python
-### Register the model architecture
+### The model architecture
-@registry.architectures.register("rel_model.v1")
+@spacy.registry.architectures.register("rel_model.v1")
 def create_relation_model(...) -> Model[List[Doc], Floats2d]:
    model = ...  # 👈 model will go here
    return model
 ```
-The first layer in this model will typically be an
+We adapt a **modular approach** to the definition of this relation model, and
-[embedding layer](/usage/embeddings-transformers) such as a
+define it as chaining two layers together: the first layer that generates an
-[`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This
+instance tensor from a given set of documents, and the second layer that
-layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it
+transforms the instance tensor into a final tensor holding the predictions:
 transforms each **document into a list of tokens**, with each token being
 represented by its embedding in the vector space.
 Next, we need a method that **generates pairs of entities** that we want to
 classify as being related or not. As these candidate pairs are typically formed
 within one document, this function takes a [`Doc`](/api/doc) as input and
 outputs a `List` of `Span` tuples. For instance, a very straightforward
 implementation would be to just take any two entities from the same document:
 ```python
 ### Simple candiate generation
 def get_candidates(doc: Doc) -> List[Tuple[Span, Span]]:
    candidates = []
    for ent1 in doc.ents:
        for ent2 in doc.ents:
            candidates.append((ent1, ent2))
    return candidates
 ```
 But we could also refine this further by **excluding relations** of an entity
 with itself, and posing a **maximum distance** (in number of tokens) between two
 entities. We register this function in the
 [`@misc` registry](/api/top-level#registry) so we can refer to it from the
 config, and easily swap it out for any other candidate generation function.
 > #### config.cfg (excerpt)
 >
@ -594,18 +578,159 @@ config, and easily swap it out for any other candidate generation function.
 > [model]
 > @architectures = "rel_model.v1"
 >
-> [model.tok2vec]
+> [model.create_instance_tensor]
 > # ...
 >
-> [model.get_candidates]
+> [model.classification_layer]
-> @misc = "rel_cand_generator.v1"
+> # ...
 > max_length = 20
 > ```
 ```python
-### Extended candidate generation {highlight="1,2,7,8"}
+### The model architecture {highlight="6"}
-@registry.misc.register("rel_cand_generator.v1")
+@spacy.registry.architectures.register("rel_model.v1")
-def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
+def create_relation_model(
    create_instance_tensor: Model[List[Doc], Floats2d],
    classification_layer: Model[Floats2d, Floats2d],
 ) -> Model[List[Doc], Floats2d]:
    model = chain(create_instance_tensor, classification_layer)
    return model
 ```
 The `classification_layer` could be something like a
 [Linear](https://thinc.ai/docs/api-layers#linear) layer followed by a
 [logistic](https://thinc.ai/docs/api-layers#logistic) activation function:
 > #### config.cfg (excerpt)
 >
 > ```ini
 > [model.classification_layer]
 > @architectures = "rel_classification_layer.v1"
 > nI = null
 > nO = null
 > ```
 ```python
 ### The classification layer
@spacy.registry.architectures.register("rel_classification_layer.v1")
 def create_classification_layer(
    nO: int = None, nI: int = None
 ) -> Model[Floats2d, Floats2d]:
    return chain(Linear(nO=nO, nI=nI), Logistic())
 ```
 The first layer that **creates the instance tensor** can be defined by
 implementing a
 [custom forward function](https://thinc.ai/docs/usage-models#weights-layers-forward)
 with an appropriate backpropagation callback. We also define an
 [initialization method](https://thinc.ai/docs/usage-models#weights-layers-init)
 that ensures that the layer is properly set up for training.
 We omit some of the implementation details here, and refer to the
 [spaCy project](https://github.com/explosion/projects/tree/v3/tutorials/rel_component)
 that has the full implementation.
 > #### config.cfg (excerpt)
 >
 > ```ini
 > [model.create_instance_tensor]
 > @architectures = "rel_instance_tensor.v1"
 >
 > [model.create_instance_tensor.tok2vec]
 > @architectures = "spacy.HashEmbedCNN.v1"
 > # ...
 >
 > [model.create_instance_tensor.pooling]
 > @layers = "reduce_mean.v1"
 >
 > [model.create_instance_tensor.get_instances]
 > # ...
 > ```
 ```python
 ### The layer that creates the instance tensor
@spacy.registry.architectures.register("rel_instance_tensor.v1")
 def create_tensors(
    tok2vec: Model[List[Doc], List[Floats2d]],
    pooling: Model[Ragged, Floats2d],
    get_instances: Callable[[Doc], List[Tuple[Span, Span]]],
 ) -> Model[List[Doc], Floats2d]:
    return Model(
        "instance_tensors",
        instance_forward,
        init=instance_init,
        layers=[tok2vec, pooling],
        refs={"tok2vec": tok2vec, "pooling": pooling},
        attrs={"get_instances": get_instances},
    )
 # The custom forward function
 def instance_forward(
    model: Model[List[Doc], Floats2d],
    docs: List[Doc],
    is_train: bool,
 ) -> Tuple[Floats2d, Callable]:
    tok2vec = model.get_ref("tok2vec")
    tokvecs, bp_tokvecs = tok2vec(docs, is_train)
    get_instances = model.attrs["get_instances"]
    all_instances = [get_instances(doc) for doc in docs]
    pooling = model.get_ref("pooling")
    relations = ...
    def backprop(d_relations: Floats2d) -> List[Doc]:
        d_tokvecs = ...
        return bp_tokvecs(d_tokvecs)
    return relations, backprop
 # The custom initialization method
 def instance_init(
    model: Model,
    X: List[Doc] = None,
    Y: Floats2d = None,
 ) -> Model:
    tok2vec = model.get_ref("tok2vec")
    tok2vec.initialize(X)
    return model
 ```
 This custom layer uses an [embedding layer](/usage/embeddings-transformers) such
 as a [`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer).
 This layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it
 transforms each **document into a list of tokens**, with each token being
 represented by its embedding in the vector space.
 The `pooling` layer will be applied to summarize the token vectors into **entity
 vectors**, as named entities (represented by ~~Span~~ objects) can consist of
 one or multiple tokens. For instance, the pooling layer could resort to
 calculating the average of all token vectors in an entity. Thinc provides
 several
 [built-in pooling operators](https://thinc.ai/docs/api-layers#reduction-ops) for
 this purpose.
 Finally, we need a `get_instances` method that **generates pairs of entities**
 that we want to classify as being related or not. As these candidate pairs are
 typically formed within one document, this function takes a [`Doc`](/api/doc) as
 input and outputs a `List` of `Span` tuples. For instance, the following
 implementation takes any two entities from the same document, as long as they
 are within a **maximum distance** (in number of tokens) of eachother:
 > #### config.cfg (excerpt)
 >
 > ```ini
 >
 > [model.create_instance_tensor.get_instances]
 > @misc = "rel_instance_generator.v1"
 > max_length = 100
 > ```
 ```python
 ### Candidate generation
@spacy.registry.misc.register("rel_instance_generator.v1")
 def create_instances(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
    def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
        candidates = []
        for ent1 in doc.ents:
@ -617,45 +742,39 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span
    return get_candidates
 ```
-Finally, we require a method that transforms the candidate entity pairs into a
+This function in added to the [`@misc` registry](/api/top-level#registry) so we
-2D tensor using the specified [`Tok2Vec`](/api/tok2vec) or
+can refer to it from the config, and easily swap it out for any other candidate
-[`Transformer`](/api/transformer). The resulting ~~Floats2~~ object will then be
+generation function.
 processed by a final `output_layer` of the network. Putting all this together,
 we can define our relation model in a config file as such:
-```ini
+#### Intermezzo: define how to store the relations data {#component-rel-attribute}
 ### config.cfg
 [model]
@architectures = "rel_model.v1"
 # ...
-[model.tok2vec]
+> #### Example output
-# ...
+>
 > ```python
 > doc = nlp("Amsterdam is the capital of the Netherlands.")
 > print("spans", [(e.start, e.text, e.label_) for e in doc.ents])
 > for value, rel_dict in doc._.rel.items():
 >     print(f"{value}: {rel_dict}")
 >
 > # spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')]
 > # (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002}
 > # (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017}
 > ```
-[model.get_candidates]
+For our new relation extraction component, we will use a custom
-@misc = "rel_cand_generator.v1"
+[extension attribute](/usage/processing-pipelines#custom-components-attributes)
-max_length = 20
+`doc._.rel` in which we store relation data. The attribute refers to a
-
+dictionary, keyed by the **start offsets of each entity** involved in the
-[model.create_candidate_tensor]
+candidate relation. The values in the dictionary refer to another dictionary
-@misc = "rel_cand_tensor.v1"
+where relation labels are mapped to values between 0 and 1. We assume anything
-
+above 0.5 to be a `True` relation. The ~~Example~~ instances that we'll use as
-[model.output_layer]
+training data, will include their gold-standard relation annotations in
-@architectures = "rel_output_layer.v1"
+`example.reference._.rel`.
 # ...
 ```
 <!-- TODO: link to project for implementation details -->
 <!-- TODO: maybe embed files from project that show the architectures? -->
 When creating this model, we store the custom functions as
 [attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as
 references, so we can access them easily:
 ```python
-tok2vec_layer = model.get_ref("tok2vec")
+### Registering the extension attribute
-output_layer = model.get_ref("output_layer")
+from spacy.tokens import Doc
-create_candidate_tensor = model.attrs["create_candidate_tensor"]
+Doc.set_extension("rel", default={})
 get_candidates = model.attrs["get_candidates"]
 ```
 #### Step 2: Implementing the pipeline component {#component-rel-pipe}
@ -698,19 +817,44 @@ class RelationExtractor(TrainablePipe):
        ...
 ```
-Before the model can be used, it needs to be
+Typically, the **constructor** defines the vocab, the Machine Learning model,
-[initialized](/usage/training#initialization). This function receives a callback
+and the name of this component. Additionally, this component, just like the
-to access the full **training data set**, or a representative sample. This data
+`textcat` and the `tagger`, stores an **internal list of labels**. The ML model
-set can be used to deduce all **relevant labels**. Alternatively, a list of
+will predict scores for each label. We add convenience methods to easily
-labels can be provided to `initialize`, or you can call
+retrieve and add to them.
-`RelationExtractor.add_label` directly. The number of labels defines the output
+
-dimensionality of the network, and will be used to do
+```python
 ### The constructor (continued) 
    def __init__(self, vocab, model, name="rel"):
        """Create a component instance."""
        # ...
        self.cfg = {"labels": []}
    @property
    def labels(self) -> Tuple[str]:
        """Returns the labels currently added to the component."""
        return tuple(self.cfg["labels"])
    def add_label(self, label: str):
        """Add a new label to the pipe."""
        self.cfg["labels"] = list(self.labels) + [label]
 ```
 After creation, the component needs to be
 [initialized](/usage/training#initialization). This method can define the
 relevant labels in two ways: explicitely by setting the `labels` argument in the
 [`initialize` block](/api/data-formats#config-initialize) of the config, or
 implicately by deducing them from the `get_examples` callback that generates the
 full **training data set**, or a representative sample.
 The final number of labels defines the output dimensionality of the network, and
 will be used to do
 [shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
 layers of the neural network. This is triggered by calling
 [`Model.initialize`](https://thinc.ai/api/model#initialize).
 ```python
-### The initialize method {highlight="12,18,22"}
+### The initialize method {highlight="12,15,18,22"}
 from itertools import islice
 def initialize(
@ -741,7 +885,7 @@ Typically, this happens when the pipeline is set up before training in
 [`spacy train`](/api/cli#training). After initialization, the pipeline component
 and its internal model can be trained and used to make predictions.
-During training, the function [`update`](/api/pipe#update) is invoked which
+During training, the method [`update`](/api/pipe#update) is invoked which
 delegates to
 [`Model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
 [`get_loss`](/api/pipe#get_loss) function that **calculates the loss** for a
@ -761,18 +905,18 @@ def update(
    sgd: Optional[Optimizer] = None,
    losses: Optional[Dict[str, float]] = None,
 ) -> Dict[str, float]:
-    ...
+    # ...
-    docs = [ex.predicted for ex in examples]
+    docs = [eg.predicted for eg in examples]
    predictions, backprop = self.model.begin_update(docs)
    loss, gradient = self.get_loss(examples, predictions)
    backprop(gradient)
    losses[self.name] += loss
-    ...
+    # ...
    return losses
 ```
-When the internal model is trained, the component can be used to make novel
+After training the model, the component can be used to make novel
-**predictions**. The [`predict`](/api/pipe#predict) function needs to be
+**predictions**. The [`predict`](/api/pipe#predict) method needs to be
 implemented for each subclass of `TrainablePipe`. In our case, we can simply
 delegate to the internal model's
 [predict](https://thinc.ai/docs/api-model#predict) function that takes a batch
@ -788,42 +932,21 @@ def predict(self, docs: Iterable[Doc]) -> Floats2d:
 The final method that needs to be implemented, is
 [`set_annotations`](/api/pipe#set_annotations). This function takes the
 predictions, and modifies the given `Doc` object in place to store them. For our
-relation extraction component, we store the data as a dictionary in a custom
+relation extraction component, we store the data in the
-[extension attribute](/usage/processing-pipelines#custom-components-attributes)
+[custom attribute](#component-rel-attribute)`doc._.rel`.
 `doc._.rel`. As keys, we represent the candidate pair by the **start offsets of
 each entity**, as this defines an entity pair uniquely within one document.
 To interpret the scores predicted by the relation extraction model correctly, we
-need to refer to the model's `get_candidates` function that defined which pairs
+need to refer to the model's `get_instances` function that defined which pairs
 of entities were relevant candidates, so that the predictions can be linked to
 those exact entities:
 > #### Example output
 >
 > ```python
 > doc = nlp("Amsterdam is the capital of the Netherlands.")
 > print("spans", [(e.start, e.text, e.label_) for e in doc.ents])
 > for value, rel_dict in doc._.rel.items():
 >     print(f"{value}: {rel_dict}")
 >
 > # spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')]
 > # (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002}
 > # (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017}
 > ```
 ```python
 ### Registering the extension attribute
 from spacy.tokens import Doc
 Doc.set_extension("rel", default={})
 ```
 ```python
 ### The set_annotations method {highlight="5-6,10"}
 def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
    c = 0
-    get_candidates = self.model.attrs["get_candidates"]
+    get_instances = self.model.attrs["get_instances"]
    for doc in docs:
-        for (e1, e2) in get_candidates(doc):
+        for (e1, e2) in get_instances(doc):
            offset = (e1.start, e2.start)
            if offset not in doc._.rel:
                doc._.rel[offset] = {}
@ -837,15 +960,15 @@ Under the hood, when the pipe is applied to a document, it delegates to the
 ```python
 ### The __call__ method
-def __call__(self, Doc doc):
+def __call__(self, doc: Doc):
    predictions = self.predict([doc])
    self.set_annotations([doc], predictions)
    return doc
 ```
 There is one more optional method to implement: [`score`](/api/pipe#score)
-calculates the performance of your component on a set of examples, and 
+calculates the performance of your component on a set of examples, and returns
-returns the results as a dictionary:
+the results as a dictionary:
 ```python
 ### The score method
@ -861,8 +984,8 @@ def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
    }
 ```
-This is particularly useful to see the scores on the development corpus 
+This is particularly useful for calculating relevant scores on the development
-when training the component with [`spacy train`](/api/cli#training).
+corpus when training the component with [`spacy train`](/api/cli#training).
 Once our `TrainablePipe` subclass is fully implemented, we can
 [register](/usage/processing-pipelines#custom-components-factories) the
@ -879,14 +1002,8 @@ assigns it a name and lets you create the component with
 >
 > [components.relation_extractor.model]
 > @architectures = "rel_model.v1"
 >
 > [components.relation_extractor.model.tok2vec]
 > # ...
 >
 > [components.relation_extractor.model.get_candidates]
 > @misc = "rel_cand_generator.v1"
 > max_length = 20
 > 
 > [training.score_weights]
 > rel_micro_p = 0.0
 > rel_micro_r = 0.0
@ -924,6 +1041,12 @@ def make_relation_extractor(nlp, name, model):
    return RelationExtractor(nlp.vocab, model, name)
 ```
-<!-- TODO: <Project id="tutorials/ner-relations">
+<Project id="tutorials/rel_component">
-
+Run this example use-case by using our project template. It includes all the 
-</Project> -->
+code to create the ML model and the pipeline component from scratch.
 It contains two config files to train the model: 
 one to run on CPU with a Tok2Vec layer, and one for the GPU using a transformer.
 The project applies the relation extraction component to identify biomolecular 
 interactions, but you can easily swap in your own dataset for your experiments 
 in any other domain.
 </Project>
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -969,18 +969,18 @@ The [`Language.update`](/api/language#update),
 raw text and a dictionary of annotations.
 ```python
-### Training loop {highlight="11"}
+### Training loop {highlight="5-8,12"}
 TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
    ("I like London.", {"entities": [(7, 13, "LOC")]}),
 ]
-nlp.initialize()
+examples = []
 for text, annots in TRAIN_DATA:
    examples.append(Example.from_dict(nlp.make_doc(text), annots))
 nlp.initialize(lambda: examples)
 for i in range(20):
-    random.shuffle(TRAIN_DATA)
+    random.shuffle(examples)
-    for batch in minibatch(TRAIN_DATA):
+    for batch in minibatch(examples, size=8):
        examples = []
        for text, annots in batch:
            examples.append(Example.from_dict(nlp.make_doc(text), annots))
        nlp.update(examples)
 ```
@ -995,7 +995,7 @@ network,
 setting up the label scheme.
 ```diff
- nlp.initialize(examples)
+- nlp.begin_training()
 + nlp.initialize(lambda: examples)
 ```