From 2c4b2ee5e9b29442c119e9c8bb2b5bce761a78aa Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sat, 3 Oct 2020 23:27:05 +0200
Subject: [PATCH 1/7] REL intro and get_candidates function

---
 website/docs/usage/layers-architectures.md | 54 ++++++++++++++++++++++
 website/docs/usage/processing-pipelines.md |  2 +-
 2 files changed, 55 insertions(+), 1 deletion(-)
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index b65c3d903..678f70667 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -486,6 +486,60 @@ with Model.define_operators({">>": chain}):
 
 ## Create new trainable components {#components}
 
+In addition to [swapping out](#swap-architectures) default models in built-in
+components, you can also implement an entirely new,
+[trainable pipeline component](usage/processing-pipelines#trainable-components)
+from scratch. This can be done by creating a new class inheriting from [`Pipe`](/api/pipe), 
+and linking it up to your custom model implementation.
+
+### Example: Pipeline component for relation extraction {#component-rel}
+
+This section will run through an example of implementing a novel relation extraction 
+component from scratch. As a first step, we need a method that will generate pairs of
+entities that we want to classify as being related or not. These candidate pairs are 
+typically formed within one document, which means we'll have a function that takes a 
+`Doc` as input and outputs a `List` of `Span` tuples. In this example, we will focus 
+on binary relation extraction, i.e. the tuple will be of length 2.
+
+We register this function in the 'misc' register so we can easily refer to it from the config, 
+and allow swapping it out for any candidate 
+generation function. For instance, a very straightforward implementation would be to just 
+take any two entities from the same document:
+
+```python
+@registry.misc.register("rel_cand_generator.v1")
+def create_candidate_indices() -> Callable[[Doc], List[Tuple[Span, Span]]]:
+    def get_candidate_indices(doc: "Doc"):
+        indices = []
+        for ent1 in doc.ents:
+            for ent2 in doc.ents:
+                indices.append((ent1, ent2))
+        return indices
+    return get_candidate_indices
+```
+
+But we could also refine this further by excluding relations of an entity with itself, 
+and posing a maximum distance (in number of tokens) between two entities:
+
+```python
+### {highlight="1,2,7,8"}
+@registry.misc.register("rel_cand_generator.v2")
+def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
+    def get_candidate_indices(doc: "Doc"):
+        indices = []
+        for ent1 in doc.ents:
+            for ent2 in doc.ents:
+                if ent1 != ent2:
+                    if max_length and abs(ent2.start - ent1.start) <= max_length:
+                        indices.append((ent1, ent2))
+        return indices
+    return get_candidate_indices
+```
+
+
+
+
+
 <Infobox title="This section is still under construction" emoji="🚧" variant="warning">
 </Infobox>
 
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index c98bd08bc..3619993c5 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1035,7 +1035,7 @@ plug fully custom machine learning components into your pipeline. You'll need
 the following:
 
 1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This
-   can be a model using implemented in
+   can be a model implemented in
    [Thinc](/usage/layers-architectures#thinc), or a
    [wrapped model](/usage/layers-architectures#frameworks) implemented in
    PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a

From 08ad349a1851c3310a4ae7f34170eea37c9e2e3b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sun, 4 Oct 2020 00:08:02 +0200
Subject: [PATCH 2/7] tok2vec layer

---
 website/docs/usage/layers-architectures.md | 87 ++++++++++++++--------
 1 file changed, 58 insertions(+), 29 deletions(-)

diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 678f70667..6f79cc6e8 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -489,51 +489,80 @@ with Model.define_operators({">>": chain}):
 In addition to [swapping out](#swap-architectures) default models in built-in
 components, you can also implement an entirely new,
 [trainable pipeline component](usage/processing-pipelines#trainable-components)
-from scratch. This can be done by creating a new class inheriting from [`Pipe`](/api/pipe), 
-and linking it up to your custom model implementation.
+from scratch. This can be done by creating a new class inheriting from
+[`Pipe`](/api/pipe), and linking it up to your custom model implementation.
 
 ### Example: Pipeline component for relation extraction {#component-rel}
 
-This section will run through an example of implementing a novel relation extraction 
-component from scratch. As a first step, we need a method that will generate pairs of
-entities that we want to classify as being related or not. These candidate pairs are 
-typically formed within one document, which means we'll have a function that takes a 
-`Doc` as input and outputs a `List` of `Span` tuples. In this example, we will focus 
-on binary relation extraction, i.e. the tuple will be of length 2.
-
-We register this function in the 'misc' register so we can easily refer to it from the config, 
-and allow swapping it out for any candidate 
-generation function. For instance, a very straightforward implementation would be to just 
-take any two entities from the same document:
+This section will run through an example of implementing a novel relation
+extraction component from scratch. As a first step, we need a method that will
+generate pairs of entities that we want to classify as being related or not.
+These candidate pairs are typically formed within one document, which means
+we'll have a function that takes a `Doc` as input and outputs a `List` of `Span`
+tuples. In this example, we will focus on binary relation extraction, i.e. the
+tuple will be of length 2. For instance, a very straightforward implementation
+would be to just take any two entities from the same document:
 
 ```python
-@registry.misc.register("rel_cand_generator.v1")
-def create_candidate_indices() -> Callable[[Doc], List[Tuple[Span, Span]]]:
-    def get_candidate_indices(doc: "Doc"):
-        indices = []
-        for ent1 in doc.ents:
-            for ent2 in doc.ents:
-                indices.append((ent1, ent2))
-        return indices
-    return get_candidate_indices
+def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
+    candidates = []
+    for ent1 in doc.ents:
+        for ent2 in doc.ents:
+            candidates.append((ent1, ent2))
+    return candidates
 ```
 
-But we could also refine this further by excluding relations of an entity with itself, 
-and posing a maximum distance (in number of tokens) between two entities:
+But we could also refine this further by excluding relations of an entity with
+itself, and posing a maximum distance (in number of tokens) between two
+entities. We'll also register this function in the
+[`@misc` registry](/api/top-level#registry) so we can refer to it from the
+config, and easily swap it out for any other candidate generation function.
+
+> ```
+> [get_candidates]
+> @misc = "rel_cand_generator.v2"
+> max_length = 6
+> ```
 
 ```python
 ### {highlight="1,2,7,8"}
 @registry.misc.register("rel_cand_generator.v2")
 def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
-    def get_candidate_indices(doc: "Doc"):
-        indices = []
+    def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
+        candidates = []
         for ent1 in doc.ents:
             for ent2 in doc.ents:
                 if ent1 != ent2:
                     if max_length and abs(ent2.start - ent1.start) <= max_length:
-                        indices.append((ent1, ent2))
-        return indices
-    return get_candidate_indices
+                        candidates.append((ent1, ent2))
+        return candidates
+    return get_candidates
+```
+
+> ```
+> [tok2vec]
+> @architectures = "spacy.HashEmbedCNN.v1"
+> pretrained_vectors = null
+> width = 96
+> depth = 2
+> embed_size = 300
+> window_size = 1
+> maxout_pieces = 3
+> subword_features = true
+> ```
+
+Next, we'll assume we have access to an
+[embedding layer](/usage/embeddings-transformers) such as a
+[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This
+layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it
+transforms a list of documents into a list of 2D vectors. Further, this
+`tok2vec` component will be trainable, which means that, following the Thinc
+paradigm, we'll apply it to some input, and receive the predicted results as
+well as a callback to perform backpropagation:
+
+```python
+tok2vec = model.get_ref("tok2vec")
+tokvecs, bp_tokvecs = tok2vec(docs, is_train=True)
 ```
 
 

From 452b8309f9e34530e5f592699a3601400f40ffb0 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sun, 4 Oct 2020 13:26:46 +0200
Subject: [PATCH 3/7] slight rewrite to hide some thinc implementation details

---
 website/docs/usage/layers-architectures.md | 98 ++++++++++++++--------
 1 file changed, 63 insertions(+), 35 deletions(-)

diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 6f79cc6e8..25f9a568c 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -373,7 +373,7 @@ gpu_allocator = "pytorch"
 Of course it's also possible to define the `Model` from the previous section
 entirely in Thinc. The Thinc documentation provides details on the
 [various layers](https://thinc.ai/docs/api-layers) and helper functions
-available. Combinators can also be used to
+available. Combinators can be used to
 [overload operators](https://thinc.ai/docs/usage-models#operators) and a common
 usage pattern is to bind `chain` to `>>`. The "native" Thinc version of our
 simple neural network would then become:
@@ -494,13 +494,34 @@ from scratch. This can be done by creating a new class inheriting from
 
 ### Example: Pipeline component for relation extraction {#component-rel}
 
-This section will run through an example of implementing a novel relation
-extraction component from scratch. As a first step, we need a method that will
+This section outlines an example use-case of implementing a novel relation
+extraction component from scratch. We assume we want to implement a binary 
+relation extraction method that determines whether two entities in a document 
+are related or not, and if so, with what type of relation. We'll allow multiple 
+types of relations between two such entities - i.e. it is a multi-label setting.
+
+We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes 
+a list of documents as input, and outputs a two-dimensional matrix of scores:
+
+```python
+@registry.architectures.register("rel_model.v1")
+def create_relation_model(...) -> Model[List[Doc], Floats2d]:
+    model = _create_my_model()
+    return model
+```
+
+The first layer in this model will typically be an
+[embedding layer](/usage/embeddings-transformers) such as a
+[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This
+layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it
+transforms each document into a list of tokens, with each token being 
+represented by its embedding in the vector space.
+
+Next, we need a method that will
 generate pairs of entities that we want to classify as being related or not.
 These candidate pairs are typically formed within one document, which means
 we'll have a function that takes a `Doc` as input and outputs a `List` of `Span`
-tuples. In this example, we will focus on binary relation extraction, i.e. the
-tuple will be of length 2. For instance, a very straightforward implementation
+tuples. For instance, a very straightforward implementation
 would be to just take any two entities from the same document:
 
 ```python
@@ -512,18 +533,24 @@ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
     return candidates
 ```
 
-But we could also refine this further by excluding relations of an entity with
-itself, and posing a maximum distance (in number of tokens) between two
-entities. We'll also register this function in the
-[`@misc` registry](/api/top-level#registry) so we can refer to it from the
-config, and easily swap it out for any other candidate generation function.
-
 > ```
-> [get_candidates]
+> [model]
+> @architectures = "rel_model.v1"
+> 
+> [model.tok2vec]
+> ...
+> 
+> [model.get_candidates]
 > @misc = "rel_cand_generator.v2"
 > max_length = 6
 > ```
 
+But we could also refine this further by excluding relations of an entity with
+itself, and posing a maximum distance (in number of tokens) between two
+entities. We'll register this function in the
+[`@misc` registry](/api/top-level#registry) so we can refer to it from the
+config, and easily swap it out for any other candidate generation function.
+
 ```python
 ### {highlight="1,2,7,8"}
 @registry.misc.register("rel_cand_generator.v2")
@@ -539,32 +566,33 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span
     return get_candidates
 ```
 
+Finally, we'll require a method that transforms the candidate pairs of entities into 
+a 2D tensor using the specified Tok2Vec function, and this `Floats2d` object will then be 
+processed by a final `output_layer` of the network. Taking all this together, we can define 
+our relation model like this in the config:
+
 > ```
-> [tok2vec]
-> @architectures = "spacy.HashEmbedCNN.v1"
-> pretrained_vectors = null
-> width = 96
-> depth = 2
-> embed_size = 300
-> window_size = 1
-> maxout_pieces = 3
-> subword_features = true
+> [model]
+> @architectures = "rel_model.v1"
+> nO = null
+> 
+> [model.tok2vec]
+> ...
+> 
+> [model.get_candidates]
+> @misc = "rel_cand_generator.v2"
+> max_length = 6
+> 
+> [components.relation_extractor.model.create_candidate_tensor]
+> @misc = "rel_cand_tensor.v1"
+> 
+> [components.relation_extractor.model.output_layer]
+> @architectures = "rel_output_layer.v1"
+> nI = null
+> nO = null
 > ```
 
-Next, we'll assume we have access to an
-[embedding layer](/usage/embeddings-transformers) such as a
-[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This
-layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it
-transforms a list of documents into a list of 2D vectors. Further, this
-`tok2vec` component will be trainable, which means that, following the Thinc
-paradigm, we'll apply it to some input, and receive the predicted results as
-well as a callback to perform backpropagation:
-
-```python
-tok2vec = model.get_ref("tok2vec")
-tokvecs, bp_tokvecs = tok2vec(docs, is_train=True)
-```
-
+<!-- Link to project for implementation details -->
 
 
 

From 9f40d963fd92d2dc5de04af2bda45d79d440113e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sun, 4 Oct 2020 14:11:53 +0200
Subject: [PATCH 4/7] highlight the two steps: the model and the pipeline
 component

---
 website/docs/usage/layers-architectures.md | 126 ++++++++++++++-------
 1 file changed, 88 insertions(+), 38 deletions(-)

diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 25f9a568c..c4b3fb9dc 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -495,12 +495,19 @@ from scratch. This can be done by creating a new class inheriting from
 ### Example: Pipeline component for relation extraction {#component-rel}
 
 This section outlines an example use-case of implementing a novel relation
-extraction component from scratch. We assume we want to implement a binary 
-relation extraction method that determines whether two entities in a document 
-are related or not, and if so, with what type of relation. We'll allow multiple 
+extraction component from scratch. We assume we want to implement a binary
+relation extraction method that determines whether two entities in a document
+are related or not, and if so, with what type of relation. We'll allow multiple
 types of relations between two such entities - i.e. it is a multi-label setting.
 
-We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes 
+There are two major steps required: first, we need to
+[implement a machine learning model](#component-rel-model) specific to this
+task, and then we'll use this model to
+[implement a custom pipeline component](#component-rel-pipe).
+
+#### Step 1: Implementing the Model {#component-rel-model}
+
+We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes
 a list of documents as input, and outputs a two-dimensional matrix of scores:
 
 ```python
@@ -514,15 +521,15 @@ The first layer in this model will typically be an
 [embedding layer](/usage/embeddings-transformers) such as a
 [`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This
 layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it
-transforms each document into a list of tokens, with each token being 
+transforms each document into a list of tokens, with each token being
 represented by its embedding in the vector space.
 
-Next, we need a method that will
-generate pairs of entities that we want to classify as being related or not.
-These candidate pairs are typically formed within one document, which means
-we'll have a function that takes a `Doc` as input and outputs a `List` of `Span`
-tuples. For instance, a very straightforward implementation
-would be to just take any two entities from the same document:
+Next, we need a method that will generate pairs of entities that we want to
+classify as being related or not. These candidate pairs are typically formed
+within one document, which means we'll have a function that takes a `Doc` as
+input and outputs a `List` of `Span` tuples. For instance, a very
+straightforward implementation would be to just take any two entities from the
+same document:
 
 ```python
 def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
@@ -536,10 +543,10 @@ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
 > ```
 > [model]
 > @architectures = "rel_model.v1"
-> 
+>
 > [model.tok2vec]
 > ...
-> 
+>
 > [model.get_candidates]
 > @misc = "rel_cand_generator.v2"
 > max_length = 6
@@ -566,33 +573,76 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span
     return get_candidates
 ```
 
-Finally, we'll require a method that transforms the candidate pairs of entities into 
-a 2D tensor using the specified Tok2Vec function, and this `Floats2d` object will then be 
-processed by a final `output_layer` of the network. Taking all this together, we can define 
-our relation model like this in the config:
+Finally, we'll require a method that transforms the candidate pairs of entities
+into a 2D tensor using the specified Tok2Vec function, and this `Floats2d`
+object will then be processed by a final `output_layer` of the network. Taking
+all this together, we can define our relation model like this in the config:
 
-> ```
-> [model]
-> @architectures = "rel_model.v1"
-> nO = null
-> 
-> [model.tok2vec]
-> ...
-> 
-> [model.get_candidates]
-> @misc = "rel_cand_generator.v2"
-> max_length = 6
-> 
-> [components.relation_extractor.model.create_candidate_tensor]
-> @misc = "rel_cand_tensor.v1"
-> 
-> [components.relation_extractor.model.output_layer]
-> @architectures = "rel_output_layer.v1"
-> nI = null
-> nO = null
-> ```
+```
+[model]
+@architectures = "rel_model.v1"
+...
 
-<!-- Link to project for implementation details -->
+[model.tok2vec]
+...
+
+[model.get_candidates]
+@misc = "rel_cand_generator.v2"
+max_length = 6
+
+[model.create_candidate_tensor]
+@misc = "rel_cand_tensor.v1"
+
+[model.output_layer]
+@architectures = "rel_output_layer.v1"
+...
+```
+
+<!-- TODO: Link to project for implementation details -->
+
+When creating this model, we'll store the custom functions as
+[attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as
+references, so we can access them easily:
+
+```python
+tok2vec_layer = model.get_ref("tok2vec")
+output_layer = model.get_ref("output_layer")
+create_candidate_tensor = model.attrs["create_candidate_tensor"]
+get_candidates = model.attrs["get_candidates"]
+```
+
+#### Step 2: Implementing the pipeline component {#component-rel-pipe}
+
+To use our new relation extraction model as part of a custom component, we 
+create a subclass of [`Pipe`](/api/pipe) that will hold the model:
+
+```python
+from spacy.pipeline import Pipe
+from spacy.language import Language
+
+class RelationExtractor(Pipe):
+     def __init__(self, vocab, model, name="rel", labels=[]):
+        ...
+
+    def predict(self, docs):
+        ...
+
+    def set_annotations(self, docs, scores):
+         ...
+
+@Language.factory("relation_extractor")
+def make_relation_extractor(nlp, name, model, labels):
+    return RelationExtractor(nlp.vocab, model, name, labels=labels)
+```
+
+The [`predict`](/api/pipe#predict ) function needs to be implemented for each subclass. 
+In our case, we can simply delegate to the internal model's 
+[predict](https://thinc.ai/docs/api-model#predict) function:
+```python
+def predict(self, docs: Iterable[Doc]) -> Floats2d:
+    scores = self.model.predict(docs)
+    return self.model.ops.asarray(scores)
+```
 
 
 

From b0463fbf75a83127352d52d6ac295bb73d16a6d0 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sun, 4 Oct 2020 14:56:48 +0200
Subject: [PATCH 5/7] set_annotations explanation

---
 website/docs/usage/layers-architectures.md | 48 ++++++++++++++++++++--
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index c4b3fb9dc..7e563cb5c 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -613,7 +613,7 @@ get_candidates = model.attrs["get_candidates"]
 
 #### Step 2: Implementing the pipeline component {#component-rel-pipe}
 
-To use our new relation extraction model as part of a custom component, we 
+To use our new relation extraction model as part of a custom component, we
 create a subclass of [`Pipe`](/api/pipe) that will hold the model:
 
 ```python
@@ -635,15 +635,57 @@ def make_relation_extractor(nlp, name, model, labels):
     return RelationExtractor(nlp.vocab, model, name, labels=labels)
 ```
 
-The [`predict`](/api/pipe#predict ) function needs to be implemented for each subclass. 
-In our case, we can simply delegate to the internal model's 
+The [`predict`](/api/pipe#predict) function needs to be implemented for each
+subclass. In our case, we can simply delegate to the internal model's
 [predict](https://thinc.ai/docs/api-model#predict) function:
+
 ```python
 def predict(self, docs: Iterable[Doc]) -> Floats2d:
     scores = self.model.predict(docs)
     return self.model.ops.asarray(scores)
 ```
 
+The other method that needs to be implemented, is
+[`set_annotations`](/api/pipe#set_annotations). It takes the predicted scores,
+and modifies the given `Doc` object in place to hold the predictions. For our
+relation extraction component, we'll store the data as a dictionary in a custom
+extension attribute `doc._.rel`. As keys, we represent the candidate pair by the
+start offsets of each entity, as this defines an entity uniquely within one
+document.
+
+To interpret the scores predicted by the REL model correctly, we need to 
+refer to the model's `get_candidates` function that originally defined which 
+pairs of entities would be run through the model, so that the scores can be 
+related to those exact entities:
+
+> #### Example output
+>
+> ```python
+> doc = nlp("Amsterdam is the capital of the Netherlands.")
+> print(f"spans: {[(e.start, e.text, e.label_) for e in doc.ents]}")
+> for value, rel_dict in doc._.rel.items():
+>     print(f"{value}: {rel_dict}")
+> ```
+
+> ```
+> spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')]
+> (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002}
+> (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017}
+> ```
+
+```python
+def set_annotations(self, docs: Iterable[Doc], rel_scores: Floats2d):
+    c = 0
+    get_candidates = self.model.attrs["get_candidates"]
+    for doc in docs:
+        for (e1, e2) in get_candidates(doc):
+            offset = (e1.start, e2.start)
+            if offset not in doc._.rel:
+                doc._.rel[offset] = {}
+            for j, label in enumerate(self.labels):
+                doc._.rel[offset][label] = rel_scores[c, j]
+            c += 1
+```
 
 
 

From 52b660e9dcc412fc1d4bbdf269c1bd31d9e7d3a4 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 5 Oct 2020 00:39:36 +0200
Subject: [PATCH 6/7] initialize and update explanation

---
 website/docs/api/pipe.md                   |   6 +
 website/docs/usage/layers-architectures.md | 149 ++++++++++++++++-----
 2 files changed, 119 insertions(+), 36 deletions(-)

diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md
index 4f5ac6f61..de35f9eb4 100644
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@@ -226,6 +226,12 @@ the "catastrophic forgetting" problem. This feature is experimental.
 Find the loss and gradient of loss for the batch of documents and their
 predicted scores.
 
+<Infobox variant="danger">
+
+This method needs to be overwritten with your own custom `get_loss` method.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 7e563cb5c..130a7144e 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -618,31 +618,97 @@ create a subclass of [`Pipe`](/api/pipe) that will hold the model:
 
 ```python
 from spacy.pipeline import Pipe
-from spacy.language import Language
 
 class RelationExtractor(Pipe):
      def __init__(self, vocab, model, name="rel", labels=[]):
+        self.model = model
         ...
 
     def predict(self, docs):
         ...
 
-    def set_annotations(self, docs, scores):
+    def set_annotations(self, docs, predictions):
          ...
-
-@Language.factory("relation_extractor")
-def make_relation_extractor(nlp, name, model, labels):
-    return RelationExtractor(nlp.vocab, model, name, labels=labels)
 ```
 
+Before the model can be used however, it needs to be 
+[initialized](/api/pipe#initialize). This function recieves either the full 
+training data set, or a representative sample. The training data can be used 
+to deduce all relevant labels. Alternatively, a list of labels can be provided, 
+or a script can call `rel_component.add_label()` to add each label separately.
+
+The number of labels will define the output dimensionality of the network, 
+and will be used to do 
+[shape inference](https://thinc.ai/docs/usage-models#validation) throughout 
+the layers of the neural network. This is triggerd by calling `model.initialize`.
+
+```python
+from itertools import islice
+
+def initialize(
+    self,
+    get_examples: Callable[[], Iterable[Example]],
+    *,
+    nlp: Language = None,
+    labels: Optional[List[str]] = None,
+):
+    if labels is not None:
+        for label in labels:
+            self.add_label(label)
+    else:
+        for example in get_examples():
+            relations = example.reference._.rel
+            for indices, label_dict in relations.items():
+                for label in label_dict.keys():
+                    self.add_label(label)
+    subbatch = list(islice(get_examples(), 10))
+    doc_sample = [eg.reference for eg in subbatch]
+    label_sample = self._examples_to_truth(subbatch)
+    self.model.initialize(X=doc_sample, Y=label_sample)
+```
+ 
+The `initialize` method will be triggered whenever this component is part of an 
+`nlp` pipeline, and `nlp.initialize()` is invoked. After doing so, the pipeline 
+component and its internal model can be trained and used to make predictions.
+
+During training the function [`update`](/api/pipe#update) is invoked which delegates to 
+[`self.model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and 
+needs a function [`get_loss`](/api/pipe#get_loss) that will calculate the 
+loss for a batch of examples, as well as the gradient of loss that will be used to update 
+the weights of the model layers.
+
+```python
+def update(
+    self,
+    examples: Iterable[Example],
+    *,
+    drop: float = 0.0,
+    set_annotations: bool = False,
+    sgd: Optional[Optimizer] = None,
+    losses: Optional[Dict[str, float]] = None,
+) -> Dict[str, float]:
+    ...
+    docs = [ex.predicted for ex in examples]
+    predictions, backprop = self.model.begin_update(docs)
+    loss, gradient = self.get_loss(examples, predictions)
+    backprop(gradient)
+    losses[self.name] += loss
+    ...
+    return losses
+```
+
+Thinc provides some [loss functions](https://thinc.ai/docs/api-loss) that can be used 
+for the implementation of the `get_loss` function.
+
+When the internal model is trained, the component can be used to make novel predictions. 
 The [`predict`](/api/pipe#predict) function needs to be implemented for each
-subclass. In our case, we can simply delegate to the internal model's
+subclass of `Pipe`. In our case, we can simply delegate to the internal model's
 [predict](https://thinc.ai/docs/api-model#predict) function:
 
 ```python
 def predict(self, docs: Iterable[Doc]) -> Floats2d:
-    scores = self.model.predict(docs)
-    return self.model.ops.asarray(scores)
+    predictions = self.model.predict(docs)
+    return self.model.ops.asarray(predictions)
 ```
 
 The other method that needs to be implemented, is
@@ -650,7 +716,7 @@ The other method that needs to be implemented, is
 and modifies the given `Doc` object in place to hold the predictions. For our
 relation extraction component, we'll store the data as a dictionary in a custom
 extension attribute `doc._.rel`. As keys, we represent the candidate pair by the
-start offsets of each entity, as this defines an entity uniquely within one
+start offsets of each entity, as this defines an entity pair uniquely within one
 document.
 
 To interpret the scores predicted by the REL model correctly, we need to 
@@ -674,7 +740,7 @@ related to those exact entities:
 > ```
 
 ```python
-def set_annotations(self, docs: Iterable[Doc], rel_scores: Floats2d):
+def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
     c = 0
     get_candidates = self.model.attrs["get_candidates"]
     for doc in docs:
@@ -683,34 +749,45 @@ def set_annotations(self, docs: Iterable[Doc], rel_scores: Floats2d):
             if offset not in doc._.rel:
                 doc._.rel[offset] = {}
             for j, label in enumerate(self.labels):
-                doc._.rel[offset][label] = rel_scores[c, j]
+                doc._.rel[offset][label] = predictions[c, j]
             c += 1
 ```
 
-
-
-<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
-</Infobox>
-
-<!-- TODO: write trainable component section
-- Interaction with `predict`, `get_loss` and `set_annotations`
-- Initialization life-cycle with `initialize`, correlation with add_label
-Example: relation extraction component (implemented as project template)
-Avoid duplication with usage/processing-pipelines#trainable-components ?
--->
-
-<!-- ![Diagram of a pipeline component with its model](../images/layers-architectures.svg)
+Under the hood, when the pipe is applied to a document, it will delegate to these 
+two methods: 
 
 ```python
-def update(self, examples):
-    docs = [ex.predicted for ex in examples]
-    refs = [ex.reference for ex in examples]
-    predictions, backprop = self.model.begin_update(docs)
-    gradient = self.get_loss(predictions, refs)
-    backprop(gradient)
-
-def __call__(self, doc):
-    predictions = self.model([doc])
-    self.set_annotations(predictions)
+def __call__(self, Doc doc):
+    predictions = self.predict([doc])
+    self.set_annotations([doc], predictions)
+    return doc
 ```
--->
+
+Once our `Pipe` subclass is fully implemented, we can 
+[register](http://localhost:8000/usage/processing-pipelines#custom-components-factories) 
+the component with the 
+`Language.factory` decorator. This will enable the creation of the component with 
+`nlp.add_pipe`, or via the config.
+
+> ```
+> 
+> [components.relation_extractor]
+> factory = "relation_extractor"
+> labels = []
+> 
+> [components.relation_extractor.model]
+> @architectures = "rel_model.v1"
+> ...
+> ```
+
+```python
+from spacy.language import Language
+
+@Language.factory("relation_extractor")
+def make_relation_extractor(nlp, name, model, labels):
+    return RelationExtractor(nlp.vocab, model, name, labels=labels)
+```
+
+<!-- TODO: refer once more to example project -->
+
+<!-- ![Diagram of a pipeline component with its model](../images/layers-architectures.svg) -->

From 9a6c9b133b796d4b766189740ef1fc88f6dbe3ee Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 5 Oct 2020 01:05:37 +0200
Subject: [PATCH 7/7] various small fixes

---
 website/docs/usage/layers-architectures.md | 142 +++++++++++----------
 1 file changed, 74 insertions(+), 68 deletions(-)

diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 130a7144e..414562d6d 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -288,7 +288,7 @@ those parts of the network.
 
 To use our custom model including the PyTorch subnetwork, all we need to do is
 register the architecture using the
-[`architectures` registry](/api/top-level#registry). This will assign the
+[`architectures` registry](/api/top-level#registry). This assigns the
 architecture a name so spaCy knows how to find it, and allows passing in
 arguments like hyperparameters via the [config](/usage/training#config). The
 full example then becomes:
@@ -488,27 +488,27 @@ with Model.define_operators({">>": chain}):
 
 In addition to [swapping out](#swap-architectures) default models in built-in
 components, you can also implement an entirely new,
-[trainable pipeline component](usage/processing-pipelines#trainable-components)
+[trainable pipeline component](/usage/processing-pipelines#trainable-components)
 from scratch. This can be done by creating a new class inheriting from
 [`Pipe`](/api/pipe), and linking it up to your custom model implementation.
 
 ### Example: Pipeline component for relation extraction {#component-rel}
 
 This section outlines an example use-case of implementing a novel relation
-extraction component from scratch. We assume we want to implement a binary
-relation extraction method that determines whether two entities in a document
-are related or not, and if so, with what type of relation. We'll allow multiple
-types of relations between two such entities - i.e. it is a multi-label setting.
+extraction component from scratch. We'll implement a binary relation extraction
+method that determines whether or not two entities in a document are related,
+and if so, what type of relation. We'll allow multiple types of relations
+between two such entities (multi-label setting).
 
 There are two major steps required: first, we need to
 [implement a machine learning model](#component-rel-model) specific to this
-task, and then we'll use this model to
+task, and subsequently we use this model to
 [implement a custom pipeline component](#component-rel-pipe).
 
 #### Step 1: Implementing the Model {#component-rel-model}
 
-We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes
-a list of documents as input, and outputs a two-dimensional matrix of scores:
+We need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes a
+list of documents as input, and outputs a two-dimensional matrix of predictions:
 
 ```python
 @registry.architectures.register("rel_model.v1")
@@ -519,17 +519,16 @@ def create_relation_model(...) -> Model[List[Doc], Floats2d]:
 
 The first layer in this model will typically be an
 [embedding layer](/usage/embeddings-transformers) such as a
-[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This
-layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it
+[`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This
+layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it
 transforms each document into a list of tokens, with each token being
 represented by its embedding in the vector space.
 
-Next, we need a method that will generate pairs of entities that we want to
-classify as being related or not. These candidate pairs are typically formed
-within one document, which means we'll have a function that takes a `Doc` as
-input and outputs a `List` of `Span` tuples. For instance, a very
-straightforward implementation would be to just take any two entities from the
-same document:
+Next, we need a method that generates pairs of entities that we want to classify
+as being related or not. As these candidate pairs are typically formed within
+one document, this function takes a `Doc` as input and outputs a `List` of
+`Span` tuples. For instance, a very straightforward implementation would be to
+just take any two entities from the same document:
 
 ```python
 def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
@@ -549,12 +548,12 @@ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
 >
 > [model.get_candidates]
 > @misc = "rel_cand_generator.v2"
-> max_length = 6
+> max_length = 20
 > ```
 
 But we could also refine this further by excluding relations of an entity with
 itself, and posing a maximum distance (in number of tokens) between two
-entities. We'll register this function in the
+entities. We register this function in the
 [`@misc` registry](/api/top-level#registry) so we can refer to it from the
 config, and easily swap it out for any other candidate generation function.
 
@@ -573,10 +572,10 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span
     return get_candidates
 ```
 
-Finally, we'll require a method that transforms the candidate pairs of entities
-into a 2D tensor using the specified Tok2Vec function, and this `Floats2d`
-object will then be processed by a final `output_layer` of the network. Taking
-all this together, we can define our relation model like this in the config:
+Finally, we require a method that transforms the candidate entity pairs into a
+2D tensor using the specified `Tok2Vec` function. The resulting `Floats2d`
+object will then be processed by a final `output_layer` of the network. Putting
+all this together, we can define our relation model in a config file as such:
 
 ```
 [model]
@@ -588,7 +587,7 @@ all this together, we can define our relation model like this in the config:
 
 [model.get_candidates]
 @misc = "rel_cand_generator.v2"
-max_length = 6
+max_length = 20
 
 [model.create_candidate_tensor]
 @misc = "rel_cand_tensor.v1"
@@ -600,7 +599,7 @@ max_length = 6
 
 <!-- TODO: Link to project for implementation details -->
 
-When creating this model, we'll store the custom functions as
+When creating this model, we store the custom functions as
 [attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as
 references, so we can access them easily:
 
@@ -614,7 +613,7 @@ get_candidates = model.attrs["get_candidates"]
 #### Step 2: Implementing the pipeline component {#component-rel-pipe}
 
 To use our new relation extraction model as part of a custom component, we
-create a subclass of [`Pipe`](/api/pipe) that will hold the model:
+create a subclass of [`Pipe`](/api/pipe) that holds the model:
 
 ```python
 from spacy.pipeline import Pipe
@@ -624,6 +623,9 @@ class RelationExtractor(Pipe):
         self.model = model
         ...
 
+    def update(self, examples, ...):
+        ...
+
     def predict(self, docs):
         ...
 
@@ -631,18 +633,19 @@ class RelationExtractor(Pipe):
          ...
 ```
 
-Before the model can be used however, it needs to be 
-[initialized](/api/pipe#initialize). This function recieves either the full 
-training data set, or a representative sample. The training data can be used 
-to deduce all relevant labels. Alternatively, a list of labels can be provided, 
-or a script can call `rel_component.add_label()` to add each label separately.
+Before the model can be used, it needs to be
+[initialized](/api/pipe#initialize). This function receives either the full
+training data set, or a representative sample. This data set can be used to
+deduce all relevant labels. Alternatively, a list of labels can be provided, or
+a script can call `rel_component.add_label()` directly.
 
-The number of labels will define the output dimensionality of the network, 
-and will be used to do 
-[shape inference](https://thinc.ai/docs/usage-models#validation) throughout 
-the layers of the neural network. This is triggerd by calling `model.initialize`.
+The number of labels defines the output dimensionality of the network, and will
+be used to do [shape inference](https://thinc.ai/docs/usage-models#validation)
+throughout the layers of the neural network. This is triggered by calling
+`model.initialize`.
 
 ```python
+### {highlight="12,18,22"}
 from itertools import islice
 
 def initialize(
@@ -666,18 +669,21 @@ def initialize(
     label_sample = self._examples_to_truth(subbatch)
     self.model.initialize(X=doc_sample, Y=label_sample)
 ```
- 
-The `initialize` method will be triggered whenever this component is part of an 
-`nlp` pipeline, and `nlp.initialize()` is invoked. After doing so, the pipeline 
-component and its internal model can be trained and used to make predictions.
 
-During training the function [`update`](/api/pipe#update) is invoked which delegates to 
-[`self.model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and 
-needs a function [`get_loss`](/api/pipe#get_loss) that will calculate the 
-loss for a batch of examples, as well as the gradient of loss that will be used to update 
-the weights of the model layers.
+The `initialize` method is triggered whenever this component is part of an `nlp`
+pipeline, and [`nlp.initialize()`](/api/language#initialize) is invoked. After
+doing so, the pipeline component and its internal model can be trained and used
+to make predictions.
+
+During training, the function [`update`](/api/pipe#update) is invoked which
+delegates to
+[`self.model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
+[`get_loss`](/api/pipe#get_loss) function that calculate the loss for a batch of
+examples, as well as the gradient of loss that will be used to update the
+weights of the model layers.
 
 ```python
+### {highlight="12-14"}
 def update(
     self,
     examples: Iterable[Example],
@@ -697,13 +703,13 @@ def update(
     return losses
 ```
 
-Thinc provides some [loss functions](https://thinc.ai/docs/api-loss) that can be used 
-for the implementation of the `get_loss` function.
+Thinc provides several [loss functions](https://thinc.ai/docs/api-loss) that can
+be used for the implementation of the `get_loss` function.
 
-When the internal model is trained, the component can be used to make novel predictions. 
-The [`predict`](/api/pipe#predict) function needs to be implemented for each
-subclass of `Pipe`. In our case, we can simply delegate to the internal model's
-[predict](https://thinc.ai/docs/api-model#predict) function:
+When the internal model is trained, the component can be used to make novel
+predictions. The [`predict`](/api/pipe#predict) function needs to be implemented
+for each subclass of `Pipe`. In our case, we can simply delegate to the internal
+model's [predict](https://thinc.ai/docs/api-model#predict) function:
 
 ```python
 def predict(self, docs: Iterable[Doc]) -> Floats2d:
@@ -711,24 +717,24 @@ def predict(self, docs: Iterable[Doc]) -> Floats2d:
     return self.model.ops.asarray(predictions)
 ```
 
-The other method that needs to be implemented, is
-[`set_annotations`](/api/pipe#set_annotations). It takes the predicted scores,
-and modifies the given `Doc` object in place to hold the predictions. For our
-relation extraction component, we'll store the data as a dictionary in a custom
+The final method that needs to be implemented, is
+[`set_annotations`](/api/pipe#set_annotations). This function takes the
+predictions, and modifies the given `Doc` object in place to store them. For our
+relation extraction component, we store the data as a dictionary in a custom
 extension attribute `doc._.rel`. As keys, we represent the candidate pair by the
 start offsets of each entity, as this defines an entity pair uniquely within one
 document.
 
-To interpret the scores predicted by the REL model correctly, we need to 
-refer to the model's `get_candidates` function that originally defined which 
-pairs of entities would be run through the model, so that the scores can be 
-related to those exact entities:
+To interpret the scores predicted by the REL model correctly, we need to refer
+to the model's `get_candidates` function that defined which pairs of entities
+were relevant candidates, so that the predictions can be linked to those exact
+entities:
 
 > #### Example output
 >
 > ```python
 > doc = nlp("Amsterdam is the capital of the Netherlands.")
-> print(f"spans: {[(e.start, e.text, e.label_) for e in doc.ents]}")
+> print(f"spans: [(e.start, e.text, e.label_) for e in doc.ents]")
 > for value, rel_dict in doc._.rel.items():
 >     print(f"{value}: {rel_dict}")
 > ```
@@ -740,6 +746,7 @@ related to those exact entities:
 > ```
 
 ```python
+###  {highlight="5-6,10"}
 def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
     c = 0
     get_candidates = self.model.attrs["get_candidates"]
@@ -753,8 +760,8 @@ def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
             c += 1
 ```
 
-Under the hood, when the pipe is applied to a document, it will delegate to these 
-two methods: 
+Under the hood, when the pipe is applied to a document, it delegates to the
+`predict` and `set_annotations` functions:
 
 ```python
 def __call__(self, Doc doc):
@@ -763,18 +770,17 @@ def __call__(self, Doc doc):
     return doc
 ```
 
-Once our `Pipe` subclass is fully implemented, we can 
-[register](http://localhost:8000/usage/processing-pipelines#custom-components-factories) 
-the component with the 
-`Language.factory` decorator. This will enable the creation of the component with 
-`nlp.add_pipe`, or via the config.
+Once our `Pipe` subclass is fully implemented, we can
+[register](http://localhost:8000/usage/processing-pipelines#custom-components-factories)
+the component with the `Language.factory` decorator. This enables the creation
+of the component with `nlp.add_pipe`, or via the config.
 
 > ```
-> 
+>
 > [components.relation_extractor]
 > factory = "relation_extractor"
 > labels = []
-> 
+>
 > [components.relation_extractor.model]
 > @architectures = "rel_model.v1"
 > ...