From 2c4b2ee5e9b29442c119e9c8bb2b5bce761a78aa Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sat, 3 Oct 2020 23:27:05 +0200
Subject: [PATCH 01/12] REL intro and get_candidates function

---
 website/docs/usage/layers-architectures.md | 54 ++++++++++++++++++++++
 website/docs/usage/processing-pipelines.md |  2 +-
 2 files changed, 55 insertions(+), 1 deletion(-)
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index b65c3d903..678f70667 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -486,6 +486,60 @@ with Model.define_operators({">>": chain}):
 
 ## Create new trainable components {#components}
 
+In addition to [swapping out](#swap-architectures) default models in built-in
+components, you can also implement an entirely new,
+[trainable pipeline component](usage/processing-pipelines#trainable-components)
+from scratch. This can be done by creating a new class inheriting from [`Pipe`](/api/pipe), 
+and linking it up to your custom model implementation.
+
+### Example: Pipeline component for relation extraction {#component-rel}
+
+This section will run through an example of implementing a novel relation extraction 
+component from scratch. As a first step, we need a method that will generate pairs of
+entities that we want to classify as being related or not. These candidate pairs are 
+typically formed within one document, which means we'll have a function that takes a 
+`Doc` as input and outputs a `List` of `Span` tuples. In this example, we will focus 
+on binary relation extraction, i.e. the tuple will be of length 2.
+
+We register this function in the 'misc' register so we can easily refer to it from the config, 
+and allow swapping it out for any candidate 
+generation function. For instance, a very straightforward implementation would be to just 
+take any two entities from the same document:
+
+```python
+@registry.misc.register("rel_cand_generator.v1")
+def create_candidate_indices() -> Callable[[Doc], List[Tuple[Span, Span]]]:
+    def get_candidate_indices(doc: "Doc"):
+        indices = []
+        for ent1 in doc.ents:
+            for ent2 in doc.ents:
+                indices.append((ent1, ent2))
+        return indices
+    return get_candidate_indices
+```
+
+But we could also refine this further by excluding relations of an entity with itself, 
+and posing a maximum distance (in number of tokens) between two entities:
+
+```python
+### {highlight="1,2,7,8"}
+@registry.misc.register("rel_cand_generator.v2")
+def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
+    def get_candidate_indices(doc: "Doc"):
+        indices = []
+        for ent1 in doc.ents:
+            for ent2 in doc.ents:
+                if ent1 != ent2:
+                    if max_length and abs(ent2.start - ent1.start) <= max_length:
+                        indices.append((ent1, ent2))
+        return indices
+    return get_candidate_indices
+```
+
+
+
+
+
 <Infobox title="This section is still under construction" emoji="🚧" variant="warning">
 </Infobox>
 
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index c98bd08bc..3619993c5 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1035,7 +1035,7 @@ plug fully custom machine learning components into your pipeline. You'll need
 the following:
 
 1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This
-   can be a model using implemented in
+   can be a model implemented in
    [Thinc](/usage/layers-architectures#thinc), or a
    [wrapped model](/usage/layers-architectures#frameworks) implemented in
    PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a

From 08ad349a1851c3310a4ae7f34170eea37c9e2e3b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sun, 4 Oct 2020 00:08:02 +0200
Subject: [PATCH 02/12] tok2vec layer

---
 website/docs/usage/layers-architectures.md | 87 ++++++++++++++--------
 1 file changed, 58 insertions(+), 29 deletions(-)

diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 678f70667..6f79cc6e8 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -489,51 +489,80 @@ with Model.define_operators({">>": chain}):
 In addition to [swapping out](#swap-architectures) default models in built-in
 components, you can also implement an entirely new,
 [trainable pipeline component](usage/processing-pipelines#trainable-components)
-from scratch. This can be done by creating a new class inheriting from [`Pipe`](/api/pipe), 
-and linking it up to your custom model implementation.
+from scratch. This can be done by creating a new class inheriting from
+[`Pipe`](/api/pipe), and linking it up to your custom model implementation.
 
 ### Example: Pipeline component for relation extraction {#component-rel}
 
-This section will run through an example of implementing a novel relation extraction 
-component from scratch. As a first step, we need a method that will generate pairs of
-entities that we want to classify as being related or not. These candidate pairs are 
-typically formed within one document, which means we'll have a function that takes a 
-`Doc` as input and outputs a `List` of `Span` tuples. In this example, we will focus 
-on binary relation extraction, i.e. the tuple will be of length 2.
-
-We register this function in the 'misc' register so we can easily refer to it from the config, 
-and allow swapping it out for any candidate 
-generation function. For instance, a very straightforward implementation would be to just 
-take any two entities from the same document:
+This section will run through an example of implementing a novel relation
+extraction component from scratch. As a first step, we need a method that will
+generate pairs of entities that we want to classify as being related or not.
+These candidate pairs are typically formed within one document, which means
+we'll have a function that takes a `Doc` as input and outputs a `List` of `Span`
+tuples. In this example, we will focus on binary relation extraction, i.e. the
+tuple will be of length 2. For instance, a very straightforward implementation
+would be to just take any two entities from the same document:
 
 ```python
-@registry.misc.register("rel_cand_generator.v1")
-def create_candidate_indices() -> Callable[[Doc], List[Tuple[Span, Span]]]:
-    def get_candidate_indices(doc: "Doc"):
-        indices = []
-        for ent1 in doc.ents:
-            for ent2 in doc.ents:
-                indices.append((ent1, ent2))
-        return indices
-    return get_candidate_indices
+def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
+    candidates = []
+    for ent1 in doc.ents:
+        for ent2 in doc.ents:
+            candidates.append((ent1, ent2))
+    return candidates
 ```
 
-But we could also refine this further by excluding relations of an entity with itself, 
-and posing a maximum distance (in number of tokens) between two entities:
+But we could also refine this further by excluding relations of an entity with
+itself, and posing a maximum distance (in number of tokens) between two
+entities. We'll also register this function in the
+[`@misc` registry](/api/top-level#registry) so we can refer to it from the
+config, and easily swap it out for any other candidate generation function.
+
+> ```
+> [get_candidates]
+> @misc = "rel_cand_generator.v2"
+> max_length = 6
+> ```
 
 ```python
 ### {highlight="1,2,7,8"}
 @registry.misc.register("rel_cand_generator.v2")
 def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
-    def get_candidate_indices(doc: "Doc"):
-        indices = []
+    def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
+        candidates = []
         for ent1 in doc.ents:
             for ent2 in doc.ents:
                 if ent1 != ent2:
                     if max_length and abs(ent2.start - ent1.start) <= max_length:
-                        indices.append((ent1, ent2))
-        return indices
-    return get_candidate_indices
+                        candidates.append((ent1, ent2))
+        return candidates
+    return get_candidates
+```
+
+> ```
+> [tok2vec]
+> @architectures = "spacy.HashEmbedCNN.v1"
+> pretrained_vectors = null
+> width = 96
+> depth = 2
+> embed_size = 300
+> window_size = 1
+> maxout_pieces = 3
+> subword_features = true
+> ```
+
+Next, we'll assume we have access to an
+[embedding layer](/usage/embeddings-transformers) such as a
+[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This
+layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it
+transforms a list of documents into a list of 2D vectors. Further, this
+`tok2vec` component will be trainable, which means that, following the Thinc
+paradigm, we'll apply it to some input, and receive the predicted results as
+well as a callback to perform backpropagation:
+
+```python
+tok2vec = model.get_ref("tok2vec")
+tokvecs, bp_tokvecs = tok2vec(docs, is_train=True)
 ```
 
 

From 452b8309f9e34530e5f592699a3601400f40ffb0 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sun, 4 Oct 2020 13:26:46 +0200
Subject: [PATCH 03/12] slight rewrite to hide some thinc implementation
 details

---
 website/docs/usage/layers-architectures.md | 98 ++++++++++++++--------
 1 file changed, 63 insertions(+), 35 deletions(-)

diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 6f79cc6e8..25f9a568c 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -373,7 +373,7 @@ gpu_allocator = "pytorch"
 Of course it's also possible to define the `Model` from the previous section
 entirely in Thinc. The Thinc documentation provides details on the
 [various layers](https://thinc.ai/docs/api-layers) and helper functions
-available. Combinators can also be used to
+available. Combinators can be used to
 [overload operators](https://thinc.ai/docs/usage-models#operators) and a common
 usage pattern is to bind `chain` to `>>`. The "native" Thinc version of our
 simple neural network would then become:
@@ -494,13 +494,34 @@ from scratch. This can be done by creating a new class inheriting from
 
 ### Example: Pipeline component for relation extraction {#component-rel}
 
-This section will run through an example of implementing a novel relation
-extraction component from scratch. As a first step, we need a method that will
+This section outlines an example use-case of implementing a novel relation
+extraction component from scratch. We assume we want to implement a binary 
+relation extraction method that determines whether two entities in a document 
+are related or not, and if so, with what type of relation. We'll allow multiple 
+types of relations between two such entities - i.e. it is a multi-label setting.
+
+We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes 
+a list of documents as input, and outputs a two-dimensional matrix of scores:
+
+```python
+@registry.architectures.register("rel_model.v1")
+def create_relation_model(...) -> Model[List[Doc], Floats2d]:
+    model = _create_my_model()
+    return model
+```
+
+The first layer in this model will typically be an
+[embedding layer](/usage/embeddings-transformers) such as a
+[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This
+layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it
+transforms each document into a list of tokens, with each token being 
+represented by its embedding in the vector space.
+
+Next, we need a method that will
 generate pairs of entities that we want to classify as being related or not.
 These candidate pairs are typically formed within one document, which means
 we'll have a function that takes a `Doc` as input and outputs a `List` of `Span`
-tuples. In this example, we will focus on binary relation extraction, i.e. the
-tuple will be of length 2. For instance, a very straightforward implementation
+tuples. For instance, a very straightforward implementation
 would be to just take any two entities from the same document:
 
 ```python
@@ -512,18 +533,24 @@ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
     return candidates
 ```
 
-But we could also refine this further by excluding relations of an entity with
-itself, and posing a maximum distance (in number of tokens) between two
-entities. We'll also register this function in the
-[`@misc` registry](/api/top-level#registry) so we can refer to it from the
-config, and easily swap it out for any other candidate generation function.
-
 > ```
-> [get_candidates]
+> [model]
+> @architectures = "rel_model.v1"
+> 
+> [model.tok2vec]
+> ...
+> 
+> [model.get_candidates]
 > @misc = "rel_cand_generator.v2"
 > max_length = 6
 > ```
 
+But we could also refine this further by excluding relations of an entity with
+itself, and posing a maximum distance (in number of tokens) between two
+entities. We'll register this function in the
+[`@misc` registry](/api/top-level#registry) so we can refer to it from the
+config, and easily swap it out for any other candidate generation function.
+
 ```python
 ### {highlight="1,2,7,8"}
 @registry.misc.register("rel_cand_generator.v2")
@@ -539,32 +566,33 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span
     return get_candidates
 ```
 
+Finally, we'll require a method that transforms the candidate pairs of entities into 
+a 2D tensor using the specified Tok2Vec function, and this `Floats2d` object will then be 
+processed by a final `output_layer` of the network. Taking all this together, we can define 
+our relation model like this in the config:
+
 > ```
-> [tok2vec]
-> @architectures = "spacy.HashEmbedCNN.v1"
-> pretrained_vectors = null
-> width = 96
-> depth = 2
-> embed_size = 300
-> window_size = 1
-> maxout_pieces = 3
-> subword_features = true
+> [model]
+> @architectures = "rel_model.v1"
+> nO = null
+> 
+> [model.tok2vec]
+> ...
+> 
+> [model.get_candidates]
+> @misc = "rel_cand_generator.v2"
+> max_length = 6
+> 
+> [components.relation_extractor.model.create_candidate_tensor]
+> @misc = "rel_cand_tensor.v1"
+> 
+> [components.relation_extractor.model.output_layer]
+> @architectures = "rel_output_layer.v1"
+> nI = null
+> nO = null
 > ```
 
-Next, we'll assume we have access to an
-[embedding layer](/usage/embeddings-transformers) such as a
-[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This
-layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it
-transforms a list of documents into a list of 2D vectors. Further, this
-`tok2vec` component will be trainable, which means that, following the Thinc
-paradigm, we'll apply it to some input, and receive the predicted results as
-well as a callback to perform backpropagation:
-
-```python
-tok2vec = model.get_ref("tok2vec")
-tokvecs, bp_tokvecs = tok2vec(docs, is_train=True)
-```
-
+<!-- Link to project for implementation details -->
 
 
 

From 9f40d963fd92d2dc5de04af2bda45d79d440113e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sun, 4 Oct 2020 14:11:53 +0200
Subject: [PATCH 04/12] highlight the two steps: the model and the pipeline
 component

---
 website/docs/usage/layers-architectures.md | 126 ++++++++++++++-------
 1 file changed, 88 insertions(+), 38 deletions(-)

diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 25f9a568c..c4b3fb9dc 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -495,12 +495,19 @@ from scratch. This can be done by creating a new class inheriting from
 ### Example: Pipeline component for relation extraction {#component-rel}
 
 This section outlines an example use-case of implementing a novel relation
-extraction component from scratch. We assume we want to implement a binary 
-relation extraction method that determines whether two entities in a document 
-are related or not, and if so, with what type of relation. We'll allow multiple 
+extraction component from scratch. We assume we want to implement a binary
+relation extraction method that determines whether two entities in a document
+are related or not, and if so, with what type of relation. We'll allow multiple
 types of relations between two such entities - i.e. it is a multi-label setting.
 
-We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes 
+There are two major steps required: first, we need to
+[implement a machine learning model](#component-rel-model) specific to this
+task, and then we'll use this model to
+[implement a custom pipeline component](#component-rel-pipe).
+
+#### Step 1: Implementing the Model {#component-rel-model}
+
+We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes
 a list of documents as input, and outputs a two-dimensional matrix of scores:
 
 ```python
@@ -514,15 +521,15 @@ The first layer in this model will typically be an
 [embedding layer](/usage/embeddings-transformers) such as a
 [`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This
 layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it
-transforms each document into a list of tokens, with each token being 
+transforms each document into a list of tokens, with each token being
 represented by its embedding in the vector space.
 
-Next, we need a method that will
-generate pairs of entities that we want to classify as being related or not.
-These candidate pairs are typically formed within one document, which means
-we'll have a function that takes a `Doc` as input and outputs a `List` of `Span`
-tuples. For instance, a very straightforward implementation
-would be to just take any two entities from the same document:
+Next, we need a method that will generate pairs of entities that we want to
+classify as being related or not. These candidate pairs are typically formed
+within one document, which means we'll have a function that takes a `Doc` as
+input and outputs a `List` of `Span` tuples. For instance, a very
+straightforward implementation would be to just take any two entities from the
+same document:
 
 ```python
 def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
@@ -536,10 +543,10 @@ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
 > ```
 > [model]
 > @architectures = "rel_model.v1"
-> 
+>
 > [model.tok2vec]
 > ...
-> 
+>
 > [model.get_candidates]
 > @misc = "rel_cand_generator.v2"
 > max_length = 6
@@ -566,33 +573,76 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span
     return get_candidates
 ```
 
-Finally, we'll require a method that transforms the candidate pairs of entities into 
-a 2D tensor using the specified Tok2Vec function, and this `Floats2d` object will then be 
-processed by a final `output_layer` of the network. Taking all this together, we can define 
-our relation model like this in the config:
+Finally, we'll require a method that transforms the candidate pairs of entities
+into a 2D tensor using the specified Tok2Vec function, and this `Floats2d`
+object will then be processed by a final `output_layer` of the network. Taking
+all this together, we can define our relation model like this in the config:
 
-> ```
-> [model]
-> @architectures = "rel_model.v1"
-> nO = null
-> 
-> [model.tok2vec]
-> ...
-> 
-> [model.get_candidates]
-> @misc = "rel_cand_generator.v2"
-> max_length = 6
-> 
-> [components.relation_extractor.model.create_candidate_tensor]
-> @misc = "rel_cand_tensor.v1"
-> 
-> [components.relation_extractor.model.output_layer]
-> @architectures = "rel_output_layer.v1"
-> nI = null
-> nO = null
-> ```
+```
+[model]
+@architectures = "rel_model.v1"
+...
 
-<!-- Link to project for implementation details -->
+[model.tok2vec]
+...
+
+[model.get_candidates]
+@misc = "rel_cand_generator.v2"
+max_length = 6
+
+[model.create_candidate_tensor]
+@misc = "rel_cand_tensor.v1"
+
+[model.output_layer]
+@architectures = "rel_output_layer.v1"
+...
+```
+
+<!-- TODO: Link to project for implementation details -->
+
+When creating this model, we'll store the custom functions as
+[attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as
+references, so we can access them easily:
+
+```python
+tok2vec_layer = model.get_ref("tok2vec")
+output_layer = model.get_ref("output_layer")
+create_candidate_tensor = model.attrs["create_candidate_tensor"]
+get_candidates = model.attrs["get_candidates"]
+```
+
+#### Step 2: Implementing the pipeline component {#component-rel-pipe}
+
+To use our new relation extraction model as part of a custom component, we 
+create a subclass of [`Pipe`](/api/pipe) that will hold the model:
+
+```python
+from spacy.pipeline import Pipe
+from spacy.language import Language
+
+class RelationExtractor(Pipe):
+     def __init__(self, vocab, model, name="rel", labels=[]):
+        ...
+
+    def predict(self, docs):
+        ...
+
+    def set_annotations(self, docs, scores):
+         ...
+
+@Language.factory("relation_extractor")
+def make_relation_extractor(nlp, name, model, labels):
+    return RelationExtractor(nlp.vocab, model, name, labels=labels)
+```
+
+The [`predict`](/api/pipe#predict ) function needs to be implemented for each subclass. 
+In our case, we can simply delegate to the internal model's 
+[predict](https://thinc.ai/docs/api-model#predict) function:
+```python
+def predict(self, docs: Iterable[Doc]) -> Floats2d:
+    scores = self.model.predict(docs)
+    return self.model.ops.asarray(scores)
+```
 
 
 

From b0463fbf75a83127352d52d6ac295bb73d16a6d0 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sun, 4 Oct 2020 14:56:48 +0200
Subject: [PATCH 05/12] set_annotations explanation

---
 website/docs/usage/layers-architectures.md | 48 ++++++++++++++++++++--
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index c4b3fb9dc..7e563cb5c 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -613,7 +613,7 @@ get_candidates = model.attrs["get_candidates"]
 
 #### Step 2: Implementing the pipeline component {#component-rel-pipe}
 
-To use our new relation extraction model as part of a custom component, we 
+To use our new relation extraction model as part of a custom component, we
 create a subclass of [`Pipe`](/api/pipe) that will hold the model:
 
 ```python
@@ -635,15 +635,57 @@ def make_relation_extractor(nlp, name, model, labels):
     return RelationExtractor(nlp.vocab, model, name, labels=labels)
 ```
 
-The [`predict`](/api/pipe#predict ) function needs to be implemented for each subclass. 
-In our case, we can simply delegate to the internal model's 
+The [`predict`](/api/pipe#predict) function needs to be implemented for each
+subclass. In our case, we can simply delegate to the internal model's
 [predict](https://thinc.ai/docs/api-model#predict) function:
+
 ```python
 def predict(self, docs: Iterable[Doc]) -> Floats2d:
     scores = self.model.predict(docs)
     return self.model.ops.asarray(scores)
 ```
 
+The other method that needs to be implemented, is
+[`set_annotations`](/api/pipe#set_annotations). It takes the predicted scores,
+and modifies the given `Doc` object in place to hold the predictions. For our
+relation extraction component, we'll store the data as a dictionary in a custom
+extension attribute `doc._.rel`. As keys, we represent the candidate pair by the
+start offsets of each entity, as this defines an entity uniquely within one
+document.
+
+To interpret the scores predicted by the REL model correctly, we need to 
+refer to the model's `get_candidates` function that originally defined which 
+pairs of entities would be run through the model, so that the scores can be 
+related to those exact entities:
+
+> #### Example output
+>
+> ```python
+> doc = nlp("Amsterdam is the capital of the Netherlands.")
+> print(f"spans: {[(e.start, e.text, e.label_) for e in doc.ents]}")
+> for value, rel_dict in doc._.rel.items():
+>     print(f"{value}: {rel_dict}")
+> ```
+
+> ```
+> spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')]
+> (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002}
+> (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017}
+> ```
+
+```python
+def set_annotations(self, docs: Iterable[Doc], rel_scores: Floats2d):
+    c = 0
+    get_candidates = self.model.attrs["get_candidates"]
+    for doc in docs:
+        for (e1, e2) in get_candidates(doc):
+            offset = (e1.start, e2.start)
+            if offset not in doc._.rel:
+                doc._.rel[offset] = {}
+            for j, label in enumerate(self.labels):
+                doc._.rel[offset][label] = rel_scores[c, j]
+            c += 1
+```
 
 
 

From 52b660e9dcc412fc1d4bbdf269c1bd31d9e7d3a4 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 5 Oct 2020 00:39:36 +0200
Subject: [PATCH 06/12] initialize and update explanation

---
 website/docs/api/pipe.md                   |   6 +
 website/docs/usage/layers-architectures.md | 149 ++++++++++++++++-----
 2 files changed, 119 insertions(+), 36 deletions(-)

diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md
index 4f5ac6f61..de35f9eb4 100644
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@@ -226,6 +226,12 @@ the "catastrophic forgetting" problem. This feature is experimental.
 Find the loss and gradient of loss for the batch of documents and their
 predicted scores.
 
+<Infobox variant="danger">
+
+This method needs to be overwritten with your own custom `get_loss` method.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 7e563cb5c..130a7144e 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -618,31 +618,97 @@ create a subclass of [`Pipe`](/api/pipe) that will hold the model:
 
 ```python
 from spacy.pipeline import Pipe
-from spacy.language import Language
 
 class RelationExtractor(Pipe):
      def __init__(self, vocab, model, name="rel", labels=[]):
+        self.model = model
         ...
 
     def predict(self, docs):
         ...
 
-    def set_annotations(self, docs, scores):
+    def set_annotations(self, docs, predictions):
          ...
-
-@Language.factory("relation_extractor")
-def make_relation_extractor(nlp, name, model, labels):
-    return RelationExtractor(nlp.vocab, model, name, labels=labels)
 ```
 
+Before the model can be used however, it needs to be 
+[initialized](/api/pipe#initialize). This function recieves either the full 
+training data set, or a representative sample. The training data can be used 
+to deduce all relevant labels. Alternatively, a list of labels can be provided, 
+or a script can call `rel_component.add_label()` to add each label separately.
+
+The number of labels will define the output dimensionality of the network, 
+and will be used to do 
+[shape inference](https://thinc.ai/docs/usage-models#validation) throughout 
+the layers of the neural network. This is triggerd by calling `model.initialize`.
+
+```python
+from itertools import islice
+
+def initialize(
+    self,
+    get_examples: Callable[[], Iterable[Example]],
+    *,
+    nlp: Language = None,
+    labels: Optional[List[str]] = None,
+):
+    if labels is not None:
+        for label in labels:
+            self.add_label(label)
+    else:
+        for example in get_examples():
+            relations = example.reference._.rel
+            for indices, label_dict in relations.items():
+                for label in label_dict.keys():
+                    self.add_label(label)
+    subbatch = list(islice(get_examples(), 10))
+    doc_sample = [eg.reference for eg in subbatch]
+    label_sample = self._examples_to_truth(subbatch)
+    self.model.initialize(X=doc_sample, Y=label_sample)
+```
+ 
+The `initialize` method will be triggered whenever this component is part of an 
+`nlp` pipeline, and `nlp.initialize()` is invoked. After doing so, the pipeline 
+component and its internal model can be trained and used to make predictions.
+
+During training the function [`update`](/api/pipe#update) is invoked which delegates to 
+[`self.model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and 
+needs a function [`get_loss`](/api/pipe#get_loss) that will calculate the 
+loss for a batch of examples, as well as the gradient of loss that will be used to update 
+the weights of the model layers.
+
+```python
+def update(
+    self,
+    examples: Iterable[Example],
+    *,
+    drop: float = 0.0,
+    set_annotations: bool = False,
+    sgd: Optional[Optimizer] = None,
+    losses: Optional[Dict[str, float]] = None,
+) -> Dict[str, float]:
+    ...
+    docs = [ex.predicted for ex in examples]
+    predictions, backprop = self.model.begin_update(docs)
+    loss, gradient = self.get_loss(examples, predictions)
+    backprop(gradient)
+    losses[self.name] += loss
+    ...
+    return losses
+```
+
+Thinc provides some [loss functions](https://thinc.ai/docs/api-loss) that can be used 
+for the implementation of the `get_loss` function.
+
+When the internal model is trained, the component can be used to make novel predictions. 
 The [`predict`](/api/pipe#predict) function needs to be implemented for each
-subclass. In our case, we can simply delegate to the internal model's
+subclass of `Pipe`. In our case, we can simply delegate to the internal model's
 [predict](https://thinc.ai/docs/api-model#predict) function:
 
 ```python
 def predict(self, docs: Iterable[Doc]) -> Floats2d:
-    scores = self.model.predict(docs)
-    return self.model.ops.asarray(scores)
+    predictions = self.model.predict(docs)
+    return self.model.ops.asarray(predictions)
 ```
 
 The other method that needs to be implemented, is
@@ -650,7 +716,7 @@ The other method that needs to be implemented, is
 and modifies the given `Doc` object in place to hold the predictions. For our
 relation extraction component, we'll store the data as a dictionary in a custom
 extension attribute `doc._.rel`. As keys, we represent the candidate pair by the
-start offsets of each entity, as this defines an entity uniquely within one
+start offsets of each entity, as this defines an entity pair uniquely within one
 document.
 
 To interpret the scores predicted by the REL model correctly, we need to 
@@ -674,7 +740,7 @@ related to those exact entities:
 > ```
 
 ```python
-def set_annotations(self, docs: Iterable[Doc], rel_scores: Floats2d):
+def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
     c = 0
     get_candidates = self.model.attrs["get_candidates"]
     for doc in docs:
@@ -683,34 +749,45 @@ def set_annotations(self, docs: Iterable[Doc], rel_scores: Floats2d):
             if offset not in doc._.rel:
                 doc._.rel[offset] = {}
             for j, label in enumerate(self.labels):
-                doc._.rel[offset][label] = rel_scores[c, j]
+                doc._.rel[offset][label] = predictions[c, j]
             c += 1
 ```
 
-
-
-<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
-</Infobox>
-
-<!-- TODO: write trainable component section
-- Interaction with `predict`, `get_loss` and `set_annotations`
-- Initialization life-cycle with `initialize`, correlation with add_label
-Example: relation extraction component (implemented as project template)
-Avoid duplication with usage/processing-pipelines#trainable-components ?
--->
-
-<!-- ![Diagram of a pipeline component with its model](../images/layers-architectures.svg)
+Under the hood, when the pipe is applied to a document, it will delegate to these 
+two methods: 
 
 ```python
-def update(self, examples):
-    docs = [ex.predicted for ex in examples]
-    refs = [ex.reference for ex in examples]
-    predictions, backprop = self.model.begin_update(docs)
-    gradient = self.get_loss(predictions, refs)
-    backprop(gradient)
-
-def __call__(self, doc):
-    predictions = self.model([doc])
-    self.set_annotations(predictions)
+def __call__(self, Doc doc):
+    predictions = self.predict([doc])
+    self.set_annotations([doc], predictions)
+    return doc
 ```
--->
+
+Once our `Pipe` subclass is fully implemented, we can 
+[register](http://localhost:8000/usage/processing-pipelines#custom-components-factories) 
+the component with the 
+`Language.factory` decorator. This will enable the creation of the component with 
+`nlp.add_pipe`, or via the config.
+
+> ```
+> 
+> [components.relation_extractor]
+> factory = "relation_extractor"
+> labels = []
+> 
+> [components.relation_extractor.model]
+> @architectures = "rel_model.v1"
+> ...
+> ```
+
+```python
+from spacy.language import Language
+
+@Language.factory("relation_extractor")
+def make_relation_extractor(nlp, name, model, labels):
+    return RelationExtractor(nlp.vocab, model, name, labels=labels)
+```
+
+<!-- TODO: refer once more to example project -->
+
+<!-- ![Diagram of a pipeline component with its model](../images/layers-architectures.svg) -->

From 9a6c9b133b796d4b766189740ef1fc88f6dbe3ee Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 5 Oct 2020 01:05:37 +0200
Subject: [PATCH 07/12] various small fixes

---
 website/docs/usage/layers-architectures.md | 142 +++++++++++----------
 1 file changed, 74 insertions(+), 68 deletions(-)

diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 130a7144e..414562d6d 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -288,7 +288,7 @@ those parts of the network.
 
 To use our custom model including the PyTorch subnetwork, all we need to do is
 register the architecture using the
-[`architectures` registry](/api/top-level#registry). This will assign the
+[`architectures` registry](/api/top-level#registry). This assigns the
 architecture a name so spaCy knows how to find it, and allows passing in
 arguments like hyperparameters via the [config](/usage/training#config). The
 full example then becomes:
@@ -488,27 +488,27 @@ with Model.define_operators({">>": chain}):
 
 In addition to [swapping out](#swap-architectures) default models in built-in
 components, you can also implement an entirely new,
-[trainable pipeline component](usage/processing-pipelines#trainable-components)
+[trainable pipeline component](/usage/processing-pipelines#trainable-components)
 from scratch. This can be done by creating a new class inheriting from
 [`Pipe`](/api/pipe), and linking it up to your custom model implementation.
 
 ### Example: Pipeline component for relation extraction {#component-rel}
 
 This section outlines an example use-case of implementing a novel relation
-extraction component from scratch. We assume we want to implement a binary
-relation extraction method that determines whether two entities in a document
-are related or not, and if so, with what type of relation. We'll allow multiple
-types of relations between two such entities - i.e. it is a multi-label setting.
+extraction component from scratch. We'll implement a binary relation extraction
+method that determines whether or not two entities in a document are related,
+and if so, what type of relation. We'll allow multiple types of relations
+between two such entities (multi-label setting).
 
 There are two major steps required: first, we need to
 [implement a machine learning model](#component-rel-model) specific to this
-task, and then we'll use this model to
+task, and subsequently we use this model to
 [implement a custom pipeline component](#component-rel-pipe).
 
 #### Step 1: Implementing the Model {#component-rel-model}
 
-We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes
-a list of documents as input, and outputs a two-dimensional matrix of scores:
+We need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes a
+list of documents as input, and outputs a two-dimensional matrix of predictions:
 
 ```python
 @registry.architectures.register("rel_model.v1")
@@ -519,17 +519,16 @@ def create_relation_model(...) -> Model[List[Doc], Floats2d]:
 
 The first layer in this model will typically be an
 [embedding layer](/usage/embeddings-transformers) such as a
-[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This
-layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it
+[`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This
+layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it
 transforms each document into a list of tokens, with each token being
 represented by its embedding in the vector space.
 
-Next, we need a method that will generate pairs of entities that we want to
-classify as being related or not. These candidate pairs are typically formed
-within one document, which means we'll have a function that takes a `Doc` as
-input and outputs a `List` of `Span` tuples. For instance, a very
-straightforward implementation would be to just take any two entities from the
-same document:
+Next, we need a method that generates pairs of entities that we want to classify
+as being related or not. As these candidate pairs are typically formed within
+one document, this function takes a `Doc` as input and outputs a `List` of
+`Span` tuples. For instance, a very straightforward implementation would be to
+just take any two entities from the same document:
 
 ```python
 def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
@@ -549,12 +548,12 @@ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
 >
 > [model.get_candidates]
 > @misc = "rel_cand_generator.v2"
-> max_length = 6
+> max_length = 20
 > ```
 
 But we could also refine this further by excluding relations of an entity with
 itself, and posing a maximum distance (in number of tokens) between two
-entities. We'll register this function in the
+entities. We register this function in the
 [`@misc` registry](/api/top-level#registry) so we can refer to it from the
 config, and easily swap it out for any other candidate generation function.
 
@@ -573,10 +572,10 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span
     return get_candidates
 ```
 
-Finally, we'll require a method that transforms the candidate pairs of entities
-into a 2D tensor using the specified Tok2Vec function, and this `Floats2d`
-object will then be processed by a final `output_layer` of the network. Taking
-all this together, we can define our relation model like this in the config:
+Finally, we require a method that transforms the candidate entity pairs into a
+2D tensor using the specified `Tok2Vec` function. The resulting `Floats2d`
+object will then be processed by a final `output_layer` of the network. Putting
+all this together, we can define our relation model in a config file as such:
 
 ```
 [model]
@@ -588,7 +587,7 @@ all this together, we can define our relation model like this in the config:
 
 [model.get_candidates]
 @misc = "rel_cand_generator.v2"
-max_length = 6
+max_length = 20
 
 [model.create_candidate_tensor]
 @misc = "rel_cand_tensor.v1"
@@ -600,7 +599,7 @@ max_length = 6
 
 <!-- TODO: Link to project for implementation details -->
 
-When creating this model, we'll store the custom functions as
+When creating this model, we store the custom functions as
 [attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as
 references, so we can access them easily:
 
@@ -614,7 +613,7 @@ get_candidates = model.attrs["get_candidates"]
 #### Step 2: Implementing the pipeline component {#component-rel-pipe}
 
 To use our new relation extraction model as part of a custom component, we
-create a subclass of [`Pipe`](/api/pipe) that will hold the model:
+create a subclass of [`Pipe`](/api/pipe) that holds the model:
 
 ```python
 from spacy.pipeline import Pipe
@@ -624,6 +623,9 @@ class RelationExtractor(Pipe):
         self.model = model
         ...
 
+    def update(self, examples, ...):
+        ...
+
     def predict(self, docs):
         ...
 
@@ -631,18 +633,19 @@ class RelationExtractor(Pipe):
          ...
 ```
 
-Before the model can be used however, it needs to be 
-[initialized](/api/pipe#initialize). This function recieves either the full 
-training data set, or a representative sample. The training data can be used 
-to deduce all relevant labels. Alternatively, a list of labels can be provided, 
-or a script can call `rel_component.add_label()` to add each label separately.
+Before the model can be used, it needs to be
+[initialized](/api/pipe#initialize). This function receives either the full
+training data set, or a representative sample. This data set can be used to
+deduce all relevant labels. Alternatively, a list of labels can be provided, or
+a script can call `rel_component.add_label()` directly.
 
-The number of labels will define the output dimensionality of the network, 
-and will be used to do 
-[shape inference](https://thinc.ai/docs/usage-models#validation) throughout 
-the layers of the neural network. This is triggerd by calling `model.initialize`.
+The number of labels defines the output dimensionality of the network, and will
+be used to do [shape inference](https://thinc.ai/docs/usage-models#validation)
+throughout the layers of the neural network. This is triggered by calling
+`model.initialize`.
 
 ```python
+### {highlight="12,18,22"}
 from itertools import islice
 
 def initialize(
@@ -666,18 +669,21 @@ def initialize(
     label_sample = self._examples_to_truth(subbatch)
     self.model.initialize(X=doc_sample, Y=label_sample)
 ```
- 
-The `initialize` method will be triggered whenever this component is part of an 
-`nlp` pipeline, and `nlp.initialize()` is invoked. After doing so, the pipeline 
-component and its internal model can be trained and used to make predictions.
 
-During training the function [`update`](/api/pipe#update) is invoked which delegates to 
-[`self.model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and 
-needs a function [`get_loss`](/api/pipe#get_loss) that will calculate the 
-loss for a batch of examples, as well as the gradient of loss that will be used to update 
-the weights of the model layers.
+The `initialize` method is triggered whenever this component is part of an `nlp`
+pipeline, and [`nlp.initialize()`](/api/language#initialize) is invoked. After
+doing so, the pipeline component and its internal model can be trained and used
+to make predictions.
+
+During training, the function [`update`](/api/pipe#update) is invoked which
+delegates to
+[`self.model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
+[`get_loss`](/api/pipe#get_loss) function that calculate the loss for a batch of
+examples, as well as the gradient of loss that will be used to update the
+weights of the model layers.
 
 ```python
+### {highlight="12-14"}
 def update(
     self,
     examples: Iterable[Example],
@@ -697,13 +703,13 @@ def update(
     return losses
 ```
 
-Thinc provides some [loss functions](https://thinc.ai/docs/api-loss) that can be used 
-for the implementation of the `get_loss` function.
+Thinc provides several [loss functions](https://thinc.ai/docs/api-loss) that can
+be used for the implementation of the `get_loss` function.
 
-When the internal model is trained, the component can be used to make novel predictions. 
-The [`predict`](/api/pipe#predict) function needs to be implemented for each
-subclass of `Pipe`. In our case, we can simply delegate to the internal model's
-[predict](https://thinc.ai/docs/api-model#predict) function:
+When the internal model is trained, the component can be used to make novel
+predictions. The [`predict`](/api/pipe#predict) function needs to be implemented
+for each subclass of `Pipe`. In our case, we can simply delegate to the internal
+model's [predict](https://thinc.ai/docs/api-model#predict) function:
 
 ```python
 def predict(self, docs: Iterable[Doc]) -> Floats2d:
@@ -711,24 +717,24 @@ def predict(self, docs: Iterable[Doc]) -> Floats2d:
     return self.model.ops.asarray(predictions)
 ```
 
-The other method that needs to be implemented, is
-[`set_annotations`](/api/pipe#set_annotations). It takes the predicted scores,
-and modifies the given `Doc` object in place to hold the predictions. For our
-relation extraction component, we'll store the data as a dictionary in a custom
+The final method that needs to be implemented, is
+[`set_annotations`](/api/pipe#set_annotations). This function takes the
+predictions, and modifies the given `Doc` object in place to store them. For our
+relation extraction component, we store the data as a dictionary in a custom
 extension attribute `doc._.rel`. As keys, we represent the candidate pair by the
 start offsets of each entity, as this defines an entity pair uniquely within one
 document.
 
-To interpret the scores predicted by the REL model correctly, we need to 
-refer to the model's `get_candidates` function that originally defined which 
-pairs of entities would be run through the model, so that the scores can be 
-related to those exact entities:
+To interpret the scores predicted by the REL model correctly, we need to refer
+to the model's `get_candidates` function that defined which pairs of entities
+were relevant candidates, so that the predictions can be linked to those exact
+entities:
 
 > #### Example output
 >
 > ```python
 > doc = nlp("Amsterdam is the capital of the Netherlands.")
-> print(f"spans: {[(e.start, e.text, e.label_) for e in doc.ents]}")
+> print(f"spans: [(e.start, e.text, e.label_) for e in doc.ents]")
 > for value, rel_dict in doc._.rel.items():
 >     print(f"{value}: {rel_dict}")
 > ```
@@ -740,6 +746,7 @@ related to those exact entities:
 > ```
 
 ```python
+###  {highlight="5-6,10"}
 def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
     c = 0
     get_candidates = self.model.attrs["get_candidates"]
@@ -753,8 +760,8 @@ def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
             c += 1
 ```
 
-Under the hood, when the pipe is applied to a document, it will delegate to these 
-two methods: 
+Under the hood, when the pipe is applied to a document, it delegates to the
+`predict` and `set_annotations` functions:
 
 ```python
 def __call__(self, Doc doc):
@@ -763,18 +770,17 @@ def __call__(self, Doc doc):
     return doc
 ```
 
-Once our `Pipe` subclass is fully implemented, we can 
-[register](http://localhost:8000/usage/processing-pipelines#custom-components-factories) 
-the component with the 
-`Language.factory` decorator. This will enable the creation of the component with 
-`nlp.add_pipe`, or via the config.
+Once our `Pipe` subclass is fully implemented, we can
+[register](http://localhost:8000/usage/processing-pipelines#custom-components-factories)
+the component with the `Language.factory` decorator. This enables the creation
+of the component with `nlp.add_pipe`, or via the config.
 
 > ```
-> 
+>
 > [components.relation_extractor]
 > factory = "relation_extractor"
 > labels = []
-> 
+>
 > [components.relation_extractor.model]
 > @architectures = "rel_model.v1"
 > ...

From b0b93854cb2c522090c87544e33a19e6b361ed19 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 5 Oct 2020 09:26:43 +0200
Subject: [PATCH 08/12] Update ru/uk lemmatizers for new nlp.initialize

---
 spacy/lang/ru/__init__.py   | 10 ++++++++--
 spacy/lang/ru/lemmatizer.py |  5 ++---
 spacy/lang/uk/__init__.py   |  4 ++--
 spacy/lang/uk/lemmatizer.py |  5 ++---
 spacy/tests/conftest.py     |  1 -
 5 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index 1d59ca043..2f3965fcc 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -25,8 +25,14 @@ class Russian(Language):
     default_config={"model": None, "mode": "pymorphy2"},
     default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
-    return RussianLemmatizer(nlp.vocab, model, name, mode=mode)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool = False,
+):
+    return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
 
 
 __all__ = ["Russian"]
diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py
index 8d7996c63..3bcac8730 100644
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Tuple
 
 from thinc.api import Model
 
-from ...lookups import Lookups
 from ...pipeline import Lemmatizer
 from ...symbols import POS
 from ...tokens import Token
@@ -22,9 +21,9 @@ class RussianLemmatizer(Lemmatizer):
         name: str = "lemmatizer",
         *,
         mode: str = "pymorphy2",
-        lookups: Optional[Lookups] = None,
+        overwrite: bool = False,
     ) -> None:
-        super().__init__(vocab, model, name, mode=mode, lookups=lookups)
+        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
 
         try:
             from pymorphy2 import MorphAnalyzer
diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py
index 73c065379..0abe9170e 100644
--- a/spacy/lang/uk/__init__.py
+++ b/spacy/lang/uk/__init__.py
@@ -26,8 +26,8 @@ class Ukrainian(Language):
     default_config={"model": None, "mode": "pymorphy2"},
     default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
-    return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False,):
+    return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
 
 
 __all__ = ["Ukrainian"]
diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py
index 0d6febce6..009ec5044 100644
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@@ -3,7 +3,6 @@ from typing import Optional
 from thinc.api import Model
 
 from ..ru.lemmatizer import RussianLemmatizer
-from ...lookups import Lookups
 from ...vocab import Vocab
 
 
@@ -15,9 +14,9 @@ class UkrainianLemmatizer(RussianLemmatizer):
         name: str = "lemmatizer",
         *,
         mode: str = "pymorphy2",
-        lookups: Optional[Lookups] = None,
+        overwrite: bool = False,
     ) -> None:
-        super().__init__(vocab, model, name, mode=mode, lookups=lookups)
+        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
         try:
             from pymorphy2 import MorphAnalyzer
         except ImportError:
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 4a3d126d7..67860b7e4 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -248,7 +248,6 @@ def tt_tokenizer():
 @pytest.fixture(scope="session")
 def uk_tokenizer():
     pytest.importorskip("pymorphy2")
-    pytest.importorskip("pymorphy2.lang")
     return get_lang_class("uk")().tokenizer
 
 

From 1c641e41c3d46c5b555891427833200c0f0087b5 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 11:50:11 +0200
Subject: [PATCH 09/12] Remove unused import [ci skip]

---
 spacy/tests/regression/test_issue5918.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/tests/regression/test_issue5918.py b/spacy/tests/regression/test_issue5918.py
index e4ee0135d..d25323ef6 100644
--- a/spacy/tests/regression/test_issue5918.py
+++ b/spacy/tests/regression/test_issue5918.py
@@ -1,6 +1,5 @@
 from spacy.lang.en import English
 from spacy.pipeline import merge_entities
-import pytest
 
 
 def test_issue5918():

From e3acad626443c9cf0b81f600aae2b3b9529b63cd Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 13:06:20 +0200
Subject: [PATCH 10/12] Update docs [ci skip]

---
 website/docs/usage/layers-architectures.md | 261 +++++++++++++--------
 1 file changed, 162 insertions(+), 99 deletions(-)

diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 414562d6d..24c7bf1cf 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -86,7 +86,8 @@ see are: ​
 | ~~Ragged~~         | A container to handle variable-length sequence data in an unpadded contiguous array.                 |
 | ~~Padded~~         | A container to handle variable-length sequence data in a padded contiguous array.                    |
 
-The model type signatures help you figure out which model architectures and
+See the [Thinc type reference](https://thinc.ai/docs/api-types) for details. The
+model type signatures help you figure out which model architectures and
 components can **fit together**. For instance, the
 [`TextCategorizer`](/api/textcategorizer) class expects a model typed
 ~~Model[List[Doc], Floats2d]~~, because the model will predict one row of
@@ -488,32 +489,57 @@ with Model.define_operators({">>": chain}):
 
 In addition to [swapping out](#swap-architectures) default models in built-in
 components, you can also implement an entirely new,
-[trainable pipeline component](/usage/processing-pipelines#trainable-components)
+[trainable](/usage/processing-pipelines#trainable-components) pipeline component
 from scratch. This can be done by creating a new class inheriting from
 [`Pipe`](/api/pipe), and linking it up to your custom model implementation.
 
-### Example: Pipeline component for relation extraction {#component-rel}
+<Infobox title="Trainable component API" emoji="💡">
 
-This section outlines an example use-case of implementing a novel relation
-extraction component from scratch. We'll implement a binary relation extraction
-method that determines whether or not two entities in a document are related,
-and if so, what type of relation. We'll allow multiple types of relations
-between two such entities (multi-label setting).
+For details on how to implement pipeline components, check out the usage guide
+on [custom components](/usage/processing-pipelines#custom-component) and the
+overview of the `Pipe` methods used by
+[trainable components](/usage/processing-pipelines#trainable-components).
 
-There are two major steps required: first, we need to
-[implement a machine learning model](#component-rel-model) specific to this
-task, and subsequently we use this model to
-[implement a custom pipeline component](#component-rel-pipe).
+</Infobox>
+
+### Example: Entity elation extraction component {#component-rel}
+
+This section outlines an example use-case of implementing a **novel relation
+extraction component** from scratch. We'll implement a binary relation
+extraction method that determines whether or not **two entities** in a document
+are related, and if so, what type of relation. We'll allow multiple types of
+relations between two such entities (multi-label setting). There are two major
+steps required:
+
+1. Implement a [machine learning model](#component-rel-model) specific to this
+   task. It will have to extract candidates from a [`Doc`](/api/doc) and predict
+   a relation for the available candidate pairs.
+2. Implement a custom [pipeline component](#component-rel-pipe) powered by the
+   machine learning model that sets annotations on the [`Doc`](/api/doc) passing
+   through the pipeline.
+
+<!-- TODO: <Project id="tutorials/ner-relations">
+
+</Project> -->
 
 #### Step 1: Implementing the Model {#component-rel-model}
 
 We need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes a
-list of documents as input, and outputs a two-dimensional matrix of predictions:
+**list of documents** (~~List[Doc]~~) as input, and outputs a **two-dimensional
+matrix** (~~Floats2d~~) of predictions:
+
+> #### Model type annotations
+>
+> The `Model` class is a generic type that can specify its input and output
+> types, e.g. ~~Model[List[Doc], Floats2d]~~. Type hints are used for static
+> type checks and validation. See the section on [type signatures](#type-sigs)
+> for details.
 
 ```python
+### Register the model architecture
 @registry.architectures.register("rel_model.v1")
 def create_relation_model(...) -> Model[List[Doc], Floats2d]:
-    model = _create_my_model()
+    model = ...  # 👈 model will go here
     return model
 ```
 
@@ -521,17 +547,18 @@ The first layer in this model will typically be an
 [embedding layer](/usage/embeddings-transformers) such as a
 [`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This
 layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it
-transforms each document into a list of tokens, with each token being
+transforms each **document into a list of tokens**, with each token being
 represented by its embedding in the vector space.
 
-Next, we need a method that generates pairs of entities that we want to classify
-as being related or not. As these candidate pairs are typically formed within
-one document, this function takes a `Doc` as input and outputs a `List` of
-`Span` tuples. For instance, a very straightforward implementation would be to
-just take any two entities from the same document:
+Next, we need a method that **generates pairs of entities** that we want to
+classify as being related or not. As these candidate pairs are typically formed
+within one document, this function takes a [`Doc`](/api/doc) as input and
+outputs a `List` of `Span` tuples. For instance, a very straightforward
+implementation would be to just take any two entities from the same document:
 
 ```python
-def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
+### Simple candiate generation
+def get_candidates(doc: Doc) -> List[Tuple[Span, Span]]:
     candidates = []
     for ent1 in doc.ents:
         for ent2 in doc.ents:
@@ -539,27 +566,29 @@ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
     return candidates
 ```
 
-> ```
-> [model]
-> @architectures = "rel_model.v1"
->
-> [model.tok2vec]
-> ...
->
-> [model.get_candidates]
-> @misc = "rel_cand_generator.v2"
-> max_length = 20
-> ```
-
-But we could also refine this further by excluding relations of an entity with
-itself, and posing a maximum distance (in number of tokens) between two
+But we could also refine this further by **excluding relations** of an entity
+with itself, and posing a **maximum distance** (in number of tokens) between two
 entities. We register this function in the
 [`@misc` registry](/api/top-level#registry) so we can refer to it from the
 config, and easily swap it out for any other candidate generation function.
 
+> #### config.cfg (excerpt)
+>
+> ```ini
+> [model]
+> @architectures = "rel_model.v1"
+>
+> [model.tok2vec]
+> # ...
+>
+> [model.get_candidates]
+> @misc = "rel_cand_generator.v1"
+> max_length = 20
+> ```
+
 ```python
-### {highlight="1,2,7,8"}
-@registry.misc.register("rel_cand_generator.v2")
+### Extended candidate generation {highlight="1,2,7,8"}
+@registry.misc.register("rel_cand_generator.v1")
 def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
     def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
         candidates = []
@@ -573,17 +602,19 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span
 ```
 
 Finally, we require a method that transforms the candidate entity pairs into a
-2D tensor using the specified `Tok2Vec` function. The resulting `Floats2d`
-object will then be processed by a final `output_layer` of the network. Putting
-all this together, we can define our relation model in a config file as such:
+2D tensor using the specified [`Tok2Vec`](/api/tok2vec) or
+[`Transformer`](/api/transformer). The resulting ~~Floats2~~ object will then be
+processed by a final `output_layer` of the network. Putting all this together,
+we can define our relation model in a config file as such:
 
-```
+```ini
+### config.cfg
 [model]
 @architectures = "rel_model.v1"
-...
+# ...
 
 [model.tok2vec]
-...
+# ...
 
 [model.get_candidates]
 @misc = "rel_cand_generator.v2"
@@ -594,10 +625,11 @@ max_length = 20
 
 [model.output_layer]
 @architectures = "rel_output_layer.v1"
-...
+# ...
 ```
 
-<!-- TODO: Link to project for implementation details -->
+<!-- TODO: link to project for implementation details -->
+<!-- TODO: maybe embed files from project that show the architectures? -->
 
 When creating this model, we store the custom functions as
 [attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as
@@ -612,40 +644,55 @@ get_candidates = model.attrs["get_candidates"]
 
 #### Step 2: Implementing the pipeline component {#component-rel-pipe}
 
-To use our new relation extraction model as part of a custom component, we
+To use our new relation extraction model as part of a custom
+[trainable component](/usage/processing-pipelines#trainable-components), we
 create a subclass of [`Pipe`](/api/pipe) that holds the model:
 
 ```python
+### Pipeline component skeleton
 from spacy.pipeline import Pipe
 
 class RelationExtractor(Pipe):
-     def __init__(self, vocab, model, name="rel", labels=[]):
+     def __init__(self, vocab, model, name="rel"):
+        """Create a component instance."""
         self.model = model
-        ...
+        self.vocab = vocab
+        self.name = name
 
-    def update(self, examples, ...):
+    def update(self, examples, drop=0.0, set_annotations=False, sgd=None, losses=None):
+        """Learn from a batch of Example objects."""
         ...
 
     def predict(self, docs):
+        """Apply the model to a batch of Doc objects."""
         ...
 
     def set_annotations(self, docs, predictions):
+        """Modify a batch of Doc objects using the predictions."""
          ...
+
+    def initialize(self, get_examples, nlp=None, labels=None):
+        """Initialize the model before training."""
+        ...
+
+    def add_label(self, label):
+        """Add a label to the component."""
+        ...
 ```
 
 Before the model can be used, it needs to be
-[initialized](/api/pipe#initialize). This function receives either the full
-training data set, or a representative sample. This data set can be used to
-deduce all relevant labels. Alternatively, a list of labels can be provided, or
-a script can call `rel_component.add_label()` directly.
-
-The number of labels defines the output dimensionality of the network, and will
-be used to do [shape inference](https://thinc.ai/docs/usage-models#validation)
-throughout the layers of the neural network. This is triggered by calling
-`model.initialize`.
+[initialized](/usage/training#initialization). This function receives a callback
+to access the full **training data set**, or a representative sample. This data
+set can be used to deduce all **relevant labels**. Alternatively, a list of
+labels can be provided to `initialize`, or you can call the
+`RelationExtractoradd_label` directly. The number of labels defines the output
+dimensionality of the network, and will be used to do
+[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
+layers of the neural network. This is triggered by calling
+[`Model.initialize`](https://thinc.ai/api/model#initialize).
 
 ```python
-### {highlight="12,18,22"}
+### The initialize method {highlight="12,18,22"}
 from itertools import islice
 
 def initialize(
@@ -671,19 +718,22 @@ def initialize(
 ```
 
 The `initialize` method is triggered whenever this component is part of an `nlp`
-pipeline, and [`nlp.initialize()`](/api/language#initialize) is invoked. After
-doing so, the pipeline component and its internal model can be trained and used
-to make predictions.
+pipeline, and [`nlp.initialize`](/api/language#initialize) is invoked.
+Typically, this happens when the pipeline is set up before training in
+[`spacy train`](/api/cli#training). After initialization, the pipeline component
+and its internal model can be trained and used to make predictions.
 
 During training, the function [`update`](/api/pipe#update) is invoked which
 delegates to
-[`self.model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
-[`get_loss`](/api/pipe#get_loss) function that calculate the loss for a batch of
-examples, as well as the gradient of loss that will be used to update the
-weights of the model layers.
+[`Model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
+[`get_loss`](/api/pipe#get_loss) function that **calculate the loss** for a
+batch of examples, as well as the **gradient** of loss that will be used to
+update the weights of the model layers. Thinc provides several
+[loss functions](https://thinc.ai/docs/api-loss) that can be used for the
+implementation of the `get_loss` function.
 
 ```python
-### {highlight="12-14"}
+### The update method {highlight="12-14"}
 def update(
     self,
     examples: Iterable[Example],
@@ -703,15 +753,14 @@ def update(
     return losses
 ```
 
-Thinc provides several [loss functions](https://thinc.ai/docs/api-loss) that can
-be used for the implementation of the `get_loss` function.
-
 When the internal model is trained, the component can be used to make novel
-predictions. The [`predict`](/api/pipe#predict) function needs to be implemented
-for each subclass of `Pipe`. In our case, we can simply delegate to the internal
-model's [predict](https://thinc.ai/docs/api-model#predict) function:
+**predictions**. The [`predict`](/api/pipe#predict) function needs to be
+implemented for each subclass of `Pipe`. In our case, we can simply delegate to
+the internal model's [predict](https://thinc.ai/docs/api-model#predict) function
+that takes a batch of `Doc` objects and returns a ~~Floats2d~~ array:
 
 ```python
+### The predict method
 def predict(self, docs: Iterable[Doc]) -> Floats2d:
     predictions = self.model.predict(docs)
     return self.model.ops.asarray(predictions)
@@ -721,32 +770,36 @@ The final method that needs to be implemented, is
 [`set_annotations`](/api/pipe#set_annotations). This function takes the
 predictions, and modifies the given `Doc` object in place to store them. For our
 relation extraction component, we store the data as a dictionary in a custom
-extension attribute `doc._.rel`. As keys, we represent the candidate pair by the
-start offsets of each entity, as this defines an entity pair uniquely within one
-document.
+[extension attribute](/usage/processing-pipelines#custom-components-attributes)
+`doc._.rel`. As keys, we represent the candidate pair by the **start offsets of
+each entity**, as this defines an entity pair uniquely within one document.
 
-To interpret the scores predicted by the REL model correctly, we need to refer
-to the model's `get_candidates` function that defined which pairs of entities
-were relevant candidates, so that the predictions can be linked to those exact
-entities:
+To interpret the scores predicted by the relation extraction model correctly, we
+need to refer to the model's `get_candidates` function that defined which pairs
+of entities were relevant candidates, so that the predictions can be linked to
+those exact entities:
 
 > #### Example output
 >
 > ```python
 > doc = nlp("Amsterdam is the capital of the Netherlands.")
-> print(f"spans: [(e.start, e.text, e.label_) for e in doc.ents]")
+> print("spans", [(e.start, e.text, e.label_) for e in doc.ents])
 > for value, rel_dict in doc._.rel.items():
 >     print(f"{value}: {rel_dict}")
-> ```
-
-> ```
-> spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')]
-> (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002}
-> (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017}
+>
+> # spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')]
+> # (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002}
+> # (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017}
 > ```
 
 ```python
-###  {highlight="5-6,10"}
+### Registering the extension attribute
+from spacy.tokens import Doc
+Doc.set_extension("rel", default={})
+```
+
+```python
+### The set_annotations method {highlight="5-6,10"}
 def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
     c = 0
     get_candidates = self.model.attrs["get_candidates"]
@@ -761,9 +814,10 @@ def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
 ```
 
 Under the hood, when the pipe is applied to a document, it delegates to the
-`predict` and `set_annotations` functions:
+`predict` and `set_annotations` methods:
 
 ```python
+### The __call__ method
 def __call__(self, Doc doc):
     predictions = self.predict([doc])
     self.set_annotations([doc], predictions)
@@ -771,29 +825,38 @@ def __call__(self, Doc doc):
 ```
 
 Once our `Pipe` subclass is fully implemented, we can
-[register](http://localhost:8000/usage/processing-pipelines#custom-components-factories)
-the component with the `Language.factory` decorator. This enables the creation
-of the component with `nlp.add_pipe`, or via the config.
+[register](/usage/processing-pipelines#custom-components-factories) the
+component with the [`@Language.factory`](/api/lnguage#factory) decorator. This
+assigns it a name and lets you create the component with
+[`nlp.add_pipe`](/api/language#add_pipe) and via the
+[config](/usage/training#config).
 
-> ```
+> #### config.cfg (excerpt)
 >
+> ```ini
 > [components.relation_extractor]
 > factory = "relation_extractor"
-> labels = []
 >
 > [components.relation_extractor.model]
 > @architectures = "rel_model.v1"
-> ...
+>
+> [components.relation_extractor.model.tok2vec]
+> # ...
+>
+> [components.relation_extractor.model.get_candidates]
+> @misc = "rel_cand_generator.v1"
+> max_length = 20
 > ```
 
 ```python
+### Registering the pipeline component
 from spacy.language import Language
 
 @Language.factory("relation_extractor")
-def make_relation_extractor(nlp, name, model, labels):
-    return RelationExtractor(nlp.vocab, model, name, labels=labels)
+def make_relation_extractor(nlp, name, model):
+    return RelationExtractor(nlp.vocab, model, name)
 ```
 
-<!-- TODO: refer once more to example project -->
+<!-- TODO: <Project id="tutorials/ner-relations">
 
-<!-- ![Diagram of a pipeline component with its model](../images/layers-architectures.svg) -->
+</Project> -->

From fd2d48556c1e77f4492693e4a69dc8f4a34cfe34 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 5 Oct 2020 13:43:32 +0200
Subject: [PATCH 11/12] fix E902 and E903 numbering

---
 spacy/errors.py                                | 4 ++--
 spacy/training/converters/conll_ner_to_docs.py | 2 +-
 spacy/training/converters/iob_to_docs.py       | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 20edf45b5..9d9a716d2 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -456,10 +456,10 @@ class Errors:
             "issue tracker: http://github.com/explosion/spaCy/issues")
 
     # TODO: fix numbering after merging develop into master
-    E092 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
+    E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
             "Try checking whitespace and delimiters. See "
             "https://nightly.spacy.io/api/cli#convert")
-    E093 = ("The token-per-line NER file is not formatted correctly. Try checking "
+    E903 = ("The token-per-line NER file is not formatted correctly. Try checking "
             "whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert")
     E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This "
             "dimension refers to the output width, after the linear projection "
diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py
index 28f0f87c3..c01686aee 100644
--- a/spacy/training/converters/conll_ner_to_docs.py
+++ b/spacy/training/converters/conll_ner_to_docs.py
@@ -103,7 +103,7 @@ def conll_ner_to_docs(
             lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
             cols = list(zip(*[line.split() for line in lines]))
             if len(cols) < 2:
-                raise ValueError(Errors.E093)
+                raise ValueError(Errors.E903)
             length = len(cols[0])
             words.extend(cols[0])
             sent_starts.extend([True] + [False] * (length - 1))
diff --git a/spacy/training/converters/iob_to_docs.py b/spacy/training/converters/iob_to_docs.py
index 73ad8953d..a2185fef7 100644
--- a/spacy/training/converters/iob_to_docs.py
+++ b/spacy/training/converters/iob_to_docs.py
@@ -46,7 +46,7 @@ def read_iob(raw_sents, vocab, n_sents):
                 sent_words, sent_iob = zip(*sent_tokens)
                 sent_tags = ["-"] * len(sent_words)
             else:
-                raise ValueError(Errors.E092)
+                raise ValueError(Errors.E902)
             words.extend(sent_words)
             tags.extend(sent_tags)
             iob.extend(sent_iob)

From 20f2a17a09dc053b5f2f06cff637fb92647137ad Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 13:45:57 +0200
Subject: [PATCH 12/12] Merge test_misc and test_util

---
 spacy/tests/test_misc.py | 134 ++++++++++++++++++++++++++++++++++++++
 spacy/tests/test_util.py | 137 ---------------------------------------
 2 files changed, 134 insertions(+), 137 deletions(-)
 delete mode 100644 spacy/tests/test_util.py

diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index e6ef45f90..bdf54ad6a 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -7,6 +7,15 @@ from spacy import util
 from spacy import prefer_gpu, require_gpu
 from spacy.ml._precomputable_affine import PrecomputableAffine
 from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
+from spacy.util import dot_to_object, SimpleFrozenList
+from thinc.api import Config, Optimizer, ConfigValidationError
+from spacy.training.batchers import minibatch_by_words
+from spacy.lang.en import English
+from spacy.lang.nl import Dutch
+from spacy.language import DEFAULT_CONFIG_PATH
+from spacy.schemas import ConfigSchemaTraining
+
+from .util import get_random_doc
 
 
 @pytest.fixture
@@ -157,3 +166,128 @@ def test_dot_to_dict(dot_notation, expected):
     result = util.dot_to_dict(dot_notation)
     assert result == expected
     assert util.dict_to_dot(result) == dot_notation
+
+
+@pytest.mark.parametrize(
+    "doc_sizes, expected_batches",
+    [
+        ([400, 400, 199], [3]),
+        ([400, 400, 199, 3], [4]),
+        ([400, 400, 199, 3, 200], [3, 2]),
+        ([400, 400, 199, 3, 1], [5]),
+        ([400, 400, 199, 3, 1, 1500], [5]),  # 1500 will be discarded
+        ([400, 400, 199, 3, 1, 200], [3, 3]),
+        ([400, 400, 199, 3, 1, 999], [3, 3]),
+        ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
+        ([1, 2, 999], [3]),
+        ([1, 2, 999, 1], [4]),
+        ([1, 200, 999, 1], [2, 2]),
+        ([1, 999, 200, 1], [2, 2]),
+    ],
+)
+def test_util_minibatch(doc_sizes, expected_batches):
+    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
+    tol = 0.2
+    batch_size = 1000
+    batches = list(
+        minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
+    )
+    assert [len(batch) for batch in batches] == expected_batches
+
+    max_size = batch_size + batch_size * tol
+    for batch in batches:
+        assert sum([len(doc) for doc in batch]) < max_size
+
+
+@pytest.mark.parametrize(
+    "doc_sizes, expected_batches",
+    [
+        ([400, 4000, 199], [1, 2]),
+        ([400, 400, 199, 3000, 200], [1, 4]),
+        ([400, 400, 199, 3, 1, 1500], [1, 5]),
+        ([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
+        ([1, 2, 9999], [1, 2]),
+        ([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
+    ],
+)
+def test_util_minibatch_oversize(doc_sizes, expected_batches):
+    """ Test that oversized documents are returned in their own batch"""
+    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
+    tol = 0.2
+    batch_size = 1000
+    batches = list(
+        minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
+    )
+    assert [len(batch) for batch in batches] == expected_batches
+
+
+def test_util_dot_section():
+    cfg_string = """
+    [nlp]
+    lang = "en"
+    pipeline = ["textcat"]
+
+    [components]
+
+    [components.textcat]
+    factory = "textcat"
+
+    [components.textcat.model]
+    @architectures = "spacy.TextCatBOW.v1"
+    exclusive_classes = true
+    ngram_size = 1
+    no_output_layer = false
+    """
+    nlp_config = Config().from_str(cfg_string)
+    en_nlp = util.load_model_from_config(nlp_config, auto_fill=True)
+    default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
+    default_config["nlp"]["lang"] = "nl"
+    nl_nlp = util.load_model_from_config(default_config, auto_fill=True)
+    # Test that creation went OK
+    assert isinstance(en_nlp, English)
+    assert isinstance(nl_nlp, Dutch)
+    assert nl_nlp.pipe_names == []
+    assert en_nlp.pipe_names == ["textcat"]
+    # not exclusive_classes
+    assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
+    # Test that default values got overwritten
+    assert en_nlp.config["nlp"]["pipeline"] == ["textcat"]
+    assert nl_nlp.config["nlp"]["pipeline"] == []  # default value []
+    # Test proper functioning of 'dot_to_object'
+    with pytest.raises(KeyError):
+        dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
+    with pytest.raises(KeyError):
+        dot_to_object(en_nlp.config, "nlp.unknownattribute")
+    T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
+    assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
+
+
+def test_simple_frozen_list():
+    t = SimpleFrozenList(["foo", "bar"])
+    assert t == ["foo", "bar"]
+    assert t.index("bar") == 1  # okay method
+    with pytest.raises(NotImplementedError):
+        t.append("baz")
+    with pytest.raises(NotImplementedError):
+        t.sort()
+    with pytest.raises(NotImplementedError):
+        t.extend(["baz"])
+    with pytest.raises(NotImplementedError):
+        t.pop()
+    t = SimpleFrozenList(["foo", "bar"], error="Error!")
+    with pytest.raises(NotImplementedError):
+        t.append("baz")
+
+
+def test_resolve_dot_names():
+    config = {
+        "training": {"optimizer": {"@optimizers": "Adam.v1"}},
+        "foo": {"bar": "training.optimizer", "baz": "training.xyz"},
+    }
+    result = util.resolve_dot_names(config, ["training.optimizer"])
+    assert isinstance(result[0], Optimizer)
+    with pytest.raises(ConfigValidationError) as e:
+        util.resolve_dot_names(config, ["training.xyz", "training.optimizer"])
+    errors = e.value.errors
+    assert len(errors) == 1
+    assert errors[0]["loc"] == ["training", "xyz"]
diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py
deleted file mode 100644
index f710a38eb..000000000
--- a/spacy/tests/test_util.py
+++ /dev/null
@@ -1,137 +0,0 @@
-import pytest
-
-from spacy import util
-from spacy.util import dot_to_object, SimpleFrozenList
-from thinc.api import Config, Optimizer, ConfigValidationError
-from spacy.training.batchers import minibatch_by_words
-from spacy.lang.en import English
-from spacy.lang.nl import Dutch
-from spacy.language import DEFAULT_CONFIG_PATH
-from spacy.schemas import ConfigSchemaTraining
-
-from .util import get_random_doc
-
-
-@pytest.mark.parametrize(
-    "doc_sizes, expected_batches",
-    [
-        ([400, 400, 199], [3]),
-        ([400, 400, 199, 3], [4]),
-        ([400, 400, 199, 3, 200], [3, 2]),
-        ([400, 400, 199, 3, 1], [5]),
-        ([400, 400, 199, 3, 1, 1500], [5]),  # 1500 will be discarded
-        ([400, 400, 199, 3, 1, 200], [3, 3]),
-        ([400, 400, 199, 3, 1, 999], [3, 3]),
-        ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
-        ([1, 2, 999], [3]),
-        ([1, 2, 999, 1], [4]),
-        ([1, 200, 999, 1], [2, 2]),
-        ([1, 999, 200, 1], [2, 2]),
-    ],
-)
-def test_util_minibatch(doc_sizes, expected_batches):
-    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
-    tol = 0.2
-    batch_size = 1000
-    batches = list(
-        minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
-    )
-    assert [len(batch) for batch in batches] == expected_batches
-
-    max_size = batch_size + batch_size * tol
-    for batch in batches:
-        assert sum([len(doc) for doc in batch]) < max_size
-
-
-@pytest.mark.parametrize(
-    "doc_sizes, expected_batches",
-    [
-        ([400, 4000, 199], [1, 2]),
-        ([400, 400, 199, 3000, 200], [1, 4]),
-        ([400, 400, 199, 3, 1, 1500], [1, 5]),
-        ([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
-        ([1, 2, 9999], [1, 2]),
-        ([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
-    ],
-)
-def test_util_minibatch_oversize(doc_sizes, expected_batches):
-    """ Test that oversized documents are returned in their own batch"""
-    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
-    tol = 0.2
-    batch_size = 1000
-    batches = list(
-        minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
-    )
-    assert [len(batch) for batch in batches] == expected_batches
-
-
-def test_util_dot_section():
-    cfg_string = """
-    [nlp]
-    lang = "en"
-    pipeline = ["textcat"]
-
-    [components]
-
-    [components.textcat]
-    factory = "textcat"
-
-    [components.textcat.model]
-    @architectures = "spacy.TextCatBOW.v1"
-    exclusive_classes = true
-    ngram_size = 1
-    no_output_layer = false
-    """
-    nlp_config = Config().from_str(cfg_string)
-    en_nlp = util.load_model_from_config(nlp_config, auto_fill=True)
-    default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
-    default_config["nlp"]["lang"] = "nl"
-    nl_nlp = util.load_model_from_config(default_config, auto_fill=True)
-    # Test that creation went OK
-    assert isinstance(en_nlp, English)
-    assert isinstance(nl_nlp, Dutch)
-    assert nl_nlp.pipe_names == []
-    assert en_nlp.pipe_names == ["textcat"]
-    # not exclusive_classes
-    assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
-    # Test that default values got overwritten
-    assert en_nlp.config["nlp"]["pipeline"] == ["textcat"]
-    assert nl_nlp.config["nlp"]["pipeline"] == []  # default value []
-    # Test proper functioning of 'dot_to_object'
-    with pytest.raises(KeyError):
-        dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
-    with pytest.raises(KeyError):
-        dot_to_object(en_nlp.config, "nlp.unknownattribute")
-    T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
-    assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
-
-
-def test_simple_frozen_list():
-    t = SimpleFrozenList(["foo", "bar"])
-    assert t == ["foo", "bar"]
-    assert t.index("bar") == 1  # okay method
-    with pytest.raises(NotImplementedError):
-        t.append("baz")
-    with pytest.raises(NotImplementedError):
-        t.sort()
-    with pytest.raises(NotImplementedError):
-        t.extend(["baz"])
-    with pytest.raises(NotImplementedError):
-        t.pop()
-    t = SimpleFrozenList(["foo", "bar"], error="Error!")
-    with pytest.raises(NotImplementedError):
-        t.append("baz")
-
-
-def test_resolve_dot_names():
-    config = {
-        "training": {"optimizer": {"@optimizers": "Adam.v1"}},
-        "foo": {"bar": "training.optimizer", "baz": "training.xyz"},
-    }
-    result = util.resolve_dot_names(config, ["training.optimizer"])
-    assert isinstance(result[0], Optimizer)
-    with pytest.raises(ConfigValidationError) as e:
-        util.resolve_dot_names(config, ["training.xyz", "training.optimizer"])
-    errors = e.value.errors
-    assert len(errors) == 1
-    assert errors[0]["loc"] == ["training", "xyz"]