From 75d90193434ce13ff22b4210bfa063f9c3096936 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 7 Jan 2021 06:39:27 +0100
Subject: [PATCH] Fix types of Tok2Vec encoding architectures (#6442)

* fix TorchBiLSTMEncoder documentation

* ensure the types of the encoding Tok2vec layers are correct

* update references from v1 to v2 for the new architectures
---
 spacy/cli/templates/quickstart_training.jinja |  4 +-
 spacy/ml/models/tok2vec.py                    | 85 ++++++++++++++++++-
 spacy/pipeline/morphologizer.pyx              |  4 +-
 spacy/pipeline/textcat.py                     |  4 +-
 spacy/tests/pipeline/test_tok2vec.py          |  4 +-
 website/docs/api/architectures.md             | 34 ++++----
 website/docs/usage/embeddings-transformers.md |  8 +-
 website/docs/usage/layers-architectures.md    | 10 +--
 8 files changed, 116 insertions(+), 37 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 71ba5f6bc..ab1d69894 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -205,7 +205,7 @@ no_output_layer = false
 factory = "tok2vec"
 
 [components.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"
 
 [components.tok2vec.model.embed]
 @architectures = "spacy.MultiHashEmbed.v1"
@@ -220,7 +220,7 @@ rows = [5000, 2500]
 include_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
 
 [components.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = {{ 96 if optimize == "efficiency" else 256 }}
 depth = {{ 4 if optimize == "efficiency" else 8 }}
 window_size = 1
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 0f727d85f..2bb420004 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -87,6 +87,7 @@ def build_hash_embed_cnn_tok2vec(
     )
 
 
+# TODO: archive
 @registry.architectures.register("spacy.Tok2Vec.v1")
 def build_Tok2Vec_model(
     embed: Model[List[Doc], List[Floats2d]],
@@ -108,6 +109,28 @@ def build_Tok2Vec_model(
     return tok2vec
 
 
+
+@registry.architectures.register("spacy.Tok2Vec.v2")
+def build_Tok2Vec_model(
+    embed: Model[List[Doc], List[Floats2d]],
+    encode: Model[List[Floats2d], List[Floats2d]],
+) -> Model[List[Doc], List[Floats2d]]:
+    """Construct a tok2vec model out of embedding and encoding subnetworks.
+    See https://explosion.ai/blog/deep-learning-formula-nlp
+
+    embed (Model[List[Doc], List[Floats2d]]): Embed tokens into context-independent
+        word vector representations.
+    encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the
+        embeddings, using an architecture such as a CNN, BiLSTM or transformer.
+    """
+    tok2vec = chain(embed, encode)
+    tok2vec.set_dim("nO", encode.get_dim("nO"))
+    tok2vec.set_ref("embed", embed)
+    tok2vec.set_ref("encode", encode)
+    return tok2vec
+
+
+
 @registry.architectures.register("spacy.MultiHashEmbed.v1")
 def MultiHashEmbed(
     width: int,
@@ -255,6 +278,7 @@ def CharacterEmbed(
     return model
 
 
+# TODO: archive
 @registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
 def MaxoutWindowEncoder(
     width: int, window_size: int, maxout_pieces: int, depth: int
@@ -286,7 +310,39 @@ def MaxoutWindowEncoder(
     model.attrs["receptive_field"] = window_size * depth
     return model
 
+@registry.architectures.register("spacy.MaxoutWindowEncoder.v2")
+def MaxoutWindowEncoder(
+    width: int, window_size: int, maxout_pieces: int, depth: int
+) -> Model[List[Floats2d], List[Floats2d]]:
+    """Encode context using convolutions with maxout activation, layer
+    normalization and residual connections.
 
+    width (int): The input and output width. These are required to be the same,
+        to allow residual connections. This value will be determined by the
+        width of the inputs. Recommended values are between 64 and 300.
+    window_size (int): The number of words to concatenate around each token
+        to construct the convolution. Recommended value is 1.
+    maxout_pieces (int): The number of maxout pieces to use. Recommended
+        values are 2 or 3.
+    depth (int): The number of convolutional layers. Recommended value is 4.
+    """
+    cnn = chain(
+        expand_window(window_size=window_size),
+        Maxout(
+            nO=width,
+            nI=width * ((window_size * 2) + 1),
+            nP=maxout_pieces,
+            dropout=0.0,
+            normalize=True,
+        ),
+    )
+    model = clone(residual(cnn), depth)
+    model.set_dim("nO", width)
+    receptive_field = window_size * depth
+    return with_array(model, pad=receptive_field)
+
+
+# TODO: archive
 @registry.architectures.register("spacy.MishWindowEncoder.v1")
 def MishWindowEncoder(
     width: int, window_size: int, depth: int
@@ -310,6 +366,29 @@ def MishWindowEncoder(
     return model
 
 
+@registry.architectures.register("spacy.MishWindowEncoder.v2")
+def MishWindowEncoder(
+    width: int, window_size: int, depth: int
+) -> Model[List[Floats2d], List[Floats2d]]:
+    """Encode context using convolutions with mish activation, layer
+    normalization and residual connections.
+
+    width (int): The input and output width. These are required to be the same,
+        to allow residual connections. This value will be determined by the
+        width of the inputs. Recommended values are between 64 and 300.
+    window_size (int): The number of words to concatenate around each token
+        to construct the convolution. Recommended value is 1.
+    depth (int): The number of convolutional layers. Recommended value is 4.
+    """
+    cnn = chain(
+        expand_window(window_size=window_size),
+        Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
+    )
+    model = clone(residual(cnn), depth)
+    model.set_dim("nO", width)
+    return with_array(model)
+
+
 @registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
 def BiLSTMEncoder(
     width: int, depth: int, dropout: float
@@ -319,9 +398,9 @@ def BiLSTMEncoder(
     width (int): The input and output width. These are required to be the same,
         to allow residual connections. This value will be determined by the
         width of the inputs. Recommended values are between 64 and 300.
-    window_size (int): The number of words to concatenate around each token
-        to construct the convolution. Recommended value is 1.
-    depth (int): The number of convolutional layers. Recommended value is 4.
+    depth (int): The number of recurrent layers.
+    dropout (float): Creates a Dropout layer on the outputs of each LSTM layer
+        except the last layer. Set to 0 to disable this functionality.
     """
     if depth == 0:
         return noop()
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 66e0787ef..e34b2e750 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -24,7 +24,7 @@ default_model_config = """
 @architectures = "spacy.Tagger.v1"
 
 [model.tok2vec]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"
 
 [model.tok2vec.embed]
 @architectures = "spacy.CharacterEmbed.v1"
@@ -35,7 +35,7 @@ nC = 8
 include_static_vectors = false
 
 [model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = 128
 depth = 4
 window_size = 1
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 8a5d6938e..456a8bb38 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -19,7 +19,7 @@ single_label_default_config = """
 @architectures = "spacy.TextCatEnsemble.v2"
 
 [model.tok2vec]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"
 
 [model.tok2vec.embed]
 @architectures = "spacy.MultiHashEmbed.v1"
@@ -29,7 +29,7 @@ attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
 include_static_vectors = false
 
 [model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = ${model.tok2vec.embed.width}
 window_size = 1
 maxout_pieces = 3
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index ec4ed17dd..90052a9c8 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -113,7 +113,7 @@ cfg_string = """
     factory = "tok2vec"
 
     [components.tok2vec.model]
-    @architectures = "spacy.Tok2Vec.v1"
+    @architectures = "spacy.Tok2Vec.v2"
 
     [components.tok2vec.model.embed]
     @architectures = "spacy.MultiHashEmbed.v1"
@@ -123,7 +123,7 @@ cfg_string = """
     include_static_vectors = false
 
     [components.tok2vec.model.encode]
-    @architectures = "spacy.MaxoutWindowEncoder.v1"
+    @architectures = "spacy.MaxoutWindowEncoder.v2"
     width = 96
     depth = 4
     window_size = 1
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 1ab10f410..d8f0ce022 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -26,20 +26,20 @@ usage documentation on
 
 ## Tok2Vec architectures {#tok2vec-arch source="spacy/ml/models/tok2vec.py"}
 
-### spacy.Tok2Vec.v1 {#Tok2Vec}
+### spacy.Tok2Vec.v2 {#Tok2Vec}
 
 > #### Example config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.Tok2Vec.v1"
+> @architectures = "spacy.Tok2Vec.v2"
 >
 > [model.embed]
 > @architectures = "spacy.CharacterEmbed.v1"
 > # ...
 >
 > [model.encode]
-> @architectures = "spacy.MaxoutWindowEncoder.v1"
+> @architectures = "spacy.MaxoutWindowEncoder.v2"
 > # ...
 > ```
 
@@ -197,13 +197,13 @@ network to construct a single vector to represent the information.
 | `nC`        | The number of UTF-8 bytes to embed per word. Recommended values are between `3` and `8`, although it may depend on the length of words in the language. ~~int~~ |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                          |
 
-### spacy.MaxoutWindowEncoder.v1 {#MaxoutWindowEncoder}
+### spacy.MaxoutWindowEncoder.v2 {#MaxoutWindowEncoder}
 
 > #### Example config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.MaxoutWindowEncoder.v1"
+> @architectures = "spacy.MaxoutWindowEncoder.v2"
 > width = 128
 > window_size = 1
 > maxout_pieces = 3
@@ -221,13 +221,13 @@ and residual connections.
 | `depth`         | The number of convolutional layers. Recommended value is `4`. ~~int~~                                                                                                                                          |
 | **CREATES**     | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~                                                                                                                                    |
 
-### spacy.MishWindowEncoder.v1 {#MishWindowEncoder}
+### spacy.MishWindowEncoder.v2 {#MishWindowEncoder}
 
 > #### Example config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.MishWindowEncoder.v1"
+> @architectures = "spacy.MishWindowEncoder.v2"
 > width = 64
 > window_size = 1
 > depth = 4
@@ -252,19 +252,19 @@ and residual connections.
 > [model]
 > @architectures = "spacy.TorchBiLSTMEncoder.v1"
 > width = 64
-> window_size = 1
-> depth = 4
+> depth = 2
+> dropout = 0.0
 > ```
 
 Encode context using bidirectional LSTM layers. Requires
 [PyTorch](https://pytorch.org).
 
-| Name          | Description                                                                                                                                                                                                    |
-| ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `width`       | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ |
-| `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~                                                                                           |
-| `depth`       | The number of convolutional layers. Recommended value is `4`. ~~int~~                                                                                                                                          |
-| **CREATES**   | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~                                                                                                                                    |
+| Name        | Description                                                                                                                                                                                                    |
+| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `width`     | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ |
+| `depth`     | The number of recurrent layers, for instance `depth=2` results in stacking two LSTMs together. ~~int~~                                                                                                         |
+| `dropout`   | Creates a Dropout layer on the outputs of each LSTM layer except the last layer. Set to 0.0 to disable this functionality. ~~float~~                                                                           |
+| **CREATES** | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~                                                                                                                                    |
 
 ### spacy.StaticVectors.v1 {#StaticVectors}
 
@@ -600,7 +600,7 @@ specific data and challenge.
 > no_output_layer = false
 >
 > [model.tok2vec]
-> @architectures = "spacy.Tok2Vec.v1"
+> @architectures = "spacy.Tok2Vec.v2"
 >
 > [model.tok2vec.embed]
 > @architectures = "spacy.MultiHashEmbed.v1"
@@ -610,7 +610,7 @@ specific data and challenge.
 > include_static_vectors = false
 >
 > [model.tok2vec.encode]
-> @architectures = "spacy.MaxoutWindowEncoder.v1"
+> @architectures = "spacy.MaxoutWindowEncoder.v2"
 > width = ${model.tok2vec.embed.width}
 > window_size = 1
 > maxout_pieces = 3
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index a7318e4ef..7e47ac9d2 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -129,13 +129,13 @@ the entity recognizer, use a
 factory = "tok2vec"
 
 [components.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"
 
 [components.tok2vec.model.embed]
 @architectures = "spacy.MultiHashEmbed.v1"
 
 [components.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 
 [components.ner]
 factory = "ner"
@@ -161,13 +161,13 @@ factory = "ner"
 @architectures = "spacy.TransitionBasedParser.v1"
 
 [components.ner.model.tok2vec]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"
 
 [components.ner.model.tok2vec.embed]
 @architectures = "spacy.MultiHashEmbed.v1"
 
 [components.ner.model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 ```
 
 <!-- TODO: Once rehearsal is tested, mention it here. -->
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 72669b5e8..d0a2ac819 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -134,7 +134,7 @@ labels = []
 nO = null
 
 [components.textcat.model.tok2vec]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"
 
 [components.textcat.model.tok2vec.embed]
 @architectures = "spacy.MultiHashEmbed.v1"
@@ -144,7 +144,7 @@ attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
 include_static_vectors = false
 
 [components.textcat.model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = ${components.textcat.model.tok2vec.embed.width}
 window_size = 1
 maxout_pieces = 3
@@ -201,14 +201,14 @@ tokens, and their combination forms a typical
 factory = "tok2vec"
 
 [components.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"
 
 [components.tok2vec.model.embed]
 @architectures = "spacy.MultiHashEmbed.v1"
 # ...
 
 [components.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 # ...
 ```
 
@@ -224,7 +224,7 @@ architecture:
 # ...
 
 [components.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 # ...
 ```