Fix types of Tok2Vec encoding architectures (#6442)

* fix TorchBiLSTMEncoder documentation * ensure the types of the encoding Tok2vec layers are correct * update references from v1 to v2 for the new architectures
2025-09-16 17:12:38 +03:00 · 2021-01-07 06:39:27 +01:00 · 2021-01-07 06:39:27 +01:00 · 75d9019343
commit 75d9019343
parent 8c1a23209f
8 changed files with 116 additions and 37 deletions
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -205,7 +205,7 @@ no_output_layer = false
 factory = "tok2vec"

 [components.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"

 [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
@ -220,7 +220,7 @@ rows = [5000, 2500]
 include_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}

 [components.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = {{ 96 if optimize == "efficiency" else 256 }}
 depth = {{ 4 if optimize == "efficiency" else 8 }}
 window_size = 1
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -87,6 +87,7 @@ def build_hash_embed_cnn_tok2vec(
    )


+# TODO: archive
@registry.architectures.register("spacy.Tok2Vec.v1")
 def build_Tok2Vec_model(
    embed: Model[List[Doc], List[Floats2d]],
@ -108,6 +109,28 @@ def build_Tok2Vec_model(
    return tok2vec


+
+@registry.architectures.register("spacy.Tok2Vec.v2")
+def build_Tok2Vec_model(
+    embed: Model[List[Doc], List[Floats2d]],
+    encode: Model[List[Floats2d], List[Floats2d]],
+) -> Model[List[Doc], List[Floats2d]]:
+    """Construct a tok2vec model out of embedding and encoding subnetworks.
+    See https://explosion.ai/blog/deep-learning-formula-nlp
+
+    embed (Model[List[Doc], List[Floats2d]]): Embed tokens into context-independent
+        word vector representations.
+    encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the
+        embeddings, using an architecture such as a CNN, BiLSTM or transformer.
+    """
+    tok2vec = chain(embed, encode)
+    tok2vec.set_dim("nO", encode.get_dim("nO"))
+    tok2vec.set_ref("embed", embed)
+    tok2vec.set_ref("encode", encode)
+    return tok2vec
+
+
+
@registry.architectures.register("spacy.MultiHashEmbed.v1")
 def MultiHashEmbed(
    width: int,
@ -255,6 +278,7 @@ def CharacterEmbed(
    return model


+# TODO: archive
@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
 def MaxoutWindowEncoder(
    width: int, window_size: int, maxout_pieces: int, depth: int
@ -286,7 +310,39 @@ def MaxoutWindowEncoder(
    model.attrs["receptive_field"] = window_size * depth
    return model

+@registry.architectures.register("spacy.MaxoutWindowEncoder.v2")
+def MaxoutWindowEncoder(
+    width: int, window_size: int, maxout_pieces: int, depth: int
+) -> Model[List[Floats2d], List[Floats2d]]:
+    """Encode context using convolutions with maxout activation, layer
+    normalization and residual connections.

+    width (int): The input and output width. These are required to be the same,
+        to allow residual connections. This value will be determined by the
+        width of the inputs. Recommended values are between 64 and 300.
+    window_size (int): The number of words to concatenate around each token
+        to construct the convolution. Recommended value is 1.
+    maxout_pieces (int): The number of maxout pieces to use. Recommended
+        values are 2 or 3.
+    depth (int): The number of convolutional layers. Recommended value is 4.
+    """
+    cnn = chain(
+        expand_window(window_size=window_size),
+        Maxout(
+            nO=width,
+            nI=width * ((window_size * 2) + 1),
+            nP=maxout_pieces,
+            dropout=0.0,
+            normalize=True,
+        ),
+    )
+    model = clone(residual(cnn), depth)
+    model.set_dim("nO", width)
+    receptive_field = window_size * depth
+    return with_array(model, pad=receptive_field)
+
+
+# TODO: archive
@registry.architectures.register("spacy.MishWindowEncoder.v1")
 def MishWindowEncoder(
    width: int, window_size: int, depth: int
@ -310,6 +366,29 @@ def MishWindowEncoder(
    return model


+@registry.architectures.register("spacy.MishWindowEncoder.v2")
+def MishWindowEncoder(
+    width: int, window_size: int, depth: int
+) -> Model[List[Floats2d], List[Floats2d]]:
+    """Encode context using convolutions with mish activation, layer
+    normalization and residual connections.
+
+    width (int): The input and output width. These are required to be the same,
+        to allow residual connections. This value will be determined by the
+        width of the inputs. Recommended values are between 64 and 300.
+    window_size (int): The number of words to concatenate around each token
+        to construct the convolution. Recommended value is 1.
+    depth (int): The number of convolutional layers. Recommended value is 4.
+    """
+    cnn = chain(
+        expand_window(window_size=window_size),
+        Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
+    )
+    model = clone(residual(cnn), depth)
+    model.set_dim("nO", width)
+    return with_array(model)
+
+
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
 def BiLSTMEncoder(
    width: int, depth: int, dropout: float
@ -319,9 +398,9 @@ def BiLSTMEncoder(
    width (int): The input and output width. These are required to be the same,
        to allow residual connections. This value will be determined by the
        width of the inputs. Recommended values are between 64 and 300.
-    window_size (int): The number of words to concatenate around each token
-        to construct the convolution. Recommended value is 1.
-    depth (int): The number of convolutional layers. Recommended value is 4.
+    depth (int): The number of recurrent layers.
+    dropout (float): Creates a Dropout layer on the outputs of each LSTM layer
+        except the last layer. Set to 0 to disable this functionality.
    """
    if depth == 0:
        return noop()
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -24,7 +24,7 @@ default_model_config = """
@architectures = "spacy.Tagger.v1"

 [model.tok2vec]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"

 [model.tok2vec.embed]
@architectures = "spacy.CharacterEmbed.v1"
@ -35,7 +35,7 @@ nC = 8
 include_static_vectors = false

 [model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = 128
 depth = 4
 window_size = 1
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -19,7 +19,7 @@ single_label_default_config = """
@architectures = "spacy.TextCatEnsemble.v2"

 [model.tok2vec]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"

 [model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
@ -29,7 +29,7 @@ attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
 include_static_vectors = false

 [model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = ${model.tok2vec.embed.width}
 window_size = 1
 maxout_pieces = 3
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@ -113,7 +113,7 @@ cfg_string = """
    factory = "tok2vec"

    [components.tok2vec.model]
-    @architectures = "spacy.Tok2Vec.v1"
+    @architectures = "spacy.Tok2Vec.v2"

    [components.tok2vec.model.embed]
    @architectures = "spacy.MultiHashEmbed.v1"
@ -123,7 +123,7 @@ cfg_string = """
    include_static_vectors = false

    [components.tok2vec.model.encode]
-    @architectures = "spacy.MaxoutWindowEncoder.v1"
+    @architectures = "spacy.MaxoutWindowEncoder.v2"
    width = 96
    depth = 4
    window_size = 1
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -26,20 +26,20 @@ usage documentation on

 ## Tok2Vec architectures {#tok2vec-arch source="spacy/ml/models/tok2vec.py"}

-### spacy.Tok2Vec.v1 {#Tok2Vec}
+### spacy.Tok2Vec.v2 {#Tok2Vec}

 > #### Example config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.Tok2Vec.v1"
+> @architectures = "spacy.Tok2Vec.v2"
 >
 > [model.embed]
 > @architectures = "spacy.CharacterEmbed.v1"
 > # ...
 >
 > [model.encode]
-> @architectures = "spacy.MaxoutWindowEncoder.v1"
+> @architectures = "spacy.MaxoutWindowEncoder.v2"
 > # ...
 > ```

@ -197,13 +197,13 @@ network to construct a single vector to represent the information.
 | `nC`        | The number of UTF-8 bytes to embed per word. Recommended values are between `3` and `8`, although it may depend on the length of words in the language. ~~int~~ |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                          |

-### spacy.MaxoutWindowEncoder.v1 {#MaxoutWindowEncoder}
+### spacy.MaxoutWindowEncoder.v2 {#MaxoutWindowEncoder}

 > #### Example config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.MaxoutWindowEncoder.v1"
+> @architectures = "spacy.MaxoutWindowEncoder.v2"
 > width = 128
 > window_size = 1
 > maxout_pieces = 3
@ -221,13 +221,13 @@ and residual connections.
 | `depth`         | The number of convolutional layers. Recommended value is `4`. ~~int~~                                                                                                                                          |
 | **CREATES**     | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~                                                                                                                                    |

-### spacy.MishWindowEncoder.v1 {#MishWindowEncoder}
+### spacy.MishWindowEncoder.v2 {#MishWindowEncoder}

 > #### Example config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.MishWindowEncoder.v1"
+> @architectures = "spacy.MishWindowEncoder.v2"
 > width = 64
 > window_size = 1
 > depth = 4
@ -252,19 +252,19 @@ and residual connections.
 > [model]
 > @architectures = "spacy.TorchBiLSTMEncoder.v1"
 > width = 64
-> window_size = 1
-> depth = 4
+> depth = 2
+> dropout = 0.0
 > ```

 Encode context using bidirectional LSTM layers. Requires
 [PyTorch](https://pytorch.org).

-| Name          | Description                                                                                                                                                                                                    |
-| ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `width`       | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ |
-| `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~                                                                                           |
-| `depth`       | The number of convolutional layers. Recommended value is `4`. ~~int~~                                                                                                                                          |
-| **CREATES**   | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~                                                                                                                                    |
+| Name        | Description                                                                                                                                                                                                    |
+| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `width`     | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ |
+| `depth`     | The number of recurrent layers, for instance `depth=2` results in stacking two LSTMs together. ~~int~~                                                                                                         |
+| `dropout`   | Creates a Dropout layer on the outputs of each LSTM layer except the last layer. Set to 0.0 to disable this functionality. ~~float~~                                                                           |
+| **CREATES** | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~                                                                                                                                    |

 ### spacy.StaticVectors.v1 {#StaticVectors}

@ -600,7 +600,7 @@ specific data and challenge.
 > no_output_layer = false
 >
 > [model.tok2vec]
-> @architectures = "spacy.Tok2Vec.v1"
+> @architectures = "spacy.Tok2Vec.v2"
 >
 > [model.tok2vec.embed]
 > @architectures = "spacy.MultiHashEmbed.v1"
@ -610,7 +610,7 @@ specific data and challenge.
 > include_static_vectors = false
 >
 > [model.tok2vec.encode]
-> @architectures = "spacy.MaxoutWindowEncoder.v1"
+> @architectures = "spacy.MaxoutWindowEncoder.v2"
 > width = ${model.tok2vec.embed.width}
 > window_size = 1
 > maxout_pieces = 3
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@ -129,13 +129,13 @@ the entity recognizer, use a
 factory = "tok2vec"

 [components.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"

 [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"

 [components.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"

 [components.ner]
 factory = "ner"
@ -161,13 +161,13 @@ factory = "ner"
@architectures = "spacy.TransitionBasedParser.v1"

 [components.ner.model.tok2vec]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"

 [components.ner.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"

 [components.ner.model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 ```

 <!-- TODO: Once rehearsal is tested, mention it here. -->
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@ -134,7 +134,7 @@ labels = []
 nO = null

 [components.textcat.model.tok2vec]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"

 [components.textcat.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
@ -144,7 +144,7 @@ attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
 include_static_vectors = false

 [components.textcat.model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = ${components.textcat.model.tok2vec.embed.width}
 window_size = 1
 maxout_pieces = 3
@ -201,14 +201,14 @@ tokens, and their combination forms a typical
 factory = "tok2vec"

 [components.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"

 [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
 # ...

 [components.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 # ...
 ```

@ -224,7 +224,7 @@ architecture:
 # ...

 [components.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 # ...
 ```