Replace negative rows with 0 in StaticVectors (#7674)

* Replace negative rows with 0 in StaticVectors

Replace negative row indices with 0-vectors in `StaticVectors`.

* Increase versions related to StaticVectors

* Increase versions of all architctures and layers related to
`StaticVectors`
* Improve efficiency of 0-vector operations

Parallel `spacy-legacy` PR: https://github.com/explosion/spacy-legacy/pull/5

* Update config defaults to new versions

* Update docs
This commit is contained in:
Adriane Boyd 2021-04-22 10:04:15 +02:00 committed by GitHub
parent 6f565cf39d
commit d2bdaa7823
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 106 additions and 69 deletions

View File

@ -1,5 +1,5 @@
# Our libraries # Our libraries
spacy-legacy>=3.0.2,<3.1.0 spacy-legacy>=3.0.3,<3.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.2,<8.1.0 thinc>=8.0.2,<8.1.0

View File

@ -37,7 +37,7 @@ setup_requires =
thinc>=8.0.2,<8.1.0 thinc>=8.0.2,<8.1.0
install_requires = install_requires =
# Our libraries # Our libraries
spacy-legacy>=3.0.2,<3.1.0 spacy-legacy>=3.0.3,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0

View File

@ -206,7 +206,7 @@ factory = "tok2vec"
@architectures = "spacy.Tok2Vec.v2" @architectures = "spacy.Tok2Vec.v2"
[components.tok2vec.model.embed] [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1" @architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width} width = ${components.tok2vec.model.encode.width}
{% if has_letters -%} {% if has_letters -%}
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]

View File

@ -31,7 +31,7 @@ def get_tok2vec_width(model: Model):
return nO return nO
@registry.architectures("spacy.HashEmbedCNN.v1") @registry.architectures("spacy.HashEmbedCNN.v2")
def build_hash_embed_cnn_tok2vec( def build_hash_embed_cnn_tok2vec(
*, *,
width: int, width: int,
@ -108,7 +108,7 @@ def build_Tok2Vec_model(
return tok2vec return tok2vec
@registry.architectures("spacy.MultiHashEmbed.v1") @registry.architectures("spacy.MultiHashEmbed.v2")
def MultiHashEmbed( def MultiHashEmbed(
width: int, width: int,
attrs: List[Union[str, int]], attrs: List[Union[str, int]],
@ -182,7 +182,7 @@ def MultiHashEmbed(
return model return model
@registry.architectures("spacy.CharacterEmbed.v1") @registry.architectures("spacy.CharacterEmbed.v2")
def CharacterEmbed( def CharacterEmbed(
width: int, width: int,
rows: int, rows: int,

View File

@ -8,7 +8,7 @@ from ..tokens import Doc
from ..errors import Errors from ..errors import Errors
@registry.layers("spacy.StaticVectors.v1") @registry.layers("spacy.StaticVectors.v2")
def StaticVectors( def StaticVectors(
nO: Optional[int] = None, nO: Optional[int] = None,
nM: Optional[int] = None, nM: Optional[int] = None,
@ -46,6 +46,8 @@ def forward(
vectors_data = model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True) vectors_data = model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True)
except ValueError: except ValueError:
raise RuntimeError(Errors.E896) raise RuntimeError(Errors.E896)
# Convert negative indices to 0-vectors (TODO: more options for UNK tokens)
vectors_data[rows < 0] = 0
output = Ragged( output = Ragged(
vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i") vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i")
) )

View File

@ -24,7 +24,7 @@ maxout_pieces = 2
use_upper = true use_upper = true
[model.tok2vec] [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1" @architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null pretrained_vectors = null
width = 96 width = 96
depth = 4 depth = 4

View File

@ -26,7 +26,7 @@ default_model_config = """
@architectures = "spacy.EntityLinker.v1" @architectures = "spacy.EntityLinker.v1"
[model.tok2vec] [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1" @architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null pretrained_vectors = null
width = 96 width = 96
depth = 2 depth = 2

View File

@ -27,7 +27,7 @@ default_model_config = """
@architectures = "spacy.Tok2Vec.v2" @architectures = "spacy.Tok2Vec.v2"
[model.tok2vec.embed] [model.tok2vec.embed]
@architectures = "spacy.CharacterEmbed.v1" @architectures = "spacy.CharacterEmbed.v2"
width = 128 width = 128
rows = 7000 rows = 7000
nM = 64 nM = 64

View File

@ -22,7 +22,7 @@ maxout_pieces = 3
token_vector_width = 96 token_vector_width = 96
[model.tok2vec] [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1" @architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null pretrained_vectors = null
width = 96 width = 96
depth = 4 depth = 4

View File

@ -21,7 +21,7 @@ maxout_pieces = 2
use_upper = true use_upper = true
[model.tok2vec] [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1" @architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null pretrained_vectors = null
width = 96 width = 96
depth = 4 depth = 4

View File

@ -19,7 +19,7 @@ default_model_config = """
@architectures = "spacy.Tagger.v1" @architectures = "spacy.Tagger.v1"
[model.tok2vec] [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1" @architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null pretrained_vectors = null
width = 12 width = 12
depth = 1 depth = 1

View File

@ -26,7 +26,7 @@ default_model_config = """
@architectures = "spacy.Tagger.v1" @architectures = "spacy.Tagger.v1"
[model.tok2vec] [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1" @architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null pretrained_vectors = null
width = 96 width = 96
depth = 4 depth = 4

View File

@ -21,7 +21,7 @@ single_label_default_config = """
@architectures = "spacy.Tok2Vec.v2" @architectures = "spacy.Tok2Vec.v2"
[model.tok2vec.embed] [model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1" @architectures = "spacy.MultiHashEmbed.v2"
width = 64 width = 64
rows = [2000, 2000, 1000, 1000, 1000, 1000] rows = [2000, 2000, 1000, 1000, 1000, 1000]
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
@ -56,7 +56,7 @@ single_label_cnn_config = """
exclusive_classes = true exclusive_classes = true
[model.tok2vec] [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1" @architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null pretrained_vectors = null
width = 96 width = 96
depth = 4 depth = 4

View File

@ -21,7 +21,7 @@ multi_label_default_config = """
@architectures = "spacy.Tok2Vec.v1" @architectures = "spacy.Tok2Vec.v1"
[model.tok2vec.embed] [model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1" @architectures = "spacy.MultiHashEmbed.v2"
width = 64 width = 64
rows = [2000, 2000, 1000, 1000, 1000, 1000] rows = [2000, 2000, 1000, 1000, 1000, 1000]
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
@ -56,7 +56,7 @@ multi_label_cnn_config = """
exclusive_classes = false exclusive_classes = false
[model.tok2vec] [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1" @architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null pretrained_vectors = null
width = 96 width = 96
depth = 4 depth = 4

View File

@ -11,7 +11,7 @@ from ..errors import Errors
default_model_config = """ default_model_config = """
[model] [model]
@architectures = "spacy.HashEmbedCNN.v1" @architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null pretrained_vectors = null
width = 96 width = 96
depth = 4 depth = 4

View File

@ -35,7 +35,7 @@ usage documentation on
> @architectures = "spacy.Tok2Vec.v2" > @architectures = "spacy.Tok2Vec.v2"
> >
> [model.embed] > [model.embed]
> @architectures = "spacy.CharacterEmbed.v1" > @architectures = "spacy.CharacterEmbed.v2"
> # ... > # ...
> >
> [model.encode] > [model.encode]
@ -54,13 +54,13 @@ blog post for background.
| `encode` | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder). ~~Model[List[Floats2d], List[Floats2d]]~~ | | `encode` | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder). ~~Model[List[Floats2d], List[Floats2d]]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN} ### spacy.HashEmbedCNN.v2 {#HashEmbedCNN}
> #### Example Config > #### Example Config
> >
> ```ini > ```ini
> [model] > [model]
> @architectures = "spacy.HashEmbedCNN.v1" > @architectures = "spacy.HashEmbedCNN.v2"
> pretrained_vectors = null > pretrained_vectors = null
> width = 96 > width = 96
> depth = 4 > depth = 4
@ -96,7 +96,7 @@ consisting of a CNN and a layer-normalized maxout activation function.
> factory = "tok2vec" > factory = "tok2vec"
> >
> [components.tok2vec.model] > [components.tok2vec.model]
> @architectures = "spacy.HashEmbedCNN.v1" > @architectures = "spacy.HashEmbedCNN.v2"
> width = 342 > width = 342
> >
> [components.tagger] > [components.tagger]
@ -129,13 +129,13 @@ argument that connects to the shared `tok2vec` component in the pipeline.
| `upstream` | A string to identify the "upstream" `Tok2Vec` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ | | `upstream` | A string to identify the "upstream" `Tok2Vec` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
### spacy.MultiHashEmbed.v1 {#MultiHashEmbed} ### spacy.MultiHashEmbed.v2 {#MultiHashEmbed}
> #### Example config > #### Example config
> >
> ```ini > ```ini
> [model] > [model]
> @architectures = "spacy.MultiHashEmbed.v1" > @architectures = "spacy.MultiHashEmbed.v2"
> width = 64 > width = 64
> attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] > attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
> rows = [2000, 1000, 1000, 1000] > rows = [2000, 1000, 1000, 1000]
@ -160,13 +160,13 @@ not updated).
| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~ | | `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
### spacy.CharacterEmbed.v1 {#CharacterEmbed} ### spacy.CharacterEmbed.v2 {#CharacterEmbed}
> #### Example config > #### Example config
> >
> ```ini > ```ini
> [model] > [model]
> @architectures = "spacy.CharacterEmbed.v1" > @architectures = "spacy.CharacterEmbed.v2"
> width = 128 > width = 128
> rows = 7000 > rows = 7000
> nM = 64 > nM = 64
@ -266,13 +266,13 @@ Encode context using bidirectional LSTM layers. Requires
| `dropout` | Creates a Dropout layer on the outputs of each LSTM layer except the last layer. Set to 0.0 to disable this functionality. ~~float~~ | | `dropout` | Creates a Dropout layer on the outputs of each LSTM layer except the last layer. Set to 0.0 to disable this functionality. ~~float~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~ |
### spacy.StaticVectors.v1 {#StaticVectors} ### spacy.StaticVectors.v2 {#StaticVectors}
> #### Example config > #### Example config
> >
> ```ini > ```ini
> [model] > [model]
> @architectures = "spacy.StaticVectors.v1" > @architectures = "spacy.StaticVectors.v2"
> nO = null > nO = null
> nM = null > nM = null
> dropout = 0.2 > dropout = 0.2
@ -283,8 +283,9 @@ Encode context using bidirectional LSTM layers. Requires
> ``` > ```
Embed [`Doc`](/api/doc) objects with their vocab's vectors table, applying a Embed [`Doc`](/api/doc) objects with their vocab's vectors table, applying a
learned linear projection to control the dimensionality. See the documentation learned linear projection to control the dimensionality. Unknown tokens are
on [static vectors](/usage/embeddings-transformers#static-vectors) for details. mapped to a zero vector. See the documentation on [static
vectors](/usage/embeddings-transformers#static-vectors) for details.
| Name |  Description | | Name |  Description |
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -513,7 +514,7 @@ for a Tok2Vec layer.
> use_upper = true > use_upper = true
> >
> [model.tok2vec] > [model.tok2vec]
> @architectures = "spacy.HashEmbedCNN.v1" > @architectures = "spacy.HashEmbedCNN.v2"
> pretrained_vectors = null > pretrained_vectors = null
> width = 96 > width = 96
> depth = 4 > depth = 4
@ -619,7 +620,7 @@ single-label use-cases where `exclusive_classes = true`, while the
> @architectures = "spacy.Tok2Vec.v2" > @architectures = "spacy.Tok2Vec.v2"
> >
> [model.tok2vec.embed] > [model.tok2vec.embed]
> @architectures = "spacy.MultiHashEmbed.v1" > @architectures = "spacy.MultiHashEmbed.v2"
> width = 64 > width = 64
> rows = [2000, 2000, 1000, 1000, 1000, 1000] > rows = [2000, 2000, 1000, 1000, 1000, 1000]
> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] > attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
@ -676,7 +677,7 @@ taking it as argument:
> nO = null > nO = null
> >
> [model.tok2vec] > [model.tok2vec]
> @architectures = "spacy.HashEmbedCNN.v1" > @architectures = "spacy.HashEmbedCNN.v2"
> pretrained_vectors = null > pretrained_vectors = null
> width = 96 > width = 96
> depth = 4 > depth = 4
@ -744,7 +745,7 @@ into the "real world". This requires 3 main components:
> nO = null > nO = null
> >
> [model.tok2vec] > [model.tok2vec]
> @architectures = "spacy.HashEmbedCNN.v1" > @architectures = "spacy.HashEmbedCNN.v2"
> pretrained_vectors = null > pretrained_vectors = null
> width = 96 > width = 96
> depth = 2 > depth = 2

View File

@ -29,8 +29,8 @@ recommended settings for your use case, check out the
> >
> The `@` syntax lets you refer to function names registered in the > The `@` syntax lets you refer to function names registered in the
> [function registry](/api/top-level#registry). For example, > [function registry](/api/top-level#registry). For example,
> `@architectures = "spacy.HashEmbedCNN.v1"` refers to a registered function of > `@architectures = "spacy.HashEmbedCNN.v2"` refers to a registered function of
> the name [spacy.HashEmbedCNN.v1](/api/architectures#HashEmbedCNN) and all > the name [spacy.HashEmbedCNN.v2](/api/architectures#HashEmbedCNN) and all
> other values defined in its block will be passed into that function as > other values defined in its block will be passed into that function as
> arguments. Those arguments depend on the registered function. See the usage > arguments. Those arguments depend on the registered function. See the usage
> guide on [registered functions](/usage/training#config-functions) for details. > guide on [registered functions](/usage/training#config-functions) for details.

View File

@ -5,11 +5,12 @@ source: spacy/legacy
--- ---
The [`spacy-legacy`](https://github.com/explosion/spacy-legacy) package includes The [`spacy-legacy`](https://github.com/explosion/spacy-legacy) package includes
outdated registered functions and architectures. It is installed automatically as outdated registered functions and architectures. It is installed automatically
a dependency of spaCy, and provides backwards compatibility for archived functions as a dependency of spaCy, and provides backwards compatibility for archived
that may still be used in projects. functions that may still be used in projects.
You can find the detailed documentation of each such legacy function on this page. You can find the detailed documentation of each such legacy function on this
page.
## Architectures {#architectures} ## Architectures {#architectures}
@ -44,15 +45,14 @@ blog post for background.
| Name | Description | | Name | Description |
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `embed` | Embed tokens into context-independent word vector representations. For example, [CharacterEmbed](/api/architectures#CharacterEmbed) or [MultiHashEmbed](/api/architectures#MultiHashEmbed). ~~Model[List[Doc], List[Floats2d]]~~ | | `embed` | Embed tokens into context-independent word vector representations. For example, [CharacterEmbed](/api/architectures#CharacterEmbed) or [MultiHashEmbed](/api/architectures#MultiHashEmbed). ~~Model[List[Doc], List[Floats2d]]~~ |
| `encode` | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder.v1](/api/legacy#MaxoutWindowEncoder_v1). ~~Model[Floats2d, Floats2d]~~ | | `encode` | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder.v1](/api/legacy#MaxoutWindowEncoder_v1). ~~Model[Floats2d, Floats2d]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
### spacy.MaxoutWindowEncoder.v1 {#MaxoutWindowEncoder_v1} ### spacy.MaxoutWindowEncoder.v1 {#MaxoutWindowEncoder_v1}
The `spacy.MaxoutWindowEncoder.v1` architecture was producing a model of type The `spacy.MaxoutWindowEncoder.v1` architecture was producing a model of type
`Model[Floats2D, Floats2D]`. Since `spacy.MaxoutWindowEncoder.v2`, this has been changed to output `Model[Floats2D, Floats2D]`. Since `spacy.MaxoutWindowEncoder.v2`, this has been
type `Model[List[Floats2d], List[Floats2d]]`. changed to output type `Model[List[Floats2d], List[Floats2d]]`.
> #### Example config > #### Example config
> >
@ -79,8 +79,8 @@ and residual connections.
### spacy.MishWindowEncoder.v1 {#MishWindowEncoder_v1} ### spacy.MishWindowEncoder.v1 {#MishWindowEncoder_v1}
The `spacy.MishWindowEncoder.v1` architecture was producing a model of type The `spacy.MishWindowEncoder.v1` architecture was producing a model of type
`Model[Floats2D, Floats2D]`. Since `spacy.MishWindowEncoder.v2`, this has been changed to output `Model[Floats2D, Floats2D]`. Since `spacy.MishWindowEncoder.v2`, this has been
type `Model[List[Floats2d], List[Floats2d]]`. changed to output type `Model[List[Floats2d], List[Floats2d]]`.
> #### Example config > #### Example config
> >
@ -103,12 +103,11 @@ and residual connections.
| `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ | | `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ |
| **CREATES** | The model using the architecture. ~~Model[Floats2d, Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[Floats2d, Floats2d]~~ |
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble_v1} ### spacy.TextCatEnsemble.v1 {#TextCatEnsemble_v1}
The `spacy.TextCatEnsemble.v1` architecture built an internal `tok2vec` and `linear_model`. The `spacy.TextCatEnsemble.v1` architecture built an internal `tok2vec` and
Since `spacy.TextCatEnsemble.v2`, this has been refactored so that the `TextCatEnsemble` takes these `linear_model`. Since `spacy.TextCatEnsemble.v2`, this has been refactored so
two sublayers as input. that the `TextCatEnsemble` takes these two sublayers as input.
> #### Example Config > #### Example Config
> >
@ -142,6 +141,40 @@ network has an internal CNN Tok2Vec layer and uses attention.
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN_v1}
Identical to [`spacy.HashEmbedCNN.v2`](/api/architectures#HashEmbedCNN) except
using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are included.
### spacy.MultiHashEmbed.v1 {#MultiHashEmbed_v1}
Identical to [`spacy.MultiHashEmbed.v2`](/api/architectures#MultiHashEmbed)
except with [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are
included.
### spacy.CharacterEmbed.v1 {#CharacterEmbed_v1}
Identical to [`spacy.CharacterEmbed.v2`](/api/architectures#CharacterEmbed)
except using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are
included.
## Layers {#layers}
These functions are available from `@spacy.registry.layers`.
### spacy.StaticVectors.v1 {#StaticVectors_v1}
Identical to [`spacy.StaticVectors.v2`](/api/architectures#StaticVectors) except
for the handling of tokens without vectors.
<Infobox title="Bugs for tokens without vectors" variant="warning">
`spacy.StaticVectors.v1` maps tokens without vectors to the final row in the
vectors table, which causes the model predictions to change if new vectors are
added to an existing vectors table. See more details in
[issue #7662](https://github.com/explosion/spaCy/issues/7662#issuecomment-813925655).
</Infobox>
## Loggers {#loggers} ## Loggers {#loggers}
@ -160,7 +193,8 @@ support the `log_dataset_dir` and `model_log_interval` arguments.
> project_name = "monitor_spacy_training" > project_name = "monitor_spacy_training"
> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"] > remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
> ``` > ```
| Name | Description | >
| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | > | Name | Description |
| `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ | > | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ | > | `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
> | `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ |

View File

@ -132,7 +132,7 @@ factory = "tok2vec"
@architectures = "spacy.Tok2Vec.v2" @architectures = "spacy.Tok2Vec.v2"
[components.tok2vec.model.embed] [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1" @architectures = "spacy.MultiHashEmbed.v2"
[components.tok2vec.model.encode] [components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2" @architectures = "spacy.MaxoutWindowEncoder.v2"
@ -164,7 +164,7 @@ factory = "ner"
@architectures = "spacy.Tok2Vec.v2" @architectures = "spacy.Tok2Vec.v2"
[components.ner.model.tok2vec.embed] [components.ner.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1" @architectures = "spacy.MultiHashEmbed.v2"
[components.ner.model.tok2vec.encode] [components.ner.model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2" @architectures = "spacy.MaxoutWindowEncoder.v2"
@ -541,7 +541,7 @@ word vector tables using the `include_static_vectors` flag.
```ini ```ini
[tagger.model.tok2vec.embed] [tagger.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1" @architectures = "spacy.MultiHashEmbed.v2"
width = 128 width = 128
attrs = ["LOWER","PREFIX","SUFFIX","SHAPE"] attrs = ["LOWER","PREFIX","SUFFIX","SHAPE"]
rows = [5000,2500,2500,2500] rows = [5000,2500,2500,2500]
@ -550,7 +550,7 @@ include_static_vectors = true
<Infobox title="How it works" emoji="💡"> <Infobox title="How it works" emoji="💡">
The configuration system will look up the string `"spacy.MultiHashEmbed.v1"` in The configuration system will look up the string `"spacy.MultiHashEmbed.v2"` in
the `architectures` [registry](/api/top-level#registry), and call the returned the `architectures` [registry](/api/top-level#registry), and call the returned
object with the rest of the arguments from the block. This will result in a call object with the rest of the arguments from the block. This will result in a call
to the to the

View File

@ -137,7 +137,7 @@ nO = null
@architectures = "spacy.Tok2Vec.v2" @architectures = "spacy.Tok2Vec.v2"
[components.textcat.model.tok2vec.embed] [components.textcat.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1" @architectures = "spacy.MultiHashEmbed.v2"
width = 64 width = 64
rows = [2000, 2000, 1000, 1000, 1000, 1000] rows = [2000, 2000, 1000, 1000, 1000, 1000]
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
@ -204,7 +204,7 @@ factory = "tok2vec"
@architectures = "spacy.Tok2Vec.v2" @architectures = "spacy.Tok2Vec.v2"
[components.tok2vec.model.embed] [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1" @architectures = "spacy.MultiHashEmbed.v2"
# ... # ...
[components.tok2vec.model.encode] [components.tok2vec.model.encode]
@ -220,7 +220,7 @@ architecture:
```ini ```ini
### config.cfg (excerpt) ### config.cfg (excerpt)
[components.tok2vec.model.embed] [components.tok2vec.model.embed]
@architectures = "spacy.CharacterEmbed.v1" @architectures = "spacy.CharacterEmbed.v2"
# ... # ...
[components.tok2vec.model.encode] [components.tok2vec.model.encode]
@ -638,7 +638,7 @@ that has the full implementation.
> @architectures = "rel_instance_tensor.v1" > @architectures = "rel_instance_tensor.v1"
> >
> [model.create_instance_tensor.tok2vec] > [model.create_instance_tensor.tok2vec]
> @architectures = "spacy.HashEmbedCNN.v1" > @architectures = "spacy.HashEmbedCNN.v2"
> # ... > # ...
> >
> [model.create_instance_tensor.pooling] > [model.create_instance_tensor.pooling]