Replace negative rows with 0 in StaticVectors (#7674)

* Replace negative rows with 0 in StaticVectors

Replace negative row indices with 0-vectors in `StaticVectors`.

* Increase versions related to StaticVectors

* Increase versions of all architctures and layers related to
`StaticVectors`
* Improve efficiency of 0-vector operations

Parallel `spacy-legacy` PR: https://github.com/explosion/spacy-legacy/pull/5

* Update config defaults to new versions

* Update docs
This commit is contained in:
Adriane Boyd 2021-04-22 10:04:15 +02:00 committed by GitHub
parent 6f565cf39d
commit d2bdaa7823
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 106 additions and 69 deletions

View File

@ -1,5 +1,5 @@
# Our libraries
spacy-legacy>=3.0.2,<3.1.0
spacy-legacy>=3.0.3,<3.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.2,<8.1.0

View File

@ -37,7 +37,7 @@ setup_requires =
thinc>=8.0.2,<8.1.0
install_requires =
# Our libraries
spacy-legacy>=3.0.2,<3.1.0
spacy-legacy>=3.0.3,<3.1.0
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0

View File

@ -206,7 +206,7 @@ factory = "tok2vec"
@architectures = "spacy.Tok2Vec.v2"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
{% if has_letters -%}
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]

View File

@ -31,7 +31,7 @@ def get_tok2vec_width(model: Model):
return nO
@registry.architectures("spacy.HashEmbedCNN.v1")
@registry.architectures("spacy.HashEmbedCNN.v2")
def build_hash_embed_cnn_tok2vec(
*,
width: int,
@ -108,7 +108,7 @@ def build_Tok2Vec_model(
return tok2vec
@registry.architectures("spacy.MultiHashEmbed.v1")
@registry.architectures("spacy.MultiHashEmbed.v2")
def MultiHashEmbed(
width: int,
attrs: List[Union[str, int]],
@ -182,7 +182,7 @@ def MultiHashEmbed(
return model
@registry.architectures("spacy.CharacterEmbed.v1")
@registry.architectures("spacy.CharacterEmbed.v2")
def CharacterEmbed(
width: int,
rows: int,

View File

@ -8,7 +8,7 @@ from ..tokens import Doc
from ..errors import Errors
@registry.layers("spacy.StaticVectors.v1")
@registry.layers("spacy.StaticVectors.v2")
def StaticVectors(
nO: Optional[int] = None,
nM: Optional[int] = None,
@ -46,6 +46,8 @@ def forward(
vectors_data = model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True)
except ValueError:
raise RuntimeError(Errors.E896)
# Convert negative indices to 0-vectors (TODO: more options for UNK tokens)
vectors_data[rows < 0] = 0
output = Ragged(
vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i")
)

View File

@ -24,7 +24,7 @@ maxout_pieces = 2
use_upper = true
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 4

View File

@ -26,7 +26,7 @@ default_model_config = """
@architectures = "spacy.EntityLinker.v1"
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 2

View File

@ -27,7 +27,7 @@ default_model_config = """
@architectures = "spacy.Tok2Vec.v2"
[model.tok2vec.embed]
@architectures = "spacy.CharacterEmbed.v1"
@architectures = "spacy.CharacterEmbed.v2"
width = 128
rows = 7000
nM = 64

View File

@ -22,7 +22,7 @@ maxout_pieces = 3
token_vector_width = 96
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 4

View File

@ -21,7 +21,7 @@ maxout_pieces = 2
use_upper = true
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 4

View File

@ -19,7 +19,7 @@ default_model_config = """
@architectures = "spacy.Tagger.v1"
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 12
depth = 1

View File

@ -26,7 +26,7 @@ default_model_config = """
@architectures = "spacy.Tagger.v1"
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 4

View File

@ -21,7 +21,7 @@ single_label_default_config = """
@architectures = "spacy.Tok2Vec.v2"
[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
@architectures = "spacy.MultiHashEmbed.v2"
width = 64
rows = [2000, 2000, 1000, 1000, 1000, 1000]
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
@ -56,7 +56,7 @@ single_label_cnn_config = """
exclusive_classes = true
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 4

View File

@ -21,7 +21,7 @@ multi_label_default_config = """
@architectures = "spacy.Tok2Vec.v1"
[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
@architectures = "spacy.MultiHashEmbed.v2"
width = 64
rows = [2000, 2000, 1000, 1000, 1000, 1000]
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
@ -56,7 +56,7 @@ multi_label_cnn_config = """
exclusive_classes = false
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 4

View File

@ -11,7 +11,7 @@ from ..errors import Errors
default_model_config = """
[model]
@architectures = "spacy.HashEmbedCNN.v1"
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 4

View File

@ -35,7 +35,7 @@ usage documentation on
> @architectures = "spacy.Tok2Vec.v2"
>
> [model.embed]
> @architectures = "spacy.CharacterEmbed.v1"
> @architectures = "spacy.CharacterEmbed.v2"
> # ...
>
> [model.encode]
@ -54,13 +54,13 @@ blog post for background.
| `encode` | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder). ~~Model[List[Floats2d], List[Floats2d]]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
### spacy.HashEmbedCNN.v2 {#HashEmbedCNN}
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.HashEmbedCNN.v1"
> @architectures = "spacy.HashEmbedCNN.v2"
> pretrained_vectors = null
> width = 96
> depth = 4
@ -96,7 +96,7 @@ consisting of a CNN and a layer-normalized maxout activation function.
> factory = "tok2vec"
>
> [components.tok2vec.model]
> @architectures = "spacy.HashEmbedCNN.v1"
> @architectures = "spacy.HashEmbedCNN.v2"
> width = 342
>
> [components.tagger]
@ -129,13 +129,13 @@ argument that connects to the shared `tok2vec` component in the pipeline.
| `upstream` | A string to identify the "upstream" `Tok2Vec` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
### spacy.MultiHashEmbed.v1 {#MultiHashEmbed}
### spacy.MultiHashEmbed.v2 {#MultiHashEmbed}
> #### Example config
>
> ```ini
> [model]
> @architectures = "spacy.MultiHashEmbed.v1"
> @architectures = "spacy.MultiHashEmbed.v2"
> width = 64
> attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
> rows = [2000, 1000, 1000, 1000]
@ -160,13 +160,13 @@ not updated).
| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
### spacy.CharacterEmbed.v1 {#CharacterEmbed}
### spacy.CharacterEmbed.v2 {#CharacterEmbed}
> #### Example config
>
> ```ini
> [model]
> @architectures = "spacy.CharacterEmbed.v1"
> @architectures = "spacy.CharacterEmbed.v2"
> width = 128
> rows = 7000
> nM = 64
@ -266,13 +266,13 @@ Encode context using bidirectional LSTM layers. Requires
| `dropout` | Creates a Dropout layer on the outputs of each LSTM layer except the last layer. Set to 0.0 to disable this functionality. ~~float~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~ |
### spacy.StaticVectors.v1 {#StaticVectors}
### spacy.StaticVectors.v2 {#StaticVectors}
> #### Example config
>
> ```ini
> [model]
> @architectures = "spacy.StaticVectors.v1"
> @architectures = "spacy.StaticVectors.v2"
> nO = null
> nM = null
> dropout = 0.2
@ -283,8 +283,9 @@ Encode context using bidirectional LSTM layers. Requires
> ```
Embed [`Doc`](/api/doc) objects with their vocab's vectors table, applying a
learned linear projection to control the dimensionality. See the documentation
on [static vectors](/usage/embeddings-transformers#static-vectors) for details.
learned linear projection to control the dimensionality. Unknown tokens are
mapped to a zero vector. See the documentation on [static
vectors](/usage/embeddings-transformers#static-vectors) for details.
| Name |  Description |
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -513,7 +514,7 @@ for a Tok2Vec layer.
> use_upper = true
>
> [model.tok2vec]
> @architectures = "spacy.HashEmbedCNN.v1"
> @architectures = "spacy.HashEmbedCNN.v2"
> pretrained_vectors = null
> width = 96
> depth = 4
@ -619,7 +620,7 @@ single-label use-cases where `exclusive_classes = true`, while the
> @architectures = "spacy.Tok2Vec.v2"
>
> [model.tok2vec.embed]
> @architectures = "spacy.MultiHashEmbed.v1"
> @architectures = "spacy.MultiHashEmbed.v2"
> width = 64
> rows = [2000, 2000, 1000, 1000, 1000, 1000]
> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
@ -676,7 +677,7 @@ taking it as argument:
> nO = null
>
> [model.tok2vec]
> @architectures = "spacy.HashEmbedCNN.v1"
> @architectures = "spacy.HashEmbedCNN.v2"
> pretrained_vectors = null
> width = 96
> depth = 4
@ -744,7 +745,7 @@ into the "real world". This requires 3 main components:
> nO = null
>
> [model.tok2vec]
> @architectures = "spacy.HashEmbedCNN.v1"
> @architectures = "spacy.HashEmbedCNN.v2"
> pretrained_vectors = null
> width = 96
> depth = 2

View File

@ -29,8 +29,8 @@ recommended settings for your use case, check out the
>
> The `@` syntax lets you refer to function names registered in the
> [function registry](/api/top-level#registry). For example,
> `@architectures = "spacy.HashEmbedCNN.v1"` refers to a registered function of
> the name [spacy.HashEmbedCNN.v1](/api/architectures#HashEmbedCNN) and all
> `@architectures = "spacy.HashEmbedCNN.v2"` refers to a registered function of
> the name [spacy.HashEmbedCNN.v2](/api/architectures#HashEmbedCNN) and all
> other values defined in its block will be passed into that function as
> arguments. Those arguments depend on the registered function. See the usage
> guide on [registered functions](/usage/training#config-functions) for details.

View File

@ -4,12 +4,13 @@ teaser: Archived implementations available through spacy-legacy
source: spacy/legacy
---
The [`spacy-legacy`](https://github.com/explosion/spacy-legacy) package includes
outdated registered functions and architectures. It is installed automatically as
a dependency of spaCy, and provides backwards compatibility for archived functions
that may still be used in projects.
The [`spacy-legacy`](https://github.com/explosion/spacy-legacy) package includes
outdated registered functions and architectures. It is installed automatically
as a dependency of spaCy, and provides backwards compatibility for archived
functions that may still be used in projects.
You can find the detailed documentation of each such legacy function on this page.
You can find the detailed documentation of each such legacy function on this
page.
## Architectures {#architectures}
@ -17,8 +18,8 @@ These functions are available from `@spacy.registry.architectures`.
### spacy.Tok2Vec.v1 {#Tok2Vec_v1}
The `spacy.Tok2Vec.v1` architecture was expecting an `encode` model of type
`Model[Floats2D, Floats2D]` such as `spacy.MaxoutWindowEncoder.v1` or
The `spacy.Tok2Vec.v1` architecture was expecting an `encode` model of type
`Model[Floats2D, Floats2D]` such as `spacy.MaxoutWindowEncoder.v1` or
`spacy.MishWindowEncoder.v1`.
> #### Example config
@ -44,15 +45,14 @@ blog post for background.
| Name | Description |
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `embed` | Embed tokens into context-independent word vector representations. For example, [CharacterEmbed](/api/architectures#CharacterEmbed) or [MultiHashEmbed](/api/architectures#MultiHashEmbed). ~~Model[List[Doc], List[Floats2d]]~~ |
| `encode` | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder.v1](/api/legacy#MaxoutWindowEncoder_v1). ~~Model[Floats2d, Floats2d]~~ |
| `encode` | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder.v1](/api/legacy#MaxoutWindowEncoder_v1). ~~Model[Floats2d, Floats2d]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
### spacy.MaxoutWindowEncoder.v1 {#MaxoutWindowEncoder_v1}
The `spacy.MaxoutWindowEncoder.v1` architecture was producing a model of type
`Model[Floats2D, Floats2D]`. Since `spacy.MaxoutWindowEncoder.v2`, this has been changed to output
type `Model[List[Floats2d], List[Floats2d]]`.
The `spacy.MaxoutWindowEncoder.v1` architecture was producing a model of type
`Model[Floats2D, Floats2D]`. Since `spacy.MaxoutWindowEncoder.v2`, this has been
changed to output type `Model[List[Floats2d], List[Floats2d]]`.
> #### Example config
>
@ -78,9 +78,9 @@ and residual connections.
### spacy.MishWindowEncoder.v1 {#MishWindowEncoder_v1}
The `spacy.MishWindowEncoder.v1` architecture was producing a model of type
`Model[Floats2D, Floats2D]`. Since `spacy.MishWindowEncoder.v2`, this has been changed to output
type `Model[List[Floats2d], List[Floats2d]]`.
The `spacy.MishWindowEncoder.v1` architecture was producing a model of type
`Model[Floats2D, Floats2D]`. Since `spacy.MishWindowEncoder.v2`, this has been
changed to output type `Model[List[Floats2d], List[Floats2d]]`.
> #### Example config
>
@ -103,12 +103,11 @@ and residual connections.
| `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ |
| **CREATES** | The model using the architecture. ~~Model[Floats2d, Floats2d]~~ |
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble_v1}
The `spacy.TextCatEnsemble.v1` architecture built an internal `tok2vec` and `linear_model`.
Since `spacy.TextCatEnsemble.v2`, this has been refactored so that the `TextCatEnsemble` takes these
two sublayers as input.
The `spacy.TextCatEnsemble.v1` architecture built an internal `tok2vec` and
`linear_model`. Since `spacy.TextCatEnsemble.v2`, this has been refactored so
that the `TextCatEnsemble` takes these two sublayers as input.
> #### Example Config
>
@ -142,6 +141,40 @@ network has an internal CNN Tok2Vec layer and uses attention.
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN_v1}
Identical to [`spacy.HashEmbedCNN.v2`](/api/architectures#HashEmbedCNN) except
using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are included.
### spacy.MultiHashEmbed.v1 {#MultiHashEmbed_v1}
Identical to [`spacy.MultiHashEmbed.v2`](/api/architectures#MultiHashEmbed)
except with [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are
included.
### spacy.CharacterEmbed.v1 {#CharacterEmbed_v1}
Identical to [`spacy.CharacterEmbed.v2`](/api/architectures#CharacterEmbed)
except using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are
included.
## Layers {#layers}
These functions are available from `@spacy.registry.layers`.
### spacy.StaticVectors.v1 {#StaticVectors_v1}
Identical to [`spacy.StaticVectors.v2`](/api/architectures#StaticVectors) except
for the handling of tokens without vectors.
<Infobox title="Bugs for tokens without vectors" variant="warning">
`spacy.StaticVectors.v1` maps tokens without vectors to the final row in the
vectors table, which causes the model predictions to change if new vectors are
added to an existing vectors table. See more details in
[issue #7662](https://github.com/explosion/spaCy/issues/7662#issuecomment-813925655).
</Infobox>
## Loggers {#loggers}
@ -149,7 +182,7 @@ These functions are available from `@spacy.registry.loggers`.
### spacy.WandbLogger.v1 {#WandbLogger_v1}
The first version of the [`WandbLogger`](/api/top-level#WandbLogger) did not yet
The first version of the [`WandbLogger`](/api/top-level#WandbLogger) did not yet
support the `log_dataset_dir` and `model_log_interval` arguments.
> #### Example config
@ -160,7 +193,8 @@ support the `log_dataset_dir` and `model_log_interval` arguments.
> project_name = "monitor_spacy_training"
> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
> ```
| Name | Description |
| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ |
>
> | Name | Description |
> | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
> | `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
> | `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ |

View File

@ -132,7 +132,7 @@ factory = "tok2vec"
@architectures = "spacy.Tok2Vec.v2"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
@architectures = "spacy.MultiHashEmbed.v2"
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
@ -164,7 +164,7 @@ factory = "ner"
@architectures = "spacy.Tok2Vec.v2"
[components.ner.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
@architectures = "spacy.MultiHashEmbed.v2"
[components.ner.model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
@ -541,7 +541,7 @@ word vector tables using the `include_static_vectors` flag.
```ini
[tagger.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
@architectures = "spacy.MultiHashEmbed.v2"
width = 128
attrs = ["LOWER","PREFIX","SUFFIX","SHAPE"]
rows = [5000,2500,2500,2500]
@ -550,7 +550,7 @@ include_static_vectors = true
<Infobox title="How it works" emoji="💡">
The configuration system will look up the string `"spacy.MultiHashEmbed.v1"` in
The configuration system will look up the string `"spacy.MultiHashEmbed.v2"` in
the `architectures` [registry](/api/top-level#registry), and call the returned
object with the rest of the arguments from the block. This will result in a call
to the

View File

@ -137,7 +137,7 @@ nO = null
@architectures = "spacy.Tok2Vec.v2"
[components.textcat.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
@architectures = "spacy.MultiHashEmbed.v2"
width = 64
rows = [2000, 2000, 1000, 1000, 1000, 1000]
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
@ -204,7 +204,7 @@ factory = "tok2vec"
@architectures = "spacy.Tok2Vec.v2"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
@architectures = "spacy.MultiHashEmbed.v2"
# ...
[components.tok2vec.model.encode]
@ -220,7 +220,7 @@ architecture:
```ini
### config.cfg (excerpt)
[components.tok2vec.model.embed]
@architectures = "spacy.CharacterEmbed.v1"
@architectures = "spacy.CharacterEmbed.v2"
# ...
[components.tok2vec.model.encode]
@ -638,7 +638,7 @@ that has the full implementation.
> @architectures = "rel_instance_tensor.v1"
>
> [model.create_instance_tensor.tok2vec]
> @architectures = "spacy.HashEmbedCNN.v1"
> @architectures = "spacy.HashEmbedCNN.v2"
> # ...
>
> [model.create_instance_tensor.pooling]