Merge pull request #6231 from adrianeboyd/feature/include-static-vectors

This commit is contained in:
Ines Montani 2020-10-09 15:54:52 +02:00 committed by GitHub
commit 9fb3244672
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 10 additions and 10 deletions

View File

@ -177,7 +177,7 @@ def CharacterEmbed(
rows: int, rows: int,
nM: int, nM: int,
nC: int, nC: int,
also_use_static_vectors: bool, include_static_vectors: bool,
feature: Union[int, str] = "LOWER", feature: Union[int, str] = "LOWER",
) -> Model[List[Doc], List[Floats2d]]: ) -> Model[List[Doc], List[Floats2d]]:
"""Construct an embedded representation based on character embeddings, using """Construct an embedded representation based on character embeddings, using
@ -204,13 +204,13 @@ def CharacterEmbed(
nC (int): The number of UTF-8 bytes to embed per word. Recommended values nC (int): The number of UTF-8 bytes to embed per word. Recommended values
are between 3 and 8, although it may depend on the length of words in the are between 3 and 8, although it may depend on the length of words in the
language. language.
also_use_static_vectors (bool): Whether to also use static word vectors. include_static_vectors (bool): Whether to also use static word vectors.
Requires a vectors table to be loaded in the Doc objects' vocab. Requires a vectors table to be loaded in the Doc objects' vocab.
""" """
feature = intify_attr(feature) feature = intify_attr(feature)
if feature is None: if feature is None:
raise ValueError(Errors.E911(feat=feature)) raise ValueError(Errors.E911(feat=feature))
if also_use_static_vectors: if include_static_vectors:
model = chain( model = chain(
concatenate( concatenate(
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()), chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),

View File

@ -32,7 +32,7 @@ width = 128
rows = 7000 rows = 7000
nM = 64 nM = 64
nC = 8 nC = 8
also_use_static_vectors = false include_static_vectors = false
[model.tok2vec.encode] [model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1" @architectures = "spacy.MaxoutWindowEncoder.v1"

View File

@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
[ [
(8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), (8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
(8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), (8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}), (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
], ],
) )
# fmt: on # fmt: on

View File

@ -516,7 +516,7 @@ Many neural network models are able to use word vector tables as additional
features, which sometimes results in significant improvements in accuracy. features, which sometimes results in significant improvements in accuracy.
spaCy's built-in embedding layer, spaCy's built-in embedding layer,
[MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use [MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use
word vector tables using the `also_use_static_vectors` flag. This setting is word vector tables using the `include_static_vectors` flag. This setting is
also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN) also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN)
layer, which builds the default token-to-vector encoding architecture. layer, which builds the default token-to-vector encoding architecture.
@ -524,9 +524,9 @@ layer, which builds the default token-to-vector encoding architecture.
[tagger.model.tok2vec.embed] [tagger.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1" @architectures = "spacy.MultiHashEmbed.v1"
width = 128 width = 128
rows = 7000 attrs = ["LOWER","PREFIX","SUFFIX","SHAPE"]
also_embed_subwords = true rows = [5000,2500,2500,2500]
also_use_static_vectors = true include_static_vectors = true
``` ```
<Infobox title="How it works" emoji="💡"> <Infobox title="How it works" emoji="💡">