mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-28 02:04:07 +03:00
Merge pull request #6231 from adrianeboyd/feature/include-static-vectors
This commit is contained in:
commit
9fb3244672
|
@ -177,7 +177,7 @@ def CharacterEmbed(
|
||||||
rows: int,
|
rows: int,
|
||||||
nM: int,
|
nM: int,
|
||||||
nC: int,
|
nC: int,
|
||||||
also_use_static_vectors: bool,
|
include_static_vectors: bool,
|
||||||
feature: Union[int, str] = "LOWER",
|
feature: Union[int, str] = "LOWER",
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
"""Construct an embedded representation based on character embeddings, using
|
"""Construct an embedded representation based on character embeddings, using
|
||||||
|
@ -204,13 +204,13 @@ def CharacterEmbed(
|
||||||
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
||||||
are between 3 and 8, although it may depend on the length of words in the
|
are between 3 and 8, although it may depend on the length of words in the
|
||||||
language.
|
language.
|
||||||
also_use_static_vectors (bool): Whether to also use static word vectors.
|
include_static_vectors (bool): Whether to also use static word vectors.
|
||||||
Requires a vectors table to be loaded in the Doc objects' vocab.
|
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||||
"""
|
"""
|
||||||
feature = intify_attr(feature)
|
feature = intify_attr(feature)
|
||||||
if feature is None:
|
if feature is None:
|
||||||
raise ValueError(Errors.E911(feat=feature))
|
raise ValueError(Errors.E911(feat=feature))
|
||||||
if also_use_static_vectors:
|
if include_static_vectors:
|
||||||
model = chain(
|
model = chain(
|
||||||
concatenate(
|
concatenate(
|
||||||
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||||
|
|
|
@ -32,7 +32,7 @@ width = 128
|
||||||
rows = 7000
|
rows = 7000
|
||||||
nM = 64
|
nM = 64
|
||||||
nC = 8
|
nC = 8
|
||||||
also_use_static_vectors = false
|
include_static_vectors = false
|
||||||
|
|
||||||
[model.tok2vec.encode]
|
[model.tok2vec.encode]
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
|
|
@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
|
||||||
[
|
[
|
||||||
(8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
|
(8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
|
||||||
(8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
|
(8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
|
||||||
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
|
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
|
||||||
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
|
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
|
@ -516,7 +516,7 @@ Many neural network models are able to use word vector tables as additional
|
||||||
features, which sometimes results in significant improvements in accuracy.
|
features, which sometimes results in significant improvements in accuracy.
|
||||||
spaCy's built-in embedding layer,
|
spaCy's built-in embedding layer,
|
||||||
[MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use
|
[MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use
|
||||||
word vector tables using the `also_use_static_vectors` flag. This setting is
|
word vector tables using the `include_static_vectors` flag. This setting is
|
||||||
also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN)
|
also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN)
|
||||||
layer, which builds the default token-to-vector encoding architecture.
|
layer, which builds the default token-to-vector encoding architecture.
|
||||||
|
|
||||||
|
@ -524,9 +524,9 @@ layer, which builds the default token-to-vector encoding architecture.
|
||||||
[tagger.model.tok2vec.embed]
|
[tagger.model.tok2vec.embed]
|
||||||
@architectures = "spacy.MultiHashEmbed.v1"
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
width = 128
|
width = 128
|
||||||
rows = 7000
|
attrs = ["LOWER","PREFIX","SUFFIX","SHAPE"]
|
||||||
also_embed_subwords = true
|
rows = [5000,2500,2500,2500]
|
||||||
also_use_static_vectors = true
|
include_static_vectors = true
|
||||||
```
|
```
|
||||||
|
|
||||||
<Infobox title="How it works" emoji="💡">
|
<Infobox title="How it works" emoji="💡">
|
||||||
|
|
Loading…
Reference in New Issue
Block a user