mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-24 12:41:23 +03:00
Merge pull request #6205 from explosion/feature/embed-features
This commit is contained in:
commit
181039bd17
|
@ -171,9 +171,14 @@ factory = "tok2vec"
|
||||||
[components.tok2vec.model.embed]
|
[components.tok2vec.model.embed]
|
||||||
@architectures = "spacy.MultiHashEmbed.v1"
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
rows = {{ 2000 if optimize == "efficiency" else 7000 }}
|
{% if has_letters -%}
|
||||||
also_embed_subwords = {{ "true" if has_letters else "false" }}
|
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||||
also_use_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
|
rows = [5000, 2500, 2500, 2500]
|
||||||
|
{% else -%}
|
||||||
|
attrs = ["ORTH", "SHAPE"]
|
||||||
|
rows = [5000, 2500]
|
||||||
|
{% endif -%}
|
||||||
|
include_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
|
||||||
|
|
||||||
[components.tok2vec.model.encode]
|
[components.tok2vec.model.encode]
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional, List, Union
|
from typing import Optional, List, Union, Dict
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
||||||
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
|
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
|
||||||
|
@ -11,7 +11,7 @@ from ...ml import _character_embed
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
from ..featureextractor import FeatureExtractor
|
from ..featureextractor import FeatureExtractor
|
||||||
from ...pipeline.tok2vec import Tok2VecListener
|
from ...pipeline.tok2vec import Tok2VecListener
|
||||||
from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
|
from ...attrs import ORTH, NORM, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
||||||
|
@ -54,12 +54,18 @@ def build_hash_embed_cnn_tok2vec(
|
||||||
a language such as Chinese.
|
a language such as Chinese.
|
||||||
pretrained_vectors (bool): Whether to also use static vectors.
|
pretrained_vectors (bool): Whether to also use static vectors.
|
||||||
"""
|
"""
|
||||||
|
if subword_features:
|
||||||
|
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||||
|
row_sizes = [embed_size, embed_size//2, embed_size//2, embed_size//2]
|
||||||
|
else:
|
||||||
|
attrs = ["NORM"]
|
||||||
|
row_sizes = [embed_size]
|
||||||
return build_Tok2Vec_model(
|
return build_Tok2Vec_model(
|
||||||
embed=MultiHashEmbed(
|
embed=MultiHashEmbed(
|
||||||
width=width,
|
width=width,
|
||||||
rows=embed_size,
|
rows=row_sizes,
|
||||||
also_embed_subwords=subword_features,
|
attrs=attrs,
|
||||||
also_use_static_vectors=bool(pretrained_vectors),
|
include_static_vectors=bool(pretrained_vectors),
|
||||||
),
|
),
|
||||||
encode=MaxoutWindowEncoder(
|
encode=MaxoutWindowEncoder(
|
||||||
width=width,
|
width=width,
|
||||||
|
@ -93,58 +99,65 @@ def build_Tok2Vec_model(
|
||||||
|
|
||||||
@registry.architectures.register("spacy.MultiHashEmbed.v1")
|
@registry.architectures.register("spacy.MultiHashEmbed.v1")
|
||||||
def MultiHashEmbed(
|
def MultiHashEmbed(
|
||||||
width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
|
width: int,
|
||||||
|
attrs: List[Union[str, int]],
|
||||||
|
rows: List[int],
|
||||||
|
include_static_vectors: bool,
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
"""Construct an embedding layer that separately embeds a number of lexical
|
"""Construct an embedding layer that separately embeds a number of lexical
|
||||||
attributes using hash embedding, concatenates the results, and passes it
|
attributes using hash embedding, concatenates the results, and passes it
|
||||||
through a feed-forward subnetwork to build a mixed representations.
|
through a feed-forward subnetwork to build a mixed representations.
|
||||||
|
|
||||||
The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have
|
The features used can be configured with the 'attrs' argument. The suggested
|
||||||
varying definitions depending on the Vocab of the Doc object passed in.
|
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
|
||||||
Vectors from pretrained static vectors can also be incorporated into the
|
account some subword information, without contruction a fully character-based
|
||||||
concatenated representation.
|
representation. If pretrained vectors are available, they can be included in
|
||||||
|
the representation as well, with the vectors table will be kept static
|
||||||
|
(i.e. it's not updated).
|
||||||
|
|
||||||
|
The `width` parameter specifices the output width of the layer and the widths
|
||||||
|
of all embedding tables. If static vectors are included, a learned linear
|
||||||
|
layer is used to map the vectors to the specified width before concatenating
|
||||||
|
it with the other embedding outputs. A single Maxout layer is then used to
|
||||||
|
reduce the concatenated vectors to the final width.
|
||||||
|
|
||||||
|
The `rows` parameter controls the number of rows used by the `HashEmbed`
|
||||||
|
tables. The HashEmbed layer needs surprisingly few rows, due to its use of
|
||||||
|
the hashing trick. Generally between 2000 and 10000 rows is sufficient,
|
||||||
|
even for very large vocabularies. A number of rows must be specified for each
|
||||||
|
table, so the `rows` list must be of the same length as the `attrs` parameter.
|
||||||
|
|
||||||
width (int): The output width. Also used as the width of the embedding tables.
|
width (int): The output width. Also used as the width of the embedding tables.
|
||||||
Recommended values are between 64 and 300.
|
Recommended values are between 64 and 300.
|
||||||
rows (int): The number of rows for the embedding tables. Can be low, due
|
attrs (list of attr IDs): The token attributes to embed. A separate
|
||||||
to the hashing trick. Embeddings for prefix, suffix and word shape
|
embedding table will be constructed for each attribute.
|
||||||
use half as many rows. Recommended values are between 2000 and 10000.
|
rows (List[int]): The number of rows in the embedding tables. Must have the
|
||||||
also_embed_subwords (bool): Whether to use the PREFIX, SUFFIX and SHAPE
|
same length as attrs.
|
||||||
features in the embeddings. If not using these, you may need more
|
include_static_vectors (bool): Whether to also use static word vectors.
|
||||||
rows in your hash embeddings, as there will be increased chance of
|
|
||||||
collisions.
|
|
||||||
also_use_static_vectors (bool): Whether to also use static word vectors.
|
|
||||||
Requires a vectors table to be loaded in the Doc objects' vocab.
|
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||||
"""
|
"""
|
||||||
cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH]
|
if len(rows) != len(attrs):
|
||||||
|
raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
|
||||||
seed = 7
|
seed = 7
|
||||||
|
|
||||||
def make_hash_embed(feature):
|
def make_hash_embed(index):
|
||||||
nonlocal seed
|
nonlocal seed
|
||||||
seed += 1
|
seed += 1
|
||||||
return HashEmbed(
|
return HashEmbed(
|
||||||
width,
|
width,
|
||||||
rows if feature == LOWER else rows // 2,
|
rows[index],
|
||||||
column=cols.index(feature),
|
column=index,
|
||||||
seed=seed,
|
seed=seed,
|
||||||
dropout=0.0,
|
dropout=0.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
if also_embed_subwords:
|
embeddings = [make_hash_embed(i) for i in range(len(attrs))]
|
||||||
embeddings = [
|
concat_size = width * (len(embeddings) + include_static_vectors)
|
||||||
make_hash_embed(LOWER),
|
if include_static_vectors:
|
||||||
make_hash_embed(PREFIX),
|
|
||||||
make_hash_embed(SUFFIX),
|
|
||||||
make_hash_embed(SHAPE),
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
embeddings = [make_hash_embed(LOWER)]
|
|
||||||
concat_size = width * (len(embeddings) + also_use_static_vectors)
|
|
||||||
if also_use_static_vectors:
|
|
||||||
model = chain(
|
model = chain(
|
||||||
concatenate(
|
concatenate(
|
||||||
chain(
|
chain(
|
||||||
FeatureExtractor(cols),
|
FeatureExtractor(attrs),
|
||||||
list2ragged(),
|
list2ragged(),
|
||||||
with_array(concatenate(*embeddings)),
|
with_array(concatenate(*embeddings)),
|
||||||
),
|
),
|
||||||
|
@ -155,7 +168,7 @@ def MultiHashEmbed(
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
model = chain(
|
model = chain(
|
||||||
FeatureExtractor(cols),
|
FeatureExtractor(list(attrs)),
|
||||||
list2ragged(),
|
list2ragged(),
|
||||||
with_array(concatenate(*embeddings)),
|
with_array(concatenate(*embeddings)),
|
||||||
with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)),
|
with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)),
|
||||||
|
|
|
@ -24,9 +24,9 @@ def test_empty_doc():
|
||||||
tok2vec = build_Tok2Vec_model(
|
tok2vec = build_Tok2Vec_model(
|
||||||
MultiHashEmbed(
|
MultiHashEmbed(
|
||||||
width=width,
|
width=width,
|
||||||
rows=embed_size,
|
rows=[embed_size, embed_size, embed_size, embed_size],
|
||||||
also_use_static_vectors=False,
|
include_static_vectors=False,
|
||||||
also_embed_subwords=True,
|
attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"],
|
||||||
),
|
),
|
||||||
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
|
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
|
||||||
)
|
)
|
||||||
|
@ -44,9 +44,9 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
|
||||||
tok2vec = build_Tok2Vec_model(
|
tok2vec = build_Tok2Vec_model(
|
||||||
MultiHashEmbed(
|
MultiHashEmbed(
|
||||||
width=width,
|
width=width,
|
||||||
rows=embed_size,
|
rows=[embed_size] * 4,
|
||||||
also_use_static_vectors=False,
|
include_static_vectors=False,
|
||||||
also_embed_subwords=True,
|
attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"],
|
||||||
),
|
),
|
||||||
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
|
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
|
||||||
)
|
)
|
||||||
|
@ -61,8 +61,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"width,embed_arch,embed_config,encode_arch,encode_config",
|
"width,embed_arch,embed_config,encode_arch,encode_config",
|
||||||
[
|
[
|
||||||
(8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
|
(8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
|
||||||
(8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
|
(8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
|
||||||
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
|
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
|
||||||
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
|
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
|
||||||
],
|
],
|
||||||
|
@ -118,9 +118,9 @@ cfg_string = """
|
||||||
[components.tok2vec.model.embed]
|
[components.tok2vec.model.embed]
|
||||||
@architectures = "spacy.MultiHashEmbed.v1"
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
rows = 2000
|
rows = [2000, 1000, 1000, 1000]
|
||||||
also_embed_subwords = true
|
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||||
also_use_static_vectors = false
|
include_static_vectors = false
|
||||||
|
|
||||||
[components.tok2vec.model.encode]
|
[components.tok2vec.model.encode]
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
|
|
@ -89,9 +89,9 @@ def my_parser():
|
||||||
tok2vec = build_Tok2Vec_model(
|
tok2vec = build_Tok2Vec_model(
|
||||||
MultiHashEmbed(
|
MultiHashEmbed(
|
||||||
width=321,
|
width=321,
|
||||||
rows=5432,
|
attrs=["LOWER", "SHAPE"],
|
||||||
also_embed_subwords=True,
|
rows=[5432, 5432],
|
||||||
also_use_static_vectors=False,
|
include_static_vectors=False,
|
||||||
),
|
),
|
||||||
MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
|
MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
|
||||||
)
|
)
|
||||||
|
|
|
@ -61,7 +61,10 @@ def get_tok2vec_kwargs():
|
||||||
# This actually creates models, so seems best to put it in a function.
|
# This actually creates models, so seems best to put it in a function.
|
||||||
return {
|
return {
|
||||||
"embed": MultiHashEmbed(
|
"embed": MultiHashEmbed(
|
||||||
width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False
|
width=32,
|
||||||
|
rows=[500, 500, 500],
|
||||||
|
attrs=["NORM", "PREFIX", "SHAPE"],
|
||||||
|
include_static_vectors=False
|
||||||
),
|
),
|
||||||
"encode": MaxoutWindowEncoder(
|
"encode": MaxoutWindowEncoder(
|
||||||
width=32, depth=2, maxout_pieces=2, window_size=1
|
width=32, depth=2, maxout_pieces=2, window_size=1
|
||||||
|
@ -73,6 +76,32 @@ def test_tok2vec():
|
||||||
return build_Tok2Vec_model(**get_tok2vec_kwargs())
|
return build_Tok2Vec_model(**get_tok2vec_kwargs())
|
||||||
|
|
||||||
|
|
||||||
|
def test_multi_hash_embed():
|
||||||
|
embed = MultiHashEmbed(
|
||||||
|
width=32,
|
||||||
|
rows=[500, 500, 500],
|
||||||
|
attrs=["NORM", "PREFIX", "SHAPE"],
|
||||||
|
include_static_vectors=False
|
||||||
|
)
|
||||||
|
hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
|
||||||
|
assert len(hash_embeds) == 3
|
||||||
|
# Check they look at different columns.
|
||||||
|
assert list(sorted(he.attrs["column"] for he in hash_embeds)) == [0, 1, 2]
|
||||||
|
# Check they use different seeds
|
||||||
|
assert len(set(he.attrs["seed"] for he in hash_embeds)) == 3
|
||||||
|
# Check they all have the same number of rows
|
||||||
|
assert [he.get_dim("nV") for he in hash_embeds] == [500, 500, 500]
|
||||||
|
# Now try with different row factors
|
||||||
|
embed = MultiHashEmbed(
|
||||||
|
width=32,
|
||||||
|
rows=[1000, 50, 250],
|
||||||
|
attrs=["NORM", "PREFIX", "SHAPE"],
|
||||||
|
include_static_vectors=False
|
||||||
|
)
|
||||||
|
hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
|
||||||
|
assert [he.get_dim("nV") for he in hash_embeds] == [1000, 50, 250]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"seed,model_func,kwargs",
|
"seed,model_func,kwargs",
|
||||||
[
|
[
|
||||||
|
|
|
@ -136,25 +136,50 @@ argument that connects to the shared `tok2vec` component in the pipeline.
|
||||||
> [model]
|
> [model]
|
||||||
> @architectures = "spacy.MultiHashEmbed.v1"
|
> @architectures = "spacy.MultiHashEmbed.v1"
|
||||||
> width = 64
|
> width = 64
|
||||||
> rows = 2000
|
> attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||||
> also_embed_subwords = false
|
> rows = [2000, 1000, 1000, 1000]
|
||||||
> also_use_static_vectors = false
|
> include_static_vectors = true
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Construct an embedding layer that separately embeds a number of lexical
|
Construct an embedding layer that separately embeds a number of lexical
|
||||||
attributes using hash embedding, concatenates the results, and passes it through
|
attributes using hash embedding, concatenates the results, and passes it
|
||||||
a feed-forward subnetwork to build mixed representations. The features used are
|
through a feed-forward subnetwork to build a mixed representations.
|
||||||
the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a
|
|
||||||
[FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static
|
The features used can be configured with the 'attrs' argument. The suggested
|
||||||
vectors can also be incorporated into the concatenated representation.
|
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
|
||||||
|
account some subword information, without contruction a fully character-based
|
||||||
|
representation. If pretrained vectors are available, they can be included in
|
||||||
|
the representation as well, with the vectors table will be kept static
|
||||||
|
(i.e. it's not updated).
|
||||||
|
|
||||||
|
The `width` parameter specifices the output width of the layer and the widths
|
||||||
|
of all embedding tables. If static vectors are included, a learned linear
|
||||||
|
layer is used to map the vectors to the specified width before concatenating
|
||||||
|
it with the other embedding outputs. A single Maxout layer is then used to
|
||||||
|
reduce the concatenated vectors to the final width.
|
||||||
|
|
||||||
|
The `rows` parameter controls the number of rows used by the `HashEmbed`
|
||||||
|
tables. The HashEmbed layer needs surprisingly few rows, due to its use of
|
||||||
|
the hashing trick. Generally between 2000 and 10000 rows is sufficient,
|
||||||
|
even for very large vocabularies. A number of rows must be specified for each
|
||||||
|
table, so the `rows` list must be of the same length as the `attrs` parameter.
|
||||||
|
|
||||||
|
attrs (list of attr IDs): The token attributes to embed. A separate
|
||||||
|
embedding table will be constructed for each attribute.
|
||||||
|
rows (List[int]): The number of rows in the embedding tables. Must have the
|
||||||
|
same length as attrs.
|
||||||
|
include_static_vectors (bool): Whether to also use static word vectors.
|
||||||
|
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||||
|
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. ~~int~~ |
|
| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. ~~int~~ |
|
||||||
| `rows` | The number of rows for the embedding tables. Can be low, due to the hashing trick. Embeddings for prefix, suffix and word shape use half as many rows. Recommended values are between `2000` and `10000`. ~~int~~ |
|
| `attrs` | The token attributes to embed. A separate |
|
||||||
| `also_embed_subwords` | Whether to use the `PREFIX`, `SUFFIX` and `SHAPE` features in the embeddings. If not using these, you may need more rows in your hash embeddings, as there will be increased chance of collisions. ~~bool~~ |
|
embedding table will be constructed for each attribute. ~~List[Union[int, str]]~~ |
|
||||||
| `also_use_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. ~~bool~~ |
|
| `rows` | The number of rows for each embedding tables. Can be low, due to the hashing trick. Recommended values are between `1000` and `10000`. ~~List[int]~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. ~~bool~~ |
|
||||||
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
|
|
||||||
### spacy.CharacterEmbed.v1 {#CharacterEmbed}
|
### spacy.CharacterEmbed.v1 {#CharacterEmbed}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user