mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Add tok2vec docstrings
This commit is contained in:
parent
547bc8a82b
commit
234c52a91e
|
@ -28,11 +28,31 @@ def build_hash_embed_cnn_tok2vec(
|
||||||
window_size: int,
|
window_size: int,
|
||||||
maxout_pieces: int,
|
maxout_pieces: int,
|
||||||
subword_features: bool,
|
subword_features: bool,
|
||||||
dropout: Optional[float],
|
|
||||||
pretrained_vectors: Optional[bool]
|
pretrained_vectors: Optional[bool]
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
"""Build spaCy's 'standard' tok2vec layer, which uses hash embedding
|
"""Build spaCy's 'standard' tok2vec layer, which uses hash embedding
|
||||||
with subword features and a CNN with layer-normalized maxout."""
|
with subword features and a CNN with layer-normalized maxout.
|
||||||
|
|
||||||
|
width (int): The width of the input and output. These are required to be the
|
||||||
|
same, so that residual connections can be used. Recommended values are
|
||||||
|
96, 128 or 300.
|
||||||
|
depth (int): The number of convolutional layers to use. Recommended values
|
||||||
|
are between 2 and 8.
|
||||||
|
window_size (int): The number of tokens on either side to concatenate during
|
||||||
|
the convolutions. The receptive field of the CNN will be
|
||||||
|
depth * (window_size * 2 + 1), so a 4-layer network with window_size of
|
||||||
|
2 will be sensitive to 17 words at a time. Recommended value is 1.
|
||||||
|
embed_size (int): The number of rows in the hash embedding tables. This can
|
||||||
|
be surprisingly small, due to the use of the hash embeddings. Recommended
|
||||||
|
values are between 2000 and 10000.
|
||||||
|
maxout_pieces (int): The number of pieces to use in the maxout non-linearity.
|
||||||
|
If 1, the Mish non-linearity is used instead. Recommended values are 1-3.
|
||||||
|
subword_features (bool): Whether to also embed subword features, specifically
|
||||||
|
the prefix, suffix and word shape. This is recommended for alphabetic
|
||||||
|
languages like English, but not if single-character tokens are used for
|
||||||
|
a language such as Chinese.
|
||||||
|
pretrained_vectors (bool): Whether to also use static vectors.
|
||||||
|
"""
|
||||||
return build_Tok2Vec_model(
|
return build_Tok2Vec_model(
|
||||||
embed=MultiHashEmbed(
|
embed=MultiHashEmbed(
|
||||||
width=width,
|
width=width,
|
||||||
|
@ -54,7 +74,14 @@ def build_Tok2Vec_model(
|
||||||
embed: Model[List[Doc], List[Floats2d]],
|
embed: Model[List[Doc], List[Floats2d]],
|
||||||
encode: Model[List[Floats2d], List[Floats2d]],
|
encode: Model[List[Floats2d], List[Floats2d]],
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
|
"""Construct a tok2vec model out of embedding and encoding subnetworks.
|
||||||
|
See https://explosion.ai/blog/deep-learning-formula-nlp
|
||||||
|
|
||||||
|
embed (Model[List[Doc], List[Floats2d]]): Embed tokens into context-indepdent
|
||||||
|
word vector representations.
|
||||||
|
encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the
|
||||||
|
embeddings, using an architecture such as a CNN, BiLSTM or transformer.
|
||||||
|
"""
|
||||||
receptive_field = encode.attrs.get("receptive_field", 0)
|
receptive_field = encode.attrs.get("receptive_field", 0)
|
||||||
tok2vec = chain(embed, with_array(encode, pad=receptive_field))
|
tok2vec = chain(embed, with_array(encode, pad=receptive_field))
|
||||||
tok2vec.set_dim("nO", encode.get_dim("nO"))
|
tok2vec.set_dim("nO", encode.get_dim("nO"))
|
||||||
|
@ -67,6 +94,27 @@ def build_Tok2Vec_model(
|
||||||
def MultiHashEmbed(
|
def MultiHashEmbed(
|
||||||
width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
|
width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
|
||||||
):
|
):
|
||||||
|
"""Construct an embedding layer that separately embeds a number of lexical
|
||||||
|
attributes using hash embedding, concatenates the results, and passes it
|
||||||
|
through a feed-forward subnetwork to build a mixed representations.
|
||||||
|
|
||||||
|
The features used are the NORM, PREFIX, SUFFIX and SHAPE, which can have
|
||||||
|
varying definitions depending on the Vocab of the Doc object passed in.
|
||||||
|
Vectors from pretrained static vectors can also be incorporated into the
|
||||||
|
concatenated representation.
|
||||||
|
|
||||||
|
width (int): The output width. Also used as the width of the embedding tables.
|
||||||
|
Recommended values are between 64 and 300.
|
||||||
|
rows (int): The number of rows for the embedding tables. Can be low, due
|
||||||
|
to the hashing trick. Embeddings for prefix, suffix and word shape
|
||||||
|
use half as many rows. Recommended values are between 2000 and 10000.
|
||||||
|
also_embed_subwords (bool): Whether to use the PREFIX, SUFFIX and SHAPE
|
||||||
|
features in the embeddings. If not using these, you may need more
|
||||||
|
rows in your hash embeddings, as there will be increased chance of
|
||||||
|
collisions.
|
||||||
|
also_use_static_vectors (bool): Whether to also use static word vectors.
|
||||||
|
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||||
|
"""
|
||||||
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
seed = 7
|
seed = 7
|
||||||
|
|
||||||
|
@ -117,6 +165,30 @@ def MultiHashEmbed(
|
||||||
|
|
||||||
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
||||||
def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
||||||
|
"""Construct an embedded representations based on character embeddings, using
|
||||||
|
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
||||||
|
each word, taken from the beginning and end of the word equally. Padding is
|
||||||
|
used in the centre for words that are too short.
|
||||||
|
|
||||||
|
For instance, let's say nC=4, and the word is "jumping". The characters
|
||||||
|
used will be jung (two from the start, two from the end). If we had nC=8,
|
||||||
|
the characters would be "jumpping": 4 from the start, 4 from the end. This
|
||||||
|
ensures that the final character is always in the last position, instead
|
||||||
|
of being in an arbitrary position depending on the word length.
|
||||||
|
|
||||||
|
The characters are embedded in a embedding table with 256 rows, and the
|
||||||
|
vectors concatenated. A hash-embedded vector of the NORM of the word is
|
||||||
|
also concatenated on, and the result is then passed through a feed-forward
|
||||||
|
network to construct a single vector to represent the information.
|
||||||
|
|
||||||
|
width (int): The width of the output vector and the NORM hash embedding.
|
||||||
|
rows (int): The number of rows in the NORM hash embedding table.
|
||||||
|
nM (int): The dimensionality of the character embeddings. Recommended values
|
||||||
|
are between 16 and 64.
|
||||||
|
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
||||||
|
are between 3 and 8, although it may depend on the length of words in the
|
||||||
|
language.
|
||||||
|
"""
|
||||||
model = chain(
|
model = chain(
|
||||||
concatenate(
|
concatenate(
|
||||||
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||||
|
@ -133,7 +205,19 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
|
@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
|
||||||
def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth: int):
|
def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth: int) -> Model[List[Floats2d], List[Floats2d]]:
|
||||||
|
"""Encode context using convolutions with maxout activation, layer
|
||||||
|
normalization and residual connections.
|
||||||
|
|
||||||
|
width (int): The input and output width. These are required to be the same,
|
||||||
|
to allow residual connections. This value will be determined by the
|
||||||
|
width of the inputs. Recommended values are between 64 and 300.
|
||||||
|
window_size (int): The number of words to concatenate around each token
|
||||||
|
to construct the convolution. Recommended value is 1.
|
||||||
|
maxout_pieces (int): The number of maxout pieces to use. Recommended
|
||||||
|
values are 2 or 3.
|
||||||
|
depth (int): The number of convolutional layers. Recommended value is 4.
|
||||||
|
"""
|
||||||
cnn = chain(
|
cnn = chain(
|
||||||
expand_window(window_size=window_size),
|
expand_window(window_size=window_size),
|
||||||
Maxout(
|
Maxout(
|
||||||
|
@ -151,7 +235,17 @@ def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth:
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.MishWindowEncoder.v1")
|
@registry.architectures.register("spacy.MishWindowEncoder.v1")
|
||||||
def MishWindowEncoder(width, window_size, depth):
|
def MishWindowEncoder(width: int, window_size: int, depth: int) -> Model[List[Floats2d], List[Floats2d]]:
|
||||||
|
"""Encode context using convolutions with mish activation, layer
|
||||||
|
normalization and residual connections.
|
||||||
|
|
||||||
|
width (int): The input and output width. These are required to be the same,
|
||||||
|
to allow residual connections. This value will be determined by the
|
||||||
|
width of the inputs. Recommended values are between 64 and 300.
|
||||||
|
window_size (int): The number of words to concatenate around each token
|
||||||
|
to construct the convolution. Recommended value is 1.
|
||||||
|
depth (int): The number of convolutional layers. Recommended value is 4.
|
||||||
|
"""
|
||||||
cnn = chain(
|
cnn = chain(
|
||||||
expand_window(window_size=window_size),
|
expand_window(window_size=window_size),
|
||||||
Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
|
Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
|
||||||
|
@ -162,7 +256,16 @@ def MishWindowEncoder(width, window_size, depth):
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
|
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
|
||||||
def BiLSTMEncoder(width, depth, dropout):
|
def BiLSTMEncoder(width: int, depth: int, dropout: float) -> Model[List[Floats2d], List[Floats2d]]:
|
||||||
|
"""Encode context using bidirectonal LSTM layers. Requires PyTorch.
|
||||||
|
|
||||||
|
width (int): The input and output width. These are required to be the same,
|
||||||
|
to allow residual connections. This value will be determined by the
|
||||||
|
width of the inputs. Recommended values are between 64 and 300.
|
||||||
|
window_size (int): The number of words to concatenate around each token
|
||||||
|
to construct the convolution. Recommended value is 1.
|
||||||
|
depth (int): The number of convolutional layers. Recommended value is 4.
|
||||||
|
"""
|
||||||
if depth == 0:
|
if depth == 0:
|
||||||
return noop()
|
return noop()
|
||||||
return with_padded(PyTorchLSTM(width, width, bi=True, depth=depth, dropout=dropout))
|
return with_padded(PyTorchLSTM(width, width, bi=True, depth=depth, dropout=dropout))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user