2020-10-05 16:22:00 +03:00
|
|
|
from typing import Optional, List, Union, Dict
|
2020-10-01 17:22:48 +03:00
|
|
|
from thinc.types import Floats2d
|
2020-07-28 14:51:43 +03:00
|
|
|
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
2020-10-01 17:22:48 +03:00
|
|
|
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
|
2020-07-29 15:45:09 +03:00
|
|
|
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
2020-02-27 20:42:27 +03:00
|
|
|
|
2020-07-28 16:51:40 +03:00
|
|
|
from ...tokens import Doc
|
2020-03-08 15:23:18 +03:00
|
|
|
from ...util import registry
|
2020-10-04 12:16:31 +03:00
|
|
|
from ...errors import Errors
|
2020-02-27 20:42:27 +03:00
|
|
|
from ...ml import _character_embed
|
2020-07-28 16:51:40 +03:00
|
|
|
from ..staticvectors import StaticVectors
|
2020-10-01 17:22:48 +03:00
|
|
|
from ..featureextractor import FeatureExtractor
|
2020-02-27 20:42:27 +03:00
|
|
|
from ...pipeline.tok2vec import Tok2VecListener
|
2020-10-05 16:22:00 +03:00
|
|
|
from ...attrs import ORTH, NORM, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
|
2020-02-27 20:42:27 +03:00
|
|
|
|
|
|
|
|
2020-07-28 14:51:43 +03:00
|
|
|
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
2020-07-31 18:02:54 +03:00
|
|
|
def tok2vec_listener_v1(width: int, upstream: str = "*"):
|
2020-07-22 14:42:59 +03:00
|
|
|
tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
|
2020-02-27 20:42:27 +03:00
|
|
|
return tok2vec
|
|
|
|
|
|
|
|
|
2020-07-28 23:43:06 +03:00
|
|
|
@registry.architectures.register("spacy.HashEmbedCNN.v1")
|
|
|
|
def build_hash_embed_cnn_tok2vec(
|
|
|
|
*,
|
|
|
|
width: int,
|
|
|
|
depth: int,
|
|
|
|
embed_size: int,
|
|
|
|
window_size: int,
|
|
|
|
maxout_pieces: int,
|
|
|
|
subword_features: bool,
|
|
|
|
pretrained_vectors: Optional[bool]
|
|
|
|
) -> Model[List[Doc], List[Floats2d]]:
|
|
|
|
"""Build spaCy's 'standard' tok2vec layer, which uses hash embedding
|
2020-08-07 17:48:48 +03:00
|
|
|
with subword features and a CNN with layer-normalized maxout.
|
|
|
|
|
|
|
|
width (int): The width of the input and output. These are required to be the
|
|
|
|
same, so that residual connections can be used. Recommended values are
|
|
|
|
96, 128 or 300.
|
|
|
|
depth (int): The number of convolutional layers to use. Recommended values
|
|
|
|
are between 2 and 8.
|
|
|
|
window_size (int): The number of tokens on either side to concatenate during
|
|
|
|
the convolutions. The receptive field of the CNN will be
|
|
|
|
depth * (window_size * 2 + 1), so a 4-layer network with window_size of
|
|
|
|
2 will be sensitive to 17 words at a time. Recommended value is 1.
|
|
|
|
embed_size (int): The number of rows in the hash embedding tables. This can
|
|
|
|
be surprisingly small, due to the use of the hash embeddings. Recommended
|
|
|
|
values are between 2000 and 10000.
|
|
|
|
maxout_pieces (int): The number of pieces to use in the maxout non-linearity.
|
|
|
|
If 1, the Mish non-linearity is used instead. Recommended values are 1-3.
|
|
|
|
subword_features (bool): Whether to also embed subword features, specifically
|
|
|
|
the prefix, suffix and word shape. This is recommended for alphabetic
|
|
|
|
languages like English, but not if single-character tokens are used for
|
|
|
|
a language such as Chinese.
|
|
|
|
pretrained_vectors (bool): Whether to also use static vectors.
|
|
|
|
"""
|
2020-10-05 16:22:00 +03:00
|
|
|
if subword_features:
|
2020-10-05 20:57:45 +03:00
|
|
|
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
|
|
|
row_sizes = [embed_size, embed_size//2, embed_size//2, embed_size//2]
|
2020-10-05 16:22:00 +03:00
|
|
|
else:
|
2020-10-05 20:57:45 +03:00
|
|
|
attrs = ["NORM"]
|
|
|
|
row_sizes = [embed_size]
|
2020-07-28 23:43:06 +03:00
|
|
|
return build_Tok2Vec_model(
|
|
|
|
embed=MultiHashEmbed(
|
|
|
|
width=width,
|
2020-10-05 20:57:45 +03:00
|
|
|
rows=row_sizes,
|
2020-10-05 16:22:00 +03:00
|
|
|
attrs=attrs,
|
|
|
|
include_static_vectors=bool(pretrained_vectors),
|
2020-07-28 23:43:06 +03:00
|
|
|
),
|
|
|
|
encode=MaxoutWindowEncoder(
|
|
|
|
width=width,
|
|
|
|
depth=depth,
|
|
|
|
window_size=window_size,
|
2020-07-31 18:02:54 +03:00
|
|
|
maxout_pieces=maxout_pieces,
|
|
|
|
),
|
2020-07-28 23:43:06 +03:00
|
|
|
)
|
|
|
|
|
2020-07-31 18:02:54 +03:00
|
|
|
|
2020-02-27 20:42:27 +03:00
|
|
|
@registry.architectures.register("spacy.Tok2Vec.v1")
|
2020-07-28 23:43:06 +03:00
|
|
|
def build_Tok2Vec_model(
|
2020-07-28 14:51:43 +03:00
|
|
|
embed: Model[List[Doc], List[Floats2d]],
|
2020-07-28 23:02:34 +03:00
|
|
|
encode: Model[List[Floats2d], List[Floats2d]],
|
2020-07-28 14:51:43 +03:00
|
|
|
) -> Model[List[Doc], List[Floats2d]]:
|
2020-08-07 17:48:48 +03:00
|
|
|
"""Construct a tok2vec model out of embedding and encoding subnetworks.
|
|
|
|
See https://explosion.ai/blog/deep-learning-formula-nlp
|
2020-07-28 16:51:40 +03:00
|
|
|
|
2020-08-07 19:40:54 +03:00
|
|
|
embed (Model[List[Doc], List[Floats2d]]): Embed tokens into context-independent
|
2020-08-07 17:48:48 +03:00
|
|
|
word vector representations.
|
|
|
|
encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the
|
|
|
|
embeddings, using an architecture such as a CNN, BiLSTM or transformer.
|
|
|
|
"""
|
2020-07-28 16:51:40 +03:00
|
|
|
receptive_field = encode.attrs.get("receptive_field", 0)
|
|
|
|
tok2vec = chain(embed, with_array(encode, pad=receptive_field))
|
2020-02-27 20:42:27 +03:00
|
|
|
tok2vec.set_dim("nO", encode.get_dim("nO"))
|
|
|
|
tok2vec.set_ref("embed", embed)
|
|
|
|
tok2vec.set_ref("encode", encode)
|
|
|
|
return tok2vec
|
|
|
|
|
|
|
|
|
2020-07-28 16:51:40 +03:00
|
|
|
@registry.architectures.register("spacy.MultiHashEmbed.v1")
|
2020-10-05 16:22:00 +03:00
|
|
|
def MultiHashEmbed(
|
|
|
|
width: int,
|
2020-10-05 20:57:45 +03:00
|
|
|
attrs: List[Union[str, int]],
|
|
|
|
rows: List[int],
|
2020-10-05 16:29:49 +03:00
|
|
|
include_static_vectors: bool,
|
2020-10-01 10:20:09 +03:00
|
|
|
) -> Model[List[Doc], List[Floats2d]]:
|
2020-08-07 17:48:48 +03:00
|
|
|
"""Construct an embedding layer that separately embeds a number of lexical
|
|
|
|
attributes using hash embedding, concatenates the results, and passes it
|
|
|
|
through a feed-forward subnetwork to build a mixed representations.
|
|
|
|
|
2020-10-05 16:22:00 +03:00
|
|
|
The features used can be configured with the 'attrs' argument. The suggested
|
|
|
|
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
|
|
|
|
account some subword information, without contruction a fully character-based
|
|
|
|
representation. If pretrained vectors are available, they can be included in
|
|
|
|
the representation as well, with the vectors table will be kept static
|
|
|
|
(i.e. it's not updated).
|
|
|
|
|
|
|
|
The `width` parameter specifices the output width of the layer and the widths
|
|
|
|
of all embedding tables. If static vectors are included, a learned linear
|
|
|
|
layer is used to map the vectors to the specified width before concatenating
|
|
|
|
it with the other embedding outputs. A single Maxout layer is then used to
|
|
|
|
reduce the concatenated vectors to the final width.
|
|
|
|
|
|
|
|
The `rows` parameter controls the number of rows used by the `HashEmbed`
|
|
|
|
tables. The HashEmbed layer needs surprisingly few rows, due to its use of
|
|
|
|
the hashing trick. Generally between 2000 and 10000 rows is sufficient,
|
2020-10-05 20:57:45 +03:00
|
|
|
even for very large vocabularies. A number of rows must be specified for each
|
|
|
|
table, so the `rows` list must be of the same length as the `attrs` parameter.
|
2020-08-07 17:48:48 +03:00
|
|
|
|
|
|
|
width (int): The output width. Also used as the width of the embedding tables.
|
|
|
|
Recommended values are between 64 and 300.
|
2020-10-05 20:57:45 +03:00
|
|
|
attrs (list of attr IDs): The token attributes to embed. A separate
|
|
|
|
embedding table will be constructed for each attribute.
|
|
|
|
rows (List[int]): The number of rows in the embedding tables. Must have the
|
|
|
|
same length as attrs.
|
2020-10-05 16:22:00 +03:00
|
|
|
include_static_vectors (bool): Whether to also use static word vectors.
|
2020-08-07 17:48:48 +03:00
|
|
|
Requires a vectors table to be loaded in the Doc objects' vocab.
|
|
|
|
"""
|
2020-10-05 21:02:45 +03:00
|
|
|
if len(rows) != len(attrs):
|
|
|
|
raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
|
2020-07-28 14:51:43 +03:00
|
|
|
seed = 7
|
2020-07-28 23:02:34 +03:00
|
|
|
|
2020-10-05 20:57:45 +03:00
|
|
|
def make_hash_embed(index):
|
2020-07-28 14:51:43 +03:00
|
|
|
nonlocal seed
|
|
|
|
seed += 1
|
|
|
|
return HashEmbed(
|
|
|
|
width,
|
2020-10-05 20:57:45 +03:00
|
|
|
rows[index],
|
|
|
|
column=index,
|
2020-07-28 14:59:46 +03:00
|
|
|
seed=seed,
|
2020-07-28 23:02:34 +03:00
|
|
|
dropout=0.0,
|
2020-06-20 15:15:04 +03:00
|
|
|
)
|
2020-07-28 23:02:34 +03:00
|
|
|
|
2020-10-05 20:57:45 +03:00
|
|
|
embeddings = [make_hash_embed(i) for i in range(len(attrs))]
|
2020-10-05 16:22:00 +03:00
|
|
|
concat_size = width * (len(embeddings) + include_static_vectors)
|
|
|
|
if include_static_vectors:
|
2020-07-28 14:51:43 +03:00
|
|
|
model = chain(
|
|
|
|
concatenate(
|
2020-07-28 16:51:40 +03:00
|
|
|
chain(
|
2020-10-05 20:57:45 +03:00
|
|
|
FeatureExtractor(attrs),
|
2020-07-28 16:51:40 +03:00
|
|
|
list2ragged(),
|
2020-07-28 23:02:34 +03:00
|
|
|
with_array(concatenate(*embeddings)),
|
2020-07-28 16:51:40 +03:00
|
|
|
),
|
2020-07-28 23:02:34 +03:00
|
|
|
StaticVectors(width, dropout=0.0),
|
2020-07-28 14:51:43 +03:00
|
|
|
),
|
2020-07-28 23:43:06 +03:00
|
|
|
with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)),
|
2020-07-28 23:02:34 +03:00
|
|
|
ragged2list(),
|
2020-06-20 15:15:04 +03:00
|
|
|
)
|
2020-07-28 14:51:43 +03:00
|
|
|
else:
|
|
|
|
model = chain(
|
2020-10-05 16:22:00 +03:00
|
|
|
FeatureExtractor(list(attrs)),
|
2020-07-29 14:38:41 +03:00
|
|
|
list2ragged(),
|
|
|
|
with_array(concatenate(*embeddings)),
|
2020-07-28 23:43:06 +03:00
|
|
|
with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)),
|
2020-07-28 23:02:34 +03:00
|
|
|
ragged2list(),
|
2020-03-08 15:23:18 +03:00
|
|
|
)
|
2020-07-28 14:51:43 +03:00
|
|
|
return model
|
2020-07-28 23:02:34 +03:00
|
|
|
|
2020-02-27 20:42:27 +03:00
|
|
|
|
2020-03-08 15:23:18 +03:00
|
|
|
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
2020-09-21 11:59:07 +03:00
|
|
|
def CharacterEmbed(
|
2020-10-03 18:20:18 +03:00
|
|
|
width: int,
|
|
|
|
rows: int,
|
|
|
|
nM: int,
|
|
|
|
nC: int,
|
|
|
|
also_use_static_vectors: bool,
|
|
|
|
feature: Union[int, str] = "LOWER",
|
2020-10-01 10:20:09 +03:00
|
|
|
) -> Model[List[Doc], List[Floats2d]]:
|
2020-09-04 10:10:21 +03:00
|
|
|
"""Construct an embedded representation based on character embeddings, using
|
2020-08-07 17:48:48 +03:00
|
|
|
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
|
|
|
each word, taken from the beginning and end of the word equally. Padding is
|
|
|
|
used in the centre for words that are too short.
|
|
|
|
|
|
|
|
For instance, let's say nC=4, and the word is "jumping". The characters
|
|
|
|
used will be jung (two from the start, two from the end). If we had nC=8,
|
|
|
|
the characters would be "jumpping": 4 from the start, 4 from the end. This
|
|
|
|
ensures that the final character is always in the last position, instead
|
|
|
|
of being in an arbitrary position depending on the word length.
|
|
|
|
|
2020-09-08 18:24:36 +03:00
|
|
|
The characters are embedded in a embedding table with a given number of rows,
|
2020-10-02 00:05:55 +03:00
|
|
|
and the vectors concatenated. A hash-embedded vector of the LOWER of the word is
|
2020-08-07 17:48:48 +03:00
|
|
|
also concatenated on, and the result is then passed through a feed-forward
|
|
|
|
network to construct a single vector to represent the information.
|
|
|
|
|
2020-10-01 23:17:26 +03:00
|
|
|
feature (int or str): An attribute to embed, to concatenate with the characters.
|
|
|
|
width (int): The width of the output vector and the feature embedding.
|
2020-10-02 00:05:55 +03:00
|
|
|
rows (int): The number of rows in the LOWER hash embedding table.
|
2020-08-07 17:48:48 +03:00
|
|
|
nM (int): The dimensionality of the character embeddings. Recommended values
|
|
|
|
are between 16 and 64.
|
|
|
|
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
|
|
|
are between 3 and 8, although it may depend on the length of words in the
|
2020-08-07 19:40:54 +03:00
|
|
|
language.
|
2020-09-16 18:45:04 +03:00
|
|
|
also_use_static_vectors (bool): Whether to also use static word vectors.
|
|
|
|
Requires a vectors table to be loaded in the Doc objects' vocab.
|
2020-08-07 17:48:48 +03:00
|
|
|
"""
|
2020-10-01 23:17:26 +03:00
|
|
|
feature = intify_attr(feature)
|
|
|
|
if feature is None:
|
2020-10-04 12:16:31 +03:00
|
|
|
raise ValueError(Errors.E911(feat=feature))
|
2020-09-16 18:45:04 +03:00
|
|
|
if also_use_static_vectors:
|
|
|
|
model = chain(
|
|
|
|
concatenate(
|
|
|
|
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
|
|
|
chain(
|
2020-10-01 23:17:26 +03:00
|
|
|
FeatureExtractor([feature]),
|
2020-09-16 18:45:04 +03:00
|
|
|
list2ragged(),
|
|
|
|
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
|
|
|
),
|
|
|
|
StaticVectors(width, dropout=0.0),
|
2020-07-31 18:02:54 +03:00
|
|
|
),
|
2020-09-21 11:59:07 +03:00
|
|
|
with_array(
|
|
|
|
Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)
|
|
|
|
),
|
2020-09-16 18:45:04 +03:00
|
|
|
ragged2list(),
|
2020-09-21 11:59:07 +03:00
|
|
|
)
|
2020-09-16 18:45:04 +03:00
|
|
|
else:
|
|
|
|
model = chain(
|
|
|
|
concatenate(
|
|
|
|
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
|
|
|
chain(
|
2020-10-01 23:17:26 +03:00
|
|
|
FeatureExtractor([feature]),
|
2020-09-16 18:45:04 +03:00
|
|
|
list2ragged(),
|
|
|
|
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
|
|
|
),
|
|
|
|
),
|
2020-09-21 11:59:07 +03:00
|
|
|
with_array(
|
|
|
|
Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)
|
|
|
|
),
|
2020-09-16 18:45:04 +03:00
|
|
|
ragged2list(),
|
2020-09-21 11:59:07 +03:00
|
|
|
)
|
2020-07-29 00:06:30 +03:00
|
|
|
return model
|
2020-02-27 20:42:27 +03:00
|
|
|
|
|
|
|
|
|
|
|
@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
|
2020-08-07 17:49:00 +03:00
|
|
|
def MaxoutWindowEncoder(
|
|
|
|
width: int, window_size: int, maxout_pieces: int, depth: int
|
|
|
|
) -> Model[List[Floats2d], List[Floats2d]]:
|
2020-08-07 17:48:48 +03:00
|
|
|
"""Encode context using convolutions with maxout activation, layer
|
|
|
|
normalization and residual connections.
|
|
|
|
|
|
|
|
width (int): The input and output width. These are required to be the same,
|
2020-08-07 19:40:54 +03:00
|
|
|
to allow residual connections. This value will be determined by the
|
2020-08-07 17:48:48 +03:00
|
|
|
width of the inputs. Recommended values are between 64 and 300.
|
|
|
|
window_size (int): The number of words to concatenate around each token
|
|
|
|
to construct the convolution. Recommended value is 1.
|
|
|
|
maxout_pieces (int): The number of maxout pieces to use. Recommended
|
|
|
|
values are 2 or 3.
|
|
|
|
depth (int): The number of convolutional layers. Recommended value is 4.
|
|
|
|
"""
|
2020-03-08 15:23:18 +03:00
|
|
|
cnn = chain(
|
|
|
|
expand_window(window_size=window_size),
|
2020-06-20 15:15:04 +03:00
|
|
|
Maxout(
|
|
|
|
nO=width,
|
|
|
|
nI=width * ((window_size * 2) + 1),
|
|
|
|
nP=maxout_pieces,
|
|
|
|
dropout=0.0,
|
|
|
|
normalize=True,
|
|
|
|
),
|
2020-02-27 20:42:27 +03:00
|
|
|
)
|
|
|
|
model = clone(residual(cnn), depth)
|
2020-03-08 15:23:18 +03:00
|
|
|
model.set_dim("nO", width)
|
|
|
|
model.attrs["receptive_field"] = window_size * depth
|
2020-02-27 20:42:27 +03:00
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
@registry.architectures.register("spacy.MishWindowEncoder.v1")
|
2020-08-07 17:49:00 +03:00
|
|
|
def MishWindowEncoder(
|
|
|
|
width: int, window_size: int, depth: int
|
|
|
|
) -> Model[List[Floats2d], List[Floats2d]]:
|
2020-08-07 17:48:48 +03:00
|
|
|
"""Encode context using convolutions with mish activation, layer
|
|
|
|
normalization and residual connections.
|
|
|
|
|
|
|
|
width (int): The input and output width. These are required to be the same,
|
2020-08-07 19:40:54 +03:00
|
|
|
to allow residual connections. This value will be determined by the
|
2020-08-07 17:48:48 +03:00
|
|
|
width of the inputs. Recommended values are between 64 and 300.
|
|
|
|
window_size (int): The number of words to concatenate around each token
|
|
|
|
to construct the convolution. Recommended value is 1.
|
|
|
|
depth (int): The number of convolutional layers. Recommended value is 4.
|
|
|
|
"""
|
2020-02-27 20:42:27 +03:00
|
|
|
cnn = chain(
|
2020-03-08 15:23:18 +03:00
|
|
|
expand_window(window_size=window_size),
|
2020-07-31 18:02:54 +03:00
|
|
|
Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
|
2020-02-27 20:42:27 +03:00
|
|
|
)
|
|
|
|
model = clone(residual(cnn), depth)
|
2020-03-08 15:23:18 +03:00
|
|
|
model.set_dim("nO", width)
|
2020-02-27 20:42:27 +03:00
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
|
2020-08-07 17:49:00 +03:00
|
|
|
def BiLSTMEncoder(
|
|
|
|
width: int, depth: int, dropout: float
|
|
|
|
) -> Model[List[Floats2d], List[Floats2d]]:
|
2020-08-07 17:48:48 +03:00
|
|
|
"""Encode context using bidirectonal LSTM layers. Requires PyTorch.
|
|
|
|
|
|
|
|
width (int): The input and output width. These are required to be the same,
|
2020-08-07 19:40:54 +03:00
|
|
|
to allow residual connections. This value will be determined by the
|
2020-08-07 17:48:48 +03:00
|
|
|
width of the inputs. Recommended values are between 64 and 300.
|
|
|
|
window_size (int): The number of words to concatenate around each token
|
|
|
|
to construct the convolution. Recommended value is 1.
|
|
|
|
depth (int): The number of convolutional layers. Recommended value is 4.
|
|
|
|
"""
|
2020-02-27 20:42:27 +03:00
|
|
|
if depth == 0:
|
|
|
|
return noop()
|
2020-07-28 23:02:34 +03:00
|
|
|
return with_padded(PyTorchLSTM(width, width, bi=True, depth=depth, dropout=dropout))
|