2020-07-28 01:52:50 +03:00
|
|
|
from typing import Optional, List
|
2020-07-28 14:51:43 +03:00
|
|
|
from thinc.api import chain, clone, concatenate, with_array, with_padded
|
2020-07-28 16:51:40 +03:00
|
|
|
from thinc.api import Model, noop, list2ragged, ragged2list
|
|
|
|
from thinc.api import FeatureExtractor, HashEmbed
|
2020-07-29 15:45:09 +03:00
|
|
|
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
2020-07-28 01:52:50 +03:00
|
|
|
from thinc.types import Floats2d
|
2020-02-27 20:42:27 +03:00
|
|
|
|
2020-07-28 16:51:40 +03:00
|
|
|
from ...tokens import Doc
|
2020-03-08 15:23:18 +03:00
|
|
|
from ...util import registry
|
2020-02-27 20:42:27 +03:00
|
|
|
from ...ml import _character_embed
|
2020-07-28 16:51:40 +03:00
|
|
|
from ..staticvectors import StaticVectors
|
2020-02-27 20:42:27 +03:00
|
|
|
from ...pipeline.tok2vec import Tok2VecListener
|
2020-07-31 18:02:54 +03:00
|
|
|
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
2020-02-27 20:42:27 +03:00
|
|
|
|
|
|
|
|
2020-07-28 14:51:43 +03:00
|
|
|
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
2020-07-31 18:02:54 +03:00
|
|
|
def tok2vec_listener_v1(width: int, upstream: str = "*"):
|
2020-07-22 14:42:59 +03:00
|
|
|
tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
|
2020-02-27 20:42:27 +03:00
|
|
|
return tok2vec
|
|
|
|
|
|
|
|
|
2020-07-28 23:43:06 +03:00
|
|
|
@registry.architectures.register("spacy.HashEmbedCNN.v1")
|
|
|
|
def build_hash_embed_cnn_tok2vec(
|
|
|
|
*,
|
|
|
|
width: int,
|
|
|
|
depth: int,
|
|
|
|
embed_size: int,
|
|
|
|
window_size: int,
|
|
|
|
maxout_pieces: int,
|
|
|
|
subword_features: bool,
|
|
|
|
dropout: Optional[float],
|
|
|
|
pretrained_vectors: Optional[bool]
|
|
|
|
) -> Model[List[Doc], List[Floats2d]]:
|
|
|
|
"""Build spaCy's 'standard' tok2vec layer, which uses hash embedding
|
|
|
|
with subword features and a CNN with layer-normalized maxout."""
|
|
|
|
return build_Tok2Vec_model(
|
|
|
|
embed=MultiHashEmbed(
|
|
|
|
width=width,
|
|
|
|
rows=embed_size,
|
|
|
|
also_embed_subwords=subword_features,
|
|
|
|
also_use_static_vectors=bool(pretrained_vectors),
|
|
|
|
),
|
|
|
|
encode=MaxoutWindowEncoder(
|
|
|
|
width=width,
|
|
|
|
depth=depth,
|
|
|
|
window_size=window_size,
|
2020-07-31 18:02:54 +03:00
|
|
|
maxout_pieces=maxout_pieces,
|
|
|
|
),
|
2020-07-28 23:43:06 +03:00
|
|
|
)
|
|
|
|
|
2020-07-31 18:02:54 +03:00
|
|
|
|
2020-02-27 20:42:27 +03:00
|
|
|
@registry.architectures.register("spacy.Tok2Vec.v1")
|
2020-07-28 23:43:06 +03:00
|
|
|
def build_Tok2Vec_model(
|
2020-07-28 14:51:43 +03:00
|
|
|
embed: Model[List[Doc], List[Floats2d]],
|
2020-07-28 23:02:34 +03:00
|
|
|
encode: Model[List[Floats2d], List[Floats2d]],
|
2020-07-28 14:51:43 +03:00
|
|
|
) -> Model[List[Doc], List[Floats2d]]:
|
2020-07-28 16:51:40 +03:00
|
|
|
|
|
|
|
receptive_field = encode.attrs.get("receptive_field", 0)
|
|
|
|
tok2vec = chain(embed, with_array(encode, pad=receptive_field))
|
2020-02-27 20:42:27 +03:00
|
|
|
tok2vec.set_dim("nO", encode.get_dim("nO"))
|
|
|
|
tok2vec.set_ref("embed", embed)
|
|
|
|
tok2vec.set_ref("encode", encode)
|
|
|
|
return tok2vec
|
|
|
|
|
|
|
|
|
2020-07-28 16:51:40 +03:00
|
|
|
@registry.architectures.register("spacy.MultiHashEmbed.v1")
|
|
|
|
def MultiHashEmbed(
|
2020-07-28 23:02:34 +03:00
|
|
|
width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
|
2020-02-27 20:42:27 +03:00
|
|
|
):
|
2020-07-28 14:51:43 +03:00
|
|
|
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
|
|
|
seed = 7
|
2020-07-28 23:02:34 +03:00
|
|
|
|
2020-07-28 14:51:43 +03:00
|
|
|
def make_hash_embed(feature):
|
|
|
|
nonlocal seed
|
|
|
|
seed += 1
|
|
|
|
return HashEmbed(
|
|
|
|
width,
|
|
|
|
rows if feature == NORM else rows // 2,
|
|
|
|
column=cols.index(feature),
|
2020-07-28 14:59:46 +03:00
|
|
|
seed=seed,
|
2020-07-28 23:02:34 +03:00
|
|
|
dropout=0.0,
|
2020-06-20 15:15:04 +03:00
|
|
|
)
|
2020-07-28 23:02:34 +03:00
|
|
|
|
2020-07-28 14:51:43 +03:00
|
|
|
if also_embed_subwords:
|
|
|
|
embeddings = [
|
2020-07-28 16:51:40 +03:00
|
|
|
make_hash_embed(NORM),
|
|
|
|
make_hash_embed(PREFIX),
|
|
|
|
make_hash_embed(SUFFIX),
|
2020-07-28 23:02:34 +03:00
|
|
|
make_hash_embed(SHAPE),
|
2020-07-28 14:51:43 +03:00
|
|
|
]
|
|
|
|
else:
|
|
|
|
embeddings = [make_hash_embed(NORM)]
|
2020-07-28 23:43:06 +03:00
|
|
|
concat_size = width * (len(embeddings) + also_use_static_vectors)
|
2020-07-28 14:51:43 +03:00
|
|
|
if also_use_static_vectors:
|
|
|
|
model = chain(
|
|
|
|
concatenate(
|
2020-07-28 16:51:40 +03:00
|
|
|
chain(
|
|
|
|
FeatureExtractor(cols),
|
|
|
|
list2ragged(),
|
2020-07-28 23:02:34 +03:00
|
|
|
with_array(concatenate(*embeddings)),
|
2020-07-28 16:51:40 +03:00
|
|
|
),
|
2020-07-28 23:02:34 +03:00
|
|
|
StaticVectors(width, dropout=0.0),
|
2020-07-28 14:51:43 +03:00
|
|
|
),
|
2020-07-28 23:43:06 +03:00
|
|
|
with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)),
|
2020-07-28 23:02:34 +03:00
|
|
|
ragged2list(),
|
2020-06-20 15:15:04 +03:00
|
|
|
)
|
2020-07-28 14:51:43 +03:00
|
|
|
else:
|
|
|
|
model = chain(
|
2020-07-29 14:38:41 +03:00
|
|
|
FeatureExtractor(cols),
|
|
|
|
list2ragged(),
|
|
|
|
with_array(concatenate(*embeddings)),
|
2020-07-28 23:43:06 +03:00
|
|
|
with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)),
|
2020-07-28 23:02:34 +03:00
|
|
|
ragged2list(),
|
2020-03-08 15:23:18 +03:00
|
|
|
)
|
2020-07-28 14:51:43 +03:00
|
|
|
return model
|
2020-07-28 23:02:34 +03:00
|
|
|
|
2020-02-27 20:42:27 +03:00
|
|
|
|
2020-03-08 15:23:18 +03:00
|
|
|
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
2020-07-29 00:06:30 +03:00
|
|
|
def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
2020-07-29 14:38:41 +03:00
|
|
|
model = chain(
|
|
|
|
concatenate(
|
|
|
|
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
|
|
|
chain(
|
|
|
|
FeatureExtractor([NORM]),
|
|
|
|
list2ragged(),
|
2020-07-31 18:02:54 +03:00
|
|
|
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
|
|
|
),
|
2020-07-29 14:38:41 +03:00
|
|
|
),
|
|
|
|
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
|
2020-07-31 18:02:54 +03:00
|
|
|
ragged2list(),
|
2020-07-25 16:01:15 +03:00
|
|
|
)
|
2020-07-29 00:06:30 +03:00
|
|
|
return model
|
2020-02-27 20:42:27 +03:00
|
|
|
|
|
|
|
|
|
|
|
@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
|
2020-07-28 14:51:43 +03:00
|
|
|
def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth: int):
|
2020-03-08 15:23:18 +03:00
|
|
|
cnn = chain(
|
|
|
|
expand_window(window_size=window_size),
|
2020-06-20 15:15:04 +03:00
|
|
|
Maxout(
|
|
|
|
nO=width,
|
|
|
|
nI=width * ((window_size * 2) + 1),
|
|
|
|
nP=maxout_pieces,
|
|
|
|
dropout=0.0,
|
|
|
|
normalize=True,
|
|
|
|
),
|
2020-02-27 20:42:27 +03:00
|
|
|
)
|
|
|
|
model = clone(residual(cnn), depth)
|
2020-03-08 15:23:18 +03:00
|
|
|
model.set_dim("nO", width)
|
|
|
|
model.attrs["receptive_field"] = window_size * depth
|
2020-02-27 20:42:27 +03:00
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
@registry.architectures.register("spacy.MishWindowEncoder.v1")
|
2020-03-08 15:23:18 +03:00
|
|
|
def MishWindowEncoder(width, window_size, depth):
|
2020-02-27 20:42:27 +03:00
|
|
|
cnn = chain(
|
2020-03-08 15:23:18 +03:00
|
|
|
expand_window(window_size=window_size),
|
2020-07-31 18:02:54 +03:00
|
|
|
Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
|
2020-02-27 20:42:27 +03:00
|
|
|
)
|
|
|
|
model = clone(residual(cnn), depth)
|
2020-03-08 15:23:18 +03:00
|
|
|
model.set_dim("nO", width)
|
2020-02-27 20:42:27 +03:00
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
|
2020-07-28 14:51:43 +03:00
|
|
|
def BiLSTMEncoder(width, depth, dropout):
|
2020-02-27 20:42:27 +03:00
|
|
|
if depth == 0:
|
|
|
|
return noop()
|
2020-07-28 23:02:34 +03:00
|
|
|
return with_padded(PyTorchLSTM(width, width, bi=True, depth=depth, dropout=dropout))
|