mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Refactor Tok2Vec
This commit is contained in:
parent
d9124f1aa3
commit
980fb6e854
59
spacy/_ml.py
59
spacy/_ml.py
|
@ -227,45 +227,52 @@ def drop_layer(layer, factor=2.):
|
||||||
model.predict = layer
|
model.predict = layer
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
def link_vectors_to_models(vocab):
|
||||||
|
vectors = vocab.vectors
|
||||||
|
ops = Model.ops
|
||||||
|
for word in vocab:
|
||||||
|
if word.orth in vectors.key2row:
|
||||||
|
word.rank = vectors.key2row[word.orth]
|
||||||
|
else:
|
||||||
|
word.rank = 0
|
||||||
|
data = ops.asarray(vectors.data)
|
||||||
|
# Set an entry here, so that vectors are accessed by StaticVectors
|
||||||
|
# (unideal, I know)
|
||||||
|
thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
|
||||||
|
|
||||||
def Tok2Vec(width, embed_size, pretrained_dims=0, **kwargs):
|
|
||||||
assert pretrained_dims is not None
|
def Tok2Vec(width, embed_size, **kwargs):
|
||||||
|
pretrained_dims = kwargs.get('pretrained_dims', 0)
|
||||||
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
|
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
|
||||||
|
'*': reapply}):
|
||||||
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
|
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
|
||||||
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
|
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
|
||||||
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
|
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
|
||||||
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
|
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
|
||||||
|
if pretrained_dims is not None and pretrained_dims >= 1:
|
||||||
|
glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
|
||||||
|
|
||||||
|
embed = uniqued(
|
||||||
|
(glove | norm | prefix | suffix | shape)
|
||||||
|
>> LN(Maxout(width, width*5, pieces=3)), column=5)
|
||||||
|
else:
|
||||||
|
embed = uniqued(
|
||||||
|
(norm | prefix | suffix | shape)
|
||||||
|
>> LN(Maxout(width, width*4, pieces=3)), column=5)
|
||||||
|
|
||||||
|
|
||||||
trained_vectors = (
|
|
||||||
FeatureExtracter(cols)
|
|
||||||
>> with_flatten(
|
|
||||||
uniqued(
|
|
||||||
(norm | prefix | suffix | shape)
|
|
||||||
>> LN(Maxout(width, width*4, pieces=3)), column=5)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
convolution = Residual(
|
convolution = Residual(
|
||||||
ExtractWindow(nW=1)
|
ExtractWindow(nW=1)
|
||||||
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
|
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
|
||||||
)
|
)
|
||||||
|
|
||||||
if pretrained_dims >= 1:
|
tok2vec = (
|
||||||
embed = concatenate_lists(trained_vectors, SpacyVectors)
|
FeatureExtracter(cols)
|
||||||
tok2vec = (
|
>> with_flatten(
|
||||||
embed
|
embed >> (convolution * 4), pad=4)
|
||||||
>> with_flatten(
|
)
|
||||||
Affine(width, width+pretrained_dims)
|
|
||||||
>> convolution ** 4,
|
|
||||||
pad=4)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
embed = trained_vectors
|
|
||||||
tok2vec = (
|
|
||||||
embed
|
|
||||||
>> with_flatten(convolution ** 4, pad=4)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
||||||
tok2vec.nO = width
|
tok2vec.nO = width
|
||||||
|
|
Loading…
Reference in New Issue
Block a user