mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Refactor Tok2Vec
This commit is contained in:
parent
d9124f1aa3
commit
980fb6e854
49
spacy/_ml.py
49
spacy/_ml.py
|
@ -227,44 +227,51 @@ def drop_layer(layer, factor=2.):
|
|||
model.predict = layer
|
||||
return model
|
||||
|
||||
def link_vectors_to_models(vocab):
|
||||
vectors = vocab.vectors
|
||||
ops = Model.ops
|
||||
for word in vocab:
|
||||
if word.orth in vectors.key2row:
|
||||
word.rank = vectors.key2row[word.orth]
|
||||
else:
|
||||
word.rank = 0
|
||||
data = ops.asarray(vectors.data)
|
||||
# Set an entry here, so that vectors are accessed by StaticVectors
|
||||
# (unideal, I know)
|
||||
thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
|
||||
|
||||
def Tok2Vec(width, embed_size, pretrained_dims=0, **kwargs):
|
||||
assert pretrained_dims is not None
|
||||
|
||||
def Tok2Vec(width, embed_size, **kwargs):
|
||||
pretrained_dims = kwargs.get('pretrained_dims', 0)
|
||||
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
|
||||
'*': reapply}):
|
||||
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
|
||||
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
|
||||
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
|
||||
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
|
||||
if pretrained_dims is not None and pretrained_dims >= 1:
|
||||
glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
|
||||
|
||||
trained_vectors = (
|
||||
FeatureExtracter(cols)
|
||||
>> with_flatten(
|
||||
uniqued(
|
||||
embed = uniqued(
|
||||
(glove | norm | prefix | suffix | shape)
|
||||
>> LN(Maxout(width, width*5, pieces=3)), column=5)
|
||||
else:
|
||||
embed = uniqued(
|
||||
(norm | prefix | suffix | shape)
|
||||
>> LN(Maxout(width, width*4, pieces=3)), column=5)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
convolution = Residual(
|
||||
ExtractWindow(nW=1)
|
||||
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
|
||||
)
|
||||
|
||||
if pretrained_dims >= 1:
|
||||
embed = concatenate_lists(trained_vectors, SpacyVectors)
|
||||
tok2vec = (
|
||||
embed
|
||||
FeatureExtracter(cols)
|
||||
>> with_flatten(
|
||||
Affine(width, width+pretrained_dims)
|
||||
>> convolution ** 4,
|
||||
pad=4)
|
||||
)
|
||||
else:
|
||||
embed = trained_vectors
|
||||
tok2vec = (
|
||||
embed
|
||||
>> with_flatten(convolution ** 4, pad=4)
|
||||
embed >> (convolution * 4), pad=4)
|
||||
)
|
||||
|
||||
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
||||
|
|
Loading…
Reference in New Issue
Block a user