Refactor Tok2Vec

This commit is contained in:
Matthew Honnibal 2017-09-22 09:38:36 -05:00
parent d9124f1aa3
commit 980fb6e854

View File

@ -227,45 +227,52 @@ def drop_layer(layer, factor=2.):
model.predict = layer
return model
def link_vectors_to_models(vocab):
vectors = vocab.vectors
ops = Model.ops
for word in vocab:
if word.orth in vectors.key2row:
word.rank = vectors.key2row[word.orth]
else:
word.rank = 0
data = ops.asarray(vectors.data)
# Set an entry here, so that vectors are accessed by StaticVectors
# (unideal, I know)
thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
def Tok2Vec(width, embed_size, pretrained_dims=0, **kwargs):
assert pretrained_dims is not None
def Tok2Vec(width, embed_size, **kwargs):
pretrained_dims = kwargs.get('pretrained_dims', 0)
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
'*': reapply}):
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
if pretrained_dims is not None and pretrained_dims >= 1:
glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
embed = uniqued(
(glove | norm | prefix | suffix | shape)
>> LN(Maxout(width, width*5, pieces=3)), column=5)
else:
embed = uniqued(
(norm | prefix | suffix | shape)
>> LN(Maxout(width, width*4, pieces=3)), column=5)
trained_vectors = (
FeatureExtracter(cols)
>> with_flatten(
uniqued(
(norm | prefix | suffix | shape)
>> LN(Maxout(width, width*4, pieces=3)), column=5)
)
)
convolution = Residual(
ExtractWindow(nW=1)
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
)
if pretrained_dims >= 1:
embed = concatenate_lists(trained_vectors, SpacyVectors)
tok2vec = (
embed
>> with_flatten(
Affine(width, width+pretrained_dims)
>> convolution ** 4,
pad=4)
)
else:
embed = trained_vectors
tok2vec = (
embed
>> with_flatten(convolution ** 4, pad=4)
)
tok2vec = (
FeatureExtracter(cols)
>> with_flatten(
embed >> (convolution * 4), pad=4)
)
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
tok2vec.nO = width