From 980fb6e85482d0325897775be86ef4343f880941 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 22 Sep 2017 09:38:36 -0500 Subject: [PATCH] Refactor Tok2Vec --- spacy/_ml.py | 59 +++++++++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 65ffb42a6..34f66233d 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -227,45 +227,52 @@ def drop_layer(layer, factor=2.): model.predict = layer return model +def link_vectors_to_models(vocab): + vectors = vocab.vectors + ops = Model.ops + for word in vocab: + if word.orth in vectors.key2row: + word.rank = vectors.key2row[word.orth] + else: + word.rank = 0 + data = ops.asarray(vectors.data) + # Set an entry here, so that vectors are accessed by StaticVectors + # (unideal, I know) + thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data -def Tok2Vec(width, embed_size, pretrained_dims=0, **kwargs): - assert pretrained_dims is not None + +def Tok2Vec(width, embed_size, **kwargs): + pretrained_dims = kwargs.get('pretrained_dims', 0) cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3) cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] - with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): + with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add, + '*': reapply}): norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm') prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix') suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix') shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape') + if pretrained_dims is not None and pretrained_dims >= 1: + glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID)) + + embed = uniqued( + (glove | norm | prefix | suffix | shape) + >> LN(Maxout(width, width*5, pieces=3)), column=5) + else: + embed = uniqued( + (norm | prefix | suffix | shape) + >> LN(Maxout(width, width*4, pieces=3)), column=5) + - trained_vectors = ( - FeatureExtracter(cols) - >> with_flatten( - uniqued( - (norm | prefix | suffix | shape) - >> LN(Maxout(width, width*4, pieces=3)), column=5) - ) - ) convolution = Residual( ExtractWindow(nW=1) >> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces)) ) - if pretrained_dims >= 1: - embed = concatenate_lists(trained_vectors, SpacyVectors) - tok2vec = ( - embed - >> with_flatten( - Affine(width, width+pretrained_dims) - >> convolution ** 4, - pad=4) - ) - else: - embed = trained_vectors - tok2vec = ( - embed - >> with_flatten(convolution ** 4, pad=4) - ) + tok2vec = ( + FeatureExtracter(cols) + >> with_flatten( + embed >> (convolution * 4), pad=4) + ) # Work around thinc API limitations :(. TODO: Revise in Thinc 7 tok2vec.nO = width