Support optional pre-trained vectors in tensorizer model

This commit is contained in:
Matthew Honnibal 2017-09-16 12:45:37 -05:00
parent e0a2aa9289
commit 2a93404da6

View File

@ -21,7 +21,7 @@ from thinc.api import FeatureExtracter, with_getitem
from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
from thinc.neural._classes.attention import ParametricAttention from thinc.neural._classes.attention import ParametricAttention
from thinc.linear.linear import LinearModel from thinc.linear.linear import LinearModel
from thinc.api import uniqued, wrap, flatten_add_lengths from thinc.api import uniqued, wrap, flatten_add_lengths, noop
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
@ -226,7 +226,7 @@ def drop_layer(layer, factor=2.):
return model return model
def Tok2Vec(width, embed_size, preprocess=None): def Tok2Vec(width, embed_size, preprocess=True, pretrained_dims=0):
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm') norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
@ -234,18 +234,30 @@ def Tok2Vec(width, embed_size, preprocess=None):
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix') suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape') shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3)) trained_vectors = (
tok2vec = ( FeatureExtracter(cols)
with_flatten( >> with_flatten(
asarray(Model.ops, dtype='uint64') uniqued(
>> uniqued(embed, column=5) (norm | prefix | suffix | shape)
>> Residual( >> LN(Maxout(width, width*4, pieces=3)), column=5)
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
) ** 4, pad=4
) )
) )
if preprocess not in (False, None): if pretrained_dims:
tok2vec = preprocess >> tok2vec embed = concatenate_lists(trained_vectors, SpacyVectors)
else:
embed = trained_vectors
convolution = Residual(ExtractWindow(nW=1) >> LN(Maxout(width, width*3, pieces=3)))
tok2vec = (
embed
>> with_flatten(
Affine(width, width+pretrained_dims)
>> convolution
>> convolution
>> convolution
>> convolution,
pad=1)
)
# Work around thinc API limitations :(. TODO: Revise in Thinc 7 # Work around thinc API limitations :(. TODO: Revise in Thinc 7
tok2vec.nO = width tok2vec.nO = width
tok2vec.embed = embed tok2vec.embed = embed
@ -457,10 +469,11 @@ def getitem(i):
def build_tagger_model(nr_class, token_vector_width, **cfg): def build_tagger_model(nr_class, token_vector_width, **cfg):
embed_size = util.env_opt('embed_size', 7500) embed_size = util.env_opt('embed_size', 7500)
pretrained_dims = cfg.get('pretrained_dims', 0)
with Model.define_operators({'>>': chain, '+': add}): with Model.define_operators({'>>': chain, '+': add}):
# Input: (doc, tensor) tuples # Input: (doc, tensor) tuples
private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats()) private_tok2vec = Tok2Vec(token_vector_width, embed_size,
pretrained_dims=pretrained_dims)
model = ( model = (
fine_tune(private_tok2vec) fine_tune(private_tok2vec)
>> with_flatten( >> with_flatten(