mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
Support optional pre-trained vectors in tensorizer model
This commit is contained in:
parent
e0a2aa9289
commit
2a93404da6
41
spacy/_ml.py
41
spacy/_ml.py
|
@ -21,7 +21,7 @@ from thinc.api import FeatureExtracter, with_getitem
|
||||||
from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
|
from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
|
||||||
from thinc.neural._classes.attention import ParametricAttention
|
from thinc.neural._classes.attention import ParametricAttention
|
||||||
from thinc.linear.linear import LinearModel
|
from thinc.linear.linear import LinearModel
|
||||||
from thinc.api import uniqued, wrap, flatten_add_lengths
|
from thinc.api import uniqued, wrap, flatten_add_lengths, noop
|
||||||
|
|
||||||
|
|
||||||
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
|
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
|
||||||
|
@ -226,7 +226,7 @@ def drop_layer(layer, factor=2.):
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def Tok2Vec(width, embed_size, preprocess=None):
|
def Tok2Vec(width, embed_size, preprocess=True, pretrained_dims=0):
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
||||||
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
|
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
|
||||||
|
@ -234,18 +234,30 @@ def Tok2Vec(width, embed_size, preprocess=None):
|
||||||
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
|
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
|
||||||
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
|
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
|
||||||
|
|
||||||
embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3))
|
trained_vectors = (
|
||||||
tok2vec = (
|
FeatureExtracter(cols)
|
||||||
with_flatten(
|
>> with_flatten(
|
||||||
asarray(Model.ops, dtype='uint64')
|
uniqued(
|
||||||
>> uniqued(embed, column=5)
|
(norm | prefix | suffix | shape)
|
||||||
>> Residual(
|
>> LN(Maxout(width, width*4, pieces=3)), column=5)
|
||||||
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
|
|
||||||
) ** 4, pad=4
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if preprocess not in (False, None):
|
if pretrained_dims:
|
||||||
tok2vec = preprocess >> tok2vec
|
embed = concatenate_lists(trained_vectors, SpacyVectors)
|
||||||
|
else:
|
||||||
|
embed = trained_vectors
|
||||||
|
convolution = Residual(ExtractWindow(nW=1) >> LN(Maxout(width, width*3, pieces=3)))
|
||||||
|
|
||||||
|
tok2vec = (
|
||||||
|
embed
|
||||||
|
>> with_flatten(
|
||||||
|
Affine(width, width+pretrained_dims)
|
||||||
|
>> convolution
|
||||||
|
>> convolution
|
||||||
|
>> convolution
|
||||||
|
>> convolution,
|
||||||
|
pad=1)
|
||||||
|
)
|
||||||
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
||||||
tok2vec.nO = width
|
tok2vec.nO = width
|
||||||
tok2vec.embed = embed
|
tok2vec.embed = embed
|
||||||
|
@ -457,10 +469,11 @@ def getitem(i):
|
||||||
|
|
||||||
def build_tagger_model(nr_class, token_vector_width, **cfg):
|
def build_tagger_model(nr_class, token_vector_width, **cfg):
|
||||||
embed_size = util.env_opt('embed_size', 7500)
|
embed_size = util.env_opt('embed_size', 7500)
|
||||||
|
pretrained_dims = cfg.get('pretrained_dims', 0)
|
||||||
with Model.define_operators({'>>': chain, '+': add}):
|
with Model.define_operators({'>>': chain, '+': add}):
|
||||||
# Input: (doc, tensor) tuples
|
# Input: (doc, tensor) tuples
|
||||||
private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats())
|
private_tok2vec = Tok2Vec(token_vector_width, embed_size,
|
||||||
|
pretrained_dims=pretrained_dims)
|
||||||
model = (
|
model = (
|
||||||
fine_tune(private_tok2vec)
|
fine_tune(private_tok2vec)
|
||||||
>> with_flatten(
|
>> with_flatten(
|
||||||
|
|
Loading…
Reference in New Issue
Block a user