Refactor Tok2Vec

2025-08-24 14:04:56 +03:00 · 2017-09-22 09:38:36 -05:00 · 2017-09-22 09:38:36 -05:00 · 980fb6e854
commit 980fb6e854
parent d9124f1aa3
1 changed files with 33 additions and 26 deletions
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -227,45 +227,52 @@ def drop_layer(layer, factor=2.):
    model.predict = layer
    return model
 def link_vectors_to_models(vocab):
    vectors = vocab.vectors
    ops = Model.ops
    for word in vocab:
        if word.orth in vectors.key2row:
            word.rank = vectors.key2row[word.orth]
        else:
            word.rank = 0
    data = ops.asarray(vectors.data)
    # Set an entry here, so that vectors are accessed by StaticVectors
    # (unideal, I know)
    thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
-def Tok2Vec(width, embed_size, pretrained_dims=0, **kwargs):
+
-    assert pretrained_dims is not None
+def Tok2Vec(width, embed_size, **kwargs):
    pretrained_dims = kwargs.get('pretrained_dims', 0)
    cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
-    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
+    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
                                 '*': reapply}):
        norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
        prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
        suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
        shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
        if pretrained_dims is not None and pretrained_dims >= 1:
            glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
            embed = uniqued(
                (glove | norm | prefix | suffix | shape)
                >> LN(Maxout(width, width*5, pieces=3)), column=5)
        else:
            embed = uniqued(
                (norm | prefix | suffix | shape)
                >> LN(Maxout(width, width*4, pieces=3)), column=5)
        trained_vectors = (
            FeatureExtracter(cols)
            >> with_flatten(
                uniqued(
                    (norm | prefix | suffix | shape)
                    >> LN(Maxout(width, width*4, pieces=3)), column=5)
            )
        )
        convolution = Residual(
            ExtractWindow(nW=1)
            >> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
        )
-        if pretrained_dims >= 1:
+        tok2vec = (
-            embed = concatenate_lists(trained_vectors, SpacyVectors)
+            FeatureExtracter(cols)
-            tok2vec = (
+            >> with_flatten(
-                embed
+                embed >> (convolution * 4), pad=4)
-                >> with_flatten(
+        )
                    Affine(width, width+pretrained_dims)
                    >> convolution ** 4,
                    pad=4)
            )
        else:
            embed = trained_vectors
            tok2vec = (
                embed
                >> with_flatten(convolution ** 4, pad=4)
            )
        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
        tok2vec.nO = width