Refactor Tok2Vec

2025-08-24 14:04:56 +03:00 · 2017-09-22 09:38:36 -05:00 · 2017-09-22 09:38:36 -05:00 · 980fb6e854
commit 980fb6e854
parent d9124f1aa3
1 changed files with 33 additions and 26 deletions
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -227,45 +227,52 @@ def drop_layer(layer, factor=2.):
    model.predict = layer
    return model

+def link_vectors_to_models(vocab):
+    vectors = vocab.vectors
+    ops = Model.ops
+    for word in vocab:
+        if word.orth in vectors.key2row:
+            word.rank = vectors.key2row[word.orth]
+        else:
+            word.rank = 0
+    data = ops.asarray(vectors.data)
+    # Set an entry here, so that vectors are accessed by StaticVectors
+    # (unideal, I know)
+    thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data

-def Tok2Vec(width, embed_size, pretrained_dims=0, **kwargs):
-    assert pretrained_dims is not None
+
+def Tok2Vec(width, embed_size, **kwargs):
+    pretrained_dims = kwargs.get('pretrained_dims', 0)
    cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
-    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
+    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
+                                 '*': reapply}):
        norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
        prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
        suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
        shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
+        if pretrained_dims is not None and pretrained_dims >= 1:
+            glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
+
+            embed = uniqued(
+                (glove | norm | prefix | suffix | shape)
+                >> LN(Maxout(width, width*5, pieces=3)), column=5)
+        else:
+            embed = uniqued(
+                (norm | prefix | suffix | shape)
+                >> LN(Maxout(width, width*4, pieces=3)), column=5)
+

-        trained_vectors = (
-            FeatureExtracter(cols)
-            >> with_flatten(
-                uniqued(
-                    (norm | prefix | suffix | shape)
-                    >> LN(Maxout(width, width*4, pieces=3)), column=5)
-            )
-        )
        convolution = Residual(
            ExtractWindow(nW=1)
            >> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
        )

-        if pretrained_dims >= 1:
-            embed = concatenate_lists(trained_vectors, SpacyVectors)
-            tok2vec = (
-                embed
-                >> with_flatten(
-                    Affine(width, width+pretrained_dims)
-                    >> convolution ** 4,
-                    pad=4)
-            )
-        else:
-            embed = trained_vectors
-            tok2vec = (
-                embed
-                >> with_flatten(convolution ** 4, pad=4)
-            )
+        tok2vec = (
+            FeatureExtracter(cols)
+            >> with_flatten(
+                embed >> (convolution * 4), pad=4)
+        )

        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
        tok2vec.nO = width