From 980fb6e85482d0325897775be86ef4343f880941 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 22 Sep 2017 09:38:36 -0500
Subject: [PATCH] Refactor Tok2Vec

---
 spacy/_ml.py | 59 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 33 insertions(+), 26 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index 65ffb42a6..34f66233d 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -227,45 +227,52 @@ def drop_layer(layer, factor=2.):
     model.predict = layer
     return model
 
+def link_vectors_to_models(vocab):
+    vectors = vocab.vectors
+    ops = Model.ops
+    for word in vocab:
+        if word.orth in vectors.key2row:
+            word.rank = vectors.key2row[word.orth]
+        else:
+            word.rank = 0
+    data = ops.asarray(vectors.data)
+    # Set an entry here, so that vectors are accessed by StaticVectors
+    # (unideal, I know)
+    thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
 
-def Tok2Vec(width, embed_size, pretrained_dims=0, **kwargs):
-    assert pretrained_dims is not None
+
+def Tok2Vec(width, embed_size, **kwargs):
+    pretrained_dims = kwargs.get('pretrained_dims', 0)
     cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
-    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
+    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
+                                 '*': reapply}):
         norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
         prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
         suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
         shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
+        if pretrained_dims is not None and pretrained_dims >= 1:
+            glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
+
+            embed = uniqued(
+                (glove | norm | prefix | suffix | shape)
+                >> LN(Maxout(width, width*5, pieces=3)), column=5)
+        else:
+            embed = uniqued(
+                (norm | prefix | suffix | shape)
+                >> LN(Maxout(width, width*4, pieces=3)), column=5)
+
 
-        trained_vectors = (
-            FeatureExtracter(cols)
-            >> with_flatten(
-                uniqued(
-                    (norm | prefix | suffix | shape)
-                    >> LN(Maxout(width, width*4, pieces=3)), column=5)
-            )
-        )
         convolution = Residual(
             ExtractWindow(nW=1)
             >> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
         )
 
-        if pretrained_dims >= 1:
-            embed = concatenate_lists(trained_vectors, SpacyVectors)
-            tok2vec = (
-                embed
-                >> with_flatten(
-                    Affine(width, width+pretrained_dims)
-                    >> convolution ** 4,
-                    pad=4)
-            )
-        else:
-            embed = trained_vectors
-            tok2vec = (
-                embed
-                >> with_flatten(convolution ** 4, pad=4)
-            )
+        tok2vec = (
+            FeatureExtracter(cols)
+            >> with_flatten(
+                embed >> (convolution * 4), pad=4)
+        )
 
         # Work around thinc API limitations :(. TODO: Revise in Thinc 7
         tok2vec.nO = width