Implement fancier initialisation for precomputed layer

2025-07-19 04:32:32 +03:00 · 2017-10-20 03:07:45 +02:00 · 2017-10-20 03:07:45 +02:00 · 64658e02e5
commit 64658e02e5
parent 827cd8a883
1 changed files with 52 additions and 12 deletions
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -13,7 +13,8 @@ from thinc.api import uniqued, wrap, flatten_add_lengths, noop

 from thinc.linear.linear import LinearModel
 from thinc.neural.ops import NumpyOps, CupyOps
-from thinc.neural.util import get_array_module
+from thinc.neural.util import get_array_module, copy_array
+from thinc.neural._lsuv import svd_orthonormal

 import random
 import cytoolz
@ -22,6 +23,7 @@ from thinc import describe
 from thinc.describe import Dimension, Synapses, Biases, Gradient
 from thinc.neural._classes.affine import _set_dimensions_if_needed
 import thinc.extra.load_nlp
+from thinc.neural._lsuv import svd_orthonormal

 from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
 from .tokens.doc import Doc
@ -102,22 +104,14 @@ def _preprocess_doc(docs, drop=0.):
    return (keys, vals, lengths), None


-def _init_for_precomputed(W, ops):
-    if (W**2).sum() != 0.:
-        return
-    W = W.reshape((W.shape[0] * W.shape[1], W.shape[2]))
-    ops.xavier_uniform_init(W, inplace=True)
-    return W
-
-
-@describe.on_data(_set_dimensions_if_needed)
+@describe.on_data(_set_dimensions_if_needed,
+    lambda model, X, y: model.init_weights(model))
@describe.attributes(
    nI=Dimension("Input size"),
    nF=Dimension("Number of features"),
    nO=Dimension("Output size"),
    W=Synapses("Weights matrix",
-        lambda obj: (obj.nI, obj.nF, obj.nO),
-        lambda W, ops: _init_for_precomputed(W, ops)),
+        lambda obj: (obj.nI, obj.nF, obj.nO)),
    b=Biases("Bias vector",
        lambda obj: (obj.nO,)),
    d_W=Gradient("W"),
@ -173,6 +167,52 @@ class PrecomputableAffine(Model):
        weights = self.ops.xp.ascontiguousarray(weights)
        return weights.reshape(shape)

+    @staticmethod
+    def init_weights(model):
+        '''This is like the 'layer sequential unit variance', but instead
+        of taking the actual inputs, we randomly generate whitened data.
+
+        Why's this all so complicated? We have a huge number of inputs,
+        and the maxout unit makes guessing the dynamics tricky. Instead
+        we set the maxout weights to values that empirically result in
+        whitened outputs given whitened inputs.
+        '''
+        if (model.W**2).sum() != 0.:
+            return
+        model.ops.normal_init(model.W, model.nFI, inplace=True)
+
+        ids = numpy.zeros((5000, model.nF), dtype='i')
+        ids += numpy.asarray(numpy.random.uniform(0, 1000, ids.shape), dtype='i')
+        tokvecs = numpy.zeros((5000, model.nI), dtype='f')
+        tokvecs += numpy.random.normal(loc=0., scale=1.,
+                    size=tokvecs.size).reshape(tokvecs.shape)
+
+        def predict(ids, tokvecs):
+            hiddens = model(tokvecs)
+            vector = model.ops.allocate((hiddens.shape[0], model.nO))
+            model.ops.scatter_add(vector, ids, hiddens)
+            vector += model.b
+            if model.nP >= 2:
+                vector = vector.reshape((ids.shape[0], model.nO//model.nP, model.nP))
+                return model.ops.maxout(vector)[0]
+            else:
+                return vector * (vector >= 0)
+
+        tol_var = 0.01
+        tol_mean = 0.01
+        t_max = 10
+        t_i = 0
+        for t_i in range(t_max):
+            acts1 = predict(ids, tokvecs)
+            var = numpy.var(acts1)
+            mean = numpy.mean(acts1)
+            if abs(var - 1.0) >= tol_var:
+                model.W /= numpy.sqrt(var)
+            elif abs(mean) >= tol_mean:
+                model.b -= mean
+            else:
+                break
+

 # Thinc's Embed class is a bit broken atm, so drop this here.
 from thinc import describe