Implement fancier initialisation for precomputed layer

2025-09-20 02:52:45 +03:00 · 2017-10-20 03:07:45 +02:00 · 2017-10-20 03:07:45 +02:00 · 64658e02e5
commit 64658e02e5
parent 827cd8a883
1 changed files with 52 additions and 12 deletions
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -13,7 +13,8 @@ from thinc.api import uniqued, wrap, flatten_add_lengths, noop
 from thinc.linear.linear import LinearModel
 from thinc.neural.ops import NumpyOps, CupyOps
-from thinc.neural.util import get_array_module
+from thinc.neural.util import get_array_module, copy_array
 from thinc.neural._lsuv import svd_orthonormal
 import random
 import cytoolz
@ -22,6 +23,7 @@ from thinc import describe
 from thinc.describe import Dimension, Synapses, Biases, Gradient
 from thinc.neural._classes.affine import _set_dimensions_if_needed
 import thinc.extra.load_nlp
 from thinc.neural._lsuv import svd_orthonormal
 from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
 from .tokens.doc import Doc
@ -102,22 +104,14 @@ def _preprocess_doc(docs, drop=0.):
    return (keys, vals, lengths), None
-def _init_for_precomputed(W, ops):
+@describe.on_data(_set_dimensions_if_needed,
-    if (W**2).sum() != 0.:
+    lambda model, X, y: model.init_weights(model))
        return
    W = W.reshape((W.shape[0] * W.shape[1], W.shape[2]))
    ops.xavier_uniform_init(W, inplace=True)
    return W
@describe.on_data(_set_dimensions_if_needed)
@describe.attributes(
    nI=Dimension("Input size"),
    nF=Dimension("Number of features"),
    nO=Dimension("Output size"),
    W=Synapses("Weights matrix",
-        lambda obj: (obj.nI, obj.nF, obj.nO),
+        lambda obj: (obj.nI, obj.nF, obj.nO)),
        lambda W, ops: _init_for_precomputed(W, ops)),
    b=Biases("Bias vector",
        lambda obj: (obj.nO,)),
    d_W=Gradient("W"),
@ -173,6 +167,52 @@ class PrecomputableAffine(Model):
        weights = self.ops.xp.ascontiguousarray(weights)
        return weights.reshape(shape)
    @staticmethod
    def init_weights(model):
        '''This is like the 'layer sequential unit variance', but instead
        of taking the actual inputs, we randomly generate whitened data.
        Why's this all so complicated? We have a huge number of inputs,
        and the maxout unit makes guessing the dynamics tricky. Instead
        we set the maxout weights to values that empirically result in
        whitened outputs given whitened inputs.
        '''
        if (model.W**2).sum() != 0.:
            return
        model.ops.normal_init(model.W, model.nFI, inplace=True)
        ids = numpy.zeros((5000, model.nF), dtype='i')
        ids += numpy.asarray(numpy.random.uniform(0, 1000, ids.shape), dtype='i')
        tokvecs = numpy.zeros((5000, model.nI), dtype='f')
        tokvecs += numpy.random.normal(loc=0., scale=1.,
                    size=tokvecs.size).reshape(tokvecs.shape)
        def predict(ids, tokvecs):
            hiddens = model(tokvecs)
            vector = model.ops.allocate((hiddens.shape[0], model.nO))
            model.ops.scatter_add(vector, ids, hiddens)
            vector += model.b
            if model.nP >= 2:
                vector = vector.reshape((ids.shape[0], model.nO//model.nP, model.nP))
                return model.ops.maxout(vector)[0]
            else:
                return vector * (vector >= 0)
        tol_var = 0.01
        tol_mean = 0.01
        t_max = 10
        t_i = 0
        for t_i in range(t_max):
            acts1 = predict(ids, tokvecs)
            var = numpy.var(acts1)
            mean = numpy.mean(acts1)
            if abs(var - 1.0) >= tol_var:
                model.W /= numpy.sqrt(var)
            elif abs(mean) >= tol_mean:
                model.b -= mean
            else:
                break
 # Thinc's Embed class is a bit broken atm, so drop this here.
 from thinc import describe