mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	After the precomputable affine table of shape [nB, nF, nO, nP] is computed, padding with shape [1, nF, nO, nP] is assigned to the first row of the precomputed affine table. However, when we are indexing the precomputed table, we get a row of shape [nF, nO, nP]. CuPy versions before 10.0 cannot paper over this shape difference. This change fixes compatibility with CuPy < 10.0 by squeezing the first dimension of the padding before assignment.
		
			
				
	
	
		
			165 lines
		
	
	
		
			5.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			165 lines
		
	
	
		
			5.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from thinc.api import Model, normal_init
 | 
						|
 | 
						|
from ..util import registry
 | 
						|
 | 
						|
 | 
						|
@registry.layers("spacy.PrecomputableAffine.v1")
 | 
						|
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
 | 
						|
    model = Model(
 | 
						|
        "precomputable_affine",
 | 
						|
        forward,
 | 
						|
        init=init,
 | 
						|
        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
 | 
						|
        params={"W": None, "b": None, "pad": None},
 | 
						|
        attrs={"dropout_rate": dropout},
 | 
						|
    )
 | 
						|
    return model
 | 
						|
 | 
						|
 | 
						|
def forward(model, X, is_train):
 | 
						|
    nF = model.get_dim("nF")
 | 
						|
    nO = model.get_dim("nO")
 | 
						|
    nP = model.get_dim("nP")
 | 
						|
    nI = model.get_dim("nI")
 | 
						|
    W = model.get_param("W")
 | 
						|
    # Preallocate array for layer output, including padding.
 | 
						|
    Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
 | 
						|
    model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
 | 
						|
    Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
 | 
						|
 | 
						|
    # Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
 | 
						|
    # change its shape to (nF, nO, nP) without breaking existing models. So
 | 
						|
    # we'll squeeze the first dimension here.
 | 
						|
    Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
 | 
						|
 | 
						|
    def backward(dY_ids):
 | 
						|
        # This backprop is particularly tricky, because we get back a different
 | 
						|
        # thing from what we put out. We put out an array of shape:
 | 
						|
        # (nB, nF, nO, nP), and get back:
 | 
						|
        # (nB, nO, nP) and ids (nB, nF)
 | 
						|
        # The ids tell us the values of nF, so we would have:
 | 
						|
        #
 | 
						|
        # dYf = zeros((nB, nF, nO, nP))
 | 
						|
        # for b in range(nB):
 | 
						|
        #     for f in range(nF):
 | 
						|
        #         dYf[b, ids[b, f]] += dY[b]
 | 
						|
        #
 | 
						|
        # However, we avoid building that array for efficiency -- and just pass
 | 
						|
        # in the indices.
 | 
						|
        dY, ids = dY_ids
 | 
						|
        assert dY.ndim == 3
 | 
						|
        assert dY.shape[1] == nO, dY.shape
 | 
						|
        assert dY.shape[2] == nP, dY.shape
 | 
						|
        # nB = dY.shape[0]
 | 
						|
        model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
 | 
						|
        Xf = X[ids]
 | 
						|
        Xf = Xf.reshape((Xf.shape[0], nF * nI))
 | 
						|
 | 
						|
        model.inc_grad("b", dY.sum(axis=0))
 | 
						|
        dY = dY.reshape((dY.shape[0], nO * nP))
 | 
						|
 | 
						|
        Wopfi = W.transpose((1, 2, 0, 3))
 | 
						|
        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
 | 
						|
        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
 | 
						|
 | 
						|
        dWopfi = model.ops.gemm(dY, Xf, trans1=True)
 | 
						|
        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
 | 
						|
        # (o, p, f, i) --> (f, o, p, i)
 | 
						|
        dWopfi = dWopfi.transpose((2, 0, 1, 3))
 | 
						|
        model.inc_grad("W", dWopfi)
 | 
						|
        return dXf.reshape((dXf.shape[0], nF, nI))
 | 
						|
 | 
						|
    return Yf, backward
 | 
						|
 | 
						|
 | 
						|
def _backprop_precomputable_affine_padding(model, dY, ids):
 | 
						|
    nB = dY.shape[0]
 | 
						|
    nF = model.get_dim("nF")
 | 
						|
    nP = model.get_dim("nP")
 | 
						|
    nO = model.get_dim("nO")
 | 
						|
    # Backprop the "padding", used as a filler for missing values.
 | 
						|
    # Values that are missing are set to -1, and each state vector could
 | 
						|
    # have multiple missing values. The padding has different values for
 | 
						|
    # different missing features. The gradient of the padding vector is:
 | 
						|
    #
 | 
						|
    # for b in range(nB):
 | 
						|
    #     for f in range(nF):
 | 
						|
    #         if ids[b, f] < 0:
 | 
						|
    #             d_pad[f] += dY[b]
 | 
						|
    #
 | 
						|
    # Which can be rewritten as:
 | 
						|
    #
 | 
						|
    # (ids < 0).T @ dY
 | 
						|
    mask = model.ops.asarray(ids < 0, dtype="f")
 | 
						|
    d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
 | 
						|
    return d_pad.reshape((1, nF, nO, nP))
 | 
						|
 | 
						|
 | 
						|
def init(model, X=None, Y=None):
 | 
						|
    """This is like the 'layer sequential unit variance', but instead
 | 
						|
    of taking the actual inputs, we randomly generate whitened data.
 | 
						|
 | 
						|
    Why's this all so complicated? We have a huge number of inputs,
 | 
						|
    and the maxout unit makes guessing the dynamics tricky. Instead
 | 
						|
    we set the maxout weights to values that empirically result in
 | 
						|
    whitened outputs given whitened inputs.
 | 
						|
    """
 | 
						|
    if model.has_param("W") and model.get_param("W").any():
 | 
						|
        return
 | 
						|
 | 
						|
    nF = model.get_dim("nF")
 | 
						|
    nO = model.get_dim("nO")
 | 
						|
    nP = model.get_dim("nP")
 | 
						|
    nI = model.get_dim("nI")
 | 
						|
    W = model.ops.alloc4f(nF, nO, nP, nI)
 | 
						|
    b = model.ops.alloc2f(nO, nP)
 | 
						|
    pad = model.ops.alloc4f(1, nF, nO, nP)
 | 
						|
 | 
						|
    ops = model.ops
 | 
						|
    W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
 | 
						|
    pad = normal_init(ops, pad.shape, mean=1.0)
 | 
						|
    model.set_param("W", W)
 | 
						|
    model.set_param("b", b)
 | 
						|
    model.set_param("pad", pad)
 | 
						|
 | 
						|
    ids = ops.alloc((5000, nF), dtype="f")
 | 
						|
    ids += ops.xp.random.uniform(0, 1000, ids.shape)
 | 
						|
    ids = ops.asarray(ids, dtype="i")
 | 
						|
    tokvecs = ops.alloc((5000, nI), dtype="f")
 | 
						|
    tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
 | 
						|
        tokvecs.shape
 | 
						|
    )
 | 
						|
 | 
						|
    def predict(ids, tokvecs):
 | 
						|
        # nS ids. nW tokvecs. Exclude the padding array.
 | 
						|
        hiddens = model.predict(tokvecs[:-1])  # (nW, f, o, p)
 | 
						|
        vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
 | 
						|
        # need nS vectors
 | 
						|
        hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
 | 
						|
        model.ops.scatter_add(vectors, ids.flatten(), hiddens)
 | 
						|
        vectors = vectors.reshape((vectors.shape[0], nO, nP))
 | 
						|
        vectors += b
 | 
						|
        vectors = model.ops.asarray(vectors)
 | 
						|
        if nP >= 2:
 | 
						|
            return model.ops.maxout(vectors)[0]
 | 
						|
        else:
 | 
						|
            return vectors * (vectors >= 0)
 | 
						|
 | 
						|
    tol_var = 0.01
 | 
						|
    tol_mean = 0.01
 | 
						|
    t_max = 10
 | 
						|
    W = model.get_param("W").copy()
 | 
						|
    b = model.get_param("b").copy()
 | 
						|
    for t_i in range(t_max):
 | 
						|
        acts1 = predict(ids, tokvecs)
 | 
						|
        var = model.ops.xp.var(acts1)
 | 
						|
        mean = model.ops.xp.mean(acts1)
 | 
						|
        if abs(var - 1.0) >= tol_var:
 | 
						|
            W /= model.ops.xp.sqrt(var)
 | 
						|
            model.set_param("W", W)
 | 
						|
        elif abs(mean) >= tol_mean:
 | 
						|
            b -= mean
 | 
						|
            model.set_param("b", b)
 | 
						|
        else:
 | 
						|
            break
 |