mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 16:22:29 +03:00
Delete _precomputable_affine module
This commit is contained in:
parent
9b459f9ef2
commit
0279aa036a
|
@ -1,155 +0,0 @@
|
||||||
from thinc.api import Model, normal_init
|
|
||||||
|
|
||||||
|
|
||||||
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
|
|
||||||
model = Model(
|
|
||||||
"precomputable_affine",
|
|
||||||
forward,
|
|
||||||
init=init,
|
|
||||||
dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
|
|
||||||
params={"W": None, "b": None, "pad": None},
|
|
||||||
attrs={"dropout_rate": dropout},
|
|
||||||
)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def forward(model, X, is_train):
|
|
||||||
nF = model.get_dim("nF")
|
|
||||||
nO = model.get_dim("nO")
|
|
||||||
nP = model.get_dim("nP")
|
|
||||||
nI = model.get_dim("nI")
|
|
||||||
W = model.get_param("W")
|
|
||||||
Yf = model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True)
|
|
||||||
Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
|
|
||||||
Yf = model.ops.xp.vstack((model.get_param("pad"), Yf))
|
|
||||||
|
|
||||||
def backward(dY_ids):
|
|
||||||
# This backprop is particularly tricky, because we get back a different
|
|
||||||
# thing from what we put out. We put out an array of shape:
|
|
||||||
# (nB, nF, nO, nP), and get back:
|
|
||||||
# (nB, nO, nP) and ids (nB, nF)
|
|
||||||
# The ids tell us the values of nF, so we would have:
|
|
||||||
#
|
|
||||||
# dYf = zeros((nB, nF, nO, nP))
|
|
||||||
# for b in range(nB):
|
|
||||||
# for f in range(nF):
|
|
||||||
# dYf[b, ids[b, f]] += dY[b]
|
|
||||||
#
|
|
||||||
# However, we avoid building that array for efficiency -- and just pass
|
|
||||||
# in the indices.
|
|
||||||
dY, ids = dY_ids
|
|
||||||
assert dY.ndim == 3
|
|
||||||
assert dY.shape[1] == nO, dY.shape
|
|
||||||
assert dY.shape[2] == nP, dY.shape
|
|
||||||
# nB = dY.shape[0]
|
|
||||||
model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
|
|
||||||
Xf = X[ids]
|
|
||||||
Xf = Xf.reshape((Xf.shape[0], nF * nI))
|
|
||||||
|
|
||||||
model.inc_grad("b", dY.sum(axis=0))
|
|
||||||
dY = dY.reshape((dY.shape[0], nO * nP))
|
|
||||||
|
|
||||||
Wopfi = W.transpose((1, 2, 0, 3))
|
|
||||||
Wopfi = Wopfi.reshape((nO * nP, nF * nI))
|
|
||||||
dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
|
|
||||||
|
|
||||||
dWopfi = model.ops.gemm(dY, Xf, trans1=True)
|
|
||||||
dWopfi = dWopfi.reshape((nO, nP, nF, nI))
|
|
||||||
# (o, p, f, i) --> (f, o, p, i)
|
|
||||||
dWopfi = dWopfi.transpose((2, 0, 1, 3))
|
|
||||||
model.inc_grad("W", dWopfi)
|
|
||||||
return dXf.reshape((dXf.shape[0], nF, nI))
|
|
||||||
|
|
||||||
return Yf, backward
|
|
||||||
|
|
||||||
|
|
||||||
def _backprop_precomputable_affine_padding(model, dY, ids):
|
|
||||||
nB = dY.shape[0]
|
|
||||||
nF = model.get_dim("nF")
|
|
||||||
nP = model.get_dim("nP")
|
|
||||||
nO = model.get_dim("nO")
|
|
||||||
# Backprop the "padding", used as a filler for missing values.
|
|
||||||
# Values that are missing are set to -1, and each state vector could
|
|
||||||
# have multiple missing values. The padding has different values for
|
|
||||||
# different missing features. The gradient of the padding vector is:
|
|
||||||
#
|
|
||||||
# for b in range(nB):
|
|
||||||
# for f in range(nF):
|
|
||||||
# if ids[b, f] < 0:
|
|
||||||
# d_pad[f] += dY[b]
|
|
||||||
#
|
|
||||||
# Which can be rewritten as:
|
|
||||||
#
|
|
||||||
# (ids < 0).T @ dY
|
|
||||||
mask = model.ops.asarray(ids < 0, dtype="f")
|
|
||||||
d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
|
|
||||||
return d_pad.reshape((1, nF, nO, nP))
|
|
||||||
|
|
||||||
|
|
||||||
def init(model, X=None, Y=None):
|
|
||||||
"""This is like the 'layer sequential unit variance', but instead
|
|
||||||
of taking the actual inputs, we randomly generate whitened data.
|
|
||||||
|
|
||||||
Why's this all so complicated? We have a huge number of inputs,
|
|
||||||
and the maxout unit makes guessing the dynamics tricky. Instead
|
|
||||||
we set the maxout weights to values that empirically result in
|
|
||||||
whitened outputs given whitened inputs.
|
|
||||||
"""
|
|
||||||
if model.has_param("W") and model.get_param("W").any():
|
|
||||||
return
|
|
||||||
|
|
||||||
nF = model.get_dim("nF")
|
|
||||||
nO = model.get_dim("nO")
|
|
||||||
nP = model.get_dim("nP")
|
|
||||||
nI = model.get_dim("nI")
|
|
||||||
W = model.ops.alloc4f(nF, nO, nP, nI)
|
|
||||||
b = model.ops.alloc2f(nO, nP)
|
|
||||||
pad = model.ops.alloc4f(1, nF, nO, nP)
|
|
||||||
|
|
||||||
ops = model.ops
|
|
||||||
W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
|
|
||||||
pad = normal_init(ops, pad.shape, mean=1.0)
|
|
||||||
model.set_param("W", W)
|
|
||||||
model.set_param("b", b)
|
|
||||||
model.set_param("pad", pad)
|
|
||||||
|
|
||||||
ids = ops.alloc((5000, nF), dtype="f")
|
|
||||||
ids += ops.xp.random.uniform(0, 1000, ids.shape)
|
|
||||||
ids = ops.asarray(ids, dtype="i")
|
|
||||||
tokvecs = ops.alloc((5000, nI), dtype="f")
|
|
||||||
tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
|
|
||||||
tokvecs.shape
|
|
||||||
)
|
|
||||||
|
|
||||||
def predict(ids, tokvecs):
|
|
||||||
# nS ids. nW tokvecs. Exclude the padding array.
|
|
||||||
hiddens = model.predict(tokvecs[:-1]) # (nW, f, o, p)
|
|
||||||
vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
|
|
||||||
# need nS vectors
|
|
||||||
hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
|
|
||||||
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
|
|
||||||
vectors = vectors.reshape((vectors.shape[0], nO, nP))
|
|
||||||
vectors += b
|
|
||||||
vectors = model.ops.asarray(vectors)
|
|
||||||
if nP >= 2:
|
|
||||||
return model.ops.maxout(vectors)[0]
|
|
||||||
else:
|
|
||||||
return vectors * (vectors >= 0)
|
|
||||||
|
|
||||||
tol_var = 0.01
|
|
||||||
tol_mean = 0.01
|
|
||||||
t_max = 10
|
|
||||||
W = model.get_param("W").copy()
|
|
||||||
b = model.get_param("b").copy()
|
|
||||||
for t_i in range(t_max):
|
|
||||||
acts1 = predict(ids, tokvecs)
|
|
||||||
var = model.ops.xp.var(acts1)
|
|
||||||
mean = model.ops.xp.mean(acts1)
|
|
||||||
if abs(var - 1.0) >= tol_var:
|
|
||||||
W /= model.ops.xp.sqrt(var)
|
|
||||||
model.set_param("W", W)
|
|
||||||
elif abs(mean) >= tol_mean:
|
|
||||||
b -= mean
|
|
||||||
model.set_param("b", b)
|
|
||||||
else:
|
|
||||||
break
|
|
Loading…
Reference in New Issue
Block a user