mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
Try to move parser to simpler PrecomputedAffine class. Currently broken -- maybe the previous change
This commit is contained in:
parent
3ff8c35a79
commit
b272890a8c
96
spacy/_ml.py
96
spacy/_ml.py
|
@ -17,14 +17,19 @@ from .tokens.doc import Doc
|
|||
import numpy
|
||||
|
||||
|
||||
def _init_for_precomputed(W, ops):
|
||||
reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2]))
|
||||
ops.xavier_uniform_init(reshaped)
|
||||
W[:] = reshaped.reshape(W.shape)
|
||||
|
||||
@describe.on_data(_set_dimensions_if_needed)
|
||||
@describe.attributes(
|
||||
nI=Dimension("Input size"),
|
||||
nF=Dimension("Number of features"),
|
||||
nO=Dimension("Output size"),
|
||||
W=Synapses("Weights matrix",
|
||||
lambda obj: (obj.nO, obj.nF, obj.nI),
|
||||
lambda W, ops: ops.xavier_uniform_init(W)),
|
||||
lambda obj: (obj.nF, obj.nO, obj.nI),
|
||||
lambda W, ops: _init_for_precomputed(W, ops)),
|
||||
b=Biases("Bias vector",
|
||||
lambda obj: (obj.nO,)),
|
||||
d_W=Gradient("W"),
|
||||
|
@ -39,25 +44,25 @@ class PrecomputableAffine(Model):
|
|||
|
||||
def begin_update(self, X, drop=0.):
|
||||
# X: (b, i)
|
||||
# Xf: (b, f, i)
|
||||
# Yf: (b, f, i)
|
||||
# dY: (b, o)
|
||||
# dYf: (b, f, o)
|
||||
#Yf = numpy.einsum('bi,ofi->bfo', X, self.W)
|
||||
#Yf = numpy.einsum('bi,foi->bfo', X, self.W)
|
||||
Yf = self.ops.xp.tensordot(
|
||||
X, self.W, axes=[[1], [2]]).transpose((0, 2, 1))
|
||||
X, self.W, axes=[[1], [2]])
|
||||
Yf += self.b
|
||||
def backward(dY_ids, sgd=None):
|
||||
tensordot = self.ops.xp.tensordot
|
||||
dY, ids = dY_ids
|
||||
Xf = X[ids]
|
||||
|
||||
#dXf = numpy.einsum('bo,foi->bfi', dY, self.W)
|
||||
dXf = tensordot(dY, self.W, axes=[[1], [1]])
|
||||
#dW = numpy.einsum('bo,bfi->ofi', dY, Xf)
|
||||
dW = self.ops.xp.tensordot(dY, Xf, axes=[[0], [0]])
|
||||
db = dY.sum(axis=0)
|
||||
#dXf = numpy.einsum('bo,ofi->bfi', dY, self.W)
|
||||
dXf = self.ops.xp.tensordot(dY, self.W, axes=[[1], [0]])
|
||||
|
||||
self.d_W += dW
|
||||
self.d_b += db
|
||||
dW = tensordot(dY, Xf, axes=[[0], [0]])
|
||||
# ofi -> foi
|
||||
self.d_W += dW.transpose((1, 0, 2))
|
||||
self.d_b += dY.sum(axis=0)
|
||||
|
||||
if sgd is not None:
|
||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||
|
@ -144,14 +149,70 @@ def Tok2Vec(width, embed_size, preprocess=None):
|
|||
return tok2vec
|
||||
|
||||
|
||||
def get_col(idx):
|
||||
def foreach(layer):
|
||||
def forward(Xs, drop=0.):
|
||||
results = []
|
||||
backprops = []
|
||||
for X in Xs:
|
||||
result, bp = layer.begin_update(X, drop=drop)
|
||||
results.append(result)
|
||||
backprops.append(bp)
|
||||
def backward(d_results, sgd=None):
|
||||
dXs = []
|
||||
for d_result, backprop in zip(d_results, backprops):
|
||||
dXs.append(backprop(d_result, sgd))
|
||||
return dXs
|
||||
return results, backward
|
||||
model = layerize(forward)
|
||||
model._layers.append(layer)
|
||||
return model
|
||||
|
||||
|
||||
def rebatch(size, layer):
|
||||
ops = layer.ops
|
||||
def forward(X, drop=0.):
|
||||
if X.shape[0] < size:
|
||||
return layer.begin_update(X)
|
||||
parts = _divide_array(X, size)
|
||||
results, bp_results = zip(*[layer.begin_update(p, drop=drop)
|
||||
for p in parts])
|
||||
y = ops.flatten(results)
|
||||
def backward(dy, sgd=None):
|
||||
d_parts = [bp(y, sgd=sgd) for bp, y in
|
||||
zip(bp_results, _divide_array(dy, size))]
|
||||
try:
|
||||
dX = ops.flatten(d_parts)
|
||||
except TypeError:
|
||||
dX = None
|
||||
except ValueError:
|
||||
dX = None
|
||||
return dX
|
||||
return y, backward
|
||||
model = layerize(forward)
|
||||
model._layers.append(layer)
|
||||
return model
|
||||
|
||||
|
||||
def _divide_array(X, size):
|
||||
parts = []
|
||||
index = 0
|
||||
while index < len(X):
|
||||
parts.append(X[index : index + size])
|
||||
index += size
|
||||
return parts
|
||||
|
||||
|
||||
def get_col(idx):
|
||||
assert idx >= 0, idx
|
||||
def forward(X, drop=0.):
|
||||
assert idx >= 0, idx
|
||||
if isinstance(X, numpy.ndarray):
|
||||
ops = NumpyOps()
|
||||
else:
|
||||
ops = CupyOps()
|
||||
output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
|
||||
def backward(y, sgd=None):
|
||||
assert idx >= 0, idx
|
||||
dX = ops.allocate(X.shape)
|
||||
dX[:, idx] += y
|
||||
return dX
|
||||
|
@ -171,12 +232,9 @@ def doc2feats(cols=None):
|
|||
def forward(docs, drop=0.):
|
||||
feats = []
|
||||
for doc in docs:
|
||||
if 'cached_feats' not in doc.user_data:
|
||||
doc.user_data['cached_feats'] = model.ops.asarray(
|
||||
doc.to_array(cols),
|
||||
dtype='uint64')
|
||||
feats.append(doc.user_data['cached_feats'])
|
||||
assert feats[-1].dtype == 'uint64'
|
||||
feats.append(
|
||||
model.ops.asarray(doc.to_array(cols),
|
||||
dtype='uint64'))
|
||||
return feats, None
|
||||
model = layerize(forward)
|
||||
model.cols = cols
|
||||
|
|
|
@ -84,7 +84,7 @@ cdef class precompute_hiddens:
|
|||
we can do all our hard maths up front, packed into large multiplications,
|
||||
and do the hard-to-program parsing on the CPU.
|
||||
'''
|
||||
cdef int nF, nO, nP
|
||||
cdef int nF, nO
|
||||
cdef bint _is_synchronized
|
||||
cdef public object ops
|
||||
cdef np.ndarray _features
|
||||
|
@ -104,9 +104,8 @@ cdef class precompute_hiddens:
|
|||
cached = gpu_cached
|
||||
self.nF = cached.shape[1]
|
||||
self.nO = cached.shape[2]
|
||||
self.nP = cached.shape[3]
|
||||
self.ops = lower_model.ops
|
||||
self._features = numpy.zeros((batch_size, self.nO, self.nP), dtype='f')
|
||||
self._features = numpy.zeros((batch_size, self.nO), dtype='f')
|
||||
self._is_synchronized = False
|
||||
self._cuda_stream = cuda_stream
|
||||
self._cached = cached
|
||||
|
@ -133,24 +132,15 @@ cdef class precompute_hiddens:
|
|||
cdef int[:, ::1] ids = token_ids
|
||||
self._sum_features(<float*>state_vector.data,
|
||||
<float*>hiddens.data, &ids[0,0],
|
||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
||||
token_ids.shape[0], self.nF, self.nO)
|
||||
|
||||
output, bp_output = self._apply_nonlinearity(state_vector)
|
||||
|
||||
def backward(d_output, sgd=None):
|
||||
def backward(d_state_vector, sgd=None):
|
||||
# This will usually be on GPU
|
||||
if isinstance(d_output, numpy.ndarray):
|
||||
d_output = self.ops.xp.array(d_output)
|
||||
d_state_vector = bp_output(d_output, sgd)
|
||||
if isinstance(d_state_vector, numpy.ndarray):
|
||||
d_state_vector = self.ops.xp.array(d_state_vector)
|
||||
d_tokens = bp_hiddens((d_state_vector, token_ids), sgd)
|
||||
return d_tokens
|
||||
return output, backward
|
||||
|
||||
def _apply_nonlinearity(self, X):
|
||||
if self.nP < 2:
|
||||
return X.reshape(X.shape[:2]), lambda dX, sgd=None: dX.reshape(X.shape)
|
||||
best, which = self.ops.maxout(X)
|
||||
return best, lambda dX, sgd=None: self.ops.backprop_maxout(dX, which, self.nP)
|
||||
return state_vector, backward
|
||||
|
||||
cdef void _sum_features(self, float* output,
|
||||
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
||||
|
@ -223,11 +213,9 @@ cdef class Parser:
|
|||
def Model(cls, nr_class, token_vector_width=128, hidden_width=128, **cfg):
|
||||
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
|
||||
hidden_width = util.env_opt('hidden_width', hidden_width)
|
||||
maxout_pieces = util.env_opt('parser_maxout_pieces', 1)
|
||||
lower = PrecomputableMaxouts(hidden_width,
|
||||
lower = PrecomputableAffine(hidden_width,
|
||||
nF=cls.nr_feature,
|
||||
nI=token_vector_width,
|
||||
pieces=maxout_pieces)
|
||||
nI=token_vector_width)
|
||||
|
||||
with Model.use_device('cpu'):
|
||||
upper = chain(
|
||||
|
|
Loading…
Reference in New Issue
Block a user