Try to move parser to simpler PrecomputedAffine class. Currently broken -- maybe the previous change

This commit is contained in:
Matthew Honnibal 2017-05-20 06:40:10 -05:00
parent 3ff8c35a79
commit b272890a8c
2 changed files with 86 additions and 40 deletions

View File

@ -17,14 +17,19 @@ from .tokens.doc import Doc
import numpy import numpy
def _init_for_precomputed(W, ops):
reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2]))
ops.xavier_uniform_init(reshaped)
W[:] = reshaped.reshape(W.shape)
@describe.on_data(_set_dimensions_if_needed) @describe.on_data(_set_dimensions_if_needed)
@describe.attributes( @describe.attributes(
nI=Dimension("Input size"), nI=Dimension("Input size"),
nF=Dimension("Number of features"), nF=Dimension("Number of features"),
nO=Dimension("Output size"), nO=Dimension("Output size"),
W=Synapses("Weights matrix", W=Synapses("Weights matrix",
lambda obj: (obj.nO, obj.nF, obj.nI), lambda obj: (obj.nF, obj.nO, obj.nI),
lambda W, ops: ops.xavier_uniform_init(W)), lambda W, ops: _init_for_precomputed(W, ops)),
b=Biases("Bias vector", b=Biases("Bias vector",
lambda obj: (obj.nO,)), lambda obj: (obj.nO,)),
d_W=Gradient("W"), d_W=Gradient("W"),
@ -39,25 +44,25 @@ class PrecomputableAffine(Model):
def begin_update(self, X, drop=0.): def begin_update(self, X, drop=0.):
# X: (b, i) # X: (b, i)
# Xf: (b, f, i) # Yf: (b, f, i)
# dY: (b, o) # dY: (b, o)
# dYf: (b, f, o) # dYf: (b, f, o)
#Yf = numpy.einsum('bi,ofi->bfo', X, self.W) #Yf = numpy.einsum('bi,foi->bfo', X, self.W)
Yf = self.ops.xp.tensordot( Yf = self.ops.xp.tensordot(
X, self.W, axes=[[1], [2]]).transpose((0, 2, 1)) X, self.W, axes=[[1], [2]])
Yf += self.b Yf += self.b
def backward(dY_ids, sgd=None): def backward(dY_ids, sgd=None):
tensordot = self.ops.xp.tensordot
dY, ids = dY_ids dY, ids = dY_ids
Xf = X[ids] Xf = X[ids]
#dXf = numpy.einsum('bo,foi->bfi', dY, self.W)
dXf = tensordot(dY, self.W, axes=[[1], [1]])
#dW = numpy.einsum('bo,bfi->ofi', dY, Xf) #dW = numpy.einsum('bo,bfi->ofi', dY, Xf)
dW = self.ops.xp.tensordot(dY, Xf, axes=[[0], [0]]) dW = tensordot(dY, Xf, axes=[[0], [0]])
db = dY.sum(axis=0) # ofi -> foi
#dXf = numpy.einsum('bo,ofi->bfi', dY, self.W) self.d_W += dW.transpose((1, 0, 2))
dXf = self.ops.xp.tensordot(dY, self.W, axes=[[1], [0]]) self.d_b += dY.sum(axis=0)
self.d_W += dW
self.d_b += db
if sgd is not None: if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id) sgd(self._mem.weights, self._mem.gradient, key=self.id)
@ -144,14 +149,70 @@ def Tok2Vec(width, embed_size, preprocess=None):
return tok2vec return tok2vec
def get_col(idx): def foreach(layer):
def forward(Xs, drop=0.):
results = []
backprops = []
for X in Xs:
result, bp = layer.begin_update(X, drop=drop)
results.append(result)
backprops.append(bp)
def backward(d_results, sgd=None):
dXs = []
for d_result, backprop in zip(d_results, backprops):
dXs.append(backprop(d_result, sgd))
return dXs
return results, backward
model = layerize(forward)
model._layers.append(layer)
return model
def rebatch(size, layer):
ops = layer.ops
def forward(X, drop=0.): def forward(X, drop=0.):
if X.shape[0] < size:
return layer.begin_update(X)
parts = _divide_array(X, size)
results, bp_results = zip(*[layer.begin_update(p, drop=drop)
for p in parts])
y = ops.flatten(results)
def backward(dy, sgd=None):
d_parts = [bp(y, sgd=sgd) for bp, y in
zip(bp_results, _divide_array(dy, size))]
try:
dX = ops.flatten(d_parts)
except TypeError:
dX = None
except ValueError:
dX = None
return dX
return y, backward
model = layerize(forward)
model._layers.append(layer)
return model
def _divide_array(X, size):
parts = []
index = 0
while index < len(X):
parts.append(X[index : index + size])
index += size
return parts
def get_col(idx):
assert idx >= 0, idx
def forward(X, drop=0.):
assert idx >= 0, idx
if isinstance(X, numpy.ndarray): if isinstance(X, numpy.ndarray):
ops = NumpyOps() ops = NumpyOps()
else: else:
ops = CupyOps() ops = CupyOps()
output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype) output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
def backward(y, sgd=None): def backward(y, sgd=None):
assert idx >= 0, idx
dX = ops.allocate(X.shape) dX = ops.allocate(X.shape)
dX[:, idx] += y dX[:, idx] += y
return dX return dX
@ -171,12 +232,9 @@ def doc2feats(cols=None):
def forward(docs, drop=0.): def forward(docs, drop=0.):
feats = [] feats = []
for doc in docs: for doc in docs:
if 'cached_feats' not in doc.user_data: feats.append(
doc.user_data['cached_feats'] = model.ops.asarray( model.ops.asarray(doc.to_array(cols),
doc.to_array(cols), dtype='uint64'))
dtype='uint64')
feats.append(doc.user_data['cached_feats'])
assert feats[-1].dtype == 'uint64'
return feats, None return feats, None
model = layerize(forward) model = layerize(forward)
model.cols = cols model.cols = cols

View File

@ -84,7 +84,7 @@ cdef class precompute_hiddens:
we can do all our hard maths up front, packed into large multiplications, we can do all our hard maths up front, packed into large multiplications,
and do the hard-to-program parsing on the CPU. and do the hard-to-program parsing on the CPU.
''' '''
cdef int nF, nO, nP cdef int nF, nO
cdef bint _is_synchronized cdef bint _is_synchronized
cdef public object ops cdef public object ops
cdef np.ndarray _features cdef np.ndarray _features
@ -104,9 +104,8 @@ cdef class precompute_hiddens:
cached = gpu_cached cached = gpu_cached
self.nF = cached.shape[1] self.nF = cached.shape[1]
self.nO = cached.shape[2] self.nO = cached.shape[2]
self.nP = cached.shape[3]
self.ops = lower_model.ops self.ops = lower_model.ops
self._features = numpy.zeros((batch_size, self.nO, self.nP), dtype='f') self._features = numpy.zeros((batch_size, self.nO), dtype='f')
self._is_synchronized = False self._is_synchronized = False
self._cuda_stream = cuda_stream self._cuda_stream = cuda_stream
self._cached = cached self._cached = cached
@ -133,24 +132,15 @@ cdef class precompute_hiddens:
cdef int[:, ::1] ids = token_ids cdef int[:, ::1] ids = token_ids
self._sum_features(<float*>state_vector.data, self._sum_features(<float*>state_vector.data,
<float*>hiddens.data, &ids[0,0], <float*>hiddens.data, &ids[0,0],
token_ids.shape[0], self.nF, self.nO*self.nP) token_ids.shape[0], self.nF, self.nO)
output, bp_output = self._apply_nonlinearity(state_vector) def backward(d_state_vector, sgd=None):
def backward(d_output, sgd=None):
# This will usually be on GPU # This will usually be on GPU
if isinstance(d_output, numpy.ndarray): if isinstance(d_state_vector, numpy.ndarray):
d_output = self.ops.xp.array(d_output) d_state_vector = self.ops.xp.array(d_state_vector)
d_state_vector = bp_output(d_output, sgd)
d_tokens = bp_hiddens((d_state_vector, token_ids), sgd) d_tokens = bp_hiddens((d_state_vector, token_ids), sgd)
return d_tokens return d_tokens
return output, backward return state_vector, backward
def _apply_nonlinearity(self, X):
if self.nP < 2:
return X.reshape(X.shape[:2]), lambda dX, sgd=None: dX.reshape(X.shape)
best, which = self.ops.maxout(X)
return best, lambda dX, sgd=None: self.ops.backprop_maxout(dX, which, self.nP)
cdef void _sum_features(self, float* output, cdef void _sum_features(self, float* output,
const float* cached, const int* token_ids, int B, int F, int O) nogil: const float* cached, const int* token_ids, int B, int F, int O) nogil:
@ -223,11 +213,9 @@ cdef class Parser:
def Model(cls, nr_class, token_vector_width=128, hidden_width=128, **cfg): def Model(cls, nr_class, token_vector_width=128, hidden_width=128, **cfg):
token_vector_width = util.env_opt('token_vector_width', token_vector_width) token_vector_width = util.env_opt('token_vector_width', token_vector_width)
hidden_width = util.env_opt('hidden_width', hidden_width) hidden_width = util.env_opt('hidden_width', hidden_width)
maxout_pieces = util.env_opt('parser_maxout_pieces', 1) lower = PrecomputableAffine(hidden_width,
lower = PrecomputableMaxouts(hidden_width,
nF=cls.nr_feature, nF=cls.nr_feature,
nI=token_vector_width, nI=token_vector_width)
pieces=maxout_pieces)
with Model.use_device('cpu'): with Model.use_device('cpu'):
upper = chain( upper = chain(