Fix bias in parser

This commit is contained in:
Matthew Honnibal 2017-10-19 18:42:11 +02:00
parent b54b4b8a97
commit b00d0a2c97
3 changed files with 22 additions and 13 deletions

View File

@ -148,6 +148,7 @@ class PrecomputableAffine(Model):
# W: (i, fo) # W: (i, fo)
# Yf = numpy.einsum('bi,i_fo->b_fo', X, self.W) # Yf = numpy.einsum('bi,i_fo->b_fo', X, self.W)
Yf = einsum('ab,bc->ac', X, self.W).reshape((nN, self.nF, self.nO)) Yf = einsum('ab,bc->ac', X, self.W).reshape((nN, self.nF, self.nO))
#Yf = self.ops.xp.dot(X, self.W).reshape((nN, self.nF, self.nO))
def backward(dY_ids, sgd=None): def backward(dY_ids, sgd=None):
dY, ids = dY_ids dY, ids = dY_ids
nB = ids.shape[0] nB = ids.shape[0]
@ -155,12 +156,14 @@ class PrecomputableAffine(Model):
Xf = Xf.reshape((nB, self.nIF)) Xf = Xf.reshape((nB, self.nIF))
dW_re = self.d_W.reshape((self.nIF, self.nO)) dW_re = self.d_W.reshape((self.nIF, self.nO))
W_re = self.d_W.reshape((self.nIF, self.nO)) W_re = self.W.reshape((self.nIF, self.nO))
# bo,if_o->bif # bo,if_o->bif
dXf = einsum('ab,cb->ac', dY, W_re) dXf = einsum('ab,cb->ac', dY, W_re)
#dXf = self.ops.xp.dot(dY, W_re.T)
# b_if,bo->if_o # b_if,bo->if_o
einsum('ab,ac->bc', Xf, dY, out=dW_re) einsum('ab,ac->bc', Xf, dY, out=dW_re)
# self.d_b += dY.sum(axis=0) #self.ops.xp.dot(Xf.T, dY, out=dW_re)
self.d_b += dY.sum(axis=0)
if sgd is not None: if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id) sgd(self._mem.weights, self._mem.gradient, key=self.id)
@ -208,7 +211,6 @@ class PrecomputableMaxouts(Model):
ascontiguous = self.ops.xp.ascontiguousarray ascontiguous = self.ops.xp.ascontiguousarray
Yfp = tensordot(X, self.W, axes=[[1], [3]]) Yfp = tensordot(X, self.W, axes=[[1], [3]])
Yfp += self.b
def backward(dYp_ids, sgd=None): def backward(dYp_ids, sgd=None):
dYp, ids = dYp_ids dYp, ids = dYp_ids
@ -380,8 +382,6 @@ def reapply(layer, n_times):
return wrap(reapply_fwd, layer) return wrap(reapply_fwd, layer)
def asarray(ops, dtype): def asarray(ops, dtype):
def forward(X, drop=0.): def forward(X, drop=0.):
return ops.asarray(X, dtype=dtype), None return ops.asarray(X, dtype=dtype), None

View File

@ -16,5 +16,6 @@ cdef class Parser:
cdef public object _multitasks cdef public object _multitasks
cdef void _parseC(self, StateC* state, cdef void _parseC(self, StateC* state,
const float* feat_weights, const float* hW, const float* hb, const float* feat_weights, const float* bias,
const float* hW, const float* hb,
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil

View File

@ -101,6 +101,7 @@ cdef class precompute_hiddens:
cdef public object ops cdef public object ops
cdef np.ndarray _features cdef np.ndarray _features
cdef np.ndarray _cached cdef np.ndarray _cached
cdef np.ndarray bias
cdef object _cuda_stream cdef object _cuda_stream
cdef object _bp_hiddens cdef object _bp_hiddens
@ -118,6 +119,7 @@ cdef class precompute_hiddens:
self.nO = cached.shape[2] self.nO = cached.shape[2]
self.nP = getattr(lower_model, 'nP', 1) self.nP = getattr(lower_model, 'nP', 1)
self.ops = lower_model.ops self.ops = lower_model.ops
self.bias = lower_model.b
self._is_synchronized = False self._is_synchronized = False
self._cuda_stream = cuda_stream self._cuda_stream = cuda_stream
self._cached = cached self._cached = cached
@ -147,6 +149,7 @@ cdef class precompute_hiddens:
sum_state_features(<float*>state_vector.data, sum_state_features(<float*>state_vector.data,
feat_weights, &ids[0,0], feat_weights, &ids[0,0],
token_ids.shape[0], self.nF, self.nO*self.nP) token_ids.shape[0], self.nF, self.nO*self.nP)
state_vector += self.bias.ravel()
state_vector, bp_nonlinearity = self._nonlinearity(state_vector) state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
def backward(d_state_vector, sgd=None): def backward(d_state_vector, sgd=None):
@ -161,14 +164,15 @@ cdef class precompute_hiddens:
def _nonlinearity(self, state_vector): def _nonlinearity(self, state_vector):
if self.nP == 1: if self.nP == 1:
return state_vector, None mask = state_vector >= 0.
return state_vector * mask, lambda dY, sgd=None: dY * mask
state_vector = state_vector.reshape( state_vector = state_vector.reshape(
(state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP)) (state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
best, which = self.ops.maxout(state_vector) best, which = self.ops.maxout(state_vector)
def backprop(d_best, sgd=None):
return self.ops.backprop_maxout(d_best, which, self.nP)
return best, backprop
def backprop_maxout(d_best, sgd=None):
return self.ops.backprop_maxout(d_best, which, self.nP)
return best, backprop_maxout
cdef void sum_state_features(float* output, cdef void sum_state_features(float* output,
@ -425,18 +429,20 @@ cdef class Parser:
hW = <float*>hidden_weights.data hW = <float*>hidden_weights.data
hb = <float*>hidden_bias.data hb = <float*>hidden_bias.data
bias = <float*>state2vec.bias.data
cdef int nr_hidden = hidden_weights.shape[0] cdef int nr_hidden = hidden_weights.shape[0]
cdef int nr_task = states.size() cdef int nr_task = states.size()
with nogil: with nogil:
for i in cython.parallel.prange(nr_task, num_threads=2, for i in cython.parallel.prange(nr_task, num_threads=2,
schedule='guided'): schedule='guided'):
self._parseC(states[i], self._parseC(states[i],
feat_weights, hW, hb, feat_weights, bias, hW, hb,
nr_class, nr_hidden, nr_feat, nr_piece) nr_class, nr_hidden, nr_feat, nr_piece)
return state_objs return state_objs
cdef void _parseC(self, StateC* state, cdef void _parseC(self, StateC* state,
const float* feat_weights, const float* hW, const float* hb, const float* feat_weights, const float* bias,
const float* hW, const float* hb,
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil: int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil:
token_ids = <int*>calloc(nr_feat, sizeof(int)) token_ids = <int*>calloc(nr_feat, sizeof(int))
is_valid = <int*>calloc(nr_class, sizeof(int)) is_valid = <int*>calloc(nr_class, sizeof(int))
@ -449,11 +455,13 @@ cdef class Parser:
memset(scores, 0, nr_class * sizeof(float)) memset(scores, 0, nr_class * sizeof(float))
sum_state_features(vectors, sum_state_features(vectors,
feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece) feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece)
for i in range(nr_hidden * nr_piece):
vectors[i] += bias[i]
V = vectors V = vectors
W = hW W = hW
for i in range(nr_hidden): for i in range(nr_hidden):
if nr_piece == 1: if nr_piece == 1:
feature = V[0] feature = V[0] if V[0] >= 0. else 0.
elif nr_piece == 2: elif nr_piece == 2:
feature = V[0] if V[0] >= V[1] else V[1] feature = V[0] if V[0] >= V[1] else V[1]
else: else: