mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Fix bias in parser
This commit is contained in:
parent
b54b4b8a97
commit
b00d0a2c97
10
spacy/_ml.py
10
spacy/_ml.py
|
@ -148,6 +148,7 @@ class PrecomputableAffine(Model):
|
||||||
# W: (i, fo)
|
# W: (i, fo)
|
||||||
# Yf = numpy.einsum('bi,i_fo->b_fo', X, self.W)
|
# Yf = numpy.einsum('bi,i_fo->b_fo', X, self.W)
|
||||||
Yf = einsum('ab,bc->ac', X, self.W).reshape((nN, self.nF, self.nO))
|
Yf = einsum('ab,bc->ac', X, self.W).reshape((nN, self.nF, self.nO))
|
||||||
|
#Yf = self.ops.xp.dot(X, self.W).reshape((nN, self.nF, self.nO))
|
||||||
def backward(dY_ids, sgd=None):
|
def backward(dY_ids, sgd=None):
|
||||||
dY, ids = dY_ids
|
dY, ids = dY_ids
|
||||||
nB = ids.shape[0]
|
nB = ids.shape[0]
|
||||||
|
@ -155,12 +156,14 @@ class PrecomputableAffine(Model):
|
||||||
Xf = Xf.reshape((nB, self.nIF))
|
Xf = Xf.reshape((nB, self.nIF))
|
||||||
|
|
||||||
dW_re = self.d_W.reshape((self.nIF, self.nO))
|
dW_re = self.d_W.reshape((self.nIF, self.nO))
|
||||||
W_re = self.d_W.reshape((self.nIF, self.nO))
|
W_re = self.W.reshape((self.nIF, self.nO))
|
||||||
# bo,if_o->bif
|
# bo,if_o->bif
|
||||||
dXf = einsum('ab,cb->ac', dY, W_re)
|
dXf = einsum('ab,cb->ac', dY, W_re)
|
||||||
|
#dXf = self.ops.xp.dot(dY, W_re.T)
|
||||||
# b_if,bo->if_o
|
# b_if,bo->if_o
|
||||||
einsum('ab,ac->bc', Xf, dY, out=dW_re)
|
einsum('ab,ac->bc', Xf, dY, out=dW_re)
|
||||||
# self.d_b += dY.sum(axis=0)
|
#self.ops.xp.dot(Xf.T, dY, out=dW_re)
|
||||||
|
self.d_b += dY.sum(axis=0)
|
||||||
|
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||||
|
@ -208,7 +211,6 @@ class PrecomputableMaxouts(Model):
|
||||||
ascontiguous = self.ops.xp.ascontiguousarray
|
ascontiguous = self.ops.xp.ascontiguousarray
|
||||||
|
|
||||||
Yfp = tensordot(X, self.W, axes=[[1], [3]])
|
Yfp = tensordot(X, self.W, axes=[[1], [3]])
|
||||||
Yfp += self.b
|
|
||||||
|
|
||||||
def backward(dYp_ids, sgd=None):
|
def backward(dYp_ids, sgd=None):
|
||||||
dYp, ids = dYp_ids
|
dYp, ids = dYp_ids
|
||||||
|
@ -380,8 +382,6 @@ def reapply(layer, n_times):
|
||||||
return wrap(reapply_fwd, layer)
|
return wrap(reapply_fwd, layer)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def asarray(ops, dtype):
|
def asarray(ops, dtype):
|
||||||
def forward(X, drop=0.):
|
def forward(X, drop=0.):
|
||||||
return ops.asarray(X, dtype=dtype), None
|
return ops.asarray(X, dtype=dtype), None
|
||||||
|
|
|
@ -16,5 +16,6 @@ cdef class Parser:
|
||||||
cdef public object _multitasks
|
cdef public object _multitasks
|
||||||
|
|
||||||
cdef void _parseC(self, StateC* state,
|
cdef void _parseC(self, StateC* state,
|
||||||
const float* feat_weights, const float* hW, const float* hb,
|
const float* feat_weights, const float* bias,
|
||||||
|
const float* hW, const float* hb,
|
||||||
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil
|
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil
|
||||||
|
|
|
@ -101,6 +101,7 @@ cdef class precompute_hiddens:
|
||||||
cdef public object ops
|
cdef public object ops
|
||||||
cdef np.ndarray _features
|
cdef np.ndarray _features
|
||||||
cdef np.ndarray _cached
|
cdef np.ndarray _cached
|
||||||
|
cdef np.ndarray bias
|
||||||
cdef object _cuda_stream
|
cdef object _cuda_stream
|
||||||
cdef object _bp_hiddens
|
cdef object _bp_hiddens
|
||||||
|
|
||||||
|
@ -118,6 +119,7 @@ cdef class precompute_hiddens:
|
||||||
self.nO = cached.shape[2]
|
self.nO = cached.shape[2]
|
||||||
self.nP = getattr(lower_model, 'nP', 1)
|
self.nP = getattr(lower_model, 'nP', 1)
|
||||||
self.ops = lower_model.ops
|
self.ops = lower_model.ops
|
||||||
|
self.bias = lower_model.b
|
||||||
self._is_synchronized = False
|
self._is_synchronized = False
|
||||||
self._cuda_stream = cuda_stream
|
self._cuda_stream = cuda_stream
|
||||||
self._cached = cached
|
self._cached = cached
|
||||||
|
@ -147,6 +149,7 @@ cdef class precompute_hiddens:
|
||||||
sum_state_features(<float*>state_vector.data,
|
sum_state_features(<float*>state_vector.data,
|
||||||
feat_weights, &ids[0,0],
|
feat_weights, &ids[0,0],
|
||||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
token_ids.shape[0], self.nF, self.nO*self.nP)
|
||||||
|
state_vector += self.bias.ravel()
|
||||||
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
||||||
|
|
||||||
def backward(d_state_vector, sgd=None):
|
def backward(d_state_vector, sgd=None):
|
||||||
|
@ -161,14 +164,15 @@ cdef class precompute_hiddens:
|
||||||
|
|
||||||
def _nonlinearity(self, state_vector):
|
def _nonlinearity(self, state_vector):
|
||||||
if self.nP == 1:
|
if self.nP == 1:
|
||||||
return state_vector, None
|
mask = state_vector >= 0.
|
||||||
|
return state_vector * mask, lambda dY, sgd=None: dY * mask
|
||||||
state_vector = state_vector.reshape(
|
state_vector = state_vector.reshape(
|
||||||
(state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
|
(state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
|
||||||
best, which = self.ops.maxout(state_vector)
|
best, which = self.ops.maxout(state_vector)
|
||||||
def backprop(d_best, sgd=None):
|
|
||||||
return self.ops.backprop_maxout(d_best, which, self.nP)
|
|
||||||
return best, backprop
|
|
||||||
|
|
||||||
|
def backprop_maxout(d_best, sgd=None):
|
||||||
|
return self.ops.backprop_maxout(d_best, which, self.nP)
|
||||||
|
return best, backprop_maxout
|
||||||
|
|
||||||
|
|
||||||
cdef void sum_state_features(float* output,
|
cdef void sum_state_features(float* output,
|
||||||
|
@ -425,18 +429,20 @@ cdef class Parser:
|
||||||
|
|
||||||
hW = <float*>hidden_weights.data
|
hW = <float*>hidden_weights.data
|
||||||
hb = <float*>hidden_bias.data
|
hb = <float*>hidden_bias.data
|
||||||
|
bias = <float*>state2vec.bias.data
|
||||||
cdef int nr_hidden = hidden_weights.shape[0]
|
cdef int nr_hidden = hidden_weights.shape[0]
|
||||||
cdef int nr_task = states.size()
|
cdef int nr_task = states.size()
|
||||||
with nogil:
|
with nogil:
|
||||||
for i in cython.parallel.prange(nr_task, num_threads=2,
|
for i in cython.parallel.prange(nr_task, num_threads=2,
|
||||||
schedule='guided'):
|
schedule='guided'):
|
||||||
self._parseC(states[i],
|
self._parseC(states[i],
|
||||||
feat_weights, hW, hb,
|
feat_weights, bias, hW, hb,
|
||||||
nr_class, nr_hidden, nr_feat, nr_piece)
|
nr_class, nr_hidden, nr_feat, nr_piece)
|
||||||
return state_objs
|
return state_objs
|
||||||
|
|
||||||
cdef void _parseC(self, StateC* state,
|
cdef void _parseC(self, StateC* state,
|
||||||
const float* feat_weights, const float* hW, const float* hb,
|
const float* feat_weights, const float* bias,
|
||||||
|
const float* hW, const float* hb,
|
||||||
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil:
|
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil:
|
||||||
token_ids = <int*>calloc(nr_feat, sizeof(int))
|
token_ids = <int*>calloc(nr_feat, sizeof(int))
|
||||||
is_valid = <int*>calloc(nr_class, sizeof(int))
|
is_valid = <int*>calloc(nr_class, sizeof(int))
|
||||||
|
@ -449,11 +455,13 @@ cdef class Parser:
|
||||||
memset(scores, 0, nr_class * sizeof(float))
|
memset(scores, 0, nr_class * sizeof(float))
|
||||||
sum_state_features(vectors,
|
sum_state_features(vectors,
|
||||||
feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece)
|
feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece)
|
||||||
|
for i in range(nr_hidden * nr_piece):
|
||||||
|
vectors[i] += bias[i]
|
||||||
V = vectors
|
V = vectors
|
||||||
W = hW
|
W = hW
|
||||||
for i in range(nr_hidden):
|
for i in range(nr_hidden):
|
||||||
if nr_piece == 1:
|
if nr_piece == 1:
|
||||||
feature = V[0]
|
feature = V[0] if V[0] >= 0. else 0.
|
||||||
elif nr_piece == 2:
|
elif nr_piece == 2:
|
||||||
feature = V[0] if V[0] >= V[1] else V[1]
|
feature = V[0] if V[0] >= V[1] else V[1]
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user