diff --git a/spacy/_ml.py b/spacy/_ml.py index ad6ef6361..2b82f3d9b 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -148,6 +148,7 @@ class PrecomputableAffine(Model): # W: (i, fo) # Yf = numpy.einsum('bi,i_fo->b_fo', X, self.W) Yf = einsum('ab,bc->ac', X, self.W).reshape((nN, self.nF, self.nO)) + #Yf = self.ops.xp.dot(X, self.W).reshape((nN, self.nF, self.nO)) def backward(dY_ids, sgd=None): dY, ids = dY_ids nB = ids.shape[0] @@ -155,12 +156,14 @@ class PrecomputableAffine(Model): Xf = Xf.reshape((nB, self.nIF)) dW_re = self.d_W.reshape((self.nIF, self.nO)) - W_re = self.d_W.reshape((self.nIF, self.nO)) + W_re = self.W.reshape((self.nIF, self.nO)) # bo,if_o->bif dXf = einsum('ab,cb->ac', dY, W_re) + #dXf = self.ops.xp.dot(dY, W_re.T) # b_if,bo->if_o einsum('ab,ac->bc', Xf, dY, out=dW_re) - # self.d_b += dY.sum(axis=0) + #self.ops.xp.dot(Xf.T, dY, out=dW_re) + self.d_b += dY.sum(axis=0) if sgd is not None: sgd(self._mem.weights, self._mem.gradient, key=self.id) @@ -208,7 +211,6 @@ class PrecomputableMaxouts(Model): ascontiguous = self.ops.xp.ascontiguousarray Yfp = tensordot(X, self.W, axes=[[1], [3]]) - Yfp += self.b def backward(dYp_ids, sgd=None): dYp, ids = dYp_ids @@ -380,8 +382,6 @@ def reapply(layer, n_times): return wrap(reapply_fwd, layer) - - def asarray(ops, dtype): def forward(X, drop=0.): return ops.asarray(X, dtype=dtype), None diff --git a/spacy/syntax/nn_parser.pxd b/spacy/syntax/nn_parser.pxd index 1d389609b..56615c6f1 100644 --- a/spacy/syntax/nn_parser.pxd +++ b/spacy/syntax/nn_parser.pxd @@ -16,5 +16,6 @@ cdef class Parser: cdef public object _multitasks cdef void _parseC(self, StateC* state, - const float* feat_weights, const float* hW, const float* hb, + const float* feat_weights, const float* bias, + const float* hW, const float* hb, int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 361e61a99..755c87369 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -101,6 +101,7 @@ cdef class precompute_hiddens: cdef public object ops cdef np.ndarray _features cdef np.ndarray _cached + cdef np.ndarray bias cdef object _cuda_stream cdef object _bp_hiddens @@ -118,6 +119,7 @@ cdef class precompute_hiddens: self.nO = cached.shape[2] self.nP = getattr(lower_model, 'nP', 1) self.ops = lower_model.ops + self.bias = lower_model.b self._is_synchronized = False self._cuda_stream = cuda_stream self._cached = cached @@ -147,6 +149,7 @@ cdef class precompute_hiddens: sum_state_features(state_vector.data, feat_weights, &ids[0,0], token_ids.shape[0], self.nF, self.nO*self.nP) + state_vector += self.bias.ravel() state_vector, bp_nonlinearity = self._nonlinearity(state_vector) def backward(d_state_vector, sgd=None): @@ -161,14 +164,15 @@ cdef class precompute_hiddens: def _nonlinearity(self, state_vector): if self.nP == 1: - return state_vector, None + mask = state_vector >= 0. + return state_vector * mask, lambda dY, sgd=None: dY * mask state_vector = state_vector.reshape( (state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP)) best, which = self.ops.maxout(state_vector) - def backprop(d_best, sgd=None): - return self.ops.backprop_maxout(d_best, which, self.nP) - return best, backprop + def backprop_maxout(d_best, sgd=None): + return self.ops.backprop_maxout(d_best, which, self.nP) + return best, backprop_maxout cdef void sum_state_features(float* output, @@ -425,18 +429,20 @@ cdef class Parser: hW = hidden_weights.data hb = hidden_bias.data + bias = state2vec.bias.data cdef int nr_hidden = hidden_weights.shape[0] cdef int nr_task = states.size() with nogil: for i in cython.parallel.prange(nr_task, num_threads=2, schedule='guided'): self._parseC(states[i], - feat_weights, hW, hb, + feat_weights, bias, hW, hb, nr_class, nr_hidden, nr_feat, nr_piece) return state_objs cdef void _parseC(self, StateC* state, - const float* feat_weights, const float* hW, const float* hb, + const float* feat_weights, const float* bias, + const float* hW, const float* hb, int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil: token_ids = calloc(nr_feat, sizeof(int)) is_valid = calloc(nr_class, sizeof(int)) @@ -449,11 +455,13 @@ cdef class Parser: memset(scores, 0, nr_class * sizeof(float)) sum_state_features(vectors, feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece) + for i in range(nr_hidden * nr_piece): + vectors[i] += bias[i] V = vectors W = hW for i in range(nr_hidden): if nr_piece == 1: - feature = V[0] + feature = V[0] if V[0] >= 0. else 0. elif nr_piece == 2: feature = V[0] if V[0] >= V[1] else V[1] else: