Fix bias in parser

2025-11-10 04:47:51 +03:00 · 2017-10-19 18:42:11 +02:00 · 2017-10-19 18:42:11 +02:00 · b00d0a2c97
commit b00d0a2c97
parent b54b4b8a97
3 changed files with 22 additions and 13 deletions
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -148,6 +148,7 @@ class PrecomputableAffine(Model):
        # W: (i, fo)
        # Yf = numpy.einsum('bi,i_fo->b_fo', X, self.W)
        Yf = einsum('ab,bc->ac', X, self.W).reshape((nN, self.nF, self.nO))
+        #Yf = self.ops.xp.dot(X, self.W).reshape((nN, self.nF, self.nO))
        def backward(dY_ids, sgd=None):
            dY, ids = dY_ids
            nB = ids.shape[0]
@ -155,12 +156,14 @@ class PrecomputableAffine(Model):
            Xf = Xf.reshape((nB, self.nIF))

            dW_re = self.d_W.reshape((self.nIF, self.nO))
-            W_re = self.d_W.reshape((self.nIF, self.nO))
+            W_re = self.W.reshape((self.nIF, self.nO))
            # bo,if_o->bif
            dXf = einsum('ab,cb->ac', dY, W_re)
+            #dXf = self.ops.xp.dot(dY, W_re.T)
            # b_if,bo->if_o
            einsum('ab,ac->bc', Xf, dY, out=dW_re)
-            # self.d_b += dY.sum(axis=0)
+            #self.ops.xp.dot(Xf.T, dY, out=dW_re)
+            self.d_b += dY.sum(axis=0)

            if sgd is not None:
                sgd(self._mem.weights, self._mem.gradient, key=self.id)
@ -208,7 +211,6 @@ class PrecomputableMaxouts(Model):
        ascontiguous = self.ops.xp.ascontiguousarray

        Yfp = tensordot(X, self.W, axes=[[1], [3]])
-        Yfp += self.b

        def backward(dYp_ids, sgd=None):
            dYp, ids = dYp_ids
@ -380,8 +382,6 @@ def reapply(layer, n_times):
    return wrap(reapply_fwd, layer)


-
-
 def asarray(ops, dtype):
    def forward(X, drop=0.):
        return ops.asarray(X, dtype=dtype), None
--- a/spacy/syntax/nn_parser.pxd
+++ b/spacy/syntax/nn_parser.pxd
@ -16,5 +16,6 @@ cdef class Parser:
    cdef public object _multitasks

    cdef void _parseC(self, StateC* state, 
-            const float* feat_weights, const float* hW, const float* hb,
+            const float* feat_weights, const float* bias,
+            const float* hW, const float* hb,
            int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -101,6 +101,7 @@ cdef class precompute_hiddens:
    cdef public object ops
    cdef np.ndarray _features
    cdef np.ndarray _cached
+    cdef np.ndarray bias
    cdef object _cuda_stream
    cdef object _bp_hiddens

@ -118,6 +119,7 @@ cdef class precompute_hiddens:
        self.nO = cached.shape[2]
        self.nP = getattr(lower_model, 'nP', 1)
        self.ops = lower_model.ops
+        self.bias = lower_model.b
        self._is_synchronized = False
        self._cuda_stream = cuda_stream
        self._cached = cached
@ -147,6 +149,7 @@ cdef class precompute_hiddens:
        sum_state_features(<float*>state_vector.data,
            feat_weights, &ids[0,0],
            token_ids.shape[0], self.nF, self.nO*self.nP)
+        state_vector += self.bias.ravel()
        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)

        def backward(d_state_vector, sgd=None):
@ -161,14 +164,15 @@ cdef class precompute_hiddens:

    def _nonlinearity(self, state_vector):
        if self.nP == 1:
-            return state_vector, None
+            mask = state_vector >= 0.
+            return state_vector * mask, lambda dY, sgd=None: dY * mask
        state_vector = state_vector.reshape(
            (state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
        best, which = self.ops.maxout(state_vector)
-        def backprop(d_best, sgd=None):
-            return self.ops.backprop_maxout(d_best, which, self.nP)
-        return best, backprop

+        def backprop_maxout(d_best, sgd=None):
+            return self.ops.backprop_maxout(d_best, which, self.nP)
+        return best, backprop_maxout


 cdef void sum_state_features(float* output,
@ -425,18 +429,20 @@ cdef class Parser:

        hW = <float*>hidden_weights.data
        hb = <float*>hidden_bias.data
+        bias = <float*>state2vec.bias.data
        cdef int nr_hidden = hidden_weights.shape[0]
        cdef int nr_task = states.size()
        with nogil:
            for i in cython.parallel.prange(nr_task, num_threads=2,
                                            schedule='guided'):
                self._parseC(states[i],
-                    feat_weights, hW, hb,
+                    feat_weights, bias, hW, hb,
                    nr_class, nr_hidden, nr_feat, nr_piece)
        return state_objs

    cdef void _parseC(self, StateC* state, 
-            const float* feat_weights, const float* hW, const float* hb,
+            const float* feat_weights, const float* bias,
+            const float* hW, const float* hb,
            int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil:
        token_ids = <int*>calloc(nr_feat, sizeof(int))
        is_valid = <int*>calloc(nr_class, sizeof(int))
@ -449,11 +455,13 @@ cdef class Parser:
            memset(scores, 0, nr_class * sizeof(float))
            sum_state_features(vectors,
                feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece)
+            for i in range(nr_hidden * nr_piece):
+                vectors[i] += bias[i]
            V = vectors
            W = hW
            for i in range(nr_hidden):
                if nr_piece == 1:
-                    feature = V[0]
+                    feature = V[0] if V[0] >= 0. else 0.
                elif nr_piece == 2:
                    feature = V[0] if V[0] >= V[1] else V[1]
                else: