diff --git a/spacy/_ml.py b/spacy/_ml.py
index ad6ef6361..2b82f3d9b 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -148,6 +148,7 @@ class PrecomputableAffine(Model):
         # W: (i, fo)
         # Yf = numpy.einsum('bi,i_fo->b_fo', X, self.W)
         Yf = einsum('ab,bc->ac', X, self.W).reshape((nN, self.nF, self.nO))
+        #Yf = self.ops.xp.dot(X, self.W).reshape((nN, self.nF, self.nO))
         def backward(dY_ids, sgd=None):
             dY, ids = dY_ids
             nB = ids.shape[0]
@@ -155,12 +156,14 @@ class PrecomputableAffine(Model):
             Xf = Xf.reshape((nB, self.nIF))
 
             dW_re = self.d_W.reshape((self.nIF, self.nO))
-            W_re = self.d_W.reshape((self.nIF, self.nO))
+            W_re = self.W.reshape((self.nIF, self.nO))
             # bo,if_o->bif
             dXf = einsum('ab,cb->ac', dY, W_re)
+            #dXf = self.ops.xp.dot(dY, W_re.T)
             # b_if,bo->if_o
             einsum('ab,ac->bc', Xf, dY, out=dW_re)
-            # self.d_b += dY.sum(axis=0)
+            #self.ops.xp.dot(Xf.T, dY, out=dW_re)
+            self.d_b += dY.sum(axis=0)
 
             if sgd is not None:
                 sgd(self._mem.weights, self._mem.gradient, key=self.id)
@@ -208,7 +211,6 @@ class PrecomputableMaxouts(Model):
         ascontiguous = self.ops.xp.ascontiguousarray
 
         Yfp = tensordot(X, self.W, axes=[[1], [3]])
-        Yfp += self.b
 
         def backward(dYp_ids, sgd=None):
             dYp, ids = dYp_ids
@@ -380,8 +382,6 @@ def reapply(layer, n_times):
     return wrap(reapply_fwd, layer)
 
 
-
-
 def asarray(ops, dtype):
     def forward(X, drop=0.):
         return ops.asarray(X, dtype=dtype), None
diff --git a/spacy/syntax/nn_parser.pxd b/spacy/syntax/nn_parser.pxd
index 1d389609b..56615c6f1 100644
--- a/spacy/syntax/nn_parser.pxd
+++ b/spacy/syntax/nn_parser.pxd
@@ -16,5 +16,6 @@ cdef class Parser:
     cdef public object _multitasks
 
     cdef void _parseC(self, StateC* state, 
-            const float* feat_weights, const float* hW, const float* hb,
+            const float* feat_weights, const float* bias,
+            const float* hW, const float* hb,
             int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 361e61a99..755c87369 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -101,6 +101,7 @@ cdef class precompute_hiddens:
     cdef public object ops
     cdef np.ndarray _features
     cdef np.ndarray _cached
+    cdef np.ndarray bias
     cdef object _cuda_stream
     cdef object _bp_hiddens
 
@@ -118,6 +119,7 @@ cdef class precompute_hiddens:
         self.nO = cached.shape[2]
         self.nP = getattr(lower_model, 'nP', 1)
         self.ops = lower_model.ops
+        self.bias = lower_model.b
         self._is_synchronized = False
         self._cuda_stream = cuda_stream
         self._cached = cached
@@ -147,6 +149,7 @@ cdef class precompute_hiddens:
         sum_state_features(<float*>state_vector.data,
             feat_weights, &ids[0,0],
             token_ids.shape[0], self.nF, self.nO*self.nP)
+        state_vector += self.bias.ravel()
         state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
 
         def backward(d_state_vector, sgd=None):
@@ -161,14 +164,15 @@ cdef class precompute_hiddens:
 
     def _nonlinearity(self, state_vector):
         if self.nP == 1:
-            return state_vector, None
+            mask = state_vector >= 0.
+            return state_vector * mask, lambda dY, sgd=None: dY * mask
         state_vector = state_vector.reshape(
             (state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
         best, which = self.ops.maxout(state_vector)
-        def backprop(d_best, sgd=None):
-            return self.ops.backprop_maxout(d_best, which, self.nP)
-        return best, backprop
 
+        def backprop_maxout(d_best, sgd=None):
+            return self.ops.backprop_maxout(d_best, which, self.nP)
+        return best, backprop_maxout
 
 
 cdef void sum_state_features(float* output,
@@ -425,18 +429,20 @@ cdef class Parser:
 
         hW = <float*>hidden_weights.data
         hb = <float*>hidden_bias.data
+        bias = <float*>state2vec.bias.data
         cdef int nr_hidden = hidden_weights.shape[0]
         cdef int nr_task = states.size()
         with nogil:
             for i in cython.parallel.prange(nr_task, num_threads=2,
                                             schedule='guided'):
                 self._parseC(states[i],
-                    feat_weights, hW, hb,
+                    feat_weights, bias, hW, hb,
                     nr_class, nr_hidden, nr_feat, nr_piece)
         return state_objs
 
     cdef void _parseC(self, StateC* state, 
-            const float* feat_weights, const float* hW, const float* hb,
+            const float* feat_weights, const float* bias,
+            const float* hW, const float* hb,
             int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil:
         token_ids = <int*>calloc(nr_feat, sizeof(int))
         is_valid = <int*>calloc(nr_class, sizeof(int))
@@ -449,11 +455,13 @@ cdef class Parser:
             memset(scores, 0, nr_class * sizeof(float))
             sum_state_features(vectors,
                 feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece)
+            for i in range(nr_hidden * nr_piece):
+                vectors[i] += bias[i]
             V = vectors
             W = hW
             for i in range(nr_hidden):
                 if nr_piece == 1:
-                    feature = V[0]
+                    feature = V[0] if V[0] >= 0. else 0.
                 elif nr_piece == 2:
                     feature = V[0] if V[0] >= V[1] else V[1]
                 else: