diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index a94b94e83..201b988b9 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -39,6 +39,7 @@ from preshed.maps cimport map_get
 from thinc.api import layerize, chain, noop, clone
 from thinc.neural import Model, Affine, ReLu, Maxout
 from thinc.neural._classes.selu import SELU
+from thinc.neural._classes.layernorm import LayerNorm
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module
 
@@ -467,7 +468,7 @@ cdef class Parser:
             docs = [docs]
             golds = [golds]
         if USE_FINE_TUNE:
-            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=0.)
+            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
             my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
             tokvecs += my_tokvecs
 
@@ -496,13 +497,13 @@ cdef class Parser:
             scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
 
             d_scores = self.get_batch_loss(states, golds, scores)
-            d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
+            d_vector = bp_scores(d_scores, sgd=sgd)
             if drop != 0:
                 d_vector *= mask
 
             if isinstance(self.model[0].ops, CupyOps) \
             and not isinstance(token_ids, state2vec.ops.xp.ndarray):
-                # Move token_ids and d_vector to CPU, asynchronously
+                # Move token_ids and d_vector to GPU, asynchronously
                 backprops.append((
                     get_async(cuda_stream, token_ids),
                     get_async(cuda_stream, d_vector),