diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index b2ecca689..0ff001523 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -213,8 +213,7 @@ class ParserModel(Model):
         self._layers[-1]._layers[-1] = larger
 
     def begin_training(self, X, y=None):
-        for layer in self._layers:
-            layer.begin_training(X, y=y)
+        self.lower.begin_training(X, y=y)
    
     @property
     def tok2vec(self):
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 3d6049513..5449dbbfb 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -525,8 +525,7 @@ cdef class Parser:
         else:
             if sgd is None:
                 sgd = self.create_optimizer()
-            self.model.begin_training(
-                self.model.ops.allocate((5, cfg['token_vector_width'])))
+            self.model.begin_training([])
         self.cfg.update(cfg)
         return sgd