diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index e32828527..191a41f67 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -221,7 +221,7 @@ def train(
                     (nlp.make_doc(rt["text"]) for rt in raw_text), size=8
                 )
             words_seen = 0
-            with _create_progress_bar(n_train_words) as pbar:
+            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                 losses = {}
                 for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
                     if not batch:
diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index fd87aab2b..657e30f41 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -205,7 +205,9 @@ class ParserModel(Model):
             return
         smaller = self.upper
         larger = Affine(new_output, smaller.nI)
-        larger.W *= 0
+        # Set nan as value for unseen classes, to prevent prediction.
+        larger.W.fill(self.ops.xp.nan)
+        larger.b.fill(self.ops.xp.nan)
         # It seems very unhappy if I pass these as smaller.W?
         # Seems to segfault. Maybe it's a descriptor protocol thing?
         smaller_W = smaller.W
@@ -254,8 +256,23 @@ class ParserStepModel(Model):
         if mask is not None:
             vector *= mask
         scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
+        # We can have nans from unseen classes.
+        # For backprop purposes, we want to treat unseen classes as having the
+        # lowest score.
+        # numpy's nan_to_num function doesn't take a value, and nan is replaced
+        # by 0...-inf is replaced by minimum, so we go via that. Ugly to the max.
+        scores[self.ops.xp.isnan(scores)] = -self.ops.xp.inf
+        self.ops.xp.nan_to_num(scores, copy=False)
 
         def backprop_parser_step(d_scores, sgd=None):
+            # If we have a non-zero gradient for a previously unseen class,
+            # replace the weight with 0.
+            new_classes = self.ops.xp.logical_and(
+                self.vec2scores.ops.xp.isnan(self.vec2scores.b),
+                d_scores.any(axis=0)
+            )
+            self.vec2scores.b[new_classes] = 0.
+            self.vec2scores.W[new_classes] = 0.
             d_vector = get_d_vector(d_scores, sgd=sgd)
             if mask is not None:
                 d_vector *= mask
@@ -400,6 +417,8 @@ cdef class precompute_hiddens:
             state_vector, mask = self.ops.maxout(state_vector)
 
         def backprop_nonlinearity(d_best, sgd=None):
+            # Fix nans (which can occur from unseen classes.)
+            d_best[self.ops.xp.isnan(d_best)] = 0.
             if self.nP == 1:
                 d_best *= mask
                 d_best = d_best.reshape((d_best.shape + (1,)))