💫 Prevent parser from predicting unseen classes (#3075)

The output weights often return negative scores for classes, especially via the bias terms. This means that when we add a new class, we can't rely on just zeroing the weights, or we'll end up with positive predictions for those labels. To solve this, we use nan values as the initial weights for new labels. This prevents them from ever coming out on top. During backprop, we replace the nan values with the minimum assigned score, so that we're still able to learn these classes.
2025-08-04 12:20:20 +03:00 · 2018-12-20 16:12:22 +01:00 · 2018-12-20 16:12:22 +01:00 · f57bea8ab6
commit f57bea8ab6
parent 9ec9f89b99
2 changed files with 21 additions and 2 deletions
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -221,7 +221,7 @@ def train(
                    (nlp.make_doc(rt["text"]) for rt in raw_text), size=8
                )
            words_seen = 0
-            with _create_progress_bar(n_train_words) as pbar:
+            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
                    if not batch:
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@ -205,7 +205,9 @@ class ParserModel(Model):
            return
        smaller = self.upper
        larger = Affine(new_output, smaller.nI)
-        larger.W *= 0
+        # Set nan as value for unseen classes, to prevent prediction.
+        larger.W.fill(self.ops.xp.nan)
+        larger.b.fill(self.ops.xp.nan)
        # It seems very unhappy if I pass these as smaller.W?
        # Seems to segfault. Maybe it's a descriptor protocol thing?
        smaller_W = smaller.W
@ -254,8 +256,23 @@ class ParserStepModel(Model):
        if mask is not None:
            vector *= mask
        scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
+        # We can have nans from unseen classes.
+        # For backprop purposes, we want to treat unseen classes as having the
+        # lowest score.
+        # numpy's nan_to_num function doesn't take a value, and nan is replaced
+        # by 0...-inf is replaced by minimum, so we go via that. Ugly to the max.
+        scores[self.ops.xp.isnan(scores)] = -self.ops.xp.inf
+        self.ops.xp.nan_to_num(scores, copy=False)

        def backprop_parser_step(d_scores, sgd=None):
+            # If we have a non-zero gradient for a previously unseen class,
+            # replace the weight with 0.
+            new_classes = self.ops.xp.logical_and(
+                self.vec2scores.ops.xp.isnan(self.vec2scores.b),
+                d_scores.any(axis=0)
+            )
+            self.vec2scores.b[new_classes] = 0.
+            self.vec2scores.W[new_classes] = 0.
            d_vector = get_d_vector(d_scores, sgd=sgd)
            if mask is not None:
                d_vector *= mask
@ -400,6 +417,8 @@ cdef class precompute_hiddens:
            state_vector, mask = self.ops.maxout(state_vector)

        def backprop_nonlinearity(d_best, sgd=None):
+            # Fix nans (which can occur from unseen classes.)
+            d_best[self.ops.xp.isnan(d_best)] = 0.
            if self.nP == 1:
                d_best *= mask
                d_best = d_best.reshape((d_best.shape + (1,)))