From f57bea8ab692a5e028874f9055d225ae13f7b17c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 20 Dec 2018 16:12:22 +0100
Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20Prevent=20parser=20from=20predic?=
 =?UTF-8?q?ting=20unseen=20classes=20(#3075)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The output weights often return negative scores for classes, especially
via the bias terms. This means that when we add a new class, we can't
rely on just zeroing the weights, or we'll end up with positive
predictions for those labels.

To solve this, we use nan values as the initial weights for new labels.
This prevents them from ever coming out on top. During backprop, we
replace the nan values with the minimum assigned score, so that we're
still able to learn these classes.
---
 spacy/cli/train.py             |  2 +-
 spacy/syntax/_parser_model.pyx | 21 ++++++++++++++++++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index e32828527..191a41f67 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -221,7 +221,7 @@ def train(
                     (nlp.make_doc(rt["text"]) for rt in raw_text), size=8
                 )
             words_seen = 0
-            with _create_progress_bar(n_train_words) as pbar:
+            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                 losses = {}
                 for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
                     if not batch:
diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index fd87aab2b..657e30f41 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -205,7 +205,9 @@ class ParserModel(Model):
             return
         smaller = self.upper
         larger = Affine(new_output, smaller.nI)
-        larger.W *= 0
+        # Set nan as value for unseen classes, to prevent prediction.
+        larger.W.fill(self.ops.xp.nan)
+        larger.b.fill(self.ops.xp.nan)
         # It seems very unhappy if I pass these as smaller.W?
         # Seems to segfault. Maybe it's a descriptor protocol thing?
         smaller_W = smaller.W
@@ -254,8 +256,23 @@ class ParserStepModel(Model):
         if mask is not None:
             vector *= mask
         scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
+        # We can have nans from unseen classes.
+        # For backprop purposes, we want to treat unseen classes as having the
+        # lowest score.
+        # numpy's nan_to_num function doesn't take a value, and nan is replaced
+        # by 0...-inf is replaced by minimum, so we go via that. Ugly to the max.
+        scores[self.ops.xp.isnan(scores)] = -self.ops.xp.inf
+        self.ops.xp.nan_to_num(scores, copy=False)
 
         def backprop_parser_step(d_scores, sgd=None):
+            # If we have a non-zero gradient for a previously unseen class,
+            # replace the weight with 0.
+            new_classes = self.ops.xp.logical_and(
+                self.vec2scores.ops.xp.isnan(self.vec2scores.b),
+                d_scores.any(axis=0)
+            )
+            self.vec2scores.b[new_classes] = 0.
+            self.vec2scores.W[new_classes] = 0.
             d_vector = get_d_vector(d_scores, sgd=sgd)
             if mask is not None:
                 d_vector *= mask
@@ -400,6 +417,8 @@ cdef class precompute_hiddens:
             state_vector, mask = self.ops.maxout(state_vector)
 
         def backprop_nonlinearity(d_best, sgd=None):
+            # Fix nans (which can occur from unseen classes.)
+            d_best[self.ops.xp.isnan(d_best)] = 0.
             if self.nP == 1:
                 d_best *= mask
                 d_best = d_best.reshape((d_best.shape + (1,)))