💫 Prevent parser from predicting unseen classes (#3075)

The output weights often return negative scores for classes, especially
via the bias terms. This means that when we add a new class, we can't
rely on just zeroing the weights, or we'll end up with positive
predictions for those labels.

To solve this, we use nan values as the initial weights for new labels.
This prevents them from ever coming out on top. During backprop, we
replace the nan values with the minimum assigned score, so that we're
still able to learn these classes.
This commit is contained in:
Matthew Honnibal 2018-12-20 16:12:22 +01:00 committed by GitHub
parent 9ec9f89b99
commit f57bea8ab6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 21 additions and 2 deletions

View File

@ -221,7 +221,7 @@ def train(
(nlp.make_doc(rt["text"]) for rt in raw_text), size=8 (nlp.make_doc(rt["text"]) for rt in raw_text), size=8
) )
words_seen = 0 words_seen = 0
with _create_progress_bar(n_train_words) as pbar: with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
losses = {} losses = {}
for batch in util.minibatch_by_words(train_docs, size=batch_sizes): for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
if not batch: if not batch:

View File

@ -205,7 +205,9 @@ class ParserModel(Model):
return return
smaller = self.upper smaller = self.upper
larger = Affine(new_output, smaller.nI) larger = Affine(new_output, smaller.nI)
larger.W *= 0 # Set nan as value for unseen classes, to prevent prediction.
larger.W.fill(self.ops.xp.nan)
larger.b.fill(self.ops.xp.nan)
# It seems very unhappy if I pass these as smaller.W? # It seems very unhappy if I pass these as smaller.W?
# Seems to segfault. Maybe it's a descriptor protocol thing? # Seems to segfault. Maybe it's a descriptor protocol thing?
smaller_W = smaller.W smaller_W = smaller.W
@ -254,8 +256,23 @@ class ParserStepModel(Model):
if mask is not None: if mask is not None:
vector *= mask vector *= mask
scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop) scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
# We can have nans from unseen classes.
# For backprop purposes, we want to treat unseen classes as having the
# lowest score.
# numpy's nan_to_num function doesn't take a value, and nan is replaced
# by 0...-inf is replaced by minimum, so we go via that. Ugly to the max.
scores[self.ops.xp.isnan(scores)] = -self.ops.xp.inf
self.ops.xp.nan_to_num(scores, copy=False)
def backprop_parser_step(d_scores, sgd=None): def backprop_parser_step(d_scores, sgd=None):
# If we have a non-zero gradient for a previously unseen class,
# replace the weight with 0.
new_classes = self.ops.xp.logical_and(
self.vec2scores.ops.xp.isnan(self.vec2scores.b),
d_scores.any(axis=0)
)
self.vec2scores.b[new_classes] = 0.
self.vec2scores.W[new_classes] = 0.
d_vector = get_d_vector(d_scores, sgd=sgd) d_vector = get_d_vector(d_scores, sgd=sgd)
if mask is not None: if mask is not None:
d_vector *= mask d_vector *= mask
@ -400,6 +417,8 @@ cdef class precompute_hiddens:
state_vector, mask = self.ops.maxout(state_vector) state_vector, mask = self.ops.maxout(state_vector)
def backprop_nonlinearity(d_best, sgd=None): def backprop_nonlinearity(d_best, sgd=None):
# Fix nans (which can occur from unseen classes.)
d_best[self.ops.xp.isnan(d_best)] = 0.
if self.nP == 1: if self.nP == 1:
d_best *= mask d_best *= mask
d_best = d_best.reshape((d_best.shape + (1,))) d_best = d_best.reshape((d_best.shape + (1,)))