mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
💫 Prevent parser from predicting unseen classes (#3075)
The output weights often return negative scores for classes, especially via the bias terms. This means that when we add a new class, we can't rely on just zeroing the weights, or we'll end up with positive predictions for those labels. To solve this, we use nan values as the initial weights for new labels. This prevents them from ever coming out on top. During backprop, we replace the nan values with the minimum assigned score, so that we're still able to learn these classes.
This commit is contained in:
parent
9ec9f89b99
commit
f57bea8ab6
|
@ -221,7 +221,7 @@ def train(
|
|||
(nlp.make_doc(rt["text"]) for rt in raw_text), size=8
|
||||
)
|
||||
words_seen = 0
|
||||
with _create_progress_bar(n_train_words) as pbar:
|
||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||
losses = {}
|
||||
for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
|
||||
if not batch:
|
||||
|
|
|
@ -205,7 +205,9 @@ class ParserModel(Model):
|
|||
return
|
||||
smaller = self.upper
|
||||
larger = Affine(new_output, smaller.nI)
|
||||
larger.W *= 0
|
||||
# Set nan as value for unseen classes, to prevent prediction.
|
||||
larger.W.fill(self.ops.xp.nan)
|
||||
larger.b.fill(self.ops.xp.nan)
|
||||
# It seems very unhappy if I pass these as smaller.W?
|
||||
# Seems to segfault. Maybe it's a descriptor protocol thing?
|
||||
smaller_W = smaller.W
|
||||
|
@ -254,8 +256,23 @@ class ParserStepModel(Model):
|
|||
if mask is not None:
|
||||
vector *= mask
|
||||
scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
|
||||
# We can have nans from unseen classes.
|
||||
# For backprop purposes, we want to treat unseen classes as having the
|
||||
# lowest score.
|
||||
# numpy's nan_to_num function doesn't take a value, and nan is replaced
|
||||
# by 0...-inf is replaced by minimum, so we go via that. Ugly to the max.
|
||||
scores[self.ops.xp.isnan(scores)] = -self.ops.xp.inf
|
||||
self.ops.xp.nan_to_num(scores, copy=False)
|
||||
|
||||
def backprop_parser_step(d_scores, sgd=None):
|
||||
# If we have a non-zero gradient for a previously unseen class,
|
||||
# replace the weight with 0.
|
||||
new_classes = self.ops.xp.logical_and(
|
||||
self.vec2scores.ops.xp.isnan(self.vec2scores.b),
|
||||
d_scores.any(axis=0)
|
||||
)
|
||||
self.vec2scores.b[new_classes] = 0.
|
||||
self.vec2scores.W[new_classes] = 0.
|
||||
d_vector = get_d_vector(d_scores, sgd=sgd)
|
||||
if mask is not None:
|
||||
d_vector *= mask
|
||||
|
@ -400,6 +417,8 @@ cdef class precompute_hiddens:
|
|||
state_vector, mask = self.ops.maxout(state_vector)
|
||||
|
||||
def backprop_nonlinearity(d_best, sgd=None):
|
||||
# Fix nans (which can occur from unseen classes.)
|
||||
d_best[self.ops.xp.isnan(d_best)] = 0.
|
||||
if self.nP == 1:
|
||||
d_best *= mask
|
||||
d_best = d_best.reshape((d_best.shape + (1,)))
|
||||
|
|
Loading…
Reference in New Issue
Block a user