mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Don't normalize gradients
This commit is contained in:
parent
8177f25b6c
commit
a4164f67ca
|
@ -334,7 +334,7 @@ class Tagger(Pipe):
|
|||
losses[self.name] += (gradient**2).sum()
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels)
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||
truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
|
||||
d_scores, loss = loss_func(scores, truths)
|
||||
if self.model.ops.xp.isnan(loss):
|
||||
|
|
|
@ -65,7 +65,6 @@ cdef class Parser:
|
|||
self.set_output(self.moves.n_moves)
|
||||
self.cfg = dict(cfg)
|
||||
self.cfg.setdefault("update_with_oracle_cut_size", 100)
|
||||
self.cfg.setdefault("normalize_gradients_with_batch_size", True)
|
||||
self._multitasks = []
|
||||
for multitask in cfg.get("multitasks", []):
|
||||
self.add_multitask_objective(multitask)
|
||||
|
@ -300,17 +299,10 @@ cdef class Parser:
|
|||
states, golds = zip(*states_golds)
|
||||
scores, backprop = model.begin_update(states)
|
||||
d_scores = self.get_batch_loss(states, golds, scores, losses)
|
||||
if self.cfg["normalize_gradients_with_batch_size"]:
|
||||
# We have to be very careful how we do this, because of the way we
|
||||
# cut up the batch. We subdivide long sequences. If we normalize
|
||||
# naively, we end up normalizing by sequence length, which
|
||||
# is bad: that would mean that states in long sequences
|
||||
# consistently get smaller gradients. Imagine if we have two
|
||||
# sequences, one length 1000, one length 20. If we cut up
|
||||
# the 1k sequence so that we have a "batch" of 50 subsequences,
|
||||
# we don't want the gradients to get 50 times smaller!
|
||||
d_scores /= n_examples
|
||||
|
||||
# Note that the gradient isn't normalized by the batch size
|
||||
# here, because our "samples" are really the states...But we
|
||||
# can't normalize by the number of states either, as then we'd
|
||||
# be getting smaller gradients for states in long sequences.
|
||||
backprop(d_scores)
|
||||
# Follow the predicted action
|
||||
self.transition_states(states, scores)
|
||||
|
@ -408,6 +400,7 @@ cdef class Parser:
|
|||
cpu_log_loss(c_d_scores,
|
||||
costs, is_valid, &scores[i, 0], d_scores.shape[1])
|
||||
c_d_scores += d_scores.shape[1]
|
||||
# Note that we don't normalize this. See comment in update() for why.
|
||||
if losses is not None:
|
||||
losses.setdefault(self.name, 0.)
|
||||
losses[self.name] += (d_scores**2).sum()
|
||||
|
|
Loading…
Reference in New Issue
Block a user