mirror of
https://github.com/explosion/spaCy.git
synced 2025-05-31 19:23:05 +03:00
Don't normalize gradients
This commit is contained in:
parent
8177f25b6c
commit
a4164f67ca
|
@ -334,7 +334,7 @@ class Tagger(Pipe):
|
||||||
losses[self.name] += (gradient**2).sum()
|
losses[self.name] += (gradient**2).sum()
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels)
|
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||||
truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
|
truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
|
||||||
d_scores, loss = loss_func(scores, truths)
|
d_scores, loss = loss_func(scores, truths)
|
||||||
if self.model.ops.xp.isnan(loss):
|
if self.model.ops.xp.isnan(loss):
|
||||||
|
|
|
@ -65,7 +65,6 @@ cdef class Parser:
|
||||||
self.set_output(self.moves.n_moves)
|
self.set_output(self.moves.n_moves)
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.cfg.setdefault("update_with_oracle_cut_size", 100)
|
self.cfg.setdefault("update_with_oracle_cut_size", 100)
|
||||||
self.cfg.setdefault("normalize_gradients_with_batch_size", True)
|
|
||||||
self._multitasks = []
|
self._multitasks = []
|
||||||
for multitask in cfg.get("multitasks", []):
|
for multitask in cfg.get("multitasks", []):
|
||||||
self.add_multitask_objective(multitask)
|
self.add_multitask_objective(multitask)
|
||||||
|
@ -300,17 +299,10 @@ cdef class Parser:
|
||||||
states, golds = zip(*states_golds)
|
states, golds = zip(*states_golds)
|
||||||
scores, backprop = model.begin_update(states)
|
scores, backprop = model.begin_update(states)
|
||||||
d_scores = self.get_batch_loss(states, golds, scores, losses)
|
d_scores = self.get_batch_loss(states, golds, scores, losses)
|
||||||
if self.cfg["normalize_gradients_with_batch_size"]:
|
# Note that the gradient isn't normalized by the batch size
|
||||||
# We have to be very careful how we do this, because of the way we
|
# here, because our "samples" are really the states...But we
|
||||||
# cut up the batch. We subdivide long sequences. If we normalize
|
# can't normalize by the number of states either, as then we'd
|
||||||
# naively, we end up normalizing by sequence length, which
|
# be getting smaller gradients for states in long sequences.
|
||||||
# is bad: that would mean that states in long sequences
|
|
||||||
# consistently get smaller gradients. Imagine if we have two
|
|
||||||
# sequences, one length 1000, one length 20. If we cut up
|
|
||||||
# the 1k sequence so that we have a "batch" of 50 subsequences,
|
|
||||||
# we don't want the gradients to get 50 times smaller!
|
|
||||||
d_scores /= n_examples
|
|
||||||
|
|
||||||
backprop(d_scores)
|
backprop(d_scores)
|
||||||
# Follow the predicted action
|
# Follow the predicted action
|
||||||
self.transition_states(states, scores)
|
self.transition_states(states, scores)
|
||||||
|
@ -408,6 +400,7 @@ cdef class Parser:
|
||||||
cpu_log_loss(c_d_scores,
|
cpu_log_loss(c_d_scores,
|
||||||
costs, is_valid, &scores[i, 0], d_scores.shape[1])
|
costs, is_valid, &scores[i, 0], d_scores.shape[1])
|
||||||
c_d_scores += d_scores.shape[1]
|
c_d_scores += d_scores.shape[1]
|
||||||
|
# Note that we don't normalize this. See comment in update() for why.
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses.setdefault(self.name, 0.)
|
losses.setdefault(self.name, 0.)
|
||||||
losses[self.name] += (d_scores**2).sum()
|
losses[self.name] += (d_scores**2).sum()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user