From a4164f67cac6388b16707e6c7dcc9100cd8926e7 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Tue, 7 Jul 2020 17:21:58 +0200 Subject: [PATCH] Don't normalize gradients --- spacy/pipeline/pipes.pyx | 2 +- spacy/syntax/nn_parser.pyx | 17 +++++------------ 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 61cf155a2..2b147785e 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -334,7 +334,7 @@ class Tagger(Pipe): losses[self.name] += (gradient**2).sum() def get_loss(self, examples, scores): - loss_func = SequenceCategoricalCrossentropy(names=self.labels) + loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False) truths = [eg.get_aligned("tag", as_string=True) for eg in examples] d_scores, loss = loss_func(scores, truths) if self.model.ops.xp.isnan(loss): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 1732805a9..19d424823 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -65,7 +65,6 @@ cdef class Parser: self.set_output(self.moves.n_moves) self.cfg = dict(cfg) self.cfg.setdefault("update_with_oracle_cut_size", 100) - self.cfg.setdefault("normalize_gradients_with_batch_size", True) self._multitasks = [] for multitask in cfg.get("multitasks", []): self.add_multitask_objective(multitask) @@ -300,17 +299,10 @@ cdef class Parser: states, golds = zip(*states_golds) scores, backprop = model.begin_update(states) d_scores = self.get_batch_loss(states, golds, scores, losses) - if self.cfg["normalize_gradients_with_batch_size"]: - # We have to be very careful how we do this, because of the way we - # cut up the batch. We subdivide long sequences. If we normalize - # naively, we end up normalizing by sequence length, which - # is bad: that would mean that states in long sequences - # consistently get smaller gradients. Imagine if we have two - # sequences, one length 1000, one length 20. If we cut up - # the 1k sequence so that we have a "batch" of 50 subsequences, - # we don't want the gradients to get 50 times smaller! - d_scores /= n_examples - + # Note that the gradient isn't normalized by the batch size + # here, because our "samples" are really the states...But we + # can't normalize by the number of states either, as then we'd + # be getting smaller gradients for states in long sequences. backprop(d_scores) # Follow the predicted action self.transition_states(states, scores) @@ -408,6 +400,7 @@ cdef class Parser: cpu_log_loss(c_d_scores, costs, is_valid, &scores[i, 0], d_scores.shape[1]) c_d_scores += d_scores.shape[1] + # Note that we don't normalize this. See comment in update() for why. if losses is not None: losses.setdefault(self.name, 0.) losses[self.name] += (d_scores**2).sum()