From b68216e2631c18cee1baa43c85eaad5c4c925d54 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 10 Jul 2020 22:35:20 +0200 Subject: [PATCH] Explicitly delete objects after parser.update to free GPU memory (#5748) * Try explicitly deleting objects * Refactor parser model backprop slightly * Free parser data explicitly after rehearse and update --- spacy/syntax/_parser_model.pyx | 31 +++++++++++++++++++++---------- spacy/syntax/nn_parser.pyx | 19 +++++++++++++++++-- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 42baa737b..7acee5efd 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -245,6 +245,13 @@ class ParserStepModel(Model): for class_ in unseen_classes: self._class_mask[class_] = 0. + def clear_memory(self): + del self.tokvecs + del self.bp_tokvecs + del self.state2vec + del self.backprops + del self._class_mask + @property def nO(self): if self.attrs["has_upper"]: @@ -273,6 +280,19 @@ class ParserStepModel(Model): c_ids += ids.shape[1] return ids + def backprop_step(self, token_ids, d_vector, get_d_tokvecs): + if isinstance(self.state2vec.ops, CupyOps) \ + and not isinstance(token_ids, self.state2vec.ops.xp.ndarray): + # Move token_ids and d_vector to GPU, asynchronously + self.backprops.append(( + util.get_async(self.cuda_stream, token_ids), + util.get_async(self.cuda_stream, d_vector), + get_d_tokvecs + )) + else: + self.backprops.append((token_ids, d_vector, get_d_tokvecs)) + + def finish_steps(self, golds): # Add a padding vector to the d_tokvecs gradient, so that missing # values don't affect the real gradient. @@ -315,16 +335,7 @@ def step_forward(model: ParserStepModel, states, is_train): d_vector = get_d_vector(d_scores) if mask is not None: d_vector *= mask - if isinstance(model.state2vec.ops, CupyOps) \ - and not isinstance(token_ids, model.state2vec.ops.xp.ndarray): - # Move token_ids and d_vector to GPU, asynchronously - model.backprops.append(( - util.get_async(model.cuda_stream, token_ids), - util.get_async(model.cuda_stream, d_vector), - get_d_tokvecs - )) - else: - model.backprops.append((token_ids, d_vector, get_d_tokvecs)) + model.backprop_step(token_ids, d_vector, get_d_tokvecs) return None return scores, backprop_parser_step diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 043d8d681..591afe5ab 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -200,6 +200,8 @@ cdef class Parser: with nogil: self._parseC(&states[0], weights, sizes) + model.clear_memory() + del model return batch cdef void _parseC(self, StateC** states, @@ -312,6 +314,13 @@ cdef class Parser: if set_annotations: docs = [eg.predicted for eg in examples] self.set_annotations(docs, all_states) + # Ugh, this is annoying. If we're working on GPU, we want to free the + # memory ASAP. It seems that Python doesn't necessarily get around to + # removing these in time if we don't explicitly delete? It's confusing. + del backprop + del backprop_tok2vec + model.clear_memory() + del model return losses def rehearse(self, examples, sgd=None, losses=None, **cfg): @@ -335,7 +344,7 @@ cdef class Parser: set_dropout_rate(self._rehearsal_model, 0.0) set_dropout_rate(self.model, 0.0) tutor, _ = self._rehearsal_model.begin_update(docs) - model, finish_update = self.model.begin_update(docs) + model, backprop_tok2vec = self.model.begin_update(docs) n_scores = 0. loss = 0. while states: @@ -351,10 +360,16 @@ cdef class Parser: states = [state for state in states if not state.is_final()] n_scores += d_scores.size # Do the backprop - finish_update(docs) + backprop_tok2vec(docs) if sgd is not None: self.model.finish_update(sgd) losses[self.name] += loss / n_scores + del backprop + del backprop_tok2vec + model.clear_memory() + tutor.clear_memory() + del model + del tutor return losses def get_gradients(self):