Explicitly delete objects after parser.update to free GPU memory (#5748)

* Try explicitly deleting objects * Refactor parser model backprop slightly * Free parser data explicitly after rehearse and update
2025-07-16 11:12:25 +03:00 · 2020-07-10 22:35:20 +02:00 · 2020-07-10 22:35:20 +02:00 · b68216e263
commit b68216e263
parent de6a32315c
2 changed files with 38 additions and 12 deletions
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@ -245,6 +245,13 @@ class ParserStepModel(Model):
            for class_ in unseen_classes:
                self._class_mask[class_] = 0.

+    def clear_memory(self):
+        del self.tokvecs
+        del self.bp_tokvecs
+        del self.state2vec
+        del self.backprops
+        del self._class_mask
+
    @property
    def nO(self):
        if self.attrs["has_upper"]:
@ -273,6 +280,19 @@ class ParserStepModel(Model):
            c_ids += ids.shape[1]
        return ids

+    def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
+        if isinstance(self.state2vec.ops, CupyOps) \
+        and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
+            # Move token_ids and d_vector to GPU, asynchronously
+            self.backprops.append((
+                util.get_async(self.cuda_stream, token_ids),
+                util.get_async(self.cuda_stream, d_vector),
+                get_d_tokvecs
+            ))
+        else:
+            self.backprops.append((token_ids, d_vector, get_d_tokvecs))
+
+
    def finish_steps(self, golds):
        # Add a padding vector to the d_tokvecs gradient, so that missing
        # values don't affect the real gradient.
@ -315,16 +335,7 @@ def step_forward(model: ParserStepModel, states, is_train):
        d_vector = get_d_vector(d_scores)
        if mask is not None:
            d_vector *= mask
-        if isinstance(model.state2vec.ops, CupyOps) \
-        and not isinstance(token_ids, model.state2vec.ops.xp.ndarray):
-            # Move token_ids and d_vector to GPU, asynchronously
-            model.backprops.append((
-                util.get_async(model.cuda_stream, token_ids),
-                util.get_async(model.cuda_stream, d_vector),
-                get_d_tokvecs
-            ))
-        else:
-            model.backprops.append((token_ids, d_vector, get_d_tokvecs))
+        model.backprop_step(token_ids, d_vector, get_d_tokvecs)
        return None
    return scores, backprop_parser_step

--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -200,6 +200,8 @@ cdef class Parser:
        with nogil:
            self._parseC(&states[0],
                weights, sizes)
+        model.clear_memory()
+        del model
        return batch

    cdef void _parseC(self, StateC** states,
@ -312,6 +314,13 @@ cdef class Parser:
        if set_annotations:
            docs = [eg.predicted for eg in examples]
            self.set_annotations(docs, all_states)
+        # Ugh, this is annoying. If we're working on GPU, we want to free the
+        # memory ASAP. It seems that Python doesn't necessarily get around to
+        # removing these in time if we don't explicitly delete? It's confusing.
+        del backprop
+        del backprop_tok2vec
+        model.clear_memory()
+        del model
        return losses

    def rehearse(self, examples, sgd=None, losses=None, **cfg):
@ -335,7 +344,7 @@ cdef class Parser:
        set_dropout_rate(self._rehearsal_model, 0.0)
        set_dropout_rate(self.model, 0.0)
        tutor, _ = self._rehearsal_model.begin_update(docs)
-        model, finish_update = self.model.begin_update(docs)
+        model, backprop_tok2vec = self.model.begin_update(docs)
        n_scores = 0.
        loss = 0.
        while states:
@ -351,10 +360,16 @@ cdef class Parser:
            states = [state for state in states if not state.is_final()]
            n_scores += d_scores.size
        # Do the backprop
-        finish_update(docs)
+        backprop_tok2vec(docs)
        if sgd is not None:
            self.model.finish_update(sgd)
        losses[self.name] += loss / n_scores
+        del backprop
+        del backprop_tok2vec
+        model.clear_memory()
+        tutor.clear_memory()
+        del model
+        del tutor
        return losses

    def get_gradients(self):