From b68216e2631c18cee1baa43c85eaad5c4c925d54 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 10 Jul 2020 22:35:20 +0200
Subject: [PATCH] Explicitly delete objects after parser.update to free GPU
 memory (#5748)

* Try explicitly deleting objects

* Refactor parser model backprop slightly

* Free parser data explicitly after rehearse and update
---
 spacy/syntax/_parser_model.pyx | 31 +++++++++++++++++++++----------
 spacy/syntax/nn_parser.pyx     | 19 +++++++++++++++++--
 2 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index 42baa737b..7acee5efd 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -245,6 +245,13 @@ class ParserStepModel(Model):
             for class_ in unseen_classes:
                 self._class_mask[class_] = 0.
 
+    def clear_memory(self):
+        del self.tokvecs
+        del self.bp_tokvecs
+        del self.state2vec
+        del self.backprops
+        del self._class_mask
+
     @property
     def nO(self):
         if self.attrs["has_upper"]:
@@ -273,6 +280,19 @@ class ParserStepModel(Model):
             c_ids += ids.shape[1]
         return ids
 
+    def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
+        if isinstance(self.state2vec.ops, CupyOps) \
+        and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
+            # Move token_ids and d_vector to GPU, asynchronously
+            self.backprops.append((
+                util.get_async(self.cuda_stream, token_ids),
+                util.get_async(self.cuda_stream, d_vector),
+                get_d_tokvecs
+            ))
+        else:
+            self.backprops.append((token_ids, d_vector, get_d_tokvecs))
+
+
     def finish_steps(self, golds):
         # Add a padding vector to the d_tokvecs gradient, so that missing
         # values don't affect the real gradient.
@@ -315,16 +335,7 @@ def step_forward(model: ParserStepModel, states, is_train):
         d_vector = get_d_vector(d_scores)
         if mask is not None:
             d_vector *= mask
-        if isinstance(model.state2vec.ops, CupyOps) \
-        and not isinstance(token_ids, model.state2vec.ops.xp.ndarray):
-            # Move token_ids and d_vector to GPU, asynchronously
-            model.backprops.append((
-                util.get_async(model.cuda_stream, token_ids),
-                util.get_async(model.cuda_stream, d_vector),
-                get_d_tokvecs
-            ))
-        else:
-            model.backprops.append((token_ids, d_vector, get_d_tokvecs))
+        model.backprop_step(token_ids, d_vector, get_d_tokvecs)
         return None
     return scores, backprop_parser_step
 
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 043d8d681..591afe5ab 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -200,6 +200,8 @@ cdef class Parser:
         with nogil:
             self._parseC(&states[0],
                 weights, sizes)
+        model.clear_memory()
+        del model
         return batch
 
     cdef void _parseC(self, StateC** states,
@@ -312,6 +314,13 @@ cdef class Parser:
         if set_annotations:
             docs = [eg.predicted for eg in examples]
             self.set_annotations(docs, all_states)
+        # Ugh, this is annoying. If we're working on GPU, we want to free the
+        # memory ASAP. It seems that Python doesn't necessarily get around to
+        # removing these in time if we don't explicitly delete? It's confusing.
+        del backprop
+        del backprop_tok2vec
+        model.clear_memory()
+        del model
         return losses
 
     def rehearse(self, examples, sgd=None, losses=None, **cfg):
@@ -335,7 +344,7 @@ cdef class Parser:
         set_dropout_rate(self._rehearsal_model, 0.0)
         set_dropout_rate(self.model, 0.0)
         tutor, _ = self._rehearsal_model.begin_update(docs)
-        model, finish_update = self.model.begin_update(docs)
+        model, backprop_tok2vec = self.model.begin_update(docs)
         n_scores = 0.
         loss = 0.
         while states:
@@ -351,10 +360,16 @@ cdef class Parser:
             states = [state for state in states if not state.is_final()]
             n_scores += d_scores.size
         # Do the backprop
-        finish_update(docs)
+        backprop_tok2vec(docs)
         if sgd is not None:
             self.model.finish_update(sgd)
         losses[self.name] += loss / n_scores
+        del backprop
+        del backprop_tok2vec
+        model.clear_memory()
+        tutor.clear_memory()
+        del model
+        del tutor
         return losses
 
     def get_gradients(self):