From c6df0eafd0046179c1c9fb7840074edf04e4721d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 25 Jan 2021 09:50:48 +1100
Subject: [PATCH] Fix set_annotations in parser.update

---
 .../_parser_internals/_beam_utils.pyx         |  6 ++-
 .../pipeline/_parser_internals/arc_eager.pyx  |  5 +-
 .../_parser_internals/transition_system.pyx   | 10 ++++
 spacy/pipeline/transition_parser.pyx          | 52 +++++++++++++------
 4 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index fa7df2056..ef4165505 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -193,7 +193,11 @@ def update_beam(TransitionSystem moves, states, golds, model, int width, beam_de
     for i, (d_scores, bp_scores) in enumerate(zip(states_d_scores, backprops)):
         loss += (d_scores**2).mean()
         bp_scores(d_scores)
-    return loss
+    # Return the predicted sequence for each doc.
+    predicted_histories = []
+    for i in range(len(pbeam)):
+        predicted_histories.append(pbeam[i].histories[0])
+    return predicted_histories, loss
 
 
 def collect_states(beams, docs):
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 069b41170..7c3d6d275 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -638,16 +638,17 @@ cdef class ArcEager(TransitionSystem):
         return gold
 
     def init_gold_batch(self, examples):
-        # TODO: Projectivity?
         all_states = self.init_batch([eg.predicted for eg in examples])
         golds = []
         states = []
+        docs = []
         for state, eg in zip(all_states, examples):
             if self.has_gold(eg) and not state.is_final():
                 golds.append(self.init_gold(state, eg))
                 states.append(state)
+                docs.append(eg.x)
         n_steps = sum([len(s.queue) for s in states])
-        return states, golds, n_steps
+        return states, golds, docs
 
     def _replace_unseen_labels(self, ArcEagerGold gold):
         backoff_label = self.strings["dep"]
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index 9bb4f7f5f..287513a79 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -120,6 +120,16 @@ cdef class TransitionSystem:
                 raise ValueError(Errors.E024)
         return history
 
+    def follow_history(self, doc, history):
+        """Get the state that results from following a sequence of actions."""
+        cdef int clas
+        cdef StateClass state
+        state = self.init_batch([doc])[0]
+        for clas in history:
+            action = self.c[clas]
+            action.do(state.c, action.label)
+        return state
+
     def apply_transition(self, StateClass state, name):
         if not self.is_valid(state, name):
             raise ValueError(Errors.E170.format(name=name))
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 15b07e9b1..b93565178 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -337,21 +337,22 @@ cdef class Parser(TrainablePipe):
             # Chop sequences into lengths of this many words, to make the
             # batch uniform length.
             max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
-            states, golds, _ = self._init_gold_batch(
+            states, golds, max_moves, state2doc = self._init_gold_batch(
                 examples,
                 max_length=max_moves
             )
         else:
-            states, golds, _ = self.moves.init_gold_batch(examples)
+            states, golds, state2doc = self.moves.init_gold_batch(examples)
         if not states:
             return losses
         model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
  
+        histories = [[] for example in examples]
         all_states = list(states)
-        states_golds = list(zip(states, golds))
+        states_golds = list(zip(states, golds, state2doc))
         n_moves = 0
         while states_golds:
-            states, golds = zip(*states_golds)
+            states, golds, state2doc = zip(*states_golds)
             scores, backprop = model.begin_update(states)
             d_scores = self.get_batch_loss(states, golds, scores, losses)
             # Note that the gradient isn't normalized by the batch size
@@ -360,8 +361,13 @@ cdef class Parser(TrainablePipe):
             # be getting smaller gradients for states in long sequences.
             backprop(d_scores)
             # Follow the predicted action
-            self.transition_states(states, scores)
-            states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
+            actions = self.transition_states(states, scores)
+            for i, action in enumerate(actions):
+                histories[i].append(action)
+            states_golds = [
+                s for s  in zip(states, golds, state2doc)
+                if not s[0].is_final()
+            ]
             if max_moves >= 1 and n_moves >= max_moves:
                 break
             n_moves += 1
@@ -370,11 +376,11 @@ cdef class Parser(TrainablePipe):
         if sgd not in (None, False):
             self.finish_update(sgd)
         docs = [eg.predicted for eg in examples]
-        # TODO: Refactor so we don't have to parse twice like this (ugh)
-        # The issue is that we cut up the gold batch into sub-states, and that
-        # makes it hard to get the actual predicted transition sequence.
-        predicted_states = self.predict(docs)
-        self.set_annotations(docs, predicted_states)
+        states = [
+            self.moves.follow_history(doc, history)
+            for doc, history in zip(docs, histories)
+        ]
+        self.set_annotations(docs, self._get_states(docs, states))
         # Ugh, this is annoying. If we're working on GPU, we want to free the
         # memory ASAP. It seems that Python doesn't necessarily get around to
         # removing these in time if we don't explicitly delete? It's confusing.
@@ -435,13 +441,16 @@ cdef class Parser(TrainablePipe):
 
     def update_beam(self, examples, *, beam_width,
             drop=0., sgd=None, losses=None, beam_density=0.0):
-        states, golds, _ = self.moves.init_gold_batch(examples)
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
+        states, golds, docs = self.moves.init_gold_batch(examples)
         if not states:
             return losses
         # Prepare the stepwise model, and get the callback for finishing the batch
         model, backprop_tok2vec = self.model.begin_update(
             [eg.predicted for eg in examples])
-        loss = _beam_utils.update_beam(
+        predicted_histories, loss = _beam_utils.update_beam(
             self.moves,
             states,
             golds,
@@ -453,6 +462,12 @@ cdef class Parser(TrainablePipe):
         backprop_tok2vec(golds)
         if sgd is not None:
             self.finish_update(sgd)
+        states = [
+            self.moves.follow_history(doc, history)
+            for doc, history in zip(docs, predicted_histories)
+        ]
+        self.set_annotations(docs, states)
+        return losses
 
     def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
         cdef StateClass state
@@ -595,18 +610,24 @@ cdef class Parser(TrainablePipe):
         states = []
         golds = []
         to_cut = []
+        # Return a list indicating the position in the batch that each state
+        # refers to. This lets us put together the full list of predicted 
+        # histories.
+        state2doc = []
+        doc2i = {eg.x: i for i, eg in enumerate(examples)}
         for state, eg in zip(all_states, examples):
             if self.moves.has_gold(eg) and not state.is_final():
                 gold = self.moves.init_gold(state, eg)
                 if len(eg.x) < max_length:
                     states.append(state)
                     golds.append(gold)
+                    state2doc.append(doc2i[eg.x])
                 else:
                     oracle_actions = self.moves.get_oracle_sequence_from_state(
                         state.copy(), gold)
                     to_cut.append((eg, state, gold, oracle_actions))
         if not to_cut:
-            return states, golds, 0
+            return states, golds, 0, state2doc
         cdef int clas
         for eg, state, gold, oracle_actions in to_cut:
             for i in range(0, len(oracle_actions), max_length):
@@ -619,6 +640,7 @@ cdef class Parser(TrainablePipe):
                 if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
                     states.append(start_state)
                     golds.append(gold)
+                    state2doc.append(doc2i[eg.x])
                 if state.is_final():
                     break
-        return states, golds, max_length
+        return states, golds, max_length, state2doc