diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx
index 5c4e34e36..329b0a2f2 100644
--- a/spacy/ml/tb_framework.pyx
+++ b/spacy/ml/tb_framework.pyx
@@ -317,106 +317,6 @@ def _forward_fallback(model: Model, moves: TransitionSystem, states: List[StateC
     return (list(batch), all_scores), backprop_parser
 
 
-def _forward_reference(
-    model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: bool
-):
-    """Slow reference implementation, without the precomputation"""
-    nF = model.get_dim("nF")
-    tok2vec = model.get_ref("tok2vec")
-    output = model.get_ref("output")
-    hidden_pad = model.get_param("hidden_pad")
-    hidden_W = model.get_param("hidden_W")
-    hidden_b = model.get_param("hidden_b")
-    nH = model.get_dim("nH")
-    nP = model.get_dim("nP")
-    nO = model.get_dim("nO")
-    nI = model.get_dim("nI")
-
-    ops = model.ops
-    docs, moves = docs_moves
-    states = moves.init_batch(docs)
-    tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
-    tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
-    all_ids = []
-    all_which = []
-    all_statevecs = []
-    all_scores = []
-    all_tokfeats = []
-    next_states = [s for s in states if not s.is_final()]
-    seen_mask = _get_seen_mask(model)
-    ids = numpy.zeros((len(states), nF), dtype="i")
-    while next_states:
-        ids = ids[: len(next_states)]
-        for i, state in enumerate(next_states):
-            state.set_context_tokens(ids, i, nF)
-        # Sum the state features, add the bias and apply the activation (maxout)
-        # to create the state vectors.
-        tokfeats3f = tokvecs[ids]
-        tokfeats = model.ops.reshape2f(tokfeats3f, tokfeats3f.shape[0], -1)
-        preacts2f = model.ops.gemm(tokfeats, hidden_W, trans2=True)
-        preacts2f += hidden_b
-        preacts = model.ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
-        statevecs, which = ops.maxout(preacts)
-        # We don't use output's backprop, since we want to backprop for
-        # all states at once, rather than a single state.
-        scores = output.predict(statevecs)
-        scores[:, seen_mask] = model.ops.xp.nanmin(scores)
-        # Transition the states, filtering out any that are finished.
-        next_states = moves.transition_states(next_states, scores)
-        all_scores.append(scores)
-        if is_train:
-            # Remember intermediate results for the backprop.
-            all_tokfeats.append(tokfeats)
-            all_ids.append(ids.copy())
-            all_statevecs.append(statevecs)
-            all_which.append(which)
-
-    nS = sum(len(s.history) for s in states)
-
-    def backprop_parser(d_states_d_scores):
-        d_tokvecs = model.ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
-        ids = model.ops.xp.vstack(all_ids)
-        which = ops.xp.vstack(all_which)
-        statevecs = model.ops.xp.vstack(all_statevecs)
-        tokfeats = model.ops.xp.vstack(all_tokfeats)
-        _, d_scores = d_states_d_scores
-        if model.attrs.get("unseen_classes"):
-            # If we have a negative gradient (i.e. the probability should
-            # increase) on any classes we filtered out as unseen, mark
-            # them as seen.
-            for clas in set(model.attrs["unseen_classes"]):
-                if (d_scores[:, clas] < 0).any():
-                    model.attrs["unseen_classes"].remove(clas)
-        d_scores *= seen_mask == False
-        assert statevecs.shape == (nS, nH), statevecs.shape
-        assert d_scores.shape == (nS, nO), d_scores.shape
-        # Calculate the gradients for the parameters of the output layer.
-        # The weight gemm is (nS, nO) @ (nS, nH).T
-        output.inc_grad("b", d_scores.sum(axis=0))
-        output.inc_grad("W", model.ops.gemm(d_scores, statevecs, trans1=True))
-        # Now calculate d_statevecs, by backproping through the output linear layer.
-        # This gemm is (nS, nO) @ (nO, nH)
-        output_W = output.get_param("W")
-        d_statevecs = model.ops.gemm(d_scores, output_W)
-        # Backprop through the maxout activation
-        d_preacts = model.ops.backprop_maxout(d_statevecs, which, nP)
-        d_preacts2f = model.ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
-        # Now increment the gradients for the hidden layer.
-        # The gemm here is (nS, nH*nP) @ (nS, nF*nI)
-        model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
-        model.inc_grad("hidden_W", model.ops.gemm(d_preacts2f, tokfeats, trans1=True))
-        # Caclulate d_tokfeats
-        # The gemm here is (nS, nH*nP) @ (nH*nP, nF*nI)
-        d_tokfeats = model.ops.gemm(d_preacts2f, hidden_W)
-        # Get the gradients of the tokvecs and the padding
-        d_tokfeats3f = model.ops.reshape3f(d_tokfeats, nS, nF, nI)
-        model.ops.scatter_add(d_tokvecs, ids, d_tokfeats3f)
-        model.inc_grad("hidden_pad", d_tokvecs[-1])
-        return (backprop_tok2vec(d_tokvecs[:-1]), None)
-
-    return (states, all_scores), backprop_parser
-
-
 def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
     mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
     for class_ in model.attrs.get("unseen_classes", set()):