diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx index 5c4e34e36..329b0a2f2 100644 --- a/spacy/ml/tb_framework.pyx +++ b/spacy/ml/tb_framework.pyx @@ -317,106 +317,6 @@ def _forward_fallback(model: Model, moves: TransitionSystem, states: List[StateC return (list(batch), all_scores), backprop_parser -def _forward_reference( - model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: bool -): - """Slow reference implementation, without the precomputation""" - nF = model.get_dim("nF") - tok2vec = model.get_ref("tok2vec") - output = model.get_ref("output") - hidden_pad = model.get_param("hidden_pad") - hidden_W = model.get_param("hidden_W") - hidden_b = model.get_param("hidden_b") - nH = model.get_dim("nH") - nP = model.get_dim("nP") - nO = model.get_dim("nO") - nI = model.get_dim("nI") - - ops = model.ops - docs, moves = docs_moves - states = moves.init_batch(docs) - tokvecs, backprop_tok2vec = tok2vec(docs, is_train) - tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad)) - all_ids = [] - all_which = [] - all_statevecs = [] - all_scores = [] - all_tokfeats = [] - next_states = [s for s in states if not s.is_final()] - seen_mask = _get_seen_mask(model) - ids = numpy.zeros((len(states), nF), dtype="i") - while next_states: - ids = ids[: len(next_states)] - for i, state in enumerate(next_states): - state.set_context_tokens(ids, i, nF) - # Sum the state features, add the bias and apply the activation (maxout) - # to create the state vectors. - tokfeats3f = tokvecs[ids] - tokfeats = model.ops.reshape2f(tokfeats3f, tokfeats3f.shape[0], -1) - preacts2f = model.ops.gemm(tokfeats, hidden_W, trans2=True) - preacts2f += hidden_b - preacts = model.ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP) - statevecs, which = ops.maxout(preacts) - # We don't use output's backprop, since we want to backprop for - # all states at once, rather than a single state. - scores = output.predict(statevecs) - scores[:, seen_mask] = model.ops.xp.nanmin(scores) - # Transition the states, filtering out any that are finished. - next_states = moves.transition_states(next_states, scores) - all_scores.append(scores) - if is_train: - # Remember intermediate results for the backprop. - all_tokfeats.append(tokfeats) - all_ids.append(ids.copy()) - all_statevecs.append(statevecs) - all_which.append(which) - - nS = sum(len(s.history) for s in states) - - def backprop_parser(d_states_d_scores): - d_tokvecs = model.ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1]) - ids = model.ops.xp.vstack(all_ids) - which = ops.xp.vstack(all_which) - statevecs = model.ops.xp.vstack(all_statevecs) - tokfeats = model.ops.xp.vstack(all_tokfeats) - _, d_scores = d_states_d_scores - if model.attrs.get("unseen_classes"): - # If we have a negative gradient (i.e. the probability should - # increase) on any classes we filtered out as unseen, mark - # them as seen. - for clas in set(model.attrs["unseen_classes"]): - if (d_scores[:, clas] < 0).any(): - model.attrs["unseen_classes"].remove(clas) - d_scores *= seen_mask == False - assert statevecs.shape == (nS, nH), statevecs.shape - assert d_scores.shape == (nS, nO), d_scores.shape - # Calculate the gradients for the parameters of the output layer. - # The weight gemm is (nS, nO) @ (nS, nH).T - output.inc_grad("b", d_scores.sum(axis=0)) - output.inc_grad("W", model.ops.gemm(d_scores, statevecs, trans1=True)) - # Now calculate d_statevecs, by backproping through the output linear layer. - # This gemm is (nS, nO) @ (nO, nH) - output_W = output.get_param("W") - d_statevecs = model.ops.gemm(d_scores, output_W) - # Backprop through the maxout activation - d_preacts = model.ops.backprop_maxout(d_statevecs, which, nP) - d_preacts2f = model.ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP) - # Now increment the gradients for the hidden layer. - # The gemm here is (nS, nH*nP) @ (nS, nF*nI) - model.inc_grad("hidden_b", d_preacts2f.sum(axis=0)) - model.inc_grad("hidden_W", model.ops.gemm(d_preacts2f, tokfeats, trans1=True)) - # Caclulate d_tokfeats - # The gemm here is (nS, nH*nP) @ (nH*nP, nF*nI) - d_tokfeats = model.ops.gemm(d_preacts2f, hidden_W) - # Get the gradients of the tokvecs and the padding - d_tokfeats3f = model.ops.reshape3f(d_tokfeats, nS, nF, nI) - model.ops.scatter_add(d_tokvecs, ids, d_tokfeats3f) - model.inc_grad("hidden_pad", d_tokvecs[-1]) - return (backprop_tok2vec(d_tokvecs[:-1]), None) - - return (states, all_scores), backprop_parser - - def _get_seen_mask(model: Model) -> numpy.array[bool, 1]: mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool") for class_ in model.attrs.get("unseen_classes", set()):