From 4b5d1b53f65980e090e283fdd9db9b38ee8bd0fd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Oct 2021 22:21:17 +0200 Subject: [PATCH] Support unseen_classes in parser model --- spacy/ml/tb_framework.py | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 714a4e43e..9cb93c9a2 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -122,38 +122,46 @@ def forward(model, docs_moves, is_train): states = moves.init_batch(docs) tokvecs, backprop_tok2vec = tok2vec(docs, is_train) feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train) - memory = [] + all_ids = [] + all_which = [] + all_statevecs = [] all_scores = [] next_states = list(states) + unseen_mask = _get_unseen_mask(model) while next_states: ids = moves.get_state_ids(states) + # Sum the state features, add the bias and apply the activation (maxout) + # to create the state vectors. preacts = _sum_state_features(feats, lower_pad, ids) - # * Add the bias preacts += lower_b - # * Apply the activation (maxout) statevecs, which = ops.maxout(preacts) - # * Multiply the state-vector by the scores weights + # Multiply the state-vector by the scores weights and add the bias, + # to get the logits. scores = ops.gemm(statevecs, upper_W, trans2=True) - # * Add the bias scores += upper_b + scores[:, unseen_mask == 0] = model.ops.xp.nanmin(scores) + # Transition the states, filtering out any that are finished. next_states = moves.transition_states(states, scores) all_scores.append(scores) if is_train: - memory.append((ids, statevecs, which)) + # Remember intermediate results for the backprop. + all_ids.append(ids) + all_statevecs.append(statevecs) + all_which.append(which) def backprop_parser(d_states_d_scores): _, d_scores = d_states_d_scores - ids, statevecs, whiches = [ops.xp.concatenate(*item) for item in zip(*memory)] - # TODO: Unseen class masking + d_scores *= unseen_mask + ids = ops.xp.concatenate(all_ids) + statevecs = ops.xp.concatenate(all_statevecs) + which = ops.xp.concatenate(all_which) # Calculate the gradients for the parameters of the upper layer. model.inc_grad("upper_b", d_scores.sum(axis=0)) model.inc_grad("upper_W", model.ops.gemm(d_scores, statevecs, trans1=True)) # Now calculate d_statevecs, by backproping through the upper linear layer. d_statevecs = model.ops.gemm(d_scores, upper_W) # Backprop through the maxout activation - d_preacts = model.ops.backprop_maxount( - d_statevecs, whiches, model.get_dim("nP") - ) + d_preacts = model.ops.backprop_maxount(d_statevecs, which, model.get_dim("nP")) # We don't need to backprop the summation, because we pass back the IDs instead d_tokvecs = backprop_feats((d_preacts, ids)) return (backprop_tok2vec(d_tokvecs), None) @@ -161,6 +169,14 @@ def forward(model, docs_moves, is_train): return (states, all_scores), backprop_parser +def _get_unseen_mask(model: Model) -> Floats1d: + mask = model.ops.alloc1f(model.get_dim("nO")) + mask.fill(1) + for class_ in model.attrs.get("unseen_classes", set()): + mask[class_] = 0 + return mask + + def _sum_state_features(ops: Ops, feats: Floats3d, ids: Ints2d, _arange=[]) -> Floats2d: # Here's what we're trying to implement here: #