From bc36c71982681823b18e141220cec73e81265fe4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Thu, 9 Jun 2022 13:45:30 +0200 Subject: [PATCH] Rename some identifiers in the parser refactor (#10935) * Rename _parseC to _parse_batch * tb_framework: prefix many auxiliary functions with underscore To clearly state the intent that they are private. * Rename `lower` to `hidden`, `upper` to `output` --- spacy/ml/tb_framework.pyx | 210 +++++++++--------- .../tests/serialize/test_serialize_config.py | 24 +- 2 files changed, 117 insertions(+), 117 deletions(-) diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx index 62d876e33..e671cbebd 100644 --- a/spacy/ml/tb_framework.pyx +++ b/spacy/ml/tb_framework.pyx @@ -45,28 +45,28 @@ def TransitionModel( tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width)) # type: ignore tok2vec_projected.set_dim("nO", hidden_width) - # FIXME: we use `upper` as a container for the upper layer's + # FIXME: we use `output` as a container for the output layer's # weights and biases. Thinc optimizers cannot handle resizing # of parameters. So, when the parser model is resized, we - # construct a new `upper` layer, which has a different key in + # construct a new `output` layer, which has a different key in # the optimizer. Once the optimizer supports parameter resizing, - # we can replace the `upper` layer by `upper_W` and `upper_b` + # we can replace the `output` layer by `output_W` and `output_b` # parameters in this model. - upper = Linear(nO=None, nI=hidden_width, init_W=zero_init) + output = Linear(nO=None, nI=hidden_width, init_W=zero_init) return Model( name="parser_model", forward=forward, init=init, - layers=[tok2vec_projected, upper], + layers=[tok2vec_projected, output], refs={ "tok2vec": tok2vec_projected, - "upper": upper, + "output": output, }, params={ - "lower_W": None, # Floats2d W for the hidden layer - "lower_b": None, # Floats1d bias for the hidden layer - "lower_pad": None, # Floats1d padding for the hidden layer + "hidden_W": None, # Floats2d W for the hidden layer + "hidden_b": None, # Floats1d bias for the hidden layer + "hidden_pad": None, # Floats1d padding for the hidden layer }, dims={ "nO": None, # Output size @@ -86,28 +86,28 @@ def TransitionModel( def resize_output(model: Model, new_nO: int) -> Model: old_nO = model.maybe_get_dim("nO") - upper = model.get_ref("upper") + output = model.get_ref("output") if old_nO is None: model.set_dim("nO", new_nO) - upper.set_dim("nO", new_nO) - upper.initialize() + output.set_dim("nO", new_nO) + output.initialize() return model elif new_nO <= old_nO: return model - elif upper.has_param("W"): + elif output.has_param("W"): nH = model.get_dim("nH") - new_upper = Linear(nO=new_nO, nI=nH, init_W=zero_init) - new_upper.initialize() - new_W = new_upper.get_param("W") - new_b = new_upper.get_param("b") - old_W = upper.get_param("W") - old_b = upper.get_param("b") + new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init) + new_output.initialize() + new_W = new_output.get_param("W") + new_b = new_output.get_param("b") + old_W = output.get_param("W") + old_b = output.get_param("b") new_W[:old_nO] = old_W # type: ignore new_b[:old_nO] = old_b # type: ignore for i in range(old_nO, new_nO): model.attrs["unseen_classes"].add(i) - model.layers[-1] = new_upper - model.set_ref("upper", new_upper) + model.layers[-1] = new_output + model.set_ref("output", new_output) # TODO: Avoid this private intrusion model._dims["nO"] = new_nO return model @@ -141,10 +141,10 @@ def init( # Wl = zero_init(ops, Wl.shape) Wl = glorot_uniform_init(ops, Wl.shape) padl = uniform_init(ops, padl.shape) # type: ignore - # TODO: Experiment with whether better to initialize upper_W - model.set_param("lower_W", Wl) - model.set_param("lower_b", bl) - model.set_param("lower_pad", padl) + # TODO: Experiment with whether better to initialize output_W + model.set_param("hidden_W", Wl) + model.set_param("hidden_b", bl) + model.set_param("hidden_pad", padl) # model = _lsuv_init(model) return model @@ -160,12 +160,12 @@ def forward(model, docs_moves: InT, is_train: bool): docs, moves, actions = docs_moves beam_width = model.attrs["beam_width"] - lower_pad = model.get_param("lower_pad") + hidden_pad = model.get_param("hidden_pad") tok2vec = model.get_ref("tok2vec") states = moves.init_batch(docs) tokvecs, backprop_tok2vec = tok2vec(docs, is_train) - tokvecs = model.ops.xp.vstack((tokvecs, lower_pad)) + tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad)) feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train) seen_mask = _get_seen_mask(model) @@ -183,23 +183,23 @@ def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[State for state in states: if not state.is_final(): c_states.push_back(state.c) - weights = get_c_weights(model, feats.data, seen_mask) + weights = _get_c_weights(model, feats.data, seen_mask) # Precomputed features have rows for each token, plus one for padding. cdef int n_tokens = feats.shape[0] - 1 - sizes = get_c_sizes(model, c_states.size(), n_tokens) + sizes = _get_c_sizes(model, c_states.size(), n_tokens) cdef CBlas cblas = model.ops.cblas() - scores = _parseC(cblas, moves, &c_states[0], weights, sizes, actions=actions) + scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions) def backprop(dY): raise ValueError(Errors.E1038) return (states, scores), backprop -cdef list _parseC(CBlas cblas, TransitionSystem moves, StateC** states, - WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None): +cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states, + WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None): cdef int i, j cdef vector[StateC *] unfinished - cdef ActivationsC activations = alloc_activations(sizes) + cdef ActivationsC activations = _alloc_activations(sizes) cdef np.ndarray step_scores cdef np.ndarray step_actions @@ -208,7 +208,7 @@ cdef list _parseC(CBlas cblas, TransitionSystem moves, StateC** states, step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f") step_actions = actions[0] if actions is not None else None with nogil: - predict_states(cblas, &activations, step_scores.data, states, &weights, sizes) + _predict_states(cblas, &activations, step_scores.data, states, &weights, sizes) if actions is None: # Validate actions, argmax, take action. c_transition_batch(moves, states, step_scores.data, sizes.classes, @@ -224,7 +224,7 @@ cdef list _parseC(CBlas cblas, TransitionSystem moves, StateC** states, scores.append(step_scores) unfinished.clear() actions = actions[1:] if actions is not None else None - free_activations(&activations) + _free_activations(&activations) return scores @@ -232,8 +232,8 @@ cdef list _parseC(CBlas cblas, TransitionSystem moves, StateC** states, def _forward_fallback(model: Model, moves: TransitionSystem, states: List[StateClass], tokvecs, backprop_tok2vec, feats, backprop_feats, seen_mask, is_train: bool, actions: Optional[List[Ints1d]]=None): nF = model.get_dim("nF") - upper = model.get_ref("upper") - lower_b = model.get_param("lower_b") + output = model.get_ref("output") + hidden_b = model.get_param("hidden_b") nH = model.get_dim("nH") nP = model.get_dim("nP") @@ -260,13 +260,13 @@ def _forward_fallback(model: Model, moves: TransitionSystem, states: List[StateC # Sum the state features, add the bias and apply the activation (maxout) # to create the state vectors. preacts2f = feats[ids, arange].sum(axis=1) # type: ignore - preacts2f += lower_b + preacts2f += hidden_b preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP) assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape statevecs, which = ops.maxout(preacts) - # We don't use upper's backprop, since we want to backprop for + # We don't use output's backprop, since we want to backprop for # all states at once, rather than a single state. - scores = upper.predict(statevecs) + scores = output.predict(statevecs) scores[:, seen_mask] = ops.xp.nanmin(scores) # Transition the states, filtering out any that are finished. cpu_scores = ops.to_numpy(scores) @@ -295,23 +295,23 @@ def _forward_fallback(model: Model, moves: TransitionSystem, states: List[StateC if (d_scores[:, clas] < 0).any(): model.attrs["unseen_classes"].remove(clas) d_scores *= seen_mask == False - # Calculate the gradients for the parameters of the upper layer. + # Calculate the gradients for the parameters of the output layer. # The weight gemm is (nS, nO) @ (nS, nH).T - upper.inc_grad("b", d_scores.sum(axis=0)) - upper.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True)) - # Now calculate d_statevecs, by backproping through the upper linear layer. + output.inc_grad("b", d_scores.sum(axis=0)) + output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True)) + # Now calculate d_statevecs, by backproping through the output linear layer. # This gemm is (nS, nO) @ (nO, nH) - upper_W = upper.get_param("W") - d_statevecs = ops.gemm(d_scores, upper_W) + output_W = output.get_param("W") + d_statevecs = ops.gemm(d_scores, output_W) # Backprop through the maxout activation d_preacts = ops.backprop_maxout(d_statevecs, which, nP) d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP) - model.inc_grad("lower_b", d_preacts2f.sum(axis=0)) + model.inc_grad("hidden_b", d_preacts2f.sum(axis=0)) # We don't need to backprop the summation, because we pass back the IDs instead d_state_features = backprop_feats((d_preacts2f, ids)) d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1]) ops.scatter_add(d_tokvecs, ids, d_state_features) - model.inc_grad("lower_pad", d_tokvecs[-1]) + model.inc_grad("hidden_pad", d_tokvecs[-1]) return (backprop_tok2vec(d_tokvecs[:-1]), None) return (list(batch), all_scores), backprop_parser @@ -323,10 +323,10 @@ def _forward_reference( """Slow reference implementation, without the precomputation""" nF = model.get_dim("nF") tok2vec = model.get_ref("tok2vec") - upper = model.get_ref("upper") - lower_pad = model.get_param("lower_pad") - lower_W = model.get_param("lower_W") - lower_b = model.get_param("lower_b") + output = model.get_ref("output") + hidden_pad = model.get_param("hidden_pad") + hidden_W = model.get_param("hidden_W") + hidden_b = model.get_param("hidden_b") nH = model.get_dim("nH") nP = model.get_dim("nP") nO = model.get_dim("nO") @@ -336,7 +336,7 @@ def _forward_reference( docs, moves = docs_moves states = moves.init_batch(docs) tokvecs, backprop_tok2vec = tok2vec(docs, is_train) - tokvecs = model.ops.xp.vstack((tokvecs, lower_pad)) + tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad)) all_ids = [] all_which = [] all_statevecs = [] @@ -353,13 +353,13 @@ def _forward_reference( # to create the state vectors. tokfeats3f = tokvecs[ids] tokfeats = model.ops.reshape2f(tokfeats3f, tokfeats3f.shape[0], -1) - preacts2f = model.ops.gemm(tokfeats, lower_W, trans2=True) - preacts2f += lower_b + preacts2f = model.ops.gemm(tokfeats, hidden_W, trans2=True) + preacts2f += hidden_b preacts = model.ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP) statevecs, which = ops.maxout(preacts) - # We don't use upper's backprop, since we want to backprop for + # We don't use output's backprop, since we want to backprop for # all states at once, rather than a single state. - scores = upper.predict(statevecs) + scores = output.predict(statevecs) scores[:, seen_mask] = model.ops.xp.nanmin(scores) # Transition the states, filtering out any that are finished. next_states = moves.transition_states(next_states, scores) @@ -390,28 +390,28 @@ def _forward_reference( d_scores *= seen_mask == False assert statevecs.shape == (nS, nH), statevecs.shape assert d_scores.shape == (nS, nO), d_scores.shape - # Calculate the gradients for the parameters of the upper layer. + # Calculate the gradients for the parameters of the output layer. # The weight gemm is (nS, nO) @ (nS, nH).T - upper.inc_grad("b", d_scores.sum(axis=0)) - upper.inc_grad("W", model.ops.gemm(d_scores, statevecs, trans1=True)) - # Now calculate d_statevecs, by backproping through the upper linear layer. + output.inc_grad("b", d_scores.sum(axis=0)) + output.inc_grad("W", model.ops.gemm(d_scores, statevecs, trans1=True)) + # Now calculate d_statevecs, by backproping through the output linear layer. # This gemm is (nS, nO) @ (nO, nH) - upper_W = upper.get_param("W") - d_statevecs = model.ops.gemm(d_scores, upper_W) + output_W = output.get_param("W") + d_statevecs = model.ops.gemm(d_scores, output_W) # Backprop through the maxout activation d_preacts = model.ops.backprop_maxout(d_statevecs, which, nP) d_preacts2f = model.ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP) - # Now increment the gradients for the lower layer. + # Now increment the gradients for the hidden layer. # The gemm here is (nS, nH*nP) @ (nS, nF*nI) - model.inc_grad("lower_b", d_preacts2f.sum(axis=0)) - model.inc_grad("lower_W", model.ops.gemm(d_preacts2f, tokfeats, trans1=True)) + model.inc_grad("hidden_b", d_preacts2f.sum(axis=0)) + model.inc_grad("hidden_W", model.ops.gemm(d_preacts2f, tokfeats, trans1=True)) # Caclulate d_tokfeats # The gemm here is (nS, nH*nP) @ (nH*nP, nF*nI) - d_tokfeats = model.ops.gemm(d_preacts2f, lower_W) + d_tokfeats = model.ops.gemm(d_preacts2f, hidden_W) # Get the gradients of the tokvecs and the padding d_tokfeats3f = model.ops.reshape3f(d_tokfeats, nS, nF, nI) model.ops.scatter_add(d_tokvecs, ids, d_tokfeats3f) - model.inc_grad("lower_pad", d_tokvecs[-1]) + model.inc_grad("hidden_pad", d_tokvecs[-1]) return (backprop_tok2vec(d_tokvecs[:-1]), None) return (states, all_scores), backprop_parser @@ -425,7 +425,7 @@ def _get_seen_mask(model: Model) -> numpy.array[bool, 1]: def _forward_precomputable_affine(model, X: Floats2d, is_train: bool): - W: Floats2d = model.get_param("lower_W") + W: Floats2d = model.get_param("hidden_W") nF = model.get_dim("nF") nH = model.get_dim("nH") nP = model.get_dim("nP") @@ -456,7 +456,7 @@ def _forward_precomputable_affine(model, X: Floats2d, is_train: bool): dXf = model.ops.gemm(dY, W) Xf = X[ids].reshape((ids.shape[0], -1)) dW = model.ops.gemm(dY, Xf, trans1=True) - model.inc_grad("lower_W", dW) + model.inc_grad("hidden_W", dW) return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI) return Yf, backward @@ -482,7 +482,7 @@ def _lsuv_init(model: Model): we set the maxout weights to values that empirically result in whitened outputs given whitened inputs. """ - W = model.maybe_get_param("lower_W") + W = model.maybe_get_param("hidden_W") if W is not None and W.any(): return @@ -523,66 +523,66 @@ def _lsuv_init(model: Model): tol_var = 0.01 tol_mean = 0.01 t_max = 10 - W = cast(Floats4d, model.get_param("lower_W").copy()) - b = cast(Floats2d, model.get_param("lower_b").copy()) + W = cast(Floats4d, model.get_param("hidden_W").copy()) + b = cast(Floats2d, model.get_param("hidden_b").copy()) for t_i in range(t_max): acts1 = predict(ids, tokvecs) var = model.ops.xp.var(acts1) mean = model.ops.xp.mean(acts1) if abs(var - 1.0) >= tol_var: W /= model.ops.xp.sqrt(var) - model.set_param("lower_W", W) + model.set_param("hidden_W", W) elif abs(mean) >= tol_mean: b -= mean - model.set_param("lower_b", b) + model.set_param("hidden_b", b) else: break return model -cdef WeightsC get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *: - upper = model.get_ref("upper") - cdef np.ndarray lower_b = model.get_param("lower_b") - cdef np.ndarray upper_W = upper.get_param("W") - cdef np.ndarray upper_b = upper.get_param("b") +cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *: + output = model.get_ref("output") + cdef np.ndarray hidden_b = model.get_param("hidden_b") + cdef np.ndarray output_W = output.get_param("W") + cdef np.ndarray output_b = output.get_param("b") - cdef WeightsC output - output.feat_weights = feats - output.feat_bias = lower_b.data - output.hidden_weights = upper_W.data - output.hidden_bias = upper_b.data - output.seen_mask = seen_mask.data + cdef WeightsC weights + weights.feat_weights = feats + weights.feat_bias = hidden_b.data + weights.hidden_weights = output_W.data + weights.hidden_bias = output_b.data + weights.seen_mask = seen_mask.data - return output + return weights -cdef SizesC get_c_sizes(model, int batch_size, int tokens) except *: - cdef SizesC output - output.states = batch_size - output.classes = model.get_dim("nO") - output.hiddens = model.get_dim("nH") - output.pieces = model.get_dim("nP") - output.feats = model.get_dim("nF") - output.embed_width = model.get_dim("nI") - output.tokens = tokens - return output +cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *: + cdef SizesC sizes + sizes.states = batch_size + sizes.classes = model.get_dim("nO") + sizes.hiddens = model.get_dim("nH") + sizes.pieces = model.get_dim("nP") + sizes.feats = model.get_dim("nF") + sizes.embed_width = model.get_dim("nI") + sizes.tokens = tokens + return sizes -cdef ActivationsC alloc_activations(SizesC n) nogil: +cdef ActivationsC _alloc_activations(SizesC n) nogil: cdef ActivationsC A memset(&A, 0, sizeof(A)) - resize_activations(&A, n) + _resize_activations(&A, n) return A -cdef void free_activations(const ActivationsC* A) nogil: +cdef void _free_activations(const ActivationsC* A) nogil: free(A.token_ids) free(A.unmaxed) free(A.hiddens) free(A.is_valid) -cdef void resize_activations(ActivationsC* A, SizesC n) nogil: +cdef void _resize_activations(ActivationsC* A, SizesC n) nogil: if n.states <= A._max_size: A._curr_size = n.states return @@ -605,12 +605,12 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil: A._curr_size = n.states -cdef void predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil: - resize_activations(A, n) +cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil: + _resize_activations(A, n) for i in range(n.states): states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats) memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float)) - sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n) + _sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n) for i in range(n.states): VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces], W.feat_bias, 1., n.hiddens * n.pieces) @@ -641,7 +641,7 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** s scores[i*n.classes+j] = min_ -cdef void sum_state_features(CBlas cblas, float* output, +cdef void _sum_state_features(CBlas cblas, float* output, const float* cached, const int* token_ids, SizesC n) nogil: cdef int idx, b, f, i cdef const float* feature diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 95208141d..82f01dcc2 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -263,12 +263,12 @@ def test_serialize_custom_nlp(): nlp2 = spacy.load(d) model = nlp2.get_pipe("parser").model assert model.get_ref("tok2vec") is not None - assert model.has_param("lower_W") - assert model.has_param("lower_b") - upper = model.get_ref("upper") - assert upper is not None - assert upper.has_param("W") - assert upper.has_param("b") + assert model.has_param("hidden_W") + assert model.has_param("hidden_b") + output = model.get_ref("output") + assert output is not None + assert output.has_param("W") + assert output.has_param("b") @pytest.mark.parametrize("parser_config_string", [parser_config_string_upper]) @@ -285,12 +285,12 @@ def test_serialize_parser(parser_config_string): nlp2 = spacy.load(d) model = nlp2.get_pipe("parser").model assert model.get_ref("tok2vec") is not None - assert model.has_param("lower_W") - assert model.has_param("lower_b") - upper = model.get_ref("upper") - assert upper is not None - assert upper.has_param("b") - assert upper.has_param("W") + assert model.has_param("hidden_W") + assert model.has_param("hidden_b") + output = model.get_ref("output") + assert output is not None + assert output.has_param("b") + assert output.has_param("W") def test_config_nlp_roundtrip():