WIP refactor parser

2025-07-21 21:49:49 +03:00 · 2021-01-25 23:22:10 +11:00 · 2021-01-25 23:22:10 +11:00 · 267ffb5605
commit 267ffb5605
parent b456929bfd
1 changed files with 71 additions and 133 deletions
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@ -18,8 +18,9 @@ from ..pipeline._parser_internals.stateclass cimport StateClass
 cdef WeightsC get_c_weights(model) except *:
    cdef WeightsC output
    cdef precompute_hiddens state2vec = model.state2vec
    cdef np.ndarray bias = state2vec.bias
    output.feat_weights = state2vec.get_feat_weights()
-    output.feat_bias = <const float*>state2vec.bias.data
+    output.feat_bias = <const float*>bias.data
    cdef np.ndarray vec2scores_W
    cdef np.ndarray vec2scores_b
    if model.vec2scores is None:
@ -220,27 +221,23 @@ class ParserStepModel(Model):
            activation = None
        else:
            activation = "relu"
-        self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
+        self.state2vec = precompute_hiddens(
-                                            activation=activation, train=train)
+            len(docs),
            self.tokvecs,
            layers[1],
            activation=activation,
            train=train
        )
        if has_upper:
            self.vec2scores = layers[-1]
        else:
            self.vec2scores = None
        self.cuda_stream = util.get_cuda_stream(non_blocking=True)
        self.backprops = []
        self._class_mask = numpy.zeros((self.nO,), dtype='f')
        self._class_mask.fill(1)
        if unseen_classes is not None:
            for class_ in unseen_classes:
                self._class_mask[class_] = 0.
    def clear_memory(self):
        del self.tokvecs
        del self.bp_tokvecs
        del self.state2vec
        del self.backprops
        del self._class_mask
    @property
    def nO(self):
        if self.attrs["has_upper"]:
@ -248,6 +245,13 @@ class ParserStepModel(Model):
        else:
            return self.state2vec.get_dim("nO")
    def clear_memory(self):
        del self.tokvecs
        del self.bp_tokvecs
        del self.state2vec
        del self.backprops
        del self._class_mask
    def class_is_unseen(self, class_):
        return self._class_mask[class_]
@ -269,54 +273,22 @@ class ParserStepModel(Model):
            c_ids += ids.shape[1]
        return ids
    def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
        if isinstance(self.state2vec.ops, CupyOps) \
        and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
            # Move token_ids and d_vector to GPU, asynchronously
            self.backprops.append((
                util.get_async(self.cuda_stream, token_ids),
                util.get_async(self.cuda_stream, d_vector),
                get_d_tokvecs
            ))
        else:
            self.backprops.append((token_ids, d_vector, get_d_tokvecs))
-
+def step_forward(model: ParserStepModel, token_ids, is_train):
    def finish_steps(self, golds):
        # Add a padding vector to the d_tokvecs gradient, so that missing
        # values don't affect the real gradient.
        d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
        # Tells CUDA to block, so our async copies complete.
        if self.cuda_stream is not None:
            self.cuda_stream.synchronize()
        for ids, d_vector, bp_vector in self.backprops:
            d_state_features = bp_vector((d_vector, ids))
            ids = ids.flatten()
            d_state_features = d_state_features.reshape(
                (ids.size, d_state_features.shape[2]))
            self.ops.scatter_add(d_tokvecs, ids,
                d_state_features)
        # Padded -- see update()
        self.bp_tokvecs(d_tokvecs[:-1])
        return d_tokvecs
 NUMPY_OPS = NumpyOps()
 def step_forward(model: ParserStepModel, states, is_train):
    token_ids = model.get_token_ids(states)
    vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
    mask = None
    if model.attrs["has_upper"]:
        vec2scores = ensure_same_device(model.ops, model.vec2scores)
        dropout_rate = model.attrs["dropout_rate"]
        if is_train and dropout_rate > 0:
-            mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
+            mask = model.ops.get_dropout_mask(vector.shape, dropout_rate)
            vector *= mask
-        scores, get_d_vector = model.vec2scores(vector, is_train)
+        scores, get_d_vector = vec2scores(vector, is_train)
    else:
-        scores = NumpyOps().asarray(vector)
+        scores = vector
        get_d_vector = lambda d_scores: d_scores
    # If the class is unseen, make sure its score is minimum
-    scores[:, model._class_mask == 0] = numpy.nanmin(scores)
+    scores[:, model._class_mask == 0] = model.ops.xp.nanmin(scores)
    def backprop_parser_step(d_scores):
        # Zero vectors for unseen classes
@ -324,11 +296,18 @@ def step_forward(model: ParserStepModel, states, is_train):
        d_vector = get_d_vector(d_scores)
        if mask is not None:
            d_vector *= mask
-        model.backprop_step(token_ids, d_vector, get_d_tokvecs)
+        return get_d_tokvecs(d_vector)
-        return None
+    
    return scores, backprop_parser_step
 def ensure_same_device(ops, model):
    """Ensure a model is on the same device as a given ops"""
    if not isinstance(model.ops, ops.__class__):
        model._to_ops(ops)
    return model
 cdef class precompute_hiddens:
    """Allow a model to be "primed" by pre-computing input features in bulk.
@ -347,31 +326,23 @@ cdef class precompute_hiddens:
    and do the hard-to-program parsing on the CPU.
    """
    cdef readonly int nF, nO, nP
    cdef bint _is_synchronized
    cdef public object ops
-    cdef public object numpy_ops
+    cdef readonly object bias
-    cdef np.ndarray _features
+    cdef readonly object activation
-    cdef np.ndarray _cached
+    cdef readonly object _features
-    cdef np.ndarray bias
+    cdef readonly object _cached
-    cdef object _cuda_stream
+    cdef readonly object _bp_hiddens
    cdef object _bp_hiddens
    cdef object activation
-    def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
+    def __init__(
-                 activation="maxout", train=False):
+        self,
-        gpu_cached, bp_features = lower_model(tokvecs, train)
+        batch_size,
-        cdef np.ndarray cached
+        tokvecs,
-        if not isinstance(gpu_cached, numpy.ndarray):
+        lower_model, 
-            # Note the passing of cuda_stream here: it lets
+        activation="maxout",
-            # cupy make the copy asynchronously.
+        train=False
-            # We then have to block before first use.
+    ):
-            cached = gpu_cached.get(stream=cuda_stream)
+        cached, bp_features = lower_model(tokvecs, train)
-        else:
+        self.bias = lower_model.get_param("b")
            cached = gpu_cached
        if not isinstance(lower_model.get_param("b"), numpy.ndarray):
            self.bias = lower_model.get_param("b").get(stream=cuda_stream)
        else:
            self.bias = lower_model.get_param("b")
        self.nF = cached.shape[1]
        if lower_model.has_dim("nP"):
            self.nP = lower_model.get_dim("nP")
@ -379,19 +350,18 @@ cdef class precompute_hiddens:
            self.nP = 1
        self.nO = cached.shape[2]
        self.ops = lower_model.ops
        self.numpy_ops = NumpyOps()
        assert activation in (None, "relu", "maxout")
        self.activation = activation
        self._is_synchronized = False
        self._cuda_stream = cuda_stream
        self._cached = cached
        self._bp_hiddens = bp_features
    cdef const float* get_feat_weights(self) except NULL:
-        if not self._is_synchronized and self._cuda_stream is not None:
+        cdef np.ndarray cached
-            self._cuda_stream.synchronize()
+        if isinstance(self._cached, numpy.ndarray):
-            self._is_synchronized = True
+            cached = self._cached
-        return <float*>self._cached.data
+        else:
            cached = self._cached.get()
        return <float*>cached.data
    def has_dim(self, name):
        if name == "nF":
@ -433,57 +403,25 @@ cdef class precompute_hiddens:
        return self.begin_update(X)[0]
    def begin_update(self, token_ids):
-        cdef np.ndarray state_vector = numpy.zeros(
+        nO = self.nO
-            (token_ids.shape[0], self.nO, self.nP), dtype='f')
+        nP = self.nP
-        # This is tricky, but (assuming GPU available);
+        hidden = self.model.ops.alloc2f(
-        # - Input to forward on CPU
+            token_ids.shape[0],
-        # - Output from forward on CPU
+            nO * nP
-        # - Input to backward on GPU!
+        ) 
        # - Output from backward on GPU
        bp_hiddens = self._bp_hiddens
        feat_weights = self.cached
        self.ops.scatter_add(
            hidden,
            feat_weights,
            token_ids
        )
        hidden += self.bias
        statevec, mask = self.ops.maxout(hidden.reshape((-1, nO, nP)))
-        feat_weights = self.get_feat_weights()
+        def backward(d_statevec):
-        cdef int[:, ::1] ids = token_ids
+            return bp_hiddens(
-        sum_state_features(<float*>state_vector.data,
+                self.ops.backprop_maxout(d_statevec, mask, nP)
-            feat_weights, &ids[0,0],
+            )
            token_ids.shape[0], self.nF, self.nO*self.nP)
        state_vector += self.bias
        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
        def backward(d_state_vector_ids):
            d_state_vector, token_ids = d_state_vector_ids
            d_state_vector = bp_nonlinearity(d_state_vector)
            d_tokens = bp_hiddens((d_state_vector, token_ids))
            return d_tokens
        return state_vector, backward
    def _nonlinearity(self, state_vector):
        if self.activation == "maxout":
            return self._maxout_nonlinearity(state_vector)
        else:
            return self._relu_nonlinearity(state_vector)
    def _maxout_nonlinearity(self, state_vector):
        state_vector, mask = self.numpy_ops.maxout(state_vector)
        # We're outputting to CPU, but we need this variable on GPU for the
        # backward pass.
        mask = self.ops.asarray(mask)
        def backprop_maxout(d_best):
            return self.ops.backprop_maxout(d_best, mask, self.nP)
-        return state_vector, backprop_maxout
+        return statevec, backward
    def _relu_nonlinearity(self, state_vector):
        state_vector = state_vector.reshape((state_vector.shape[0], -1))
        mask = state_vector >= 0.
        state_vector *= mask
        # We're outputting to CPU, but we need this variable on GPU for the
        # backward pass.
        mask = self.ops.asarray(mask)
        def backprop_relu(d_best):
            d_best *= mask
            return d_best.reshape((d_best.shape + (1,)))
        return state_vector, backprop_relu