Work on refactoring greedy parser

2025-07-13 01:32:32 +03:00 · 2018-05-07 15:45:52 +02:00 · 2018-05-07 15:45:52 +02:00 · 7f163442e6
commit 7f163442e6
parent a8e70a4187
4 changed files with 569 additions and 755 deletions
--- a/spacy/syntax/_parser_model.pxd
+++ b/spacy/syntax/_parser_model.pxd
@ -0,0 +1,47 @@
+from libc.string cimport memset, memcpy
+from libc.stdlib cimport calloc, free, realloc
+from thinc.typedefs cimport weight_t, class_t, hash_t
+
+from ._state cimport StateC
+
+
+cdef struct SizesC:
+    int states
+    int classes
+    int hiddens
+    int pieces
+    int feats
+    int embed_width
+
+
+cdef struct WeightsC:
+    float* feat_weights
+    float* feat_bias
+    float* hidden_bias
+    float* hidden_weights
+    float* vectors
+
+
+cdef struct ActivationsC:
+    int* token_ids
+    float* vectors
+    float* unmaxed
+    float* scores
+    float* hiddens
+    int* is_valid
+    int _curr_size
+    int _max_size
+
+
+cdef WeightsC get_c_weights(model)
+
+cdef void resize_activations(ActivationsC* A, SizesC n) nogil
+
+cdef void predict_states(ActivationsC* A, StateC** states,
+        const WeightsC* W, SizesC n) nogil
+ 
+cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
+
+cdef void cpu_log_loss(float* d_scores,
+        const float* costs, const int* is_valid, const float* scores, int O) nogil
+ 
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@ -0,0 +1,352 @@
+# cython: infer_types=True
+# cython: cdivision=True
+# cython: boundscheck=False
+# coding: utf-8
+from __future__ import unicode_literals, print_function
+
+from collections import OrderedDict
+import ujson
+import json
+import numpy
+cimport cython.parallel
+import cytoolz
+import numpy.random
+cimport numpy as np
+from libc.math cimport exp
+from libcpp.vector cimport vector
+from libc.string cimport memset, memcpy
+from libc.stdlib cimport calloc, free, realloc
+from cymem.cymem cimport Pool
+from thinc.typedefs cimport weight_t, class_t, hash_t
+from thinc.extra.search cimport Beam
+from thinc.api import chain, clone
+from thinc.v2v import Model, Maxout, Affine
+from thinc.misc import LayerNorm
+from thinc.neural.ops import CupyOps
+from thinc.neural.util import get_array_module
+from thinc.linalg cimport Vec, VecVec
+from thinc cimport openblas
+
+
+from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
+from .._ml import link_vectors_to_models, create_default_optimizer
+from ..compat import json_dumps, copy_array
+from ..tokens.doc cimport Doc
+from ..gold cimport GoldParse
+from ..errors import Errors, TempErrors
+from .. import util
+from .stateclass cimport StateClass
+from .transition_system cimport Transition
+from . import _beam_utils, nonproj
+
+
+cdef WeightsC get_c_weights(model):
+    cdef WeightsC output
+    return output
+
+cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
+    if n.states < A._max_size:
+        A._curr_size = n.states
+        return
+    if A._max_size == 0:
+        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
+        A.vectors = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
+        A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
+        A.unmaxed = <float*>calloc(n.states * n.hiddens, sizeof(A.unmaxed[0]))
+        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
+    else:
+        A.token_ids = <int*>realloc(A.token_ids, n.states * n.feats * sizeof(A.token_ids[0]))
+        A.vectors = <float*>realloc(A.token_ids, n.states * n.embed_width * sizeof(A.vectors[0]))
+        A.scores = <float*>realloc(A.scores,     n.states * n.classes * sizeof(A.scores[0]))
+        A.unmaxed = <float*>realloc(A.unmaxed,   n.states * n.hiddens * sizeof(A.unmaxed[0]))
+        A.is_valid = <int*>realloc(A.is_valid,   n.states * n.classes * sizeof(A.is_valid[0]))
+        A._max_size = n.states
+    A._curr_size = n.states
+
+
+cdef void predict_states(ActivationsC* A, StateC** states,
+        const WeightsC* W, SizesC n) nogil:
+    resize_activations(A, n)
+    for i in range(n.states):
+        state = states[i]
+        state.set_context_tokens(A.token_ids, n.feats)
+        memset(A.unmaxed, 0, n.hiddens * n.pieces * sizeof(float))
+        sum_state_features(A.unmaxed,
+            W.feat_weights, A.token_ids, 1, n.feats, n.hiddens * n.pieces)
+        VecVec.add_i(A.unmaxed,
+            W.feat_bias, 1., n.hiddens * n.pieces)
+        state_vector = &A.vectors[i*n.hiddens]
+        for j in range(n.hiddens):
+            index = j * n.pieces
+            which = Vec.arg_max(&A.unmaxed[index], n.pieces)
+            state_vector[j] = A.unmaxed[index + which]
+        # Compute hidden-to-output
+        openblas.simple_gemm(A.scores, n.states, n.classes,
+            A.vectors, n.states, n.hiddens,
+            W.hidden_weights, n.hiddens, n.classes, 0, 0)
+        # Add bias
+        for i in range(n.states):
+            VecVec.add_i(&A.scores[i*n.classes],
+                W.hidden_bias, 1., n.classes)
+
+            
+cdef void sum_state_features(float* output,
+        const float* cached, const int* token_ids, int B, int F, int O) nogil:
+    cdef int idx, b, f, i
+    cdef const float* feature
+    padding = cached
+    cached += F * O
+    cdef int id_stride = F*O
+    cdef float one = 1.
+    for b in range(B):
+        for f in range(F):
+            if token_ids[f] < 0:
+                feature = &padding[f*O]
+            else:
+                idx = token_ids[f] * id_stride + f*O
+                feature = &cached[idx]
+            openblas.simple_axpy(&output[b*O], O,
+                feature, one)
+        token_ids += F
+
+
+cdef void cpu_log_loss(float* d_scores,
+        const float* costs, const int* is_valid, const float* scores,
+        int O) nogil:
+    """Do multi-label log loss"""
+    cdef double max_, gmax, Z, gZ
+    best = arg_max_if_gold(scores, costs, is_valid, O)
+    guess = arg_max_if_valid(scores, is_valid, O)
+    Z = 1e-10
+    gZ = 1e-10
+    max_ = scores[guess]
+    gmax = scores[best]
+    for i in range(O):
+        if is_valid[i]:
+            Z += exp(scores[i] - max_)
+            if costs[i] <= costs[best]:
+                gZ += exp(scores[i] - gmax)
+    for i in range(O):
+        if not is_valid[i]:
+            d_scores[i] = 0.
+        elif costs[i] <= costs[best]:
+            d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
+        else:
+            d_scores[i] = exp(scores[i]-max_) / Z
+
+ 
+cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
+        const int* is_valid, int n) nogil:
+    # Find minimum cost
+    cdef float cost = 1
+    for i in range(n):
+        if is_valid[i] and costs[i] < cost:
+            cost = costs[i]
+    # Now find best-scoring with that cost
+    cdef int best = -1
+    for i in range(n):
+        if costs[i] <= cost and is_valid[i]:
+            if best == -1 or scores[i] > scores[best]:
+                best = i
+    return best
+
+
+cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
+    cdef int best = -1
+    for i in range(n):
+        if is_valid[i] >= 1:
+            if best == -1 or scores[i] > scores[best]:
+                best = i
+    return best
+
+
+class ParserModel(Model):
+    def __init__(self, tok2vec, lower_model, upper_model):
+        Model.__init__(self)
+        self._layers = [tok2vec, lower_model, upper_model]
+
+    def begin_update(self, docs, drop=0.):
+        step_model = ParserStepModel(docs, self.layers, drop=drop)
+        def finish_parser_update(golds, sgd=None):
+            step_model.make_updates(sgd)
+            return None
+        return step_model, finish_parser_update
+
+    @property
+    def tok2vec(self):
+        return self._layers[0]
+    
+    @property
+    def lower(self):
+        return self._layers[1]
+    
+    @property
+    def upper(self):
+        return self._layers[2]
+
+
+class ParserStepModel(Model):
+    def __init__(self, docs, layers, drop=0.):
+        self.tokvecs, self.bp_tokvecs = layers[0].begin_update(docs, drop=drop)
+        self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
+                                            drop=drop)
+        self.vec2scores = layers[-1]
+        self.cuda_stream = util.get_cuda_stream()
+        self.backprops = []
+
+    def begin_update(self, states, drop=0.):
+        token_ids = self.get_token_ids(states)
+        vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0)
+        vector, bp_dropout = self.ops.dropout(vector, drop)
+        scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
+
+        def backprop_parser_step(d_scores, sgd=None):
+            d_vector = bp_dropout(get_d_vector(d_scores, sgd=sgd))
+            if isinstance(self.model[0].ops, CupyOps) \
+            and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
+                # Move token_ids and d_vector to GPU, asynchronously
+                self.backprops.append((
+                    util.get_async(self.cuda_stream, token_ids),
+                    util.get_async(self.cuda_stream, d_vector),
+                    get_d_tokvecs
+                ))
+            else:
+                self.backprops.append((token_ids, d_vector, get_d_tokvecs))
+            return None
+        return scores, backprop_parser_step
+
+    def get_token_ids(self, states):
+        cdef StateClass state
+        cdef int n_tokens = self.nr_feature
+        cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
+                                          dtype='i', order='C')
+        c_ids = <int*>ids.data
+        for i, state in enumerate(states):
+            if not state.is_final():
+                state.c.set_context_tokens(c_ids, n_tokens)
+            c_ids += ids.shape[1]
+        return ids
+
+    def make_updates(self, sgd):
+        # Tells CUDA to block, so our async copies complete.
+        if self.cuda_stream is not None:
+            self.cuda_stream.synchronize()
+        # Add a padding vector to the d_tokvecs gradient, so that missing
+        # values don't affect the real gradient.
+        d_tokvecs = self.ops.allocate((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
+        for ids, d_vector, bp_vector in self.backprops:
+            d_state_features = bp_vector((d_vector, ids), sgd=sgd)
+            ids = ids.flatten()
+            d_state_features = d_state_features.reshape(
+                (ids.size, d_state_features.shape[2]))
+            self.ops.scatter_add(d_tokvecs, ids,
+                d_state_features)
+        # Padded -- see update()
+        self.bp_tokvecs(d_tokvecs[:-1], sgd=sgd)
+        return d_tokvecs
+
+
+cdef class precompute_hiddens:
+    """Allow a model to be "primed" by pre-computing input features in bulk.
+
+    This is used for the parser, where we want to take a batch of documents,
+    and compute vectors for each (token, position) pair. These vectors can then
+    be reused, especially for beam-search.
+
+    Let's say we're using 12 features for each state, e.g. word at start of
+    buffer, three words on stack, their children, etc. In the normal arc-eager
+    system, a document of length N is processed in 2*N states. This means we'll
+    create 2*N*12 feature vectors --- but if we pre-compute, we only need
+    N*12 vector computations. The saving for beam-search is much better:
+    if we have a beam of k, we'll normally make 2*N*12*K computations --
+    so we can save the factor k. This also gives a nice CPU/GPU division:
+    we can do all our hard maths up front, packed into large multiplications,
+    and do the hard-to-program parsing on the CPU.
+    """
+    cdef int nF, nO, nP
+    cdef bint _is_synchronized
+    cdef public object ops
+    cdef np.ndarray _features
+    cdef np.ndarray _cached
+    cdef np.ndarray bias
+    cdef object _cuda_stream
+    cdef object _bp_hiddens
+
+    def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
+                 drop=0.):
+        gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
+        cdef np.ndarray cached
+        if not isinstance(gpu_cached, numpy.ndarray):
+            # Note the passing of cuda_stream here: it lets
+            # cupy make the copy asynchronously.
+            # We then have to block before first use.
+            cached = gpu_cached.get(stream=cuda_stream)
+        else:
+            cached = gpu_cached
+        if not isinstance(lower_model.b, numpy.ndarray):
+            self.bias = lower_model.b.get()
+        else:
+            self.bias = lower_model.b
+        self.nF = cached.shape[1]
+        self.nP = getattr(lower_model, 'nP', 1)
+        self.nO = cached.shape[2]
+        self.ops = lower_model.ops
+        self._is_synchronized = False
+        self._cuda_stream = cuda_stream
+        self._cached = cached
+        self._bp_hiddens = bp_features
+
+    cdef const float* get_feat_weights(self) except NULL:
+        if not self._is_synchronized and self._cuda_stream is not None:
+            self._cuda_stream.synchronize()
+            self._is_synchronized = True
+        return <float*>self._cached.data
+
+    def __call__(self, X):
+        return self.begin_update(X)[0]
+
+    def begin_update(self, token_ids, drop=0.):
+        cdef np.ndarray state_vector = numpy.zeros(
+            (token_ids.shape[0], self.nO, self.nP), dtype='f')
+        # This is tricky, but (assuming GPU available);
+        # - Input to forward on CPU
+        # - Output from forward on CPU
+        # - Input to backward on GPU!
+        # - Output from backward on GPU
+        bp_hiddens = self._bp_hiddens
+
+        feat_weights = self.get_feat_weights()
+        cdef int[:, ::1] ids = token_ids
+        sum_state_features(<float*>state_vector.data,
+            feat_weights, &ids[0,0],
+            token_ids.shape[0], self.nF, self.nO*self.nP)
+        state_vector += self.bias
+        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
+
+        def backward(d_state_vector_ids, sgd=None):
+            d_state_vector, token_ids = d_state_vector_ids
+            d_state_vector = bp_nonlinearity(d_state_vector, sgd)
+            # This will usually be on GPU
+            if not isinstance(d_state_vector, self.ops.xp.ndarray):
+                d_state_vector = self.ops.xp.array(d_state_vector)
+            d_tokens = bp_hiddens((d_state_vector, token_ids), sgd)
+            return d_tokens
+        return state_vector, backward
+
+    def _nonlinearity(self, state_vector):
+        if self.nP == 1:
+            state_vector = state_vector.reshape(state_vector.shape[:-1])
+            mask = state_vector >= 0.
+            state_vector *= mask
+        else:
+            state_vector, mask = self.ops.maxout(state_vector)
+
+        def backprop_nonlinearity(d_best, sgd=None):
+            if self.nP == 1:
+                d_best *= mask
+                d_best = d_best.reshape((d_best.shape + (1,)))
+                return d_best
+            else:
+                return self.ops.backprop_maxout(d_best, mask, self.nP)
+        return state_vector, backprop_nonlinearity
+
--- a/spacy/syntax/nn_parser.pxd
+++ b/spacy/syntax/nn_parser.pxd
@ -6,6 +6,7 @@ from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc
 from ..structs cimport TokenC
 from ._state cimport StateC
+from ._parser_model cimport WeightsC, ActivationsC, SizesC


 cdef class Parser:
@ -14,8 +15,10 @@ cdef class Parser:
    cdef readonly TransitionSystem moves
    cdef readonly object cfg
    cdef public object _multitasks
-
-    cdef void _parseC(self, StateC** states, int nr_task, 
-            const float* feat_weights, const float* bias,
-            const float* hW, const float* hb,
-            int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil
+    
+    cdef void _parseC(self, StateC** states,
+            WeightsC weights, SizesC sizes) nogil
+ 
+    cdef void c_transition_batch(self, StateC** states, const float* scores,
+            int nr_class, int batch_size) nogil
+ 
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx