From f46b22879b4a9eeea07fd369ea84e5b5938a7894 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 21 Oct 2018 16:15:53 +0200 Subject: [PATCH] Tmp: Commiting work that was sitting around --- spacy/syntax/_parse_features.pxd | 259 +++++++++++++++++++ spacy/syntax/_parse_features.pyx | 419 +++++++++++++++++++++++++++++++ spacy/syntax/nn_parser.pyx | 240 ++---------------- 3 files changed, 699 insertions(+), 219 deletions(-) create mode 100644 spacy/syntax/_parse_features.pxd create mode 100644 spacy/syntax/_parse_features.pyx diff --git a/spacy/syntax/_parse_features.pxd b/spacy/syntax/_parse_features.pxd new file mode 100644 index 000000000..0842e3504 --- /dev/null +++ b/spacy/syntax/_parse_features.pxd @@ -0,0 +1,259 @@ +from thinc.typedefs cimport atom_t + +from .stateclass cimport StateClass +from ._state cimport StateC + + +cdef int fill_context(atom_t* context, const StateC* state) nogil +# Context elements + +# Ensure each token's attributes are listed: w, p, c, c6, c4. The order +# is referenced by incrementing the enum... + +# Tokens are listed in left-to-right order. +#cdef size_t* SLOTS = [ +# S2w, S1w, +# S0l0w, S0l2w, S0lw, +# S0w, +# S0r0w, S0r2w, S0rw, +# N0l0w, N0l2w, N0lw, +# P2w, P1w, +# N0w, N1w, N2w, N3w, 0 +#] + +# NB: The order of the enum is _NOT_ arbitrary!! +cpdef enum: + S2w + S2W + S2p + S2c + S2c4 + S2c6 + S2L + S2_prefix + S2_suffix + S2_shape + S2_ne_iob + S2_ne_type + + S1w + S1W + S1p + S1c + S1c4 + S1c6 + S1L + S1_prefix + S1_suffix + S1_shape + S1_ne_iob + S1_ne_type + + S1rw + S1rW + S1rp + S1rc + S1rc4 + S1rc6 + S1rL + S1r_prefix + S1r_suffix + S1r_shape + S1r_ne_iob + S1r_ne_type + + S0lw + S0lW + S0lp + S0lc + S0lc4 + S0lc6 + S0lL + S0l_prefix + S0l_suffix + S0l_shape + S0l_ne_iob + S0l_ne_type + + S0l2w + S0l2W + S0l2p + S0l2c + S0l2c4 + S0l2c6 + S0l2L + S0l2_prefix + S0l2_suffix + S0l2_shape + S0l2_ne_iob + S0l2_ne_type + + S0w + S0W + S0p + S0c + S0c4 + S0c6 + S0L + S0_prefix + S0_suffix + S0_shape + S0_ne_iob + S0_ne_type + + S0r2w + S0r2W + S0r2p + S0r2c + S0r2c4 + S0r2c6 + S0r2L + S0r2_prefix + S0r2_suffix + S0r2_shape + S0r2_ne_iob + S0r2_ne_type + + S0rw + S0rW + S0rp + S0rc + S0rc4 + S0rc6 + S0rL + S0r_prefix + S0r_suffix + S0r_shape + S0r_ne_iob + S0r_ne_type + + N0l2w + N0l2W + N0l2p + N0l2c + N0l2c4 + N0l2c6 + N0l2L + N0l2_prefix + N0l2_suffix + N0l2_shape + N0l2_ne_iob + N0l2_ne_type + + N0lw + N0lW + N0lp + N0lc + N0lc4 + N0lc6 + N0lL + N0l_prefix + N0l_suffix + N0l_shape + N0l_ne_iob + N0l_ne_type + + N0w + N0W + N0p + N0c + N0c4 + N0c6 + N0L + N0_prefix + N0_suffix + N0_shape + N0_ne_iob + N0_ne_type + + N1w + N1W + N1p + N1c + N1c4 + N1c6 + N1L + N1_prefix + N1_suffix + N1_shape + N1_ne_iob + N1_ne_type + + N2w + N2W + N2p + N2c + N2c4 + N2c6 + N2L + N2_prefix + N2_suffix + N2_shape + N2_ne_iob + N2_ne_type + + P1w + P1W + P1p + P1c + P1c4 + P1c6 + P1L + P1_prefix + P1_suffix + P1_shape + P1_ne_iob + P1_ne_type + + P2w + P2W + P2p + P2c + P2c4 + P2c6 + P2L + P2_prefix + P2_suffix + P2_shape + P2_ne_iob + P2_ne_type + + E0w + E0W + E0p + E0c + E0c4 + E0c6 + E0L + E0_prefix + E0_suffix + E0_shape + E0_ne_iob + E0_ne_type + + E1w + E1W + E1p + E1c + E1c4 + E1c6 + E1L + E1_prefix + E1_suffix + E1_shape + E1_ne_iob + E1_ne_type + + # Misc features at the end + dist + N0lv + S0lv + S0rv + S1lv + S1rv + + S0_has_head + S1_has_head + S2_has_head + + CONTEXT_SIZE diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx new file mode 100644 index 000000000..2e0db4877 --- /dev/null +++ b/spacy/syntax/_parse_features.pyx @@ -0,0 +1,419 @@ +""" +Fill an array, context, with every _atomic_ value our features reference. +We then write the _actual features_ as tuples of the atoms. The machinery +that translates from the tuples to feature-extractors (which pick the values +out of "context") is in features/extractor.pyx + +The atomic feature names are listed in a big enum, so that the feature tuples +can refer to them. +""" +# coding: utf-8 +from __future__ import unicode_literals + +from libc.string cimport memset +from itertools import combinations +from cymem.cymem cimport Pool + +from ..structs cimport TokenC +from .stateclass cimport StateClass +from ._state cimport StateC + + +cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: + if token is NULL: + context[0] = 0 + context[1] = 0 + context[2] = 0 + context[3] = 0 + context[4] = 0 + context[5] = 0 + context[6] = 0 + context[7] = 0 + context[8] = 0 + context[9] = 0 + context[10] = 0 + context[11] = 0 + else: + context[0] = token.lex.orth + context[1] = token.lemma + context[2] = token.tag + context[3] = token.lex.cluster + # We've read in the string little-endian, so now we can take & (2**n)-1 + # to get the first n bits of the cluster. + # e.g. s = "1110010101" + # s = ''.join(reversed(s)) + # first_4_bits = int(s, 2) + # print first_4_bits + # 5 + # print "{0:b}".format(prefix).ljust(4, '0') + # 1110 + # What we're doing here is picking a number where all bits are 1, e.g. + # 15 is 1111, 63 is 111111 and doing bitwise AND, so getting all bits in + # the source that are set to 1. + context[4] = token.lex.cluster & 15 + context[5] = token.lex.cluster & 63 + context[6] = token.dep if token.head != 0 else 0 + context[7] = token.lex.prefix + context[8] = token.lex.suffix + context[9] = token.lex.shape + context[10] = token.ent_iob + context[11] = token.ent_type + +cdef int fill_context(atom_t* ctxt, const StateC* st) nogil: + # Take care to fill every element of context! + # We could memset, but this makes it very easy to have broken features that + # make almost no impact on accuracy. If instead they're unset, the impact + # tends to be dramatic, so we get an obvious regression to fix... + fill_token(&ctxt[S2w], st.S_(2)) + fill_token(&ctxt[S1w], st.S_(1)) + fill_token(&ctxt[S1rw], st.R_(st.S(1), 1)) + fill_token(&ctxt[S0lw], st.L_(st.S(0), 1)) + fill_token(&ctxt[S0l2w], st.L_(st.S(0), 2)) + fill_token(&ctxt[S0w], st.S_(0)) + fill_token(&ctxt[S0r2w], st.R_(st.S(0), 2)) + fill_token(&ctxt[S0rw], st.R_(st.S(0), 1)) + fill_token(&ctxt[N0lw], st.L_(st.B(0), 1)) + fill_token(&ctxt[N0l2w], st.L_(st.B(0), 2)) + fill_token(&ctxt[N0w], st.B_(0)) + fill_token(&ctxt[N1w], st.B_(1)) + fill_token(&ctxt[N2w], st.B_(2)) + fill_token(&ctxt[P1w], st.safe_get(st.B(0)-1)) + fill_token(&ctxt[P2w], st.safe_get(st.B(0)-2)) + + fill_token(&ctxt[E0w], st.E_(0)) + fill_token(&ctxt[E1w], st.E_(1)) + + if st.stack_depth() >= 1 and not st.eol(): + ctxt[dist] = min_(st.B(0) - st.E(0), 5) + else: + ctxt[dist] = 0 + ctxt[N0lv] = min_(st.n_L(st.B(0)), 5) + ctxt[S0lv] = min_(st.n_L(st.S(0)), 5) + ctxt[S0rv] = min_(st.n_R(st.S(0)), 5) + ctxt[S1lv] = min_(st.n_L(st.S(1)), 5) + ctxt[S1rv] = min_(st.n_R(st.S(1)), 5) + + ctxt[S0_has_head] = 0 + ctxt[S1_has_head] = 0 + ctxt[S2_has_head] = 0 + if st.stack_depth() >= 1: + ctxt[S0_has_head] = st.has_head(st.S(0)) + 1 + if st.stack_depth() >= 2: + ctxt[S1_has_head] = st.has_head(st.S(1)) + 1 + if st.stack_depth() >= 3: + ctxt[S2_has_head] = st.has_head(st.S(2)) + 1 + + +cdef inline int min_(int a, int b) nogil: + return a if a > b else b + + +ner = ( + (N0W,), + (P1W,), + (N1W,), + (P2W,), + (N2W,), + + (P1W, N0W,), + (N0W, N1W), + + (N0_prefix,), + (N0_suffix,), + + (P1_shape,), + (N0_shape,), + (N1_shape,), + (P1_shape, N0_shape,), + (N0_shape, P1_shape,), + (P1_shape, N0_shape, N1_shape), + (N2_shape,), + (P2_shape,), + + #(P2_norm, P1_norm, W_norm), + #(P1_norm, W_norm, N1_norm), + #(W_norm, N1_norm, N2_norm) + + (P2p,), + (P1p,), + (N0p,), + (N1p,), + (N2p,), + + (P1p, N0p), + (N0p, N1p), + (P2p, P1p, N0p), + (P1p, N0p, N1p), + (N0p, N1p, N2p), + + (P2c,), + (P1c,), + (N0c,), + (N1c,), + (N2c,), + + (P1c, N0c), + (N0c, N1c), + + (E0W,), + (E0c,), + (E0p,), + + (E0W, N0W), + (E0c, N0W), + (E0p, N0W), + + (E0p, P1p, N0p), + (E0c, P1c, N0c), + + (E0w, P1c), + (E0p, P1p), + (E0c, P1c), + (E0p, E1p), + (E0c, P1p), + + (E1W,), + (E1c,), + (E1p,), + + (E0W, E1W), + (E0W, E1p,), + (E0p, E1W,), + (E0p, E1W), + + (P1_ne_iob,), + (P1_ne_iob, P1_ne_type), + (N0w, P1_ne_iob, P1_ne_type), + + (N0_shape,), + (N1_shape,), + (N2_shape,), + (P1_shape,), + (P2_shape,), + + (N0_prefix,), + (N0_suffix,), + + (P1_ne_iob,), + (P2_ne_iob,), + (P1_ne_iob, P2_ne_iob), + (P1_ne_iob, P1_ne_type), + (P2_ne_iob, P2_ne_type), + (N0w, P1_ne_iob, P1_ne_type), + + (N0w, N1w), +) + + +unigrams = ( + (S2W, S2p), + (S2c6, S2p), + + (S1W, S1p), + (S1c6, S1p), + + (S0W, S0p), + (S0c6, S0p), + + (N0W, N0p), + (N0p,), + (N0c,), + (N0c6, N0p), + (N0L,), + + (N1W, N1p), + (N1c6, N1p), + + (N2W, N2p), + (N2c6, N2p), + + (S0r2W, S0r2p), + (S0r2c6, S0r2p), + (S0r2L,), + + (S0rW, S0rp), + (S0rc6, S0rp), + (S0rL,), + + (S0l2W, S0l2p), + (S0l2c6, S0l2p), + (S0l2L,), + + (S0lW, S0lp), + (S0lc6, S0lp), + (S0lL,), + + (N0l2W, N0l2p), + (N0l2c6, N0l2p), + (N0l2L,), + + (N0lW, N0lp), + (N0lc6, N0lp), + (N0lL,), +) + + +s0_n0 = ( + (S0W, S0p, N0W, N0p), + (S0c, S0p, N0c, N0p), + (S0c6, S0p, N0c6, N0p), + (S0c4, S0p, N0c4, N0p), + (S0p, N0p), + (S0W, N0p), + (S0p, N0W), + (S0W, N0c), + (S0c, N0W), + (S0p, N0c), + (S0c, N0p), + (S0W, S0rp, N0p), + (S0p, S0rp, N0p), + (S0p, N0lp, N0W), + (S0p, N0lp, N0p), + (S0L, N0p), + (S0p, S0rL, N0p), + (S0p, N0lL, N0p), + (S0p, S0rv, N0p), + (S0p, N0lv, N0p), + (S0c6, S0rL, S0r2L, N0p), + (S0p, N0lL, N0l2L, N0p), +) + + +s1_s0 = ( + (S1p, S0p), + (S1p, S0p, S0_has_head), + (S1W, S0p), + (S1W, S0p, S0_has_head), + (S1c, S0p), + (S1c, S0p, S0_has_head), + (S1p, S1rL, S0p), + (S1p, S1rL, S0p, S0_has_head), + (S1p, S0lL, S0p), + (S1p, S0lL, S0p, S0_has_head), + (S1p, S0lL, S0l2L, S0p), + (S1p, S0lL, S0l2L, S0p, S0_has_head), + (S1L, S0L, S0W), + (S1L, S0L, S0p), + (S1p, S1L, S0L, S0p), + (S1p, S0p), +) + + +s1_n0 = ( + (S1p, N0p), + (S1c, N0c), + (S1c, N0p), + (S1p, N0c), + (S1W, S1p, N0p), + (S1p, N0W, N0p), + (S1c6, S1p, N0c6, N0p), + (S1L, N0p), + (S1p, S1rL, N0p), + (S1p, S1rp, N0p), +) + + +s0_n1 = ( + (S0p, N1p), + (S0c, N1c), + (S0c, N1p), + (S0p, N1c), + (S0W, S0p, N1p), + (S0p, N1W, N1p), + (S0c6, S0p, N1c6, N1p), + (S0L, N1p), + (S0p, S0rL, N1p), +) + + +n0_n1 = ( + (N0W, N0p, N1W, N1p), + (N0W, N0p, N1p), + (N0p, N1W, N1p), + (N0c, N0p, N1c, N1p), + (N0c6, N0p, N1c6, N1p), + (N0c, N1c), + (N0p, N1c), +) + +tree_shape = ( + (dist,), + (S0p, S0_has_head, S1_has_head, S2_has_head), + (S0p, S0lv, S0rv), + (N0p, N0lv), +) + +trigrams = ( + (N0p, N1p, N2p), + (S0p, S0lp, S0l2p), + (S0p, S0rp, S0r2p), + (S0p, S1p, S2p), + (S1p, S0p, N0p), + (S0p, S0lp, N0p), + (S0p, N0p, N0lp), + (N0p, N0lp, N0l2p), + + (S0W, S0p, S0rL, S0r2L), + (S0p, S0rL, S0r2L), + + (S0W, S0p, S0lL, S0l2L), + (S0p, S0lL, S0l2L), + + (N0W, N0p, N0lL, N0l2L), + (N0p, N0lL, N0l2L), +) + + +words = ( + S2w, + S1w, + S1rw, + S0lw, + S0l2w, + S0w, + S0r2w, + S0rw, + N0lw, + N0l2w, + N0w, + N1w, + N2w, + P1w, + P2w +) + +tags = ( + S2p, + S1p, + S1rp, + S0lp, + S0l2p, + S0p, + S0r2p, + S0rp, + N0lp, + N0l2p, + N0p, + N1p, + N2p, + P1p, + P2p +) + +labels = ( + S2L, + S1L, + S1rL, + S0lL, + S0l2L, + S0L, + S0r2L, + S0rL, + N0lL, + N0l2L, + N0L, + N1L, + N2L, + P1L, + P2L +) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index bb01cecf1..c3abadee8 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -41,188 +41,6 @@ from .transition_system cimport Transition from . import _beam_utils, nonproj -def get_templates(*args, **kwargs): - return [] - - -DEBUG = False - - -def set_debug(val): - global DEBUG - DEBUG = val - - -cdef class precompute_hiddens: - """Allow a model to be "primed" by pre-computing input features in bulk. - - This is used for the parser, where we want to take a batch of documents, - and compute vectors for each (token, position) pair. These vectors can then - be reused, especially for beam-search. - - Let's say we're using 12 features for each state, e.g. word at start of - buffer, three words on stack, their children, etc. In the normal arc-eager - system, a document of length N is processed in 2*N states. This means we'll - create 2*N*12 feature vectors --- but if we pre-compute, we only need - N*12 vector computations. The saving for beam-search is much better: - if we have a beam of k, we'll normally make 2*N*12*K computations -- - so we can save the factor k. This also gives a nice CPU/GPU division: - we can do all our hard maths up front, packed into large multiplications, - and do the hard-to-program parsing on the CPU. - """ - cdef int nF, nO, nP - cdef bint _is_synchronized - cdef public object ops - cdef np.ndarray _features - cdef np.ndarray _cached - cdef np.ndarray bias - cdef object _cuda_stream - cdef object _bp_hiddens - - def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, - drop=0.): - gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop) - cdef np.ndarray cached - if not isinstance(gpu_cached, numpy.ndarray): - # Note the passing of cuda_stream here: it lets - # cupy make the copy asynchronously. - # We then have to block before first use. - cached = gpu_cached.get(stream=cuda_stream) - else: - cached = gpu_cached - if not isinstance(lower_model.b, numpy.ndarray): - self.bias = lower_model.b.get() - else: - self.bias = lower_model.b - self.nF = cached.shape[1] - self.nP = getattr(lower_model, 'nP', 1) - self.nO = cached.shape[2] - self.ops = lower_model.ops - self._is_synchronized = False - self._cuda_stream = cuda_stream - self._cached = cached - self._bp_hiddens = bp_features - - cdef const float* get_feat_weights(self) except NULL: - if not self._is_synchronized and self._cuda_stream is not None: - self._cuda_stream.synchronize() - self._is_synchronized = True - return self._cached.data - - def __call__(self, X): - return self.begin_update(X)[0] - - def begin_update(self, token_ids, drop=0.): - cdef np.ndarray state_vector = numpy.zeros( - (token_ids.shape[0], self.nO, self.nP), dtype='f') - # This is tricky, but (assuming GPU available); - # - Input to forward on CPU - # - Output from forward on CPU - # - Input to backward on GPU! - # - Output from backward on GPU - bp_hiddens = self._bp_hiddens - - feat_weights = self.get_feat_weights() - cdef int[:, ::1] ids = token_ids - sum_state_features(state_vector.data, - feat_weights, &ids[0,0], - token_ids.shape[0], self.nF, self.nO*self.nP) - state_vector += self.bias - state_vector, bp_nonlinearity = self._nonlinearity(state_vector) - - def backward(d_state_vector_ids, sgd=None): - d_state_vector, token_ids = d_state_vector_ids - d_state_vector = bp_nonlinearity(d_state_vector, sgd) - # This will usually be on GPU - if not isinstance(d_state_vector, self.ops.xp.ndarray): - d_state_vector = self.ops.xp.array(d_state_vector) - d_tokens = bp_hiddens((d_state_vector, token_ids), sgd) - return d_tokens - return state_vector, backward - - def _nonlinearity(self, state_vector): - if self.nP == 1: - state_vector = state_vector.reshape(state_vector.shape[:-1]) - mask = state_vector >= 0. - state_vector *= mask - else: - state_vector, mask = self.ops.maxout(state_vector) - - def backprop_nonlinearity(d_best, sgd=None): - if self.nP == 1: - d_best *= mask - d_best = d_best.reshape((d_best.shape + (1,))) - return d_best - else: - return self.ops.backprop_maxout(d_best, mask, self.nP) - return state_vector, backprop_nonlinearity - - -cdef void sum_state_features(float* output, - const float* cached, const int* token_ids, int B, int F, int O) nogil: - cdef int idx, b, f, i - cdef const float* feature - padding = cached - cached += F * O - for b in range(B): - for f in range(F): - if token_ids[f] < 0: - feature = &padding[f*O] - else: - idx = token_ids[f] * F * O + f*O - feature = &cached[idx] - for i in range(O): - output[i] += feature[i] - output += O - token_ids += F - - -cdef void cpu_log_loss(float* d_scores, - const float* costs, const int* is_valid, const float* scores, - int O) nogil: - """Do multi-label log loss""" - cdef double max_, gmax, Z, gZ - best = arg_max_if_gold(scores, costs, is_valid, O) - guess = arg_max_if_valid(scores, is_valid, O) - Z = 1e-10 - gZ = 1e-10 - max_ = scores[guess] - gmax = scores[best] - for i in range(O): - if is_valid[i]: - Z += exp(scores[i] - max_) - if costs[i] <= costs[best]: - gZ += exp(scores[i] - gmax) - for i in range(O): - if not is_valid[i]: - d_scores[i] = 0. - elif costs[i] <= costs[best]: - d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ) - else: - d_scores[i] = exp(scores[i]-max_) / Z - - -cdef void cpu_regression_loss(float* d_scores, - const float* costs, const int* is_valid, const float* scores, - int O) nogil: - cdef float eps = 2. - best = arg_max_if_gold(scores, costs, is_valid, O) - for i in range(O): - if not is_valid[i]: - d_scores[i] = 0. - elif scores[i] < scores[best]: - d_scores[i] = 0. - else: - # I doubt this is correct? - # Looking for something like Huber loss - diff = scores[i] - -costs[i] - if diff > eps: - d_scores[i] = eps - elif diff < -eps: - d_scores[i] = -eps - else: - d_scores[i] = diff - def _collect_states(beams): cdef StateClass state @@ -545,25 +363,26 @@ cdef class Parser: def update(self, docs, golds, drop=0., sgd=None, losses=None): if not any(self.moves.has_gold(gold) for gold in golds): return None - if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.0: - return self.update_beam(docs, golds, - self.cfg['beam_width'], self.cfg['beam_density'], - drop=drop, sgd=sgd, losses=losses) if losses is not None and self.name not in losses: losses[self.name] = 0. if isinstance(docs, Doc) and isinstance(golds, GoldParse): docs = [docs] golds = [golds] - cuda_stream = util.get_cuda_stream() - states, golds, max_steps = self._init_gold_batch(docs, golds) - (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, - drop) - todo = [(s, g) for (s, g) in zip(states, golds) - if not s.is_final() and g is not None] - if not todo: - return None + if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.0: + return self.update_beam(docs, golds, + self.cfg['beam_width'], self.cfg['beam_density'], + drop=drop, sgd=sgd, losses=losses) + else: + return self.update_greedy(docs, golds, drop=drop, sgd=sgd, losses=losses) + + def update_greedy(self, docs, golds, drop=0., sgd=None, losses=None): + tokvecs, bp_tokvecs = self.model.tok2vec(docs) + states = self.init_states(docs, tokvecs) + histories, get_costs = self.model.predict_histories(states) + costs = get_costs(golds) + d_tokens = self.model.update(states, histories, costs) + return bp_tokvecs(tokvecs) - backprops = [] # Add a padding vector to the d_tokvecs gradient, so that missing # values don't affect the real gradient. d_tokvecs = state2vec.ops.allocate((tokvecs.shape[0]+1, tokvecs.shape[1])) @@ -571,32 +390,11 @@ cdef class Parser: n_steps = 0 while todo: states, golds = zip(*todo) - token_ids = self.get_token_ids(states) - vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0) - if drop != 0: - mask = vec2scores.ops.get_dropout_mask(vector.shape, drop) - vector *= mask - hists = numpy.asarray([st.history for st in states], dtype='i') - if self.cfg.get('hist_size', 0): - scores, bp_scores = vec2scores.begin_update((vector, hists), drop=drop) - else: - scores, bp_scores = vec2scores.begin_update(vector, drop=drop) - + vector, bp_vector = state2vec.begin_update(states, drop=0.0) + scores, bp_scores = vec2scores.begin_update(vector, drop=drop) d_scores = self.get_batch_loss(states, golds, scores) - d_scores /= len(docs) d_vector = bp_scores(d_scores, sgd=sgd) - if drop != 0: - d_vector *= mask - if isinstance(self.model[0].ops, CupyOps) \ - and not isinstance(token_ids, state2vec.ops.xp.ndarray): - # Move token_ids and d_vector to GPU, asynchronously - backprops.append(( - util.get_async(cuda_stream, token_ids), - util.get_async(cuda_stream, d_vector), - bp_vector - )) - else: backprops.append((token_ids, d_vector, bp_vector)) self.transition_batch(states, scores) todo = [(st, gold) for (st, gold) in todo @@ -658,7 +456,6 @@ cdef class Parser: for beam in beams: _cleanup(beam) - def _init_gold_batch(self, whole_docs, whole_golds): """Make a square batch, of length equal to the shortest doc. A long doc will get multiple states. Let's say we have a doc of length 2*N, @@ -719,6 +516,11 @@ cdef class Parser: names.append(name) return names + @property + def labels(self): + return [label.split('-')[1] for label in self.move_names + if '-' in label] + def get_batch_model(self, docs, stream, dropout): tok2vec, lower, upper = self.model tokvecs, bp_tokvecs = tok2vec.begin_update(docs, drop=dropout)