diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index ee5e9d4c7..ea292752e 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -42,11 +42,17 @@ cdef WeightsC get_c_weights(model) except *: cdef precompute_hiddens state2vec = model.state2vec output.feat_weights = state2vec.get_feat_weights() output.feat_bias = state2vec.bias.data - cdef np.ndarray vec2scores_W = model.vec2scores.W - cdef np.ndarray vec2scores_b = model.vec2scores.b + cdef np.ndarray vec2scores_W + cdef np.ndarray vec2scores_b + if model.vec2scores is None: + output.hidden_weights = NULL + output.hidden_bias = NULL + else: + vec2scores_W = model.vec2scores.b + vec2scores_b = model.vec2scores.W + output.hidden_weights = vec2scores_W.data + output.hidden_bias = vec2scores_b.data cdef np.ndarray class_mask = model._class_mask - output.hidden_weights = vec2scores_W.data - output.hidden_bias = vec2scores_b.data output.seen_classes = class_mask.data return output @@ -54,7 +60,10 @@ cdef WeightsC get_c_weights(model) except *: cdef SizesC get_c_sizes(model, int batch_size) except *: cdef SizesC output output.states = batch_size - output.classes = model.vec2scores.nO + if model.vec2scores is None: + output.classes = model.state2vec.nO + else: + output.classes = model.vec2scores.nO output.hiddens = model.state2vec.nO output.pieces = model.state2vec.nP output.feats = model.state2vec.nF @@ -90,11 +99,12 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil: cdef void predict_states(ActivationsC* A, StateC** states, const WeightsC* W, SizesC n) nogil: + cdef double one = 1.0 resize_activations(A, n) - memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float)) - memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float)) for i in range(n.states): states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats) + memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float)) + memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float)) sum_state_features(A.unmaxed, W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces) for i in range(n.states): @@ -105,18 +115,20 @@ cdef void predict_states(ActivationsC* A, StateC** states, which = Vec.arg_max(&A.unmaxed[index], n.pieces) A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which] memset(A.scores, 0, n.states * n.classes * sizeof(float)) - cdef double one = 1.0 - # Compute hidden-to-output - blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.TRANSPOSE, - n.states, n.classes, n.hiddens, one, - A.hiddens, n.hiddens, 1, - W.hidden_weights, n.hiddens, 1, - one, - A.scores, n.classes, 1) - # Add bias - for i in range(n.states): - VecVec.add_i(&A.scores[i*n.classes], - W.hidden_bias, 1., n.classes) + if W.hidden_weights == NULL: + memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float)) + else: + # Compute hidden-to-output + blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.TRANSPOSE, + n.states, n.classes, n.hiddens, one, + A.hiddens, n.hiddens, 1, + W.hidden_weights, n.hiddens, 1, + one, + A.scores, n.classes, 1) + # Add bias + for i in range(n.states): + VecVec.add_i(&A.scores[i*n.classes], + W.hidden_bias, 1., n.classes) # Set unseen classes to minimum value i = 0 min_ = A.scores[0] @@ -204,7 +216,9 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no class ParserModel(Model): def __init__(self, tok2vec, lower_model, upper_model, unseen_classes=None): Model.__init__(self) - self._layers = [tok2vec, lower_model, upper_model] + self._layers = [tok2vec, lower_model] + if upper_model is not None: + self._layers.append(upper_model) self.unseen_classes = set() if unseen_classes: for class_ in unseen_classes: @@ -219,6 +233,8 @@ class ParserModel(Model): return step_model, finish_parser_update def resize_output(self, new_output): + if len(self._layers) == 2: + return if new_output == self.upper.nO: return smaller = self.upper @@ -260,12 +276,24 @@ class ParserModel(Model): class ParserStepModel(Model): def __init__(self, docs, layers, unseen_classes=None, drop=0.): self.tokvecs, self.bp_tokvecs = layers[0].begin_update(docs, drop=drop) + if layers[1].nP >= 2: + activation = "maxout" + elif len(layers) == 2: + activation = None + else: + activation = "relu" self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1], - drop=drop) - self.vec2scores = layers[-1] + activation=activation, drop=drop) + if len(layers) == 3: + self.vec2scores = layers[-1] + else: + self.vec2scores = None self.cuda_stream = util.get_cuda_stream(non_blocking=True) self.backprops = [] - self._class_mask = numpy.zeros((self.vec2scores.nO,), dtype='f') + if self.vec2scores is None: + self._class_mask = numpy.zeros((self.state2vec.nO,), dtype='f') + else: + self._class_mask = numpy.zeros((self.vec2scores.nO,), dtype='f') self._class_mask.fill(1) if unseen_classes is not None: for class_ in unseen_classes: @@ -287,10 +315,15 @@ class ParserStepModel(Model): def begin_update(self, states, drop=0.): token_ids = self.get_token_ids(states) vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0) - mask = self.vec2scores.ops.get_dropout_mask(vector.shape, drop) - if mask is not None: - vector *= mask - scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop) + if self.vec2scores is not None: + mask = self.vec2scores.ops.get_dropout_mask(vector.shape, drop) + if mask is not None: + vector *= mask + scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop) + else: + scores = NumpyOps().asarray(vector) + get_d_vector = lambda d_scores, sgd=None: d_scores + mask = None # If the class is unseen, make sure its score is minimum scores[:, self._class_mask == 0] = numpy.nanmin(scores) @@ -327,12 +360,12 @@ class ParserStepModel(Model): return ids def make_updates(self, sgd): - # Tells CUDA to block, so our async copies complete. - if self.cuda_stream is not None: - self.cuda_stream.synchronize() # Add a padding vector to the d_tokvecs gradient, so that missing # values don't affect the real gradient. d_tokvecs = self.ops.allocate((self.tokvecs.shape[0]+1, self.tokvecs.shape[1])) + # Tells CUDA to block, so our async copies complete. + if self.cuda_stream is not None: + self.cuda_stream.synchronize() for ids, d_vector, bp_vector in self.backprops: d_state_features = bp_vector((d_vector, ids), sgd=sgd) ids = ids.flatten() @@ -370,9 +403,10 @@ cdef class precompute_hiddens: cdef np.ndarray bias cdef object _cuda_stream cdef object _bp_hiddens + cdef object activation def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, - drop=0.): + activation="maxout", drop=0.): gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop) cdef np.ndarray cached if not isinstance(gpu_cached, numpy.ndarray): @@ -390,6 +424,8 @@ cdef class precompute_hiddens: self.nP = getattr(lower_model, 'nP', 1) self.nO = cached.shape[2] self.ops = lower_model.ops + assert activation in (None, "relu", "maxout") + self.activation = activation self._is_synchronized = False self._cuda_stream = cuda_stream self._cached = cached @@ -402,7 +438,7 @@ cdef class precompute_hiddens: return self._cached.data def __call__(self, X): - return self.begin_update(X)[0] + return self.begin_update(X, drop=None)[0] def begin_update(self, token_ids, drop=0.): cdef np.ndarray state_vector = numpy.zeros( @@ -435,28 +471,31 @@ cdef class precompute_hiddens: else: ops = CupyOps() - if self.nP == 1: - state_vector = state_vector.reshape(state_vector.shape[:-1]) - mask = state_vector >= 0. - state_vector *= mask - else: + if self.activation == "maxout": state_vector, mask = ops.maxout(state_vector) + else: + state_vector = state_vector.reshape(state_vector.shape[:-1]) + if self.activation == "relu": + mask = state_vector >= 0. + state_vector *= mask def backprop_nonlinearity(d_best, sgd=None): if isinstance(d_best, numpy.ndarray): ops = NumpyOps() else: ops = CupyOps() - mask_ = ops.asarray(mask) - # This will usually be on GPU d_best = ops.asarray(d_best) # Fix nans (which can occur from unseen classes.) d_best[ops.xp.isnan(d_best)] = 0. - if self.nP == 1: + if self.activation == "maxout": + mask_ = ops.asarray(mask) + return ops.backprop_maxout(d_best, mask_, self.nP) + elif self.activation == "relu": + mask_ = ops.asarray(mask) d_best *= mask_ d_best = d_best.reshape((d_best.shape + (1,))) return d_best else: - return ops.backprop_maxout(d_best, mask_, self.nP) + return d_best.reshape((d_best.shape + (1,))) return state_vector, backprop_nonlinearity diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 3d0fb63cb..a3041c945 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -22,7 +22,7 @@ from thinc.extra.search cimport Beam from thinc.api import chain, clone from thinc.v2v import Model, Maxout, Affine from thinc.misc import LayerNorm -from thinc.neural.ops import CupyOps +from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module from thinc.linalg cimport Vec, VecVec import srsly @@ -56,19 +56,24 @@ cdef class Parser: subword_features = util.env_opt('subword_features', cfg.get('subword_features', True)) conv_depth = util.env_opt('conv_depth', cfg.get('conv_depth', 4)) + conv_window = util.env_opt('conv_window', cfg.get('conv_depth', 1)) bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0)) self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0)) - if depth != 1: + if depth not in (0, 1): raise ValueError(TempErrors.T004.format(value=depth)) parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2)) token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 96)) hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 64)) + if depth == 0: + hidden_width = nr_class + parser_maxout_pieces = 1 embed_size = util.env_opt('embed_size', cfg.get('embed_size', 2000)) pretrained_vectors = cfg.get('pretrained_vectors', None) tok2vec = Tok2Vec(token_vector_width, embed_size, conv_depth=conv_depth, + conv_window=conv_window, subword_features=subword_features, pretrained_vectors=pretrained_vectors, bilstm_depth=bilstm_depth, @@ -79,10 +84,12 @@ cdef class Parser: nF=cls.nr_feature, nI=token_vector_width, nP=parser_maxout_pieces) lower.nP = parser_maxout_pieces - - with Model.use_device('cpu'): - upper = Affine(nr_class, hidden_width, drop_factor=0.0) - upper.W *= 0 + if depth == 1: + with Model.use_device('cpu'): + upper = Affine(nr_class, hidden_width, drop_factor=0.0) + upper.W *= 0 + else: + upper = None cfg = { 'nr_class': nr_class, @@ -94,6 +101,7 @@ cdef class Parser: 'bilstm_depth': bilstm_depth, 'self_attn_depth': self_attn_depth, 'conv_depth': conv_depth, + 'conv_window': conv_window, 'embed_size': embed_size } return ParserModel(tok2vec, lower, upper), cfg