diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 1efdc4474..2277e568e 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -51,6 +51,7 @@ from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune from .._ml import Residual, drop_layer, flatten from .._ml import link_vectors_to_models +from .._ml import HistoryFeatures from ..compat import json_dumps from . import _parse_features @@ -68,7 +69,7 @@ from ..gold cimport GoldParse from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG from . import _beam_utils -USE_FINE_TUNE = True +USE_HISTORY = True def get_templates(*args, **kwargs): return [] @@ -261,18 +262,35 @@ cdef class Parser: with Model.use_device('cpu'): if depth == 0: - upper = chain() - upper.is_noop = True - else: + hist_size = 8 + nr_dim = 8 + if USE_HISTORY: + upper = chain( + HistoryFeatures(nr_class=nr_class, hist_size=hist_size, + nr_dim=nr_dim), + zero_init(Affine(nr_class, nr_class+hist_size*nr_dim, + drop_factor=0.0))) + upper.is_noop = False + else: + upper = chain() + upper.is_noop = True + elif USE_HISTORY: upper = chain( - clone(Maxout(hidden_width), depth-1), + HistoryFeatures(nr_class=nr_class, hist_size=8, nr_dim=8), + Maxout(hidden_width, hidden_width+8*8), zero_init(Affine(nr_class, hidden_width, drop_factor=0.0)) ) upper.is_noop = False + else: + upper = chain( + Maxout(hidden_width, hidden_width), + zero_init(Affine(nr_class, hidden_width, drop_factor=0.0)) + ) + upper.is_noop = False + # TODO: This is an unfortunate hack atm! # Used to set input dimensions in network. lower.begin_training(lower.ops.allocate((500, token_vector_width))) - upper.begin_training(upper.ops.allocate((500, hidden_width))) cfg = { 'nr_class': nr_class, 'depth': depth, @@ -428,12 +446,18 @@ cdef class Parser: self._parse_step(next_step[i], feat_weights, nr_class, nr_feat, nr_piece) else: + hists = [] for i in range(nr_step): st = next_step[i] st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat) self.moves.set_valid(&c_is_valid[i*nr_class], st) + hists.append([st.get_hist(j+1) for j in range(8)]) + hists = numpy.asarray(hists) vectors = state2vec(token_ids[:next_step.size()]) - scores = vec2scores(vectors) + if USE_HISTORY: + scores = vec2scores((vectors, hists)) + else: + scores = vec2scores(vectors) c_scores = scores.data for i in range(nr_step): st = next_step[i] @@ -441,6 +465,7 @@ cdef class Parser: &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class) action = self.moves.c[guess] action.do(st, action.label) + st.push_hist(guess) this_step, next_step = next_step, this_step next_step.clear() for st in this_step: @@ -551,7 +576,11 @@ cdef class Parser: if drop != 0: mask = vec2scores.ops.get_dropout_mask(vector.shape, drop) vector *= mask - scores, bp_scores = vec2scores.begin_update(vector, drop=drop) + hists = numpy.asarray([st.history for st in states], dtype='i') + if USE_HISTORY: + scores, bp_scores = vec2scores.begin_update((vector, hists), drop=drop) + else: + scores, bp_scores = vec2scores.begin_update(vector, drop=drop) d_scores = self.get_batch_loss(states, golds, scores) d_scores /= len(docs) @@ -570,7 +599,8 @@ cdef class Parser: else: backprops.append((token_ids, d_vector, bp_vector)) self.transition_batch(states, scores) - todo = [st for st in todo if not st[0].is_final()] + todo = [(st, gold) for (st, gold) in todo + if not st.is_final()] if losses is not None: losses[self.name] += (d_scores**2).sum() n_steps += 1 @@ -706,12 +736,15 @@ cdef class Parser: cdef StateClass state cdef int[500] is_valid # TODO: Unhack cdef float* c_scores = &scores[0, 0] + hists = [] for state in states: self.moves.set_valid(is_valid, state.c) guess = arg_max_if_valid(c_scores, is_valid, scores.shape[1]) action = self.moves.c[guess] action.do(state.c, action.label) c_scores += scores.shape[1] + hists.append(guess) + return hists def get_batch_loss(self, states, golds, float[:, ::1] scores): cdef StateClass state