From ef4fa594aa5f6710171a04d69bbde930134df537 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 5 May 2017 19:20:39 +0200 Subject: [PATCH] Draft of NN parser, to be tested --- spacy/_ml.py | 129 ++++++++++++++----------------- spacy/syntax/parser.pyx | 165 +++++++++++++++++----------------------- 2 files changed, 124 insertions(+), 170 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 4c1a190b4..da624c8d6 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -4,22 +4,6 @@ from thinc.neural._classes.static_vectors import StaticVectors from thinc.neural._classes.hash_embed import HashEmbed from thinc.neural._classes.convolution import ExtractWindow -from .attrs import ID, PREFIX, SUFFIX, SHAPE, TAG, DEP - - -@layerize -def get_contexts(states, drop=0.): - ops = Model.ops - context = ops.allocate((len(states), 7), dtype='uint64') - for i, state in enumerate(states): - context[i, 0] = state.B(0) - context[i, 1] = state.S(0) - context[i, 2] = state.S(1) - context[i, 3] = state.L(state.S(0), 1) - context[i, 4] = state.L(state.S(0), 2) - context[i, 5] = state.R(state.S(0), 1) - context[i, 6] = state.R(state.S(0), 2) - return (context, states), None def get_col(idx): def forward(X, drop=0.): @@ -27,69 +11,68 @@ def get_col(idx): return layerize(forward) -def extract_features(attrs): - ops = Model.ops - def forward(contexts_states, drop=0.): - contexts, states = contexts_states - output = ops.allocate((len(states), contexts.shape[1], len(attrs)), - dtype='uint64') +def build_model(state2vec, width, depth, nr_class): + with Model.define_operators({'>>': chain, '**': clone}): + model = state2vec >> Maxout(width) ** depth >> Softmax(nr_class) + return model + + +def build_parser_state2vec(tag_vectors, dep_vectors, **cfg): + embed_tags = _reshape(chain(get_col(0), tag_vectors)) + embed_deps = _reshape(chain(get_col(1), dep_vectors)) + attr_names = ops.asarray([TAG, DEP], dtype='i') + def forward(states, drop=0.): + n_tokens = state.nr_context_tokens(nF, nB, nS, nL, nR) for i, state in enumerate(states): - for j, tok_i in enumerate(contexts[i]): - token = state.get_token(tok_i) - for k, attr in enumerate(attrs): - output[i, j, k] = getattr(token, attr) - return output, None - return layerize(forward) + state.set_context_tokens(tokens[i], nF, nB, nS, nL, nR) + state.set_attributes(features[i], tokens[i], attr_names) + state.set_token_vectors(token_vectors[i], tokens[i]) + + tagvecs, bp_tag_vecs = embed_deps.begin_update(attr_vals, drop=drop) + depvecs, bp_dep_vecs = embed_tags.begin_update(attr_vals, drop=drop) + vector = ops.concatenate((tagvecs, depvecs, tokvecs)) -def build_tok2vec(lang, width, depth, embed_size): - cols = [ID, PREFIX, SUFFIX, SHAPE] - - with Model.define_operators({'>>': chain, '|': concatenate, '**': clone}): - static = get_col(cols.index(ID)) >> StaticVectors('en', width) - prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size) - suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size) - shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size) - tok2vec = ( - extract_features(cols) - >> (static | prefix | suffix | shape) - >> (ExtractWindow(nW=1) >> Maxout(width)) ** depth - ) - return tok2vec - - -def build_parse2vec(width, embed_size): - cols = [TAG, DEP] - with Model.define_operators({'>>': chain, '|': concatenate}): - tag_vector = get_col(cols.index(TAG)) >> HashEmbed(width, 1000) - dep_vector = get_col(cols.index(DEP)) >> HashEmbed(width, 1000) - model = ( - extract_features([TAG, DEP]) - >> (tag_vector | dep_vector) - ) - return model - - -def build_model(state2context, tok2vec, parse2vec, width, depth, nr_class): - with Model.define_operators({'>>': chain, '**': clone, '|': concatenate}): - model = ( - state2context - >> (tok2vec | parse2vec) - >> Maxout(width) ** depth - >> Softmax(nr_class) - ) + shapes = (tagvecs.shape, depvecs.shape, tokvecs.shape) + def backward(d_vector, sgd=None): + d_depvecs, d_tagvecs, d_tokvecs = ops.backprop_concatenate(d_vector, shapes) + bp_tagvecs(d_tagvecs) + bp_depvecs(d_depvecs) + return (d_tokvecs, tokens) + return vector, backward + model = layerize(forward) + model._layers = [embed_tags, embed_deps] return model -def test_build_model(width=100, depth=2, nr_class=10): - model = build_model( - get_contexts, - build_tok2vec('en', width=100, depth=2, embed_size=1000), - build_parse2vec(width=100, embed_size=1000), - width, - depth, - nr_class) - assert model is not None +def _reshape(layer): + def forward(X, drop=0.): + Xh = X.reshape((X.shape[0] * X.shape[1], X.shape[2])) + yh, bp_yh = layer.begin_update(Xh, drop=drop) + n = X.shape[0] + def backward(d_y, sgd=None): + d_yh = d_y.reshape((n, d_y.size / n)) + d_Xh = bp_yh(d_yh, sgd) + return d_Xh.reshape(old_shape) + return yh.reshape((n, yh.shape / n)), backward + model = layerize(forward) + model._layers.append(layer) + return model + + + +#def build_tok2vec(lang, width, depth, embed_size, cols): +# with Model.define_operators({'>>': chain, '|': concatenate, '**': clone}): +# static = get_col(cols.index(ID)) >> StaticVectors(lang, width) +# prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size) +# suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size) +# shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size) +# tok2vec = ( +# (static | prefix | suffix | shape) +# >> Maxout(width, width*4) +# >> (ExtractWindow(nW=1) >> Maxout(width, width*3)) ** depth +# ) +# return tok2vec if __name__ == '__main__': diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index c61834760..d97a2f519 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -49,67 +49,8 @@ def set_debug(val): DEBUG = val -@layerize -def get_context_tokens(states, drop=0.): - for state in states: - context[i, 0] = state.B(0) - context[i, 1] = state.S(0) - context[i, 2] = state.S(1) - context[i, 3] = state.L(state.S(0), 1) - context[i, 4] = state.L(state.S(0), 2) - context[i, 5] = state.R(state.S(0), 1) - context[i, 6] = state.R(state.S(0), 2) - return (context, states), None - - -def extract_features(attrs): - def forward(contexts_states, drop=0.): - contexts, states = contexts_states - for i, state in enumerate(states): - for j, tok_i in enumerate(contexts[i]): - token = state.get_token(tok_i) - for k, attr in enumerate(attrs): - output[i, j, k] = getattr(token, attr) - return output, None - return layerize(forward) - - -def build_tok2vec(lang, width, depth, embed_size): - cols = [LEX_ID, PREFIX, SUFFIX, SHAPE] - static = StaticVectors('en', width, column=cols.index(LEX_ID)) - prefix = HashEmbed(width, embed_size, column=cols.index(PREFIX)) - suffix = HashEmbed(width, embed_size, column=cols.index(SUFFIX)) - shape = HashEmbed(width, embed_size, column=cols.index(SHAPE)) - with Model.overload_operaters('>>': chain, '|': concatenate, '+': add): - tok2vec = ( - extract_features(cols) - >> (static | prefix | suffix | shape) - >> (ExtractWindow(nW=1) >> Maxout(width)) ** depth - ) - return tok2vec - - -def build_parse2vec(width, embed_size): - cols = [TAG, DEP] - tag_vector = HashEmbed(width, 1000, column=cols.index(TAG)) - dep_vector = HashEmbed(width, 1000, column=cols.index(DEP)) - with Model.overload_operaters('>>': chain): - model = ( - extract_features([TAG, DEP]) - >> (tag_vector | dep_vector) - ) - return model - - -def build_model(get_contexts, tok2vec, parse2vec, width, depth, nr_class): - with Model.overload_operaters('>>': chain): - model = ( - get_contexts - >> (tok2vec | parse2vec) - >> Maxout(width) ** depth - >> Softmax(nr_class) - ) - return model +def get_templates(*args, **kwargs): + return [] cdef class Parser: @@ -162,7 +103,7 @@ cdef class Parser: model = self.build_model(**cfg) self.model = model self.cfg = cfg - + def __reduce__(self): return (Parser, (self.vocab, self.moves, self.model), None, None) @@ -180,17 +121,21 @@ cdef class Parser: def parse_batch(self, docs): states = self._init_states(docs) - todo = list(states) nr_class = self.moves.n_moves + cdef StateClass state + cdef int guess + is_valid = self.model.ops.allocate((len(docs), nr_class), dtype='i') + todo = list(states) while todo: scores = self.model.predict(todo) - self._validate_batch(is_valid, scores, states) + self._validate_batch(is_valid, states) + scores *= is_valid for state, guess in zip(todo, scores.argmax(axis=1)): action = self.moves.c[guess] - action.do(state, action.label) + action.do(state.c, action.label) todo = [state for state in todo if not state.is_final()] for state, doc in zip(states, docs): - self.moves.finalize_state(state, doc) + self.moves.finalize_state(state.c) def pipe(self, stream, int batch_size=1000, int n_threads=2): """ @@ -212,8 +157,6 @@ cdef class Parser: cdef int status queue = [] for doc in stream: - doc_ptr[len(queue)] = doc.c - lengths[len(queue)] = doc.length queue.append(doc) if len(queue) == batch_size: self.parse_batch(queue) @@ -231,48 +174,76 @@ cdef class Parser: if isinstance(docs, Doc) and isinstance(golds, GoldParse): return self.update([docs], [golds], drop=drop) states = self._init_states(docs) + d_tokens = [self.model.ops.allocate(d.tensor.shape) for d in docs] nr_class = self.moves.n_moves + costs = self.model.ops.allocate((len(docs), nr_class), dtype='f') + is_valid = self.model.ops.allocate((len(docs), nr_class), dtype='i') + + todo = zip(states, golds, d_tokens) while states: + states, golds, d_tokens = zip(*todo) scores, finish_update = self.model.begin_update(states, drop=drop) - self._validate_batch(is_valid, scores, states) - for i, state in enumerate(states): - self.moves.set_costs(costs[i], is_valid, state, golds[i]) - - self._transition_batch(states, scores) + + self._cost_batch(is_valid, costs, states, golds) + scores *= is_valid self._set_gradient(gradients, scores, costs) - finish_update(gradients, sgd=sgd) + + token_ids, batch_token_grads = finish_update(gradients, sgd=sgd) + for i, tok_i in enumerate(token_ids): + d_tokens[tok_i] += batch_token_grads[i] + + self._transition_batch(states, scores) + + # Get unfinished states (and their matching gold and token gradients) + todo = zip(states, golds, d_tokens) + todo = filter(todo, lambda sp: sp[0].is_final) + + gradients = gradients[:len(todo)] + costs = costs[:len(todo)] + is_valid = is_valid[:len(todo)] + gradients.fill(0) - - states = [state for state in states if not state.is_final()] - gradients = gradients[:len(states)] - costs = costs[:len(states)] + costs.fill(0) + is_valid.fill(1) return 0 - def _validate_batch(self, is_valid, scores, states): - for i, state in enumerate(states): - self.moves.set_valid(is_valid, state) - for j in range(self.moves.n_moves): - if not is_valid[j]: - scores[i, j] = 0 - - def _transition_batch(self, states, scores): - for state, guess in zip(states, scores.argmax(axis=1)): - action = self.moves.c[guess] - action.do(state, action.label) - def _init_states(self, docs): states = [] cdef Doc doc + cdef StateClass state for i, doc in enumerate(docs): - state = StateClass.init(doc) - self.moves.initialize_state(state) + state = StateClass(doc) + self.moves.initialize_state(state.c) + states.append(state) return states + def _validate_batch(self, int[:, ::1] is_valid, states): + cdef StateClass state + cdef int i + for i, state in enumerate(states): + self.moves.set_valid(&is_valid[i, 0], state.c) + + def _cost_batch(self, weight_t[:, ::1] costs, int[:, ::1] is_valid, + states, golds): + cdef int i + cdef StateClass state + cdef GoldParse gold + for i, (state, gold) in enumerate(zip(states, golds)): + self.moves.set_costs(&is_valid[i, 0], &costs[i, 0], state, gold) + + def _transition_batch(self, states, scores): + cdef StateClass state + cdef int guess + for state, guess in zip(states, scores.argmax(axis=1)): + action = self.moves.c[guess] + action.do(state.c, action.label) + def _set_gradient(self, gradients, scores, costs): """Do multi-label log loss""" cdef double Z, gZ, max_, g_max + g_scores = scores * (costs <= 0) maxes = scores.max(axis=1) - g_maxes = (scores * costs <= 0).max(axis=1) + g_maxes = g_scores.max(axis=1) exps = (scores-maxes).exp() g_exps = (g_scores-g_maxes).exp() @@ -398,11 +369,11 @@ cdef class StepwiseState: def predict(self): self.eg.reset() - self.eg.c.nr_feat = self.parser.model.set_featuresC(self.eg.c.atoms, self.eg.c.features, - self.stcls.c) + #self.eg.c.nr_feat = self.parser.model.set_featuresC(self.eg.c.atoms, self.eg.c.features, + # self.stcls.c) self.parser.moves.set_valid(self.eg.c.is_valid, self.stcls.c) - self.parser.model.set_scoresC(self.eg.c.scores, - self.eg.c.features, self.eg.c.nr_feat) + #self.parser.model.set_scoresC(self.eg.c.scores, + # self.eg.c.features, self.eg.c.nr_feat) cdef Transition action = self.parser.moves.c[self.eg.guess] return self.parser.moves.move_name(action.move, action.label)