From a4e9bdf4c171acead99a6c55ac8113194abbb4c8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 22:55:58 +0200 Subject: [PATCH 01/30] * Work on a theano-driven model for the parser --- spacy/_theano.pyx | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 spacy/_theano.pyx diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx new file mode 100644 index 000000000..1a1224596 --- /dev/null +++ b/spacy/_theano.pyx @@ -0,0 +1,44 @@ +from thinc.example cimport Example + + +cdef class TheanoModel(Model): + def __init__(self, n_classes, input_layer, train_func, predict_func, model_loc=None): + if model_loc is not None and path.isdir(model_loc): + model_loc = path.join(model_loc, 'model') + self.n_classes = n_classes + + tables = [] + lengths = [] + for window_size, n_dims, vocab_size in input_structure: + tables.append(EmbeddingTable(n_dims, vocab_size, initializer)) + lengths.append(window_size) + + self.input_layer = InputLayer(lengths, tables) + + self.train_func = train_func + self.predict_func = predict_func + + self.model_loc = model_loc + if self.model_loc and path.exists(self.model_loc): + self._model.load(self.model_loc, freq_thresh=0) + + def train(self, Instance eg): + pass + + def predict(self, Instance eg): + + cdef const weight_t* score(self, atom_t* context) except NULL: + self.set_scores(self._scores, context) + return self._scores + + cdef int set_scores(self, weight_t* scores, atom_t* context) except -1: + # TODO f(context) --> Values + self._input_layer.fill(self._x, self._values, use_avg=False) + theano_scores = self._predict(self._x) + for i in range(self.n_classes): + output[i] = theano_scores[i] + + cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1: + # TODO f(context) --> Values + self._input_layer.fill(self._x, self._values, use_avg=False) + From 886100e1a2f64cb805ebd8ca0934fefa2794ec06 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 24 Jun 2015 04:51:38 +0200 Subject: [PATCH 02/30] * Increment version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 76615e141..24b88dea9 100644 --- a/setup.py +++ b/setup.py @@ -130,7 +130,7 @@ def run_setup(exts): headers_workaround.install_headers('numpy') -VERSION = '0.85' +VERSION = '0.86' def main(modules, is_pypy): language = "cpp" includes = ['.', path.join(sys.prefix, 'include')] From 6896455884879f2845fce63097ce9fdf5856bf60 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 26 Jun 2015 06:25:36 +0200 Subject: [PATCH 03/30] * Rejig parser interface to use new thinc.api.Example class, in prep of theano model. Comment out beam search --- spacy/_ml.pyx | 36 ++++++ spacy/syntax/arc_eager.pyx | 29 ++--- spacy/syntax/ner.pyx | 21 ---- spacy/syntax/parser.pxd | 3 - spacy/syntax/parser.pyx | 178 ++++++++++++----------------- spacy/syntax/transition_system.pxd | 8 +- spacy/syntax/transition_system.pyx | 35 ++---- 7 files changed, 132 insertions(+), 178 deletions(-) diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx index be647c2dd..df66a1791 100644 --- a/spacy/_ml.pyx +++ b/spacy/_ml.pyx @@ -10,6 +10,7 @@ import cython import numpy.random from thinc.features cimport Feature, count_feats +from thinc.api cimport Example cdef int arg_max(const weight_t* scores, const int n_classes) nogil: @@ -23,6 +24,30 @@ cdef int arg_max(const weight_t* scores, const int n_classes) nogil: return best +cdef int arg_max_if_true(const weight_t* scores, const bint* is_valid, + const int n_classes) nogil: + cdef int i + cdef int best = 0 + cdef weight_t mode = -900000 + for i in range(n_classes): + if is_valid[i] and scores[i] > mode: + mode = scores[i] + best = i + return best + + +cdef int arg_max_if_zero(const weight_t* scores, const int* costs, + const int n_classes) nogil: + cdef int i + cdef int best = 0 + cdef weight_t mode = -900000 + for i in range(n_classes): + if costs[i] == 0 and scores[i] > mode: + mode = scores[i] + best = i + return best + + cdef class Model: def __init__(self, n_classes, templates, model_loc=None): if model_loc is not None and path.isdir(model_loc): @@ -34,6 +59,17 @@ cdef class Model: if self.model_loc and path.exists(self.model_loc): self._model.load(self.model_loc, freq_thresh=0) + def predict(self, Example eg): + self.set_scores(eg.scores, eg.atoms) + eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes) + + def train(self, Example eg): + self.set_scores(eg.scores, eg.atoms) + eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes) + eg.best = arg_max_if_zero(eg.scores, eg.costs, self.n_classes) + eg.cost = eg.costs[eg.guess] + self.update(eg.atoms, eg.guess, eg.best, eg.cost) + cdef const weight_t* score(self, atom_t* context) except NULL: cdef int n_feats feats = self._extractor.get_feats(context, &n_feats) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 29e62cb4e..a83e19ec2 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -398,7 +398,8 @@ cdef class ArcEager(TransitionSystem): n_valid += output[i] assert n_valid >= 1 - cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1: + cdef int set_costs(self, bint* is_valid, int* costs, + StateClass stcls, GoldParse gold) except -1: cdef int i, move, label cdef label_cost_func_t[N_MOVES] label_cost_funcs cdef move_cost_func_t[N_MOVES] move_cost_funcs @@ -423,30 +424,14 @@ cdef class ArcEager(TransitionSystem): n_gold = 0 for i in range(self.n_moves): if self.c[i].is_valid(stcls, self.c[i].label): + is_valid[i] = True move = self.c[i].move label = self.c[i].label if move_costs[move] == -1: move_costs[move] = move_cost_funcs[move](stcls, &gold.c) - output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label) - n_gold += output[i] == 0 + costs[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label) + n_gold += costs[i] == 0 else: - output[i] = 9000 + is_valid[i] = False + costs[i] = 9000 assert n_gold >= 1 - - cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *: - cdef bint[N_MOVES] is_valid - is_valid[SHIFT] = Shift.is_valid(stcls, -1) - is_valid[REDUCE] = Reduce.is_valid(stcls, -1) - is_valid[LEFT] = LeftArc.is_valid(stcls, -1) - is_valid[RIGHT] = RightArc.is_valid(stcls, -1) - is_valid[BREAK] = Break.is_valid(stcls, -1) - cdef Transition best - cdef weight_t score = MIN_SCORE - cdef int i - for i in range(self.n_moves): - if scores[i] > score and is_valid[self.c[i].move]: - best = self.c[i] - score = scores[i] - assert best.clas < self.n_moves - assert score > MIN_SCORE, (stcls.stack_depth(), stcls.buffer_length(), stcls.is_final(), stcls._b_i, stcls.length) - return best diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 4a47a20a8..b145df7ac 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -128,27 +128,6 @@ cdef class BiluoPushDown(TransitionSystem): raise Exception(move) return t - cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *: - cdef int best = -1 - cdef weight_t score = -90000 - cdef const Transition* m - cdef int i - for i in range(self.n_moves): - m = &self.c[i] - if m.is_valid(stcls, m.label) and scores[i] > score: - best = i - score = scores[i] - assert best >= 0 - cdef Transition t = self.c[best] - t.score = score - return t - - cdef int set_valid(self, bint* output, StateClass stcls) except -1: - cdef int i - for i in range(self.n_moves): - m = &self.c[i] - output[i] = m.is_valid(stcls, m.label) - cdef class Missing: @staticmethod diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd index 103ff9c02..11ac6bbb8 100644 --- a/spacy/syntax/parser.pxd +++ b/spacy/syntax/parser.pxd @@ -11,6 +11,3 @@ cdef class Parser: cdef readonly object cfg cdef readonly Model model cdef readonly TransitionSystem moves - - cdef int _greedy_parse(self, Tokens tokens) except -1 - cdef int _beam_parse(self, Tokens tokens) except -1 diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 740e86025..4bfb0eeb1 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -20,17 +20,10 @@ from cymem.cymem cimport Pool, Address from murmurhash.mrmr cimport hash64 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t - from util import Config -from thinc.features cimport Extractor -from thinc.features cimport Feature -from thinc.features cimport count_feats +from thinc.api cimport Example -from thinc.learner cimport LinearModel - -from thinc.search cimport Beam -from thinc.search cimport MaxViolation from ..tokens cimport Tokens, TokenC from ..strings cimport StringStore @@ -73,35 +66,86 @@ cdef class Parser: self.model = Model(self.moves.n_moves, templates, model_dir) def __call__(self, Tokens tokens): - if self.cfg.get('beam_width', 1) < 1: - self._greedy_parse(tokens) - else: - self._beam_parse(tokens) + cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) + self.moves.initialize_state(stcls) + + cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE) + while not stcls.is_final(): + eg.wipe() + fill_context(eg.atoms, stcls) + self.moves.set_valid(eg.is_valid, stcls) + + self.model.predict(eg) + + self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label) + self.moves.finalize_state(stcls) + tokens.set_parse(stcls._sent) def train(self, Tokens tokens, GoldParse gold): self.moves.preprocess_gold(gold) - if self.cfg.beam_width < 1: - return self._greedy_train(tokens, gold) - else: - return self._beam_train(tokens, gold) - - cdef int _greedy_parse(self, Tokens tokens) except -1: - cdef atom_t[CONTEXT_SIZE] context - cdef int n_feats - cdef Pool mem = Pool() cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) self.moves.initialize_state(stcls) - cdef Transition guess - words = [w.orth_ for w in tokens] + cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE) + cdef int cost = 0 while not stcls.is_final(): - fill_context(context, stcls) - scores = self.model.score(context) - guess = self.moves.best_valid(scores, stcls) - #print self.moves.move_name(guess.move, guess.label), stcls.print_state(words) - guess.do(stcls, guess.label) - assert stcls._s_i >= 0 - self.moves.finalize_state(stcls) - tokens.set_parse(stcls._sent) + eg.wipe() + fill_context(eg.atoms, stcls) + self.moves.set_costs(eg.is_valid, eg.costs, stcls, gold) + + self.model.train(eg) + + self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label) + cost += eg.cost + return cost + + +# These are passed as callbacks to thinc.search.Beam +""" +cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: + dest = _dest + src = _src + moves = _moves + dest.clone(src) + moves[clas].do(dest, moves[clas].label) + + +cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: + cdef StateClass st = StateClass.init(tokens, length) + st.fast_forward() + Py_INCREF(st) + return st + + +cdef int _check_final_state(void* _state, void* extra_args) except -1: + return (_state).is_final() + + +def _cleanup(Beam beam): + for i in range(beam.width): + Py_XDECREF(beam._states[i].content) + Py_XDECREF(beam._parents[i].content) + +cdef hash_t _hash_state(void* _state, void* _) except 0: + return _state + + #state = _state + #cdef atom_t[10] rep + + #rep[0] = state.stack[0] if state.stack_len >= 1 else 0 + #rep[1] = state.stack[-1] if state.stack_len >= 2 else 0 + #rep[2] = state.stack[-2] if state.stack_len >= 3 else 0 + #rep[3] = state.i + #rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0 + #rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0 + #rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0 + #rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0 + #if get_left(state, get_n0(state), 1) != NULL: + # rep[8] = get_left(state, get_n0(state), 1).dep + #else: + # rep[8] = 0 + #rep[9] = state.sent[state.i].l_kids + #return hash64(rep, sizeof(atom_t) * 10, 0) + cdef int _beam_parse(self, Tokens tokens) except -1: cdef Beam beam = Beam(self.moves.n_moves, self.cfg.beam_width) @@ -115,30 +159,6 @@ cdef class Parser: tokens.set_parse(state._sent) _cleanup(beam) - def _greedy_train(self, Tokens tokens, GoldParse gold): - cdef Pool mem = Pool() - cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) - self.moves.initialize_state(stcls) - - cdef int cost - cdef const Feature* feats - cdef const weight_t* scores - cdef Transition guess - cdef Transition best - cdef atom_t[CONTEXT_SIZE] context - loss = 0 - words = [w.orth_ for w in tokens] - history = [] - while not stcls.is_final(): - fill_context(context, stcls) - scores = self.model.score(context) - guess = self.moves.best_valid(scores, stcls) - best = self.moves.best_gold(scores, stcls, gold) - cost = guess.get_cost(stcls, &gold.c, guess.label) - self.model.update(context, guess.clas, best.clas, cost) - guess.do(stcls, guess.label) - loss += cost - return loss def _beam_train(self, Tokens tokens, GoldParse gold_parse): cdef Beam pred = Beam(self.moves.n_moves, self.cfg.beam_width) @@ -201,50 +221,4 @@ cdef class Parser: count_feats(counts[clas], feats, n_feats, inc) self.moves.c[clas].do(stcls, self.moves.c[clas].label) - -# These are passed as callbacks to thinc.search.Beam - -cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: - dest = _dest - src = _src - moves = _moves - dest.clone(src) - moves[clas].do(dest, moves[clas].label) - - -cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: - cdef StateClass st = StateClass.init(tokens, length) - st.fast_forward() - Py_INCREF(st) - return st - - -cdef int _check_final_state(void* _state, void* extra_args) except -1: - return (_state).is_final() - - -def _cleanup(Beam beam): - for i in range(beam.width): - Py_XDECREF(beam._states[i].content) - Py_XDECREF(beam._parents[i].content) - -cdef hash_t _hash_state(void* _state, void* _) except 0: - return _state - - #state = _state - #cdef atom_t[10] rep - - #rep[0] = state.stack[0] if state.stack_len >= 1 else 0 - #rep[1] = state.stack[-1] if state.stack_len >= 2 else 0 - #rep[2] = state.stack[-2] if state.stack_len >= 3 else 0 - #rep[3] = state.i - #rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0 - #rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0 - #rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0 - #rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0 - #if get_left(state, get_n0(state), 1) != NULL: - # rep[8] = get_left(state, get_n0(state), 1).dep - #else: - # rep[8] = 0 - #rep[9] = state.sent[state.i].l_kids - #return hash64(rep, sizeof(atom_t) * 10, 0) +""" diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index d9bd2b3e6..35f0ada30 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -46,9 +46,5 @@ cdef class TransitionSystem: cdef int set_valid(self, bint* output, StateClass state) except -1 - cdef int set_costs(self, int* output, StateClass state, GoldParse gold) except -1 - - cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except * - - cdef Transition best_gold(self, const weight_t* scores, StateClass state, - GoldParse gold) except * + cdef int set_costs(self, bint* is_valid, int* costs, + StateClass state, GoldParse gold) except -1 diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 927498cba..b13c75ba3 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -43,30 +43,17 @@ cdef class TransitionSystem: cdef Transition init_transition(self, int clas, int move, int label) except *: raise NotImplementedError - cdef Transition best_valid(self, const weight_t* scores, StateClass s) except *: - raise NotImplementedError - - cdef int set_valid(self, bint* output, StateClass state) except -1: - raise NotImplementedError - - cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1: + cdef int set_valid(self, bint* is_valid, StateClass stcls) except -1: cdef int i for i in range(self.n_moves): - if self.c[i].is_valid(stcls, self.c[i].label): - output[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label) + is_valid[i] = self.c[i].is_valid(stcls, self.c[i].label) + + cdef int set_costs(self, bint* is_valid, int* costs, + StateClass stcls, GoldParse gold) except -1: + cdef int i + self.set_valid(is_valid, stcls) + for i in range(self.n_moves): + if is_valid[i]: + costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label) else: - output[i] = 9000 - - cdef Transition best_gold(self, const weight_t* scores, StateClass stcls, - GoldParse gold) except *: - cdef Transition best - cdef weight_t score = MIN_SCORE - cdef int i - for i in range(self.n_moves): - if self.c[i].is_valid(stcls, self.c[i].label): - cost = self.c[i].get_cost(stcls, &gold.c, self.c[i].label) - if scores[i] > score and cost == 0: - best = self.c[i] - score = scores[i] - assert score > MIN_SCORE - return best + costs[i] = 9000 From 2fe98b8a9a75ff3613b71f850d4e0772674df82e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 26 Jun 2015 13:51:39 +0200 Subject: [PATCH 04/30] * Prepare for new models to be plugged in by using Example class --- setup.py | 3 +- spacy/_ml.pxd | 5 ++++ spacy/_ml.pyx | 18 +++++++----- spacy/_theano.pyx | 64 ++++++++++++++++++++--------------------- spacy/syntax/parser.pyx | 12 ++++---- 5 files changed, 56 insertions(+), 46 deletions(-) diff --git a/setup.py b/setup.py index 24b88dea9..a86e0f98d 100644 --- a/setup.py +++ b/setup.py @@ -151,7 +151,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.lexeme', 'spacy.vocab', 'spacy.tokens', 'spacy.spans', 'spacy.morphology', 'spacy.syntax.stateclass', - 'spacy._ml', 'spacy.tokenizer', 'spacy.en.attrs', + 'spacy._ml', 'spacy._theano', + 'spacy.tokenizer', 'spacy.en.attrs', 'spacy.en.pos', 'spacy.syntax.parser', 'spacy.syntax.transition_system', 'spacy.syntax.arc_eager', diff --git a/spacy/_ml.pxd b/spacy/_ml.pxd index add162e69..3562b4a32 100644 --- a/spacy/_ml.pxd +++ b/spacy/_ml.pxd @@ -14,9 +14,14 @@ from .tokens cimport Tokens cdef int arg_max(const weight_t* scores, const int n_classes) nogil +cdef int arg_max_if_true(const weight_t* scores, const int* is_valid, int n_classes) nogil + +cdef int arg_max_if_zero(const weight_t* scores, const int* costs, int n_classes) nogil + cdef class Model: cdef int n_classes + cdef int n_feats cdef const weight_t* score(self, atom_t* context) except NULL cdef int set_scores(self, weight_t* scores, atom_t* context) except -1 diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx index df66a1791..993d1a8ac 100644 --- a/spacy/_ml.pyx +++ b/spacy/_ml.pyx @@ -24,7 +24,7 @@ cdef int arg_max(const weight_t* scores, const int n_classes) nogil: return best -cdef int arg_max_if_true(const weight_t* scores, const bint* is_valid, +cdef int arg_max_if_true(const weight_t* scores, const int* is_valid, const int n_classes) nogil: cdef int i cdef int best = 0 @@ -54,21 +54,25 @@ cdef class Model: model_loc = path.join(model_loc, 'model') self.n_classes = n_classes self._extractor = Extractor(templates) + self.n_feats = self._extractor.n_templ self._model = LinearModel(n_classes, self._extractor.n_templ) self.model_loc = model_loc if self.model_loc and path.exists(self.model_loc): self._model.load(self.model_loc, freq_thresh=0) def predict(self, Example eg): - self.set_scores(eg.scores, eg.atoms) - eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes) + self.set_scores(eg.scores.data, eg.atoms.data) + eg.guess = arg_max_if_true(eg.scores.data, eg.is_valid.data, + self.n_classes) def train(self, Example eg): - self.set_scores(eg.scores, eg.atoms) - eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes) - eg.best = arg_max_if_zero(eg.scores, eg.costs, self.n_classes) + self.set_scores(eg.scores.data, eg.atoms.data) + eg.guess = arg_max_if_true(eg.scores.data, + eg.is_valid.data, self.n_classes) + eg.best = arg_max_if_zero(eg.scores.data, eg.costs.data, + self.n_classes) eg.cost = eg.costs[eg.guess] - self.update(eg.atoms, eg.guess, eg.best, eg.cost) + self.update(eg.atoms.data, eg.guess, eg.best, eg.cost) cdef const weight_t* score(self, atom_t* context) except NULL: cdef int n_feats diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx index 1a1224596..702208d18 100644 --- a/spacy/_theano.pyx +++ b/spacy/_theano.pyx @@ -1,44 +1,44 @@ -from thinc.example cimport Example +from thinc.api cimport Example +from thinc.typedefs cimport weight_t + +from ._ml cimport arg_max_if_true +from ._ml cimport arg_max_if_zero + +import numpy +from os import path cdef class TheanoModel(Model): - def __init__(self, n_classes, input_layer, train_func, predict_func, model_loc=None): + def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None): if model_loc is not None and path.isdir(model_loc): model_loc = path.join(model_loc, 'model') - self.n_classes = n_classes - - tables = [] - lengths = [] - for window_size, n_dims, vocab_size in input_structure: - tables.append(EmbeddingTable(n_dims, vocab_size, initializer)) - lengths.append(window_size) - - self.input_layer = InputLayer(lengths, tables) + self.eta = 0.001 + self.mu = 0.9 + self.t = 1 + initializer = lambda: 0.2 * numpy.random.uniform(-1.0, 1.0) + self.input_layer = InputLayer(input_spec, initializer) self.train_func = train_func self.predict_func = predict_func + self.n_classes = n_classes + self.n_feats = len(self.input_layer) self.model_loc = model_loc - if self.model_loc and path.exists(self.model_loc): - self._model.load(self.model_loc, freq_thresh=0) - - def train(self, Instance eg): - pass - - def predict(self, Instance eg): - - cdef const weight_t* score(self, atom_t* context) except NULL: - self.set_scores(self._scores, context) - return self._scores - - cdef int set_scores(self, weight_t* scores, atom_t* context) except -1: - # TODO f(context) --> Values - self._input_layer.fill(self._x, self._values, use_avg=False) - theano_scores = self._predict(self._x) + + def predict(self, Example eg): + self.input_layer.fill(eg.embeddings, eg.atoms) + theano_scores = self.predict_func(eg.embeddings) + cdef int i for i in range(self.n_classes): - output[i] = theano_scores[i] + eg.scores[i] = theano_scores[i] + eg.guess = arg_max_if_true(eg.scores.data, eg.is_valid.data, + self.n_classes) - cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1: - # TODO f(context) --> Values - self._input_layer.fill(self._x, self._values, use_avg=False) - + def train(self, Example eg): + self.predict(eg) + update, t, eta, mu = self.train_func(eg.embeddings, eg.scores, eg.costs) + self.input_layer.update(eg.atoms, update, self.t, self.eta, self.mu) + eg.best = arg_max_if_zero(eg.scores.data, eg.costs.data, + self.n_classes) + eg.cost = eg.costs[eg.guess] + self.t += 1 diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 4bfb0eeb1..33ae5b497 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -69,11 +69,11 @@ cdef class Parser: cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) self.moves.initialize_state(stcls) - cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE) + cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats) while not stcls.is_final(): eg.wipe() - fill_context(eg.atoms, stcls) - self.moves.set_valid(eg.is_valid, stcls) + fill_context(eg.atoms.data, stcls) + self.moves.set_valid(eg.is_valid.data, stcls) self.model.predict(eg) @@ -85,12 +85,12 @@ cdef class Parser: self.moves.preprocess_gold(gold) cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) self.moves.initialize_state(stcls) - cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE) + cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats) cdef int cost = 0 while not stcls.is_final(): eg.wipe() - fill_context(eg.atoms, stcls) - self.moves.set_costs(eg.is_valid, eg.costs, stcls, gold) + fill_context(eg.atoms.data, stcls) + self.moves.set_costs(eg.is_valid.data, eg.costs.data, stcls, gold) self.model.train(eg) From f8bb43475e449db5fdc4eb74809b947fbca0f3a9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 Jun 2015 02:38:51 +0200 Subject: [PATCH 05/30] * Bridge to Theano working. Very disorganised. Using thinc adb60aba966ed2 --- bin/parser/nn_train.py | 255 +++++++++++++++++++++++++++++++ spacy/_theano.pxd | 13 ++ spacy/_theano.pyx | 19 ++- spacy/syntax/_parse_features.pyx | 4 + spacy/syntax/parser.pyx | 7 +- 5 files changed, 291 insertions(+), 7 deletions(-) create mode 100755 bin/parser/nn_train.py create mode 100644 spacy/_theano.pxd diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py new file mode 100755 index 000000000..375996f4f --- /dev/null +++ b/bin/parser/nn_train.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python +from __future__ import division +from __future__ import unicode_literals + +import os +from os import path +import shutil +import codecs +import random + +import plac +import cProfile +import pstats +import re + +import spacy.util +from spacy.en import English +from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir + +from spacy.syntax.util import Config +from spacy.gold import read_json_file +from spacy.gold import GoldParse + +from spacy.scorer import Scorer + +from thinc.theano_nn import compile_theano_model + +from spacy.syntax.parser import Parser +from spacy._theano import TheanoModel + + +def _corrupt(c, noise_level): + if random.random() >= noise_level: + return c + elif c == ' ': + return '\n' + elif c == '\n': + return ' ' + elif c in ['.', "'", "!", "?"]: + return '' + else: + return c.lower() + + +def add_noise(orig, noise_level): + if random.random() >= noise_level: + return orig + elif type(orig) == list: + corrupted = [_corrupt(word, noise_level) for word in orig] + corrupted = [w for w in corrupted if w] + return corrupted + else: + return ''.join(_corrupt(c, noise_level) for c in orig) + + +def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False): + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + else: + tokens = nlp.tokenizer(raw_text) + nlp.tagger(tokens) + nlp.entity(tokens) + nlp.parser(tokens) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold, verbose=verbose) + + +def _merge_sents(sents): + m_deps = [[], [], [], [], [], []] + m_brackets = [] + i = 0 + for (ids, words, tags, heads, labels, ner), brackets in sents: + m_deps[0].extend(id_ + i for id_ in ids) + m_deps[1].extend(words) + m_deps[2].extend(tags) + m_deps[3].extend(head + i for head in heads) + m_deps[4].extend(labels) + m_deps[5].extend(ner) + m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets) + i += len(ids) + return [(m_deps, m_brackets)] + + +def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', + seed=0, gold_preproc=False, n_sents=0, corruption_level=0, + verbose=False, + eta=0.01, mu=0.9, n_hidden=100, word_vec_len=10, pos_vec_len=10): + dep_model_dir = path.join(model_dir, 'deps') + pos_model_dir = path.join(model_dir, 'pos') + ner_model_dir = path.join(model_dir, 'ner') + if path.exists(dep_model_dir): + shutil.rmtree(dep_model_dir) + if path.exists(pos_model_dir): + shutil.rmtree(pos_model_dir) + if path.exists(ner_model_dir): + shutil.rmtree(ner_model_dir) + os.mkdir(dep_model_dir) + os.mkdir(pos_model_dir) + os.mkdir(ner_model_dir) + setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) + + Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, + labels=Language.ParserTransitionSystem.get_labels(gold_tuples)) + Config.write(ner_model_dir, 'config', features='ner', seed=seed, + labels=Language.EntityTransitionSystem.get_labels(gold_tuples), + beam_width=0) + + if n_sents > 0: + gold_tuples = gold_tuples[:n_sents] + + nlp = Language(data_dir=model_dir) + + def make_model(n_classes, input_spec, model_dir): + print input_spec + n_in = sum(n_cols * len(fields) for (n_cols, fields) in input_spec) + print 'Compiling' + debug, train_func, predict_func = compile_theano_model(n_classes, n_hidden, + n_in, 0.0, 0.0) + print 'Done' + return TheanoModel( + n_classes, + input_spec, + train_func, + predict_func, + model_loc=model_dir, + debug=debug) + + nlp._parser = Parser(nlp.vocab.strings, dep_model_dir, nlp.ParserTransitionSystem, + make_model) + + print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %" + for itn in range(n_iter): + scorer = Scorer() + loss = 0 + for raw_text, sents in gold_tuples: + if gold_preproc: + raw_text = None + else: + sents = _merge_sents(sents) + for annot_tuples, ctnt in sents: + if len(annot_tuples[1]) == 1: + continue + score_model(scorer, nlp, raw_text, annot_tuples, + verbose=verbose if itn >= 2 else False) + if raw_text is None: + words = add_noise(annot_tuples[1], corruption_level) + tokens = nlp.tokenizer.tokens_from_list(words) + else: + raw_text = add_noise(raw_text, corruption_level) + tokens = nlp.tokenizer(raw_text) + nlp.tagger(tokens) + gold = GoldParse(tokens, annot_tuples, make_projective=True) + if not gold.is_projective: + raise Exception( + "Non-projective sentence in training, after we should " + "have enforced projectivity: %s" % annot_tuples + ) + loss += nlp.parser.train(tokens, gold) + nlp.entity.train(tokens, gold) + nlp.tagger.train(tokens, gold.tags) + random.shuffle(gold_tuples) + print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, + scorer.tags_acc, + scorer.token_acc) + nlp.parser.model.end_training() + nlp.entity.model.end_training() + nlp.tagger.model.end_training() + nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt')) + return nlp + + +def evaluate(nlp, gold_tuples, gold_preproc=True): + scorer = Scorer() + for raw_text, sents in gold_tuples: + if gold_preproc: + raw_text = None + else: + sents = _merge_sents(sents) + for annot_tuples, brackets in sents: + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + nlp.tagger(tokens) + nlp.entity(tokens) + nlp.parser(tokens) + else: + tokens = nlp(raw_text, merge_mwes=False) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold) + return scorer + + +def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): + nlp = Language(data_dir=model_dir) + if beam_width is not None: + nlp.parser.cfg.beam_width = beam_width + gold_tuples = read_json_file(dev_loc) + scorer = Scorer() + out_file = codecs.open(out_loc, 'w', 'utf8') + for raw_text, sents in gold_tuples: + sents = _merge_sents(sents) + for annot_tuples, brackets in sents: + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + nlp.tagger(tokens) + nlp.entity(tokens) + nlp.parser(tokens) + else: + tokens = nlp(raw_text, merge_mwes=False) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold, verbose=False) + for t in tokens: + out_file.write( + '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_) + ) + return scorer + + +@plac.annotations( + train_loc=("Location of training file or directory"), + dev_loc=("Location of development file or directory"), + model_dir=("Location of output model directory",), + eval_only=("Skip training, and only evaluate", "flag", "e", bool), + corruption_level=("Amount of noise to add to training data", "option", "c", float), + gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool), + out_loc=("Out location", "option", "o", str), + n_sents=("Number of training sentences", "option", "n", int), + n_iter=("Number of training iterations", "option", "i", int), + verbose=("Verbose error reporting", "flag", "v", bool), + debug=("Debug mode", "flag", "d", bool), +) +def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, + debug=False, corruption_level=0.0, gold_preproc=False, beam_width=1, + eval_only=False): + gold_train = list(read_json_file(train_loc)) + nlp = train(English, gold_train, model_dir, + feat_set='embed', + gold_preproc=gold_preproc, n_sents=n_sents, + corruption_level=corruption_level, n_iter=n_iter, + verbose=verbose) + #if out_loc: + # write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width) + scorer = evaluate(nlp, list(read_json_file(dev_loc)), gold_preproc=gold_preproc) + + print 'TOK', 100-scorer.token_acc + print 'POS', scorer.tags_acc + print 'UAS', scorer.uas + print 'LAS', scorer.las + + print 'NER P', scorer.ents_p + print 'NER R', scorer.ents_r + print 'NER F', scorer.ents_f + + +if __name__ == '__main__': + plac.call(main) diff --git a/spacy/_theano.pxd b/spacy/_theano.pxd new file mode 100644 index 000000000..cad0736c2 --- /dev/null +++ b/spacy/_theano.pxd @@ -0,0 +1,13 @@ +from ._ml cimport Model +from thinc.nn cimport InputLayer + + +cdef class TheanoModel(Model): + cdef InputLayer input_layer + cdef object train_func + cdef object predict_func + cdef object debug + + cdef public float eta + cdef public float mu + cdef public float t diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx index 702208d18..b791c4f42 100644 --- a/spacy/_theano.pyx +++ b/spacy/_theano.pyx @@ -9,7 +9,8 @@ from os import path cdef class TheanoModel(Model): - def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None): + def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None, + debug=None): if model_loc is not None and path.isdir(model_loc): model_loc = path.join(model_loc, 'model') @@ -20,6 +21,7 @@ cdef class TheanoModel(Model): self.input_layer = InputLayer(input_spec, initializer) self.train_func = train_func self.predict_func = predict_func + self.debug = debug self.n_classes = n_classes self.n_feats = len(self.input_layer) @@ -27,7 +29,7 @@ cdef class TheanoModel(Model): def predict(self, Example eg): self.input_layer.fill(eg.embeddings, eg.atoms) - theano_scores = self.predict_func(eg.embeddings) + theano_scores = self.predict_func(eg.embeddings)[0] cdef int i for i in range(self.n_classes): eg.scores[i] = theano_scores[i] @@ -35,10 +37,17 @@ cdef class TheanoModel(Model): self.n_classes) def train(self, Example eg): - self.predict(eg) - update, t, eta, mu = self.train_func(eg.embeddings, eg.scores, eg.costs) - self.input_layer.update(eg.atoms, update, self.t, self.eta, self.mu) + self.input_layer.fill(eg.embeddings, eg.atoms) + theano_scores, update, y = self.train_func(eg.embeddings, eg.costs, self.eta) + self.input_layer.update(update, eg.atoms, self.t, self.eta, self.mu) + for i in range(self.n_classes): + eg.scores[i] = theano_scores[i] + eg.guess = arg_max_if_true(eg.scores.data, eg.is_valid.data, + self.n_classes) eg.best = arg_max_if_zero(eg.scores.data, eg.costs.data, self.n_classes) eg.cost = eg.costs[eg.guess] self.t += 1 + + def end_training(self): + pass diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx index efefc7273..1adeaef83 100644 --- a/spacy/syntax/_parse_features.pyx +++ b/spacy/syntax/_parse_features.pyx @@ -355,3 +355,7 @@ trigrams = ( (N0W, N0p, N0lL, N0l2L), (N0p, N0lL, N0l2L), ) + +words = (S0w, N0w, S1w, N1w) +tags = (S0p, N0p, S1p, N1p) +labels = (S0L, N0L, S1L, S2L) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 33ae5b497..66d598b88 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -52,18 +52,21 @@ def get_templates(name): return pf.ner elif name == 'debug': return pf.unigrams + elif name.startswith('embed'): + return ((10, pf.words), (10, pf.tags), (10, pf.labels)) else: return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \ pf.tree_shape + pf.trigrams) cdef class Parser: - def __init__(self, StringStore strings, model_dir, transition_system): + def __init__(self, StringStore strings, model_dir, transition_system, + get_model=Model): assert os.path.exists(model_dir) and os.path.isdir(model_dir) self.cfg = Config.read(model_dir, 'config') self.moves = transition_system(strings, self.cfg.labels) templates = get_templates(self.cfg.features) - self.model = Model(self.moves.n_moves, templates, model_dir) + self.model = get_model(self.moves.n_moves, templates, model_dir) def __call__(self, Tokens tokens): cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) From ebe630cc8dc4f8affd8cd27fd3d4be113e669b59 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 Jun 2015 04:17:29 +0200 Subject: [PATCH 06/30] * Enable more features for NN --- spacy/syntax/_parse_features.pyx | 57 ++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx index 1adeaef83..40c1818c5 100644 --- a/spacy/syntax/_parse_features.pyx +++ b/spacy/syntax/_parse_features.pyx @@ -356,6 +356,57 @@ trigrams = ( (N0p, N0lL, N0l2L), ) -words = (S0w, N0w, S1w, N1w) -tags = (S0p, N0p, S1p, N1p) -labels = (S0L, N0L, S1L, S2L) + +words = ( + S2w, + S1w, + S1rw, + S0lw, + S0l2w, + S0w, + S0r2w, + S0rw, + N0lw, + N0l2w, + N0w, + N1w, + N2w, + P1w, + P2w +) + +tags = ( + S2p, + S1p, + S1rp, + S0lp, + S0l2p, + S0p, + S0r2p, + S0rp, + N0lp, + N0l2p, + N0p, + N1p, + N2p, + P1p, + P2p +) + +labels = ( + S2L, + S1L, + S1rL, + S0lL, + S0l2L, + S0L, + S0r2L, + S0rL, + N0lL, + N0l2L, + N0L, + N1L, + N2L, + P1L, + P2L +) From da793073d0cbd83ad1464c4aba94dba5d1fe1fde Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 Jun 2015 04:18:01 +0200 Subject: [PATCH 07/30] * Wire hyperparameters to script interface --- bin/parser/nn_train.py | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py index 375996f4f..e0ae846b5 100755 --- a/bin/parser/nn_train.py +++ b/bin/parser/nn_train.py @@ -84,7 +84,8 @@ def _merge_sents(sents): def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, gold_preproc=False, n_sents=0, corruption_level=0, verbose=False, - eta=0.01, mu=0.9, n_hidden=100, word_vec_len=10, pos_vec_len=10): + eta=0.01, mu=0.9, n_hidden=100, + nv_word=10, nv_tag=10, nv_label=10): dep_model_dir = path.join(model_dir, 'deps') pos_model_dir = path.join(model_dir, 'pos') ner_model_dir = path.join(model_dir, 'ner') @@ -99,8 +100,15 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', os.mkdir(ner_model_dir) setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) - Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, - labels=Language.ParserTransitionSystem.get_labels(gold_tuples)) + Config.write(dep_model_dir, 'config', + seed=seed, + features=feat_set, + labels=Language.ParserTransitionSystem.get_labels(gold_tuples), + vector_lengths=(nv_word, nv_tag, nv_label), + hidden_nodes=n_hidden, + eta=eta, + mu=mu + ) Config.write(ner_model_dir, 'config', features='ner', seed=seed, labels=Language.EntityTransitionSystem.get_labels(gold_tuples), beam_width=0) @@ -110,16 +118,17 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', nlp = Language(data_dir=model_dir) - def make_model(n_classes, input_spec, model_dir): - print input_spec - n_in = sum(n_cols * len(fields) for (n_cols, fields) in input_spec) + def make_model(n_classes, (words, tags, labels), model_dir): + n_in = (nv_word * len(words)) + \ + (nv_tag * len(tags)) + \ + (nv_label * len(labels)) print 'Compiling' debug, train_func, predict_func = compile_theano_model(n_classes, n_hidden, n_in, 0.0, 0.0) print 'Done' return TheanoModel( n_classes, - input_spec, + ((nv_word, words), (nv_tag, tags), (nv_label, labels)), train_func, predict_func, model_loc=model_dir, @@ -226,14 +235,23 @@ def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): n_sents=("Number of training sentences", "option", "n", int), n_iter=("Number of training iterations", "option", "i", int), verbose=("Verbose error reporting", "flag", "v", bool), - debug=("Debug mode", "flag", "d", bool), + + nv_word=("Word vector length", "option", "W", int), + nv_tag=("Tag vector length", "option", "T", int), + nv_label=("Label vector length", "option", "L", int), + nv_hidden=("Hidden nodes length", "option", "H", int), + eta=("Learning rate", "option", "E", float), + mu=("Momentum", "option", "M", float), ) def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, - debug=False, corruption_level=0.0, gold_preproc=False, beam_width=1, + corruption_level=0.0, gold_preproc=False, + nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10, + eta=0.1, mu=0.9, eval_only=False): gold_train = list(read_json_file(train_loc)) nlp = train(English, gold_train, model_dir, feat_set='embed', + nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, gold_preproc=gold_preproc, n_sents=n_sents, corruption_level=corruption_level, n_iter=n_iter, verbose=verbose) From ed40a8380ee4289eadae9b98da4e61c337b6ab01 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 Jun 2015 04:18:47 +0200 Subject: [PATCH 08/30] * Remove hard-coding of vector lengths --- spacy/_theano.pyx | 4 ++-- spacy/syntax/parser.pyx | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx index b791c4f42..965ee84d7 100644 --- a/spacy/_theano.pyx +++ b/spacy/_theano.pyx @@ -28,7 +28,7 @@ cdef class TheanoModel(Model): self.model_loc = model_loc def predict(self, Example eg): - self.input_layer.fill(eg.embeddings, eg.atoms) + self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=True) theano_scores = self.predict_func(eg.embeddings)[0] cdef int i for i in range(self.n_classes): @@ -37,7 +37,7 @@ cdef class TheanoModel(Model): self.n_classes) def train(self, Example eg): - self.input_layer.fill(eg.embeddings, eg.atoms) + self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=False) theano_scores, update, y = self.train_func(eg.embeddings, eg.costs, self.eta) self.input_layer.update(update, eg.atoms, self.t, self.eta, self.mu) for i in range(self.n_classes): diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 66d598b88..797ee1e56 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -53,7 +53,7 @@ def get_templates(name): elif name == 'debug': return pf.unigrams elif name.startswith('embed'): - return ((10, pf.words), (10, pf.tags), (10, pf.labels)) + return (pf.words, pf.tags, pf.labels) else: return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \ pf.tree_shape + pf.trigrams) From 65ac38919135612b319de1d6f183558d13a0f52c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 28 Jun 2015 01:29:37 +0200 Subject: [PATCH 09/30] * whitespace --- bin/parser/nn_train.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py index e0ae846b5..33ad8a8a9 100755 --- a/bin/parser/nn_train.py +++ b/bin/parser/nn_train.py @@ -84,7 +84,7 @@ def _merge_sents(sents): def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, gold_preproc=False, n_sents=0, corruption_level=0, verbose=False, - eta=0.01, mu=0.9, n_hidden=100, + eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10): dep_model_dir = path.join(model_dir, 'deps') pos_model_dir = path.join(model_dir, 'pos') @@ -105,7 +105,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', features=feat_set, labels=Language.ParserTransitionSystem.get_labels(gold_tuples), vector_lengths=(nv_word, nv_tag, nv_label), - hidden_nodes=n_hidden, + hidden_nodes=nv_hidden, eta=eta, mu=mu ) @@ -123,7 +123,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', (nv_tag * len(tags)) + \ (nv_label * len(labels)) print 'Compiling' - debug, train_func, predict_func = compile_theano_model(n_classes, n_hidden, + debug, train_func, predict_func = compile_theano_model(n_classes, nv_hidden, n_in, 0.0, 0.0) print 'Done' return TheanoModel( @@ -251,7 +251,7 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos gold_train = list(read_json_file(train_loc)) nlp = train(English, gold_train, model_dir, feat_set='embed', - nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, + nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden, gold_preproc=gold_preproc, n_sents=n_sents, corruption_level=corruption_level, n_iter=n_iter, verbose=verbose) From bf33598b34d18f2766290cb1163a5c362b63cc9d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 22:55:58 +0200 Subject: [PATCH 10/30] * Work on a theano-driven model for the parser --- spacy/_theano.pyx | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 spacy/_theano.pyx diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx new file mode 100644 index 000000000..1a1224596 --- /dev/null +++ b/spacy/_theano.pyx @@ -0,0 +1,44 @@ +from thinc.example cimport Example + + +cdef class TheanoModel(Model): + def __init__(self, n_classes, input_layer, train_func, predict_func, model_loc=None): + if model_loc is not None and path.isdir(model_loc): + model_loc = path.join(model_loc, 'model') + self.n_classes = n_classes + + tables = [] + lengths = [] + for window_size, n_dims, vocab_size in input_structure: + tables.append(EmbeddingTable(n_dims, vocab_size, initializer)) + lengths.append(window_size) + + self.input_layer = InputLayer(lengths, tables) + + self.train_func = train_func + self.predict_func = predict_func + + self.model_loc = model_loc + if self.model_loc and path.exists(self.model_loc): + self._model.load(self.model_loc, freq_thresh=0) + + def train(self, Instance eg): + pass + + def predict(self, Instance eg): + + cdef const weight_t* score(self, atom_t* context) except NULL: + self.set_scores(self._scores, context) + return self._scores + + cdef int set_scores(self, weight_t* scores, atom_t* context) except -1: + # TODO f(context) --> Values + self._input_layer.fill(self._x, self._values, use_avg=False) + theano_scores = self._predict(self._x) + for i in range(self.n_classes): + output[i] = theano_scores[i] + + cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1: + # TODO f(context) --> Values + self._input_layer.fill(self._x, self._values, use_avg=False) + From 75aeccc0644c09dcd59d126cc1f627c2823bbce0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 26 Jun 2015 06:25:36 +0200 Subject: [PATCH 11/30] * Rejig parser interface to use new thinc.api.Example class, in prep of theano model. Comment out beam search --- spacy/_ml.pyx | 36 ++++++ spacy/syntax/arc_eager.pyx | 29 ++--- spacy/syntax/ner.pyx | 21 ---- spacy/syntax/parser.pxd | 3 - spacy/syntax/parser.pyx | 178 ++++++++++++----------------- spacy/syntax/transition_system.pxd | 8 +- spacy/syntax/transition_system.pyx | 35 ++---- 7 files changed, 132 insertions(+), 178 deletions(-) diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx index be647c2dd..df66a1791 100644 --- a/spacy/_ml.pyx +++ b/spacy/_ml.pyx @@ -10,6 +10,7 @@ import cython import numpy.random from thinc.features cimport Feature, count_feats +from thinc.api cimport Example cdef int arg_max(const weight_t* scores, const int n_classes) nogil: @@ -23,6 +24,30 @@ cdef int arg_max(const weight_t* scores, const int n_classes) nogil: return best +cdef int arg_max_if_true(const weight_t* scores, const bint* is_valid, + const int n_classes) nogil: + cdef int i + cdef int best = 0 + cdef weight_t mode = -900000 + for i in range(n_classes): + if is_valid[i] and scores[i] > mode: + mode = scores[i] + best = i + return best + + +cdef int arg_max_if_zero(const weight_t* scores, const int* costs, + const int n_classes) nogil: + cdef int i + cdef int best = 0 + cdef weight_t mode = -900000 + for i in range(n_classes): + if costs[i] == 0 and scores[i] > mode: + mode = scores[i] + best = i + return best + + cdef class Model: def __init__(self, n_classes, templates, model_loc=None): if model_loc is not None and path.isdir(model_loc): @@ -34,6 +59,17 @@ cdef class Model: if self.model_loc and path.exists(self.model_loc): self._model.load(self.model_loc, freq_thresh=0) + def predict(self, Example eg): + self.set_scores(eg.scores, eg.atoms) + eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes) + + def train(self, Example eg): + self.set_scores(eg.scores, eg.atoms) + eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes) + eg.best = arg_max_if_zero(eg.scores, eg.costs, self.n_classes) + eg.cost = eg.costs[eg.guess] + self.update(eg.atoms, eg.guess, eg.best, eg.cost) + cdef const weight_t* score(self, atom_t* context) except NULL: cdef int n_feats feats = self._extractor.get_feats(context, &n_feats) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 29e62cb4e..a83e19ec2 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -398,7 +398,8 @@ cdef class ArcEager(TransitionSystem): n_valid += output[i] assert n_valid >= 1 - cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1: + cdef int set_costs(self, bint* is_valid, int* costs, + StateClass stcls, GoldParse gold) except -1: cdef int i, move, label cdef label_cost_func_t[N_MOVES] label_cost_funcs cdef move_cost_func_t[N_MOVES] move_cost_funcs @@ -423,30 +424,14 @@ cdef class ArcEager(TransitionSystem): n_gold = 0 for i in range(self.n_moves): if self.c[i].is_valid(stcls, self.c[i].label): + is_valid[i] = True move = self.c[i].move label = self.c[i].label if move_costs[move] == -1: move_costs[move] = move_cost_funcs[move](stcls, &gold.c) - output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label) - n_gold += output[i] == 0 + costs[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label) + n_gold += costs[i] == 0 else: - output[i] = 9000 + is_valid[i] = False + costs[i] = 9000 assert n_gold >= 1 - - cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *: - cdef bint[N_MOVES] is_valid - is_valid[SHIFT] = Shift.is_valid(stcls, -1) - is_valid[REDUCE] = Reduce.is_valid(stcls, -1) - is_valid[LEFT] = LeftArc.is_valid(stcls, -1) - is_valid[RIGHT] = RightArc.is_valid(stcls, -1) - is_valid[BREAK] = Break.is_valid(stcls, -1) - cdef Transition best - cdef weight_t score = MIN_SCORE - cdef int i - for i in range(self.n_moves): - if scores[i] > score and is_valid[self.c[i].move]: - best = self.c[i] - score = scores[i] - assert best.clas < self.n_moves - assert score > MIN_SCORE, (stcls.stack_depth(), stcls.buffer_length(), stcls.is_final(), stcls._b_i, stcls.length) - return best diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 4a47a20a8..b145df7ac 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -128,27 +128,6 @@ cdef class BiluoPushDown(TransitionSystem): raise Exception(move) return t - cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *: - cdef int best = -1 - cdef weight_t score = -90000 - cdef const Transition* m - cdef int i - for i in range(self.n_moves): - m = &self.c[i] - if m.is_valid(stcls, m.label) and scores[i] > score: - best = i - score = scores[i] - assert best >= 0 - cdef Transition t = self.c[best] - t.score = score - return t - - cdef int set_valid(self, bint* output, StateClass stcls) except -1: - cdef int i - for i in range(self.n_moves): - m = &self.c[i] - output[i] = m.is_valid(stcls, m.label) - cdef class Missing: @staticmethod diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd index 103ff9c02..11ac6bbb8 100644 --- a/spacy/syntax/parser.pxd +++ b/spacy/syntax/parser.pxd @@ -11,6 +11,3 @@ cdef class Parser: cdef readonly object cfg cdef readonly Model model cdef readonly TransitionSystem moves - - cdef int _greedy_parse(self, Tokens tokens) except -1 - cdef int _beam_parse(self, Tokens tokens) except -1 diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index e36d10a38..2f6c3cd98 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -19,17 +19,10 @@ from cymem.cymem cimport Pool, Address from murmurhash.mrmr cimport hash64 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t - from util import Config -from thinc.features cimport Extractor -from thinc.features cimport Feature -from thinc.features cimport count_feats +from thinc.api cimport Example -from thinc.learner cimport LinearModel - -from thinc.search cimport Beam -from thinc.search cimport MaxViolation from ..tokens cimport Tokens, TokenC from ..strings cimport StringStore @@ -72,35 +65,86 @@ cdef class Parser: self.model = Model(self.moves.n_moves, templates, model_dir) def __call__(self, Tokens tokens): - if self.cfg.get('beam_width', 1) < 1: - self._greedy_parse(tokens) - else: - self._beam_parse(tokens) + cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) + self.moves.initialize_state(stcls) + + cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE) + while not stcls.is_final(): + eg.wipe() + fill_context(eg.atoms, stcls) + self.moves.set_valid(eg.is_valid, stcls) + + self.model.predict(eg) + + self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label) + self.moves.finalize_state(stcls) + tokens.set_parse(stcls._sent) def train(self, Tokens tokens, GoldParse gold): self.moves.preprocess_gold(gold) - if self.cfg.beam_width < 1: - return self._greedy_train(tokens, gold) - else: - return self._beam_train(tokens, gold) - - cdef int _greedy_parse(self, Tokens tokens) except -1: - cdef atom_t[CONTEXT_SIZE] context - cdef int n_feats - cdef Pool mem = Pool() cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) self.moves.initialize_state(stcls) - cdef Transition guess - words = [w.orth_ for w in tokens] + cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE) + cdef int cost = 0 while not stcls.is_final(): - fill_context(context, stcls) - scores = self.model.score(context) - guess = self.moves.best_valid(scores, stcls) - #print self.moves.move_name(guess.move, guess.label), stcls.print_state(words) - guess.do(stcls, guess.label) - assert stcls._s_i >= 0 - self.moves.finalize_state(stcls) - tokens.set_parse(stcls._sent) + eg.wipe() + fill_context(eg.atoms, stcls) + self.moves.set_costs(eg.is_valid, eg.costs, stcls, gold) + + self.model.train(eg) + + self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label) + cost += eg.cost + return cost + + +# These are passed as callbacks to thinc.search.Beam +""" +cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: + dest = _dest + src = _src + moves = _moves + dest.clone(src) + moves[clas].do(dest, moves[clas].label) + + +cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: + cdef StateClass st = StateClass.init(tokens, length) + st.fast_forward() + Py_INCREF(st) + return st + + +cdef int _check_final_state(void* _state, void* extra_args) except -1: + return (_state).is_final() + + +def _cleanup(Beam beam): + for i in range(beam.width): + Py_XDECREF(beam._states[i].content) + Py_XDECREF(beam._parents[i].content) + +cdef hash_t _hash_state(void* _state, void* _) except 0: + return _state + + #state = _state + #cdef atom_t[10] rep + + #rep[0] = state.stack[0] if state.stack_len >= 1 else 0 + #rep[1] = state.stack[-1] if state.stack_len >= 2 else 0 + #rep[2] = state.stack[-2] if state.stack_len >= 3 else 0 + #rep[3] = state.i + #rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0 + #rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0 + #rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0 + #rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0 + #if get_left(state, get_n0(state), 1) != NULL: + # rep[8] = get_left(state, get_n0(state), 1).dep + #else: + # rep[8] = 0 + #rep[9] = state.sent[state.i].l_kids + #return hash64(rep, sizeof(atom_t) * 10, 0) + cdef int _beam_parse(self, Tokens tokens) except -1: cdef Beam beam = Beam(self.moves.n_moves, self.cfg.beam_width) @@ -114,30 +158,6 @@ cdef class Parser: tokens.set_parse(state._sent) _cleanup(beam) - def _greedy_train(self, Tokens tokens, GoldParse gold): - cdef Pool mem = Pool() - cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) - self.moves.initialize_state(stcls) - - cdef int cost - cdef const Feature* feats - cdef const weight_t* scores - cdef Transition guess - cdef Transition best - cdef atom_t[CONTEXT_SIZE] context - loss = 0 - words = [w.orth_ for w in tokens] - history = [] - while not stcls.is_final(): - fill_context(context, stcls) - scores = self.model.score(context) - guess = self.moves.best_valid(scores, stcls) - best = self.moves.best_gold(scores, stcls, gold) - cost = guess.get_cost(stcls, &gold.c, guess.label) - self.model.update(context, guess.clas, best.clas, cost) - guess.do(stcls, guess.label) - loss += cost - return loss def _beam_train(self, Tokens tokens, GoldParse gold_parse): cdef Beam pred = Beam(self.moves.n_moves, self.cfg.beam_width) @@ -200,50 +220,4 @@ cdef class Parser: count_feats(counts[clas], feats, n_feats, inc) self.moves.c[clas].do(stcls, self.moves.c[clas].label) - -# These are passed as callbacks to thinc.search.Beam - -cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: - dest = _dest - src = _src - moves = _moves - dest.clone(src) - moves[clas].do(dest, moves[clas].label) - - -cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: - cdef StateClass st = StateClass.init(tokens, length) - st.fast_forward() - Py_INCREF(st) - return st - - -cdef int _check_final_state(void* _state, void* extra_args) except -1: - return (_state).is_final() - - -def _cleanup(Beam beam): - for i in range(beam.width): - Py_XDECREF(beam._states[i].content) - Py_XDECREF(beam._parents[i].content) - -cdef hash_t _hash_state(void* _state, void* _) except 0: - return _state - - #state = _state - #cdef atom_t[10] rep - - #rep[0] = state.stack[0] if state.stack_len >= 1 else 0 - #rep[1] = state.stack[-1] if state.stack_len >= 2 else 0 - #rep[2] = state.stack[-2] if state.stack_len >= 3 else 0 - #rep[3] = state.i - #rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0 - #rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0 - #rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0 - #rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0 - #if get_left(state, get_n0(state), 1) != NULL: - # rep[8] = get_left(state, get_n0(state), 1).dep - #else: - # rep[8] = 0 - #rep[9] = state.sent[state.i].l_kids - #return hash64(rep, sizeof(atom_t) * 10, 0) +""" diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index d9bd2b3e6..35f0ada30 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -46,9 +46,5 @@ cdef class TransitionSystem: cdef int set_valid(self, bint* output, StateClass state) except -1 - cdef int set_costs(self, int* output, StateClass state, GoldParse gold) except -1 - - cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except * - - cdef Transition best_gold(self, const weight_t* scores, StateClass state, - GoldParse gold) except * + cdef int set_costs(self, bint* is_valid, int* costs, + StateClass state, GoldParse gold) except -1 diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 927498cba..b13c75ba3 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -43,30 +43,17 @@ cdef class TransitionSystem: cdef Transition init_transition(self, int clas, int move, int label) except *: raise NotImplementedError - cdef Transition best_valid(self, const weight_t* scores, StateClass s) except *: - raise NotImplementedError - - cdef int set_valid(self, bint* output, StateClass state) except -1: - raise NotImplementedError - - cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1: + cdef int set_valid(self, bint* is_valid, StateClass stcls) except -1: cdef int i for i in range(self.n_moves): - if self.c[i].is_valid(stcls, self.c[i].label): - output[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label) + is_valid[i] = self.c[i].is_valid(stcls, self.c[i].label) + + cdef int set_costs(self, bint* is_valid, int* costs, + StateClass stcls, GoldParse gold) except -1: + cdef int i + self.set_valid(is_valid, stcls) + for i in range(self.n_moves): + if is_valid[i]: + costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label) else: - output[i] = 9000 - - cdef Transition best_gold(self, const weight_t* scores, StateClass stcls, - GoldParse gold) except *: - cdef Transition best - cdef weight_t score = MIN_SCORE - cdef int i - for i in range(self.n_moves): - if self.c[i].is_valid(stcls, self.c[i].label): - cost = self.c[i].get_cost(stcls, &gold.c, self.c[i].label) - if scores[i] > score and cost == 0: - best = self.c[i] - score = scores[i] - assert score > MIN_SCORE - return best + costs[i] = 9000 From 9282a8e72c567946657cf484710f60470d6e0914 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 26 Jun 2015 13:51:39 +0200 Subject: [PATCH 12/30] * Prepare for new models to be plugged in by using Example class --- setup.py | 3 +- spacy/_ml.pxd | 5 ++++ spacy/_ml.pyx | 18 +++++++----- spacy/_theano.pyx | 64 ++++++++++++++++++++--------------------- spacy/syntax/parser.pyx | 12 ++++---- 5 files changed, 56 insertions(+), 46 deletions(-) diff --git a/setup.py b/setup.py index 5e2c9f480..48e2dfe25 100644 --- a/setup.py +++ b/setup.py @@ -151,7 +151,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.lexeme', 'spacy.vocab', 'spacy.tokens', 'spacy.spans', 'spacy.morphology', 'spacy.syntax.stateclass', - 'spacy._ml', 'spacy.tokenizer', 'spacy.en.attrs', + 'spacy._ml', 'spacy._theano', + 'spacy.tokenizer', 'spacy.en.attrs', 'spacy.en.pos', 'spacy.syntax.parser', 'spacy.syntax.transition_system', 'spacy.syntax.arc_eager', diff --git a/spacy/_ml.pxd b/spacy/_ml.pxd index add162e69..3562b4a32 100644 --- a/spacy/_ml.pxd +++ b/spacy/_ml.pxd @@ -14,9 +14,14 @@ from .tokens cimport Tokens cdef int arg_max(const weight_t* scores, const int n_classes) nogil +cdef int arg_max_if_true(const weight_t* scores, const int* is_valid, int n_classes) nogil + +cdef int arg_max_if_zero(const weight_t* scores, const int* costs, int n_classes) nogil + cdef class Model: cdef int n_classes + cdef int n_feats cdef const weight_t* score(self, atom_t* context) except NULL cdef int set_scores(self, weight_t* scores, atom_t* context) except -1 diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx index df66a1791..993d1a8ac 100644 --- a/spacy/_ml.pyx +++ b/spacy/_ml.pyx @@ -24,7 +24,7 @@ cdef int arg_max(const weight_t* scores, const int n_classes) nogil: return best -cdef int arg_max_if_true(const weight_t* scores, const bint* is_valid, +cdef int arg_max_if_true(const weight_t* scores, const int* is_valid, const int n_classes) nogil: cdef int i cdef int best = 0 @@ -54,21 +54,25 @@ cdef class Model: model_loc = path.join(model_loc, 'model') self.n_classes = n_classes self._extractor = Extractor(templates) + self.n_feats = self._extractor.n_templ self._model = LinearModel(n_classes, self._extractor.n_templ) self.model_loc = model_loc if self.model_loc and path.exists(self.model_loc): self._model.load(self.model_loc, freq_thresh=0) def predict(self, Example eg): - self.set_scores(eg.scores, eg.atoms) - eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes) + self.set_scores(eg.scores.data, eg.atoms.data) + eg.guess = arg_max_if_true(eg.scores.data, eg.is_valid.data, + self.n_classes) def train(self, Example eg): - self.set_scores(eg.scores, eg.atoms) - eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes) - eg.best = arg_max_if_zero(eg.scores, eg.costs, self.n_classes) + self.set_scores(eg.scores.data, eg.atoms.data) + eg.guess = arg_max_if_true(eg.scores.data, + eg.is_valid.data, self.n_classes) + eg.best = arg_max_if_zero(eg.scores.data, eg.costs.data, + self.n_classes) eg.cost = eg.costs[eg.guess] - self.update(eg.atoms, eg.guess, eg.best, eg.cost) + self.update(eg.atoms.data, eg.guess, eg.best, eg.cost) cdef const weight_t* score(self, atom_t* context) except NULL: cdef int n_feats diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx index 1a1224596..702208d18 100644 --- a/spacy/_theano.pyx +++ b/spacy/_theano.pyx @@ -1,44 +1,44 @@ -from thinc.example cimport Example +from thinc.api cimport Example +from thinc.typedefs cimport weight_t + +from ._ml cimport arg_max_if_true +from ._ml cimport arg_max_if_zero + +import numpy +from os import path cdef class TheanoModel(Model): - def __init__(self, n_classes, input_layer, train_func, predict_func, model_loc=None): + def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None): if model_loc is not None and path.isdir(model_loc): model_loc = path.join(model_loc, 'model') - self.n_classes = n_classes - - tables = [] - lengths = [] - for window_size, n_dims, vocab_size in input_structure: - tables.append(EmbeddingTable(n_dims, vocab_size, initializer)) - lengths.append(window_size) - - self.input_layer = InputLayer(lengths, tables) + self.eta = 0.001 + self.mu = 0.9 + self.t = 1 + initializer = lambda: 0.2 * numpy.random.uniform(-1.0, 1.0) + self.input_layer = InputLayer(input_spec, initializer) self.train_func = train_func self.predict_func = predict_func + self.n_classes = n_classes + self.n_feats = len(self.input_layer) self.model_loc = model_loc - if self.model_loc and path.exists(self.model_loc): - self._model.load(self.model_loc, freq_thresh=0) - - def train(self, Instance eg): - pass - - def predict(self, Instance eg): - - cdef const weight_t* score(self, atom_t* context) except NULL: - self.set_scores(self._scores, context) - return self._scores - - cdef int set_scores(self, weight_t* scores, atom_t* context) except -1: - # TODO f(context) --> Values - self._input_layer.fill(self._x, self._values, use_avg=False) - theano_scores = self._predict(self._x) + + def predict(self, Example eg): + self.input_layer.fill(eg.embeddings, eg.atoms) + theano_scores = self.predict_func(eg.embeddings) + cdef int i for i in range(self.n_classes): - output[i] = theano_scores[i] + eg.scores[i] = theano_scores[i] + eg.guess = arg_max_if_true(eg.scores.data, eg.is_valid.data, + self.n_classes) - cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1: - # TODO f(context) --> Values - self._input_layer.fill(self._x, self._values, use_avg=False) - + def train(self, Example eg): + self.predict(eg) + update, t, eta, mu = self.train_func(eg.embeddings, eg.scores, eg.costs) + self.input_layer.update(eg.atoms, update, self.t, self.eta, self.mu) + eg.best = arg_max_if_zero(eg.scores.data, eg.costs.data, + self.n_classes) + eg.cost = eg.costs[eg.guess] + self.t += 1 diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 2f6c3cd98..8af7ab25d 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -68,11 +68,11 @@ cdef class Parser: cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) self.moves.initialize_state(stcls) - cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE) + cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats) while not stcls.is_final(): eg.wipe() - fill_context(eg.atoms, stcls) - self.moves.set_valid(eg.is_valid, stcls) + fill_context(eg.atoms.data, stcls) + self.moves.set_valid(eg.is_valid.data, stcls) self.model.predict(eg) @@ -84,12 +84,12 @@ cdef class Parser: self.moves.preprocess_gold(gold) cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) self.moves.initialize_state(stcls) - cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE) + cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats) cdef int cost = 0 while not stcls.is_final(): eg.wipe() - fill_context(eg.atoms, stcls) - self.moves.set_costs(eg.is_valid, eg.costs, stcls, gold) + fill_context(eg.atoms.data, stcls) + self.moves.set_costs(eg.is_valid.data, eg.costs.data, stcls, gold) self.model.train(eg) From 897dd0dd0b4c1f662c36960d3367787fd658ce1d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 28 Jun 2015 11:36:11 +0200 Subject: [PATCH 13/30] * Merge changes, and adjust Example to use memoryview --- bin/parser/nn_train.py | 255 +++++++++++++++++++++ spacy/_bu_nn.pyx | 490 ++++++++++++++++++++++++++++++++++++++++ spacy/_ml.pyx | 14 +- spacy/_nn.py | 3 + spacy/_nn.pyx | 146 ++++++++++++ spacy/_theano.pxd | 13 ++ spacy/_theano.pyx | 23 +- spacy/syntax/joint.pxd | 17 ++ spacy/syntax/joint.pyx | 452 ++++++++++++++++++++++++++++++++++++ spacy/syntax/parser.pyx | 15 +- 10 files changed, 1408 insertions(+), 20 deletions(-) create mode 100755 bin/parser/nn_train.py create mode 100644 spacy/_bu_nn.pyx create mode 100644 spacy/_nn.py create mode 100644 spacy/_nn.pyx create mode 100644 spacy/_theano.pxd create mode 100644 spacy/syntax/joint.pxd create mode 100644 spacy/syntax/joint.pyx diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py new file mode 100755 index 000000000..375996f4f --- /dev/null +++ b/bin/parser/nn_train.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python +from __future__ import division +from __future__ import unicode_literals + +import os +from os import path +import shutil +import codecs +import random + +import plac +import cProfile +import pstats +import re + +import spacy.util +from spacy.en import English +from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir + +from spacy.syntax.util import Config +from spacy.gold import read_json_file +from spacy.gold import GoldParse + +from spacy.scorer import Scorer + +from thinc.theano_nn import compile_theano_model + +from spacy.syntax.parser import Parser +from spacy._theano import TheanoModel + + +def _corrupt(c, noise_level): + if random.random() >= noise_level: + return c + elif c == ' ': + return '\n' + elif c == '\n': + return ' ' + elif c in ['.', "'", "!", "?"]: + return '' + else: + return c.lower() + + +def add_noise(orig, noise_level): + if random.random() >= noise_level: + return orig + elif type(orig) == list: + corrupted = [_corrupt(word, noise_level) for word in orig] + corrupted = [w for w in corrupted if w] + return corrupted + else: + return ''.join(_corrupt(c, noise_level) for c in orig) + + +def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False): + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + else: + tokens = nlp.tokenizer(raw_text) + nlp.tagger(tokens) + nlp.entity(tokens) + nlp.parser(tokens) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold, verbose=verbose) + + +def _merge_sents(sents): + m_deps = [[], [], [], [], [], []] + m_brackets = [] + i = 0 + for (ids, words, tags, heads, labels, ner), brackets in sents: + m_deps[0].extend(id_ + i for id_ in ids) + m_deps[1].extend(words) + m_deps[2].extend(tags) + m_deps[3].extend(head + i for head in heads) + m_deps[4].extend(labels) + m_deps[5].extend(ner) + m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets) + i += len(ids) + return [(m_deps, m_brackets)] + + +def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', + seed=0, gold_preproc=False, n_sents=0, corruption_level=0, + verbose=False, + eta=0.01, mu=0.9, n_hidden=100, word_vec_len=10, pos_vec_len=10): + dep_model_dir = path.join(model_dir, 'deps') + pos_model_dir = path.join(model_dir, 'pos') + ner_model_dir = path.join(model_dir, 'ner') + if path.exists(dep_model_dir): + shutil.rmtree(dep_model_dir) + if path.exists(pos_model_dir): + shutil.rmtree(pos_model_dir) + if path.exists(ner_model_dir): + shutil.rmtree(ner_model_dir) + os.mkdir(dep_model_dir) + os.mkdir(pos_model_dir) + os.mkdir(ner_model_dir) + setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) + + Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, + labels=Language.ParserTransitionSystem.get_labels(gold_tuples)) + Config.write(ner_model_dir, 'config', features='ner', seed=seed, + labels=Language.EntityTransitionSystem.get_labels(gold_tuples), + beam_width=0) + + if n_sents > 0: + gold_tuples = gold_tuples[:n_sents] + + nlp = Language(data_dir=model_dir) + + def make_model(n_classes, input_spec, model_dir): + print input_spec + n_in = sum(n_cols * len(fields) for (n_cols, fields) in input_spec) + print 'Compiling' + debug, train_func, predict_func = compile_theano_model(n_classes, n_hidden, + n_in, 0.0, 0.0) + print 'Done' + return TheanoModel( + n_classes, + input_spec, + train_func, + predict_func, + model_loc=model_dir, + debug=debug) + + nlp._parser = Parser(nlp.vocab.strings, dep_model_dir, nlp.ParserTransitionSystem, + make_model) + + print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %" + for itn in range(n_iter): + scorer = Scorer() + loss = 0 + for raw_text, sents in gold_tuples: + if gold_preproc: + raw_text = None + else: + sents = _merge_sents(sents) + for annot_tuples, ctnt in sents: + if len(annot_tuples[1]) == 1: + continue + score_model(scorer, nlp, raw_text, annot_tuples, + verbose=verbose if itn >= 2 else False) + if raw_text is None: + words = add_noise(annot_tuples[1], corruption_level) + tokens = nlp.tokenizer.tokens_from_list(words) + else: + raw_text = add_noise(raw_text, corruption_level) + tokens = nlp.tokenizer(raw_text) + nlp.tagger(tokens) + gold = GoldParse(tokens, annot_tuples, make_projective=True) + if not gold.is_projective: + raise Exception( + "Non-projective sentence in training, after we should " + "have enforced projectivity: %s" % annot_tuples + ) + loss += nlp.parser.train(tokens, gold) + nlp.entity.train(tokens, gold) + nlp.tagger.train(tokens, gold.tags) + random.shuffle(gold_tuples) + print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, + scorer.tags_acc, + scorer.token_acc) + nlp.parser.model.end_training() + nlp.entity.model.end_training() + nlp.tagger.model.end_training() + nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt')) + return nlp + + +def evaluate(nlp, gold_tuples, gold_preproc=True): + scorer = Scorer() + for raw_text, sents in gold_tuples: + if gold_preproc: + raw_text = None + else: + sents = _merge_sents(sents) + for annot_tuples, brackets in sents: + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + nlp.tagger(tokens) + nlp.entity(tokens) + nlp.parser(tokens) + else: + tokens = nlp(raw_text, merge_mwes=False) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold) + return scorer + + +def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): + nlp = Language(data_dir=model_dir) + if beam_width is not None: + nlp.parser.cfg.beam_width = beam_width + gold_tuples = read_json_file(dev_loc) + scorer = Scorer() + out_file = codecs.open(out_loc, 'w', 'utf8') + for raw_text, sents in gold_tuples: + sents = _merge_sents(sents) + for annot_tuples, brackets in sents: + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + nlp.tagger(tokens) + nlp.entity(tokens) + nlp.parser(tokens) + else: + tokens = nlp(raw_text, merge_mwes=False) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold, verbose=False) + for t in tokens: + out_file.write( + '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_) + ) + return scorer + + +@plac.annotations( + train_loc=("Location of training file or directory"), + dev_loc=("Location of development file or directory"), + model_dir=("Location of output model directory",), + eval_only=("Skip training, and only evaluate", "flag", "e", bool), + corruption_level=("Amount of noise to add to training data", "option", "c", float), + gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool), + out_loc=("Out location", "option", "o", str), + n_sents=("Number of training sentences", "option", "n", int), + n_iter=("Number of training iterations", "option", "i", int), + verbose=("Verbose error reporting", "flag", "v", bool), + debug=("Debug mode", "flag", "d", bool), +) +def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, + debug=False, corruption_level=0.0, gold_preproc=False, beam_width=1, + eval_only=False): + gold_train = list(read_json_file(train_loc)) + nlp = train(English, gold_train, model_dir, + feat_set='embed', + gold_preproc=gold_preproc, n_sents=n_sents, + corruption_level=corruption_level, n_iter=n_iter, + verbose=verbose) + #if out_loc: + # write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width) + scorer = evaluate(nlp, list(read_json_file(dev_loc)), gold_preproc=gold_preproc) + + print 'TOK', 100-scorer.token_acc + print 'POS', scorer.tags_acc + print 'UAS', scorer.uas + print 'LAS', scorer.las + + print 'NER P', scorer.ents_p + print 'NER R', scorer.ents_r + print 'NER F', scorer.ents_f + + +if __name__ == '__main__': + plac.call(main) diff --git a/spacy/_bu_nn.pyx b/spacy/_bu_nn.pyx new file mode 100644 index 000000000..ae875b235 --- /dev/null +++ b/spacy/_bu_nn.pyx @@ -0,0 +1,490 @@ +"""Feed-forward neural network, using Thenao.""" + +import os +import sys +import time + +import numpy + +import theano +import theano.tensor as T +import gzip +import cPickle + + +def load_data(dataset): + ''' Loads the dataset + + :type dataset: string + :param dataset: the path to the dataset (here MNIST) + ''' + + ############# + # LOAD DATA # + ############# + + # Download the MNIST dataset if it is not present + data_dir, data_file = os.path.split(dataset) + if data_dir == "" and not os.path.isfile(dataset): + # Check if dataset is in the data directory. + new_path = os.path.join( + os.path.split(__file__)[0], + "..", + "data", + dataset + ) + if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz': + dataset = new_path + + if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz': + import urllib + origin = ( + 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' + ) + print 'Downloading data from %s' % origin + urllib.urlretrieve(origin, dataset) + + print '... loading data' + + # Load the dataset + f = gzip.open(dataset, 'rb') + train_set, valid_set, test_set = cPickle.load(f) + f.close() + #train_set, valid_set, test_set format: tuple(input, target) + #input is an numpy.ndarray of 2 dimensions (a matrix), + #each row corresponding to an example. target is a + #numpy.ndarray of 1 dimension (vector)) that have the same length as + #the number of rows in the input. It should give the target + #target to the example with the same index in the input. + + def shared_dataset(data_xy, borrow=True): + """ Function that loads the dataset into shared variables + + The reason we store our dataset in shared variables is to allow + Theano to copy it into the GPU memory (when code is run on GPU). + Since copying data into the GPU is slow, copying a minibatch everytime + is needed (the default behaviour if the data is not in a shared + variable) would lead to a large decrease in performance. + """ + data_x, data_y = data_xy + shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX), + borrow=borrow) + shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX), + borrow=borrow) + # When storing data on the GPU it has to be stored as floats + # therefore we will store the labels as ``floatX`` as well + # (``shared_y`` does exactly that). But during our computations + # we need them as ints (we use labels as index, and if they are + # floats it doesn't make sense) therefore instead of returning + # ``shared_y`` we will have to cast it to int. This little hack + # lets ous get around this issue + return shared_x, T.cast(shared_y, 'int32') + + test_set_x, test_set_y = shared_dataset(test_set) + valid_set_x, valid_set_y = shared_dataset(valid_set) + train_set_x, train_set_y = shared_dataset(train_set) + + rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), + (test_set_x, test_set_y)] + return rval + + +class LogisticRegression(object): + """Multi-class Logistic Regression Class + + The logistic regression is fully described by a weight matrix :math:`W` + and bias vector :math:`b`. Classification is done by projecting data + points onto a set of hyperplanes, the distance to which is used to + determine a class membership probability. + """ + + def __init__(self, input, n_in, n_out): + """ Initialize the parameters of the logistic regression + + :type input: theano.tensor.TensorType + :param input: symbolic variable that describes the input of the + architecture (one minibatch) + + :type n_in: int + :param n_in: number of input units, the dimension of the space in + which the datapoints lie + + :type n_out: int + :param n_out: number of output units, the dimension of the space in + which the labels lie + + """ + # start-snippet-1 + # initialize with 0 the weights W as a matrix of shape (n_in, n_out) + self.W = theano.shared( + value=numpy.zeros((n_in, n_out), + dtype=theano.config.floatX + ), + name='W', + borrow=True + ) + # initialize the baises b as a vector of n_out 0s + self.b = theano.shared( + value=numpy.zeros( + (n_out,), + dtype=theano.config.floatX + ), + name='b', + borrow=True + ) + + # symbolic expression for computing the matrix of class-membership + # probabilities + # Where: + # W is a matrix where column-k represent the separation hyper plain for + # class-k + # x is a matrix where row-j represents input training sample-j + # b is a vector where element-k represent the free parameter of hyper + # plain-k + self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) + + # symbolic description of how to compute prediction as class whose + # probability is maximal + self.y_pred = T.argmax(self.p_y_given_x, axis=1) + # end-snippet-1 + + # parameters of the model + self.params = [self.W, self.b] + + def neg_ll(self, y): + """Return the mean of the negative log-likelihood of the prediction + of this model under a given target distribution. + + .. math:: + + \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = + \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} + \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ + \ell (\theta=\{W,b\}, \mathcal{D}) + + :type y: theano.tensor.TensorType + :param y: corresponds to a vector that gives for each example the + correct label + + Note: we use the mean instead of the sum so that + the learning rate is less dependent on the batch size + """ + # start-snippet-2 + # y.shape[0] is (symbolically) the number of rows in y, i.e., + # number of examples (call it n) in the minibatch + # T.arange(y.shape[0]) is a symbolic vector which will contain + # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of + # Log-Probabilities (call it LP) with one row per example and + # one column per class LP[T.arange(y.shape[0]),y] is a vector + # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., + # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is + # the mean (across minibatch examples) of the elements in v, + # i.e., the mean log-likelihood across the minibatch. + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) + # end-snippet-2 + + def errors(self, y): + """Return a float representing the number of errors in the minibatch + over the total number of examples of the minibatch ; zero one + loss over the size of the minibatch + + :type y: theano.tensor.TensorType + :param y: corresponds to a vector that gives for each example the + correct label + """ + + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError( + 'y should have the same shape as self.y_pred', + ('y', y.type, 'y_pred', self.y_pred.type) + ) + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + + +# start-snippet-1 +class HiddenLayer(object): + def __init__(self, rng, input, n_in, n_out, W=None, b=None, + activation=T.tanh): + """ + Typical hidden layer of a MLP: units are fully-connected and have + sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) + and the bias vector b is of shape (n_out,). + + NOTE : The nonlinearity used here is tanh + + Hidden unit activation is given by: tanh(dot(input,W) + b) + + :type rng: numpy.random.RandomState + :param rng: a random number generator used to initialize weights + + :type input: theano.tensor.dmatrix + :param input: a symbolic tensor of shape (n_examples, n_in) + + :type n_in: int + :param n_in: dimensionality of input + + :type n_out: int + :param n_out: number of hidden units + + :type activation: theano.Op or function + :param activation: Non linearity to be applied in the hidden + layer + """ + self.input = input + # end-snippet-1 + + # `W` is initialized with `W_values` which is uniformely sampled + # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden)) + # for tanh activation function + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + # Note : optimal initialization of weights is dependent on the + # activation function used (among other things). + # For example, results presented in [Xavier10] suggest that you + # should use 4 times larger initial weights for sigmoid + # compared to tanh + # We have no info for other function, so we use the same as + # tanh. + if W is None: + W_values = numpy.asarray( + rng.uniform( + low=-numpy.sqrt(6. / (n_in + n_out)), + high=numpy.sqrt(6. / (n_in + n_out)), + size=(n_in, n_out) + ), + dtype=theano.config.floatX + ) + if activation == theano.tensor.nnet.sigmoid: + W_values *= 4 + + W = theano.shared(value=W_values, name='W', borrow=True) + + if b is None: + b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) + b = theano.shared(value=b_values, name='b', borrow=True) + + self.W = W + self.b = b + + lin_output = T.dot(input, self.W) + self.b + self.output = ( + lin_output if activation is None + else activation(lin_output) + ) + # parameters of the model + self.params = [self.W, self.b] + + +# start-snippet-2 +class MLP(object): + """Multi-Layer Perceptron Class + + A multilayer perceptron is a feedforward artificial neural network model + that has one layer or more of hidden units and nonlinear activations. + Intermediate layers usually have as activation function tanh or the + sigmoid function (defined here by a ``HiddenLayer`` class) while the + top layer is a softmax layer (defined here by a ``LogisticRegression`` + class). + """ + + def __init__(self, rng, input, n_in, n_hidden, n_out): + """Initialize the parameters for the multilayer perceptron + + :type rng: numpy.random.RandomState + :param rng: a random number generator used to initialize weights + + :type input: theano.tensor.TensorType + :param input: symbolic variable that describes the input of the + architecture (one minibatch) + + :type n_in: int + :param n_in: number of input units, the dimension of the space in + which the datapoints lie + + :type n_hidden: int + :param n_hidden: number of hidden units + + :type n_out: int + :param n_out: number of output units, the dimension of the space in + which the labels lie + + """ + + # Since we are dealing with a one hidden layer MLP, this will translate + # into a HiddenLayer with a tanh activation function connected to the + # LogisticRegression layer; the activation function can be replaced by + # sigmoid or any other nonlinear function + self.hidden = HiddenLayer( + rng=rng, + input=input, + n_in=n_in, + n_out=n_hidden, + activation=T.tanh + ) + + # The logistic regression layer gets as input the hidden units + # of the hidden layer + self.maxent = LogisticRegression( + input=self.hidden.output, + n_in=n_hidden, + n_out=n_out + ) + # L1 norm ; one regularization option is to enforce L1 norm to + # be small + self.L1 = abs(self.hidden.W).sum() + abs(self.maxent.W).sum() + + # square of L2 norm ; one regularization option is to enforce + # square of L2 norm to be small + self.L2_sqr = (self.hidden.W ** 2).sum() + (self.maxent.W ** 2).sum() + + # negative log likelihood of the MLP is given by the negative + # log likelihood of the output of the model, computed in the + # logistic regression layer + self.neg_ll = self.maxent.neg_ll + # same holds for the function computing the number of errors + self.errors = self.maxent.errors + + # the parameters of the model are the parameters of the two layer it is + # made out of + self.params = self.hidden.params + self.maxent.params + + + + +def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, + dataset='mnist.pkl.gz', batch_size=1, n_hidden=500): + """ + Demonstrate stochastic gradient descent optimization for a multilayer + perceptron + + This is demonstrated on MNIST. + + :type learning_rate: float + :param learning_rate: learning rate used (factor for the stochastic + gradient + + :type L1_reg: float + :param L1_reg: L1-norm's weight when added to the cost (see + regularization) + + :type L2_reg: float + :param L2_reg: L2-norm's weight when added to the cost (see + regularization) + + :type n_epochs: int + :param n_epochs: maximal number of epochs to run the optimizer + + :type dataset: string + :param dataset: the path of the MNIST dataset file from + http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz + """ + datasets = load_data(dataset) + + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x, test_set_y = datasets[2] + + ###################### + # BUILD ACTUAL MODEL # + ###################### + print '... building the model' + + # allocate symbolic variables for the data + index = T.lscalar() # index to a [mini]batch + x = T.matrix('x') # the data is presented as rasterized images + y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + + rng = numpy.random.RandomState(1234) + + # construct the MLP class + mlp = MLP( + rng=rng, + input=x, + n_in=28 * 28, + n_hidden=n_hidden, + n_out=10 + ) + + # the cost we minimize during training is the negative log likelihood of + # the model plus the regularization terms (L1 and L2); cost is expressed + # here symbolically + + # compiling a Theano function that computes the mistakes that are made + # by the model on a minibatch + test_model = theano.function( + inputs=[index], + outputs=mlp.maxent.errors(y), + givens={ + x: test_set_x[index:index+1], + y: test_set_y[index:index+1] + } + ) + + validate_model = theano.function( + inputs=[index], + outputs=mlp.maxent.errors(y), + givens={ + x: valid_set_x[index:index+1], + y: valid_set_y[index:index+1] + } + ) + + # compute the gradient of cost with respect to theta (sotred in params) + # the resulting gradients will be stored in a list gparams + cost = mlp.neg_ll(y) + L1_reg * mlp.L1 + L2_reg * mlp.L2_sqr + gparams = [T.grad(cost, param) for param in mlp.params] + + # specify how to update the parameters of the model as a list of + # (variable, update expression) pairs + + updates = [(mlp.params[i], mlp.params[i] - (learning_rate * gparams[i])) + for i in xrange(len(gparams))] + + # compiling a Theano function `train_model` that returns the cost, but + # in the same time updates the parameter of the model based on the rules + # defined in `updates` + train_model = theano.function( + inputs=[index], + outputs=cost, + updates=updates, + givens={ + x: train_set_x[index:index+1], + y: train_set_y[index:index+1] + } + ) + # end-snippet-5 + + ############### + # TRAIN MODEL # + ############### + print '... training' + + start_time = time.clock() + + n_examples = train_set_x.get_value(borrow=True).shape[0] + n_dev_examples = valid_set_x.get_value(borrow=True).shape[0] + n_test_examples = test_set_x.get_value(borrow=True).shape[0] + + for epoch in range(1, n_epochs+1): + for idx in xrange(n_examples): + train_model(idx) + # compute zero-one loss on validation set + error = numpy.mean(map(validate_model, xrange(n_dev_examples))) + print('epoch %i, validation error %f %%' % (epoch, error * 100)) + + end_time = time.clock() + print >> sys.stderr, ('The code for file ' + + os.path.split(__file__)[1] + + ' ran for %.2fm' % ((end_time - start_time) / 60.)) + + +if __name__ == '__main__': + test_mlp() diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx index 993d1a8ac..cabc4318a 100644 --- a/spacy/_ml.pyx +++ b/spacy/_ml.pyx @@ -61,18 +61,18 @@ cdef class Model: self._model.load(self.model_loc, freq_thresh=0) def predict(self, Example eg): - self.set_scores(eg.scores.data, eg.atoms.data) - eg.guess = arg_max_if_true(eg.scores.data, eg.is_valid.data, + self.set_scores(&eg.scores[0], &eg.atoms[0]) + eg.guess = arg_max_if_true(&eg.scores[0], &eg.is_valid[0], self.n_classes) def train(self, Example eg): - self.set_scores(eg.scores.data, eg.atoms.data) - eg.guess = arg_max_if_true(eg.scores.data, - eg.is_valid.data, self.n_classes) - eg.best = arg_max_if_zero(eg.scores.data, eg.costs.data, + self.set_scores(&eg.scores[0], &eg.atoms[0]) + eg.guess = arg_max_if_true(&eg.scores[0], + &eg.is_valid[0], self.n_classes) + eg.best = arg_max_if_zero(&eg.scores[0], &eg.costs[0], self.n_classes) eg.cost = eg.costs[eg.guess] - self.update(eg.atoms.data, eg.guess, eg.best, eg.cost) + self.update(&eg.atoms[0], eg.guess, eg.best, eg.cost) cdef const weight_t* score(self, atom_t* context) except NULL: cdef int n_feats diff --git a/spacy/_nn.py b/spacy/_nn.py new file mode 100644 index 000000000..48dca390c --- /dev/null +++ b/spacy/_nn.py @@ -0,0 +1,3 @@ +"""Feed-forward neural network, using Thenao.""" + + diff --git a/spacy/_nn.pyx b/spacy/_nn.pyx new file mode 100644 index 000000000..c47be1f49 --- /dev/null +++ b/spacy/_nn.pyx @@ -0,0 +1,146 @@ +"""Feed-forward neural network, using Thenao.""" + +import os +import sys +import time + +import numpy + +import theano +import theano.tensor as T +import plac + +from spacy.gold import read_json_file +from spacy.gold import GoldParse +from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir + + +def build_model(n_classes, n_vocab, n_hidden, n_word_embed, n_tag_embed): + # allocate symbolic variables for the data + words = T.vector('words') + tags = T.vector('tags') + + word_e = _init_embedding(n_words, n_word_embed) + tag_e = _init_embedding(n_tags, n_tag_embed) + label_e = _init_embedding(n_labels, n_label_embed) + maxent_W, maxent_b = _init_maxent_weights(n_hidden, n_classes) + hidden_W, hidden_b = _init_hidden_weights(28*28, n_hidden, T.tanh) + params = [hidden_W, hidden_b, maxent_W, maxent_b, word_e, tag_e, label_e] + + x = T.concatenate([ + T.flatten(word_e[word_indices], outdim=1), + T.flatten(tag_e[tag_indices], outdim=1)]) + + p_y_given_x = feed_layer( + T.nnet.softmax, + maxent_W, + maxent_b, + feed_layer( + T.tanh, + hidden_W, + hidden_b, + x))[0] + + guess = T.argmax(p_y_given_x) + + cost = ( + -T.log(p_y_given_x[y]) + + L1(L1_reg, maxent_W, hidden_W, word_e, tag_e) + + L2(L2_reg, maxent_W, hidden_W, wod_e, tag_e) + ) + + train_model = theano.function( + inputs=[words, tags, y], + outputs=guess, + updates=[update(learning_rate, param, cost) for param in params] + ) + + evaluate_model = theano.function( + inputs=[x, y], + outputs=T.neq(y, T.argmax(p_y_given_x[0])), + ) + return train_model, evaluate_model + + +def _init_embedding(vocab_size, n_dim): + embedding = 0.2 * numpy.random.uniform(-1.0, 1.0, (vocab_size+1, n_dim)) + return theano.shared(embedding).astype(theano.config.floatX) + + +def _init_maxent_weights(n_hidden, n_out): + weights = numpy.zeros((n_hidden, 10), dtype=theano.config.floatX) + bias = numpy.zeros((10,), dtype=theano.config.floatX) + return ( + theano.shared(name='W', borrow=True, value=weights), + theano.shared(name='b', borrow=True, value=bias) + ) + + +def _init_hidden_weights(n_in, n_out, activation=T.tanh): + rng = numpy.random.RandomState(1234) + weights = numpy.asarray( + rng.uniform( + low=-numpy.sqrt(6. / (n_in + n_out)), + high=numpy.sqrt(6. / (n_in + n_out)), + size=(n_in, n_out) + ), + dtype=theano.config.floatX + ) + + bias = numpy.zeros((n_out,), dtype=theano.config.floatX) + return ( + theano.shared(value=weights, name='W', borrow=True), + theano.shared(value=bias, name='b', borrow=True) + ) + + +def feed_layer(activation, weights, bias, input): + return activation(T.dot(input, weights) + bias) + + +def L1(L1_reg, w1, w2): + return L1_reg * (abs(w1).sum() + abs(w2).sum()) + + +def L2(L2_reg, w1, w2): + return L2_reg * ((w1 ** 2).sum() + (w2 ** 2).sum()) + + +def update(eta, param, cost): + return (param, param - (eta * T.grad(cost, param))) + + +def main(train_loc, eval_loc, model_dir): + learning_rate = 0.01 + L1_reg = 0.00 + L2_reg = 0.0001 + + print "... reading the data" + gold_train = list(read_json_file(train_loc)) + print '... building the model' + pos_model_dir = path.join(model_dir, 'pos') + if path.exists(pos_model_dir): + shutil.rmtree(pos_model_dir) + os.mkdir(pos_model_dir) + + setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) + + train_model, evaluate_model = build_model(n_hidden, len(POS_TAGS), learning_rate, + L1_reg, L2_reg) + + print '... training' + for epoch in range(1, n_epochs+1): + for raw_text, sents in gold_tuples: + for (ids, words, tags, ner, heads, deps), _ in sents: + tokens = nlp.tokenizer.tokens_from_list(words) + for t in tokens: + guess = train_model([t.orth], [t.tag]) + loss += guess != t.tag + print loss + # compute zero-one loss on validation set + #error = numpy.mean([evaluate_model(x, y) for x, y in dev_examples]) + #print('epoch %i, validation error %f %%' % (epoch, error * 100)) + + +if __name__ == '__main__': + plac.call(main) diff --git a/spacy/_theano.pxd b/spacy/_theano.pxd new file mode 100644 index 000000000..cad0736c2 --- /dev/null +++ b/spacy/_theano.pxd @@ -0,0 +1,13 @@ +from ._ml cimport Model +from thinc.nn cimport InputLayer + + +cdef class TheanoModel(Model): + cdef InputLayer input_layer + cdef object train_func + cdef object predict_func + cdef object debug + + cdef public float eta + cdef public float mu + cdef public float t diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx index 702208d18..08c49ce71 100644 --- a/spacy/_theano.pyx +++ b/spacy/_theano.pyx @@ -9,7 +9,8 @@ from os import path cdef class TheanoModel(Model): - def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None): + def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None, + debug=None): if model_loc is not None and path.isdir(model_loc): model_loc = path.join(model_loc, 'model') @@ -20,6 +21,7 @@ cdef class TheanoModel(Model): self.input_layer = InputLayer(input_spec, initializer) self.train_func = train_func self.predict_func = predict_func + self.debug = debug self.n_classes = n_classes self.n_feats = len(self.input_layer) @@ -27,18 +29,25 @@ cdef class TheanoModel(Model): def predict(self, Example eg): self.input_layer.fill(eg.embeddings, eg.atoms) - theano_scores = self.predict_func(eg.embeddings) + theano_scores = self.predict_func(eg.embeddings)[0] cdef int i for i in range(self.n_classes): eg.scores[i] = theano_scores[i] - eg.guess = arg_max_if_true(eg.scores.data, eg.is_valid.data, + eg.guess = arg_max_if_true(&eg.scores[0], eg.is_valid[0], self.n_classes) def train(self, Example eg): - self.predict(eg) - update, t, eta, mu = self.train_func(eg.embeddings, eg.scores, eg.costs) - self.input_layer.update(eg.atoms, update, self.t, self.eta, self.mu) - eg.best = arg_max_if_zero(eg.scores.data, eg.costs.data, + self.input_layer.fill(eg.embeddings, eg.atoms) + theano_scores, update, y = self.train_func(eg.embeddings, eg.costs, self.eta) + self.input_layer.update(update, eg.atoms, self.t, self.eta, self.mu) + for i in range(self.n_classes): + eg.scores[i] = theano_scores[i] + eg.guess = arg_max_if_true(&eg.scores[0], eg.is_valid[0], + self.n_classes) + eg.best = arg_max_if_zero(&eg.scores[0], eg.costs[0], self.n_classes) eg.cost = eg.costs[eg.guess] self.t += 1 + + def end_training(self): + pass diff --git a/spacy/syntax/joint.pxd b/spacy/syntax/joint.pxd new file mode 100644 index 000000000..5b7a6e3db --- /dev/null +++ b/spacy/syntax/joint.pxd @@ -0,0 +1,17 @@ +from cymem.cymem cimport Pool + +from thinc.typedefs cimport weight_t + +from .stateclass cimport StateClass + +from .transition_system cimport TransitionSystem, Transition +from ..gold cimport GoldParseC + + +cdef class ArcEager(TransitionSystem): + pass + + +cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil +cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil + diff --git a/spacy/syntax/joint.pyx b/spacy/syntax/joint.pyx new file mode 100644 index 000000000..29e62cb4e --- /dev/null +++ b/spacy/syntax/joint.pyx @@ -0,0 +1,452 @@ +# cython: profile=True +from __future__ import unicode_literals + +import ctypes +import os + +from ..structs cimport TokenC + +from .transition_system cimport do_func_t, get_cost_func_t +from .transition_system cimport move_cost_func_t, label_cost_func_t +from ..gold cimport GoldParse +from ..gold cimport GoldParseC + +from libc.stdint cimport uint32_t +from libc.string cimport memcpy + +from cymem.cymem cimport Pool +from .stateclass cimport StateClass + + +DEF NON_MONOTONIC = True +DEF USE_BREAK = True +DEF USE_ROOT_ARC_SEGMENT = True + +cdef weight_t MIN_SCORE = -90000 + +# Break transition from here +# http://www.aclweb.org/anthology/P13-1074 +cdef enum: + SHIFT + REDUCE + LEFT + RIGHT + + BREAK + + N_MOVES + + +MOVE_NAMES = [None] * N_MOVES +MOVE_NAMES[SHIFT] = 'S' +MOVE_NAMES[REDUCE] = 'D' +MOVE_NAMES[LEFT] = 'L' +MOVE_NAMES[RIGHT] = 'R' +MOVE_NAMES[BREAK] = 'B' + + +# Helper functions for the arc-eager oracle + +cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil: + cdef int cost = 0 + cdef int i, S_i + for i in range(stcls.stack_depth()): + S_i = stcls.S(i) + if gold.heads[target] == S_i: + cost += 1 + if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)): + cost += 1 + cost += Break.is_valid(stcls, -1) and Break.move_cost(stcls, gold) == 0 + return cost + + +cdef int pop_cost(StateClass stcls, const GoldParseC* gold, int target) nogil: + cdef int cost = 0 + cdef int i, B_i + for i in range(stcls.buffer_length()): + B_i = stcls.B(i) + cost += gold.heads[B_i] == target + cost += gold.heads[target] == B_i + if gold.heads[B_i] == B_i or gold.heads[B_i] < target: + break + cost += Break.is_valid(stcls, -1) and Break.move_cost(stcls, gold) == 0 + return cost + + +cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil: + if arc_is_gold(gold, head, child): + return 0 + elif stcls.H(child) == gold.heads[child]: + return 1 + # Head in buffer + elif gold.heads[child] >= stcls.B(0) and stcls.B(1) != -1: + return 1 + else: + return 0 + + +cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil: + if gold.labels[child] == -1: + return True + elif USE_ROOT_ARC_SEGMENT and _is_gold_root(gold, head) and _is_gold_root(gold, child): + return True + elif gold.heads[child] == head: + return True + else: + return False + + +cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil: + if gold.labels[child] == -1: + return True + elif label == -1: + return True + elif gold.labels[child] == label: + return True + else: + return False + + +cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil: + return gold.labels[word] == -1 or gold.heads[word] == word + + +cdef class Shift: + @staticmethod + cdef bint is_valid(StateClass st, int label) nogil: + return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start + + @staticmethod + cdef int transition(StateClass st, int label) nogil: + st.push() + st.fast_forward() + + @staticmethod + cdef int cost(StateClass st, const GoldParseC* gold, int label) nogil: + return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label) + + @staticmethod + cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: + return push_cost(s, gold, s.B(0)) + + @staticmethod + cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: + return 0 + + +cdef class Reduce: + @staticmethod + cdef bint is_valid(StateClass st, int label) nogil: + return st.stack_depth() >= 2 + + @staticmethod + cdef int transition(StateClass st, int label) nogil: + if st.has_head(st.S(0)): + st.pop() + else: + st.unshift() + st.fast_forward() + + @staticmethod + cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: + return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label) + + @staticmethod + cdef inline int move_cost(StateClass st, const GoldParseC* gold) nogil: + return pop_cost(st, gold, st.S(0)) + + @staticmethod + cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: + return 0 + + +cdef class LeftArc: + @staticmethod + cdef bint is_valid(StateClass st, int label) nogil: + return not st.B_(0).sent_start + + @staticmethod + cdef int transition(StateClass st, int label) nogil: + st.add_arc(st.B(0), st.S(0), label) + st.pop() + st.fast_forward() + + @staticmethod + cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: + return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label) + + @staticmethod + cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: + cdef int cost = 0 + if arc_is_gold(gold, s.B(0), s.S(0)): + return 0 + else: + # Account for deps we might lose between S0 and stack + if not s.has_head(s.S(0)): + for i in range(1, s.stack_depth()): + cost += gold.heads[s.S(i)] == s.S(0) + cost += gold.heads[s.S(0)] == s.S(i) + return pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0)) + + @staticmethod + cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: + return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label) + + +cdef class RightArc: + @staticmethod + cdef bint is_valid(StateClass st, int label) nogil: + return not st.B_(0).sent_start + + @staticmethod + cdef int transition(StateClass st, int label) nogil: + st.add_arc(st.S(0), st.B(0), label) + st.push() + st.fast_forward() + + @staticmethod + cdef inline int cost(StateClass s, const GoldParseC* gold, int label) nogil: + return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label) + + @staticmethod + cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: + if arc_is_gold(gold, s.S(0), s.B(0)): + return 0 + elif s.shifted[s.B(0)]: + return push_cost(s, gold, s.B(0)) + else: + return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0)) + + @staticmethod + cdef int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: + return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label) + + +cdef class Break: + @staticmethod + cdef bint is_valid(StateClass st, int label) nogil: + cdef int i + if not USE_BREAK: + return False + elif st.at_break(): + return False + elif st.B(0) == 0: + return False + elif st.stack_depth() < 1: + return False + elif (st.S(0) + 1) != st.B(0): + # Must break at the token boundary + return False + else: + return True + + @staticmethod + cdef int transition(StateClass st, int label) nogil: + st.set_break(st.B(0)) + st.fast_forward() + + @staticmethod + cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil: + return Break.move_cost(s, gold) + Break.label_cost(s, gold, label) + + @staticmethod + cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil: + cdef int cost = 0 + cdef int i, j, S_i, B_i + for i in range(s.stack_depth()): + S_i = s.S(i) + for j in range(s.buffer_length()): + B_i = s.B(j) + cost += gold.heads[S_i] == B_i + cost += gold.heads[B_i] == S_i + # Check for sentence boundary --- if it's here, we can't have any deps + # between stack and buffer, so rest of action is irrelevant. + s0_root = _get_root(s.S(0), gold) + b0_root = _get_root(s.B(0), gold) + if s0_root != b0_root or s0_root == -1 or b0_root == -1: + return cost + else: + return cost + 1 + + @staticmethod + cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: + return 0 + +cdef int _get_root(int word, const GoldParseC* gold) nogil: + while gold.heads[word] != word and gold.labels[word] != -1 and word >= 0: + word = gold.heads[word] + if gold.labels[word] == -1: + return -1 + else: + return word + + +cdef class ArcEager(TransitionSystem): + @classmethod + def get_labels(cls, gold_parses): + move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {'ROOT': True}, + LEFT: {'ROOT': True}, BREAK: {'ROOT': True}} + for raw_text, sents in gold_parses: + for (ids, words, tags, heads, labels, iob), ctnts in sents: + for child, head, label in zip(ids, heads, labels): + if label.upper() == 'ROOT': + label = 'ROOT' + if label != 'ROOT': + if head < child: + move_labels[RIGHT][label] = True + elif head > child: + move_labels[LEFT][label] = True + return move_labels + + cdef int preprocess_gold(self, GoldParse gold) except -1: + for i in range(gold.length): + if gold.heads[i] is None: # Missing values + gold.c.heads[i] = i + gold.c.labels[i] = -1 + else: + label = gold.labels[i] + if label.upper() == 'ROOT': + label = 'ROOT' + gold.c.heads[i] = gold.heads[i] + gold.c.labels[i] = self.strings[label] + for end, brackets in gold.brackets.items(): + for start, label_strs in brackets.items(): + gold.c.brackets[start][end] = 1 + for label_str in label_strs: + # Add the encoded label to the set + gold.brackets[end][start].add(self.strings[label_str]) + + cdef Transition lookup_transition(self, object name) except *: + if '-' in name: + move_str, label_str = name.split('-', 1) + label = self.label_ids[label_str] + else: + label = 0 + move = MOVE_NAMES.index(move_str) + for i in range(self.n_moves): + if self.c[i].move == move and self.c[i].label == label: + return self.c[i] + + def move_name(self, int move, int label): + label_str = self.strings[label] + if label_str: + return MOVE_NAMES[move] + '-' + label_str + else: + return MOVE_NAMES[move] + + cdef Transition init_transition(self, int clas, int move, int label) except *: + # TODO: Apparent Cython bug here when we try to use the Transition() + # constructor with the function pointers + cdef Transition t + t.score = 0 + t.clas = clas + t.move = move + t.label = label + if move == SHIFT: + t.is_valid = Shift.is_valid + t.do = Shift.transition + t.get_cost = Shift.cost + elif move == REDUCE: + t.is_valid = Reduce.is_valid + t.do = Reduce.transition + t.get_cost = Reduce.cost + elif move == LEFT: + t.is_valid = LeftArc.is_valid + t.do = LeftArc.transition + t.get_cost = LeftArc.cost + elif move == RIGHT: + t.is_valid = RightArc.is_valid + t.do = RightArc.transition + t.get_cost = RightArc.cost + elif move == BREAK: + t.is_valid = Break.is_valid + t.do = Break.transition + t.get_cost = Break.cost + else: + raise Exception(move) + return t + + cdef int initialize_state(self, StateClass st) except -1: + # Ensure sent_start is set to 0 throughout + for i in range(st.length): + st._sent[i].sent_start = False + st._sent[i].l_edge = i + st._sent[i].r_edge = i + st.fast_forward() + + cdef int finalize_state(self, StateClass st) except -1: + cdef int root_label = self.strings['ROOT'] + for i in range(st.length): + if st._sent[i].head == 0 and st._sent[i].dep == 0: + st._sent[i].dep = root_label + # If we're not using the Break transition, we segment via root-labelled + # arcs between the root words. + elif USE_ROOT_ARC_SEGMENT and st._sent[i].dep == root_label: + st._sent[i].head = 0 + + cdef int set_valid(self, bint* output, StateClass stcls) except -1: + cdef bint[N_MOVES] is_valid + is_valid[SHIFT] = Shift.is_valid(stcls, -1) + is_valid[REDUCE] = Reduce.is_valid(stcls, -1) + is_valid[LEFT] = LeftArc.is_valid(stcls, -1) + is_valid[RIGHT] = RightArc.is_valid(stcls, -1) + is_valid[BREAK] = Break.is_valid(stcls, -1) + cdef int i + n_valid = 0 + for i in range(self.n_moves): + output[i] = is_valid[self.c[i].move] + n_valid += output[i] + assert n_valid >= 1 + + cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1: + cdef int i, move, label + cdef label_cost_func_t[N_MOVES] label_cost_funcs + cdef move_cost_func_t[N_MOVES] move_cost_funcs + cdef int[N_MOVES] move_costs + for i in range(N_MOVES): + move_costs[i] = -1 + move_cost_funcs[SHIFT] = Shift.move_cost + move_cost_funcs[REDUCE] = Reduce.move_cost + move_cost_funcs[LEFT] = LeftArc.move_cost + move_cost_funcs[RIGHT] = RightArc.move_cost + move_cost_funcs[BREAK] = Break.move_cost + + label_cost_funcs[SHIFT] = Shift.label_cost + label_cost_funcs[REDUCE] = Reduce.label_cost + label_cost_funcs[LEFT] = LeftArc.label_cost + label_cost_funcs[RIGHT] = RightArc.label_cost + label_cost_funcs[BREAK] = Break.label_cost + + cdef int* labels = gold.c.labels + cdef int* heads = gold.c.heads + + n_gold = 0 + for i in range(self.n_moves): + if self.c[i].is_valid(stcls, self.c[i].label): + move = self.c[i].move + label = self.c[i].label + if move_costs[move] == -1: + move_costs[move] = move_cost_funcs[move](stcls, &gold.c) + output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label) + n_gold += output[i] == 0 + else: + output[i] = 9000 + assert n_gold >= 1 + + cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *: + cdef bint[N_MOVES] is_valid + is_valid[SHIFT] = Shift.is_valid(stcls, -1) + is_valid[REDUCE] = Reduce.is_valid(stcls, -1) + is_valid[LEFT] = LeftArc.is_valid(stcls, -1) + is_valid[RIGHT] = RightArc.is_valid(stcls, -1) + is_valid[BREAK] = Break.is_valid(stcls, -1) + cdef Transition best + cdef weight_t score = MIN_SCORE + cdef int i + for i in range(self.n_moves): + if scores[i] > score and is_valid[self.c[i].move]: + best = self.c[i] + score = scores[i] + assert best.clas < self.n_moves + assert score > MIN_SCORE, (stcls.stack_depth(), stcls.buffer_length(), stcls.is_final(), stcls._b_i, stcls.length) + return best diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 8af7ab25d..f2fa63da5 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -51,18 +51,21 @@ def get_templates(name): return pf.ner elif name == 'debug': return pf.unigrams + elif name.startswith('embed'): + return ((10, pf.words), (10, pf.tags), (10, pf.labels)) else: return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \ pf.tree_shape + pf.trigrams) cdef class Parser: - def __init__(self, StringStore strings, model_dir, transition_system): + def __init__(self, StringStore strings, model_dir, transition_system, + get_model=Model): assert os.path.exists(model_dir) and os.path.isdir(model_dir) self.cfg = Config.read(model_dir, 'config') self.moves = transition_system(strings, self.cfg.labels) templates = get_templates(self.cfg.features) - self.model = Model(self.moves.n_moves, templates, model_dir) + self.model = get_model(self.moves.n_moves, templates, model_dir) def __call__(self, Tokens tokens): cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) @@ -71,8 +74,8 @@ cdef class Parser: cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats) while not stcls.is_final(): eg.wipe() - fill_context(eg.atoms.data, stcls) - self.moves.set_valid(eg.is_valid.data, stcls) + fill_context(&eg.atoms[0], stcls) + self.moves.set_valid(&eg.is_valid[0], stcls) self.model.predict(eg) @@ -88,8 +91,8 @@ cdef class Parser: cdef int cost = 0 while not stcls.is_final(): eg.wipe() - fill_context(eg.atoms.data, stcls) - self.moves.set_costs(eg.is_valid.data, eg.costs.data, stcls, gold) + fill_context(&eg.atoms[0], stcls) + self.moves.set_costs(&eg.is_valid[0], &eg.costs[0], stcls, gold) self.model.train(eg) From 7b8275fcc4fa815323729ed49fac90775a23780f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 Jun 2015 04:18:01 +0200 Subject: [PATCH 14/30] * Wire hyperparameters to script interface --- bin/parser/nn_train.py | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py index 375996f4f..e0ae846b5 100755 --- a/bin/parser/nn_train.py +++ b/bin/parser/nn_train.py @@ -84,7 +84,8 @@ def _merge_sents(sents): def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, gold_preproc=False, n_sents=0, corruption_level=0, verbose=False, - eta=0.01, mu=0.9, n_hidden=100, word_vec_len=10, pos_vec_len=10): + eta=0.01, mu=0.9, n_hidden=100, + nv_word=10, nv_tag=10, nv_label=10): dep_model_dir = path.join(model_dir, 'deps') pos_model_dir = path.join(model_dir, 'pos') ner_model_dir = path.join(model_dir, 'ner') @@ -99,8 +100,15 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', os.mkdir(ner_model_dir) setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) - Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, - labels=Language.ParserTransitionSystem.get_labels(gold_tuples)) + Config.write(dep_model_dir, 'config', + seed=seed, + features=feat_set, + labels=Language.ParserTransitionSystem.get_labels(gold_tuples), + vector_lengths=(nv_word, nv_tag, nv_label), + hidden_nodes=n_hidden, + eta=eta, + mu=mu + ) Config.write(ner_model_dir, 'config', features='ner', seed=seed, labels=Language.EntityTransitionSystem.get_labels(gold_tuples), beam_width=0) @@ -110,16 +118,17 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', nlp = Language(data_dir=model_dir) - def make_model(n_classes, input_spec, model_dir): - print input_spec - n_in = sum(n_cols * len(fields) for (n_cols, fields) in input_spec) + def make_model(n_classes, (words, tags, labels), model_dir): + n_in = (nv_word * len(words)) + \ + (nv_tag * len(tags)) + \ + (nv_label * len(labels)) print 'Compiling' debug, train_func, predict_func = compile_theano_model(n_classes, n_hidden, n_in, 0.0, 0.0) print 'Done' return TheanoModel( n_classes, - input_spec, + ((nv_word, words), (nv_tag, tags), (nv_label, labels)), train_func, predict_func, model_loc=model_dir, @@ -226,14 +235,23 @@ def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): n_sents=("Number of training sentences", "option", "n", int), n_iter=("Number of training iterations", "option", "i", int), verbose=("Verbose error reporting", "flag", "v", bool), - debug=("Debug mode", "flag", "d", bool), + + nv_word=("Word vector length", "option", "W", int), + nv_tag=("Tag vector length", "option", "T", int), + nv_label=("Label vector length", "option", "L", int), + nv_hidden=("Hidden nodes length", "option", "H", int), + eta=("Learning rate", "option", "E", float), + mu=("Momentum", "option", "M", float), ) def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, - debug=False, corruption_level=0.0, gold_preproc=False, beam_width=1, + corruption_level=0.0, gold_preproc=False, + nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10, + eta=0.1, mu=0.9, eval_only=False): gold_train = list(read_json_file(train_loc)) nlp = train(English, gold_train, model_dir, feat_set='embed', + nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, gold_preproc=gold_preproc, n_sents=n_sents, corruption_level=corruption_level, n_iter=n_iter, verbose=verbose) From e7003f1cf3e48dd83bca48542af0dca35a1410cc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 Jun 2015 04:18:47 +0200 Subject: [PATCH 15/30] * Remove hard-coding of vector lengths --- spacy/_theano.pyx | 4 ++-- spacy/syntax/parser.pyx | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx index 08c49ce71..ab6e0b089 100644 --- a/spacy/_theano.pyx +++ b/spacy/_theano.pyx @@ -28,7 +28,7 @@ cdef class TheanoModel(Model): self.model_loc = model_loc def predict(self, Example eg): - self.input_layer.fill(eg.embeddings, eg.atoms) + self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=True) theano_scores = self.predict_func(eg.embeddings)[0] cdef int i for i in range(self.n_classes): @@ -37,7 +37,7 @@ cdef class TheanoModel(Model): self.n_classes) def train(self, Example eg): - self.input_layer.fill(eg.embeddings, eg.atoms) + self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=False) theano_scores, update, y = self.train_func(eg.embeddings, eg.costs, self.eta) self.input_layer.update(update, eg.atoms, self.t, self.eta, self.mu) for i in range(self.n_classes): diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index f2fa63da5..d65bf1f33 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -52,7 +52,7 @@ def get_templates(name): elif name == 'debug': return pf.unigrams elif name.startswith('embed'): - return ((10, pf.words), (10, pf.tags), (10, pf.labels)) + return (pf.words, pf.tags, pf.labels) else: return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \ pf.tree_shape + pf.trigrams) From fe7b24eceffda3c282b01b9ff8750d70f40ef622 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 28 Jun 2015 01:29:37 +0200 Subject: [PATCH 16/30] * whitespace --- bin/parser/nn_train.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py index e0ae846b5..33ad8a8a9 100755 --- a/bin/parser/nn_train.py +++ b/bin/parser/nn_train.py @@ -84,7 +84,7 @@ def _merge_sents(sents): def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, gold_preproc=False, n_sents=0, corruption_level=0, verbose=False, - eta=0.01, mu=0.9, n_hidden=100, + eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10): dep_model_dir = path.join(model_dir, 'deps') pos_model_dir = path.join(model_dir, 'pos') @@ -105,7 +105,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', features=feat_set, labels=Language.ParserTransitionSystem.get_labels(gold_tuples), vector_lengths=(nv_word, nv_tag, nv_label), - hidden_nodes=n_hidden, + hidden_nodes=nv_hidden, eta=eta, mu=mu ) @@ -123,7 +123,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', (nv_tag * len(tags)) + \ (nv_label * len(labels)) print 'Compiling' - debug, train_func, predict_func = compile_theano_model(n_classes, n_hidden, + debug, train_func, predict_func = compile_theano_model(n_classes, nv_hidden, n_in, 0.0, 0.0) print 'Done' return TheanoModel( @@ -251,7 +251,7 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos gold_train = list(read_json_file(train_loc)) nlp = train(English, gold_train, model_dir, feat_set='embed', - nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, + nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden, gold_preproc=gold_preproc, n_sents=n_sents, corruption_level=corruption_level, n_iter=n_iter, verbose=verbose) From f4986d5d3cd5f565d4dc613c5199d6439f343b65 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 28 Jun 2015 22:36:03 +0200 Subject: [PATCH 17/30] * Use new Example class --- spacy/_ml.pxd | 1 + spacy/_ml.pyx | 16 ++++++---------- spacy/_theano.pyx | 15 ++++++--------- spacy/syntax/parser.pyx | 29 ++++++++++++++++++----------- 4 files changed, 31 insertions(+), 30 deletions(-) diff --git a/spacy/_ml.pxd b/spacy/_ml.pxd index 3562b4a32..40281cad2 100644 --- a/spacy/_ml.pxd +++ b/spacy/_ml.pxd @@ -5,6 +5,7 @@ from cymem.cymem cimport Pool from thinc.learner cimport LinearModel from thinc.features cimport Extractor, Feature from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t +from thinc.api cimport ExampleC from preshed.maps cimport PreshMapArray diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx index cabc4318a..f84068778 100644 --- a/spacy/_ml.pyx +++ b/spacy/_ml.pyx @@ -61,18 +61,14 @@ cdef class Model: self._model.load(self.model_loc, freq_thresh=0) def predict(self, Example eg): - self.set_scores(&eg.scores[0], &eg.atoms[0]) - eg.guess = arg_max_if_true(&eg.scores[0], &eg.is_valid[0], - self.n_classes) + self.set_scores(eg.c.scores, eg.c.atoms) + eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes) def train(self, Example eg): - self.set_scores(&eg.scores[0], &eg.atoms[0]) - eg.guess = arg_max_if_true(&eg.scores[0], - &eg.is_valid[0], self.n_classes) - eg.best = arg_max_if_zero(&eg.scores[0], &eg.costs[0], - self.n_classes) - eg.cost = eg.costs[eg.guess] - self.update(&eg.atoms[0], eg.guess, eg.best, eg.cost) + self.predict(eg) + eg.c.best = arg_max_if_zero(eg.c.scores, eg.c.costs, self.n_classes) + eg.c.cost = eg.c.costs[eg.c.guess] + self.update(eg.c.atoms, eg.c.guess, eg.c.best, eg.c.cost) cdef const weight_t* score(self, atom_t* context) except NULL: cdef int n_feats diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx index ab6e0b089..69896e72a 100644 --- a/spacy/_theano.pyx +++ b/spacy/_theano.pyx @@ -1,4 +1,4 @@ -from thinc.api cimport Example +from thinc.api cimport Example, ExampleC from thinc.typedefs cimport weight_t from ._ml cimport arg_max_if_true @@ -33,20 +33,17 @@ cdef class TheanoModel(Model): cdef int i for i in range(self.n_classes): eg.scores[i] = theano_scores[i] - eg.guess = arg_max_if_true(&eg.scores[0], eg.is_valid[0], - self.n_classes) + eg.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes) def train(self, Example eg): self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=False) theano_scores, update, y = self.train_func(eg.embeddings, eg.costs, self.eta) self.input_layer.update(update, eg.atoms, self.t, self.eta, self.mu) for i in range(self.n_classes): - eg.scores[i] = theano_scores[i] - eg.guess = arg_max_if_true(&eg.scores[0], eg.is_valid[0], - self.n_classes) - eg.best = arg_max_if_zero(&eg.scores[0], eg.costs[0], - self.n_classes) - eg.cost = eg.costs[eg.guess] + eg.c.scores[i] = theano_scores[i] + eg.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes) + eg.best = arg_max_if_zero(eg.c.scores, eg.c.costs, self.n_classes) + eg.cost = eg.c.costs[eg.guess] self.t += 1 def end_training(self): diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index cf9d71736..2ea60b149 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -71,14 +71,17 @@ cdef class Parser: cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) self.moves.initialize_state(stcls) - cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats) + cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, + self.model.n_feats, self.model.n_feats) while not stcls.is_final(): - eg.wipe() - fill_context(&eg.atoms[0], stcls) - self.moves.set_valid(&eg.is_valid[0], stcls) + memset(eg.c.scores, 0, eg.c.nr_class * sizeof(weight_t)) + + self.moves.set_valid(eg.c.is_valid, stcls) + fill_context(eg.c.atoms, stcls) + self.model.predict(eg) - self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label) + self.moves.c[eg.c.guess].do(stcls, self.moves.c[eg.c.guess].label) self.moves.finalize_state(stcls) tokens.set_parse(stcls._sent) @@ -86,20 +89,24 @@ cdef class Parser: self.moves.preprocess_gold(gold) cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) self.moves.initialize_state(stcls) - cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats) + cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, + self.model.n_feats, self.model.n_feats) cdef int cost = 0 while not stcls.is_final(): - eg.wipe() - fill_context(&eg.atoms[0], stcls) - self.moves.set_costs(&eg.is_valid[0], &eg.costs[0], stcls, gold) + memset(eg.c.scores, 0, eg.c.nr_class * sizeof(weight_t)) + + self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold) + + fill_context(eg.c.atoms, stcls) self.model.train(eg) - self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label) - cost += eg.cost + self.moves.c[eg.c.guess].do(stcls, self.moves.c[eg.c.guess].label) + cost += eg.c.cost return cost + # These are passed as callbacks to thinc.search.Beam """ cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: From 5d870720bcb8fddda4153e8c27cce0b14b5791d6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 29 Jun 2015 00:17:29 +0200 Subject: [PATCH 18/30] * Check valency in L and R feature methods, to make feaure calculation faster --- spacy/syntax/stateclass.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 0112a89f5..0708a00cf 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -52,6 +52,8 @@ cdef class StateClass: if i < 0 or i >= self.length: return -1 cdef const TokenC* target = &self._sent[i] + if target.l_kids < idx: + return -1 cdef const TokenC* ptr = self._sent while ptr < target: @@ -75,8 +77,10 @@ cdef class StateClass: return -1 if i < 0 or i >= self.length: return -1 - cdef const TokenC* ptr = self._sent + (self.length - 1) cdef const TokenC* target = &self._sent[i] + if target.r_kids < idx: + return -1 + cdef const TokenC* ptr = self._sent + (self.length - 1) while ptr > target: # If this head is still to the right of us, we can skip to it # No token that's between this token and this head could be our From 313a7f87b376ce8c7def32908ca1682d3fde6982 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 29 Jun 2015 01:06:28 +0200 Subject: [PATCH 19/30] * Inline methods in StateClass --- spacy/syntax/stateclass.pxd | 85 ++++++++++++++++++++++++++----------- spacy/syntax/stateclass.pyx | 67 ----------------------------- 2 files changed, 60 insertions(+), 92 deletions(-) diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd index e3c36751e..905d8cdde 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/syntax/stateclass.pxd @@ -41,45 +41,80 @@ cdef class StateClass: if (i + self._b_i) >= self.length: return -1 return self._buffer[self._b_i + i] - - cdef int H(self, int i) nogil + + cdef inline const TokenC* S_(self, int i) nogil: + return self.safe_get(self.S(i)) + + cdef inline const TokenC* B_(self, int i) nogil: + return self.safe_get(self.B(i)) + + cdef inline const TokenC* H_(self, int i) nogil: + return self.safe_get(self.H(i)) + + cdef inline const TokenC* E_(self, int i) nogil: + return self.safe_get(self.E(i)) + + cdef inline const TokenC* L_(self, int i, int idx) nogil: + return self.safe_get(self.L(i, idx)) + + cdef inline const TokenC* R_(self, int i, int idx) nogil: + return self.safe_get(self.R(i, idx)) + + cdef inline const TokenC* safe_get(self, int i) nogil: + if i < 0 or i >= self.length: + return &self._empty_token + else: + return &self._sent[i] + + cdef inline int H(self, int i) nogil: + if i < 0 or i >= self.length: + return -1 + return self._sent[i].head + i + + cdef int E(self, int i) nogil + + cdef int R(self, int i, int idx) nogil cdef int L(self, int i, int idx) nogil - cdef int R(self, int i, int idx) nogil - cdef const TokenC* S_(self, int i) nogil - cdef const TokenC* B_(self, int i) nogil + cdef inline bint empty(self) nogil: + return self._s_i <= 0 - cdef const TokenC* H_(self, int i) nogil - cdef const TokenC* E_(self, int i) nogil + cdef inline bint eol(self) nogil: + return self.buffer_length() == 0 - cdef const TokenC* L_(self, int i, int idx) nogil - cdef const TokenC* R_(self, int i, int idx) nogil + cdef inline bint at_break(self) nogil: + return self._break != -1 - cdef const TokenC* safe_get(self, int i) nogil + cdef inline bint is_final(self) nogil: + return self.stack_depth() <= 0 and self._b_i >= self.length - cdef bint empty(self) nogil - - cdef bint entity_is_open(self) nogil + cdef inline bint has_head(self, int i) nogil: + return self.safe_get(i).head != 0 - cdef bint eol(self) nogil - - cdef bint at_break(self) nogil + cdef inline int n_L(self, int i) nogil: + return self.safe_get(i).l_kids - cdef bint is_final(self) nogil + cdef inline int n_R(self, int i) nogil: + return self.safe_get(i).r_kids - cdef bint has_head(self, int i) nogil + cdef inline bint stack_is_connected(self) nogil: + return False - cdef int n_L(self, int i) nogil + cdef inline bint entity_is_open(self) nogil: + if self._e_i < 1: + return False + return self._ents[self._e_i-1].end == -1 - cdef int n_R(self, int i) nogil + cdef inline int stack_depth(self) nogil: + return self._s_i - cdef bint stack_is_connected(self) nogil - - cdef int stack_depth(self) nogil - - cdef int buffer_length(self) nogil + cdef inline int buffer_length(self) nogil: + if self._break != -1: + return self._break - self._b_i + else: + return self.length - self._b_i cdef void push(self) nogil diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 0708a00cf..038059f94 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -34,11 +34,6 @@ cdef class StateClass: self._buffer[i] = i self._empty_token.lex = &EMPTY_LEXEME - cdef int H(self, int i) nogil: - if i < 0 or i >= self.length: - return -1 - return self._sent[i].head + i - cdef int E(self, int i) nogil: if self._e_i <= 0 or self._e_i >= self.length: return 0 @@ -96,68 +91,6 @@ cdef class StateClass: ptr -= 1 return -1 - cdef const TokenC* S_(self, int i) nogil: - return self.safe_get(self.S(i)) - - cdef const TokenC* B_(self, int i) nogil: - return self.safe_get(self.B(i)) - - cdef const TokenC* H_(self, int i) nogil: - return self.safe_get(self.H(i)) - - cdef const TokenC* E_(self, int i) nogil: - return self.safe_get(self.E(i)) - - cdef const TokenC* L_(self, int i, int idx) nogil: - return self.safe_get(self.L(i, idx)) - - cdef const TokenC* R_(self, int i, int idx) nogil: - return self.safe_get(self.R(i, idx)) - - cdef const TokenC* safe_get(self, int i) nogil: - if i < 0 or i >= self.length: - return &self._empty_token - else: - return &self._sent[i] - - cdef bint empty(self) nogil: - return self._s_i <= 0 - - cdef bint eol(self) nogil: - return self.buffer_length() == 0 - - cdef bint at_break(self) nogil: - return self._break != -1 - - cdef bint is_final(self) nogil: - return self.stack_depth() <= 0 and self._b_i >= self.length - - cdef bint has_head(self, int i) nogil: - return self.safe_get(i).head != 0 - - cdef int n_L(self, int i) nogil: - return self.safe_get(i).l_kids - - cdef int n_R(self, int i) nogil: - return self.safe_get(i).r_kids - - cdef bint stack_is_connected(self) nogil: - return False - - cdef bint entity_is_open(self) nogil: - if self._e_i < 1: - return False - return self._ents[self._e_i-1].end == -1 - - cdef int stack_depth(self) nogil: - return self._s_i - - cdef int buffer_length(self) nogil: - if self._break != -1: - return self._break - self._b_i - else: - return self.length - self._b_i - cdef void push(self) nogil: if self.B(0) != -1: self._stack[self._s_i] = self.B(0) From 8e7ffd2cddd884494ab5496d269a3173cca841ad Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 29 Jun 2015 02:13:23 +0200 Subject: [PATCH 20/30] * Use thinc 3.1 --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index bb5531f14..b18948e50 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ cython cymem == 1.11 pathlib preshed == 0.37 -thinc == 3.0 +thinc == 3.1 murmurhash == 0.24 unidecode numpy diff --git a/setup.py b/setup.py index 48e2dfe25..f75445cdf 100644 --- a/setup.py +++ b/setup.py @@ -118,7 +118,7 @@ def run_setup(exts): ext_modules=exts, license="Dual: Commercial or AGPL", install_requires=['numpy', 'murmurhash', 'cymem >= 1.11', 'preshed == 0.37', - 'thinc == 3.0', "unidecode", 'wget', 'plac', 'six', + 'thinc == 3.1', "unidecode", 'wget', 'plac', 'six', 'ujson'], setup_requires=["headers_workaround"], ) From fc34e1b6e425420e6230a41f859dadeff0a70ae5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 29 Jun 2015 07:09:16 +0200 Subject: [PATCH 21/30] * Move Theano functions into nn_train.py script --- bin/parser/nn_train.py | 284 ++++++++++++++++++++++------------------- 1 file changed, 156 insertions(+), 128 deletions(-) diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py index 33ad8a8a9..a686df9e0 100755 --- a/bin/parser/nn_train.py +++ b/bin/parser/nn_train.py @@ -23,81 +23,163 @@ from spacy.gold import GoldParse from spacy.scorer import Scorer -from thinc.theano_nn import compile_theano_model - from spacy.syntax.parser import Parser from spacy._theano import TheanoModel +import theano +import theano.tensor as T -def _corrupt(c, noise_level): - if random.random() >= noise_level: - return c - elif c == ' ': - return '\n' - elif c == '\n': - return ' ' - elif c in ['.', "'", "!", "?"]: - return '' - else: - return c.lower() +from theano.printing import Print + +import numpy +from collections import OrderedDict, defaultdict -def add_noise(orig, noise_level): - if random.random() >= noise_level: - return orig - elif type(orig) == list: - corrupted = [_corrupt(word, noise_level) for word in orig] - corrupted = [w for w in corrupted if w] - return corrupted - else: - return ''.join(_corrupt(c, noise_level) for c in orig) +theano.config.floatX = 'float32' +floatX = theano.config.floatX -def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False): - if raw_text is None: - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - else: - tokens = nlp.tokenizer(raw_text) +def th_share(w, name=''): + return theano.shared(value=w, borrow=True, name=name) + + +class AvgParam(object): + def __init__(self, numpy_data, name='?', wrapper=th_share): + self.curr = wrapper(numpy_data, name=name+'_curr') + self.avg = self.curr + self.avg = wrapper(numpy_data.copy(), name=name+'_avg') + self.step = wrapper(numpy.zeros(numpy_data.shape, numpy_data.dtype), + name=name+'_step') + + def updates(self, cost, timestep, eta=0.001, mu=0.9): + step = (mu * self.step) - T.grad(cost, self.curr) + curr = self.curr + (eta * step) + alpha = (1 / timestep).clip(0.001, 0.9).astype(floatX) + avg = ((1 - alpha) * self.avg) + (alpha * curr) + return [(self.curr, curr), (self.step, step), (self.avg, avg)] + + +def feed_layer(activation, weights, bias, input_): + return activation(T.dot(input_, weights) + bias) + + +def L2(L2_reg, *weights): + return L2_reg * sum((w ** 2).sum() for w in weights) + + +def L1(L1_reg, *weights): + return L1_reg * sum(abs(w).sum() for w in weights) + + +def relu(x): + return x * (x > 0) + + +def _init_weights(n_in, n_out): + rng = numpy.random.RandomState(1234) + weights = numpy.asarray( + numpy.random.normal( + loc=0.0, + scale=0.0001, + size=(n_in, n_out)), + dtype=theano.config.floatX + ) + bias = 0.2 * numpy.ones((n_out,), dtype=theano.config.floatX) + return [AvgParam(weights, name='W'), AvgParam(bias, name='b')] + + +def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg): + costs = T.ivector('costs') + is_gold = T.ivector('is_gold') + x = T.vector('x') + y = T.scalar('y') + timestep = theano.shared(1) + eta = T.scalar('eta').astype(floatX) + mu = T.scalar('mu').astype(floatX) + + maxent_W, maxent_b = _init_weights(n_hidden, n_classes) + hidden_W, hidden_b = _init_weights(n_in, n_hidden) + + # Feed the inputs forward through the network + p_y_given_x = feed_layer( + T.nnet.softmax, + maxent_W.curr, + maxent_b.curr, + feed_layer( + relu, + hidden_W.curr, + hidden_b.curr, + x)) + stabilizer = 1e-8 + + cost = ( + -T.log(T.sum((p_y_given_x[0] + stabilizer) * T.eq(costs, 0))) + + L1(L1_reg, hidden_W.curr, hidden_b.curr) + + L2(L2_reg, hidden_W.curr, hidden_b.curr) + ) + + debug = theano.function( + name='debug', + inputs=[x, costs], + outputs=[p_y_given_x, T.eq(costs, 0), p_y_given_x[0] * T.eq(costs, 0)], + ) + + train_model = theano.function( + name='train_model', + inputs=[x, costs, eta, mu], + outputs=[p_y_given_x[0], T.grad(cost, x), T.argmax(p_y_given_x, axis=1), + cost], + updates=( + [(timestep, timestep + 1)] + + maxent_W.updates(cost, timestep, eta=eta, mu=mu) + + maxent_b.updates(cost, timestep, eta=eta, mu=mu) + + hidden_W.updates(cost, timestep, eta=eta, mu=mu) + + hidden_b.updates(cost, timestep, eta=eta, mu=mu) + ), + on_unused_input='warn' + ) + + evaluate_model = theano.function( + name='evaluate_model', + inputs=[x], + outputs=[ + feed_layer( + T.nnet.softmax, + maxent_W.curr, + maxent_b.curr, + feed_layer( + relu, + hidden_W.curr, + hidden_b.curr, + x + ) + )[0] + ] + ) + return debug, train_model, evaluate_model + + +def score_model(scorer, nlp, annot_tuples, verbose=False): + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) - nlp.entity(tokens) nlp.parser(tokens) gold = GoldParse(tokens, annot_tuples) scorer.score(tokens, gold, verbose=verbose) -def _merge_sents(sents): - m_deps = [[], [], [], [], [], []] - m_brackets = [] - i = 0 - for (ids, words, tags, heads, labels, ner), brackets in sents: - m_deps[0].extend(id_ + i for id_ in ids) - m_deps[1].extend(words) - m_deps[2].extend(tags) - m_deps[3].extend(head + i for head in heads) - m_deps[4].extend(labels) - m_deps[5].extend(ner) - m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets) - i += len(ids) - return [(m_deps, m_brackets)] - - def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', - seed=0, gold_preproc=False, n_sents=0, corruption_level=0, + seed=0, n_sents=0, verbose=False, eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10): dep_model_dir = path.join(model_dir, 'deps') pos_model_dir = path.join(model_dir, 'pos') - ner_model_dir = path.join(model_dir, 'ner') if path.exists(dep_model_dir): shutil.rmtree(dep_model_dir) if path.exists(pos_model_dir): shutil.rmtree(pos_model_dir) - if path.exists(ner_model_dir): - shutil.rmtree(ner_model_dir) os.mkdir(dep_model_dir) os.mkdir(pos_model_dir) - os.mkdir(ner_model_dir) setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) Config.write(dep_model_dir, 'config', @@ -109,9 +191,6 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', eta=eta, mu=mu ) - Config.write(ner_model_dir, 'config', features='ner', seed=seed, - labels=Language.EntityTransitionSystem.get_labels(gold_tuples), - beam_width=0) if n_sents > 0: gold_tuples = gold_tuples[:n_sents] @@ -122,57 +201,44 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', n_in = (nv_word * len(words)) + \ (nv_tag * len(tags)) + \ (nv_label * len(labels)) - print 'Compiling' debug, train_func, predict_func = compile_theano_model(n_classes, nv_hidden, - n_in, 0.0, 0.0) - print 'Done' + n_in, 0.0, 0.0001) return TheanoModel( n_classes, ((nv_word, words), (nv_tag, tags), (nv_label, labels)), train_func, predict_func, model_loc=model_dir, + eta=eta, mu=mu, debug=debug) nlp._parser = Parser(nlp.vocab.strings, dep_model_dir, nlp.ParserTransitionSystem, make_model) print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %" + log_loc = path.join(model_dir, 'job.log') for itn in range(n_iter): scorer = Scorer() loss = 0 - for raw_text, sents in gold_tuples: - if gold_preproc: - raw_text = None - else: - sents = _merge_sents(sents) + for _, sents in gold_tuples: for annot_tuples, ctnt in sents: if len(annot_tuples[1]) == 1: continue - score_model(scorer, nlp, raw_text, annot_tuples, - verbose=verbose if itn >= 2 else False) - if raw_text is None: - words = add_noise(annot_tuples[1], corruption_level) - tokens = nlp.tokenizer.tokens_from_list(words) - else: - raw_text = add_noise(raw_text, corruption_level) - tokens = nlp.tokenizer(raw_text) + score_model(scorer, nlp, annot_tuples) + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) gold = GoldParse(tokens, annot_tuples, make_projective=True) - if not gold.is_projective: - raise Exception( - "Non-projective sentence in training, after we should " - "have enforced projectivity: %s" % annot_tuples - ) + assert gold.is_projective loss += nlp.parser.train(tokens, gold) - nlp.entity.train(tokens, gold) nlp.tagger.train(tokens, gold.tags) random.shuffle(gold_tuples) - print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, - scorer.tags_acc, - scorer.token_acc) + logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, + scorer.tags_acc, + scorer.token_acc) + print logline + with open(log_loc, 'aw') as file_: + file_.write(logline + '\n') nlp.parser.model.end_training() - nlp.entity.model.end_training() nlp.tagger.model.end_training() nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt')) return nlp @@ -181,57 +247,20 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', def evaluate(nlp, gold_tuples, gold_preproc=True): scorer = Scorer() for raw_text, sents in gold_tuples: - if gold_preproc: - raw_text = None - else: - sents = _merge_sents(sents) for annot_tuples, brackets in sents: - if raw_text is None: - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - nlp.entity(tokens) - nlp.parser(tokens) - else: - tokens = nlp(raw_text, merge_mwes=False) + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + nlp.tagger(tokens) + nlp.parser(tokens) gold = GoldParse(tokens, annot_tuples) scorer.score(tokens, gold) return scorer -def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): - nlp = Language(data_dir=model_dir) - if beam_width is not None: - nlp.parser.cfg.beam_width = beam_width - gold_tuples = read_json_file(dev_loc) - scorer = Scorer() - out_file = codecs.open(out_loc, 'w', 'utf8') - for raw_text, sents in gold_tuples: - sents = _merge_sents(sents) - for annot_tuples, brackets in sents: - if raw_text is None: - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - nlp.entity(tokens) - nlp.parser(tokens) - else: - tokens = nlp(raw_text, merge_mwes=False) - gold = GoldParse(tokens, annot_tuples) - scorer.score(tokens, gold, verbose=False) - for t in tokens: - out_file.write( - '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_) - ) - return scorer - - @plac.annotations( train_loc=("Location of training file or directory"), dev_loc=("Location of development file or directory"), model_dir=("Location of output model directory",), eval_only=("Skip training, and only evaluate", "flag", "e", bool), - corruption_level=("Amount of noise to add to training data", "option", "c", float), - gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool), - out_loc=("Out location", "option", "o", str), n_sents=("Number of training sentences", "option", "n", int), n_iter=("Number of training iterations", "option", "i", int), verbose=("Verbose error reporting", "flag", "v", bool), @@ -243,21 +272,20 @@ def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): eta=("Learning rate", "option", "E", float), mu=("Momentum", "option", "M", float), ) -def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, - corruption_level=0.0, gold_preproc=False, +def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False, nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10, - eta=0.1, mu=0.9, - eval_only=False): - gold_train = list(read_json_file(train_loc)) + eta=0.1, mu=0.9, eval_only=False): + + gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id'])) + nlp = train(English, gold_train, model_dir, feat_set='embed', + eta=eta, mu=mu, nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden, - gold_preproc=gold_preproc, n_sents=n_sents, - corruption_level=corruption_level, n_iter=n_iter, + n_sents=n_sents, n_iter=n_iter, verbose=verbose) - #if out_loc: - # write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width) - scorer = evaluate(nlp, list(read_json_file(dev_loc)), gold_preproc=gold_preproc) + + scorer = evaluate(nlp, list(read_json_file(dev_loc))) print 'TOK', 100-scorer.token_acc print 'POS', scorer.tags_acc From 894cbef8ba04222a420f312fc7f9023eed35429e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 29 Jun 2015 07:10:33 +0200 Subject: [PATCH 22/30] * Wire eta and mu parameters up for neural net --- spacy/_theano.pyx | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx index 69896e72a..1d7744322 100644 --- a/spacy/_theano.pyx +++ b/spacy/_theano.pyx @@ -10,12 +10,13 @@ from os import path cdef class TheanoModel(Model): def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None, + eta=0.001, mu=0.9, debug=None): if model_loc is not None and path.isdir(model_loc): model_loc = path.join(model_loc, 'model') - self.eta = 0.001 - self.mu = 0.9 + self.eta = eta + self.mu = mu self.t = 1 initializer = lambda: 0.2 * numpy.random.uniform(-1.0, 1.0) self.input_layer = InputLayer(input_spec, initializer) @@ -28,22 +29,24 @@ cdef class TheanoModel(Model): self.model_loc = model_loc def predict(self, Example eg): - self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=True) + self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=False) theano_scores = self.predict_func(eg.embeddings)[0] cdef int i for i in range(self.n_classes): - eg.scores[i] = theano_scores[i] - eg.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes) + eg.c.scores[i] = theano_scores[i] + eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes) def train(self, Example eg): self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=False) - theano_scores, update, y = self.train_func(eg.embeddings, eg.costs, self.eta) + theano_scores, update, y, loss = self.train_func(eg.embeddings, eg.costs, + self.eta, self.mu) self.input_layer.update(update, eg.atoms, self.t, self.eta, self.mu) for i in range(self.n_classes): eg.c.scores[i] = theano_scores[i] - eg.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes) - eg.best = arg_max_if_zero(eg.c.scores, eg.c.costs, self.n_classes) - eg.cost = eg.c.costs[eg.guess] + eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes) + eg.c.best = arg_max_if_zero(eg.c.scores, eg.c.costs, self.n_classes) + eg.c.cost = eg.c.costs[eg.c.guess] + eg.c.loss = loss self.t += 1 def end_training(self): From ca30fe15826d634f1e1029578185de11a1b16f7d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 29 Jun 2015 10:56:02 +0200 Subject: [PATCH 23/30] * Use He initialization trick --- bin/parser/nn_train.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py index a686df9e0..398578037 100755 --- a/bin/parser/nn_train.py +++ b/bin/parser/nn_train.py @@ -77,14 +77,12 @@ def relu(x): def _init_weights(n_in, n_out): rng = numpy.random.RandomState(1234) + weights = numpy.asarray( - numpy.random.normal( - loc=0.0, - scale=0.0001, - size=(n_in, n_out)), + rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in), dtype=theano.config.floatX ) - bias = 0.2 * numpy.ones((n_out,), dtype=theano.config.floatX) + bias = numpy.zeros((n_out,), dtype=theano.config.floatX) return [AvgParam(weights, name='W'), AvgParam(bias, name='b')] From 1dff04acb531df3f375db40b7fd7c3300678a889 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 29 Jun 2015 11:45:38 +0200 Subject: [PATCH 24/30] * Apply regularization to the softmax, not the bias --- bin/parser/nn_train.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py index 398578037..89013fe2b 100755 --- a/bin/parser/nn_train.py +++ b/bin/parser/nn_train.py @@ -35,6 +35,7 @@ import numpy from collections import OrderedDict, defaultdict +theano.config.profile = False theano.config.floatX = 'float32' floatX = theano.config.floatX @@ -112,8 +113,7 @@ def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg): cost = ( -T.log(T.sum((p_y_given_x[0] + stabilizer) * T.eq(costs, 0))) - + L1(L1_reg, hidden_W.curr, hidden_b.curr) - + L2(L2_reg, hidden_W.curr, hidden_b.curr) + + L2(L2_reg, maxent_W.curr, hidden_W.curr) ) debug = theano.function( @@ -143,12 +143,12 @@ def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg): outputs=[ feed_layer( T.nnet.softmax, - maxent_W.curr, - maxent_b.curr, + maxent_W.avg, + maxent_b.avg, feed_layer( relu, - hidden_W.curr, - hidden_b.curr, + hidden_W.avg, + hidden_b.avg, x ) )[0] @@ -200,7 +200,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', (nv_tag * len(tags)) + \ (nv_label * len(labels)) debug, train_func, predict_func = compile_theano_model(n_classes, nv_hidden, - n_in, 0.0, 0.0001) + n_in, 0.0, 0.00) return TheanoModel( n_classes, ((nv_word, words), (nv_tag, tags), (nv_label, labels)), @@ -213,7 +213,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', nlp._parser = Parser(nlp.vocab.strings, dep_model_dir, nlp.ParserTransitionSystem, make_model) - print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %" + print "Itn.\tP.Loss\tUAS\tTag %\tToken %" log_loc = path.join(model_dir, 'job.log') for itn in range(n_iter): scorer = Scorer() @@ -274,6 +274,9 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False, nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10, eta=0.1, mu=0.9, eval_only=False): + + + gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id'])) nlp = train(English, gold_train, model_dir, From df8179ca4f7aeee920a835155ddbf846506f83b8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 29 Jun 2015 16:39:16 +0200 Subject: [PATCH 25/30] * Add separate Param and AdadeltaParam classes. AdadeltaParam seems broken. --- bin/parser/nn_train.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py index 89013fe2b..f7b442faf 100755 --- a/bin/parser/nn_train.py +++ b/bin/parser/nn_train.py @@ -43,6 +43,39 @@ floatX = theano.config.floatX def th_share(w, name=''): return theano.shared(value=w, borrow=True, name=name) +class Param(object): + def __init__(self, numpy_data, name='?', wrapper=th_share): + self.curr = wrapper(numpy_data, name=name+'_curr') + self.step = wrapper(numpy.zeros(numpy_data.shape, numpy_data.dtype), + name=name+'_step') + + def updates(self, cost, timestep, eta, mu): + step = (mu * self.step) - T.grad(cost, self.curr) + curr = self.curr + (eta * step) + return [(self.curr, curr), (self.step, step)] + + +class AdadeltaParam(object): + def __init__(self, numpy_data, name='?', wrapper=th_share): + self.curr = wrapper(numpy_data, name=name+'_curr') + # accu: accumulate gradient magnitudes + self.accu = wrapper(numpy.zeros(numpy_data.shape, dtype=numpy_data.dtype)) + # delta_accu: accumulate update magnitudes (recursively!) + self.delta_accu = wrapper(numpy.zeros(numpy_data.shape, dtype=numpy_data.dtype)) + + def updates(self, cost, timestep, eps, rho): + # update accu (as in rmsprop) + grad = T.grad(cost, self.curr) + accu_new = rho * self.accu + (1 - rho) * grad ** 2 + + # compute parameter update, using the 'old' delta_accu + update = (grad * T.sqrt(self.delta_accu + eps) / + T.sqrt(accu_new + eps)) + # update delta_accu (as accu, but accumulating updates) + delta_accu_new = rho * self.delta_accu + (1 - rho) * update ** 2 + return [(self.curr, self.curr - update), (self.accu, accu_new), + (self.delta_accu, delta_accu_new)] + class AvgParam(object): def __init__(self, numpy_data, name='?', wrapper=th_share): From 5cd3ed42d4fea1c7cd539ad4b5e1ef6fabaa5b9a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 29 Jun 2015 16:44:42 +0200 Subject: [PATCH 26/30] * Reenable averaging --- spacy/_theano.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx index 1d7744322..4231266c3 100644 --- a/spacy/_theano.pyx +++ b/spacy/_theano.pyx @@ -29,7 +29,7 @@ cdef class TheanoModel(Model): self.model_loc = model_loc def predict(self, Example eg): - self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=False) + self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=True) theano_scores = self.predict_func(eg.embeddings)[0] cdef int i for i in range(self.n_classes): From 1135cfe50a25bbad91c5cb3b684fa85eea5be151 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 29 Jun 2015 16:45:14 +0200 Subject: [PATCH 27/30] * Tidy nn_train a bit --- bin/parser/nn_train.py | 63 +++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py index f7b442faf..2fc1958ed 100755 --- a/bin/parser/nn_train.py +++ b/bin/parser/nn_train.py @@ -85,7 +85,7 @@ class AvgParam(object): self.step = wrapper(numpy.zeros(numpy_data.shape, numpy_data.dtype), name=name+'_step') - def updates(self, cost, timestep, eta=0.001, mu=0.9): + def updates(self, cost, timestep, eta, mu): step = (mu * self.step) - T.grad(cost, self.curr) curr = self.curr + (eta * step) alpha = (1 / timestep).clip(0.001, 0.9).astype(floatX) @@ -110,7 +110,7 @@ def relu(x): def _init_weights(n_in, n_out): - rng = numpy.random.RandomState(1234) + rng = numpy.random.RandomState(1235) weights = numpy.asarray( rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in), @@ -125,6 +125,8 @@ def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg): is_gold = T.ivector('is_gold') x = T.vector('x') y = T.scalar('y') + y_cost = T.scalar('y_cost') + loss = T.scalar('cost') timestep = theano.shared(1) eta = T.scalar('eta').astype(floatX) mu = T.scalar('mu').astype(floatX) @@ -144,10 +146,9 @@ def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg): x)) stabilizer = 1e-8 - cost = ( - -T.log(T.sum((p_y_given_x[0] + stabilizer) * T.eq(costs, 0))) - + L2(L2_reg, maxent_W.curr, hidden_W.curr) - ) + y_cost = costs[T.argmax(p_y_given_x[0])] + + loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + stabilizer) debug = theano.function( name='debug', @@ -158,14 +159,14 @@ def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg): train_model = theano.function( name='train_model', inputs=[x, costs, eta, mu], - outputs=[p_y_given_x[0], T.grad(cost, x), T.argmax(p_y_given_x, axis=1), - cost], + outputs=[p_y_given_x[0], T.grad(loss, x), T.argmax(p_y_given_x, axis=1), + loss], updates=( [(timestep, timestep + 1)] + - maxent_W.updates(cost, timestep, eta=eta, mu=mu) + - maxent_b.updates(cost, timestep, eta=eta, mu=mu) + - hidden_W.updates(cost, timestep, eta=eta, mu=mu) + - hidden_b.updates(cost, timestep, eta=eta, mu=mu) + maxent_W.updates(loss, timestep, eta, mu) + + maxent_b.updates(loss, timestep, eta, mu) + + hidden_W.updates(loss, timestep, eta, mu) + + hidden_b.updates(loss, timestep, eta, mu) ), on_unused_input='warn' ) @@ -199,10 +200,24 @@ def score_model(scorer, nlp, annot_tuples, verbose=False): def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', - seed=0, n_sents=0, - verbose=False, - eta=0.01, mu=0.9, nv_hidden=100, - nv_word=10, nv_tag=10, nv_label=10): + eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10, + seed=0, n_sents=0, verbose=False): + def make_model(n_classes, (words, tags, labels), model_dir): + n_in = (nv_word * len(words)) + \ + (nv_tag * len(tags)) + \ + (nv_label * len(labels)) + debug, train_func, predict_func = compile_theano_model(n_classes, nv_hidden, + n_in, 0.0, 0.0) + return TheanoModel( + n_classes, + ((nv_word, words), (nv_tag, tags), (nv_label, labels)), + train_func, + predict_func, + model_loc=model_dir, + eta=eta, mu=mu, + debug=debug) + + dep_model_dir = path.join(model_dir, 'deps') pos_model_dir = path.join(model_dir, 'pos') if path.exists(dep_model_dir): @@ -227,22 +242,6 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', gold_tuples = gold_tuples[:n_sents] nlp = Language(data_dir=model_dir) - - def make_model(n_classes, (words, tags, labels), model_dir): - n_in = (nv_word * len(words)) + \ - (nv_tag * len(tags)) + \ - (nv_label * len(labels)) - debug, train_func, predict_func = compile_theano_model(n_classes, nv_hidden, - n_in, 0.0, 0.00) - return TheanoModel( - n_classes, - ((nv_word, words), (nv_tag, tags), (nv_label, labels)), - train_func, - predict_func, - model_loc=model_dir, - eta=eta, mu=mu, - debug=debug) - nlp._parser = Parser(nlp.vocab.strings, dep_model_dir, nlp.ParserTransitionSystem, make_model) From e20106fdff782e1c0de8e777af2f7e6df1ba63db Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 30 Jun 2015 14:26:32 +0200 Subject: [PATCH 28/30] * Begin reorganizing neuralnet work --- spacy/_ml.pxd | 4 ++-- spacy/_theano.pyx | 3 +-- spacy/syntax/parser.pxd | 6 +++--- spacy/syntax/parser.pyx | 20 +++++++++++++------- 4 files changed, 19 insertions(+), 14 deletions(-) diff --git a/spacy/_ml.pxd b/spacy/_ml.pxd index 40281cad2..c695e48aa 100644 --- a/spacy/_ml.pxd +++ b/spacy/_ml.pxd @@ -21,8 +21,8 @@ cdef int arg_max_if_zero(const weight_t* scores, const int* costs, int n_classes cdef class Model: - cdef int n_classes - cdef int n_feats + cdef readonly int n_classes + cdef readonly int n_feats cdef const weight_t* score(self, atom_t* context) except NULL cdef int set_scores(self, weight_t* scores, atom_t* context) except -1 diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx index 4231266c3..cc6886321 100644 --- a/spacy/_theano.pyx +++ b/spacy/_theano.pyx @@ -10,8 +10,7 @@ from os import path cdef class TheanoModel(Model): def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None, - eta=0.001, mu=0.9, - debug=None): + eta=0.001, mu=0.9, debug=None): if model_loc is not None and path.isdir(model_loc): model_loc = path.join(model_loc, 'model') diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd index 11ac6bbb8..7e2b3b083 100644 --- a/spacy/syntax/parser.pxd +++ b/spacy/syntax/parser.pxd @@ -8,6 +8,6 @@ from ..tokens cimport Tokens, TokenC cdef class Parser: - cdef readonly object cfg - cdef readonly Model model - cdef readonly TransitionSystem moves + cdef public object cfg + cdef public Model model + cdef public TransitionSystem moves diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 2ea60b149..c3ed0b464 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -59,13 +59,11 @@ def get_templates(name): cdef class Parser: - def __init__(self, StringStore strings, model_dir, transition_system, - get_model=Model): + def __init__(self, StringStore strings, model_dir, transition_system): assert os.path.exists(model_dir) and os.path.isdir(model_dir) self.cfg = Config.read(model_dir, 'config') self.moves = transition_system(strings, self.cfg.labels) - templates = get_templates(self.cfg.features) - self.model = get_model(self.moves.n_moves, templates, model_dir) + self.model = Model(self.moves.n_moves, self.cfg.templates, model_dir) def __call__(self, Tokens tokens): cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) @@ -73,6 +71,8 @@ cdef class Parser: cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats, self.model.n_feats) + eg.scores[0] = 10 + assert eg.c.scores[0] == 10 while not stcls.is_final(): memset(eg.c.scores, 0, eg.c.nr_class * sizeof(weight_t)) @@ -91,7 +91,9 @@ cdef class Parser: self.moves.initialize_state(stcls) cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats, self.model.n_feats) - cdef int cost = 0 + cdef weight_t loss = 0 + words = [w.orth_ for w in tokens] + cdef Transition G while not stcls.is_final(): memset(eg.c.scores, 0, eg.c.nr_class * sizeof(weight_t)) @@ -101,9 +103,13 @@ cdef class Parser: self.model.train(eg) + G = self.moves.c[eg.c.guess] + + #if eg.c.cost != 0: + # print self.moves.move_name(G.move, G.label), stcls.print_state(words) self.moves.c[eg.c.guess].do(stcls, self.moves.c[eg.c.guess].label) - cost += eg.c.cost - return cost + loss += eg.c.loss + return loss From 31b5e58aebe34e7bc2ac1b615bb99324e0304637 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 30 Jun 2015 14:26:53 +0200 Subject: [PATCH 29/30] * Begin reorganizing neuralnet work --- bin/parser/nn_train.py | 171 ++++++++++++----------------------------- 1 file changed, 49 insertions(+), 122 deletions(-) diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py index 2fc1958ed..72c9e04f1 100755 --- a/bin/parser/nn_train.py +++ b/bin/parser/nn_train.py @@ -23,7 +23,7 @@ from spacy.gold import GoldParse from spacy.scorer import Scorer -from spacy.syntax.parser import Parser +from spacy.syntax.parser import Parser, get_templates from spacy._theano import TheanoModel import theano @@ -40,76 +40,37 @@ theano.config.floatX = 'float32' floatX = theano.config.floatX -def th_share(w, name=''): - return theano.shared(value=w, borrow=True, name=name) - -class Param(object): - def __init__(self, numpy_data, name='?', wrapper=th_share): - self.curr = wrapper(numpy_data, name=name+'_curr') - self.step = wrapper(numpy.zeros(numpy_data.shape, numpy_data.dtype), - name=name+'_step') - - def updates(self, cost, timestep, eta, mu): - step = (mu * self.step) - T.grad(cost, self.curr) - curr = self.curr + (eta * step) - return [(self.curr, curr), (self.step, step)] - - -class AdadeltaParam(object): - def __init__(self, numpy_data, name='?', wrapper=th_share): - self.curr = wrapper(numpy_data, name=name+'_curr') - # accu: accumulate gradient magnitudes - self.accu = wrapper(numpy.zeros(numpy_data.shape, dtype=numpy_data.dtype)) - # delta_accu: accumulate update magnitudes (recursively!) - self.delta_accu = wrapper(numpy.zeros(numpy_data.shape, dtype=numpy_data.dtype)) - - def updates(self, cost, timestep, eps, rho): - # update accu (as in rmsprop) - grad = T.grad(cost, self.curr) - accu_new = rho * self.accu + (1 - rho) * grad ** 2 - - # compute parameter update, using the 'old' delta_accu - update = (grad * T.sqrt(self.delta_accu + eps) / - T.sqrt(accu_new + eps)) - # update delta_accu (as accu, but accumulating updates) - delta_accu_new = rho * self.delta_accu + (1 - rho) * update ** 2 - return [(self.curr, self.curr - update), (self.accu, accu_new), - (self.delta_accu, delta_accu_new)] - - -class AvgParam(object): - def __init__(self, numpy_data, name='?', wrapper=th_share): - self.curr = wrapper(numpy_data, name=name+'_curr') - self.avg = self.curr - self.avg = wrapper(numpy_data.copy(), name=name+'_avg') - self.step = wrapper(numpy.zeros(numpy_data.shape, numpy_data.dtype), - name=name+'_step') - - def updates(self, cost, timestep, eta, mu): - step = (mu * self.step) - T.grad(cost, self.curr) - curr = self.curr + (eta * step) - alpha = (1 / timestep).clip(0.001, 0.9).astype(floatX) - avg = ((1 - alpha) * self.avg) + (alpha * curr) - return [(self.curr, curr), (self.step, step), (self.avg, avg)] - - -def feed_layer(activation, weights, bias, input_): - return activation(T.dot(input_, weights) + bias) +def L1(L1_reg, *weights): + return L1_reg * sum(abs(w).sum() for w in weights) def L2(L2_reg, *weights): return L2_reg * sum((w ** 2).sum() for w in weights) -def L1(L1_reg, *weights): - return L1_reg * sum(abs(w).sum() for w in weights) +def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6): + updates = OrderedDict() + for param in params: + value = param.get_value(borrow=True) + accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), + broadcastable=param.broadcastable) + + grad = T.grad(loss, param) + accu_new = rho * accu + (1 - rho) * grad ** 2 + updates[accu] = accu_new + updates[param] = param - (eta * grad / T.sqrt(accu_new + eps)) + return updates def relu(x): return x * (x > 0) -def _init_weights(n_in, n_out): +def feed_layer(activation, weights, bias, input_): + return activation(T.dot(input_, weights) + bias) + + +def init_weights(n_in, n_out): rng = numpy.random.RandomState(1235) weights = numpy.asarray( @@ -117,57 +78,35 @@ def _init_weights(n_in, n_out): dtype=theano.config.floatX ) bias = numpy.zeros((n_out,), dtype=theano.config.floatX) - return [AvgParam(weights, name='W'), AvgParam(bias, name='b')] + return [wrapper(weights, name='W'), wrapper(bias, name='b')] -def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg): - costs = T.ivector('costs') - is_gold = T.ivector('is_gold') +def compile_model(n_classes, n_hidden, n_in, optimizer): x = T.vector('x') - y = T.scalar('y') - y_cost = T.scalar('y_cost') - loss = T.scalar('cost') - timestep = theano.shared(1) - eta = T.scalar('eta').astype(floatX) - mu = T.scalar('mu').astype(floatX) + costs = T.ivector('costs') + loss = T.scalar('loss') - maxent_W, maxent_b = _init_weights(n_hidden, n_classes) - hidden_W, hidden_b = _init_weights(n_in, n_hidden) + maxent_W, maxent_b = init_weights(n_hidden, n_classes) + hidden_W, hidden_b = init_weights(n_in, n_hidden) # Feed the inputs forward through the network p_y_given_x = feed_layer( T.nnet.softmax, - maxent_W.curr, - maxent_b.curr, + maxent_W, + maxent_b, feed_layer( relu, - hidden_W.curr, - hidden_b.curr, + hidden_W, + hidden_b, x)) - stabilizer = 1e-8 - y_cost = costs[T.argmax(p_y_given_x[0])] - - loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + stabilizer) - - debug = theano.function( - name='debug', - inputs=[x, costs], - outputs=[p_y_given_x, T.eq(costs, 0), p_y_given_x[0] * T.eq(costs, 0)], - ) + loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8) train_model = theano.function( name='train_model', - inputs=[x, costs, eta, mu], - outputs=[p_y_given_x[0], T.grad(loss, x), T.argmax(p_y_given_x, axis=1), - loss], - updates=( - [(timestep, timestep + 1)] + - maxent_W.updates(loss, timestep, eta, mu) + - maxent_b.updates(loss, timestep, eta, mu) + - hidden_W.updates(loss, timestep, eta, mu) + - hidden_b.updates(loss, timestep, eta, mu) - ), + inputs=[x, costs], + outputs=[p_y_given_x[0], T.grad(loss, x), loss], + updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]), on_unused_input='warn' ) @@ -177,18 +116,18 @@ def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg): outputs=[ feed_layer( T.nnet.softmax, - maxent_W.avg, - maxent_b.avg, + maxent_W, + maxent_b, feed_layer( relu, - hidden_W.avg, - hidden_b.avg, + hidden_W, + hidden_b, x ) )[0] ] ) - return debug, train_model, evaluate_model + return train_model, evaluate_model def score_model(scorer, nlp, annot_tuples, verbose=False): @@ -202,21 +141,6 @@ def score_model(scorer, nlp, annot_tuples, verbose=False): def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10, seed=0, n_sents=0, verbose=False): - def make_model(n_classes, (words, tags, labels), model_dir): - n_in = (nv_word * len(words)) + \ - (nv_tag * len(tags)) + \ - (nv_label * len(labels)) - debug, train_func, predict_func = compile_theano_model(n_classes, nv_hidden, - n_in, 0.0, 0.0) - return TheanoModel( - n_classes, - ((nv_word, words), (nv_tag, tags), (nv_label, labels)), - train_func, - predict_func, - model_loc=model_dir, - eta=eta, mu=mu, - debug=debug) - dep_model_dir = path.join(model_dir, 'deps') pos_model_dir = path.join(model_dir, 'pos') @@ -230,21 +154,24 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', Config.write(dep_model_dir, 'config', seed=seed, - features=feat_set, + templates=tuple(), labels=Language.ParserTransitionSystem.get_labels(gold_tuples), vector_lengths=(nv_word, nv_tag, nv_label), hidden_nodes=nv_hidden, eta=eta, mu=mu ) - + + # Bake-in hyper-parameters + optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps) + nlp = Language(data_dir=model_dir) + n_classes = nlp.parser.model.n_classes + train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer) + nlp.parser.model = TheanoModel(n_classes, input_spec, train, + predict, model_loc) + if n_sents > 0: gold_tuples = gold_tuples[:n_sents] - - nlp = Language(data_dir=model_dir) - nlp._parser = Parser(nlp.vocab.strings, dep_model_dir, nlp.ParserTransitionSystem, - make_model) - print "Itn.\tP.Loss\tUAS\tTag %\tToken %" log_loc = path.join(model_dir, 'job.log') for itn in range(n_iter): From 341cd0c99f379d2002295c529f6da375be6fd5da Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 30 Jun 2015 14:27:11 +0200 Subject: [PATCH 30/30] * Require thinc==3.2 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b18948e50..0721a1b00 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ cython cymem == 1.11 pathlib preshed == 0.37 -thinc == 3.1 +thinc == 3.2 murmurhash == 0.24 unidecode numpy