From a4e9bdf4c171acead99a6c55ac8113194abbb4c8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2015 22:55:58 +0200 Subject: [PATCH 1/9] * Work on a theano-driven model for the parser --- spacy/_theano.pyx | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 spacy/_theano.pyx diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx new file mode 100644 index 000000000..1a1224596 --- /dev/null +++ b/spacy/_theano.pyx @@ -0,0 +1,44 @@ +from thinc.example cimport Example + + +cdef class TheanoModel(Model): + def __init__(self, n_classes, input_layer, train_func, predict_func, model_loc=None): + if model_loc is not None and path.isdir(model_loc): + model_loc = path.join(model_loc, 'model') + self.n_classes = n_classes + + tables = [] + lengths = [] + for window_size, n_dims, vocab_size in input_structure: + tables.append(EmbeddingTable(n_dims, vocab_size, initializer)) + lengths.append(window_size) + + self.input_layer = InputLayer(lengths, tables) + + self.train_func = train_func + self.predict_func = predict_func + + self.model_loc = model_loc + if self.model_loc and path.exists(self.model_loc): + self._model.load(self.model_loc, freq_thresh=0) + + def train(self, Instance eg): + pass + + def predict(self, Instance eg): + + cdef const weight_t* score(self, atom_t* context) except NULL: + self.set_scores(self._scores, context) + return self._scores + + cdef int set_scores(self, weight_t* scores, atom_t* context) except -1: + # TODO f(context) --> Values + self._input_layer.fill(self._x, self._values, use_avg=False) + theano_scores = self._predict(self._x) + for i in range(self.n_classes): + output[i] = theano_scores[i] + + cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1: + # TODO f(context) --> Values + self._input_layer.fill(self._x, self._values, use_avg=False) + From 886100e1a2f64cb805ebd8ca0934fefa2794ec06 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 24 Jun 2015 04:51:38 +0200 Subject: [PATCH 2/9] * Increment version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 76615e141..24b88dea9 100644 --- a/setup.py +++ b/setup.py @@ -130,7 +130,7 @@ def run_setup(exts): headers_workaround.install_headers('numpy') -VERSION = '0.85' +VERSION = '0.86' def main(modules, is_pypy): language = "cpp" includes = ['.', path.join(sys.prefix, 'include')] From 6896455884879f2845fce63097ce9fdf5856bf60 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 26 Jun 2015 06:25:36 +0200 Subject: [PATCH 3/9] * Rejig parser interface to use new thinc.api.Example class, in prep of theano model. Comment out beam search --- spacy/_ml.pyx | 36 ++++++ spacy/syntax/arc_eager.pyx | 29 ++--- spacy/syntax/ner.pyx | 21 ---- spacy/syntax/parser.pxd | 3 - spacy/syntax/parser.pyx | 178 ++++++++++++----------------- spacy/syntax/transition_system.pxd | 8 +- spacy/syntax/transition_system.pyx | 35 ++---- 7 files changed, 132 insertions(+), 178 deletions(-) diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx index be647c2dd..df66a1791 100644 --- a/spacy/_ml.pyx +++ b/spacy/_ml.pyx @@ -10,6 +10,7 @@ import cython import numpy.random from thinc.features cimport Feature, count_feats +from thinc.api cimport Example cdef int arg_max(const weight_t* scores, const int n_classes) nogil: @@ -23,6 +24,30 @@ cdef int arg_max(const weight_t* scores, const int n_classes) nogil: return best +cdef int arg_max_if_true(const weight_t* scores, const bint* is_valid, + const int n_classes) nogil: + cdef int i + cdef int best = 0 + cdef weight_t mode = -900000 + for i in range(n_classes): + if is_valid[i] and scores[i] > mode: + mode = scores[i] + best = i + return best + + +cdef int arg_max_if_zero(const weight_t* scores, const int* costs, + const int n_classes) nogil: + cdef int i + cdef int best = 0 + cdef weight_t mode = -900000 + for i in range(n_classes): + if costs[i] == 0 and scores[i] > mode: + mode = scores[i] + best = i + return best + + cdef class Model: def __init__(self, n_classes, templates, model_loc=None): if model_loc is not None and path.isdir(model_loc): @@ -34,6 +59,17 @@ cdef class Model: if self.model_loc and path.exists(self.model_loc): self._model.load(self.model_loc, freq_thresh=0) + def predict(self, Example eg): + self.set_scores(eg.scores, eg.atoms) + eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes) + + def train(self, Example eg): + self.set_scores(eg.scores, eg.atoms) + eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes) + eg.best = arg_max_if_zero(eg.scores, eg.costs, self.n_classes) + eg.cost = eg.costs[eg.guess] + self.update(eg.atoms, eg.guess, eg.best, eg.cost) + cdef const weight_t* score(self, atom_t* context) except NULL: cdef int n_feats feats = self._extractor.get_feats(context, &n_feats) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 29e62cb4e..a83e19ec2 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -398,7 +398,8 @@ cdef class ArcEager(TransitionSystem): n_valid += output[i] assert n_valid >= 1 - cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1: + cdef int set_costs(self, bint* is_valid, int* costs, + StateClass stcls, GoldParse gold) except -1: cdef int i, move, label cdef label_cost_func_t[N_MOVES] label_cost_funcs cdef move_cost_func_t[N_MOVES] move_cost_funcs @@ -423,30 +424,14 @@ cdef class ArcEager(TransitionSystem): n_gold = 0 for i in range(self.n_moves): if self.c[i].is_valid(stcls, self.c[i].label): + is_valid[i] = True move = self.c[i].move label = self.c[i].label if move_costs[move] == -1: move_costs[move] = move_cost_funcs[move](stcls, &gold.c) - output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label) - n_gold += output[i] == 0 + costs[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label) + n_gold += costs[i] == 0 else: - output[i] = 9000 + is_valid[i] = False + costs[i] = 9000 assert n_gold >= 1 - - cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *: - cdef bint[N_MOVES] is_valid - is_valid[SHIFT] = Shift.is_valid(stcls, -1) - is_valid[REDUCE] = Reduce.is_valid(stcls, -1) - is_valid[LEFT] = LeftArc.is_valid(stcls, -1) - is_valid[RIGHT] = RightArc.is_valid(stcls, -1) - is_valid[BREAK] = Break.is_valid(stcls, -1) - cdef Transition best - cdef weight_t score = MIN_SCORE - cdef int i - for i in range(self.n_moves): - if scores[i] > score and is_valid[self.c[i].move]: - best = self.c[i] - score = scores[i] - assert best.clas < self.n_moves - assert score > MIN_SCORE, (stcls.stack_depth(), stcls.buffer_length(), stcls.is_final(), stcls._b_i, stcls.length) - return best diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 4a47a20a8..b145df7ac 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -128,27 +128,6 @@ cdef class BiluoPushDown(TransitionSystem): raise Exception(move) return t - cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *: - cdef int best = -1 - cdef weight_t score = -90000 - cdef const Transition* m - cdef int i - for i in range(self.n_moves): - m = &self.c[i] - if m.is_valid(stcls, m.label) and scores[i] > score: - best = i - score = scores[i] - assert best >= 0 - cdef Transition t = self.c[best] - t.score = score - return t - - cdef int set_valid(self, bint* output, StateClass stcls) except -1: - cdef int i - for i in range(self.n_moves): - m = &self.c[i] - output[i] = m.is_valid(stcls, m.label) - cdef class Missing: @staticmethod diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd index 103ff9c02..11ac6bbb8 100644 --- a/spacy/syntax/parser.pxd +++ b/spacy/syntax/parser.pxd @@ -11,6 +11,3 @@ cdef class Parser: cdef readonly object cfg cdef readonly Model model cdef readonly TransitionSystem moves - - cdef int _greedy_parse(self, Tokens tokens) except -1 - cdef int _beam_parse(self, Tokens tokens) except -1 diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 740e86025..4bfb0eeb1 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -20,17 +20,10 @@ from cymem.cymem cimport Pool, Address from murmurhash.mrmr cimport hash64 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t - from util import Config -from thinc.features cimport Extractor -from thinc.features cimport Feature -from thinc.features cimport count_feats +from thinc.api cimport Example -from thinc.learner cimport LinearModel - -from thinc.search cimport Beam -from thinc.search cimport MaxViolation from ..tokens cimport Tokens, TokenC from ..strings cimport StringStore @@ -73,35 +66,86 @@ cdef class Parser: self.model = Model(self.moves.n_moves, templates, model_dir) def __call__(self, Tokens tokens): - if self.cfg.get('beam_width', 1) < 1: - self._greedy_parse(tokens) - else: - self._beam_parse(tokens) + cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) + self.moves.initialize_state(stcls) + + cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE) + while not stcls.is_final(): + eg.wipe() + fill_context(eg.atoms, stcls) + self.moves.set_valid(eg.is_valid, stcls) + + self.model.predict(eg) + + self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label) + self.moves.finalize_state(stcls) + tokens.set_parse(stcls._sent) def train(self, Tokens tokens, GoldParse gold): self.moves.preprocess_gold(gold) - if self.cfg.beam_width < 1: - return self._greedy_train(tokens, gold) - else: - return self._beam_train(tokens, gold) - - cdef int _greedy_parse(self, Tokens tokens) except -1: - cdef atom_t[CONTEXT_SIZE] context - cdef int n_feats - cdef Pool mem = Pool() cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) self.moves.initialize_state(stcls) - cdef Transition guess - words = [w.orth_ for w in tokens] + cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE) + cdef int cost = 0 while not stcls.is_final(): - fill_context(context, stcls) - scores = self.model.score(context) - guess = self.moves.best_valid(scores, stcls) - #print self.moves.move_name(guess.move, guess.label), stcls.print_state(words) - guess.do(stcls, guess.label) - assert stcls._s_i >= 0 - self.moves.finalize_state(stcls) - tokens.set_parse(stcls._sent) + eg.wipe() + fill_context(eg.atoms, stcls) + self.moves.set_costs(eg.is_valid, eg.costs, stcls, gold) + + self.model.train(eg) + + self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label) + cost += eg.cost + return cost + + +# These are passed as callbacks to thinc.search.Beam +""" +cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: + dest = _dest + src = _src + moves = _moves + dest.clone(src) + moves[clas].do(dest, moves[clas].label) + + +cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: + cdef StateClass st = StateClass.init(tokens, length) + st.fast_forward() + Py_INCREF(st) + return st + + +cdef int _check_final_state(void* _state, void* extra_args) except -1: + return (_state).is_final() + + +def _cleanup(Beam beam): + for i in range(beam.width): + Py_XDECREF(beam._states[i].content) + Py_XDECREF(beam._parents[i].content) + +cdef hash_t _hash_state(void* _state, void* _) except 0: + return _state + + #state = _state + #cdef atom_t[10] rep + + #rep[0] = state.stack[0] if state.stack_len >= 1 else 0 + #rep[1] = state.stack[-1] if state.stack_len >= 2 else 0 + #rep[2] = state.stack[-2] if state.stack_len >= 3 else 0 + #rep[3] = state.i + #rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0 + #rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0 + #rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0 + #rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0 + #if get_left(state, get_n0(state), 1) != NULL: + # rep[8] = get_left(state, get_n0(state), 1).dep + #else: + # rep[8] = 0 + #rep[9] = state.sent[state.i].l_kids + #return hash64(rep, sizeof(atom_t) * 10, 0) + cdef int _beam_parse(self, Tokens tokens) except -1: cdef Beam beam = Beam(self.moves.n_moves, self.cfg.beam_width) @@ -115,30 +159,6 @@ cdef class Parser: tokens.set_parse(state._sent) _cleanup(beam) - def _greedy_train(self, Tokens tokens, GoldParse gold): - cdef Pool mem = Pool() - cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) - self.moves.initialize_state(stcls) - - cdef int cost - cdef const Feature* feats - cdef const weight_t* scores - cdef Transition guess - cdef Transition best - cdef atom_t[CONTEXT_SIZE] context - loss = 0 - words = [w.orth_ for w in tokens] - history = [] - while not stcls.is_final(): - fill_context(context, stcls) - scores = self.model.score(context) - guess = self.moves.best_valid(scores, stcls) - best = self.moves.best_gold(scores, stcls, gold) - cost = guess.get_cost(stcls, &gold.c, guess.label) - self.model.update(context, guess.clas, best.clas, cost) - guess.do(stcls, guess.label) - loss += cost - return loss def _beam_train(self, Tokens tokens, GoldParse gold_parse): cdef Beam pred = Beam(self.moves.n_moves, self.cfg.beam_width) @@ -201,50 +221,4 @@ cdef class Parser: count_feats(counts[clas], feats, n_feats, inc) self.moves.c[clas].do(stcls, self.moves.c[clas].label) - -# These are passed as callbacks to thinc.search.Beam - -cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: - dest = _dest - src = _src - moves = _moves - dest.clone(src) - moves[clas].do(dest, moves[clas].label) - - -cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: - cdef StateClass st = StateClass.init(tokens, length) - st.fast_forward() - Py_INCREF(st) - return st - - -cdef int _check_final_state(void* _state, void* extra_args) except -1: - return (_state).is_final() - - -def _cleanup(Beam beam): - for i in range(beam.width): - Py_XDECREF(beam._states[i].content) - Py_XDECREF(beam._parents[i].content) - -cdef hash_t _hash_state(void* _state, void* _) except 0: - return _state - - #state = _state - #cdef atom_t[10] rep - - #rep[0] = state.stack[0] if state.stack_len >= 1 else 0 - #rep[1] = state.stack[-1] if state.stack_len >= 2 else 0 - #rep[2] = state.stack[-2] if state.stack_len >= 3 else 0 - #rep[3] = state.i - #rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0 - #rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0 - #rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0 - #rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0 - #if get_left(state, get_n0(state), 1) != NULL: - # rep[8] = get_left(state, get_n0(state), 1).dep - #else: - # rep[8] = 0 - #rep[9] = state.sent[state.i].l_kids - #return hash64(rep, sizeof(atom_t) * 10, 0) +""" diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index d9bd2b3e6..35f0ada30 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -46,9 +46,5 @@ cdef class TransitionSystem: cdef int set_valid(self, bint* output, StateClass state) except -1 - cdef int set_costs(self, int* output, StateClass state, GoldParse gold) except -1 - - cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except * - - cdef Transition best_gold(self, const weight_t* scores, StateClass state, - GoldParse gold) except * + cdef int set_costs(self, bint* is_valid, int* costs, + StateClass state, GoldParse gold) except -1 diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 927498cba..b13c75ba3 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -43,30 +43,17 @@ cdef class TransitionSystem: cdef Transition init_transition(self, int clas, int move, int label) except *: raise NotImplementedError - cdef Transition best_valid(self, const weight_t* scores, StateClass s) except *: - raise NotImplementedError - - cdef int set_valid(self, bint* output, StateClass state) except -1: - raise NotImplementedError - - cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1: + cdef int set_valid(self, bint* is_valid, StateClass stcls) except -1: cdef int i for i in range(self.n_moves): - if self.c[i].is_valid(stcls, self.c[i].label): - output[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label) + is_valid[i] = self.c[i].is_valid(stcls, self.c[i].label) + + cdef int set_costs(self, bint* is_valid, int* costs, + StateClass stcls, GoldParse gold) except -1: + cdef int i + self.set_valid(is_valid, stcls) + for i in range(self.n_moves): + if is_valid[i]: + costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label) else: - output[i] = 9000 - - cdef Transition best_gold(self, const weight_t* scores, StateClass stcls, - GoldParse gold) except *: - cdef Transition best - cdef weight_t score = MIN_SCORE - cdef int i - for i in range(self.n_moves): - if self.c[i].is_valid(stcls, self.c[i].label): - cost = self.c[i].get_cost(stcls, &gold.c, self.c[i].label) - if scores[i] > score and cost == 0: - best = self.c[i] - score = scores[i] - assert score > MIN_SCORE - return best + costs[i] = 9000 From 2fe98b8a9a75ff3613b71f850d4e0772674df82e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 26 Jun 2015 13:51:39 +0200 Subject: [PATCH 4/9] * Prepare for new models to be plugged in by using Example class --- setup.py | 3 +- spacy/_ml.pxd | 5 ++++ spacy/_ml.pyx | 18 +++++++----- spacy/_theano.pyx | 64 ++++++++++++++++++++--------------------- spacy/syntax/parser.pyx | 12 ++++---- 5 files changed, 56 insertions(+), 46 deletions(-) diff --git a/setup.py b/setup.py index 24b88dea9..a86e0f98d 100644 --- a/setup.py +++ b/setup.py @@ -151,7 +151,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.lexeme', 'spacy.vocab', 'spacy.tokens', 'spacy.spans', 'spacy.morphology', 'spacy.syntax.stateclass', - 'spacy._ml', 'spacy.tokenizer', 'spacy.en.attrs', + 'spacy._ml', 'spacy._theano', + 'spacy.tokenizer', 'spacy.en.attrs', 'spacy.en.pos', 'spacy.syntax.parser', 'spacy.syntax.transition_system', 'spacy.syntax.arc_eager', diff --git a/spacy/_ml.pxd b/spacy/_ml.pxd index add162e69..3562b4a32 100644 --- a/spacy/_ml.pxd +++ b/spacy/_ml.pxd @@ -14,9 +14,14 @@ from .tokens cimport Tokens cdef int arg_max(const weight_t* scores, const int n_classes) nogil +cdef int arg_max_if_true(const weight_t* scores, const int* is_valid, int n_classes) nogil + +cdef int arg_max_if_zero(const weight_t* scores, const int* costs, int n_classes) nogil + cdef class Model: cdef int n_classes + cdef int n_feats cdef const weight_t* score(self, atom_t* context) except NULL cdef int set_scores(self, weight_t* scores, atom_t* context) except -1 diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx index df66a1791..993d1a8ac 100644 --- a/spacy/_ml.pyx +++ b/spacy/_ml.pyx @@ -24,7 +24,7 @@ cdef int arg_max(const weight_t* scores, const int n_classes) nogil: return best -cdef int arg_max_if_true(const weight_t* scores, const bint* is_valid, +cdef int arg_max_if_true(const weight_t* scores, const int* is_valid, const int n_classes) nogil: cdef int i cdef int best = 0 @@ -54,21 +54,25 @@ cdef class Model: model_loc = path.join(model_loc, 'model') self.n_classes = n_classes self._extractor = Extractor(templates) + self.n_feats = self._extractor.n_templ self._model = LinearModel(n_classes, self._extractor.n_templ) self.model_loc = model_loc if self.model_loc and path.exists(self.model_loc): self._model.load(self.model_loc, freq_thresh=0) def predict(self, Example eg): - self.set_scores(eg.scores, eg.atoms) - eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes) + self.set_scores(eg.scores.data, eg.atoms.data) + eg.guess = arg_max_if_true(eg.scores.data, eg.is_valid.data, + self.n_classes) def train(self, Example eg): - self.set_scores(eg.scores, eg.atoms) - eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes) - eg.best = arg_max_if_zero(eg.scores, eg.costs, self.n_classes) + self.set_scores(eg.scores.data, eg.atoms.data) + eg.guess = arg_max_if_true(eg.scores.data, + eg.is_valid.data, self.n_classes) + eg.best = arg_max_if_zero(eg.scores.data, eg.costs.data, + self.n_classes) eg.cost = eg.costs[eg.guess] - self.update(eg.atoms, eg.guess, eg.best, eg.cost) + self.update(eg.atoms.data, eg.guess, eg.best, eg.cost) cdef const weight_t* score(self, atom_t* context) except NULL: cdef int n_feats diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx index 1a1224596..702208d18 100644 --- a/spacy/_theano.pyx +++ b/spacy/_theano.pyx @@ -1,44 +1,44 @@ -from thinc.example cimport Example +from thinc.api cimport Example +from thinc.typedefs cimport weight_t + +from ._ml cimport arg_max_if_true +from ._ml cimport arg_max_if_zero + +import numpy +from os import path cdef class TheanoModel(Model): - def __init__(self, n_classes, input_layer, train_func, predict_func, model_loc=None): + def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None): if model_loc is not None and path.isdir(model_loc): model_loc = path.join(model_loc, 'model') - self.n_classes = n_classes - - tables = [] - lengths = [] - for window_size, n_dims, vocab_size in input_structure: - tables.append(EmbeddingTable(n_dims, vocab_size, initializer)) - lengths.append(window_size) - - self.input_layer = InputLayer(lengths, tables) + self.eta = 0.001 + self.mu = 0.9 + self.t = 1 + initializer = lambda: 0.2 * numpy.random.uniform(-1.0, 1.0) + self.input_layer = InputLayer(input_spec, initializer) self.train_func = train_func self.predict_func = predict_func + self.n_classes = n_classes + self.n_feats = len(self.input_layer) self.model_loc = model_loc - if self.model_loc and path.exists(self.model_loc): - self._model.load(self.model_loc, freq_thresh=0) - - def train(self, Instance eg): - pass - - def predict(self, Instance eg): - - cdef const weight_t* score(self, atom_t* context) except NULL: - self.set_scores(self._scores, context) - return self._scores - - cdef int set_scores(self, weight_t* scores, atom_t* context) except -1: - # TODO f(context) --> Values - self._input_layer.fill(self._x, self._values, use_avg=False) - theano_scores = self._predict(self._x) + + def predict(self, Example eg): + self.input_layer.fill(eg.embeddings, eg.atoms) + theano_scores = self.predict_func(eg.embeddings) + cdef int i for i in range(self.n_classes): - output[i] = theano_scores[i] + eg.scores[i] = theano_scores[i] + eg.guess = arg_max_if_true(eg.scores.data, eg.is_valid.data, + self.n_classes) - cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1: - # TODO f(context) --> Values - self._input_layer.fill(self._x, self._values, use_avg=False) - + def train(self, Example eg): + self.predict(eg) + update, t, eta, mu = self.train_func(eg.embeddings, eg.scores, eg.costs) + self.input_layer.update(eg.atoms, update, self.t, self.eta, self.mu) + eg.best = arg_max_if_zero(eg.scores.data, eg.costs.data, + self.n_classes) + eg.cost = eg.costs[eg.guess] + self.t += 1 diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 4bfb0eeb1..33ae5b497 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -69,11 +69,11 @@ cdef class Parser: cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) self.moves.initialize_state(stcls) - cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE) + cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats) while not stcls.is_final(): eg.wipe() - fill_context(eg.atoms, stcls) - self.moves.set_valid(eg.is_valid, stcls) + fill_context(eg.atoms.data, stcls) + self.moves.set_valid(eg.is_valid.data, stcls) self.model.predict(eg) @@ -85,12 +85,12 @@ cdef class Parser: self.moves.preprocess_gold(gold) cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) self.moves.initialize_state(stcls) - cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE) + cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats) cdef int cost = 0 while not stcls.is_final(): eg.wipe() - fill_context(eg.atoms, stcls) - self.moves.set_costs(eg.is_valid, eg.costs, stcls, gold) + fill_context(eg.atoms.data, stcls) + self.moves.set_costs(eg.is_valid.data, eg.costs.data, stcls, gold) self.model.train(eg) From f8bb43475e449db5fdc4eb74809b947fbca0f3a9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 Jun 2015 02:38:51 +0200 Subject: [PATCH 5/9] * Bridge to Theano working. Very disorganised. Using thinc adb60aba966ed2 --- bin/parser/nn_train.py | 255 +++++++++++++++++++++++++++++++ spacy/_theano.pxd | 13 ++ spacy/_theano.pyx | 19 ++- spacy/syntax/_parse_features.pyx | 4 + spacy/syntax/parser.pyx | 7 +- 5 files changed, 291 insertions(+), 7 deletions(-) create mode 100755 bin/parser/nn_train.py create mode 100644 spacy/_theano.pxd diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py new file mode 100755 index 000000000..375996f4f --- /dev/null +++ b/bin/parser/nn_train.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python +from __future__ import division +from __future__ import unicode_literals + +import os +from os import path +import shutil +import codecs +import random + +import plac +import cProfile +import pstats +import re + +import spacy.util +from spacy.en import English +from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir + +from spacy.syntax.util import Config +from spacy.gold import read_json_file +from spacy.gold import GoldParse + +from spacy.scorer import Scorer + +from thinc.theano_nn import compile_theano_model + +from spacy.syntax.parser import Parser +from spacy._theano import TheanoModel + + +def _corrupt(c, noise_level): + if random.random() >= noise_level: + return c + elif c == ' ': + return '\n' + elif c == '\n': + return ' ' + elif c in ['.', "'", "!", "?"]: + return '' + else: + return c.lower() + + +def add_noise(orig, noise_level): + if random.random() >= noise_level: + return orig + elif type(orig) == list: + corrupted = [_corrupt(word, noise_level) for word in orig] + corrupted = [w for w in corrupted if w] + return corrupted + else: + return ''.join(_corrupt(c, noise_level) for c in orig) + + +def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False): + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + else: + tokens = nlp.tokenizer(raw_text) + nlp.tagger(tokens) + nlp.entity(tokens) + nlp.parser(tokens) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold, verbose=verbose) + + +def _merge_sents(sents): + m_deps = [[], [], [], [], [], []] + m_brackets = [] + i = 0 + for (ids, words, tags, heads, labels, ner), brackets in sents: + m_deps[0].extend(id_ + i for id_ in ids) + m_deps[1].extend(words) + m_deps[2].extend(tags) + m_deps[3].extend(head + i for head in heads) + m_deps[4].extend(labels) + m_deps[5].extend(ner) + m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets) + i += len(ids) + return [(m_deps, m_brackets)] + + +def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', + seed=0, gold_preproc=False, n_sents=0, corruption_level=0, + verbose=False, + eta=0.01, mu=0.9, n_hidden=100, word_vec_len=10, pos_vec_len=10): + dep_model_dir = path.join(model_dir, 'deps') + pos_model_dir = path.join(model_dir, 'pos') + ner_model_dir = path.join(model_dir, 'ner') + if path.exists(dep_model_dir): + shutil.rmtree(dep_model_dir) + if path.exists(pos_model_dir): + shutil.rmtree(pos_model_dir) + if path.exists(ner_model_dir): + shutil.rmtree(ner_model_dir) + os.mkdir(dep_model_dir) + os.mkdir(pos_model_dir) + os.mkdir(ner_model_dir) + setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) + + Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, + labels=Language.ParserTransitionSystem.get_labels(gold_tuples)) + Config.write(ner_model_dir, 'config', features='ner', seed=seed, + labels=Language.EntityTransitionSystem.get_labels(gold_tuples), + beam_width=0) + + if n_sents > 0: + gold_tuples = gold_tuples[:n_sents] + + nlp = Language(data_dir=model_dir) + + def make_model(n_classes, input_spec, model_dir): + print input_spec + n_in = sum(n_cols * len(fields) for (n_cols, fields) in input_spec) + print 'Compiling' + debug, train_func, predict_func = compile_theano_model(n_classes, n_hidden, + n_in, 0.0, 0.0) + print 'Done' + return TheanoModel( + n_classes, + input_spec, + train_func, + predict_func, + model_loc=model_dir, + debug=debug) + + nlp._parser = Parser(nlp.vocab.strings, dep_model_dir, nlp.ParserTransitionSystem, + make_model) + + print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %" + for itn in range(n_iter): + scorer = Scorer() + loss = 0 + for raw_text, sents in gold_tuples: + if gold_preproc: + raw_text = None + else: + sents = _merge_sents(sents) + for annot_tuples, ctnt in sents: + if len(annot_tuples[1]) == 1: + continue + score_model(scorer, nlp, raw_text, annot_tuples, + verbose=verbose if itn >= 2 else False) + if raw_text is None: + words = add_noise(annot_tuples[1], corruption_level) + tokens = nlp.tokenizer.tokens_from_list(words) + else: + raw_text = add_noise(raw_text, corruption_level) + tokens = nlp.tokenizer(raw_text) + nlp.tagger(tokens) + gold = GoldParse(tokens, annot_tuples, make_projective=True) + if not gold.is_projective: + raise Exception( + "Non-projective sentence in training, after we should " + "have enforced projectivity: %s" % annot_tuples + ) + loss += nlp.parser.train(tokens, gold) + nlp.entity.train(tokens, gold) + nlp.tagger.train(tokens, gold.tags) + random.shuffle(gold_tuples) + print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, + scorer.tags_acc, + scorer.token_acc) + nlp.parser.model.end_training() + nlp.entity.model.end_training() + nlp.tagger.model.end_training() + nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt')) + return nlp + + +def evaluate(nlp, gold_tuples, gold_preproc=True): + scorer = Scorer() + for raw_text, sents in gold_tuples: + if gold_preproc: + raw_text = None + else: + sents = _merge_sents(sents) + for annot_tuples, brackets in sents: + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + nlp.tagger(tokens) + nlp.entity(tokens) + nlp.parser(tokens) + else: + tokens = nlp(raw_text, merge_mwes=False) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold) + return scorer + + +def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): + nlp = Language(data_dir=model_dir) + if beam_width is not None: + nlp.parser.cfg.beam_width = beam_width + gold_tuples = read_json_file(dev_loc) + scorer = Scorer() + out_file = codecs.open(out_loc, 'w', 'utf8') + for raw_text, sents in gold_tuples: + sents = _merge_sents(sents) + for annot_tuples, brackets in sents: + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + nlp.tagger(tokens) + nlp.entity(tokens) + nlp.parser(tokens) + else: + tokens = nlp(raw_text, merge_mwes=False) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold, verbose=False) + for t in tokens: + out_file.write( + '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_) + ) + return scorer + + +@plac.annotations( + train_loc=("Location of training file or directory"), + dev_loc=("Location of development file or directory"), + model_dir=("Location of output model directory",), + eval_only=("Skip training, and only evaluate", "flag", "e", bool), + corruption_level=("Amount of noise to add to training data", "option", "c", float), + gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool), + out_loc=("Out location", "option", "o", str), + n_sents=("Number of training sentences", "option", "n", int), + n_iter=("Number of training iterations", "option", "i", int), + verbose=("Verbose error reporting", "flag", "v", bool), + debug=("Debug mode", "flag", "d", bool), +) +def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, + debug=False, corruption_level=0.0, gold_preproc=False, beam_width=1, + eval_only=False): + gold_train = list(read_json_file(train_loc)) + nlp = train(English, gold_train, model_dir, + feat_set='embed', + gold_preproc=gold_preproc, n_sents=n_sents, + corruption_level=corruption_level, n_iter=n_iter, + verbose=verbose) + #if out_loc: + # write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width) + scorer = evaluate(nlp, list(read_json_file(dev_loc)), gold_preproc=gold_preproc) + + print 'TOK', 100-scorer.token_acc + print 'POS', scorer.tags_acc + print 'UAS', scorer.uas + print 'LAS', scorer.las + + print 'NER P', scorer.ents_p + print 'NER R', scorer.ents_r + print 'NER F', scorer.ents_f + + +if __name__ == '__main__': + plac.call(main) diff --git a/spacy/_theano.pxd b/spacy/_theano.pxd new file mode 100644 index 000000000..cad0736c2 --- /dev/null +++ b/spacy/_theano.pxd @@ -0,0 +1,13 @@ +from ._ml cimport Model +from thinc.nn cimport InputLayer + + +cdef class TheanoModel(Model): + cdef InputLayer input_layer + cdef object train_func + cdef object predict_func + cdef object debug + + cdef public float eta + cdef public float mu + cdef public float t diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx index 702208d18..b791c4f42 100644 --- a/spacy/_theano.pyx +++ b/spacy/_theano.pyx @@ -9,7 +9,8 @@ from os import path cdef class TheanoModel(Model): - def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None): + def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None, + debug=None): if model_loc is not None and path.isdir(model_loc): model_loc = path.join(model_loc, 'model') @@ -20,6 +21,7 @@ cdef class TheanoModel(Model): self.input_layer = InputLayer(input_spec, initializer) self.train_func = train_func self.predict_func = predict_func + self.debug = debug self.n_classes = n_classes self.n_feats = len(self.input_layer) @@ -27,7 +29,7 @@ cdef class TheanoModel(Model): def predict(self, Example eg): self.input_layer.fill(eg.embeddings, eg.atoms) - theano_scores = self.predict_func(eg.embeddings) + theano_scores = self.predict_func(eg.embeddings)[0] cdef int i for i in range(self.n_classes): eg.scores[i] = theano_scores[i] @@ -35,10 +37,17 @@ cdef class TheanoModel(Model): self.n_classes) def train(self, Example eg): - self.predict(eg) - update, t, eta, mu = self.train_func(eg.embeddings, eg.scores, eg.costs) - self.input_layer.update(eg.atoms, update, self.t, self.eta, self.mu) + self.input_layer.fill(eg.embeddings, eg.atoms) + theano_scores, update, y = self.train_func(eg.embeddings, eg.costs, self.eta) + self.input_layer.update(update, eg.atoms, self.t, self.eta, self.mu) + for i in range(self.n_classes): + eg.scores[i] = theano_scores[i] + eg.guess = arg_max_if_true(eg.scores.data, eg.is_valid.data, + self.n_classes) eg.best = arg_max_if_zero(eg.scores.data, eg.costs.data, self.n_classes) eg.cost = eg.costs[eg.guess] self.t += 1 + + def end_training(self): + pass diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx index efefc7273..1adeaef83 100644 --- a/spacy/syntax/_parse_features.pyx +++ b/spacy/syntax/_parse_features.pyx @@ -355,3 +355,7 @@ trigrams = ( (N0W, N0p, N0lL, N0l2L), (N0p, N0lL, N0l2L), ) + +words = (S0w, N0w, S1w, N1w) +tags = (S0p, N0p, S1p, N1p) +labels = (S0L, N0L, S1L, S2L) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 33ae5b497..66d598b88 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -52,18 +52,21 @@ def get_templates(name): return pf.ner elif name == 'debug': return pf.unigrams + elif name.startswith('embed'): + return ((10, pf.words), (10, pf.tags), (10, pf.labels)) else: return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \ pf.tree_shape + pf.trigrams) cdef class Parser: - def __init__(self, StringStore strings, model_dir, transition_system): + def __init__(self, StringStore strings, model_dir, transition_system, + get_model=Model): assert os.path.exists(model_dir) and os.path.isdir(model_dir) self.cfg = Config.read(model_dir, 'config') self.moves = transition_system(strings, self.cfg.labels) templates = get_templates(self.cfg.features) - self.model = Model(self.moves.n_moves, templates, model_dir) + self.model = get_model(self.moves.n_moves, templates, model_dir) def __call__(self, Tokens tokens): cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) From ebe630cc8dc4f8affd8cd27fd3d4be113e669b59 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 Jun 2015 04:17:29 +0200 Subject: [PATCH 6/9] * Enable more features for NN --- spacy/syntax/_parse_features.pyx | 57 ++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx index 1adeaef83..40c1818c5 100644 --- a/spacy/syntax/_parse_features.pyx +++ b/spacy/syntax/_parse_features.pyx @@ -356,6 +356,57 @@ trigrams = ( (N0p, N0lL, N0l2L), ) -words = (S0w, N0w, S1w, N1w) -tags = (S0p, N0p, S1p, N1p) -labels = (S0L, N0L, S1L, S2L) + +words = ( + S2w, + S1w, + S1rw, + S0lw, + S0l2w, + S0w, + S0r2w, + S0rw, + N0lw, + N0l2w, + N0w, + N1w, + N2w, + P1w, + P2w +) + +tags = ( + S2p, + S1p, + S1rp, + S0lp, + S0l2p, + S0p, + S0r2p, + S0rp, + N0lp, + N0l2p, + N0p, + N1p, + N2p, + P1p, + P2p +) + +labels = ( + S2L, + S1L, + S1rL, + S0lL, + S0l2L, + S0L, + S0r2L, + S0rL, + N0lL, + N0l2L, + N0L, + N1L, + N2L, + P1L, + P2L +) From da793073d0cbd83ad1464c4aba94dba5d1fe1fde Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 Jun 2015 04:18:01 +0200 Subject: [PATCH 7/9] * Wire hyperparameters to script interface --- bin/parser/nn_train.py | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py index 375996f4f..e0ae846b5 100755 --- a/bin/parser/nn_train.py +++ b/bin/parser/nn_train.py @@ -84,7 +84,8 @@ def _merge_sents(sents): def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, gold_preproc=False, n_sents=0, corruption_level=0, verbose=False, - eta=0.01, mu=0.9, n_hidden=100, word_vec_len=10, pos_vec_len=10): + eta=0.01, mu=0.9, n_hidden=100, + nv_word=10, nv_tag=10, nv_label=10): dep_model_dir = path.join(model_dir, 'deps') pos_model_dir = path.join(model_dir, 'pos') ner_model_dir = path.join(model_dir, 'ner') @@ -99,8 +100,15 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', os.mkdir(ner_model_dir) setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) - Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, - labels=Language.ParserTransitionSystem.get_labels(gold_tuples)) + Config.write(dep_model_dir, 'config', + seed=seed, + features=feat_set, + labels=Language.ParserTransitionSystem.get_labels(gold_tuples), + vector_lengths=(nv_word, nv_tag, nv_label), + hidden_nodes=n_hidden, + eta=eta, + mu=mu + ) Config.write(ner_model_dir, 'config', features='ner', seed=seed, labels=Language.EntityTransitionSystem.get_labels(gold_tuples), beam_width=0) @@ -110,16 +118,17 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', nlp = Language(data_dir=model_dir) - def make_model(n_classes, input_spec, model_dir): - print input_spec - n_in = sum(n_cols * len(fields) for (n_cols, fields) in input_spec) + def make_model(n_classes, (words, tags, labels), model_dir): + n_in = (nv_word * len(words)) + \ + (nv_tag * len(tags)) + \ + (nv_label * len(labels)) print 'Compiling' debug, train_func, predict_func = compile_theano_model(n_classes, n_hidden, n_in, 0.0, 0.0) print 'Done' return TheanoModel( n_classes, - input_spec, + ((nv_word, words), (nv_tag, tags), (nv_label, labels)), train_func, predict_func, model_loc=model_dir, @@ -226,14 +235,23 @@ def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): n_sents=("Number of training sentences", "option", "n", int), n_iter=("Number of training iterations", "option", "i", int), verbose=("Verbose error reporting", "flag", "v", bool), - debug=("Debug mode", "flag", "d", bool), + + nv_word=("Word vector length", "option", "W", int), + nv_tag=("Tag vector length", "option", "T", int), + nv_label=("Label vector length", "option", "L", int), + nv_hidden=("Hidden nodes length", "option", "H", int), + eta=("Learning rate", "option", "E", float), + mu=("Momentum", "option", "M", float), ) def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, - debug=False, corruption_level=0.0, gold_preproc=False, beam_width=1, + corruption_level=0.0, gold_preproc=False, + nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10, + eta=0.1, mu=0.9, eval_only=False): gold_train = list(read_json_file(train_loc)) nlp = train(English, gold_train, model_dir, feat_set='embed', + nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, gold_preproc=gold_preproc, n_sents=n_sents, corruption_level=corruption_level, n_iter=n_iter, verbose=verbose) From ed40a8380ee4289eadae9b98da4e61c337b6ab01 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 Jun 2015 04:18:47 +0200 Subject: [PATCH 8/9] * Remove hard-coding of vector lengths --- spacy/_theano.pyx | 4 ++-- spacy/syntax/parser.pyx | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx index b791c4f42..965ee84d7 100644 --- a/spacy/_theano.pyx +++ b/spacy/_theano.pyx @@ -28,7 +28,7 @@ cdef class TheanoModel(Model): self.model_loc = model_loc def predict(self, Example eg): - self.input_layer.fill(eg.embeddings, eg.atoms) + self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=True) theano_scores = self.predict_func(eg.embeddings)[0] cdef int i for i in range(self.n_classes): @@ -37,7 +37,7 @@ cdef class TheanoModel(Model): self.n_classes) def train(self, Example eg): - self.input_layer.fill(eg.embeddings, eg.atoms) + self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=False) theano_scores, update, y = self.train_func(eg.embeddings, eg.costs, self.eta) self.input_layer.update(update, eg.atoms, self.t, self.eta, self.mu) for i in range(self.n_classes): diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 66d598b88..797ee1e56 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -53,7 +53,7 @@ def get_templates(name): elif name == 'debug': return pf.unigrams elif name.startswith('embed'): - return ((10, pf.words), (10, pf.tags), (10, pf.labels)) + return (pf.words, pf.tags, pf.labels) else: return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \ pf.tree_shape + pf.trigrams) From 65ac38919135612b319de1d6f183558d13a0f52c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 28 Jun 2015 01:29:37 +0200 Subject: [PATCH 9/9] * whitespace --- bin/parser/nn_train.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py index e0ae846b5..33ad8a8a9 100755 --- a/bin/parser/nn_train.py +++ b/bin/parser/nn_train.py @@ -84,7 +84,7 @@ def _merge_sents(sents): def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, gold_preproc=False, n_sents=0, corruption_level=0, verbose=False, - eta=0.01, mu=0.9, n_hidden=100, + eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10): dep_model_dir = path.join(model_dir, 'deps') pos_model_dir = path.join(model_dir, 'pos') @@ -105,7 +105,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', features=feat_set, labels=Language.ParserTransitionSystem.get_labels(gold_tuples), vector_lengths=(nv_word, nv_tag, nv_label), - hidden_nodes=n_hidden, + hidden_nodes=nv_hidden, eta=eta, mu=mu ) @@ -123,7 +123,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', (nv_tag * len(tags)) + \ (nv_label * len(labels)) print 'Compiling' - debug, train_func, predict_func = compile_theano_model(n_classes, n_hidden, + debug, train_func, predict_func = compile_theano_model(n_classes, nv_hidden, n_in, 0.0, 0.0) print 'Done' return TheanoModel( @@ -251,7 +251,7 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos gold_train = list(read_json_file(train_loc)) nlp = train(English, gold_train, model_dir, feat_set='embed', - nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, + nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden, gold_preproc=gold_preproc, n_sents=n_sents, corruption_level=corruption_level, n_iter=n_iter, verbose=verbose)