From a4e9bdf4c171acead99a6c55ac8113194abbb4c8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 23 Jun 2015 22:55:58 +0200
Subject: [PATCH 01/30] * Work on a theano-driven model for the parser

---
 spacy/_theano.pyx | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 spacy/_theano.pyx

diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx
new file mode 100644
index 000000000..1a1224596
--- /dev/null
+++ b/spacy/_theano.pyx
@@ -0,0 +1,44 @@
+from thinc.example cimport Example
+
+
+cdef class TheanoModel(Model):
+    def __init__(self, n_classes, input_layer, train_func, predict_func, model_loc=None):
+        if model_loc is not None and path.isdir(model_loc):
+            model_loc = path.join(model_loc, 'model')
+        self.n_classes = n_classes
+        
+        tables = []
+        lengths = []
+        for window_size, n_dims, vocab_size in input_structure:
+            tables.append(EmbeddingTable(n_dims, vocab_size, initializer))
+            lengths.append(window_size)
+
+        self.input_layer = InputLayer(lengths, tables)
+
+        self.train_func = train_func
+        self.predict_func = predict_func
+
+        self.model_loc = model_loc
+        if self.model_loc and path.exists(self.model_loc):
+            self._model.load(self.model_loc, freq_thresh=0)
+
+    def train(self, Instance eg):
+        pass
+
+    def predict(self, Instance eg):
+
+    cdef const weight_t* score(self, atom_t* context) except NULL:
+        self.set_scores(self._scores, context)
+        return self._scores
+    
+    cdef int set_scores(self, weight_t* scores, atom_t* context) except -1:
+        # TODO f(context) --> Values
+        self._input_layer.fill(self._x, self._values, use_avg=False)
+        theano_scores = self._predict(self._x)
+        for i in range(self.n_classes):
+            output[i] = theano_scores[i]
+
+    cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1:
+        # TODO f(context) --> Values
+        self._input_layer.fill(self._x, self._values, use_avg=False)
+ 

From 886100e1a2f64cb805ebd8ca0934fefa2794ec06 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 24 Jun 2015 04:51:38 +0200
Subject: [PATCH 02/30] * Increment version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 76615e141..24b88dea9 100644
--- a/setup.py
+++ b/setup.py
@@ -130,7 +130,7 @@ def run_setup(exts):
     headers_workaround.install_headers('numpy')
 
 
-VERSION = '0.85'
+VERSION = '0.86'
 def main(modules, is_pypy):
     language = "cpp"
     includes = ['.', path.join(sys.prefix, 'include')]

From 6896455884879f2845fce63097ce9fdf5856bf60 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 26 Jun 2015 06:25:36 +0200
Subject: [PATCH 03/30] * Rejig parser interface to use new thinc.api.Example
 class, in prep of theano model. Comment out beam search

---
 spacy/_ml.pyx                      |  36 ++++++
 spacy/syntax/arc_eager.pyx         |  29 ++---
 spacy/syntax/ner.pyx               |  21 ----
 spacy/syntax/parser.pxd            |   3 -
 spacy/syntax/parser.pyx            | 178 ++++++++++++-----------------
 spacy/syntax/transition_system.pxd |   8 +-
 spacy/syntax/transition_system.pyx |  35 ++----
 7 files changed, 132 insertions(+), 178 deletions(-)

diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx
index be647c2dd..df66a1791 100644
--- a/spacy/_ml.pyx
+++ b/spacy/_ml.pyx
@@ -10,6 +10,7 @@ import cython
 import numpy.random
 
 from thinc.features cimport Feature, count_feats
+from thinc.api cimport Example
 
 
 cdef int arg_max(const weight_t* scores, const int n_classes) nogil:
@@ -23,6 +24,30 @@ cdef int arg_max(const weight_t* scores, const int n_classes) nogil:
     return best
 
 
+cdef int arg_max_if_true(const weight_t* scores, const bint* is_valid,
+                         const int n_classes) nogil:
+    cdef int i
+    cdef int best = 0
+    cdef weight_t mode = -900000
+    for i in range(n_classes):
+        if is_valid[i] and scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
+
+
+cdef int arg_max_if_zero(const weight_t* scores, const int* costs,
+                         const int n_classes) nogil:
+    cdef int i
+    cdef int best = 0
+    cdef weight_t mode = -900000
+    for i in range(n_classes):
+        if costs[i] == 0 and scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
+
+
 cdef class Model:
     def __init__(self, n_classes, templates, model_loc=None):
         if model_loc is not None and path.isdir(model_loc):
@@ -34,6 +59,17 @@ cdef class Model:
         if self.model_loc and path.exists(self.model_loc):
             self._model.load(self.model_loc, freq_thresh=0)
 
+    def predict(self, Example eg):
+        self.set_scores(eg.scores, eg.atoms)
+        eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes)
+
+    def train(self, Example eg):
+        self.set_scores(eg.scores, eg.atoms)
+        eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes)
+        eg.best = arg_max_if_zero(eg.scores, eg.costs, self.n_classes)
+        eg.cost = eg.costs[eg.guess]
+        self.update(eg.atoms, eg.guess, eg.best, eg.cost)
+
     cdef const weight_t* score(self, atom_t* context) except NULL:
         cdef int n_feats
         feats = self._extractor.get_feats(context, &n_feats)
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 29e62cb4e..a83e19ec2 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -398,7 +398,8 @@ cdef class ArcEager(TransitionSystem):
             n_valid += output[i]
         assert n_valid >= 1
 
-    cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1:
+    cdef int set_costs(self, bint* is_valid, int* costs, 
+                       StateClass stcls, GoldParse gold) except -1:
         cdef int i, move, label
         cdef label_cost_func_t[N_MOVES] label_cost_funcs
         cdef move_cost_func_t[N_MOVES] move_cost_funcs
@@ -423,30 +424,14 @@ cdef class ArcEager(TransitionSystem):
         n_gold = 0
         for i in range(self.n_moves):
             if self.c[i].is_valid(stcls, self.c[i].label):
+                is_valid[i] = True
                 move = self.c[i].move
                 label = self.c[i].label
                 if move_costs[move] == -1:
                     move_costs[move] = move_cost_funcs[move](stcls, &gold.c)
-                output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label)
-                n_gold += output[i] == 0
+                costs[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label)
+                n_gold += costs[i] == 0
             else:
-                output[i] = 9000
+                is_valid[i] = False
+                costs[i] = 9000
         assert n_gold >= 1
-
-    cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *:
-        cdef bint[N_MOVES] is_valid
-        is_valid[SHIFT] = Shift.is_valid(stcls, -1)
-        is_valid[REDUCE] = Reduce.is_valid(stcls, -1)
-        is_valid[LEFT] = LeftArc.is_valid(stcls, -1)
-        is_valid[RIGHT] = RightArc.is_valid(stcls, -1)
-        is_valid[BREAK] = Break.is_valid(stcls, -1)
-        cdef Transition best
-        cdef weight_t score = MIN_SCORE
-        cdef int i
-        for i in range(self.n_moves):
-            if scores[i] > score and is_valid[self.c[i].move]:
-                best = self.c[i]
-                score = scores[i]
-        assert best.clas < self.n_moves
-        assert score > MIN_SCORE, (stcls.stack_depth(), stcls.buffer_length(), stcls.is_final(), stcls._b_i, stcls.length)
-        return best
diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx
index 4a47a20a8..b145df7ac 100644
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@@ -128,27 +128,6 @@ cdef class BiluoPushDown(TransitionSystem):
             raise Exception(move)
         return t
 
-    cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *:
-        cdef int best = -1
-        cdef weight_t score = -90000
-        cdef const Transition* m
-        cdef int i
-        for i in range(self.n_moves):
-            m = &self.c[i]
-            if m.is_valid(stcls, m.label) and scores[i] > score:
-                best = i
-                score = scores[i]
-        assert best >= 0
-        cdef Transition t = self.c[best]
-        t.score = score
-        return t
-
-    cdef int set_valid(self, bint* output, StateClass stcls) except -1:
-        cdef int i
-        for i in range(self.n_moves):
-            m = &self.c[i]
-            output[i] = m.is_valid(stcls, m.label)
-
 
 cdef class Missing:
     @staticmethod
diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd
index 103ff9c02..11ac6bbb8 100644
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@@ -11,6 +11,3 @@ cdef class Parser:
     cdef readonly object cfg
     cdef readonly Model model
     cdef readonly TransitionSystem moves
-
-    cdef int _greedy_parse(self, Tokens tokens) except -1
-    cdef int _beam_parse(self, Tokens tokens) except -1
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index 740e86025..4bfb0eeb1 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -20,17 +20,10 @@ from cymem.cymem cimport Pool, Address
 from murmurhash.mrmr cimport hash64
 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
 
-
 from util import Config
 
-from thinc.features cimport Extractor
-from thinc.features cimport Feature
-from thinc.features cimport count_feats
+from thinc.api cimport Example
 
-from thinc.learner cimport LinearModel
-
-from thinc.search cimport Beam
-from thinc.search cimport MaxViolation
 
 from ..tokens cimport Tokens, TokenC
 from ..strings cimport StringStore
@@ -73,35 +66,86 @@ cdef class Parser:
         self.model = Model(self.moves.n_moves, templates, model_dir)
 
     def __call__(self, Tokens tokens):
-        if self.cfg.get('beam_width', 1) < 1:
-            self._greedy_parse(tokens)
-        else:
-            self._beam_parse(tokens)
+        cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
+        self.moves.initialize_state(stcls)
+
+        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE)
+        while not stcls.is_final():
+            eg.wipe()
+            fill_context(eg.atoms, stcls)
+            self.moves.set_valid(eg.is_valid, stcls)
+
+            self.model.predict(eg)
+
+            self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
+        self.moves.finalize_state(stcls)
+        tokens.set_parse(stcls._sent)
 
     def train(self, Tokens tokens, GoldParse gold):
         self.moves.preprocess_gold(gold)
-        if self.cfg.beam_width < 1:
-            return self._greedy_train(tokens, gold)
-        else:
-            return self._beam_train(tokens, gold)
-
-    cdef int _greedy_parse(self, Tokens tokens) except -1:
-        cdef atom_t[CONTEXT_SIZE] context
-        cdef int n_feats
-        cdef Pool mem = Pool()
         cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
         self.moves.initialize_state(stcls)
-        cdef Transition guess
-        words = [w.orth_ for w in tokens]
+        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE)
+        cdef int cost = 0
         while not stcls.is_final():
-            fill_context(context, stcls)
-            scores = self.model.score(context)
-            guess = self.moves.best_valid(scores, stcls)
-            #print self.moves.move_name(guess.move, guess.label), stcls.print_state(words)
-            guess.do(stcls, guess.label)
-            assert stcls._s_i >= 0
-        self.moves.finalize_state(stcls)
-        tokens.set_parse(stcls._sent)
+            eg.wipe()
+            fill_context(eg.atoms, stcls)
+            self.moves.set_costs(eg.is_valid, eg.costs, stcls, gold)
+
+            self.model.train(eg)
+
+            self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
+            cost += eg.cost
+        return cost
+
+
+# These are passed as callbacks to thinc.search.Beam
+"""
+cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
+    dest = <StateClass>_dest
+    src = <StateClass>_src
+    moves = <const Transition*>_moves
+    dest.clone(src)
+    moves[clas].do(dest, moves[clas].label)
+
+
+cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
+    cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
+    st.fast_forward()
+    Py_INCREF(st)
+    return <void*>st
+
+
+cdef int _check_final_state(void* _state, void* extra_args) except -1:
+    return (<StateClass>_state).is_final()
+
+
+def _cleanup(Beam beam):
+    for i in range(beam.width):
+        Py_XDECREF(<PyObject*>beam._states[i].content)
+        Py_XDECREF(<PyObject*>beam._parents[i].content)
+
+cdef hash_t _hash_state(void* _state, void* _) except 0:
+    return <hash_t>_state
+    
+    #state = <const State*>_state
+    #cdef atom_t[10] rep
+
+    #rep[0] = state.stack[0] if state.stack_len >= 1 else 0
+    #rep[1] = state.stack[-1] if state.stack_len >= 2 else 0
+    #rep[2] = state.stack[-2] if state.stack_len >= 3 else 0
+    #rep[3] = state.i
+    #rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0
+    #rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0
+    #rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0
+    #rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0
+    #if get_left(state, get_n0(state), 1) != NULL:
+    #    rep[8] = get_left(state, get_n0(state), 1).dep 
+    #else:
+    #    rep[8] = 0
+    #rep[9] = state.sent[state.i].l_kids
+    #return hash64(rep, sizeof(atom_t) * 10, 0)
+
 
     cdef int _beam_parse(self, Tokens tokens) except -1:
         cdef Beam beam = Beam(self.moves.n_moves, self.cfg.beam_width)
@@ -115,30 +159,6 @@ cdef class Parser:
         tokens.set_parse(state._sent)
         _cleanup(beam)
 
-    def _greedy_train(self, Tokens tokens, GoldParse gold):
-        cdef Pool mem = Pool()
-        cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
-        self.moves.initialize_state(stcls)
-
-        cdef int cost
-        cdef const Feature* feats
-        cdef const weight_t* scores
-        cdef Transition guess
-        cdef Transition best
-        cdef atom_t[CONTEXT_SIZE] context
-        loss = 0
-        words = [w.orth_ for w in tokens]
-        history = []
-        while not stcls.is_final():
-            fill_context(context, stcls)
-            scores = self.model.score(context)
-            guess = self.moves.best_valid(scores, stcls)
-            best = self.moves.best_gold(scores, stcls, gold)
-            cost = guess.get_cost(stcls, &gold.c, guess.label)
-            self.model.update(context, guess.clas, best.clas, cost)
-            guess.do(stcls, guess.label)
-            loss += cost
-        return loss
 
     def _beam_train(self, Tokens tokens, GoldParse gold_parse):
         cdef Beam pred = Beam(self.moves.n_moves, self.cfg.beam_width)
@@ -201,50 +221,4 @@ cdef class Parser:
             count_feats(counts[clas], feats, n_feats, inc)
             self.moves.c[clas].do(stcls, self.moves.c[clas].label)
 
-
-# These are passed as callbacks to thinc.search.Beam
-
-cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
-    dest = <StateClass>_dest
-    src = <StateClass>_src
-    moves = <const Transition*>_moves
-    dest.clone(src)
-    moves[clas].do(dest, moves[clas].label)
-
-
-cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
-    cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
-    st.fast_forward()
-    Py_INCREF(st)
-    return <void*>st
-
-
-cdef int _check_final_state(void* _state, void* extra_args) except -1:
-    return (<StateClass>_state).is_final()
-
-
-def _cleanup(Beam beam):
-    for i in range(beam.width):
-        Py_XDECREF(<PyObject*>beam._states[i].content)
-        Py_XDECREF(<PyObject*>beam._parents[i].content)
-
-cdef hash_t _hash_state(void* _state, void* _) except 0:
-    return <hash_t>_state
-    
-    #state = <const State*>_state
-    #cdef atom_t[10] rep
-
-    #rep[0] = state.stack[0] if state.stack_len >= 1 else 0
-    #rep[1] = state.stack[-1] if state.stack_len >= 2 else 0
-    #rep[2] = state.stack[-2] if state.stack_len >= 3 else 0
-    #rep[3] = state.i
-    #rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0
-    #rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0
-    #rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0
-    #rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0
-    #if get_left(state, get_n0(state), 1) != NULL:
-    #    rep[8] = get_left(state, get_n0(state), 1).dep 
-    #else:
-    #    rep[8] = 0
-    #rep[9] = state.sent[state.i].l_kids
-    #return hash64(rep, sizeof(atom_t) * 10, 0)
+"""
diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd
index d9bd2b3e6..35f0ada30 100644
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@@ -46,9 +46,5 @@ cdef class TransitionSystem:
 
     cdef int set_valid(self, bint* output, StateClass state) except -1
     
-    cdef int set_costs(self, int* output, StateClass state, GoldParse gold) except -1
-
-    cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *
-
-    cdef Transition best_gold(self, const weight_t* scores, StateClass state,
-                              GoldParse gold) except *
+    cdef int set_costs(self, bint* is_valid, int* costs,
+                       StateClass state, GoldParse gold) except -1
diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index 927498cba..b13c75ba3 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -43,30 +43,17 @@ cdef class TransitionSystem:
     cdef Transition init_transition(self, int clas, int move, int label) except *:
         raise NotImplementedError
 
-    cdef Transition best_valid(self, const weight_t* scores, StateClass s) except *:
-        raise NotImplementedError
-    
-    cdef int set_valid(self, bint* output, StateClass state) except -1:
-        raise NotImplementedError
-
-    cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1:
+    cdef int set_valid(self, bint* is_valid, StateClass stcls) except -1:
         cdef int i
         for i in range(self.n_moves):
-            if self.c[i].is_valid(stcls, self.c[i].label):
-                output[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
+            is_valid[i] = self.c[i].is_valid(stcls, self.c[i].label)
+
+    cdef int set_costs(self, bint* is_valid, int* costs,
+                       StateClass stcls, GoldParse gold) except -1:
+        cdef int i
+        self.set_valid(is_valid, stcls)
+        for i in range(self.n_moves):
+            if is_valid[i]:
+                costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
             else:
-                output[i] = 9000
-
-    cdef Transition best_gold(self, const weight_t* scores, StateClass stcls,
-                              GoldParse gold) except *:
-        cdef Transition best
-        cdef weight_t score = MIN_SCORE
-        cdef int i
-        for i in range(self.n_moves):
-            if self.c[i].is_valid(stcls, self.c[i].label):
-                cost = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
-                if scores[i] > score and cost == 0:
-                    best = self.c[i]
-                    score = scores[i]
-        assert score > MIN_SCORE
-        return best
+                costs[i] = 9000

From 2fe98b8a9a75ff3613b71f850d4e0772674df82e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 26 Jun 2015 13:51:39 +0200
Subject: [PATCH 04/30] * Prepare for new models to be plugged in by using
 Example class

---
 setup.py                |  3 +-
 spacy/_ml.pxd           |  5 ++++
 spacy/_ml.pyx           | 18 +++++++-----
 spacy/_theano.pyx       | 64 ++++++++++++++++++++---------------------
 spacy/syntax/parser.pyx | 12 ++++----
 5 files changed, 56 insertions(+), 46 deletions(-)

diff --git a/setup.py b/setup.py
index 24b88dea9..a86e0f98d 100644
--- a/setup.py
+++ b/setup.py
@@ -151,7 +151,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
              'spacy.lexeme', 'spacy.vocab', 'spacy.tokens', 'spacy.spans',
              'spacy.morphology', 
              'spacy.syntax.stateclass', 
-             'spacy._ml', 'spacy.tokenizer', 'spacy.en.attrs',
+             'spacy._ml', 'spacy._theano',
+             'spacy.tokenizer', 'spacy.en.attrs',
              'spacy.en.pos', 'spacy.syntax.parser', 
              'spacy.syntax.transition_system',
              'spacy.syntax.arc_eager',
diff --git a/spacy/_ml.pxd b/spacy/_ml.pxd
index add162e69..3562b4a32 100644
--- a/spacy/_ml.pxd
+++ b/spacy/_ml.pxd
@@ -14,9 +14,14 @@ from .tokens cimport Tokens
 
 cdef int arg_max(const weight_t* scores, const int n_classes) nogil
 
+cdef int arg_max_if_true(const weight_t* scores, const int* is_valid,  int n_classes) nogil
+
+cdef int arg_max_if_zero(const weight_t* scores, const int* costs, int n_classes) nogil
+
 
 cdef class Model:
     cdef int n_classes
+    cdef int n_feats
     
     cdef const weight_t* score(self, atom_t* context) except NULL
     cdef int set_scores(self, weight_t* scores, atom_t* context) except -1
diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx
index df66a1791..993d1a8ac 100644
--- a/spacy/_ml.pyx
+++ b/spacy/_ml.pyx
@@ -24,7 +24,7 @@ cdef int arg_max(const weight_t* scores, const int n_classes) nogil:
     return best
 
 
-cdef int arg_max_if_true(const weight_t* scores, const bint* is_valid,
+cdef int arg_max_if_true(const weight_t* scores, const int* is_valid,
                          const int n_classes) nogil:
     cdef int i
     cdef int best = 0
@@ -54,21 +54,25 @@ cdef class Model:
             model_loc = path.join(model_loc, 'model')
         self.n_classes = n_classes
         self._extractor = Extractor(templates)
+        self.n_feats = self._extractor.n_templ
         self._model = LinearModel(n_classes, self._extractor.n_templ)
         self.model_loc = model_loc
         if self.model_loc and path.exists(self.model_loc):
             self._model.load(self.model_loc, freq_thresh=0)
 
     def predict(self, Example eg):
-        self.set_scores(eg.scores, eg.atoms)
-        eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes)
+        self.set_scores(<weight_t*>eg.scores.data, <atom_t*>eg.atoms.data)
+        eg.guess = arg_max_if_true(<weight_t*>eg.scores.data, <int*>eg.is_valid.data,
+                                   self.n_classes)
 
     def train(self, Example eg):
-        self.set_scores(eg.scores, eg.atoms)
-        eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes)
-        eg.best = arg_max_if_zero(eg.scores, eg.costs, self.n_classes)
+        self.set_scores(<weight_t*>eg.scores.data, <atom_t*>eg.atoms.data)
+        eg.guess = arg_max_if_true(<weight_t*>eg.scores.data,
+                                   <int*>eg.is_valid.data, self.n_classes)
+        eg.best = arg_max_if_zero(<weight_t*>eg.scores.data, <int*>eg.costs.data,
+                                  self.n_classes)
         eg.cost = eg.costs[eg.guess]
-        self.update(eg.atoms, eg.guess, eg.best, eg.cost)
+        self.update(<atom_t*>eg.atoms.data, eg.guess, eg.best, eg.cost)
 
     cdef const weight_t* score(self, atom_t* context) except NULL:
         cdef int n_feats
diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx
index 1a1224596..702208d18 100644
--- a/spacy/_theano.pyx
+++ b/spacy/_theano.pyx
@@ -1,44 +1,44 @@
-from thinc.example cimport Example
+from thinc.api cimport Example
+from thinc.typedefs cimport weight_t
+
+from ._ml cimport arg_max_if_true
+from ._ml cimport arg_max_if_zero
+
+import numpy
+from os import path
 
 
 cdef class TheanoModel(Model):
-    def __init__(self, n_classes, input_layer, train_func, predict_func, model_loc=None):
+    def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None):
         if model_loc is not None and path.isdir(model_loc):
             model_loc = path.join(model_loc, 'model')
-        self.n_classes = n_classes
-        
-        tables = []
-        lengths = []
-        for window_size, n_dims, vocab_size in input_structure:
-            tables.append(EmbeddingTable(n_dims, vocab_size, initializer))
-            lengths.append(window_size)
-
-        self.input_layer = InputLayer(lengths, tables)
 
+        self.eta = 0.001
+        self.mu = 0.9
+        self.t = 1
+        initializer = lambda: 0.2 * numpy.random.uniform(-1.0, 1.0)
+        self.input_layer = InputLayer(input_spec, initializer)
         self.train_func = train_func
         self.predict_func = predict_func
 
+        self.n_classes = n_classes
+        self.n_feats = len(self.input_layer)
         self.model_loc = model_loc
-        if self.model_loc and path.exists(self.model_loc):
-            self._model.load(self.model_loc, freq_thresh=0)
-
-    def train(self, Instance eg):
-        pass
-
-    def predict(self, Instance eg):
-
-    cdef const weight_t* score(self, atom_t* context) except NULL:
-        self.set_scores(self._scores, context)
-        return self._scores
-    
-    cdef int set_scores(self, weight_t* scores, atom_t* context) except -1:
-        # TODO f(context) --> Values
-        self._input_layer.fill(self._x, self._values, use_avg=False)
-        theano_scores = self._predict(self._x)
+        
+    def predict(self, Example eg):
+        self.input_layer.fill(eg.embeddings, eg.atoms)
+        theano_scores = self.predict_func(eg.embeddings)
+        cdef int i
         for i in range(self.n_classes):
-            output[i] = theano_scores[i]
+            eg.scores[i] = theano_scores[i]
+        eg.guess = arg_max_if_true(<weight_t*>eg.scores.data, <int*>eg.is_valid.data,
+                                   self.n_classes)
 
-    cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1:
-        # TODO f(context) --> Values
-        self._input_layer.fill(self._x, self._values, use_avg=False)
- 
+    def train(self, Example eg):
+        self.predict(eg)
+        update, t, eta, mu = self.train_func(eg.embeddings, eg.scores, eg.costs)
+        self.input_layer.update(eg.atoms, update, self.t, self.eta, self.mu)
+        eg.best = arg_max_if_zero(<weight_t*>eg.scores.data, <int*>eg.costs.data,
+                                  self.n_classes)
+        eg.cost = eg.costs[eg.guess]
+        self.t += 1
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index 4bfb0eeb1..33ae5b497 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -69,11 +69,11 @@ cdef class Parser:
         cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
         self.moves.initialize_state(stcls)
 
-        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE)
+        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats)
         while not stcls.is_final():
             eg.wipe()
-            fill_context(eg.atoms, stcls)
-            self.moves.set_valid(eg.is_valid, stcls)
+            fill_context(<atom_t*>eg.atoms.data, stcls)
+            self.moves.set_valid(<bint*>eg.is_valid.data, stcls)
 
             self.model.predict(eg)
 
@@ -85,12 +85,12 @@ cdef class Parser:
         self.moves.preprocess_gold(gold)
         cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
         self.moves.initialize_state(stcls)
-        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE)
+        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats)
         cdef int cost = 0
         while not stcls.is_final():
             eg.wipe()
-            fill_context(eg.atoms, stcls)
-            self.moves.set_costs(eg.is_valid, eg.costs, stcls, gold)
+            fill_context(<atom_t*>eg.atoms.data, stcls)
+            self.moves.set_costs(<bint*>eg.is_valid.data, <int*>eg.costs.data, stcls, gold)
 
             self.model.train(eg)
 

From f8bb43475e449db5fdc4eb74809b947fbca0f3a9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 27 Jun 2015 02:38:51 +0200
Subject: [PATCH 05/30] * Bridge to Theano working. Very disorganised. Using
 thinc adb60aba966ed2

---
 bin/parser/nn_train.py           | 255 +++++++++++++++++++++++++++++++
 spacy/_theano.pxd                |  13 ++
 spacy/_theano.pyx                |  19 ++-
 spacy/syntax/_parse_features.pyx |   4 +
 spacy/syntax/parser.pyx          |   7 +-
 5 files changed, 291 insertions(+), 7 deletions(-)
 create mode 100755 bin/parser/nn_train.py
 create mode 100644 spacy/_theano.pxd

diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py
new file mode 100755
index 000000000..375996f4f
--- /dev/null
+++ b/bin/parser/nn_train.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python
+from __future__ import division
+from __future__ import unicode_literals
+
+import os
+from os import path
+import shutil
+import codecs
+import random
+
+import plac
+import cProfile
+import pstats
+import re
+
+import spacy.util
+from spacy.en import English
+from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
+
+from spacy.syntax.util import Config
+from spacy.gold import read_json_file
+from spacy.gold import GoldParse
+
+from spacy.scorer import Scorer
+
+from thinc.theano_nn import compile_theano_model
+
+from spacy.syntax.parser import Parser
+from spacy._theano import TheanoModel
+
+
+def _corrupt(c, noise_level):
+    if random.random() >= noise_level:
+        return c
+    elif c == ' ':
+        return '\n'
+    elif c == '\n':
+        return ' '
+    elif c in ['.', "'", "!", "?"]:
+        return ''
+    else:
+        return c.lower()
+
+
+def add_noise(orig, noise_level):
+    if random.random() >= noise_level:
+        return orig
+    elif type(orig) == list:
+        corrupted = [_corrupt(word, noise_level) for word in orig]
+        corrupted = [w for w in corrupted if w]
+        return corrupted
+    else:
+        return ''.join(_corrupt(c, noise_level) for c in orig)
+
+
+def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
+    if raw_text is None:
+        tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
+    else:
+        tokens = nlp.tokenizer(raw_text)
+    nlp.tagger(tokens)
+    nlp.entity(tokens)
+    nlp.parser(tokens)
+    gold = GoldParse(tokens, annot_tuples)
+    scorer.score(tokens, gold, verbose=verbose)
+
+
+def _merge_sents(sents):
+    m_deps = [[], [], [], [], [], []]
+    m_brackets = []
+    i = 0
+    for (ids, words, tags, heads, labels, ner), brackets in sents:
+        m_deps[0].extend(id_ + i for id_ in ids)
+        m_deps[1].extend(words)
+        m_deps[2].extend(tags)
+        m_deps[3].extend(head + i for head in heads)
+        m_deps[4].extend(labels)
+        m_deps[5].extend(ner)
+        m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
+        i += len(ids)
+    return [(m_deps, m_brackets)]
+
+
+def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
+          seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
+          verbose=False,
+          eta=0.01, mu=0.9, n_hidden=100, word_vec_len=10, pos_vec_len=10):
+    dep_model_dir = path.join(model_dir, 'deps')
+    pos_model_dir = path.join(model_dir, 'pos')
+    ner_model_dir = path.join(model_dir, 'ner')
+    if path.exists(dep_model_dir):
+        shutil.rmtree(dep_model_dir)
+    if path.exists(pos_model_dir):
+        shutil.rmtree(pos_model_dir)
+    if path.exists(ner_model_dir):
+        shutil.rmtree(ner_model_dir)
+    os.mkdir(dep_model_dir)
+    os.mkdir(pos_model_dir)
+    os.mkdir(ner_model_dir)
+    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
+
+    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
+                 labels=Language.ParserTransitionSystem.get_labels(gold_tuples))
+    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
+                 labels=Language.EntityTransitionSystem.get_labels(gold_tuples),
+                 beam_width=0)
+    
+    if n_sents > 0:
+        gold_tuples = gold_tuples[:n_sents]
+    
+    nlp = Language(data_dir=model_dir)
+
+    def make_model(n_classes, input_spec, model_dir):
+        print input_spec
+        n_in = sum(n_cols * len(fields) for (n_cols, fields) in input_spec)
+        print 'Compiling'
+        debug, train_func, predict_func = compile_theano_model(n_classes, n_hidden,
+                                                        n_in, 0.0, 0.0)
+        print 'Done'
+        return TheanoModel(
+            n_classes,
+            input_spec,
+            train_func,
+            predict_func,
+            model_loc=model_dir, 
+            debug=debug)
+
+    nlp._parser = Parser(nlp.vocab.strings, dep_model_dir, nlp.ParserTransitionSystem,
+                         make_model)
+
+    print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %"
+    for itn in range(n_iter):
+        scorer = Scorer()
+        loss = 0
+        for raw_text, sents in gold_tuples:
+            if gold_preproc:
+                raw_text = None
+            else:
+                sents = _merge_sents(sents)
+            for annot_tuples, ctnt in sents:
+                if len(annot_tuples[1]) == 1:
+                    continue
+                score_model(scorer, nlp, raw_text, annot_tuples,
+                            verbose=verbose if itn >= 2 else False)
+                if raw_text is None:
+                    words = add_noise(annot_tuples[1], corruption_level)
+                    tokens = nlp.tokenizer.tokens_from_list(words)
+                else:
+                    raw_text = add_noise(raw_text, corruption_level)
+                    tokens = nlp.tokenizer(raw_text)
+                nlp.tagger(tokens)
+                gold = GoldParse(tokens, annot_tuples, make_projective=True)
+                if not gold.is_projective:
+                    raise Exception(
+                        "Non-projective sentence in training, after we should "
+                        "have enforced projectivity: %s" % annot_tuples
+                    )
+                loss += nlp.parser.train(tokens, gold)
+                nlp.entity.train(tokens, gold)
+                nlp.tagger.train(tokens, gold.tags)
+        random.shuffle(gold_tuples)
+        print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
+                                               scorer.tags_acc,
+                                               scorer.token_acc)
+    nlp.parser.model.end_training()
+    nlp.entity.model.end_training()
+    nlp.tagger.model.end_training()
+    nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
+    return nlp
+
+
+def evaluate(nlp, gold_tuples, gold_preproc=True):
+    scorer = Scorer()
+    for raw_text, sents in gold_tuples:
+        if gold_preproc:
+            raw_text = None
+        else:
+            sents = _merge_sents(sents)
+        for annot_tuples, brackets in sents:
+            if raw_text is None:
+                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
+                nlp.tagger(tokens)
+                nlp.entity(tokens)
+                nlp.parser(tokens)
+            else:
+                tokens = nlp(raw_text, merge_mwes=False)
+            gold = GoldParse(tokens, annot_tuples)
+            scorer.score(tokens, gold)
+    return scorer
+
+
+def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
+    nlp = Language(data_dir=model_dir)
+    if beam_width is not None:
+        nlp.parser.cfg.beam_width = beam_width
+    gold_tuples = read_json_file(dev_loc)
+    scorer = Scorer()
+    out_file = codecs.open(out_loc, 'w', 'utf8')
+    for raw_text, sents in gold_tuples:
+        sents = _merge_sents(sents)
+        for annot_tuples, brackets in sents:
+            if raw_text is None:
+                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
+                nlp.tagger(tokens)
+                nlp.entity(tokens)
+                nlp.parser(tokens)
+            else:
+                tokens = nlp(raw_text, merge_mwes=False)
+            gold = GoldParse(tokens, annot_tuples)
+            scorer.score(tokens, gold, verbose=False)
+            for t in tokens:
+                out_file.write(
+                    '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
+                )
+    return scorer
+
+
+@plac.annotations(
+    train_loc=("Location of training file or directory"),
+    dev_loc=("Location of development file or directory"),
+    model_dir=("Location of output model directory",),
+    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
+    corruption_level=("Amount of noise to add to training data", "option", "c", float),
+    gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
+    out_loc=("Out location", "option", "o", str),
+    n_sents=("Number of training sentences", "option", "n", int),
+    n_iter=("Number of training iterations", "option", "i", int),
+    verbose=("Verbose error reporting", "flag", "v", bool),
+    debug=("Debug mode", "flag", "d", bool),
+)
+def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
+         debug=False, corruption_level=0.0, gold_preproc=False, beam_width=1,
+         eval_only=False):
+    gold_train = list(read_json_file(train_loc))
+    nlp = train(English, gold_train, model_dir,
+               feat_set='embed',
+               gold_preproc=gold_preproc, n_sents=n_sents,
+               corruption_level=corruption_level, n_iter=n_iter,
+               verbose=verbose)
+    #if out_loc:
+    #    write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
+    scorer = evaluate(nlp, list(read_json_file(dev_loc)), gold_preproc=gold_preproc)
+    
+    print 'TOK', 100-scorer.token_acc
+    print 'POS', scorer.tags_acc
+    print 'UAS', scorer.uas
+    print 'LAS', scorer.las
+
+    print 'NER P', scorer.ents_p
+    print 'NER R', scorer.ents_r
+    print 'NER F', scorer.ents_f
+
+
+if __name__ == '__main__':
+    plac.call(main)
diff --git a/spacy/_theano.pxd b/spacy/_theano.pxd
new file mode 100644
index 000000000..cad0736c2
--- /dev/null
+++ b/spacy/_theano.pxd
@@ -0,0 +1,13 @@
+from ._ml cimport Model
+from thinc.nn cimport InputLayer
+
+
+cdef class TheanoModel(Model):
+    cdef InputLayer input_layer
+    cdef object train_func
+    cdef object predict_func
+    cdef object debug
+
+    cdef public float eta
+    cdef public float mu
+    cdef public float t
diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx
index 702208d18..b791c4f42 100644
--- a/spacy/_theano.pyx
+++ b/spacy/_theano.pyx
@@ -9,7 +9,8 @@ from os import path
 
 
 cdef class TheanoModel(Model):
-    def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None):
+    def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None,
+                 debug=None):
         if model_loc is not None and path.isdir(model_loc):
             model_loc = path.join(model_loc, 'model')
 
@@ -20,6 +21,7 @@ cdef class TheanoModel(Model):
         self.input_layer = InputLayer(input_spec, initializer)
         self.train_func = train_func
         self.predict_func = predict_func
+        self.debug = debug
 
         self.n_classes = n_classes
         self.n_feats = len(self.input_layer)
@@ -27,7 +29,7 @@ cdef class TheanoModel(Model):
         
     def predict(self, Example eg):
         self.input_layer.fill(eg.embeddings, eg.atoms)
-        theano_scores = self.predict_func(eg.embeddings)
+        theano_scores = self.predict_func(eg.embeddings)[0]
         cdef int i
         for i in range(self.n_classes):
             eg.scores[i] = theano_scores[i]
@@ -35,10 +37,17 @@ cdef class TheanoModel(Model):
                                    self.n_classes)
 
     def train(self, Example eg):
-        self.predict(eg)
-        update, t, eta, mu = self.train_func(eg.embeddings, eg.scores, eg.costs)
-        self.input_layer.update(eg.atoms, update, self.t, self.eta, self.mu)
+        self.input_layer.fill(eg.embeddings, eg.atoms)
+        theano_scores, update, y = self.train_func(eg.embeddings, eg.costs, self.eta)
+        self.input_layer.update(update, eg.atoms, self.t, self.eta, self.mu)
+        for i in range(self.n_classes):
+            eg.scores[i] = theano_scores[i]
+        eg.guess = arg_max_if_true(<weight_t*>eg.scores.data, <int*>eg.is_valid.data,
+                                   self.n_classes)
         eg.best = arg_max_if_zero(<weight_t*>eg.scores.data, <int*>eg.costs.data,
                                   self.n_classes)
         eg.cost = eg.costs[eg.guess]
         self.t += 1
+
+    def end_training(self):
+        pass
diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx
index efefc7273..1adeaef83 100644
--- a/spacy/syntax/_parse_features.pyx
+++ b/spacy/syntax/_parse_features.pyx
@@ -355,3 +355,7 @@ trigrams = (
     (N0W, N0p, N0lL, N0l2L),
     (N0p, N0lL, N0l2L),
 )
+
+words = (S0w, N0w, S1w, N1w)
+tags = (S0p, N0p, S1p, N1p)
+labels = (S0L, N0L, S1L, S2L)
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index 33ae5b497..66d598b88 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -52,18 +52,21 @@ def get_templates(name):
         return pf.ner
     elif name == 'debug':
         return pf.unigrams
+    elif name.startswith('embed'):
+        return ((10, pf.words), (10, pf.tags), (10, pf.labels))
     else:
         return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
                 pf.tree_shape + pf.trigrams)
 
 
 cdef class Parser:
-    def __init__(self, StringStore strings, model_dir, transition_system):
+    def __init__(self, StringStore strings, model_dir, transition_system,
+                 get_model=Model):
         assert os.path.exists(model_dir) and os.path.isdir(model_dir)
         self.cfg = Config.read(model_dir, 'config')
         self.moves = transition_system(strings, self.cfg.labels)
         templates = get_templates(self.cfg.features)
-        self.model = Model(self.moves.n_moves, templates, model_dir)
+        self.model = get_model(self.moves.n_moves, templates, model_dir)
 
     def __call__(self, Tokens tokens):
         cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)

From ebe630cc8dc4f8affd8cd27fd3d4be113e669b59 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 27 Jun 2015 04:17:29 +0200
Subject: [PATCH 06/30] * Enable more features for NN

---
 spacy/syntax/_parse_features.pyx | 57 ++++++++++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 3 deletions(-)

diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx
index 1adeaef83..40c1818c5 100644
--- a/spacy/syntax/_parse_features.pyx
+++ b/spacy/syntax/_parse_features.pyx
@@ -356,6 +356,57 @@ trigrams = (
     (N0p, N0lL, N0l2L),
 )
 
-words = (S0w, N0w, S1w, N1w)
-tags = (S0p, N0p, S1p, N1p)
-labels = (S0L, N0L, S1L, S2L)
+
+words = (
+    S2w,
+    S1w,
+    S1rw,
+    S0lw,
+    S0l2w,
+    S0w,
+    S0r2w,
+    S0rw,
+    N0lw,
+    N0l2w,
+    N0w,
+    N1w,
+    N2w,
+    P1w,
+    P2w
+)
+
+tags = (
+    S2p,
+    S1p,
+    S1rp,
+    S0lp,
+    S0l2p,
+    S0p,
+    S0r2p,
+    S0rp,
+    N0lp,
+    N0l2p,
+    N0p,
+    N1p,
+    N2p,
+    P1p,
+    P2p
+)
+
+labels = (
+    S2L,
+    S1L,
+    S1rL,
+    S0lL,
+    S0l2L,
+    S0L,
+    S0r2L,
+    S0rL,
+    N0lL,
+    N0l2L,
+    N0L,
+    N1L,
+    N2L,
+    P1L,
+    P2L
+)

From da793073d0cbd83ad1464c4aba94dba5d1fe1fde Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 27 Jun 2015 04:18:01 +0200
Subject: [PATCH 07/30] * Wire hyperparameters to script interface

---
 bin/parser/nn_train.py | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py
index 375996f4f..e0ae846b5 100755
--- a/bin/parser/nn_train.py
+++ b/bin/parser/nn_train.py
@@ -84,7 +84,8 @@ def _merge_sents(sents):
 def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
           seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
           verbose=False,
-          eta=0.01, mu=0.9, n_hidden=100, word_vec_len=10, pos_vec_len=10):
+          eta=0.01, mu=0.9, n_hidden=100,
+          nv_word=10, nv_tag=10, nv_label=10):
     dep_model_dir = path.join(model_dir, 'deps')
     pos_model_dir = path.join(model_dir, 'pos')
     ner_model_dir = path.join(model_dir, 'ner')
@@ -99,8 +100,15 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
     os.mkdir(ner_model_dir)
     setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
 
-    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
-                 labels=Language.ParserTransitionSystem.get_labels(gold_tuples))
+    Config.write(dep_model_dir, 'config',
+        seed=seed,
+        features=feat_set,
+        labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
+        vector_lengths=(nv_word, nv_tag, nv_label),
+        hidden_nodes=n_hidden,
+        eta=eta,
+        mu=mu
+    )
     Config.write(ner_model_dir, 'config', features='ner', seed=seed,
                  labels=Language.EntityTransitionSystem.get_labels(gold_tuples),
                  beam_width=0)
@@ -110,16 +118,17 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
     
     nlp = Language(data_dir=model_dir)
 
-    def make_model(n_classes, input_spec, model_dir):
-        print input_spec
-        n_in = sum(n_cols * len(fields) for (n_cols, fields) in input_spec)
+    def make_model(n_classes, (words, tags, labels), model_dir):
+        n_in = (nv_word  * len(words)) + \
+               (nv_tag   * len(tags)) + \
+               (nv_label * len(labels))
         print 'Compiling'
         debug, train_func, predict_func = compile_theano_model(n_classes, n_hidden,
                                                         n_in, 0.0, 0.0)
         print 'Done'
         return TheanoModel(
             n_classes,
-            input_spec,
+            ((nv_word, words), (nv_tag, tags), (nv_label, labels)),
             train_func,
             predict_func,
             model_loc=model_dir, 
@@ -226,14 +235,23 @@ def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
     n_sents=("Number of training sentences", "option", "n", int),
     n_iter=("Number of training iterations", "option", "i", int),
     verbose=("Verbose error reporting", "flag", "v", bool),
-    debug=("Debug mode", "flag", "d", bool),
+
+    nv_word=("Word vector length", "option", "W", int),
+    nv_tag=("Tag vector length", "option", "T", int),
+    nv_label=("Label vector length", "option", "L", int),
+    nv_hidden=("Hidden nodes length", "option", "H", int),
+    eta=("Learning rate", "option", "E", float),
+    mu=("Momentum", "option", "M", float),
 )
 def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
-         debug=False, corruption_level=0.0, gold_preproc=False, beam_width=1,
+         corruption_level=0.0, gold_preproc=False,
+         nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10,
+         eta=0.1, mu=0.9,
          eval_only=False):
     gold_train = list(read_json_file(train_loc))
     nlp = train(English, gold_train, model_dir,
                feat_set='embed',
+               nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label,
                gold_preproc=gold_preproc, n_sents=n_sents,
                corruption_level=corruption_level, n_iter=n_iter,
                verbose=verbose)

From ed40a8380ee4289eadae9b98da4e61c337b6ab01 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 27 Jun 2015 04:18:47 +0200
Subject: [PATCH 08/30] * Remove hard-coding of vector lengths

---
 spacy/_theano.pyx       | 4 ++--
 spacy/syntax/parser.pyx | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx
index b791c4f42..965ee84d7 100644
--- a/spacy/_theano.pyx
+++ b/spacy/_theano.pyx
@@ -28,7 +28,7 @@ cdef class TheanoModel(Model):
         self.model_loc = model_loc
         
     def predict(self, Example eg):
-        self.input_layer.fill(eg.embeddings, eg.atoms)
+        self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=True)
         theano_scores = self.predict_func(eg.embeddings)[0]
         cdef int i
         for i in range(self.n_classes):
@@ -37,7 +37,7 @@ cdef class TheanoModel(Model):
                                    self.n_classes)
 
     def train(self, Example eg):
-        self.input_layer.fill(eg.embeddings, eg.atoms)
+        self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=False)
         theano_scores, update, y = self.train_func(eg.embeddings, eg.costs, self.eta)
         self.input_layer.update(update, eg.atoms, self.t, self.eta, self.mu)
         for i in range(self.n_classes):
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index 66d598b88..797ee1e56 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -53,7 +53,7 @@ def get_templates(name):
     elif name == 'debug':
         return pf.unigrams
     elif name.startswith('embed'):
-        return ((10, pf.words), (10, pf.tags), (10, pf.labels))
+        return (pf.words, pf.tags, pf.labels)
     else:
         return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
                 pf.tree_shape + pf.trigrams)

From 65ac38919135612b319de1d6f183558d13a0f52c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 28 Jun 2015 01:29:37 +0200
Subject: [PATCH 09/30] * whitespace

---
 bin/parser/nn_train.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py
index e0ae846b5..33ad8a8a9 100755
--- a/bin/parser/nn_train.py
+++ b/bin/parser/nn_train.py
@@ -84,7 +84,7 @@ def _merge_sents(sents):
 def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
           seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
           verbose=False,
-          eta=0.01, mu=0.9, n_hidden=100,
+          eta=0.01, mu=0.9, nv_hidden=100,
           nv_word=10, nv_tag=10, nv_label=10):
     dep_model_dir = path.join(model_dir, 'deps')
     pos_model_dir = path.join(model_dir, 'pos')
@@ -105,7 +105,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
         features=feat_set,
         labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
         vector_lengths=(nv_word, nv_tag, nv_label),
-        hidden_nodes=n_hidden,
+        hidden_nodes=nv_hidden,
         eta=eta,
         mu=mu
     )
@@ -123,7 +123,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
                (nv_tag   * len(tags)) + \
                (nv_label * len(labels))
         print 'Compiling'
-        debug, train_func, predict_func = compile_theano_model(n_classes, n_hidden,
+        debug, train_func, predict_func = compile_theano_model(n_classes, nv_hidden,
                                                         n_in, 0.0, 0.0)
         print 'Done'
         return TheanoModel(
@@ -251,7 +251,7 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos
     gold_train = list(read_json_file(train_loc))
     nlp = train(English, gold_train, model_dir,
                feat_set='embed',
-               nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label,
+               nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden,
                gold_preproc=gold_preproc, n_sents=n_sents,
                corruption_level=corruption_level, n_iter=n_iter,
                verbose=verbose)

From bf33598b34d18f2766290cb1163a5c362b63cc9d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 23 Jun 2015 22:55:58 +0200
Subject: [PATCH 10/30] * Work on a theano-driven model for the parser

---
 spacy/_theano.pyx | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 spacy/_theano.pyx

diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx
new file mode 100644
index 000000000..1a1224596
--- /dev/null
+++ b/spacy/_theano.pyx
@@ -0,0 +1,44 @@
+from thinc.example cimport Example
+
+
+cdef class TheanoModel(Model):
+    def __init__(self, n_classes, input_layer, train_func, predict_func, model_loc=None):
+        if model_loc is not None and path.isdir(model_loc):
+            model_loc = path.join(model_loc, 'model')
+        self.n_classes = n_classes
+        
+        tables = []
+        lengths = []
+        for window_size, n_dims, vocab_size in input_structure:
+            tables.append(EmbeddingTable(n_dims, vocab_size, initializer))
+            lengths.append(window_size)
+
+        self.input_layer = InputLayer(lengths, tables)
+
+        self.train_func = train_func
+        self.predict_func = predict_func
+
+        self.model_loc = model_loc
+        if self.model_loc and path.exists(self.model_loc):
+            self._model.load(self.model_loc, freq_thresh=0)
+
+    def train(self, Instance eg):
+        pass
+
+    def predict(self, Instance eg):
+
+    cdef const weight_t* score(self, atom_t* context) except NULL:
+        self.set_scores(self._scores, context)
+        return self._scores
+    
+    cdef int set_scores(self, weight_t* scores, atom_t* context) except -1:
+        # TODO f(context) --> Values
+        self._input_layer.fill(self._x, self._values, use_avg=False)
+        theano_scores = self._predict(self._x)
+        for i in range(self.n_classes):
+            output[i] = theano_scores[i]
+
+    cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1:
+        # TODO f(context) --> Values
+        self._input_layer.fill(self._x, self._values, use_avg=False)
+ 

From 75aeccc0644c09dcd59d126cc1f627c2823bbce0 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 26 Jun 2015 06:25:36 +0200
Subject: [PATCH 11/30] * Rejig parser interface to use new thinc.api.Example
 class, in prep of theano model. Comment out beam search

---
 spacy/_ml.pyx                      |  36 ++++++
 spacy/syntax/arc_eager.pyx         |  29 ++---
 spacy/syntax/ner.pyx               |  21 ----
 spacy/syntax/parser.pxd            |   3 -
 spacy/syntax/parser.pyx            | 178 ++++++++++++-----------------
 spacy/syntax/transition_system.pxd |   8 +-
 spacy/syntax/transition_system.pyx |  35 ++----
 7 files changed, 132 insertions(+), 178 deletions(-)

diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx
index be647c2dd..df66a1791 100644
--- a/spacy/_ml.pyx
+++ b/spacy/_ml.pyx
@@ -10,6 +10,7 @@ import cython
 import numpy.random
 
 from thinc.features cimport Feature, count_feats
+from thinc.api cimport Example
 
 
 cdef int arg_max(const weight_t* scores, const int n_classes) nogil:
@@ -23,6 +24,30 @@ cdef int arg_max(const weight_t* scores, const int n_classes) nogil:
     return best
 
 
+cdef int arg_max_if_true(const weight_t* scores, const bint* is_valid,
+                         const int n_classes) nogil:
+    cdef int i
+    cdef int best = 0
+    cdef weight_t mode = -900000
+    for i in range(n_classes):
+        if is_valid[i] and scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
+
+
+cdef int arg_max_if_zero(const weight_t* scores, const int* costs,
+                         const int n_classes) nogil:
+    cdef int i
+    cdef int best = 0
+    cdef weight_t mode = -900000
+    for i in range(n_classes):
+        if costs[i] == 0 and scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
+
+
 cdef class Model:
     def __init__(self, n_classes, templates, model_loc=None):
         if model_loc is not None and path.isdir(model_loc):
@@ -34,6 +59,17 @@ cdef class Model:
         if self.model_loc and path.exists(self.model_loc):
             self._model.load(self.model_loc, freq_thresh=0)
 
+    def predict(self, Example eg):
+        self.set_scores(eg.scores, eg.atoms)
+        eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes)
+
+    def train(self, Example eg):
+        self.set_scores(eg.scores, eg.atoms)
+        eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes)
+        eg.best = arg_max_if_zero(eg.scores, eg.costs, self.n_classes)
+        eg.cost = eg.costs[eg.guess]
+        self.update(eg.atoms, eg.guess, eg.best, eg.cost)
+
     cdef const weight_t* score(self, atom_t* context) except NULL:
         cdef int n_feats
         feats = self._extractor.get_feats(context, &n_feats)
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 29e62cb4e..a83e19ec2 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -398,7 +398,8 @@ cdef class ArcEager(TransitionSystem):
             n_valid += output[i]
         assert n_valid >= 1
 
-    cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1:
+    cdef int set_costs(self, bint* is_valid, int* costs, 
+                       StateClass stcls, GoldParse gold) except -1:
         cdef int i, move, label
         cdef label_cost_func_t[N_MOVES] label_cost_funcs
         cdef move_cost_func_t[N_MOVES] move_cost_funcs
@@ -423,30 +424,14 @@ cdef class ArcEager(TransitionSystem):
         n_gold = 0
         for i in range(self.n_moves):
             if self.c[i].is_valid(stcls, self.c[i].label):
+                is_valid[i] = True
                 move = self.c[i].move
                 label = self.c[i].label
                 if move_costs[move] == -1:
                     move_costs[move] = move_cost_funcs[move](stcls, &gold.c)
-                output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label)
-                n_gold += output[i] == 0
+                costs[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label)
+                n_gold += costs[i] == 0
             else:
-                output[i] = 9000
+                is_valid[i] = False
+                costs[i] = 9000
         assert n_gold >= 1
-
-    cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *:
-        cdef bint[N_MOVES] is_valid
-        is_valid[SHIFT] = Shift.is_valid(stcls, -1)
-        is_valid[REDUCE] = Reduce.is_valid(stcls, -1)
-        is_valid[LEFT] = LeftArc.is_valid(stcls, -1)
-        is_valid[RIGHT] = RightArc.is_valid(stcls, -1)
-        is_valid[BREAK] = Break.is_valid(stcls, -1)
-        cdef Transition best
-        cdef weight_t score = MIN_SCORE
-        cdef int i
-        for i in range(self.n_moves):
-            if scores[i] > score and is_valid[self.c[i].move]:
-                best = self.c[i]
-                score = scores[i]
-        assert best.clas < self.n_moves
-        assert score > MIN_SCORE, (stcls.stack_depth(), stcls.buffer_length(), stcls.is_final(), stcls._b_i, stcls.length)
-        return best
diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx
index 4a47a20a8..b145df7ac 100644
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@@ -128,27 +128,6 @@ cdef class BiluoPushDown(TransitionSystem):
             raise Exception(move)
         return t
 
-    cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *:
-        cdef int best = -1
-        cdef weight_t score = -90000
-        cdef const Transition* m
-        cdef int i
-        for i in range(self.n_moves):
-            m = &self.c[i]
-            if m.is_valid(stcls, m.label) and scores[i] > score:
-                best = i
-                score = scores[i]
-        assert best >= 0
-        cdef Transition t = self.c[best]
-        t.score = score
-        return t
-
-    cdef int set_valid(self, bint* output, StateClass stcls) except -1:
-        cdef int i
-        for i in range(self.n_moves):
-            m = &self.c[i]
-            output[i] = m.is_valid(stcls, m.label)
-
 
 cdef class Missing:
     @staticmethod
diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd
index 103ff9c02..11ac6bbb8 100644
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@@ -11,6 +11,3 @@ cdef class Parser:
     cdef readonly object cfg
     cdef readonly Model model
     cdef readonly TransitionSystem moves
-
-    cdef int _greedy_parse(self, Tokens tokens) except -1
-    cdef int _beam_parse(self, Tokens tokens) except -1
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index e36d10a38..2f6c3cd98 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -19,17 +19,10 @@ from cymem.cymem cimport Pool, Address
 from murmurhash.mrmr cimport hash64
 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
 
-
 from util import Config
 
-from thinc.features cimport Extractor
-from thinc.features cimport Feature
-from thinc.features cimport count_feats
+from thinc.api cimport Example
 
-from thinc.learner cimport LinearModel
-
-from thinc.search cimport Beam
-from thinc.search cimport MaxViolation
 
 from ..tokens cimport Tokens, TokenC
 from ..strings cimport StringStore
@@ -72,35 +65,86 @@ cdef class Parser:
         self.model = Model(self.moves.n_moves, templates, model_dir)
 
     def __call__(self, Tokens tokens):
-        if self.cfg.get('beam_width', 1) < 1:
-            self._greedy_parse(tokens)
-        else:
-            self._beam_parse(tokens)
+        cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
+        self.moves.initialize_state(stcls)
+
+        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE)
+        while not stcls.is_final():
+            eg.wipe()
+            fill_context(eg.atoms, stcls)
+            self.moves.set_valid(eg.is_valid, stcls)
+
+            self.model.predict(eg)
+
+            self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
+        self.moves.finalize_state(stcls)
+        tokens.set_parse(stcls._sent)
 
     def train(self, Tokens tokens, GoldParse gold):
         self.moves.preprocess_gold(gold)
-        if self.cfg.beam_width < 1:
-            return self._greedy_train(tokens, gold)
-        else:
-            return self._beam_train(tokens, gold)
-
-    cdef int _greedy_parse(self, Tokens tokens) except -1:
-        cdef atom_t[CONTEXT_SIZE] context
-        cdef int n_feats
-        cdef Pool mem = Pool()
         cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
         self.moves.initialize_state(stcls)
-        cdef Transition guess
-        words = [w.orth_ for w in tokens]
+        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE)
+        cdef int cost = 0
         while not stcls.is_final():
-            fill_context(context, stcls)
-            scores = self.model.score(context)
-            guess = self.moves.best_valid(scores, stcls)
-            #print self.moves.move_name(guess.move, guess.label), stcls.print_state(words)
-            guess.do(stcls, guess.label)
-            assert stcls._s_i >= 0
-        self.moves.finalize_state(stcls)
-        tokens.set_parse(stcls._sent)
+            eg.wipe()
+            fill_context(eg.atoms, stcls)
+            self.moves.set_costs(eg.is_valid, eg.costs, stcls, gold)
+
+            self.model.train(eg)
+
+            self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
+            cost += eg.cost
+        return cost
+
+
+# These are passed as callbacks to thinc.search.Beam
+"""
+cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
+    dest = <StateClass>_dest
+    src = <StateClass>_src
+    moves = <const Transition*>_moves
+    dest.clone(src)
+    moves[clas].do(dest, moves[clas].label)
+
+
+cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
+    cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
+    st.fast_forward()
+    Py_INCREF(st)
+    return <void*>st
+
+
+cdef int _check_final_state(void* _state, void* extra_args) except -1:
+    return (<StateClass>_state).is_final()
+
+
+def _cleanup(Beam beam):
+    for i in range(beam.width):
+        Py_XDECREF(<PyObject*>beam._states[i].content)
+        Py_XDECREF(<PyObject*>beam._parents[i].content)
+
+cdef hash_t _hash_state(void* _state, void* _) except 0:
+    return <hash_t>_state
+    
+    #state = <const State*>_state
+    #cdef atom_t[10] rep
+
+    #rep[0] = state.stack[0] if state.stack_len >= 1 else 0
+    #rep[1] = state.stack[-1] if state.stack_len >= 2 else 0
+    #rep[2] = state.stack[-2] if state.stack_len >= 3 else 0
+    #rep[3] = state.i
+    #rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0
+    #rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0
+    #rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0
+    #rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0
+    #if get_left(state, get_n0(state), 1) != NULL:
+    #    rep[8] = get_left(state, get_n0(state), 1).dep 
+    #else:
+    #    rep[8] = 0
+    #rep[9] = state.sent[state.i].l_kids
+    #return hash64(rep, sizeof(atom_t) * 10, 0)
+
 
     cdef int _beam_parse(self, Tokens tokens) except -1:
         cdef Beam beam = Beam(self.moves.n_moves, self.cfg.beam_width)
@@ -114,30 +158,6 @@ cdef class Parser:
         tokens.set_parse(state._sent)
         _cleanup(beam)
 
-    def _greedy_train(self, Tokens tokens, GoldParse gold):
-        cdef Pool mem = Pool()
-        cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
-        self.moves.initialize_state(stcls)
-
-        cdef int cost
-        cdef const Feature* feats
-        cdef const weight_t* scores
-        cdef Transition guess
-        cdef Transition best
-        cdef atom_t[CONTEXT_SIZE] context
-        loss = 0
-        words = [w.orth_ for w in tokens]
-        history = []
-        while not stcls.is_final():
-            fill_context(context, stcls)
-            scores = self.model.score(context)
-            guess = self.moves.best_valid(scores, stcls)
-            best = self.moves.best_gold(scores, stcls, gold)
-            cost = guess.get_cost(stcls, &gold.c, guess.label)
-            self.model.update(context, guess.clas, best.clas, cost)
-            guess.do(stcls, guess.label)
-            loss += cost
-        return loss
 
     def _beam_train(self, Tokens tokens, GoldParse gold_parse):
         cdef Beam pred = Beam(self.moves.n_moves, self.cfg.beam_width)
@@ -200,50 +220,4 @@ cdef class Parser:
             count_feats(counts[clas], feats, n_feats, inc)
             self.moves.c[clas].do(stcls, self.moves.c[clas].label)
 
-
-# These are passed as callbacks to thinc.search.Beam
-
-cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
-    dest = <StateClass>_dest
-    src = <StateClass>_src
-    moves = <const Transition*>_moves
-    dest.clone(src)
-    moves[clas].do(dest, moves[clas].label)
-
-
-cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
-    cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
-    st.fast_forward()
-    Py_INCREF(st)
-    return <void*>st
-
-
-cdef int _check_final_state(void* _state, void* extra_args) except -1:
-    return (<StateClass>_state).is_final()
-
-
-def _cleanup(Beam beam):
-    for i in range(beam.width):
-        Py_XDECREF(<PyObject*>beam._states[i].content)
-        Py_XDECREF(<PyObject*>beam._parents[i].content)
-
-cdef hash_t _hash_state(void* _state, void* _) except 0:
-    return <hash_t>_state
-    
-    #state = <const State*>_state
-    #cdef atom_t[10] rep
-
-    #rep[0] = state.stack[0] if state.stack_len >= 1 else 0
-    #rep[1] = state.stack[-1] if state.stack_len >= 2 else 0
-    #rep[2] = state.stack[-2] if state.stack_len >= 3 else 0
-    #rep[3] = state.i
-    #rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0
-    #rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0
-    #rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0
-    #rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0
-    #if get_left(state, get_n0(state), 1) != NULL:
-    #    rep[8] = get_left(state, get_n0(state), 1).dep 
-    #else:
-    #    rep[8] = 0
-    #rep[9] = state.sent[state.i].l_kids
-    #return hash64(rep, sizeof(atom_t) * 10, 0)
+"""
diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd
index d9bd2b3e6..35f0ada30 100644
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@@ -46,9 +46,5 @@ cdef class TransitionSystem:
 
     cdef int set_valid(self, bint* output, StateClass state) except -1
     
-    cdef int set_costs(self, int* output, StateClass state, GoldParse gold) except -1
-
-    cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *
-
-    cdef Transition best_gold(self, const weight_t* scores, StateClass state,
-                              GoldParse gold) except *
+    cdef int set_costs(self, bint* is_valid, int* costs,
+                       StateClass state, GoldParse gold) except -1
diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index 927498cba..b13c75ba3 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -43,30 +43,17 @@ cdef class TransitionSystem:
     cdef Transition init_transition(self, int clas, int move, int label) except *:
         raise NotImplementedError
 
-    cdef Transition best_valid(self, const weight_t* scores, StateClass s) except *:
-        raise NotImplementedError
-    
-    cdef int set_valid(self, bint* output, StateClass state) except -1:
-        raise NotImplementedError
-
-    cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1:
+    cdef int set_valid(self, bint* is_valid, StateClass stcls) except -1:
         cdef int i
         for i in range(self.n_moves):
-            if self.c[i].is_valid(stcls, self.c[i].label):
-                output[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
+            is_valid[i] = self.c[i].is_valid(stcls, self.c[i].label)
+
+    cdef int set_costs(self, bint* is_valid, int* costs,
+                       StateClass stcls, GoldParse gold) except -1:
+        cdef int i
+        self.set_valid(is_valid, stcls)
+        for i in range(self.n_moves):
+            if is_valid[i]:
+                costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
             else:
-                output[i] = 9000
-
-    cdef Transition best_gold(self, const weight_t* scores, StateClass stcls,
-                              GoldParse gold) except *:
-        cdef Transition best
-        cdef weight_t score = MIN_SCORE
-        cdef int i
-        for i in range(self.n_moves):
-            if self.c[i].is_valid(stcls, self.c[i].label):
-                cost = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
-                if scores[i] > score and cost == 0:
-                    best = self.c[i]
-                    score = scores[i]
-        assert score > MIN_SCORE
-        return best
+                costs[i] = 9000

From 9282a8e72c567946657cf484710f60470d6e0914 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 26 Jun 2015 13:51:39 +0200
Subject: [PATCH 12/30] * Prepare for new models to be plugged in by using
 Example class

---
 setup.py                |  3 +-
 spacy/_ml.pxd           |  5 ++++
 spacy/_ml.pyx           | 18 +++++++-----
 spacy/_theano.pyx       | 64 ++++++++++++++++++++---------------------
 spacy/syntax/parser.pyx | 12 ++++----
 5 files changed, 56 insertions(+), 46 deletions(-)

diff --git a/setup.py b/setup.py
index 5e2c9f480..48e2dfe25 100644
--- a/setup.py
+++ b/setup.py
@@ -151,7 +151,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
              'spacy.lexeme', 'spacy.vocab', 'spacy.tokens', 'spacy.spans',
              'spacy.morphology', 
              'spacy.syntax.stateclass', 
-             'spacy._ml', 'spacy.tokenizer', 'spacy.en.attrs',
+             'spacy._ml', 'spacy._theano',
+             'spacy.tokenizer', 'spacy.en.attrs',
              'spacy.en.pos', 'spacy.syntax.parser', 
              'spacy.syntax.transition_system',
              'spacy.syntax.arc_eager',
diff --git a/spacy/_ml.pxd b/spacy/_ml.pxd
index add162e69..3562b4a32 100644
--- a/spacy/_ml.pxd
+++ b/spacy/_ml.pxd
@@ -14,9 +14,14 @@ from .tokens cimport Tokens
 
 cdef int arg_max(const weight_t* scores, const int n_classes) nogil
 
+cdef int arg_max_if_true(const weight_t* scores, const int* is_valid,  int n_classes) nogil
+
+cdef int arg_max_if_zero(const weight_t* scores, const int* costs, int n_classes) nogil
+
 
 cdef class Model:
     cdef int n_classes
+    cdef int n_feats
     
     cdef const weight_t* score(self, atom_t* context) except NULL
     cdef int set_scores(self, weight_t* scores, atom_t* context) except -1
diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx
index df66a1791..993d1a8ac 100644
--- a/spacy/_ml.pyx
+++ b/spacy/_ml.pyx
@@ -24,7 +24,7 @@ cdef int arg_max(const weight_t* scores, const int n_classes) nogil:
     return best
 
 
-cdef int arg_max_if_true(const weight_t* scores, const bint* is_valid,
+cdef int arg_max_if_true(const weight_t* scores, const int* is_valid,
                          const int n_classes) nogil:
     cdef int i
     cdef int best = 0
@@ -54,21 +54,25 @@ cdef class Model:
             model_loc = path.join(model_loc, 'model')
         self.n_classes = n_classes
         self._extractor = Extractor(templates)
+        self.n_feats = self._extractor.n_templ
         self._model = LinearModel(n_classes, self._extractor.n_templ)
         self.model_loc = model_loc
         if self.model_loc and path.exists(self.model_loc):
             self._model.load(self.model_loc, freq_thresh=0)
 
     def predict(self, Example eg):
-        self.set_scores(eg.scores, eg.atoms)
-        eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes)
+        self.set_scores(<weight_t*>eg.scores.data, <atom_t*>eg.atoms.data)
+        eg.guess = arg_max_if_true(<weight_t*>eg.scores.data, <int*>eg.is_valid.data,
+                                   self.n_classes)
 
     def train(self, Example eg):
-        self.set_scores(eg.scores, eg.atoms)
-        eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.n_classes)
-        eg.best = arg_max_if_zero(eg.scores, eg.costs, self.n_classes)
+        self.set_scores(<weight_t*>eg.scores.data, <atom_t*>eg.atoms.data)
+        eg.guess = arg_max_if_true(<weight_t*>eg.scores.data,
+                                   <int*>eg.is_valid.data, self.n_classes)
+        eg.best = arg_max_if_zero(<weight_t*>eg.scores.data, <int*>eg.costs.data,
+                                  self.n_classes)
         eg.cost = eg.costs[eg.guess]
-        self.update(eg.atoms, eg.guess, eg.best, eg.cost)
+        self.update(<atom_t*>eg.atoms.data, eg.guess, eg.best, eg.cost)
 
     cdef const weight_t* score(self, atom_t* context) except NULL:
         cdef int n_feats
diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx
index 1a1224596..702208d18 100644
--- a/spacy/_theano.pyx
+++ b/spacy/_theano.pyx
@@ -1,44 +1,44 @@
-from thinc.example cimport Example
+from thinc.api cimport Example
+from thinc.typedefs cimport weight_t
+
+from ._ml cimport arg_max_if_true
+from ._ml cimport arg_max_if_zero
+
+import numpy
+from os import path
 
 
 cdef class TheanoModel(Model):
-    def __init__(self, n_classes, input_layer, train_func, predict_func, model_loc=None):
+    def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None):
         if model_loc is not None and path.isdir(model_loc):
             model_loc = path.join(model_loc, 'model')
-        self.n_classes = n_classes
-        
-        tables = []
-        lengths = []
-        for window_size, n_dims, vocab_size in input_structure:
-            tables.append(EmbeddingTable(n_dims, vocab_size, initializer))
-            lengths.append(window_size)
-
-        self.input_layer = InputLayer(lengths, tables)
 
+        self.eta = 0.001
+        self.mu = 0.9
+        self.t = 1
+        initializer = lambda: 0.2 * numpy.random.uniform(-1.0, 1.0)
+        self.input_layer = InputLayer(input_spec, initializer)
         self.train_func = train_func
         self.predict_func = predict_func
 
+        self.n_classes = n_classes
+        self.n_feats = len(self.input_layer)
         self.model_loc = model_loc
-        if self.model_loc and path.exists(self.model_loc):
-            self._model.load(self.model_loc, freq_thresh=0)
-
-    def train(self, Instance eg):
-        pass
-
-    def predict(self, Instance eg):
-
-    cdef const weight_t* score(self, atom_t* context) except NULL:
-        self.set_scores(self._scores, context)
-        return self._scores
-    
-    cdef int set_scores(self, weight_t* scores, atom_t* context) except -1:
-        # TODO f(context) --> Values
-        self._input_layer.fill(self._x, self._values, use_avg=False)
-        theano_scores = self._predict(self._x)
+        
+    def predict(self, Example eg):
+        self.input_layer.fill(eg.embeddings, eg.atoms)
+        theano_scores = self.predict_func(eg.embeddings)
+        cdef int i
         for i in range(self.n_classes):
-            output[i] = theano_scores[i]
+            eg.scores[i] = theano_scores[i]
+        eg.guess = arg_max_if_true(<weight_t*>eg.scores.data, <int*>eg.is_valid.data,
+                                   self.n_classes)
 
-    cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1:
-        # TODO f(context) --> Values
-        self._input_layer.fill(self._x, self._values, use_avg=False)
- 
+    def train(self, Example eg):
+        self.predict(eg)
+        update, t, eta, mu = self.train_func(eg.embeddings, eg.scores, eg.costs)
+        self.input_layer.update(eg.atoms, update, self.t, self.eta, self.mu)
+        eg.best = arg_max_if_zero(<weight_t*>eg.scores.data, <int*>eg.costs.data,
+                                  self.n_classes)
+        eg.cost = eg.costs[eg.guess]
+        self.t += 1
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index 2f6c3cd98..8af7ab25d 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -68,11 +68,11 @@ cdef class Parser:
         cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
         self.moves.initialize_state(stcls)
 
-        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE)
+        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats)
         while not stcls.is_final():
             eg.wipe()
-            fill_context(eg.atoms, stcls)
-            self.moves.set_valid(eg.is_valid, stcls)
+            fill_context(<atom_t*>eg.atoms.data, stcls)
+            self.moves.set_valid(<bint*>eg.is_valid.data, stcls)
 
             self.model.predict(eg)
 
@@ -84,12 +84,12 @@ cdef class Parser:
         self.moves.preprocess_gold(gold)
         cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
         self.moves.initialize_state(stcls)
-        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE)
+        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats)
         cdef int cost = 0
         while not stcls.is_final():
             eg.wipe()
-            fill_context(eg.atoms, stcls)
-            self.moves.set_costs(eg.is_valid, eg.costs, stcls, gold)
+            fill_context(<atom_t*>eg.atoms.data, stcls)
+            self.moves.set_costs(<bint*>eg.is_valid.data, <int*>eg.costs.data, stcls, gold)
 
             self.model.train(eg)
 

From 897dd0dd0b4c1f662c36960d3367787fd658ce1d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 28 Jun 2015 11:36:11 +0200
Subject: [PATCH 13/30] * Merge changes, and adjust Example to use memoryview

---
 bin/parser/nn_train.py  | 255 +++++++++++++++++++++
 spacy/_bu_nn.pyx        | 490 ++++++++++++++++++++++++++++++++++++++++
 spacy/_ml.pyx           |  14 +-
 spacy/_nn.py            |   3 +
 spacy/_nn.pyx           | 146 ++++++++++++
 spacy/_theano.pxd       |  13 ++
 spacy/_theano.pyx       |  23 +-
 spacy/syntax/joint.pxd  |  17 ++
 spacy/syntax/joint.pyx  | 452 ++++++++++++++++++++++++++++++++++++
 spacy/syntax/parser.pyx |  15 +-
 10 files changed, 1408 insertions(+), 20 deletions(-)
 create mode 100755 bin/parser/nn_train.py
 create mode 100644 spacy/_bu_nn.pyx
 create mode 100644 spacy/_nn.py
 create mode 100644 spacy/_nn.pyx
 create mode 100644 spacy/_theano.pxd
 create mode 100644 spacy/syntax/joint.pxd
 create mode 100644 spacy/syntax/joint.pyx

diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py
new file mode 100755
index 000000000..375996f4f
--- /dev/null
+++ b/bin/parser/nn_train.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python
+from __future__ import division
+from __future__ import unicode_literals
+
+import os
+from os import path
+import shutil
+import codecs
+import random
+
+import plac
+import cProfile
+import pstats
+import re
+
+import spacy.util
+from spacy.en import English
+from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
+
+from spacy.syntax.util import Config
+from spacy.gold import read_json_file
+from spacy.gold import GoldParse
+
+from spacy.scorer import Scorer
+
+from thinc.theano_nn import compile_theano_model
+
+from spacy.syntax.parser import Parser
+from spacy._theano import TheanoModel
+
+
+def _corrupt(c, noise_level):
+    if random.random() >= noise_level:
+        return c
+    elif c == ' ':
+        return '\n'
+    elif c == '\n':
+        return ' '
+    elif c in ['.', "'", "!", "?"]:
+        return ''
+    else:
+        return c.lower()
+
+
+def add_noise(orig, noise_level):
+    if random.random() >= noise_level:
+        return orig
+    elif type(orig) == list:
+        corrupted = [_corrupt(word, noise_level) for word in orig]
+        corrupted = [w for w in corrupted if w]
+        return corrupted
+    else:
+        return ''.join(_corrupt(c, noise_level) for c in orig)
+
+
+def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
+    if raw_text is None:
+        tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
+    else:
+        tokens = nlp.tokenizer(raw_text)
+    nlp.tagger(tokens)
+    nlp.entity(tokens)
+    nlp.parser(tokens)
+    gold = GoldParse(tokens, annot_tuples)
+    scorer.score(tokens, gold, verbose=verbose)
+
+
+def _merge_sents(sents):
+    m_deps = [[], [], [], [], [], []]
+    m_brackets = []
+    i = 0
+    for (ids, words, tags, heads, labels, ner), brackets in sents:
+        m_deps[0].extend(id_ + i for id_ in ids)
+        m_deps[1].extend(words)
+        m_deps[2].extend(tags)
+        m_deps[3].extend(head + i for head in heads)
+        m_deps[4].extend(labels)
+        m_deps[5].extend(ner)
+        m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
+        i += len(ids)
+    return [(m_deps, m_brackets)]
+
+
+def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
+          seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
+          verbose=False,
+          eta=0.01, mu=0.9, n_hidden=100, word_vec_len=10, pos_vec_len=10):
+    dep_model_dir = path.join(model_dir, 'deps')
+    pos_model_dir = path.join(model_dir, 'pos')
+    ner_model_dir = path.join(model_dir, 'ner')
+    if path.exists(dep_model_dir):
+        shutil.rmtree(dep_model_dir)
+    if path.exists(pos_model_dir):
+        shutil.rmtree(pos_model_dir)
+    if path.exists(ner_model_dir):
+        shutil.rmtree(ner_model_dir)
+    os.mkdir(dep_model_dir)
+    os.mkdir(pos_model_dir)
+    os.mkdir(ner_model_dir)
+    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
+
+    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
+                 labels=Language.ParserTransitionSystem.get_labels(gold_tuples))
+    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
+                 labels=Language.EntityTransitionSystem.get_labels(gold_tuples),
+                 beam_width=0)
+    
+    if n_sents > 0:
+        gold_tuples = gold_tuples[:n_sents]
+    
+    nlp = Language(data_dir=model_dir)
+
+    def make_model(n_classes, input_spec, model_dir):
+        print input_spec
+        n_in = sum(n_cols * len(fields) for (n_cols, fields) in input_spec)
+        print 'Compiling'
+        debug, train_func, predict_func = compile_theano_model(n_classes, n_hidden,
+                                                        n_in, 0.0, 0.0)
+        print 'Done'
+        return TheanoModel(
+            n_classes,
+            input_spec,
+            train_func,
+            predict_func,
+            model_loc=model_dir, 
+            debug=debug)
+
+    nlp._parser = Parser(nlp.vocab.strings, dep_model_dir, nlp.ParserTransitionSystem,
+                         make_model)
+
+    print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %"
+    for itn in range(n_iter):
+        scorer = Scorer()
+        loss = 0
+        for raw_text, sents in gold_tuples:
+            if gold_preproc:
+                raw_text = None
+            else:
+                sents = _merge_sents(sents)
+            for annot_tuples, ctnt in sents:
+                if len(annot_tuples[1]) == 1:
+                    continue
+                score_model(scorer, nlp, raw_text, annot_tuples,
+                            verbose=verbose if itn >= 2 else False)
+                if raw_text is None:
+                    words = add_noise(annot_tuples[1], corruption_level)
+                    tokens = nlp.tokenizer.tokens_from_list(words)
+                else:
+                    raw_text = add_noise(raw_text, corruption_level)
+                    tokens = nlp.tokenizer(raw_text)
+                nlp.tagger(tokens)
+                gold = GoldParse(tokens, annot_tuples, make_projective=True)
+                if not gold.is_projective:
+                    raise Exception(
+                        "Non-projective sentence in training, after we should "
+                        "have enforced projectivity: %s" % annot_tuples
+                    )
+                loss += nlp.parser.train(tokens, gold)
+                nlp.entity.train(tokens, gold)
+                nlp.tagger.train(tokens, gold.tags)
+        random.shuffle(gold_tuples)
+        print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
+                                               scorer.tags_acc,
+                                               scorer.token_acc)
+    nlp.parser.model.end_training()
+    nlp.entity.model.end_training()
+    nlp.tagger.model.end_training()
+    nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
+    return nlp
+
+
+def evaluate(nlp, gold_tuples, gold_preproc=True):
+    scorer = Scorer()
+    for raw_text, sents in gold_tuples:
+        if gold_preproc:
+            raw_text = None
+        else:
+            sents = _merge_sents(sents)
+        for annot_tuples, brackets in sents:
+            if raw_text is None:
+                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
+                nlp.tagger(tokens)
+                nlp.entity(tokens)
+                nlp.parser(tokens)
+            else:
+                tokens = nlp(raw_text, merge_mwes=False)
+            gold = GoldParse(tokens, annot_tuples)
+            scorer.score(tokens, gold)
+    return scorer
+
+
+def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
+    nlp = Language(data_dir=model_dir)
+    if beam_width is not None:
+        nlp.parser.cfg.beam_width = beam_width
+    gold_tuples = read_json_file(dev_loc)
+    scorer = Scorer()
+    out_file = codecs.open(out_loc, 'w', 'utf8')
+    for raw_text, sents in gold_tuples:
+        sents = _merge_sents(sents)
+        for annot_tuples, brackets in sents:
+            if raw_text is None:
+                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
+                nlp.tagger(tokens)
+                nlp.entity(tokens)
+                nlp.parser(tokens)
+            else:
+                tokens = nlp(raw_text, merge_mwes=False)
+            gold = GoldParse(tokens, annot_tuples)
+            scorer.score(tokens, gold, verbose=False)
+            for t in tokens:
+                out_file.write(
+                    '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
+                )
+    return scorer
+
+
+@plac.annotations(
+    train_loc=("Location of training file or directory"),
+    dev_loc=("Location of development file or directory"),
+    model_dir=("Location of output model directory",),
+    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
+    corruption_level=("Amount of noise to add to training data", "option", "c", float),
+    gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
+    out_loc=("Out location", "option", "o", str),
+    n_sents=("Number of training sentences", "option", "n", int),
+    n_iter=("Number of training iterations", "option", "i", int),
+    verbose=("Verbose error reporting", "flag", "v", bool),
+    debug=("Debug mode", "flag", "d", bool),
+)
+def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
+         debug=False, corruption_level=0.0, gold_preproc=False, beam_width=1,
+         eval_only=False):
+    gold_train = list(read_json_file(train_loc))
+    nlp = train(English, gold_train, model_dir,
+               feat_set='embed',
+               gold_preproc=gold_preproc, n_sents=n_sents,
+               corruption_level=corruption_level, n_iter=n_iter,
+               verbose=verbose)
+    #if out_loc:
+    #    write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
+    scorer = evaluate(nlp, list(read_json_file(dev_loc)), gold_preproc=gold_preproc)
+    
+    print 'TOK', 100-scorer.token_acc
+    print 'POS', scorer.tags_acc
+    print 'UAS', scorer.uas
+    print 'LAS', scorer.las
+
+    print 'NER P', scorer.ents_p
+    print 'NER R', scorer.ents_r
+    print 'NER F', scorer.ents_f
+
+
+if __name__ == '__main__':
+    plac.call(main)
diff --git a/spacy/_bu_nn.pyx b/spacy/_bu_nn.pyx
new file mode 100644
index 000000000..ae875b235
--- /dev/null
+++ b/spacy/_bu_nn.pyx
@@ -0,0 +1,490 @@
+"""Feed-forward neural network, using Thenao."""
+
+import os
+import sys
+import time
+
+import numpy
+
+import theano
+import theano.tensor as T
+import gzip
+import cPickle
+
+
+def load_data(dataset):
+    ''' Loads the dataset
+
+    :type dataset: string
+    :param dataset: the path to the dataset (here MNIST)
+    '''
+
+    #############
+    # LOAD DATA #
+    #############
+
+    # Download the MNIST dataset if it is not present
+    data_dir, data_file = os.path.split(dataset)
+    if data_dir == "" and not os.path.isfile(dataset):
+        # Check if dataset is in the data directory.
+        new_path = os.path.join(
+            os.path.split(__file__)[0],
+            "..",
+            "data",
+            dataset
+        )
+        if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
+            dataset = new_path
+
+    if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
+        import urllib
+        origin = (
+            'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
+        )
+        print 'Downloading data from %s' % origin
+        urllib.urlretrieve(origin, dataset)
+
+    print '... loading data'
+
+    # Load the dataset
+    f = gzip.open(dataset, 'rb')
+    train_set, valid_set, test_set = cPickle.load(f)
+    f.close()
+    #train_set, valid_set, test_set format: tuple(input, target)
+    #input is an numpy.ndarray of 2 dimensions (a matrix), 
+    #each row corresponding to an example. target is a
+    #numpy.ndarray of 1 dimension (vector)) that have the same length as
+    #the number of rows in the input. It should give the target
+    #target to the example with the same index in the input.
+
+    def shared_dataset(data_xy, borrow=True):
+        """ Function that loads the dataset into shared variables
+
+        The reason we store our dataset in shared variables is to allow
+        Theano to copy it into the GPU memory (when code is run on GPU).
+        Since copying data into the GPU is slow, copying a minibatch everytime
+        is needed (the default behaviour if the data is not in a shared
+        variable) would lead to a large decrease in performance.
+        """
+        data_x, data_y = data_xy
+        shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX),
+                                 borrow=borrow)
+        shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX),
+                                 borrow=borrow)
+        # When storing data on the GPU it has to be stored as floats
+        # therefore we will store the labels as ``floatX`` as well
+        # (``shared_y`` does exactly that). But during our computations
+        # we need them as ints (we use labels as index, and if they are
+        # floats it doesn't make sense) therefore instead of returning
+        # ``shared_y`` we will have to cast it to int. This little hack
+        # lets ous get around this issue
+        return shared_x, T.cast(shared_y, 'int32')
+
+    test_set_x, test_set_y = shared_dataset(test_set)
+    valid_set_x, valid_set_y = shared_dataset(valid_set)
+    train_set_x, train_set_y = shared_dataset(train_set)
+
+    rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
+            (test_set_x, test_set_y)]
+    return rval
+
+
+class LogisticRegression(object):
+    """Multi-class Logistic Regression Class
+
+    The logistic regression is fully described by a weight matrix :math:`W`
+    and bias vector :math:`b`. Classification is done by projecting data
+    points onto a set of hyperplanes, the distance to which is used to
+    determine a class membership probability.
+    """
+
+    def __init__(self, input, n_in, n_out):
+        """ Initialize the parameters of the logistic regression
+
+        :type input: theano.tensor.TensorType
+        :param input: symbolic variable that describes the input of the
+                      architecture (one minibatch)
+
+        :type n_in: int
+        :param n_in: number of input units, the dimension of the space in
+                     which the datapoints lie
+
+        :type n_out: int
+        :param n_out: number of output units, the dimension of the space in
+                      which the labels lie
+
+        """
+        # start-snippet-1
+        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
+        self.W = theano.shared(
+            value=numpy.zeros((n_in, n_out),
+                dtype=theano.config.floatX
+            ),
+            name='W',
+            borrow=True
+        )
+        # initialize the baises b as a vector of n_out 0s
+        self.b = theano.shared(
+            value=numpy.zeros(
+                (n_out,),
+                dtype=theano.config.floatX
+            ),
+            name='b',
+            borrow=True
+        )
+
+        # symbolic expression for computing the matrix of class-membership
+        # probabilities
+        # Where:
+        # W is a matrix where column-k represent the separation hyper plain for
+        # class-k
+        # x is a matrix where row-j  represents input training sample-j
+        # b is a vector where element-k represent the free parameter of hyper
+        # plain-k
+        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
+
+        # symbolic description of how to compute prediction as class whose
+        # probability is maximal
+        self.y_pred = T.argmax(self.p_y_given_x, axis=1)
+        # end-snippet-1
+
+        # parameters of the model
+        self.params = [self.W, self.b]
+
+    def neg_ll(self, y):
+        """Return the mean of the negative log-likelihood of the prediction
+        of this model under a given target distribution.
+
+        .. math::
+
+            \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
+            \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|}
+                \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+            \ell (\theta=\{W,b\}, \mathcal{D})
+
+        :type y: theano.tensor.TensorType
+        :param y: corresponds to a vector that gives for each example the
+                  correct label
+
+        Note: we use the mean instead of the sum so that
+              the learning rate is less dependent on the batch size
+        """
+        # start-snippet-2
+        # y.shape[0] is (symbolically) the number of rows in y, i.e.,
+        # number of examples (call it n) in the minibatch
+        # T.arange(y.shape[0]) is a symbolic vector which will contain
+        # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
+        # Log-Probabilities (call it LP) with one row per example and
+        # one column per class LP[T.arange(y.shape[0]),y] is a vector
+        # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
+        # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
+        # the mean (across minibatch examples) of the elements in v,
+        # i.e., the mean log-likelihood across the minibatch.
+        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
+        # end-snippet-2
+
+    def errors(self, y):
+        """Return a float representing the number of errors in the minibatch
+        over the total number of examples of the minibatch ; zero one
+        loss over the size of the minibatch
+
+        :type y: theano.tensor.TensorType
+        :param y: corresponds to a vector that gives for each example the
+                  correct label
+        """
+
+        # check if y has same dimension of y_pred
+        if y.ndim != self.y_pred.ndim:
+            raise TypeError(
+                'y should have the same shape as self.y_pred',
+                ('y', y.type, 'y_pred', self.y_pred.type)
+            )
+        # check if y is of the correct datatype
+        if y.dtype.startswith('int'):
+            # the T.neq operator returns a vector of 0s and 1s, where 1
+            # represents a mistake in prediction
+            return T.mean(T.neq(self.y_pred, y))
+        else:
+            raise NotImplementedError()
+
+
+# start-snippet-1
+class HiddenLayer(object):
+    def __init__(self, rng, input, n_in, n_out, W=None, b=None,
+                 activation=T.tanh):
+        """
+        Typical hidden layer of a MLP: units are fully-connected and have
+        sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
+        and the bias vector b is of shape (n_out,).
+
+        NOTE : The nonlinearity used here is tanh
+
+        Hidden unit activation is given by: tanh(dot(input,W) + b)
+
+        :type rng: numpy.random.RandomState
+        :param rng: a random number generator used to initialize weights
+
+        :type input: theano.tensor.dmatrix
+        :param input: a symbolic tensor of shape (n_examples, n_in)
+
+        :type n_in: int
+        :param n_in: dimensionality of input
+
+        :type n_out: int
+        :param n_out: number of hidden units
+
+        :type activation: theano.Op or function
+        :param activation: Non linearity to be applied in the hidden
+                           layer
+        """
+        self.input = input
+        # end-snippet-1
+
+        # `W` is initialized with `W_values` which is uniformely sampled
+        # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
+        # for tanh activation function
+        # the output of uniform if converted using asarray to dtype
+        # theano.config.floatX so that the code is runable on GPU
+        # Note : optimal initialization of weights is dependent on the
+        #        activation function used (among other things).
+        #        For example, results presented in [Xavier10] suggest that you
+        #        should use 4 times larger initial weights for sigmoid
+        #        compared to tanh
+        #        We have no info for other function, so we use the same as
+        #        tanh.
+        if W is None:
+            W_values = numpy.asarray(
+                rng.uniform(
+                    low=-numpy.sqrt(6. / (n_in + n_out)),
+                    high=numpy.sqrt(6. / (n_in + n_out)),
+                    size=(n_in, n_out)
+                ),
+                dtype=theano.config.floatX
+            )
+            if activation == theano.tensor.nnet.sigmoid:
+                W_values *= 4
+
+            W = theano.shared(value=W_values, name='W', borrow=True)
+
+        if b is None:
+            b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
+            b = theano.shared(value=b_values, name='b', borrow=True)
+
+        self.W = W
+        self.b = b
+
+        lin_output = T.dot(input, self.W) + self.b
+        self.output = (
+            lin_output if activation is None
+            else activation(lin_output)
+        )
+        # parameters of the model
+        self.params = [self.W, self.b]
+
+
+# start-snippet-2
+class MLP(object):
+    """Multi-Layer Perceptron Class
+
+    A multilayer perceptron is a feedforward artificial neural network model
+    that has one layer or more of hidden units and nonlinear activations.
+    Intermediate layers usually have as activation function tanh or the
+    sigmoid function (defined here by a ``HiddenLayer`` class)  while the
+    top layer is a softmax layer (defined here by a ``LogisticRegression``
+    class).
+    """
+
+    def __init__(self, rng, input, n_in, n_hidden, n_out):
+        """Initialize the parameters for the multilayer perceptron
+
+        :type rng: numpy.random.RandomState
+        :param rng: a random number generator used to initialize weights
+
+        :type input: theano.tensor.TensorType
+        :param input: symbolic variable that describes the input of the
+        architecture (one minibatch)
+
+        :type n_in: int
+        :param n_in: number of input units, the dimension of the space in
+        which the datapoints lie
+
+        :type n_hidden: int
+        :param n_hidden: number of hidden units
+
+        :type n_out: int
+        :param n_out: number of output units, the dimension of the space in
+        which the labels lie
+
+        """
+
+        # Since we are dealing with a one hidden layer MLP, this will translate
+        # into a HiddenLayer with a tanh activation function connected to the
+        # LogisticRegression layer; the activation function can be replaced by
+        # sigmoid or any other nonlinear function
+        self.hidden = HiddenLayer(
+            rng=rng,
+            input=input,
+            n_in=n_in,
+            n_out=n_hidden,
+            activation=T.tanh
+        )
+
+        # The logistic regression layer gets as input the hidden units
+        # of the hidden layer
+        self.maxent = LogisticRegression(
+            input=self.hidden.output,
+            n_in=n_hidden,
+            n_out=n_out
+        )
+        # L1 norm ; one regularization option is to enforce L1 norm to
+        # be small
+        self.L1 = abs(self.hidden.W).sum() + abs(self.maxent.W).sum()
+
+        # square of L2 norm ; one regularization option is to enforce
+        # square of L2 norm to be small
+        self.L2_sqr = (self.hidden.W ** 2).sum() + (self.maxent.W ** 2).sum()
+
+        # negative log likelihood of the MLP is given by the negative
+        # log likelihood of the output of the model, computed in the
+        # logistic regression layer
+        self.neg_ll = self.maxent.neg_ll
+        # same holds for the function computing the number of errors
+        self.errors = self.maxent.errors
+
+        # the parameters of the model are the parameters of the two layer it is
+        # made out of
+        self.params = self.hidden.params + self.maxent.params
+
+
+
+
+def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
+             dataset='mnist.pkl.gz', batch_size=1, n_hidden=500):
+    """
+    Demonstrate stochastic gradient descent optimization for a multilayer
+    perceptron
+
+    This is demonstrated on MNIST.
+
+    :type learning_rate: float
+    :param learning_rate: learning rate used (factor for the stochastic
+    gradient
+
+    :type L1_reg: float
+    :param L1_reg: L1-norm's weight when added to the cost (see
+    regularization)
+
+    :type L2_reg: float
+    :param L2_reg: L2-norm's weight when added to the cost (see
+    regularization)
+
+    :type n_epochs: int
+    :param n_epochs: maximal number of epochs to run the optimizer
+
+    :type dataset: string
+    :param dataset: the path of the MNIST dataset file from
+                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
+    """
+    datasets = load_data(dataset)
+
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x, test_set_y = datasets[2]
+
+    ######################
+    # BUILD ACTUAL MODEL #
+    ######################
+    print '... building the model'
+
+    # allocate symbolic variables for the data
+    index = T.lscalar()  # index to a [mini]batch
+    x = T.matrix('x')  # the data is presented as rasterized images
+    y = T.ivector('y')  # the labels are presented as 1D vector of
+                        # [int] labels
+
+    rng = numpy.random.RandomState(1234)
+
+    # construct the MLP class
+    mlp = MLP(
+        rng=rng,
+        input=x,
+        n_in=28 * 28,
+        n_hidden=n_hidden,
+        n_out=10
+    )
+
+    # the cost we minimize during training is the negative log likelihood of
+    # the model plus the regularization terms (L1 and L2); cost is expressed
+    # here symbolically
+
+    # compiling a Theano function that computes the mistakes that are made
+    # by the model on a minibatch
+    test_model = theano.function(
+        inputs=[index],
+        outputs=mlp.maxent.errors(y),
+        givens={
+            x: test_set_x[index:index+1],
+            y: test_set_y[index:index+1]
+        }
+    )
+
+    validate_model = theano.function(
+        inputs=[index],
+        outputs=mlp.maxent.errors(y),
+        givens={
+            x: valid_set_x[index:index+1],
+            y: valid_set_y[index:index+1]
+        }
+    )
+
+    # compute the gradient of cost with respect to theta (sotred in params)
+    # the resulting gradients will be stored in a list gparams
+    cost = mlp.neg_ll(y) + L1_reg * mlp.L1 + L2_reg * mlp.L2_sqr
+    gparams = [T.grad(cost, param) for param in mlp.params]
+
+    # specify how to update the parameters of the model as a list of
+    # (variable, update expression) pairs
+
+    updates = [(mlp.params[i], mlp.params[i] - (learning_rate * gparams[i]))
+               for i in xrange(len(gparams))]
+
+    # compiling a Theano function `train_model` that returns the cost, but
+    # in the same time updates the parameter of the model based on the rules
+    # defined in `updates`
+    train_model = theano.function(
+        inputs=[index],
+        outputs=cost,
+        updates=updates,
+        givens={
+            x: train_set_x[index:index+1],
+            y: train_set_y[index:index+1]
+        }
+    )
+    # end-snippet-5
+
+    ###############
+    # TRAIN MODEL #
+    ###############
+    print '... training'
+
+    start_time = time.clock()
+
+    n_examples = train_set_x.get_value(borrow=True).shape[0]
+    n_dev_examples = valid_set_x.get_value(borrow=True).shape[0]
+    n_test_examples = test_set_x.get_value(borrow=True).shape[0]
+    
+    for epoch in range(1, n_epochs+1):
+        for idx in xrange(n_examples):
+            train_model(idx)
+        # compute zero-one loss on validation set
+        error = numpy.mean(map(validate_model, xrange(n_dev_examples)))
+        print('epoch %i, validation error %f %%' % (epoch, error * 100))
+
+    end_time = time.clock()
+    print >> sys.stderr, ('The code for file ' +
+                          os.path.split(__file__)[1] +
+                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
+
+
+if __name__ == '__main__':
+    test_mlp()
diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx
index 993d1a8ac..cabc4318a 100644
--- a/spacy/_ml.pyx
+++ b/spacy/_ml.pyx
@@ -61,18 +61,18 @@ cdef class Model:
             self._model.load(self.model_loc, freq_thresh=0)
 
     def predict(self, Example eg):
-        self.set_scores(<weight_t*>eg.scores.data, <atom_t*>eg.atoms.data)
-        eg.guess = arg_max_if_true(<weight_t*>eg.scores.data, <int*>eg.is_valid.data,
+        self.set_scores(&eg.scores[0], &eg.atoms[0])
+        eg.guess = arg_max_if_true(&eg.scores[0], &eg.is_valid[0],
                                    self.n_classes)
 
     def train(self, Example eg):
-        self.set_scores(<weight_t*>eg.scores.data, <atom_t*>eg.atoms.data)
-        eg.guess = arg_max_if_true(<weight_t*>eg.scores.data,
-                                   <int*>eg.is_valid.data, self.n_classes)
-        eg.best = arg_max_if_zero(<weight_t*>eg.scores.data, <int*>eg.costs.data,
+        self.set_scores(&eg.scores[0], &eg.atoms[0])
+        eg.guess = arg_max_if_true(&eg.scores[0],
+                                   &eg.is_valid[0], self.n_classes)
+        eg.best = arg_max_if_zero(&eg.scores[0], &eg.costs[0],
                                   self.n_classes)
         eg.cost = eg.costs[eg.guess]
-        self.update(<atom_t*>eg.atoms.data, eg.guess, eg.best, eg.cost)
+        self.update(&eg.atoms[0], eg.guess, eg.best, eg.cost)
 
     cdef const weight_t* score(self, atom_t* context) except NULL:
         cdef int n_feats
diff --git a/spacy/_nn.py b/spacy/_nn.py
new file mode 100644
index 000000000..48dca390c
--- /dev/null
+++ b/spacy/_nn.py
@@ -0,0 +1,3 @@
+"""Feed-forward neural network, using Thenao."""
+
+
diff --git a/spacy/_nn.pyx b/spacy/_nn.pyx
new file mode 100644
index 000000000..c47be1f49
--- /dev/null
+++ b/spacy/_nn.pyx
@@ -0,0 +1,146 @@
+"""Feed-forward neural network, using Thenao."""
+
+import os
+import sys
+import time
+
+import numpy
+
+import theano
+import theano.tensor as T
+import plac
+
+from spacy.gold import read_json_file
+from spacy.gold import GoldParse
+from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
+
+
+def build_model(n_classes, n_vocab, n_hidden, n_word_embed, n_tag_embed):
+    # allocate symbolic variables for the data
+    words = T.vector('words')
+    tags = T.vector('tags') 
+    
+    word_e = _init_embedding(n_words, n_word_embed)
+    tag_e = _init_embedding(n_tags, n_tag_embed)
+    label_e = _init_embedding(n_labels, n_label_embed)
+    maxent_W, maxent_b = _init_maxent_weights(n_hidden, n_classes)
+    hidden_W, hidden_b = _init_hidden_weights(28*28, n_hidden, T.tanh) 
+    params = [hidden_W, hidden_b, maxent_W, maxent_b, word_e, tag_e, label_e]
+
+    x = T.concatenate([
+          T.flatten(word_e[word_indices], outdim=1),
+          T.flatten(tag_e[tag_indices], outdim=1)])
+
+    p_y_given_x = feed_layer(
+                    T.nnet.softmax,
+                    maxent_W,
+                    maxent_b,
+                      feed_layer(
+                        T.tanh,
+                        hidden_W,
+                        hidden_b,
+                        x))[0]
+
+    guess = T.argmax(p_y_given_x)
+
+    cost = (
+        -T.log(p_y_given_x[y])
+        + L1(L1_reg, maxent_W, hidden_W, word_e, tag_e)
+        + L2(L2_reg, maxent_W, hidden_W, wod_e, tag_e)
+    )
+
+    train_model = theano.function(
+        inputs=[words, tags, y],
+        outputs=guess,
+        updates=[update(learning_rate, param, cost) for param in params]
+    )
+
+    evaluate_model = theano.function(
+        inputs=[x, y],
+        outputs=T.neq(y, T.argmax(p_y_given_x[0])),
+    )
+    return train_model, evaluate_model
+
+
+def _init_embedding(vocab_size, n_dim):
+    embedding = 0.2 * numpy.random.uniform(-1.0, 1.0, (vocab_size+1, n_dim))
+    return theano.shared(embedding).astype(theano.config.floatX)
+
+
+def _init_maxent_weights(n_hidden, n_out):
+    weights = numpy.zeros((n_hidden, 10), dtype=theano.config.floatX)
+    bias =  numpy.zeros((10,), dtype=theano.config.floatX)
+    return (
+        theano.shared(name='W', borrow=True, value=weights),
+        theano.shared(name='b', borrow=True, value=bias)
+    )
+
+
+def _init_hidden_weights(n_in, n_out, activation=T.tanh):
+    rng = numpy.random.RandomState(1234)
+    weights = numpy.asarray(
+        rng.uniform(
+            low=-numpy.sqrt(6. / (n_in + n_out)),
+            high=numpy.sqrt(6. / (n_in + n_out)),
+            size=(n_in, n_out)
+        ),
+        dtype=theano.config.floatX
+    )
+
+    bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
+    return (
+        theano.shared(value=weights, name='W', borrow=True),
+        theano.shared(value=bias, name='b', borrow=True)
+    )
+
+           
+def feed_layer(activation, weights, bias, input):
+    return activation(T.dot(input, weights) + bias)
+
+
+def L1(L1_reg, w1, w2):
+    return L1_reg * (abs(w1).sum() + abs(w2).sum())
+
+
+def L2(L2_reg, w1, w2):
+    return L2_reg * ((w1 ** 2).sum() + (w2 ** 2).sum())
+
+
+def update(eta, param, cost):
+    return (param, param - (eta * T.grad(cost, param)))
+
+
+def main(train_loc, eval_loc, model_dir):
+    learning_rate = 0.01
+    L1_reg = 0.00
+    L2_reg = 0.0001
+
+    print "... reading the data"
+    gold_train = list(read_json_file(train_loc))
+    print '... building the model'
+    pos_model_dir = path.join(model_dir, 'pos')
+    if path.exists(pos_model_dir):
+        shutil.rmtree(pos_model_dir)
+    os.mkdir(pos_model_dir)
+
+    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
+
+    train_model, evaluate_model = build_model(n_hidden, len(POS_TAGS), learning_rate,
+                                              L1_reg, L2_reg)
+
+    print '... training'
+    for epoch in range(1, n_epochs+1):
+        for raw_text, sents in gold_tuples:
+            for (ids, words, tags, ner, heads, deps), _ in sents:
+                tokens = nlp.tokenizer.tokens_from_list(words)
+                for t in tokens:
+                    guess = train_model([t.orth], [t.tag])
+                    loss += guess != t.tag
+        print loss
+        # compute zero-one loss on validation set
+        #error = numpy.mean([evaluate_model(x, y) for x, y in dev_examples])
+        #print('epoch %i, validation error %f %%' % (epoch, error * 100))
+
+
+if __name__ == '__main__':
+    plac.call(main)
diff --git a/spacy/_theano.pxd b/spacy/_theano.pxd
new file mode 100644
index 000000000..cad0736c2
--- /dev/null
+++ b/spacy/_theano.pxd
@@ -0,0 +1,13 @@
+from ._ml cimport Model
+from thinc.nn cimport InputLayer
+
+
+cdef class TheanoModel(Model):
+    cdef InputLayer input_layer
+    cdef object train_func
+    cdef object predict_func
+    cdef object debug
+
+    cdef public float eta
+    cdef public float mu
+    cdef public float t
diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx
index 702208d18..08c49ce71 100644
--- a/spacy/_theano.pyx
+++ b/spacy/_theano.pyx
@@ -9,7 +9,8 @@ from os import path
 
 
 cdef class TheanoModel(Model):
-    def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None):
+    def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None,
+                 debug=None):
         if model_loc is not None and path.isdir(model_loc):
             model_loc = path.join(model_loc, 'model')
 
@@ -20,6 +21,7 @@ cdef class TheanoModel(Model):
         self.input_layer = InputLayer(input_spec, initializer)
         self.train_func = train_func
         self.predict_func = predict_func
+        self.debug = debug
 
         self.n_classes = n_classes
         self.n_feats = len(self.input_layer)
@@ -27,18 +29,25 @@ cdef class TheanoModel(Model):
         
     def predict(self, Example eg):
         self.input_layer.fill(eg.embeddings, eg.atoms)
-        theano_scores = self.predict_func(eg.embeddings)
+        theano_scores = self.predict_func(eg.embeddings)[0]
         cdef int i
         for i in range(self.n_classes):
             eg.scores[i] = theano_scores[i]
-        eg.guess = arg_max_if_true(<weight_t*>eg.scores.data, <int*>eg.is_valid.data,
+        eg.guess = arg_max_if_true(&eg.scores[0], <int*>eg.is_valid[0],
                                    self.n_classes)
 
     def train(self, Example eg):
-        self.predict(eg)
-        update, t, eta, mu = self.train_func(eg.embeddings, eg.scores, eg.costs)
-        self.input_layer.update(eg.atoms, update, self.t, self.eta, self.mu)
-        eg.best = arg_max_if_zero(<weight_t*>eg.scores.data, <int*>eg.costs.data,
+        self.input_layer.fill(eg.embeddings, eg.atoms)
+        theano_scores, update, y = self.train_func(eg.embeddings, eg.costs, self.eta)
+        self.input_layer.update(update, eg.atoms, self.t, self.eta, self.mu)
+        for i in range(self.n_classes):
+            eg.scores[i] = theano_scores[i]
+        eg.guess = arg_max_if_true(&eg.scores[0], <int*>eg.is_valid[0],
+                                   self.n_classes)
+        eg.best = arg_max_if_zero(&eg.scores[0], <int*>eg.costs[0],
                                   self.n_classes)
         eg.cost = eg.costs[eg.guess]
         self.t += 1
+
+    def end_training(self):
+        pass
diff --git a/spacy/syntax/joint.pxd b/spacy/syntax/joint.pxd
new file mode 100644
index 000000000..5b7a6e3db
--- /dev/null
+++ b/spacy/syntax/joint.pxd
@@ -0,0 +1,17 @@
+from cymem.cymem cimport Pool
+
+from thinc.typedefs cimport weight_t
+
+from .stateclass cimport StateClass
+
+from .transition_system cimport TransitionSystem, Transition
+from ..gold cimport GoldParseC
+
+
+cdef class ArcEager(TransitionSystem):
+    pass
+
+
+cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil
+cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil
+
diff --git a/spacy/syntax/joint.pyx b/spacy/syntax/joint.pyx
new file mode 100644
index 000000000..29e62cb4e
--- /dev/null
+++ b/spacy/syntax/joint.pyx
@@ -0,0 +1,452 @@
+# cython: profile=True
+from __future__ import unicode_literals
+
+import ctypes
+import os
+
+from ..structs cimport TokenC
+
+from .transition_system cimport do_func_t, get_cost_func_t
+from .transition_system cimport move_cost_func_t, label_cost_func_t
+from ..gold cimport GoldParse
+from ..gold cimport GoldParseC
+
+from libc.stdint cimport uint32_t
+from libc.string cimport memcpy
+
+from cymem.cymem cimport Pool
+from .stateclass cimport StateClass
+
+
+DEF NON_MONOTONIC = True
+DEF USE_BREAK = True
+DEF USE_ROOT_ARC_SEGMENT = True
+
+cdef weight_t MIN_SCORE = -90000
+
+# Break transition from here
+# http://www.aclweb.org/anthology/P13-1074
+cdef enum:
+    SHIFT
+    REDUCE
+    LEFT
+    RIGHT
+
+    BREAK
+
+    N_MOVES
+
+
+MOVE_NAMES = [None] * N_MOVES
+MOVE_NAMES[SHIFT] = 'S'
+MOVE_NAMES[REDUCE] = 'D'
+MOVE_NAMES[LEFT] = 'L'
+MOVE_NAMES[RIGHT] = 'R'
+MOVE_NAMES[BREAK] = 'B'
+
+
+# Helper functions for the arc-eager oracle
+
+cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil:
+    cdef int cost = 0
+    cdef int i, S_i
+    for i in range(stcls.stack_depth()):
+        S_i = stcls.S(i)
+        if gold.heads[target] == S_i:
+            cost += 1
+        if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)):
+            cost += 1
+    cost += Break.is_valid(stcls, -1) and Break.move_cost(stcls, gold) == 0
+    return cost
+
+
+cdef int pop_cost(StateClass stcls, const GoldParseC* gold, int target) nogil:
+    cdef int cost = 0
+    cdef int i, B_i
+    for i in range(stcls.buffer_length()):
+        B_i = stcls.B(i)
+        cost += gold.heads[B_i] == target
+        cost += gold.heads[target] == B_i
+        if gold.heads[B_i] == B_i or gold.heads[B_i] < target:
+            break
+    cost += Break.is_valid(stcls, -1) and Break.move_cost(stcls, gold) == 0
+    return cost
+
+
+cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil:
+    if arc_is_gold(gold, head, child):
+        return 0
+    elif stcls.H(child) == gold.heads[child]:
+        return 1
+    # Head in buffer
+    elif gold.heads[child] >= stcls.B(0) and stcls.B(1) != -1:
+        return 1
+    else:
+        return 0
+
+
+cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
+    if gold.labels[child] == -1:
+        return True
+    elif USE_ROOT_ARC_SEGMENT and _is_gold_root(gold, head) and _is_gold_root(gold, child):
+        return True
+    elif gold.heads[child] == head:
+        return True
+    else:
+        return False
+
+
+cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil:
+    if gold.labels[child] == -1:
+        return True
+    elif label == -1:
+        return True
+    elif gold.labels[child] == label:
+        return True
+    else:
+        return False
+
+
+cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
+    return gold.labels[word] == -1 or gold.heads[word] == word
+ 
+
+cdef class Shift:
+    @staticmethod
+    cdef bint is_valid(StateClass st, int label) nogil:
+        return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
+
+    @staticmethod
+    cdef int transition(StateClass st, int label) nogil:
+        st.push()
+        st.fast_forward()
+
+    @staticmethod
+    cdef int cost(StateClass st, const GoldParseC* gold, int label) nogil:
+        return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label)
+
+    @staticmethod
+    cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil:
+        return push_cost(s, gold, s.B(0))
+
+    @staticmethod
+    cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+        return 0
+
+
+cdef class Reduce:
+    @staticmethod
+    cdef bint is_valid(StateClass st, int label) nogil:
+        return st.stack_depth() >= 2
+
+    @staticmethod
+    cdef int transition(StateClass st, int label) nogil:
+        if st.has_head(st.S(0)):
+            st.pop()
+        else:
+            st.unshift()
+        st.fast_forward()
+
+    @staticmethod
+    cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil:
+        return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label)
+
+    @staticmethod
+    cdef inline int move_cost(StateClass st, const GoldParseC* gold) nogil:
+        return pop_cost(st, gold, st.S(0))
+
+    @staticmethod
+    cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+        return 0
+
+
+cdef class LeftArc:
+    @staticmethod
+    cdef bint is_valid(StateClass st, int label) nogil:
+        return not st.B_(0).sent_start
+
+    @staticmethod
+    cdef int transition(StateClass st, int label) nogil:
+        st.add_arc(st.B(0), st.S(0), label)
+        st.pop()
+        st.fast_forward()
+
+    @staticmethod
+    cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil:
+        return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)
+
+    @staticmethod
+    cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil:
+        cdef int cost = 0
+        if arc_is_gold(gold, s.B(0), s.S(0)):
+            return 0
+        else:
+            # Account for deps we might lose between S0 and stack
+            if not s.has_head(s.S(0)):
+                for i in range(1, s.stack_depth()):
+                    cost += gold.heads[s.S(i)] == s.S(0)
+                    cost += gold.heads[s.S(0)] == s.S(i)
+            return pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))
+
+    @staticmethod
+    cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+        return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)
+
+
+cdef class RightArc:
+    @staticmethod
+    cdef bint is_valid(StateClass st, int label) nogil:
+        return not st.B_(0).sent_start
+
+    @staticmethod
+    cdef int transition(StateClass st, int label) nogil:
+        st.add_arc(st.S(0), st.B(0), label)
+        st.push()
+        st.fast_forward()
+
+    @staticmethod
+    cdef inline int cost(StateClass s, const GoldParseC* gold, int label) nogil:
+        return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)
+
+    @staticmethod
+    cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil:
+        if arc_is_gold(gold, s.S(0), s.B(0)):
+            return 0
+        elif s.shifted[s.B(0)]:
+            return push_cost(s, gold, s.B(0))
+        else:
+            return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))
+
+    @staticmethod
+    cdef int label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+        return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)
+
+
+cdef class Break:
+    @staticmethod
+    cdef bint is_valid(StateClass st, int label) nogil:
+        cdef int i
+        if not USE_BREAK:
+            return False
+        elif st.at_break():
+            return False
+        elif st.B(0) == 0:
+            return False
+        elif st.stack_depth() < 1:
+            return False
+        elif (st.S(0) + 1) != st.B(0):
+            # Must break at the token boundary
+            return False
+        else:
+            return True
+
+    @staticmethod
+    cdef int transition(StateClass st, int label) nogil:
+        st.set_break(st.B(0))
+        st.fast_forward()
+
+    @staticmethod
+    cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil:
+        return Break.move_cost(s, gold) + Break.label_cost(s, gold, label)
+
+    @staticmethod
+    cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil:
+        cdef int cost = 0
+        cdef int i, j, S_i, B_i
+        for i in range(s.stack_depth()):
+            S_i = s.S(i)
+            for j in range(s.buffer_length()):
+                B_i = s.B(j)
+                cost += gold.heads[S_i] == B_i
+                cost += gold.heads[B_i] == S_i
+        # Check for sentence boundary --- if it's here, we can't have any deps
+        # between stack and buffer, so rest of action is irrelevant.
+        s0_root = _get_root(s.S(0), gold)
+        b0_root = _get_root(s.B(0), gold)
+        if s0_root != b0_root or s0_root == -1 or b0_root == -1:
+            return cost
+        else:
+            return cost + 1
+    
+    @staticmethod
+    cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+        return 0
+
+cdef int _get_root(int word, const GoldParseC* gold) nogil:
+    while gold.heads[word] != word and gold.labels[word] != -1 and word >= 0:
+        word = gold.heads[word]
+    if gold.labels[word] == -1:
+        return -1
+    else:
+        return word
+        
+
+cdef class ArcEager(TransitionSystem):
+    @classmethod
+    def get_labels(cls, gold_parses):
+        move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {'ROOT': True},
+                       LEFT: {'ROOT': True}, BREAK: {'ROOT': True}}
+        for raw_text, sents in gold_parses:
+            for (ids, words, tags, heads, labels, iob), ctnts in sents:
+                for child, head, label in zip(ids, heads, labels):
+                    if label.upper() == 'ROOT':
+                        label = 'ROOT'
+                    if label != 'ROOT':
+                        if head < child:
+                            move_labels[RIGHT][label] = True
+                        elif head > child:
+                            move_labels[LEFT][label] = True
+        return move_labels
+
+    cdef int preprocess_gold(self, GoldParse gold) except -1:
+        for i in range(gold.length):
+            if gold.heads[i] is None: # Missing values
+                gold.c.heads[i] = i
+                gold.c.labels[i] = -1
+            else:
+                label = gold.labels[i]
+                if label.upper() == 'ROOT':
+                    label = 'ROOT'
+                gold.c.heads[i] = gold.heads[i]
+                gold.c.labels[i] = self.strings[label]
+        for end, brackets in gold.brackets.items():
+            for start, label_strs in brackets.items():
+                gold.c.brackets[start][end] = 1
+                for label_str in label_strs:
+                    # Add the encoded label to the set
+                    gold.brackets[end][start].add(self.strings[label_str])
+
+    cdef Transition lookup_transition(self, object name) except *:
+        if '-' in name:
+            move_str, label_str = name.split('-', 1)
+            label = self.label_ids[label_str]
+        else:
+            label = 0
+        move = MOVE_NAMES.index(move_str)
+        for i in range(self.n_moves):
+            if self.c[i].move == move and self.c[i].label == label:
+                return self.c[i]
+
+    def move_name(self, int move, int label):
+        label_str = self.strings[label]
+        if label_str:
+            return MOVE_NAMES[move] + '-' + label_str
+        else:
+            return MOVE_NAMES[move]
+
+    cdef Transition init_transition(self, int clas, int move, int label) except *:
+        # TODO: Apparent Cython bug here when we try to use the Transition()
+        # constructor with the function pointers
+        cdef Transition t
+        t.score = 0
+        t.clas = clas
+        t.move = move
+        t.label = label
+        if move == SHIFT:
+            t.is_valid = Shift.is_valid
+            t.do = Shift.transition
+            t.get_cost = Shift.cost
+        elif move == REDUCE:
+            t.is_valid = Reduce.is_valid
+            t.do = Reduce.transition
+            t.get_cost = Reduce.cost
+        elif move == LEFT:
+            t.is_valid = LeftArc.is_valid
+            t.do = LeftArc.transition
+            t.get_cost = LeftArc.cost
+        elif move == RIGHT:
+            t.is_valid = RightArc.is_valid
+            t.do = RightArc.transition
+            t.get_cost = RightArc.cost
+        elif move == BREAK:
+            t.is_valid = Break.is_valid
+            t.do = Break.transition
+            t.get_cost = Break.cost
+        else:
+            raise Exception(move)
+        return t
+
+    cdef int initialize_state(self, StateClass st) except -1:
+        # Ensure sent_start is set to 0 throughout
+        for i in range(st.length):
+            st._sent[i].sent_start = False
+            st._sent[i].l_edge = i
+            st._sent[i].r_edge = i
+        st.fast_forward()
+
+    cdef int finalize_state(self, StateClass st) except -1:
+        cdef int root_label = self.strings['ROOT']
+        for i in range(st.length):
+            if st._sent[i].head == 0 and st._sent[i].dep == 0:
+                st._sent[i].dep = root_label
+            # If we're not using the Break transition, we segment via root-labelled
+            # arcs between the root words.
+            elif USE_ROOT_ARC_SEGMENT and st._sent[i].dep == root_label:
+                st._sent[i].head = 0
+
+    cdef int set_valid(self, bint* output, StateClass stcls) except -1:
+        cdef bint[N_MOVES] is_valid
+        is_valid[SHIFT] = Shift.is_valid(stcls, -1)
+        is_valid[REDUCE] = Reduce.is_valid(stcls, -1)
+        is_valid[LEFT] = LeftArc.is_valid(stcls, -1)
+        is_valid[RIGHT] = RightArc.is_valid(stcls, -1)
+        is_valid[BREAK] = Break.is_valid(stcls, -1)
+        cdef int i
+        n_valid = 0
+        for i in range(self.n_moves):
+            output[i] = is_valid[self.c[i].move]
+            n_valid += output[i]
+        assert n_valid >= 1
+
+    cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1:
+        cdef int i, move, label
+        cdef label_cost_func_t[N_MOVES] label_cost_funcs
+        cdef move_cost_func_t[N_MOVES] move_cost_funcs
+        cdef int[N_MOVES] move_costs
+        for i in range(N_MOVES):
+            move_costs[i] = -1
+        move_cost_funcs[SHIFT] = Shift.move_cost
+        move_cost_funcs[REDUCE] = Reduce.move_cost
+        move_cost_funcs[LEFT] = LeftArc.move_cost
+        move_cost_funcs[RIGHT] = RightArc.move_cost
+        move_cost_funcs[BREAK] = Break.move_cost
+
+        label_cost_funcs[SHIFT] = Shift.label_cost
+        label_cost_funcs[REDUCE] = Reduce.label_cost
+        label_cost_funcs[LEFT] = LeftArc.label_cost
+        label_cost_funcs[RIGHT] = RightArc.label_cost
+        label_cost_funcs[BREAK] = Break.label_cost
+
+        cdef int* labels = gold.c.labels
+        cdef int* heads = gold.c.heads
+
+        n_gold = 0
+        for i in range(self.n_moves):
+            if self.c[i].is_valid(stcls, self.c[i].label):
+                move = self.c[i].move
+                label = self.c[i].label
+                if move_costs[move] == -1:
+                    move_costs[move] = move_cost_funcs[move](stcls, &gold.c)
+                output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label)
+                n_gold += output[i] == 0
+            else:
+                output[i] = 9000
+        assert n_gold >= 1
+
+    cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *:
+        cdef bint[N_MOVES] is_valid
+        is_valid[SHIFT] = Shift.is_valid(stcls, -1)
+        is_valid[REDUCE] = Reduce.is_valid(stcls, -1)
+        is_valid[LEFT] = LeftArc.is_valid(stcls, -1)
+        is_valid[RIGHT] = RightArc.is_valid(stcls, -1)
+        is_valid[BREAK] = Break.is_valid(stcls, -1)
+        cdef Transition best
+        cdef weight_t score = MIN_SCORE
+        cdef int i
+        for i in range(self.n_moves):
+            if scores[i] > score and is_valid[self.c[i].move]:
+                best = self.c[i]
+                score = scores[i]
+        assert best.clas < self.n_moves
+        assert score > MIN_SCORE, (stcls.stack_depth(), stcls.buffer_length(), stcls.is_final(), stcls._b_i, stcls.length)
+        return best
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index 8af7ab25d..f2fa63da5 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -51,18 +51,21 @@ def get_templates(name):
         return pf.ner
     elif name == 'debug':
         return pf.unigrams
+    elif name.startswith('embed'):
+        return ((10, pf.words), (10, pf.tags), (10, pf.labels))
     else:
         return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
                 pf.tree_shape + pf.trigrams)
 
 
 cdef class Parser:
-    def __init__(self, StringStore strings, model_dir, transition_system):
+    def __init__(self, StringStore strings, model_dir, transition_system,
+                 get_model=Model):
         assert os.path.exists(model_dir) and os.path.isdir(model_dir)
         self.cfg = Config.read(model_dir, 'config')
         self.moves = transition_system(strings, self.cfg.labels)
         templates = get_templates(self.cfg.features)
-        self.model = Model(self.moves.n_moves, templates, model_dir)
+        self.model = get_model(self.moves.n_moves, templates, model_dir)
 
     def __call__(self, Tokens tokens):
         cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
@@ -71,8 +74,8 @@ cdef class Parser:
         cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats)
         while not stcls.is_final():
             eg.wipe()
-            fill_context(<atom_t*>eg.atoms.data, stcls)
-            self.moves.set_valid(<bint*>eg.is_valid.data, stcls)
+            fill_context(&eg.atoms[0], stcls)
+            self.moves.set_valid(<bint*>&eg.is_valid[0], stcls)
 
             self.model.predict(eg)
 
@@ -88,8 +91,8 @@ cdef class Parser:
         cdef int cost = 0
         while not stcls.is_final():
             eg.wipe()
-            fill_context(<atom_t*>eg.atoms.data, stcls)
-            self.moves.set_costs(<bint*>eg.is_valid.data, <int*>eg.costs.data, stcls, gold)
+            fill_context(&eg.atoms[0], stcls)
+            self.moves.set_costs(<bint*>&eg.is_valid[0], &eg.costs[0], stcls, gold)
 
             self.model.train(eg)
 

From 7b8275fcc4fa815323729ed49fac90775a23780f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 27 Jun 2015 04:18:01 +0200
Subject: [PATCH 14/30] * Wire hyperparameters to script interface

---
 bin/parser/nn_train.py | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py
index 375996f4f..e0ae846b5 100755
--- a/bin/parser/nn_train.py
+++ b/bin/parser/nn_train.py
@@ -84,7 +84,8 @@ def _merge_sents(sents):
 def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
           seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
           verbose=False,
-          eta=0.01, mu=0.9, n_hidden=100, word_vec_len=10, pos_vec_len=10):
+          eta=0.01, mu=0.9, n_hidden=100,
+          nv_word=10, nv_tag=10, nv_label=10):
     dep_model_dir = path.join(model_dir, 'deps')
     pos_model_dir = path.join(model_dir, 'pos')
     ner_model_dir = path.join(model_dir, 'ner')
@@ -99,8 +100,15 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
     os.mkdir(ner_model_dir)
     setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
 
-    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
-                 labels=Language.ParserTransitionSystem.get_labels(gold_tuples))
+    Config.write(dep_model_dir, 'config',
+        seed=seed,
+        features=feat_set,
+        labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
+        vector_lengths=(nv_word, nv_tag, nv_label),
+        hidden_nodes=n_hidden,
+        eta=eta,
+        mu=mu
+    )
     Config.write(ner_model_dir, 'config', features='ner', seed=seed,
                  labels=Language.EntityTransitionSystem.get_labels(gold_tuples),
                  beam_width=0)
@@ -110,16 +118,17 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
     
     nlp = Language(data_dir=model_dir)
 
-    def make_model(n_classes, input_spec, model_dir):
-        print input_spec
-        n_in = sum(n_cols * len(fields) for (n_cols, fields) in input_spec)
+    def make_model(n_classes, (words, tags, labels), model_dir):
+        n_in = (nv_word  * len(words)) + \
+               (nv_tag   * len(tags)) + \
+               (nv_label * len(labels))
         print 'Compiling'
         debug, train_func, predict_func = compile_theano_model(n_classes, n_hidden,
                                                         n_in, 0.0, 0.0)
         print 'Done'
         return TheanoModel(
             n_classes,
-            input_spec,
+            ((nv_word, words), (nv_tag, tags), (nv_label, labels)),
             train_func,
             predict_func,
             model_loc=model_dir, 
@@ -226,14 +235,23 @@ def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
     n_sents=("Number of training sentences", "option", "n", int),
     n_iter=("Number of training iterations", "option", "i", int),
     verbose=("Verbose error reporting", "flag", "v", bool),
-    debug=("Debug mode", "flag", "d", bool),
+
+    nv_word=("Word vector length", "option", "W", int),
+    nv_tag=("Tag vector length", "option", "T", int),
+    nv_label=("Label vector length", "option", "L", int),
+    nv_hidden=("Hidden nodes length", "option", "H", int),
+    eta=("Learning rate", "option", "E", float),
+    mu=("Momentum", "option", "M", float),
 )
 def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
-         debug=False, corruption_level=0.0, gold_preproc=False, beam_width=1,
+         corruption_level=0.0, gold_preproc=False,
+         nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10,
+         eta=0.1, mu=0.9,
          eval_only=False):
     gold_train = list(read_json_file(train_loc))
     nlp = train(English, gold_train, model_dir,
                feat_set='embed',
+               nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label,
                gold_preproc=gold_preproc, n_sents=n_sents,
                corruption_level=corruption_level, n_iter=n_iter,
                verbose=verbose)

From e7003f1cf3e48dd83bca48542af0dca35a1410cc Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 27 Jun 2015 04:18:47 +0200
Subject: [PATCH 15/30] * Remove hard-coding of vector lengths

---
 spacy/_theano.pyx       | 4 ++--
 spacy/syntax/parser.pyx | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx
index 08c49ce71..ab6e0b089 100644
--- a/spacy/_theano.pyx
+++ b/spacy/_theano.pyx
@@ -28,7 +28,7 @@ cdef class TheanoModel(Model):
         self.model_loc = model_loc
         
     def predict(self, Example eg):
-        self.input_layer.fill(eg.embeddings, eg.atoms)
+        self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=True)
         theano_scores = self.predict_func(eg.embeddings)[0]
         cdef int i
         for i in range(self.n_classes):
@@ -37,7 +37,7 @@ cdef class TheanoModel(Model):
                                    self.n_classes)
 
     def train(self, Example eg):
-        self.input_layer.fill(eg.embeddings, eg.atoms)
+        self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=False)
         theano_scores, update, y = self.train_func(eg.embeddings, eg.costs, self.eta)
         self.input_layer.update(update, eg.atoms, self.t, self.eta, self.mu)
         for i in range(self.n_classes):
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index f2fa63da5..d65bf1f33 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -52,7 +52,7 @@ def get_templates(name):
     elif name == 'debug':
         return pf.unigrams
     elif name.startswith('embed'):
-        return ((10, pf.words), (10, pf.tags), (10, pf.labels))
+        return (pf.words, pf.tags, pf.labels)
     else:
         return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
                 pf.tree_shape + pf.trigrams)

From fe7b24eceffda3c282b01b9ff8750d70f40ef622 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 28 Jun 2015 01:29:37 +0200
Subject: [PATCH 16/30] * whitespace

---
 bin/parser/nn_train.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py
index e0ae846b5..33ad8a8a9 100755
--- a/bin/parser/nn_train.py
+++ b/bin/parser/nn_train.py
@@ -84,7 +84,7 @@ def _merge_sents(sents):
 def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
           seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
           verbose=False,
-          eta=0.01, mu=0.9, n_hidden=100,
+          eta=0.01, mu=0.9, nv_hidden=100,
           nv_word=10, nv_tag=10, nv_label=10):
     dep_model_dir = path.join(model_dir, 'deps')
     pos_model_dir = path.join(model_dir, 'pos')
@@ -105,7 +105,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
         features=feat_set,
         labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
         vector_lengths=(nv_word, nv_tag, nv_label),
-        hidden_nodes=n_hidden,
+        hidden_nodes=nv_hidden,
         eta=eta,
         mu=mu
     )
@@ -123,7 +123,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
                (nv_tag   * len(tags)) + \
                (nv_label * len(labels))
         print 'Compiling'
-        debug, train_func, predict_func = compile_theano_model(n_classes, n_hidden,
+        debug, train_func, predict_func = compile_theano_model(n_classes, nv_hidden,
                                                         n_in, 0.0, 0.0)
         print 'Done'
         return TheanoModel(
@@ -251,7 +251,7 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos
     gold_train = list(read_json_file(train_loc))
     nlp = train(English, gold_train, model_dir,
                feat_set='embed',
-               nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label,
+               nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden,
                gold_preproc=gold_preproc, n_sents=n_sents,
                corruption_level=corruption_level, n_iter=n_iter,
                verbose=verbose)

From f4986d5d3cd5f565d4dc613c5199d6439f343b65 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 28 Jun 2015 22:36:03 +0200
Subject: [PATCH 17/30] * Use new Example class

---
 spacy/_ml.pxd           |  1 +
 spacy/_ml.pyx           | 16 ++++++----------
 spacy/_theano.pyx       | 15 ++++++---------
 spacy/syntax/parser.pyx | 29 ++++++++++++++++++-----------
 4 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/spacy/_ml.pxd b/spacy/_ml.pxd
index 3562b4a32..40281cad2 100644
--- a/spacy/_ml.pxd
+++ b/spacy/_ml.pxd
@@ -5,6 +5,7 @@ from cymem.cymem cimport Pool
 from thinc.learner cimport LinearModel
 from thinc.features cimport Extractor, Feature
 from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
+from thinc.api cimport ExampleC
 
 from preshed.maps cimport PreshMapArray
 
diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx
index cabc4318a..f84068778 100644
--- a/spacy/_ml.pyx
+++ b/spacy/_ml.pyx
@@ -61,18 +61,14 @@ cdef class Model:
             self._model.load(self.model_loc, freq_thresh=0)
 
     def predict(self, Example eg):
-        self.set_scores(&eg.scores[0], &eg.atoms[0])
-        eg.guess = arg_max_if_true(&eg.scores[0], &eg.is_valid[0],
-                                   self.n_classes)
+        self.set_scores(eg.c.scores, eg.c.atoms)
+        eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
 
     def train(self, Example eg):
-        self.set_scores(&eg.scores[0], &eg.atoms[0])
-        eg.guess = arg_max_if_true(&eg.scores[0],
-                                   &eg.is_valid[0], self.n_classes)
-        eg.best = arg_max_if_zero(&eg.scores[0], &eg.costs[0],
-                                  self.n_classes)
-        eg.cost = eg.costs[eg.guess]
-        self.update(&eg.atoms[0], eg.guess, eg.best, eg.cost)
+        self.predict(eg)
+        eg.c.best = arg_max_if_zero(eg.c.scores, eg.c.costs, self.n_classes)
+        eg.c.cost = eg.c.costs[eg.c.guess]
+        self.update(eg.c.atoms, eg.c.guess, eg.c.best, eg.c.cost)
 
     cdef const weight_t* score(self, atom_t* context) except NULL:
         cdef int n_feats
diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx
index ab6e0b089..69896e72a 100644
--- a/spacy/_theano.pyx
+++ b/spacy/_theano.pyx
@@ -1,4 +1,4 @@
-from thinc.api cimport Example
+from thinc.api cimport Example, ExampleC
 from thinc.typedefs cimport weight_t
 
 from ._ml cimport arg_max_if_true
@@ -33,20 +33,17 @@ cdef class TheanoModel(Model):
         cdef int i
         for i in range(self.n_classes):
             eg.scores[i] = theano_scores[i]
-        eg.guess = arg_max_if_true(&eg.scores[0], <int*>eg.is_valid[0],
-                                   self.n_classes)
+        eg.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
 
     def train(self, Example eg):
         self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=False)
         theano_scores, update, y = self.train_func(eg.embeddings, eg.costs, self.eta)
         self.input_layer.update(update, eg.atoms, self.t, self.eta, self.mu)
         for i in range(self.n_classes):
-            eg.scores[i] = theano_scores[i]
-        eg.guess = arg_max_if_true(&eg.scores[0], <int*>eg.is_valid[0],
-                                   self.n_classes)
-        eg.best = arg_max_if_zero(&eg.scores[0], <int*>eg.costs[0],
-                                  self.n_classes)
-        eg.cost = eg.costs[eg.guess]
+            eg.c.scores[i] = theano_scores[i]
+        eg.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
+        eg.best = arg_max_if_zero(eg.c.scores, eg.c.costs, self.n_classes)
+        eg.cost = eg.c.costs[eg.guess]
         self.t += 1
 
     def end_training(self):
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index cf9d71736..2ea60b149 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -71,14 +71,17 @@ cdef class Parser:
         cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
         self.moves.initialize_state(stcls)
 
-        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats)
+        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
+                                  self.model.n_feats, self.model.n_feats)
         while not stcls.is_final():
-            eg.wipe()
-            fill_context(&eg.atoms[0], stcls)
-            self.moves.set_valid(<bint*>&eg.is_valid[0], stcls)
+            memset(eg.c.scores, 0, eg.c.nr_class * sizeof(weight_t))
+        
+            self.moves.set_valid(<bint*>eg.c.is_valid, stcls)
+            fill_context(eg.c.atoms, stcls)
+            
             self.model.predict(eg)
 
-            self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
+            self.moves.c[eg.c.guess].do(stcls, self.moves.c[eg.c.guess].label)
         self.moves.finalize_state(stcls)
         tokens.set_parse(stcls._sent)
 
@@ -86,20 +89,24 @@ cdef class Parser:
         self.moves.preprocess_gold(gold)
         cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
         self.moves.initialize_state(stcls)
-        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats)
+        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
+                                  self.model.n_feats, self.model.n_feats)
         cdef int cost = 0
         while not stcls.is_final():
-            eg.wipe()
-            fill_context(&eg.atoms[0], stcls)
-            self.moves.set_costs(<bint*>&eg.is_valid[0], &eg.costs[0], stcls, gold)
+            memset(eg.c.scores, 0, eg.c.nr_class * sizeof(weight_t))
+        
+            self.moves.set_costs(<bint*>eg.c.is_valid, eg.c.costs, stcls, gold)
+            
+            fill_context(eg.c.atoms, stcls)
 
             self.model.train(eg)
 
-            self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
-            cost += eg.cost
+            self.moves.c[eg.c.guess].do(stcls, self.moves.c[eg.c.guess].label)
+            cost += eg.c.cost
         return cost
 
 
+
 # These are passed as callbacks to thinc.search.Beam
 """
 cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:

From 5d870720bcb8fddda4153e8c27cce0b14b5791d6 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 29 Jun 2015 00:17:29 +0200
Subject: [PATCH 18/30] * Check valency in L and R feature methods, to make
 feaure calculation faster

---
 spacy/syntax/stateclass.pyx | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx
index 0112a89f5..0708a00cf 100644
--- a/spacy/syntax/stateclass.pyx
+++ b/spacy/syntax/stateclass.pyx
@@ -52,6 +52,8 @@ cdef class StateClass:
         if i < 0 or i >= self.length:
             return -1
         cdef const TokenC* target = &self._sent[i]
+        if target.l_kids < idx:
+            return -1
         cdef const TokenC* ptr = self._sent
 
         while ptr < target:
@@ -75,8 +77,10 @@ cdef class StateClass:
             return -1
         if i < 0 or i >= self.length:
             return -1
-        cdef const TokenC* ptr = self._sent + (self.length - 1)
         cdef const TokenC* target = &self._sent[i]
+        if target.r_kids < idx:
+            return -1
+        cdef const TokenC* ptr = self._sent + (self.length - 1)
         while ptr > target:
             # If this head is still to the right of us, we can skip to it
             # No token that's between this token and this head could be our

From 313a7f87b376ce8c7def32908ca1682d3fde6982 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 29 Jun 2015 01:06:28 +0200
Subject: [PATCH 19/30] * Inline methods in StateClass

---
 spacy/syntax/stateclass.pxd | 85 ++++++++++++++++++++++++++-----------
 spacy/syntax/stateclass.pyx | 67 -----------------------------
 2 files changed, 60 insertions(+), 92 deletions(-)

diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd
index e3c36751e..905d8cdde 100644
--- a/spacy/syntax/stateclass.pxd
+++ b/spacy/syntax/stateclass.pxd
@@ -41,45 +41,80 @@ cdef class StateClass:
         if (i + self._b_i) >= self.length:
             return -1
         return self._buffer[self._b_i + i]
-    
-    cdef int H(self, int i) nogil
+
+    cdef inline const TokenC* S_(self, int i) nogil:
+        return self.safe_get(self.S(i))
+
+    cdef inline const TokenC* B_(self, int i) nogil:
+        return self.safe_get(self.B(i))
+
+    cdef inline const TokenC* H_(self, int i) nogil:
+        return self.safe_get(self.H(i))
+
+    cdef inline const TokenC* E_(self, int i) nogil:
+        return self.safe_get(self.E(i))
+
+    cdef inline const TokenC* L_(self, int i, int idx) nogil:
+        return self.safe_get(self.L(i, idx))
+
+    cdef inline const TokenC* R_(self, int i, int idx) nogil:
+        return self.safe_get(self.R(i, idx))
+
+    cdef inline const TokenC* safe_get(self, int i) nogil:
+        if i < 0 or i >= self.length:
+            return &self._empty_token
+        else:
+            return &self._sent[i]
+
+    cdef inline int H(self, int i) nogil:
+        if i < 0 or i >= self.length:
+            return -1
+        return self._sent[i].head + i
+
+
     cdef int E(self, int i) nogil
+
+    cdef int R(self, int i, int idx) nogil
     
     cdef int L(self, int i, int idx) nogil
-    cdef int R(self, int i, int idx) nogil
 
-    cdef const TokenC* S_(self, int i) nogil
-    cdef const TokenC* B_(self, int i) nogil
+    cdef inline bint empty(self) nogil:
+        return self._s_i <= 0
 
-    cdef const TokenC* H_(self, int i) nogil
-    cdef const TokenC* E_(self, int i) nogil
+    cdef inline bint eol(self) nogil:
+        return self.buffer_length() == 0
 
-    cdef const TokenC* L_(self, int i, int idx) nogil
-    cdef const TokenC* R_(self, int i, int idx) nogil
+    cdef inline bint at_break(self) nogil:
+        return self._break != -1
 
-    cdef const TokenC* safe_get(self, int i) nogil
+    cdef inline bint is_final(self) nogil:
+        return self.stack_depth() <= 0 and self._b_i >= self.length
 
-    cdef bint empty(self) nogil
-    
-    cdef bint entity_is_open(self) nogil
+    cdef inline bint has_head(self, int i) nogil:
+        return self.safe_get(i).head != 0
 
-    cdef bint eol(self) nogil
-    
-    cdef bint at_break(self) nogil
+    cdef inline int n_L(self, int i) nogil:
+        return self.safe_get(i).l_kids
 
-    cdef bint is_final(self) nogil
+    cdef inline int n_R(self, int i) nogil:
+        return self.safe_get(i).r_kids
 
-    cdef bint has_head(self, int i) nogil
+    cdef inline bint stack_is_connected(self) nogil:
+        return False
 
-    cdef int n_L(self, int i) nogil
+    cdef inline bint entity_is_open(self) nogil:
+        if self._e_i < 1:
+            return False
+        return self._ents[self._e_i-1].end == -1
 
-    cdef int n_R(self, int i) nogil
+    cdef inline int stack_depth(self) nogil:
+        return self._s_i
 
-    cdef bint stack_is_connected(self) nogil
-
-    cdef int stack_depth(self) nogil
-    
-    cdef int buffer_length(self) nogil
+    cdef inline int buffer_length(self) nogil:
+        if self._break != -1:
+            return self._break - self._b_i
+        else:
+            return self.length - self._b_i
 
     cdef void push(self) nogil
 
diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx
index 0708a00cf..038059f94 100644
--- a/spacy/syntax/stateclass.pyx
+++ b/spacy/syntax/stateclass.pyx
@@ -34,11 +34,6 @@ cdef class StateClass:
             self._buffer[i] = i
         self._empty_token.lex = &EMPTY_LEXEME
 
-    cdef int H(self, int i) nogil:
-        if i < 0 or i >= self.length:
-            return -1
-        return self._sent[i].head + i
-
     cdef int E(self, int i) nogil:
         if self._e_i <= 0 or self._e_i >= self.length:
             return 0
@@ -96,68 +91,6 @@ cdef class StateClass:
                 ptr -= 1
         return -1
 
-    cdef const TokenC* S_(self, int i) nogil:
-        return self.safe_get(self.S(i))
-
-    cdef const TokenC* B_(self, int i) nogil:
-        return self.safe_get(self.B(i))
-
-    cdef const TokenC* H_(self, int i) nogil:
-        return self.safe_get(self.H(i))
-
-    cdef const TokenC* E_(self, int i) nogil:
-        return self.safe_get(self.E(i))
-
-    cdef const TokenC* L_(self, int i, int idx) nogil:
-        return self.safe_get(self.L(i, idx))
-
-    cdef const TokenC* R_(self, int i, int idx) nogil:
-        return self.safe_get(self.R(i, idx))
-
-    cdef const TokenC* safe_get(self, int i) nogil:
-        if i < 0 or i >= self.length:
-            return &self._empty_token
-        else:
-            return &self._sent[i]
-
-    cdef bint empty(self) nogil:
-        return self._s_i <= 0
-
-    cdef bint eol(self) nogil:
-        return self.buffer_length() == 0
-
-    cdef bint at_break(self) nogil:
-        return self._break != -1
-
-    cdef bint is_final(self) nogil:
-        return self.stack_depth() <= 0 and self._b_i >= self.length
-
-    cdef bint has_head(self, int i) nogil:
-        return self.safe_get(i).head != 0
-
-    cdef int n_L(self, int i) nogil:
-        return self.safe_get(i).l_kids
-
-    cdef int n_R(self, int i) nogil:
-        return self.safe_get(i).r_kids
-
-    cdef bint stack_is_connected(self) nogil:
-        return False
-
-    cdef bint entity_is_open(self) nogil:
-        if self._e_i < 1:
-            return False
-        return self._ents[self._e_i-1].end == -1
-
-    cdef int stack_depth(self) nogil:
-        return self._s_i
-
-    cdef int buffer_length(self) nogil:
-        if self._break != -1:
-            return self._break - self._b_i
-        else:
-            return self.length - self._b_i
-
     cdef void push(self) nogil:
         if self.B(0) != -1:
             self._stack[self._s_i] = self.B(0)

From 8e7ffd2cddd884494ab5496d269a3173cca841ad Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 29 Jun 2015 02:13:23 +0200
Subject: [PATCH 20/30] * Use thinc 3.1

---
 requirements.txt | 2 +-
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index bb5531f14..b18948e50 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ cython
 cymem == 1.11
 pathlib
 preshed == 0.37
-thinc == 3.0
+thinc == 3.1
 murmurhash == 0.24
 unidecode
 numpy
diff --git a/setup.py b/setup.py
index 48e2dfe25..f75445cdf 100644
--- a/setup.py
+++ b/setup.py
@@ -118,7 +118,7 @@ def run_setup(exts):
         ext_modules=exts,
         license="Dual: Commercial or AGPL",
         install_requires=['numpy', 'murmurhash', 'cymem >= 1.11', 'preshed == 0.37',
-                          'thinc == 3.0', "unidecode", 'wget', 'plac', 'six',
+                          'thinc == 3.1', "unidecode", 'wget', 'plac', 'six',
                           'ujson'],
         setup_requires=["headers_workaround"],
     )

From fc34e1b6e425420e6230a41f859dadeff0a70ae5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 29 Jun 2015 07:09:16 +0200
Subject: [PATCH 21/30] * Move Theano functions into nn_train.py script

---
 bin/parser/nn_train.py | 284 ++++++++++++++++++++++-------------------
 1 file changed, 156 insertions(+), 128 deletions(-)

diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py
index 33ad8a8a9..a686df9e0 100755
--- a/bin/parser/nn_train.py
+++ b/bin/parser/nn_train.py
@@ -23,81 +23,163 @@ from spacy.gold import GoldParse
 
 from spacy.scorer import Scorer
 
-from thinc.theano_nn import compile_theano_model
-
 from spacy.syntax.parser import Parser
 from spacy._theano import TheanoModel
 
+import theano
+import theano.tensor as T
 
-def _corrupt(c, noise_level):
-    if random.random() >= noise_level:
-        return c
-    elif c == ' ':
-        return '\n'
-    elif c == '\n':
-        return ' '
-    elif c in ['.', "'", "!", "?"]:
-        return ''
-    else:
-        return c.lower()
+from theano.printing import Print
+
+import numpy
+from collections import OrderedDict, defaultdict
 
 
-def add_noise(orig, noise_level):
-    if random.random() >= noise_level:
-        return orig
-    elif type(orig) == list:
-        corrupted = [_corrupt(word, noise_level) for word in orig]
-        corrupted = [w for w in corrupted if w]
-        return corrupted
-    else:
-        return ''.join(_corrupt(c, noise_level) for c in orig)
+theano.config.floatX = 'float32'
+floatX = theano.config.floatX
 
 
-def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
-    if raw_text is None:
-        tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-    else:
-        tokens = nlp.tokenizer(raw_text)
+def th_share(w, name=''):
+    return theano.shared(value=w, borrow=True, name=name)
+
+
+class AvgParam(object):
+    def __init__(self, numpy_data, name='?', wrapper=th_share):
+        self.curr = wrapper(numpy_data, name=name+'_curr')
+        self.avg = self.curr
+        self.avg = wrapper(numpy_data.copy(), name=name+'_avg')
+        self.step = wrapper(numpy.zeros(numpy_data.shape, numpy_data.dtype),
+                            name=name+'_step')
+
+    def updates(self, cost, timestep, eta=0.001, mu=0.9):
+        step = (mu * self.step) - T.grad(cost, self.curr)
+        curr = self.curr + (eta * step)
+        alpha = (1 / timestep).clip(0.001, 0.9).astype(floatX)
+        avg = ((1 - alpha) * self.avg) + (alpha * curr)
+        return [(self.curr, curr), (self.step, step), (self.avg, avg)]
+
+
+def feed_layer(activation, weights, bias, input_):
+    return activation(T.dot(input_, weights) + bias)
+
+
+def L2(L2_reg, *weights):
+    return L2_reg * sum((w ** 2).sum() for w in weights)
+
+
+def L1(L1_reg, *weights):
+    return L1_reg * sum(abs(w).sum() for w in weights)
+
+
+def relu(x):
+    return x * (x > 0)
+
+
+def _init_weights(n_in, n_out):
+    rng = numpy.random.RandomState(1234)
+    weights = numpy.asarray(
+        numpy.random.normal(
+            loc=0.0,
+            scale=0.0001,
+            size=(n_in, n_out)),
+        dtype=theano.config.floatX
+    )
+    bias = 0.2 * numpy.ones((n_out,), dtype=theano.config.floatX)
+    return [AvgParam(weights, name='W'), AvgParam(bias, name='b')]
+
+
+def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg):
+    costs = T.ivector('costs')
+    is_gold = T.ivector('is_gold')
+    x = T.vector('x') 
+    y = T.scalar('y')
+    timestep = theano.shared(1)
+    eta = T.scalar('eta').astype(floatX)
+    mu = T.scalar('mu').astype(floatX)
+
+    maxent_W, maxent_b = _init_weights(n_hidden, n_classes)
+    hidden_W, hidden_b = _init_weights(n_in, n_hidden)
+
+    # Feed the inputs forward through the network
+    p_y_given_x = feed_layer(
+                    T.nnet.softmax,
+                    maxent_W.curr,
+                    maxent_b.curr,
+                      feed_layer(
+                        relu,
+                        hidden_W.curr,
+                        hidden_b.curr,
+                        x))
+    stabilizer = 1e-8
+
+    cost = (
+        -T.log(T.sum((p_y_given_x[0] + stabilizer) * T.eq(costs, 0)))
+        + L1(L1_reg, hidden_W.curr, hidden_b.curr)
+        + L2(L2_reg, hidden_W.curr, hidden_b.curr)
+    )
+
+    debug = theano.function(
+        name='debug',
+        inputs=[x, costs],
+        outputs=[p_y_given_x, T.eq(costs, 0), p_y_given_x[0] * T.eq(costs, 0)],
+    )
+
+    train_model = theano.function(
+        name='train_model',
+        inputs=[x, costs, eta, mu],
+        outputs=[p_y_given_x[0], T.grad(cost, x), T.argmax(p_y_given_x, axis=1),
+                 cost],
+        updates=(
+            [(timestep, timestep + 1)] + 
+             maxent_W.updates(cost, timestep, eta=eta, mu=mu) + 
+             maxent_b.updates(cost, timestep, eta=eta, mu=mu) +
+             hidden_W.updates(cost, timestep, eta=eta, mu=mu) +
+             hidden_b.updates(cost, timestep, eta=eta, mu=mu)
+        ),
+        on_unused_input='warn'
+    )
+
+    evaluate_model = theano.function(
+        name='evaluate_model',
+        inputs=[x],
+        outputs=[
+            feed_layer(
+              T.nnet.softmax,
+              maxent_W.curr,
+              maxent_b.curr,
+              feed_layer(
+                relu,
+                hidden_W.curr,
+                hidden_b.curr,
+                x
+              )
+            )[0]
+        ]
+    )
+    return debug, train_model, evaluate_model
+
+
+def score_model(scorer, nlp, annot_tuples, verbose=False):
+    tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
     nlp.tagger(tokens)
-    nlp.entity(tokens)
     nlp.parser(tokens)
     gold = GoldParse(tokens, annot_tuples)
     scorer.score(tokens, gold, verbose=verbose)
 
 
-def _merge_sents(sents):
-    m_deps = [[], [], [], [], [], []]
-    m_brackets = []
-    i = 0
-    for (ids, words, tags, heads, labels, ner), brackets in sents:
-        m_deps[0].extend(id_ + i for id_ in ids)
-        m_deps[1].extend(words)
-        m_deps[2].extend(tags)
-        m_deps[3].extend(head + i for head in heads)
-        m_deps[4].extend(labels)
-        m_deps[5].extend(ner)
-        m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
-        i += len(ids)
-    return [(m_deps, m_brackets)]
-
-
 def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
-          seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
+          seed=0, n_sents=0, 
           verbose=False,
           eta=0.01, mu=0.9, nv_hidden=100,
           nv_word=10, nv_tag=10, nv_label=10):
     dep_model_dir = path.join(model_dir, 'deps')
     pos_model_dir = path.join(model_dir, 'pos')
-    ner_model_dir = path.join(model_dir, 'ner')
     if path.exists(dep_model_dir):
         shutil.rmtree(dep_model_dir)
     if path.exists(pos_model_dir):
         shutil.rmtree(pos_model_dir)
-    if path.exists(ner_model_dir):
-        shutil.rmtree(ner_model_dir)
     os.mkdir(dep_model_dir)
     os.mkdir(pos_model_dir)
-    os.mkdir(ner_model_dir)
     setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
 
     Config.write(dep_model_dir, 'config',
@@ -109,9 +191,6 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
         eta=eta,
         mu=mu
     )
-    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
-                 labels=Language.EntityTransitionSystem.get_labels(gold_tuples),
-                 beam_width=0)
     
     if n_sents > 0:
         gold_tuples = gold_tuples[:n_sents]
@@ -122,57 +201,44 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
         n_in = (nv_word  * len(words)) + \
                (nv_tag   * len(tags)) + \
                (nv_label * len(labels))
-        print 'Compiling'
         debug, train_func, predict_func = compile_theano_model(n_classes, nv_hidden,
-                                                        n_in, 0.0, 0.0)
-        print 'Done'
+                                                        n_in, 0.0, 0.0001)
         return TheanoModel(
             n_classes,
             ((nv_word, words), (nv_tag, tags), (nv_label, labels)),
             train_func,
             predict_func,
             model_loc=model_dir, 
+            eta=eta, mu=mu,
             debug=debug)
 
     nlp._parser = Parser(nlp.vocab.strings, dep_model_dir, nlp.ParserTransitionSystem,
                          make_model)
 
     print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %"
+    log_loc = path.join(model_dir, 'job.log')
     for itn in range(n_iter):
         scorer = Scorer()
         loss = 0
-        for raw_text, sents in gold_tuples:
-            if gold_preproc:
-                raw_text = None
-            else:
-                sents = _merge_sents(sents)
+        for _, sents in gold_tuples:
             for annot_tuples, ctnt in sents:
                 if len(annot_tuples[1]) == 1:
                     continue
-                score_model(scorer, nlp, raw_text, annot_tuples,
-                            verbose=verbose if itn >= 2 else False)
-                if raw_text is None:
-                    words = add_noise(annot_tuples[1], corruption_level)
-                    tokens = nlp.tokenizer.tokens_from_list(words)
-                else:
-                    raw_text = add_noise(raw_text, corruption_level)
-                    tokens = nlp.tokenizer(raw_text)
+                score_model(scorer, nlp, annot_tuples)
+                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                 nlp.tagger(tokens)
                 gold = GoldParse(tokens, annot_tuples, make_projective=True)
-                if not gold.is_projective:
-                    raise Exception(
-                        "Non-projective sentence in training, after we should "
-                        "have enforced projectivity: %s" % annot_tuples
-                    )
+                assert gold.is_projective
                 loss += nlp.parser.train(tokens, gold)
-                nlp.entity.train(tokens, gold)
                 nlp.tagger.train(tokens, gold.tags)
         random.shuffle(gold_tuples)
-        print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
-                                               scorer.tags_acc,
-                                               scorer.token_acc)
+        logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
+                                                 scorer.tags_acc,
+                                                 scorer.token_acc)
+        print logline
+        with open(log_loc, 'aw') as file_:
+            file_.write(logline + '\n')
     nlp.parser.model.end_training()
-    nlp.entity.model.end_training()
     nlp.tagger.model.end_training()
     nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
     return nlp
@@ -181,57 +247,20 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
 def evaluate(nlp, gold_tuples, gold_preproc=True):
     scorer = Scorer()
     for raw_text, sents in gold_tuples:
-        if gold_preproc:
-            raw_text = None
-        else:
-            sents = _merge_sents(sents)
         for annot_tuples, brackets in sents:
-            if raw_text is None:
-                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-                nlp.tagger(tokens)
-                nlp.entity(tokens)
-                nlp.parser(tokens)
-            else:
-                tokens = nlp(raw_text, merge_mwes=False)
+            tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
+            nlp.tagger(tokens)
+            nlp.parser(tokens)
             gold = GoldParse(tokens, annot_tuples)
             scorer.score(tokens, gold)
     return scorer
 
 
-def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
-    nlp = Language(data_dir=model_dir)
-    if beam_width is not None:
-        nlp.parser.cfg.beam_width = beam_width
-    gold_tuples = read_json_file(dev_loc)
-    scorer = Scorer()
-    out_file = codecs.open(out_loc, 'w', 'utf8')
-    for raw_text, sents in gold_tuples:
-        sents = _merge_sents(sents)
-        for annot_tuples, brackets in sents:
-            if raw_text is None:
-                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-                nlp.tagger(tokens)
-                nlp.entity(tokens)
-                nlp.parser(tokens)
-            else:
-                tokens = nlp(raw_text, merge_mwes=False)
-            gold = GoldParse(tokens, annot_tuples)
-            scorer.score(tokens, gold, verbose=False)
-            for t in tokens:
-                out_file.write(
-                    '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
-                )
-    return scorer
-
-
 @plac.annotations(
     train_loc=("Location of training file or directory"),
     dev_loc=("Location of development file or directory"),
     model_dir=("Location of output model directory",),
     eval_only=("Skip training, and only evaluate", "flag", "e", bool),
-    corruption_level=("Amount of noise to add to training data", "option", "c", float),
-    gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
-    out_loc=("Out location", "option", "o", str),
     n_sents=("Number of training sentences", "option", "n", int),
     n_iter=("Number of training iterations", "option", "i", int),
     verbose=("Verbose error reporting", "flag", "v", bool),
@@ -243,21 +272,20 @@ def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
     eta=("Learning rate", "option", "E", float),
     mu=("Momentum", "option", "M", float),
 )
-def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
-         corruption_level=0.0, gold_preproc=False,
+def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False,
          nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10,
-         eta=0.1, mu=0.9,
-         eval_only=False):
-    gold_train = list(read_json_file(train_loc))
+         eta=0.1, mu=0.9, eval_only=False):
+
+    gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id']))
+
     nlp = train(English, gold_train, model_dir,
                feat_set='embed',
+               eta=eta, mu=mu,
                nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden,
-               gold_preproc=gold_preproc, n_sents=n_sents,
-               corruption_level=corruption_level, n_iter=n_iter,
+               n_sents=n_sents, n_iter=n_iter,
                verbose=verbose)
-    #if out_loc:
-    #    write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
-    scorer = evaluate(nlp, list(read_json_file(dev_loc)), gold_preproc=gold_preproc)
+
+    scorer = evaluate(nlp, list(read_json_file(dev_loc)))
     
     print 'TOK', 100-scorer.token_acc
     print 'POS', scorer.tags_acc

From 894cbef8ba04222a420f312fc7f9023eed35429e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 29 Jun 2015 07:10:33 +0200
Subject: [PATCH 22/30] * Wire eta and mu parameters up for neural net

---
 spacy/_theano.pyx | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx
index 69896e72a..1d7744322 100644
--- a/spacy/_theano.pyx
+++ b/spacy/_theano.pyx
@@ -10,12 +10,13 @@ from os import path
 
 cdef class TheanoModel(Model):
     def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None,
+                 eta=0.001, mu=0.9,
                  debug=None):
         if model_loc is not None and path.isdir(model_loc):
             model_loc = path.join(model_loc, 'model')
 
-        self.eta = 0.001
-        self.mu = 0.9
+        self.eta = eta
+        self.mu = mu
         self.t = 1
         initializer = lambda: 0.2 * numpy.random.uniform(-1.0, 1.0)
         self.input_layer = InputLayer(input_spec, initializer)
@@ -28,22 +29,24 @@ cdef class TheanoModel(Model):
         self.model_loc = model_loc
         
     def predict(self, Example eg):
-        self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=True)
+        self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=False)
         theano_scores = self.predict_func(eg.embeddings)[0]
         cdef int i
         for i in range(self.n_classes):
-            eg.scores[i] = theano_scores[i]
-        eg.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
+            eg.c.scores[i] = theano_scores[i]
+        eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
 
     def train(self, Example eg):
         self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=False)
-        theano_scores, update, y = self.train_func(eg.embeddings, eg.costs, self.eta)
+        theano_scores, update, y, loss = self.train_func(eg.embeddings, eg.costs,
+                                                         self.eta, self.mu)
         self.input_layer.update(update, eg.atoms, self.t, self.eta, self.mu)
         for i in range(self.n_classes):
             eg.c.scores[i] = theano_scores[i]
-        eg.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
-        eg.best = arg_max_if_zero(eg.c.scores, eg.c.costs, self.n_classes)
-        eg.cost = eg.c.costs[eg.guess]
+        eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
+        eg.c.best = arg_max_if_zero(eg.c.scores, eg.c.costs, self.n_classes)
+        eg.c.cost = eg.c.costs[eg.c.guess]
+        eg.c.loss = loss
         self.t += 1
 
     def end_training(self):

From ca30fe15826d634f1e1029578185de11a1b16f7d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 29 Jun 2015 10:56:02 +0200
Subject: [PATCH 23/30] * Use He initialization trick

---
 bin/parser/nn_train.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py
index a686df9e0..398578037 100755
--- a/bin/parser/nn_train.py
+++ b/bin/parser/nn_train.py
@@ -77,14 +77,12 @@ def relu(x):
 
 def _init_weights(n_in, n_out):
     rng = numpy.random.RandomState(1234)
+    
     weights = numpy.asarray(
-        numpy.random.normal(
-            loc=0.0,
-            scale=0.0001,
-            size=(n_in, n_out)),
+        rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in),
         dtype=theano.config.floatX
     )
-    bias = 0.2 * numpy.ones((n_out,), dtype=theano.config.floatX)
+    bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
     return [AvgParam(weights, name='W'), AvgParam(bias, name='b')]
 
 

From 1dff04acb531df3f375db40b7fd7c3300678a889 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 29 Jun 2015 11:45:38 +0200
Subject: [PATCH 24/30] * Apply regularization to the softmax, not the bias

---
 bin/parser/nn_train.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py
index 398578037..89013fe2b 100755
--- a/bin/parser/nn_train.py
+++ b/bin/parser/nn_train.py
@@ -35,6 +35,7 @@ import numpy
 from collections import OrderedDict, defaultdict
 
 
+theano.config.profile = False
 theano.config.floatX = 'float32'
 floatX = theano.config.floatX
 
@@ -112,8 +113,7 @@ def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg):
 
     cost = (
         -T.log(T.sum((p_y_given_x[0] + stabilizer) * T.eq(costs, 0)))
-        + L1(L1_reg, hidden_W.curr, hidden_b.curr)
-        + L2(L2_reg, hidden_W.curr, hidden_b.curr)
+        + L2(L2_reg, maxent_W.curr, hidden_W.curr)
     )
 
     debug = theano.function(
@@ -143,12 +143,12 @@ def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg):
         outputs=[
             feed_layer(
               T.nnet.softmax,
-              maxent_W.curr,
-              maxent_b.curr,
+              maxent_W.avg,
+              maxent_b.avg,
               feed_layer(
                 relu,
-                hidden_W.curr,
-                hidden_b.curr,
+                hidden_W.avg,
+                hidden_b.avg,
                 x
               )
             )[0]
@@ -200,7 +200,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
                (nv_tag   * len(tags)) + \
                (nv_label * len(labels))
         debug, train_func, predict_func = compile_theano_model(n_classes, nv_hidden,
-                                                        n_in, 0.0, 0.0001)
+                                                        n_in, 0.0, 0.00)
         return TheanoModel(
             n_classes,
             ((nv_word, words), (nv_tag, tags), (nv_label, labels)),
@@ -213,7 +213,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
     nlp._parser = Parser(nlp.vocab.strings, dep_model_dir, nlp.ParserTransitionSystem,
                          make_model)
 
-    print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %"
+    print "Itn.\tP.Loss\tUAS\tTag %\tToken %"
     log_loc = path.join(model_dir, 'job.log')
     for itn in range(n_iter):
         scorer = Scorer()
@@ -274,6 +274,9 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False,
          nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10,
          eta=0.1, mu=0.9, eval_only=False):
 
+
+
+
     gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id']))
 
     nlp = train(English, gold_train, model_dir,

From df8179ca4f7aeee920a835155ddbf846506f83b8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 29 Jun 2015 16:39:16 +0200
Subject: [PATCH 25/30] * Add separate Param and AdadeltaParam classes.
 AdadeltaParam seems broken.

---
 bin/parser/nn_train.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py
index 89013fe2b..f7b442faf 100755
--- a/bin/parser/nn_train.py
+++ b/bin/parser/nn_train.py
@@ -43,6 +43,39 @@ floatX = theano.config.floatX
 def th_share(w, name=''):
     return theano.shared(value=w, borrow=True, name=name)
 
+class Param(object):
+    def __init__(self, numpy_data, name='?', wrapper=th_share):
+        self.curr = wrapper(numpy_data, name=name+'_curr')
+        self.step = wrapper(numpy.zeros(numpy_data.shape, numpy_data.dtype),
+                            name=name+'_step')
+
+    def updates(self, cost, timestep, eta, mu):
+        step = (mu * self.step) - T.grad(cost, self.curr)
+        curr = self.curr + (eta * step)
+        return [(self.curr, curr), (self.step, step)]
+
+
+class AdadeltaParam(object):
+    def __init__(self, numpy_data, name='?', wrapper=th_share):
+        self.curr = wrapper(numpy_data, name=name+'_curr')
+        # accu: accumulate gradient magnitudes
+        self.accu = wrapper(numpy.zeros(numpy_data.shape, dtype=numpy_data.dtype))
+        # delta_accu: accumulate update magnitudes (recursively!)
+        self.delta_accu = wrapper(numpy.zeros(numpy_data.shape, dtype=numpy_data.dtype))
+
+    def updates(self, cost, timestep, eps, rho):
+        # update accu (as in rmsprop)
+        grad = T.grad(cost, self.curr)
+        accu_new = rho * self.accu + (1 - rho) * grad ** 2
+
+        # compute parameter update, using the 'old' delta_accu
+        update = (grad * T.sqrt(self.delta_accu + eps) /
+                  T.sqrt(accu_new + eps))
+        # update delta_accu (as accu, but accumulating updates)
+        delta_accu_new = rho * self.delta_accu + (1 - rho) * update ** 2
+        return [(self.curr, self.curr - update), (self.accu, accu_new),
+                (self.delta_accu, delta_accu_new)]
+
 
 class AvgParam(object):
     def __init__(self, numpy_data, name='?', wrapper=th_share):

From 5cd3ed42d4fea1c7cd539ad4b5e1ef6fabaa5b9a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 29 Jun 2015 16:44:42 +0200
Subject: [PATCH 26/30] * Reenable averaging

---
 spacy/_theano.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx
index 1d7744322..4231266c3 100644
--- a/spacy/_theano.pyx
+++ b/spacy/_theano.pyx
@@ -29,7 +29,7 @@ cdef class TheanoModel(Model):
         self.model_loc = model_loc
         
     def predict(self, Example eg):
-        self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=False)
+        self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=True)
         theano_scores = self.predict_func(eg.embeddings)[0]
         cdef int i
         for i in range(self.n_classes):

From 1135cfe50a25bbad91c5cb3b684fa85eea5be151 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 29 Jun 2015 16:45:14 +0200
Subject: [PATCH 27/30] * Tidy nn_train a bit

---
 bin/parser/nn_train.py | 63 +++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 32 deletions(-)

diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py
index f7b442faf..2fc1958ed 100755
--- a/bin/parser/nn_train.py
+++ b/bin/parser/nn_train.py
@@ -85,7 +85,7 @@ class AvgParam(object):
         self.step = wrapper(numpy.zeros(numpy_data.shape, numpy_data.dtype),
                             name=name+'_step')
 
-    def updates(self, cost, timestep, eta=0.001, mu=0.9):
+    def updates(self, cost, timestep, eta, mu):
         step = (mu * self.step) - T.grad(cost, self.curr)
         curr = self.curr + (eta * step)
         alpha = (1 / timestep).clip(0.001, 0.9).astype(floatX)
@@ -110,7 +110,7 @@ def relu(x):
 
 
 def _init_weights(n_in, n_out):
-    rng = numpy.random.RandomState(1234)
+    rng = numpy.random.RandomState(1235)
     
     weights = numpy.asarray(
         rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in),
@@ -125,6 +125,8 @@ def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg):
     is_gold = T.ivector('is_gold')
     x = T.vector('x') 
     y = T.scalar('y')
+    y_cost = T.scalar('y_cost')
+    loss = T.scalar('cost')
     timestep = theano.shared(1)
     eta = T.scalar('eta').astype(floatX)
     mu = T.scalar('mu').astype(floatX)
@@ -144,10 +146,9 @@ def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg):
                         x))
     stabilizer = 1e-8
 
-    cost = (
-        -T.log(T.sum((p_y_given_x[0] + stabilizer) * T.eq(costs, 0)))
-        + L2(L2_reg, maxent_W.curr, hidden_W.curr)
-    )
+    y_cost = costs[T.argmax(p_y_given_x[0])]
+
+    loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + stabilizer)
 
     debug = theano.function(
         name='debug',
@@ -158,14 +159,14 @@ def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg):
     train_model = theano.function(
         name='train_model',
         inputs=[x, costs, eta, mu],
-        outputs=[p_y_given_x[0], T.grad(cost, x), T.argmax(p_y_given_x, axis=1),
-                 cost],
+        outputs=[p_y_given_x[0], T.grad(loss, x), T.argmax(p_y_given_x, axis=1),
+                 loss],
         updates=(
             [(timestep, timestep + 1)] + 
-             maxent_W.updates(cost, timestep, eta=eta, mu=mu) + 
-             maxent_b.updates(cost, timestep, eta=eta, mu=mu) +
-             hidden_W.updates(cost, timestep, eta=eta, mu=mu) +
-             hidden_b.updates(cost, timestep, eta=eta, mu=mu)
+             maxent_W.updates(loss, timestep, eta, mu) + 
+             maxent_b.updates(loss, timestep, eta, mu) +
+             hidden_W.updates(loss, timestep, eta, mu) +
+             hidden_b.updates(loss, timestep, eta, mu)
         ),
         on_unused_input='warn'
     )
@@ -199,10 +200,24 @@ def score_model(scorer, nlp, annot_tuples, verbose=False):
 
 
 def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
-          seed=0, n_sents=0, 
-          verbose=False,
-          eta=0.01, mu=0.9, nv_hidden=100,
-          nv_word=10, nv_tag=10, nv_label=10):
+          eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10,
+          seed=0, n_sents=0,  verbose=False):
+    def make_model(n_classes, (words, tags, labels), model_dir):
+        n_in = (nv_word  * len(words)) + \
+               (nv_tag   * len(tags)) + \
+               (nv_label * len(labels))
+        debug, train_func, predict_func = compile_theano_model(n_classes, nv_hidden,
+                                                               n_in, 0.0, 0.0)
+        return TheanoModel(
+            n_classes,
+            ((nv_word, words), (nv_tag, tags), (nv_label, labels)),
+            train_func,
+            predict_func,
+            model_loc=model_dir, 
+            eta=eta, mu=mu,
+            debug=debug)
+
+
     dep_model_dir = path.join(model_dir, 'deps')
     pos_model_dir = path.join(model_dir, 'pos')
     if path.exists(dep_model_dir):
@@ -227,22 +242,6 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
         gold_tuples = gold_tuples[:n_sents]
     
     nlp = Language(data_dir=model_dir)
-
-    def make_model(n_classes, (words, tags, labels), model_dir):
-        n_in = (nv_word  * len(words)) + \
-               (nv_tag   * len(tags)) + \
-               (nv_label * len(labels))
-        debug, train_func, predict_func = compile_theano_model(n_classes, nv_hidden,
-                                                        n_in, 0.0, 0.00)
-        return TheanoModel(
-            n_classes,
-            ((nv_word, words), (nv_tag, tags), (nv_label, labels)),
-            train_func,
-            predict_func,
-            model_loc=model_dir, 
-            eta=eta, mu=mu,
-            debug=debug)
-
     nlp._parser = Parser(nlp.vocab.strings, dep_model_dir, nlp.ParserTransitionSystem,
                          make_model)
 

From e20106fdff782e1c0de8e777af2f7e6df1ba63db Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 30 Jun 2015 14:26:32 +0200
Subject: [PATCH 28/30] * Begin reorganizing neuralnet work

---
 spacy/_ml.pxd           |  4 ++--
 spacy/_theano.pyx       |  3 +--
 spacy/syntax/parser.pxd |  6 +++---
 spacy/syntax/parser.pyx | 20 +++++++++++++-------
 4 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/spacy/_ml.pxd b/spacy/_ml.pxd
index 40281cad2..c695e48aa 100644
--- a/spacy/_ml.pxd
+++ b/spacy/_ml.pxd
@@ -21,8 +21,8 @@ cdef int arg_max_if_zero(const weight_t* scores, const int* costs, int n_classes
 
 
 cdef class Model:
-    cdef int n_classes
-    cdef int n_feats
+    cdef readonly int n_classes
+    cdef readonly int n_feats
     
     cdef const weight_t* score(self, atom_t* context) except NULL
     cdef int set_scores(self, weight_t* scores, atom_t* context) except -1
diff --git a/spacy/_theano.pyx b/spacy/_theano.pyx
index 4231266c3..cc6886321 100644
--- a/spacy/_theano.pyx
+++ b/spacy/_theano.pyx
@@ -10,8 +10,7 @@ from os import path
 
 cdef class TheanoModel(Model):
     def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None,
-                 eta=0.001, mu=0.9,
-                 debug=None):
+                 eta=0.001, mu=0.9, debug=None):
         if model_loc is not None and path.isdir(model_loc):
             model_loc = path.join(model_loc, 'model')
 
diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd
index 11ac6bbb8..7e2b3b083 100644
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@@ -8,6 +8,6 @@ from ..tokens cimport Tokens, TokenC
 
 
 cdef class Parser:
-    cdef readonly object cfg
-    cdef readonly Model model
-    cdef readonly TransitionSystem moves
+    cdef public object cfg
+    cdef public Model model
+    cdef public TransitionSystem moves
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index 2ea60b149..c3ed0b464 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -59,13 +59,11 @@ def get_templates(name):
 
 
 cdef class Parser:
-    def __init__(self, StringStore strings, model_dir, transition_system,
-                 get_model=Model):
+    def __init__(self, StringStore strings, model_dir, transition_system):
         assert os.path.exists(model_dir) and os.path.isdir(model_dir)
         self.cfg = Config.read(model_dir, 'config')
         self.moves = transition_system(strings, self.cfg.labels)
-        templates = get_templates(self.cfg.features)
-        self.model = get_model(self.moves.n_moves, templates, model_dir)
+        self.model = Model(self.moves.n_moves, self.cfg.templates, model_dir)
 
     def __call__(self, Tokens tokens):
         cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
@@ -73,6 +71,8 @@ cdef class Parser:
 
         cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
                                   self.model.n_feats, self.model.n_feats)
+        eg.scores[0] = 10
+        assert eg.c.scores[0] == 10
         while not stcls.is_final():
             memset(eg.c.scores, 0, eg.c.nr_class * sizeof(weight_t))
         
@@ -91,7 +91,9 @@ cdef class Parser:
         self.moves.initialize_state(stcls)
         cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
                                   self.model.n_feats, self.model.n_feats)
-        cdef int cost = 0
+        cdef weight_t loss = 0
+        words = [w.orth_ for w in tokens]
+        cdef Transition G
         while not stcls.is_final():
             memset(eg.c.scores, 0, eg.c.nr_class * sizeof(weight_t))
         
@@ -101,9 +103,13 @@ cdef class Parser:
 
             self.model.train(eg)
 
+            G = self.moves.c[eg.c.guess]
+
+            #if eg.c.cost != 0:
+            #    print self.moves.move_name(G.move, G.label), stcls.print_state(words)
             self.moves.c[eg.c.guess].do(stcls, self.moves.c[eg.c.guess].label)
-            cost += eg.c.cost
-        return cost
+            loss += eg.c.loss
+        return loss
 
 
 

From 31b5e58aebe34e7bc2ac1b615bb99324e0304637 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 30 Jun 2015 14:26:53 +0200
Subject: [PATCH 29/30] * Begin reorganizing neuralnet work

---
 bin/parser/nn_train.py | 171 ++++++++++++-----------------------------
 1 file changed, 49 insertions(+), 122 deletions(-)

diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py
index 2fc1958ed..72c9e04f1 100755
--- a/bin/parser/nn_train.py
+++ b/bin/parser/nn_train.py
@@ -23,7 +23,7 @@ from spacy.gold import GoldParse
 
 from spacy.scorer import Scorer
 
-from spacy.syntax.parser import Parser
+from spacy.syntax.parser import Parser, get_templates
 from spacy._theano import TheanoModel
 
 import theano
@@ -40,76 +40,37 @@ theano.config.floatX = 'float32'
 floatX = theano.config.floatX
 
 
-def th_share(w, name=''):
-    return theano.shared(value=w, borrow=True, name=name)
-
-class Param(object):
-    def __init__(self, numpy_data, name='?', wrapper=th_share):
-        self.curr = wrapper(numpy_data, name=name+'_curr')
-        self.step = wrapper(numpy.zeros(numpy_data.shape, numpy_data.dtype),
-                            name=name+'_step')
-
-    def updates(self, cost, timestep, eta, mu):
-        step = (mu * self.step) - T.grad(cost, self.curr)
-        curr = self.curr + (eta * step)
-        return [(self.curr, curr), (self.step, step)]
-
-
-class AdadeltaParam(object):
-    def __init__(self, numpy_data, name='?', wrapper=th_share):
-        self.curr = wrapper(numpy_data, name=name+'_curr')
-        # accu: accumulate gradient magnitudes
-        self.accu = wrapper(numpy.zeros(numpy_data.shape, dtype=numpy_data.dtype))
-        # delta_accu: accumulate update magnitudes (recursively!)
-        self.delta_accu = wrapper(numpy.zeros(numpy_data.shape, dtype=numpy_data.dtype))
-
-    def updates(self, cost, timestep, eps, rho):
-        # update accu (as in rmsprop)
-        grad = T.grad(cost, self.curr)
-        accu_new = rho * self.accu + (1 - rho) * grad ** 2
-
-        # compute parameter update, using the 'old' delta_accu
-        update = (grad * T.sqrt(self.delta_accu + eps) /
-                  T.sqrt(accu_new + eps))
-        # update delta_accu (as accu, but accumulating updates)
-        delta_accu_new = rho * self.delta_accu + (1 - rho) * update ** 2
-        return [(self.curr, self.curr - update), (self.accu, accu_new),
-                (self.delta_accu, delta_accu_new)]
-
-
-class AvgParam(object):
-    def __init__(self, numpy_data, name='?', wrapper=th_share):
-        self.curr = wrapper(numpy_data, name=name+'_curr')
-        self.avg = self.curr
-        self.avg = wrapper(numpy_data.copy(), name=name+'_avg')
-        self.step = wrapper(numpy.zeros(numpy_data.shape, numpy_data.dtype),
-                            name=name+'_step')
-
-    def updates(self, cost, timestep, eta, mu):
-        step = (mu * self.step) - T.grad(cost, self.curr)
-        curr = self.curr + (eta * step)
-        alpha = (1 / timestep).clip(0.001, 0.9).astype(floatX)
-        avg = ((1 - alpha) * self.avg) + (alpha * curr)
-        return [(self.curr, curr), (self.step, step), (self.avg, avg)]
-
-
-def feed_layer(activation, weights, bias, input_):
-    return activation(T.dot(input_, weights) + bias)
+def L1(L1_reg, *weights):
+    return L1_reg * sum(abs(w).sum() for w in weights)
 
 
 def L2(L2_reg, *weights):
     return L2_reg * sum((w ** 2).sum() for w in weights)
 
 
-def L1(L1_reg, *weights):
-    return L1_reg * sum(abs(w).sum() for w in weights)
+def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6):
+    updates = OrderedDict()
+    for param in params:
+        value = param.get_value(borrow=True)
+        accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
+                             broadcastable=param.broadcastable)
+
+        grad = T.grad(loss, param)
+        accu_new = rho * accu + (1 - rho) * grad ** 2
+        updates[accu] = accu_new
+        updates[param] = param - (eta * grad / T.sqrt(accu_new + eps))
+    return updates
 
 
 def relu(x):
     return x * (x > 0)
 
 
-def _init_weights(n_in, n_out):
+def feed_layer(activation, weights, bias, input_):
+    return activation(T.dot(input_, weights) + bias)
+
+
+def init_weights(n_in, n_out):
     rng = numpy.random.RandomState(1235)
     
     weights = numpy.asarray(
@@ -117,57 +78,35 @@ def _init_weights(n_in, n_out):
         dtype=theano.config.floatX
     )
     bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
-    return [AvgParam(weights, name='W'), AvgParam(bias, name='b')]
+    return [wrapper(weights, name='W'), wrapper(bias, name='b')]
 
 
-def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg):
-    costs = T.ivector('costs')
-    is_gold = T.ivector('is_gold')
+def compile_model(n_classes, n_hidden, n_in, optimizer):
     x = T.vector('x') 
-    y = T.scalar('y')
-    y_cost = T.scalar('y_cost')
-    loss = T.scalar('cost')
-    timestep = theano.shared(1)
-    eta = T.scalar('eta').astype(floatX)
-    mu = T.scalar('mu').astype(floatX)
+    costs = T.ivector('costs')
+    loss = T.scalar('loss')
 
-    maxent_W, maxent_b = _init_weights(n_hidden, n_classes)
-    hidden_W, hidden_b = _init_weights(n_in, n_hidden)
+    maxent_W, maxent_b = init_weights(n_hidden, n_classes)
+    hidden_W, hidden_b = init_weights(n_in, n_hidden)
 
     # Feed the inputs forward through the network
     p_y_given_x = feed_layer(
                     T.nnet.softmax,
-                    maxent_W.curr,
-                    maxent_b.curr,
+                    maxent_W,
+                    maxent_b,
                       feed_layer(
                         relu,
-                        hidden_W.curr,
-                        hidden_b.curr,
+                        hidden_W,
+                        hidden_b,
                         x))
-    stabilizer = 1e-8
 
-    y_cost = costs[T.argmax(p_y_given_x[0])]
-
-    loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + stabilizer)
-
-    debug = theano.function(
-        name='debug',
-        inputs=[x, costs],
-        outputs=[p_y_given_x, T.eq(costs, 0), p_y_given_x[0] * T.eq(costs, 0)],
-    )
+    loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8)
 
     train_model = theano.function(
         name='train_model',
-        inputs=[x, costs, eta, mu],
-        outputs=[p_y_given_x[0], T.grad(loss, x), T.argmax(p_y_given_x, axis=1),
-                 loss],
-        updates=(
-            [(timestep, timestep + 1)] + 
-             maxent_W.updates(loss, timestep, eta, mu) + 
-             maxent_b.updates(loss, timestep, eta, mu) +
-             hidden_W.updates(loss, timestep, eta, mu) +
-             hidden_b.updates(loss, timestep, eta, mu)
-        ),
+        inputs=[x, costs],
+        outputs=[p_y_given_x[0], T.grad(loss, x), loss],
+        updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]),
         on_unused_input='warn'
     )
 
@@ -177,18 +116,18 @@ def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg):
         outputs=[
             feed_layer(
               T.nnet.softmax,
-              maxent_W.avg,
-              maxent_b.avg,
+              maxent_W,
+              maxent_b,
               feed_layer(
                 relu,
-                hidden_W.avg,
-                hidden_b.avg,
+                hidden_W,
+                hidden_b,
                 x
               )
             )[0]
         ]
     )
-    return debug, train_model, evaluate_model
+    return train_model, evaluate_model
 
 
 def score_model(scorer, nlp, annot_tuples, verbose=False):
@@ -202,21 +141,6 @@ def score_model(scorer, nlp, annot_tuples, verbose=False):
 def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
           eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10,
           seed=0, n_sents=0,  verbose=False):
-    def make_model(n_classes, (words, tags, labels), model_dir):
-        n_in = (nv_word  * len(words)) + \
-               (nv_tag   * len(tags)) + \
-               (nv_label * len(labels))
-        debug, train_func, predict_func = compile_theano_model(n_classes, nv_hidden,
-                                                               n_in, 0.0, 0.0)
-        return TheanoModel(
-            n_classes,
-            ((nv_word, words), (nv_tag, tags), (nv_label, labels)),
-            train_func,
-            predict_func,
-            model_loc=model_dir, 
-            eta=eta, mu=mu,
-            debug=debug)
-
 
     dep_model_dir = path.join(model_dir, 'deps')
     pos_model_dir = path.join(model_dir, 'pos')
@@ -230,21 +154,24 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
 
     Config.write(dep_model_dir, 'config',
         seed=seed,
-        features=feat_set,
+        templates=tuple(),
         labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
         vector_lengths=(nv_word, nv_tag, nv_label),
         hidden_nodes=nv_hidden,
         eta=eta,
         mu=mu
     )
-    
+  
+    # Bake-in hyper-parameters
+    optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps)
+    nlp = Language(data_dir=model_dir)
+    n_classes = nlp.parser.model.n_classes
+    train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer)
+    nlp.parser.model = TheanoModel(n_classes, input_spec, train,
+                                   predict, model_loc)
+ 
     if n_sents > 0:
         gold_tuples = gold_tuples[:n_sents]
-    
-    nlp = Language(data_dir=model_dir)
-    nlp._parser = Parser(nlp.vocab.strings, dep_model_dir, nlp.ParserTransitionSystem,
-                         make_model)
-
     print "Itn.\tP.Loss\tUAS\tTag %\tToken %"
     log_loc = path.join(model_dir, 'job.log')
     for itn in range(n_iter):

From 341cd0c99f379d2002295c529f6da375be6fd5da Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 30 Jun 2015 14:27:11 +0200
Subject: [PATCH 30/30] * Require thinc==3.2

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index b18948e50..0721a1b00 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ cython
 cymem == 1.11
 pathlib
 preshed == 0.37
-thinc == 3.1
+thinc == 3.2
 murmurhash == 0.24
 unidecode
 numpy