* Merge neuralnet branch into sense-tagger

2025-08-05 04:40:20 +03:00 · 2015-07-01 22:38:22 +02:00 · 2015-07-01 22:38:22 +02:00 · 041908a272
commit 041908a272
parent 3992724685 341cd0c99f
17 changed files with 1594 additions and 184 deletions
--- a/bin/parser/nn_train.py
+++ b/bin/parser/nn_train.py
@ -0,0 +1,261 @@
+#!/usr/bin/env python
+from __future__ import division
+from __future__ import unicode_literals
+
+import os
+from os import path
+import shutil
+import codecs
+import random
+
+import plac
+import cProfile
+import pstats
+import re
+
+import spacy.util
+from spacy.en import English
+from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
+
+from spacy.syntax.util import Config
+from spacy.gold import read_json_file
+from spacy.gold import GoldParse
+
+from spacy.scorer import Scorer
+
+from spacy.syntax.parser import Parser, get_templates
+from spacy._theano import TheanoModel
+
+import theano
+import theano.tensor as T
+
+from theano.printing import Print
+
+import numpy
+from collections import OrderedDict, defaultdict
+
+
+theano.config.profile = False
+theano.config.floatX = 'float32'
+floatX = theano.config.floatX
+
+
+def L1(L1_reg, *weights):
+    return L1_reg * sum(abs(w).sum() for w in weights)
+
+
+def L2(L2_reg, *weights):
+    return L2_reg * sum((w ** 2).sum() for w in weights)
+
+
+def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6):
+    updates = OrderedDict()
+    for param in params:
+        value = param.get_value(borrow=True)
+        accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
+                             broadcastable=param.broadcastable)
+
+        grad = T.grad(loss, param)
+        accu_new = rho * accu + (1 - rho) * grad ** 2
+        updates[accu] = accu_new
+        updates[param] = param - (eta * grad / T.sqrt(accu_new + eps))
+    return updates
+
+
+def relu(x):
+    return x * (x > 0)
+
+
+def feed_layer(activation, weights, bias, input_):
+    return activation(T.dot(input_, weights) + bias)
+
+
+def init_weights(n_in, n_out):
+    rng = numpy.random.RandomState(1235)
+    
+    weights = numpy.asarray(
+        rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in),
+        dtype=theano.config.floatX
+    )
+    bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
+    return [wrapper(weights, name='W'), wrapper(bias, name='b')]
+
+
+def compile_model(n_classes, n_hidden, n_in, optimizer):
+    x = T.vector('x') 
+    costs = T.ivector('costs')
+    loss = T.scalar('loss')
+
+    maxent_W, maxent_b = init_weights(n_hidden, n_classes)
+    hidden_W, hidden_b = init_weights(n_in, n_hidden)
+
+    # Feed the inputs forward through the network
+    p_y_given_x = feed_layer(
+                    T.nnet.softmax,
+                    maxent_W,
+                    maxent_b,
+                      feed_layer(
+                        relu,
+                        hidden_W,
+                        hidden_b,
+                        x))
+
+    loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8)
+
+    train_model = theano.function(
+        name='train_model',
+        inputs=[x, costs],
+        outputs=[p_y_given_x[0], T.grad(loss, x), loss],
+        updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]),
+        on_unused_input='warn'
+    )
+
+    evaluate_model = theano.function(
+        name='evaluate_model',
+        inputs=[x],
+        outputs=[
+            feed_layer(
+              T.nnet.softmax,
+              maxent_W,
+              maxent_b,
+              feed_layer(
+                relu,
+                hidden_W,
+                hidden_b,
+                x
+              )
+            )[0]
+        ]
+    )
+    return train_model, evaluate_model
+
+
+def score_model(scorer, nlp, annot_tuples, verbose=False):
+    tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
+    nlp.tagger(tokens)
+    nlp.parser(tokens)
+    gold = GoldParse(tokens, annot_tuples)
+    scorer.score(tokens, gold, verbose=verbose)
+
+
+def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
+          eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10,
+          seed=0, n_sents=0,  verbose=False):
+
+    dep_model_dir = path.join(model_dir, 'deps')
+    pos_model_dir = path.join(model_dir, 'pos')
+    if path.exists(dep_model_dir):
+        shutil.rmtree(dep_model_dir)
+    if path.exists(pos_model_dir):
+        shutil.rmtree(pos_model_dir)
+    os.mkdir(dep_model_dir)
+    os.mkdir(pos_model_dir)
+    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
+
+    Config.write(dep_model_dir, 'config',
+        seed=seed,
+        templates=tuple(),
+        labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
+        vector_lengths=(nv_word, nv_tag, nv_label),
+        hidden_nodes=nv_hidden,
+        eta=eta,
+        mu=mu
+    )
+  
+    # Bake-in hyper-parameters
+    optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps)
+    nlp = Language(data_dir=model_dir)
+    n_classes = nlp.parser.model.n_classes
+    train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer)
+    nlp.parser.model = TheanoModel(n_classes, input_spec, train,
+                                   predict, model_loc)
+ 
+    if n_sents > 0:
+        gold_tuples = gold_tuples[:n_sents]
+    print "Itn.\tP.Loss\tUAS\tTag %\tToken %"
+    log_loc = path.join(model_dir, 'job.log')
+    for itn in range(n_iter):
+        scorer = Scorer()
+        loss = 0
+        for _, sents in gold_tuples:
+            for annot_tuples, ctnt in sents:
+                if len(annot_tuples[1]) == 1:
+                    continue
+                score_model(scorer, nlp, annot_tuples)
+                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
+                nlp.tagger(tokens)
+                gold = GoldParse(tokens, annot_tuples, make_projective=True)
+                assert gold.is_projective
+                loss += nlp.parser.train(tokens, gold)
+                nlp.tagger.train(tokens, gold.tags)
+        random.shuffle(gold_tuples)
+        logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
+                                                 scorer.tags_acc,
+                                                 scorer.token_acc)
+        print logline
+        with open(log_loc, 'aw') as file_:
+            file_.write(logline + '\n')
+    nlp.parser.model.end_training()
+    nlp.tagger.model.end_training()
+    nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
+    return nlp
+
+
+def evaluate(nlp, gold_tuples, gold_preproc=True):
+    scorer = Scorer()
+    for raw_text, sents in gold_tuples:
+        for annot_tuples, brackets in sents:
+            tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
+            nlp.tagger(tokens)
+            nlp.parser(tokens)
+            gold = GoldParse(tokens, annot_tuples)
+            scorer.score(tokens, gold)
+    return scorer
+
+
+@plac.annotations(
+    train_loc=("Location of training file or directory"),
+    dev_loc=("Location of development file or directory"),
+    model_dir=("Location of output model directory",),
+    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
+    n_sents=("Number of training sentences", "option", "n", int),
+    n_iter=("Number of training iterations", "option", "i", int),
+    verbose=("Verbose error reporting", "flag", "v", bool),
+
+    nv_word=("Word vector length", "option", "W", int),
+    nv_tag=("Tag vector length", "option", "T", int),
+    nv_label=("Label vector length", "option", "L", int),
+    nv_hidden=("Hidden nodes length", "option", "H", int),
+    eta=("Learning rate", "option", "E", float),
+    mu=("Momentum", "option", "M", float),
+)
+def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False,
+         nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10,
+         eta=0.1, mu=0.9, eval_only=False):
+
+
+
+
+    gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id']))
+
+    nlp = train(English, gold_train, model_dir,
+               feat_set='embed',
+               eta=eta, mu=mu,
+               nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden,
+               n_sents=n_sents, n_iter=n_iter,
+               verbose=verbose)
+
+    scorer = evaluate(nlp, list(read_json_file(dev_loc)))
+    
+    print 'TOK', 100-scorer.token_acc
+    print 'POS', scorer.tags_acc
+    print 'UAS', scorer.uas
+    print 'LAS', scorer.las
+
+    print 'NER P', scorer.ents_p
+    print 'NER R', scorer.ents_r
+    print 'NER F', scorer.ents_f
+
+
+if __name__ == '__main__':
+    plac.call(main)
--- a/setup.py
+++ b/setup.py
@ -151,7 +151,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
             'spacy.lexeme', 'spacy.vocab', 'spacy.tokens', 'spacy.spans',
             'spacy.morphology', 
             'spacy.syntax.stateclass', 
-             'spacy._ml', 'spacy.tokenizer', 'spacy.en.attrs',
+             'spacy._ml', 'spacy._theano',
+             'spacy.tokenizer', 'spacy.en.attrs',
             'spacy.en.pos', 'spacy.syntax.parser', 
             'spacy.syntax.transition_system',
             'spacy.syntax.arc_eager',
--- a/spacy/_bu_nn.pyx
+++ b/spacy/_bu_nn.pyx
@ -0,0 +1,490 @@
+"""Feed-forward neural network, using Thenao."""
+
+import os
+import sys
+import time
+
+import numpy
+
+import theano
+import theano.tensor as T
+import gzip
+import cPickle
+
+
+def load_data(dataset):
+    ''' Loads the dataset
+
+    :type dataset: string
+    :param dataset: the path to the dataset (here MNIST)
+    '''
+
+    #############
+    # LOAD DATA #
+    #############
+
+    # Download the MNIST dataset if it is not present
+    data_dir, data_file = os.path.split(dataset)
+    if data_dir == "" and not os.path.isfile(dataset):
+        # Check if dataset is in the data directory.
+        new_path = os.path.join(
+            os.path.split(__file__)[0],
+            "..",
+            "data",
+            dataset
+        )
+        if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
+            dataset = new_path
+
+    if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
+        import urllib
+        origin = (
+            'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
+        )
+        print 'Downloading data from %s' % origin
+        urllib.urlretrieve(origin, dataset)
+
+    print '... loading data'
+
+    # Load the dataset
+    f = gzip.open(dataset, 'rb')
+    train_set, valid_set, test_set = cPickle.load(f)
+    f.close()
+    #train_set, valid_set, test_set format: tuple(input, target)
+    #input is an numpy.ndarray of 2 dimensions (a matrix), 
+    #each row corresponding to an example. target is a
+    #numpy.ndarray of 1 dimension (vector)) that have the same length as
+    #the number of rows in the input. It should give the target
+    #target to the example with the same index in the input.
+
+    def shared_dataset(data_xy, borrow=True):
+        """ Function that loads the dataset into shared variables
+
+        The reason we store our dataset in shared variables is to allow
+        Theano to copy it into the GPU memory (when code is run on GPU).
+        Since copying data into the GPU is slow, copying a minibatch everytime
+        is needed (the default behaviour if the data is not in a shared
+        variable) would lead to a large decrease in performance.
+        """
+        data_x, data_y = data_xy
+        shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX),
+                                 borrow=borrow)
+        shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX),
+                                 borrow=borrow)
+        # When storing data on the GPU it has to be stored as floats
+        # therefore we will store the labels as ``floatX`` as well
+        # (``shared_y`` does exactly that). But during our computations
+        # we need them as ints (we use labels as index, and if they are
+        # floats it doesn't make sense) therefore instead of returning
+        # ``shared_y`` we will have to cast it to int. This little hack
+        # lets ous get around this issue
+        return shared_x, T.cast(shared_y, 'int32')
+
+    test_set_x, test_set_y = shared_dataset(test_set)
+    valid_set_x, valid_set_y = shared_dataset(valid_set)
+    train_set_x, train_set_y = shared_dataset(train_set)
+
+    rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
+            (test_set_x, test_set_y)]
+    return rval
+
+
+class LogisticRegression(object):
+    """Multi-class Logistic Regression Class
+
+    The logistic regression is fully described by a weight matrix :math:`W`
+    and bias vector :math:`b`. Classification is done by projecting data
+    points onto a set of hyperplanes, the distance to which is used to
+    determine a class membership probability.
+    """
+
+    def __init__(self, input, n_in, n_out):
+        """ Initialize the parameters of the logistic regression
+
+        :type input: theano.tensor.TensorType
+        :param input: symbolic variable that describes the input of the
+                      architecture (one minibatch)
+
+        :type n_in: int
+        :param n_in: number of input units, the dimension of the space in
+                     which the datapoints lie
+
+        :type n_out: int
+        :param n_out: number of output units, the dimension of the space in
+                      which the labels lie
+
+        """
+        # start-snippet-1
+        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
+        self.W = theano.shared(
+            value=numpy.zeros((n_in, n_out),
+                dtype=theano.config.floatX
+            ),
+            name='W',
+            borrow=True
+        )
+        # initialize the baises b as a vector of n_out 0s
+        self.b = theano.shared(
+            value=numpy.zeros(
+                (n_out,),
+                dtype=theano.config.floatX
+            ),
+            name='b',
+            borrow=True
+        )
+
+        # symbolic expression for computing the matrix of class-membership
+        # probabilities
+        # Where:
+        # W is a matrix where column-k represent the separation hyper plain for
+        # class-k
+        # x is a matrix where row-j  represents input training sample-j
+        # b is a vector where element-k represent the free parameter of hyper
+        # plain-k
+        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
+
+        # symbolic description of how to compute prediction as class whose
+        # probability is maximal
+        self.y_pred = T.argmax(self.p_y_given_x, axis=1)
+        # end-snippet-1
+
+        # parameters of the model
+        self.params = [self.W, self.b]
+
+    def neg_ll(self, y):
+        """Return the mean of the negative log-likelihood of the prediction
+        of this model under a given target distribution.
+
+        .. math::
+
+            \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
+            \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|}
+                \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+            \ell (\theta=\{W,b\}, \mathcal{D})
+
+        :type y: theano.tensor.TensorType
+        :param y: corresponds to a vector that gives for each example the
+                  correct label
+
+        Note: we use the mean instead of the sum so that
+              the learning rate is less dependent on the batch size
+        """
+        # start-snippet-2
+        # y.shape[0] is (symbolically) the number of rows in y, i.e.,
+        # number of examples (call it n) in the minibatch
+        # T.arange(y.shape[0]) is a symbolic vector which will contain
+        # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
+        # Log-Probabilities (call it LP) with one row per example and
+        # one column per class LP[T.arange(y.shape[0]),y] is a vector
+        # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
+        # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
+        # the mean (across minibatch examples) of the elements in v,
+        # i.e., the mean log-likelihood across the minibatch.
+        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
+        # end-snippet-2
+
+    def errors(self, y):
+        """Return a float representing the number of errors in the minibatch
+        over the total number of examples of the minibatch ; zero one
+        loss over the size of the minibatch
+
+        :type y: theano.tensor.TensorType
+        :param y: corresponds to a vector that gives for each example the
+                  correct label
+        """
+
+        # check if y has same dimension of y_pred
+        if y.ndim != self.y_pred.ndim:
+            raise TypeError(
+                'y should have the same shape as self.y_pred',
+                ('y', y.type, 'y_pred', self.y_pred.type)
+            )
+        # check if y is of the correct datatype
+        if y.dtype.startswith('int'):
+            # the T.neq operator returns a vector of 0s and 1s, where 1
+            # represents a mistake in prediction
+            return T.mean(T.neq(self.y_pred, y))
+        else:
+            raise NotImplementedError()
+
+
+# start-snippet-1
+class HiddenLayer(object):
+    def __init__(self, rng, input, n_in, n_out, W=None, b=None,
+                 activation=T.tanh):
+        """
+        Typical hidden layer of a MLP: units are fully-connected and have
+        sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
+        and the bias vector b is of shape (n_out,).
+
+        NOTE : The nonlinearity used here is tanh
+
+        Hidden unit activation is given by: tanh(dot(input,W) + b)
+
+        :type rng: numpy.random.RandomState
+        :param rng: a random number generator used to initialize weights
+
+        :type input: theano.tensor.dmatrix
+        :param input: a symbolic tensor of shape (n_examples, n_in)
+
+        :type n_in: int
+        :param n_in: dimensionality of input
+
+        :type n_out: int
+        :param n_out: number of hidden units
+
+        :type activation: theano.Op or function
+        :param activation: Non linearity to be applied in the hidden
+                           layer
+        """
+        self.input = input
+        # end-snippet-1
+
+        # `W` is initialized with `W_values` which is uniformely sampled
+        # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
+        # for tanh activation function
+        # the output of uniform if converted using asarray to dtype
+        # theano.config.floatX so that the code is runable on GPU
+        # Note : optimal initialization of weights is dependent on the
+        #        activation function used (among other things).
+        #        For example, results presented in [Xavier10] suggest that you
+        #        should use 4 times larger initial weights for sigmoid
+        #        compared to tanh
+        #        We have no info for other function, so we use the same as
+        #        tanh.
+        if W is None:
+            W_values = numpy.asarray(
+                rng.uniform(
+                    low=-numpy.sqrt(6. / (n_in + n_out)),
+                    high=numpy.sqrt(6. / (n_in + n_out)),
+                    size=(n_in, n_out)
+                ),
+                dtype=theano.config.floatX
+            )
+            if activation == theano.tensor.nnet.sigmoid:
+                W_values *= 4
+
+            W = theano.shared(value=W_values, name='W', borrow=True)
+
+        if b is None:
+            b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
+            b = theano.shared(value=b_values, name='b', borrow=True)
+
+        self.W = W
+        self.b = b
+
+        lin_output = T.dot(input, self.W) + self.b
+        self.output = (
+            lin_output if activation is None
+            else activation(lin_output)
+        )
+        # parameters of the model
+        self.params = [self.W, self.b]
+
+
+# start-snippet-2
+class MLP(object):
+    """Multi-Layer Perceptron Class
+
+    A multilayer perceptron is a feedforward artificial neural network model
+    that has one layer or more of hidden units and nonlinear activations.
+    Intermediate layers usually have as activation function tanh or the
+    sigmoid function (defined here by a ``HiddenLayer`` class)  while the
+    top layer is a softmax layer (defined here by a ``LogisticRegression``
+    class).
+    """
+
+    def __init__(self, rng, input, n_in, n_hidden, n_out):
+        """Initialize the parameters for the multilayer perceptron
+
+        :type rng: numpy.random.RandomState
+        :param rng: a random number generator used to initialize weights
+
+        :type input: theano.tensor.TensorType
+        :param input: symbolic variable that describes the input of the
+        architecture (one minibatch)
+
+        :type n_in: int
+        :param n_in: number of input units, the dimension of the space in
+        which the datapoints lie
+
+        :type n_hidden: int
+        :param n_hidden: number of hidden units
+
+        :type n_out: int
+        :param n_out: number of output units, the dimension of the space in
+        which the labels lie
+
+        """
+
+        # Since we are dealing with a one hidden layer MLP, this will translate
+        # into a HiddenLayer with a tanh activation function connected to the
+        # LogisticRegression layer; the activation function can be replaced by
+        # sigmoid or any other nonlinear function
+        self.hidden = HiddenLayer(
+            rng=rng,
+            input=input,
+            n_in=n_in,
+            n_out=n_hidden,
+            activation=T.tanh
+        )
+
+        # The logistic regression layer gets as input the hidden units
+        # of the hidden layer
+        self.maxent = LogisticRegression(
+            input=self.hidden.output,
+            n_in=n_hidden,
+            n_out=n_out
+        )
+        # L1 norm ; one regularization option is to enforce L1 norm to
+        # be small
+        self.L1 = abs(self.hidden.W).sum() + abs(self.maxent.W).sum()
+
+        # square of L2 norm ; one regularization option is to enforce
+        # square of L2 norm to be small
+        self.L2_sqr = (self.hidden.W ** 2).sum() + (self.maxent.W ** 2).sum()
+
+        # negative log likelihood of the MLP is given by the negative
+        # log likelihood of the output of the model, computed in the
+        # logistic regression layer
+        self.neg_ll = self.maxent.neg_ll
+        # same holds for the function computing the number of errors
+        self.errors = self.maxent.errors
+
+        # the parameters of the model are the parameters of the two layer it is
+        # made out of
+        self.params = self.hidden.params + self.maxent.params
+
+
+
+
+def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
+             dataset='mnist.pkl.gz', batch_size=1, n_hidden=500):
+    """
+    Demonstrate stochastic gradient descent optimization for a multilayer
+    perceptron
+
+    This is demonstrated on MNIST.
+
+    :type learning_rate: float
+    :param learning_rate: learning rate used (factor for the stochastic
+    gradient
+
+    :type L1_reg: float
+    :param L1_reg: L1-norm's weight when added to the cost (see
+    regularization)
+
+    :type L2_reg: float
+    :param L2_reg: L2-norm's weight when added to the cost (see
+    regularization)
+
+    :type n_epochs: int
+    :param n_epochs: maximal number of epochs to run the optimizer
+
+    :type dataset: string
+    :param dataset: the path of the MNIST dataset file from
+                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
+    """
+    datasets = load_data(dataset)
+
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x, test_set_y = datasets[2]
+
+    ######################
+    # BUILD ACTUAL MODEL #
+    ######################
+    print '... building the model'
+
+    # allocate symbolic variables for the data
+    index = T.lscalar()  # index to a [mini]batch
+    x = T.matrix('x')  # the data is presented as rasterized images
+    y = T.ivector('y')  # the labels are presented as 1D vector of
+                        # [int] labels
+
+    rng = numpy.random.RandomState(1234)
+
+    # construct the MLP class
+    mlp = MLP(
+        rng=rng,
+        input=x,
+        n_in=28 * 28,
+        n_hidden=n_hidden,
+        n_out=10
+    )
+
+    # the cost we minimize during training is the negative log likelihood of
+    # the model plus the regularization terms (L1 and L2); cost is expressed
+    # here symbolically
+
+    # compiling a Theano function that computes the mistakes that are made
+    # by the model on a minibatch
+    test_model = theano.function(
+        inputs=[index],
+        outputs=mlp.maxent.errors(y),
+        givens={
+            x: test_set_x[index:index+1],
+            y: test_set_y[index:index+1]
+        }
+    )
+
+    validate_model = theano.function(
+        inputs=[index],
+        outputs=mlp.maxent.errors(y),
+        givens={
+            x: valid_set_x[index:index+1],
+            y: valid_set_y[index:index+1]
+        }
+    )
+
+    # compute the gradient of cost with respect to theta (sotred in params)
+    # the resulting gradients will be stored in a list gparams
+    cost = mlp.neg_ll(y) + L1_reg * mlp.L1 + L2_reg * mlp.L2_sqr
+    gparams = [T.grad(cost, param) for param in mlp.params]
+
+    # specify how to update the parameters of the model as a list of
+    # (variable, update expression) pairs
+
+    updates = [(mlp.params[i], mlp.params[i] - (learning_rate * gparams[i]))
+               for i in xrange(len(gparams))]
+
+    # compiling a Theano function `train_model` that returns the cost, but
+    # in the same time updates the parameter of the model based on the rules
+    # defined in `updates`
+    train_model = theano.function(
+        inputs=[index],
+        outputs=cost,
+        updates=updates,
+        givens={
+            x: train_set_x[index:index+1],
+            y: train_set_y[index:index+1]
+        }
+    )
+    # end-snippet-5
+
+    ###############
+    # TRAIN MODEL #
+    ###############
+    print '... training'
+
+    start_time = time.clock()
+
+    n_examples = train_set_x.get_value(borrow=True).shape[0]
+    n_dev_examples = valid_set_x.get_value(borrow=True).shape[0]
+    n_test_examples = test_set_x.get_value(borrow=True).shape[0]
+    
+    for epoch in range(1, n_epochs+1):
+        for idx in xrange(n_examples):
+            train_model(idx)
+        # compute zero-one loss on validation set
+        error = numpy.mean(map(validate_model, xrange(n_dev_examples)))
+        print('epoch %i, validation error %f %%' % (epoch, error * 100))
+
+    end_time = time.clock()
+    print >> sys.stderr, ('The code for file ' +
+                          os.path.split(__file__)[1] +
+                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
+
+
+if __name__ == '__main__':
+    test_mlp()
--- a/spacy/_ml.pxd
+++ b/spacy/_ml.pxd
@ -5,6 +5,7 @@ from cymem.cymem cimport Pool
 from thinc.learner cimport LinearModel
 from thinc.features cimport Extractor, Feature
 from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
+from thinc.api cimport ExampleC

 from preshed.maps cimport PreshMapArray

@ -14,9 +15,14 @@ from .tokens cimport Tokens

 cdef int arg_max(const weight_t* scores, const int n_classes) nogil

+cdef int arg_max_if_true(const weight_t* scores, const int* is_valid,  int n_classes) nogil
+
+cdef int arg_max_if_zero(const weight_t* scores, const int* costs, int n_classes) nogil
+

 cdef class Model:
-    cdef int n_classes
+    cdef readonly int n_classes
+    cdef readonly int n_feats
    
    cdef const weight_t* score(self, atom_t* context) except NULL
    cdef int set_scores(self, weight_t* scores, atom_t* context) except -1
--- a/spacy/_ml.pyx
+++ b/spacy/_ml.pyx
@ -10,6 +10,7 @@ import cython
 import numpy.random

 from thinc.features cimport Feature, count_feats
+from thinc.api cimport Example


 cdef int arg_max(const weight_t* scores, const int n_classes) nogil:
@ -23,17 +24,52 @@ cdef int arg_max(const weight_t* scores, const int n_classes) nogil:
    return best


+cdef int arg_max_if_true(const weight_t* scores, const int* is_valid,
+                         const int n_classes) nogil:
+    cdef int i
+    cdef int best = 0
+    cdef weight_t mode = -900000
+    for i in range(n_classes):
+        if is_valid[i] and scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
+
+
+cdef int arg_max_if_zero(const weight_t* scores, const int* costs,
+                         const int n_classes) nogil:
+    cdef int i
+    cdef int best = 0
+    cdef weight_t mode = -900000
+    for i in range(n_classes):
+        if costs[i] == 0 and scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
+
+
 cdef class Model:
    def __init__(self, n_classes, templates, model_loc=None):
        if model_loc is not None and path.isdir(model_loc):
            model_loc = path.join(model_loc, 'model')
        self.n_classes = n_classes
        self._extractor = Extractor(templates)
+        self.n_feats = self._extractor.n_templ
        self._model = LinearModel(n_classes, self._extractor.n_templ)
        self.model_loc = model_loc
        if self.model_loc and path.exists(self.model_loc):
            self._model.load(self.model_loc, freq_thresh=0)

+    def predict(self, Example eg):
+        self.set_scores(eg.c.scores, eg.c.atoms)
+        eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
+
+    def train(self, Example eg):
+        self.predict(eg)
+        eg.c.best = arg_max_if_zero(eg.c.scores, eg.c.costs, self.n_classes)
+        eg.c.cost = eg.c.costs[eg.c.guess]
+        self.update(eg.c.atoms, eg.c.guess, eg.c.best, eg.c.cost)
+
    cdef const weight_t* score(self, atom_t* context) except NULL:
        cdef int n_feats
        feats = self._extractor.get_feats(context, &n_feats)
--- a/spacy/_nn.py
+++ b/spacy/_nn.py
@ -0,0 +1,3 @@
+"""Feed-forward neural network, using Thenao."""
+
+
--- a/spacy/_nn.pyx
+++ b/spacy/_nn.pyx
@ -0,0 +1,146 @@
+"""Feed-forward neural network, using Thenao."""
+
+import os
+import sys
+import time
+
+import numpy
+
+import theano
+import theano.tensor as T
+import plac
+
+from spacy.gold import read_json_file
+from spacy.gold import GoldParse
+from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
+
+
+def build_model(n_classes, n_vocab, n_hidden, n_word_embed, n_tag_embed):
+    # allocate symbolic variables for the data
+    words = T.vector('words')
+    tags = T.vector('tags') 
+    
+    word_e = _init_embedding(n_words, n_word_embed)
+    tag_e = _init_embedding(n_tags, n_tag_embed)
+    label_e = _init_embedding(n_labels, n_label_embed)
+    maxent_W, maxent_b = _init_maxent_weights(n_hidden, n_classes)
+    hidden_W, hidden_b = _init_hidden_weights(28*28, n_hidden, T.tanh) 
+    params = [hidden_W, hidden_b, maxent_W, maxent_b, word_e, tag_e, label_e]
+
+    x = T.concatenate([
+          T.flatten(word_e[word_indices], outdim=1),
+          T.flatten(tag_e[tag_indices], outdim=1)])
+
+    p_y_given_x = feed_layer(
+                    T.nnet.softmax,
+                    maxent_W,
+                    maxent_b,
+                      feed_layer(
+                        T.tanh,
+                        hidden_W,
+                        hidden_b,
+                        x))[0]
+
+    guess = T.argmax(p_y_given_x)
+
+    cost = (
+        -T.log(p_y_given_x[y])
+        + L1(L1_reg, maxent_W, hidden_W, word_e, tag_e)
+        + L2(L2_reg, maxent_W, hidden_W, wod_e, tag_e)
+    )
+
+    train_model = theano.function(
+        inputs=[words, tags, y],
+        outputs=guess,
+        updates=[update(learning_rate, param, cost) for param in params]
+    )
+
+    evaluate_model = theano.function(
+        inputs=[x, y],
+        outputs=T.neq(y, T.argmax(p_y_given_x[0])),
+    )
+    return train_model, evaluate_model
+
+
+def _init_embedding(vocab_size, n_dim):
+    embedding = 0.2 * numpy.random.uniform(-1.0, 1.0, (vocab_size+1, n_dim))
+    return theano.shared(embedding).astype(theano.config.floatX)
+
+
+def _init_maxent_weights(n_hidden, n_out):
+    weights = numpy.zeros((n_hidden, 10), dtype=theano.config.floatX)
+    bias =  numpy.zeros((10,), dtype=theano.config.floatX)
+    return (
+        theano.shared(name='W', borrow=True, value=weights),
+        theano.shared(name='b', borrow=True, value=bias)
+    )
+
+
+def _init_hidden_weights(n_in, n_out, activation=T.tanh):
+    rng = numpy.random.RandomState(1234)
+    weights = numpy.asarray(
+        rng.uniform(
+            low=-numpy.sqrt(6. / (n_in + n_out)),
+            high=numpy.sqrt(6. / (n_in + n_out)),
+            size=(n_in, n_out)
+        ),
+        dtype=theano.config.floatX
+    )
+
+    bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
+    return (
+        theano.shared(value=weights, name='W', borrow=True),
+        theano.shared(value=bias, name='b', borrow=True)
+    )
+
+           
+def feed_layer(activation, weights, bias, input):
+    return activation(T.dot(input, weights) + bias)
+
+
+def L1(L1_reg, w1, w2):
+    return L1_reg * (abs(w1).sum() + abs(w2).sum())
+
+
+def L2(L2_reg, w1, w2):
+    return L2_reg * ((w1 ** 2).sum() + (w2 ** 2).sum())
+
+
+def update(eta, param, cost):
+    return (param, param - (eta * T.grad(cost, param)))
+
+
+def main(train_loc, eval_loc, model_dir):
+    learning_rate = 0.01
+    L1_reg = 0.00
+    L2_reg = 0.0001
+
+    print "... reading the data"
+    gold_train = list(read_json_file(train_loc))
+    print '... building the model'
+    pos_model_dir = path.join(model_dir, 'pos')
+    if path.exists(pos_model_dir):
+        shutil.rmtree(pos_model_dir)
+    os.mkdir(pos_model_dir)
+
+    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
+
+    train_model, evaluate_model = build_model(n_hidden, len(POS_TAGS), learning_rate,
+                                              L1_reg, L2_reg)
+
+    print '... training'
+    for epoch in range(1, n_epochs+1):
+        for raw_text, sents in gold_tuples:
+            for (ids, words, tags, ner, heads, deps), _ in sents:
+                tokens = nlp.tokenizer.tokens_from_list(words)
+                for t in tokens:
+                    guess = train_model([t.orth], [t.tag])
+                    loss += guess != t.tag
+        print loss
+        # compute zero-one loss on validation set
+        #error = numpy.mean([evaluate_model(x, y) for x, y in dev_examples])
+        #print('epoch %i, validation error %f %%' % (epoch, error * 100))
+
+
+if __name__ == '__main__':
+    plac.call(main)
--- a/spacy/_theano.pxd
+++ b/spacy/_theano.pxd
@ -0,0 +1,13 @@
+from ._ml cimport Model
+from thinc.nn cimport InputLayer
+
+
+cdef class TheanoModel(Model):
+    cdef InputLayer input_layer
+    cdef object train_func
+    cdef object predict_func
+    cdef object debug
+
+    cdef public float eta
+    cdef public float mu
+    cdef public float t
--- a/spacy/_theano.pyx
+++ b/spacy/_theano.pyx
@ -0,0 +1,52 @@
+from thinc.api cimport Example, ExampleC
+from thinc.typedefs cimport weight_t
+
+from ._ml cimport arg_max_if_true
+from ._ml cimport arg_max_if_zero
+
+import numpy
+from os import path
+
+
+cdef class TheanoModel(Model):
+    def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None,
+                 eta=0.001, mu=0.9, debug=None):
+        if model_loc is not None and path.isdir(model_loc):
+            model_loc = path.join(model_loc, 'model')
+
+        self.eta = eta
+        self.mu = mu
+        self.t = 1
+        initializer = lambda: 0.2 * numpy.random.uniform(-1.0, 1.0)
+        self.input_layer = InputLayer(input_spec, initializer)
+        self.train_func = train_func
+        self.predict_func = predict_func
+        self.debug = debug
+
+        self.n_classes = n_classes
+        self.n_feats = len(self.input_layer)
+        self.model_loc = model_loc
+        
+    def predict(self, Example eg):
+        self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=True)
+        theano_scores = self.predict_func(eg.embeddings)[0]
+        cdef int i
+        for i in range(self.n_classes):
+            eg.c.scores[i] = theano_scores[i]
+        eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
+
+    def train(self, Example eg):
+        self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=False)
+        theano_scores, update, y, loss = self.train_func(eg.embeddings, eg.costs,
+                                                         self.eta, self.mu)
+        self.input_layer.update(update, eg.atoms, self.t, self.eta, self.mu)
+        for i in range(self.n_classes):
+            eg.c.scores[i] = theano_scores[i]
+        eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
+        eg.c.best = arg_max_if_zero(eg.c.scores, eg.c.costs, self.n_classes)
+        eg.c.cost = eg.c.costs[eg.c.guess]
+        eg.c.loss = loss
+        self.t += 1
+
+    def end_training(self):
+        pass
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -398,7 +398,8 @@ cdef class ArcEager(TransitionSystem):
            n_valid += output[i]
        assert n_valid >= 1

-    cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1:
+    cdef int set_costs(self, bint* is_valid, int* costs, 
+                       StateClass stcls, GoldParse gold) except -1:
        cdef int i, move, label
        cdef label_cost_func_t[N_MOVES] label_cost_funcs
        cdef move_cost_func_t[N_MOVES] move_cost_funcs
@ -423,30 +424,14 @@ cdef class ArcEager(TransitionSystem):
        n_gold = 0
        for i in range(self.n_moves):
            if self.c[i].is_valid(stcls, self.c[i].label):
+                is_valid[i] = True
                move = self.c[i].move
                label = self.c[i].label
                if move_costs[move] == -1:
                    move_costs[move] = move_cost_funcs[move](stcls, &gold.c)
-                output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label)
-                n_gold += output[i] == 0
+                costs[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label)
+                n_gold += costs[i] == 0
            else:
-                output[i] = 9000
+                is_valid[i] = False
+                costs[i] = 9000
        assert n_gold >= 1
-
-    cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *:
-        cdef bint[N_MOVES] is_valid
-        is_valid[SHIFT] = Shift.is_valid(stcls, -1)
-        is_valid[REDUCE] = Reduce.is_valid(stcls, -1)
-        is_valid[LEFT] = LeftArc.is_valid(stcls, -1)
-        is_valid[RIGHT] = RightArc.is_valid(stcls, -1)
-        is_valid[BREAK] = Break.is_valid(stcls, -1)
-        cdef Transition best
-        cdef weight_t score = MIN_SCORE
-        cdef int i
-        for i in range(self.n_moves):
-            if scores[i] > score and is_valid[self.c[i].move]:
-                best = self.c[i]
-                score = scores[i]
-        assert best.clas < self.n_moves
-        assert score > MIN_SCORE, (stcls.stack_depth(), stcls.buffer_length(), stcls.is_final(), stcls._b_i, stcls.length)
-        return best
--- a/spacy/syntax/joint.pxd
+++ b/spacy/syntax/joint.pxd
@ -0,0 +1,17 @@
+from cymem.cymem cimport Pool
+
+from thinc.typedefs cimport weight_t
+
+from .stateclass cimport StateClass
+
+from .transition_system cimport TransitionSystem, Transition
+from ..gold cimport GoldParseC
+
+
+cdef class ArcEager(TransitionSystem):
+    pass
+
+
+cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil
+cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil
+
--- a/spacy/syntax/joint.pyx
+++ b/spacy/syntax/joint.pyx
@ -0,0 +1,452 @@
+# cython: profile=True
+from __future__ import unicode_literals
+
+import ctypes
+import os
+
+from ..structs cimport TokenC
+
+from .transition_system cimport do_func_t, get_cost_func_t
+from .transition_system cimport move_cost_func_t, label_cost_func_t
+from ..gold cimport GoldParse
+from ..gold cimport GoldParseC
+
+from libc.stdint cimport uint32_t
+from libc.string cimport memcpy
+
+from cymem.cymem cimport Pool
+from .stateclass cimport StateClass
+
+
+DEF NON_MONOTONIC = True
+DEF USE_BREAK = True
+DEF USE_ROOT_ARC_SEGMENT = True
+
+cdef weight_t MIN_SCORE = -90000
+
+# Break transition from here
+# http://www.aclweb.org/anthology/P13-1074
+cdef enum:
+    SHIFT
+    REDUCE
+    LEFT
+    RIGHT
+
+    BREAK
+
+    N_MOVES
+
+
+MOVE_NAMES = [None] * N_MOVES
+MOVE_NAMES[SHIFT] = 'S'
+MOVE_NAMES[REDUCE] = 'D'
+MOVE_NAMES[LEFT] = 'L'
+MOVE_NAMES[RIGHT] = 'R'
+MOVE_NAMES[BREAK] = 'B'
+
+
+# Helper functions for the arc-eager oracle
+
+cdef int push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil:
+    cdef int cost = 0
+    cdef int i, S_i
+    for i in range(stcls.stack_depth()):
+        S_i = stcls.S(i)
+        if gold.heads[target] == S_i:
+            cost += 1
+        if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)):
+            cost += 1
+    cost += Break.is_valid(stcls, -1) and Break.move_cost(stcls, gold) == 0
+    return cost
+
+
+cdef int pop_cost(StateClass stcls, const GoldParseC* gold, int target) nogil:
+    cdef int cost = 0
+    cdef int i, B_i
+    for i in range(stcls.buffer_length()):
+        B_i = stcls.B(i)
+        cost += gold.heads[B_i] == target
+        cost += gold.heads[target] == B_i
+        if gold.heads[B_i] == B_i or gold.heads[B_i] < target:
+            break
+    cost += Break.is_valid(stcls, -1) and Break.move_cost(stcls, gold) == 0
+    return cost
+
+
+cdef int arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil:
+    if arc_is_gold(gold, head, child):
+        return 0
+    elif stcls.H(child) == gold.heads[child]:
+        return 1
+    # Head in buffer
+    elif gold.heads[child] >= stcls.B(0) and stcls.B(1) != -1:
+        return 1
+    else:
+        return 0
+
+
+cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
+    if gold.labels[child] == -1:
+        return True
+    elif USE_ROOT_ARC_SEGMENT and _is_gold_root(gold, head) and _is_gold_root(gold, child):
+        return True
+    elif gold.heads[child] == head:
+        return True
+    else:
+        return False
+
+
+cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil:
+    if gold.labels[child] == -1:
+        return True
+    elif label == -1:
+        return True
+    elif gold.labels[child] == label:
+        return True
+    else:
+        return False
+
+
+cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
+    return gold.labels[word] == -1 or gold.heads[word] == word
+ 
+
+cdef class Shift:
+    @staticmethod
+    cdef bint is_valid(StateClass st, int label) nogil:
+        return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
+
+    @staticmethod
+    cdef int transition(StateClass st, int label) nogil:
+        st.push()
+        st.fast_forward()
+
+    @staticmethod
+    cdef int cost(StateClass st, const GoldParseC* gold, int label) nogil:
+        return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label)
+
+    @staticmethod
+    cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil:
+        return push_cost(s, gold, s.B(0))
+
+    @staticmethod
+    cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+        return 0
+
+
+cdef class Reduce:
+    @staticmethod
+    cdef bint is_valid(StateClass st, int label) nogil:
+        return st.stack_depth() >= 2
+
+    @staticmethod
+    cdef int transition(StateClass st, int label) nogil:
+        if st.has_head(st.S(0)):
+            st.pop()
+        else:
+            st.unshift()
+        st.fast_forward()
+
+    @staticmethod
+    cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil:
+        return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label)
+
+    @staticmethod
+    cdef inline int move_cost(StateClass st, const GoldParseC* gold) nogil:
+        return pop_cost(st, gold, st.S(0))
+
+    @staticmethod
+    cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+        return 0
+
+
+cdef class LeftArc:
+    @staticmethod
+    cdef bint is_valid(StateClass st, int label) nogil:
+        return not st.B_(0).sent_start
+
+    @staticmethod
+    cdef int transition(StateClass st, int label) nogil:
+        st.add_arc(st.B(0), st.S(0), label)
+        st.pop()
+        st.fast_forward()
+
+    @staticmethod
+    cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil:
+        return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)
+
+    @staticmethod
+    cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil:
+        cdef int cost = 0
+        if arc_is_gold(gold, s.B(0), s.S(0)):
+            return 0
+        else:
+            # Account for deps we might lose between S0 and stack
+            if not s.has_head(s.S(0)):
+                for i in range(1, s.stack_depth()):
+                    cost += gold.heads[s.S(i)] == s.S(0)
+                    cost += gold.heads[s.S(0)] == s.S(i)
+            return pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))
+
+    @staticmethod
+    cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+        return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)
+
+
+cdef class RightArc:
+    @staticmethod
+    cdef bint is_valid(StateClass st, int label) nogil:
+        return not st.B_(0).sent_start
+
+    @staticmethod
+    cdef int transition(StateClass st, int label) nogil:
+        st.add_arc(st.S(0), st.B(0), label)
+        st.push()
+        st.fast_forward()
+
+    @staticmethod
+    cdef inline int cost(StateClass s, const GoldParseC* gold, int label) nogil:
+        return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)
+
+    @staticmethod
+    cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil:
+        if arc_is_gold(gold, s.S(0), s.B(0)):
+            return 0
+        elif s.shifted[s.B(0)]:
+            return push_cost(s, gold, s.B(0))
+        else:
+            return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))
+
+    @staticmethod
+    cdef int label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+        return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)
+
+
+cdef class Break:
+    @staticmethod
+    cdef bint is_valid(StateClass st, int label) nogil:
+        cdef int i
+        if not USE_BREAK:
+            return False
+        elif st.at_break():
+            return False
+        elif st.B(0) == 0:
+            return False
+        elif st.stack_depth() < 1:
+            return False
+        elif (st.S(0) + 1) != st.B(0):
+            # Must break at the token boundary
+            return False
+        else:
+            return True
+
+    @staticmethod
+    cdef int transition(StateClass st, int label) nogil:
+        st.set_break(st.B(0))
+        st.fast_forward()
+
+    @staticmethod
+    cdef int cost(StateClass s, const GoldParseC* gold, int label) nogil:
+        return Break.move_cost(s, gold) + Break.label_cost(s, gold, label)
+
+    @staticmethod
+    cdef inline int move_cost(StateClass s, const GoldParseC* gold) nogil:
+        cdef int cost = 0
+        cdef int i, j, S_i, B_i
+        for i in range(s.stack_depth()):
+            S_i = s.S(i)
+            for j in range(s.buffer_length()):
+                B_i = s.B(j)
+                cost += gold.heads[S_i] == B_i
+                cost += gold.heads[B_i] == S_i
+        # Check for sentence boundary --- if it's here, we can't have any deps
+        # between stack and buffer, so rest of action is irrelevant.
+        s0_root = _get_root(s.S(0), gold)
+        b0_root = _get_root(s.B(0), gold)
+        if s0_root != b0_root or s0_root == -1 or b0_root == -1:
+            return cost
+        else:
+            return cost + 1
+    
+    @staticmethod
+    cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+        return 0
+
+cdef int _get_root(int word, const GoldParseC* gold) nogil:
+    while gold.heads[word] != word and gold.labels[word] != -1 and word >= 0:
+        word = gold.heads[word]
+    if gold.labels[word] == -1:
+        return -1
+    else:
+        return word
+        
+
+cdef class ArcEager(TransitionSystem):
+    @classmethod
+    def get_labels(cls, gold_parses):
+        move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {'ROOT': True},
+                       LEFT: {'ROOT': True}, BREAK: {'ROOT': True}}
+        for raw_text, sents in gold_parses:
+            for (ids, words, tags, heads, labels, iob), ctnts in sents:
+                for child, head, label in zip(ids, heads, labels):
+                    if label.upper() == 'ROOT':
+                        label = 'ROOT'
+                    if label != 'ROOT':
+                        if head < child:
+                            move_labels[RIGHT][label] = True
+                        elif head > child:
+                            move_labels[LEFT][label] = True
+        return move_labels
+
+    cdef int preprocess_gold(self, GoldParse gold) except -1:
+        for i in range(gold.length):
+            if gold.heads[i] is None: # Missing values
+                gold.c.heads[i] = i
+                gold.c.labels[i] = -1
+            else:
+                label = gold.labels[i]
+                if label.upper() == 'ROOT':
+                    label = 'ROOT'
+                gold.c.heads[i] = gold.heads[i]
+                gold.c.labels[i] = self.strings[label]
+        for end, brackets in gold.brackets.items():
+            for start, label_strs in brackets.items():
+                gold.c.brackets[start][end] = 1
+                for label_str in label_strs:
+                    # Add the encoded label to the set
+                    gold.brackets[end][start].add(self.strings[label_str])
+
+    cdef Transition lookup_transition(self, object name) except *:
+        if '-' in name:
+            move_str, label_str = name.split('-', 1)
+            label = self.label_ids[label_str]
+        else:
+            label = 0
+        move = MOVE_NAMES.index(move_str)
+        for i in range(self.n_moves):
+            if self.c[i].move == move and self.c[i].label == label:
+                return self.c[i]
+
+    def move_name(self, int move, int label):
+        label_str = self.strings[label]
+        if label_str:
+            return MOVE_NAMES[move] + '-' + label_str
+        else:
+            return MOVE_NAMES[move]
+
+    cdef Transition init_transition(self, int clas, int move, int label) except *:
+        # TODO: Apparent Cython bug here when we try to use the Transition()
+        # constructor with the function pointers
+        cdef Transition t
+        t.score = 0
+        t.clas = clas
+        t.move = move
+        t.label = label
+        if move == SHIFT:
+            t.is_valid = Shift.is_valid
+            t.do = Shift.transition
+            t.get_cost = Shift.cost
+        elif move == REDUCE:
+            t.is_valid = Reduce.is_valid
+            t.do = Reduce.transition
+            t.get_cost = Reduce.cost
+        elif move == LEFT:
+            t.is_valid = LeftArc.is_valid
+            t.do = LeftArc.transition
+            t.get_cost = LeftArc.cost
+        elif move == RIGHT:
+            t.is_valid = RightArc.is_valid
+            t.do = RightArc.transition
+            t.get_cost = RightArc.cost
+        elif move == BREAK:
+            t.is_valid = Break.is_valid
+            t.do = Break.transition
+            t.get_cost = Break.cost
+        else:
+            raise Exception(move)
+        return t
+
+    cdef int initialize_state(self, StateClass st) except -1:
+        # Ensure sent_start is set to 0 throughout
+        for i in range(st.length):
+            st._sent[i].sent_start = False
+            st._sent[i].l_edge = i
+            st._sent[i].r_edge = i
+        st.fast_forward()
+
+    cdef int finalize_state(self, StateClass st) except -1:
+        cdef int root_label = self.strings['ROOT']
+        for i in range(st.length):
+            if st._sent[i].head == 0 and st._sent[i].dep == 0:
+                st._sent[i].dep = root_label
+            # If we're not using the Break transition, we segment via root-labelled
+            # arcs between the root words.
+            elif USE_ROOT_ARC_SEGMENT and st._sent[i].dep == root_label:
+                st._sent[i].head = 0
+
+    cdef int set_valid(self, bint* output, StateClass stcls) except -1:
+        cdef bint[N_MOVES] is_valid
+        is_valid[SHIFT] = Shift.is_valid(stcls, -1)
+        is_valid[REDUCE] = Reduce.is_valid(stcls, -1)
+        is_valid[LEFT] = LeftArc.is_valid(stcls, -1)
+        is_valid[RIGHT] = RightArc.is_valid(stcls, -1)
+        is_valid[BREAK] = Break.is_valid(stcls, -1)
+        cdef int i
+        n_valid = 0
+        for i in range(self.n_moves):
+            output[i] = is_valid[self.c[i].move]
+            n_valid += output[i]
+        assert n_valid >= 1
+
+    cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1:
+        cdef int i, move, label
+        cdef label_cost_func_t[N_MOVES] label_cost_funcs
+        cdef move_cost_func_t[N_MOVES] move_cost_funcs
+        cdef int[N_MOVES] move_costs
+        for i in range(N_MOVES):
+            move_costs[i] = -1
+        move_cost_funcs[SHIFT] = Shift.move_cost
+        move_cost_funcs[REDUCE] = Reduce.move_cost
+        move_cost_funcs[LEFT] = LeftArc.move_cost
+        move_cost_funcs[RIGHT] = RightArc.move_cost
+        move_cost_funcs[BREAK] = Break.move_cost
+
+        label_cost_funcs[SHIFT] = Shift.label_cost
+        label_cost_funcs[REDUCE] = Reduce.label_cost
+        label_cost_funcs[LEFT] = LeftArc.label_cost
+        label_cost_funcs[RIGHT] = RightArc.label_cost
+        label_cost_funcs[BREAK] = Break.label_cost
+
+        cdef int* labels = gold.c.labels
+        cdef int* heads = gold.c.heads
+
+        n_gold = 0
+        for i in range(self.n_moves):
+            if self.c[i].is_valid(stcls, self.c[i].label):
+                move = self.c[i].move
+                label = self.c[i].label
+                if move_costs[move] == -1:
+                    move_costs[move] = move_cost_funcs[move](stcls, &gold.c)
+                output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label)
+                n_gold += output[i] == 0
+            else:
+                output[i] = 9000
+        assert n_gold >= 1
+
+    cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *:
+        cdef bint[N_MOVES] is_valid
+        is_valid[SHIFT] = Shift.is_valid(stcls, -1)
+        is_valid[REDUCE] = Reduce.is_valid(stcls, -1)
+        is_valid[LEFT] = LeftArc.is_valid(stcls, -1)
+        is_valid[RIGHT] = RightArc.is_valid(stcls, -1)
+        is_valid[BREAK] = Break.is_valid(stcls, -1)
+        cdef Transition best
+        cdef weight_t score = MIN_SCORE
+        cdef int i
+        for i in range(self.n_moves):
+            if scores[i] > score and is_valid[self.c[i].move]:
+                best = self.c[i]
+                score = scores[i]
+        assert best.clas < self.n_moves
+        assert score > MIN_SCORE, (stcls.stack_depth(), stcls.buffer_length(), stcls.is_final(), stcls._b_i, stcls.length)
+        return best
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -128,27 +128,6 @@ cdef class BiluoPushDown(TransitionSystem):
            raise Exception(move)
        return t

-    cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *:
-        cdef int best = -1
-        cdef weight_t score = -90000
-        cdef const Transition* m
-        cdef int i
-        for i in range(self.n_moves):
-            m = &self.c[i]
-            if m.is_valid(stcls, m.label) and scores[i] > score:
-                best = i
-                score = scores[i]
-        assert best >= 0
-        cdef Transition t = self.c[best]
-        t.score = score
-        return t
-
-    cdef int set_valid(self, bint* output, StateClass stcls) except -1:
-        cdef int i
-        for i in range(self.n_moves):
-            m = &self.c[i]
-            output[i] = m.is_valid(stcls, m.label)
-

 cdef class Missing:
    @staticmethod
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@ -8,9 +8,6 @@ from ..tokens cimport Tokens, TokenC


 cdef class Parser:
-    cdef readonly object cfg
-    cdef readonly Model model
-    cdef readonly TransitionSystem moves
-
-    cdef int _greedy_parse(self, Tokens tokens) except -1
-    cdef int _beam_parse(self, Tokens tokens) except -1
+    cdef public object cfg
+    cdef public Model model
+    cdef public TransitionSystem moves
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -19,17 +19,10 @@ from cymem.cymem cimport Pool, Address
 from murmurhash.mrmr cimport hash64
 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t

-
 from util import Config

-from thinc.features cimport Extractor
-from thinc.features cimport Feature
-from thinc.features cimport count_feats
+from thinc.api cimport Example

-from thinc.learner cimport LinearModel
-
-from thinc.search cimport Beam
-from thinc.search cimport MaxViolation

 from ..tokens cimport Tokens, TokenC
 from ..strings cimport StringStore
@ -58,6 +51,8 @@ def get_templates(name):
        return pf.ner
    elif name == 'debug':
        return pf.unigrams
+    elif name.startswith('embed'):
+        return (pf.words, pf.tags, pf.labels)
    else:
        return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
                pf.tree_shape + pf.trigrams)
@ -68,39 +63,103 @@ cdef class Parser:
        assert os.path.exists(model_dir) and os.path.isdir(model_dir)
        self.cfg = Config.read(model_dir, 'config')
        self.moves = transition_system(strings, self.cfg.labels)
-        templates = get_templates(self.cfg.features)
-        self.model = Model(self.moves.n_moves, templates, model_dir)
+        self.model = Model(self.moves.n_moves, self.cfg.templates, model_dir)

    def __call__(self, Tokens tokens):
-        if self.cfg.get('beam_width', 1) < 1:
-            self._greedy_parse(tokens)
-        else:
-            self._beam_parse(tokens)
+        cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
+        self.moves.initialize_state(stcls)
+
+        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
+                                  self.model.n_feats, self.model.n_feats)
+        eg.scores[0] = 10
+        assert eg.c.scores[0] == 10
+        while not stcls.is_final():
+            memset(eg.c.scores, 0, eg.c.nr_class * sizeof(weight_t))
+        
+            self.moves.set_valid(<bint*>eg.c.is_valid, stcls)
+            fill_context(eg.c.atoms, stcls)
+            
+            self.model.predict(eg)
+
+            self.moves.c[eg.c.guess].do(stcls, self.moves.c[eg.c.guess].label)
+        self.moves.finalize_state(stcls)
+        tokens.set_parse(stcls._sent)

    def train(self, Tokens tokens, GoldParse gold):
        self.moves.preprocess_gold(gold)
-        if self.cfg.beam_width < 1:
-            return self._greedy_train(tokens, gold)
-        else:
-            return self._beam_train(tokens, gold)
-
-    cdef int _greedy_parse(self, Tokens tokens) except -1:
-        cdef atom_t[CONTEXT_SIZE] context
-        cdef int n_feats
-        cdef Pool mem = Pool()
        cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
        self.moves.initialize_state(stcls)
-        cdef Transition guess
+        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
+                                  self.model.n_feats, self.model.n_feats)
+        cdef weight_t loss = 0
        words = [w.orth_ for w in tokens]
+        cdef Transition G
        while not stcls.is_final():
-            fill_context(context, stcls)
-            scores = self.model.score(context)
-            guess = self.moves.best_valid(scores, stcls)
-            #print self.moves.move_name(guess.move, guess.label), stcls.print_state(words)
-            guess.do(stcls, guess.label)
-            assert stcls._s_i >= 0
-        self.moves.finalize_state(stcls)
-        tokens.set_parse(stcls._sent)
+            memset(eg.c.scores, 0, eg.c.nr_class * sizeof(weight_t))
+        
+            self.moves.set_costs(<bint*>eg.c.is_valid, eg.c.costs, stcls, gold)
+            
+            fill_context(eg.c.atoms, stcls)
+
+            self.model.train(eg)
+
+            G = self.moves.c[eg.c.guess]
+
+            #if eg.c.cost != 0:
+            #    print self.moves.move_name(G.move, G.label), stcls.print_state(words)
+            self.moves.c[eg.c.guess].do(stcls, self.moves.c[eg.c.guess].label)
+            loss += eg.c.loss
+        return loss
+
+
+
+# These are passed as callbacks to thinc.search.Beam
+"""
+cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
+    dest = <StateClass>_dest
+    src = <StateClass>_src
+    moves = <const Transition*>_moves
+    dest.clone(src)
+    moves[clas].do(dest, moves[clas].label)
+
+
+cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
+    cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
+    st.fast_forward()
+    Py_INCREF(st)
+    return <void*>st
+
+
+cdef int _check_final_state(void* _state, void* extra_args) except -1:
+    return (<StateClass>_state).is_final()
+
+
+def _cleanup(Beam beam):
+    for i in range(beam.width):
+        Py_XDECREF(<PyObject*>beam._states[i].content)
+        Py_XDECREF(<PyObject*>beam._parents[i].content)
+
+cdef hash_t _hash_state(void* _state, void* _) except 0:
+    return <hash_t>_state
+    
+    #state = <const State*>_state
+    #cdef atom_t[10] rep
+
+    #rep[0] = state.stack[0] if state.stack_len >= 1 else 0
+    #rep[1] = state.stack[-1] if state.stack_len >= 2 else 0
+    #rep[2] = state.stack[-2] if state.stack_len >= 3 else 0
+    #rep[3] = state.i
+    #rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0
+    #rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0
+    #rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0
+    #rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0
+    #if get_left(state, get_n0(state), 1) != NULL:
+    #    rep[8] = get_left(state, get_n0(state), 1).dep 
+    #else:
+    #    rep[8] = 0
+    #rep[9] = state.sent[state.i].l_kids
+    #return hash64(rep, sizeof(atom_t) * 10, 0)
+

    cdef int _beam_parse(self, Tokens tokens) except -1:
        cdef Beam beam = Beam(self.moves.n_moves, self.cfg.beam_width)
@ -114,30 +173,6 @@ cdef class Parser:
        tokens.set_parse(state._sent)
        _cleanup(beam)

-    def _greedy_train(self, Tokens tokens, GoldParse gold):
-        cdef Pool mem = Pool()
-        cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
-        self.moves.initialize_state(stcls)
-
-        cdef int cost
-        cdef const Feature* feats
-        cdef const weight_t* scores
-        cdef Transition guess
-        cdef Transition best
-        cdef atom_t[CONTEXT_SIZE] context
-        loss = 0
-        words = [w.orth_ for w in tokens]
-        history = []
-        while not stcls.is_final():
-            fill_context(context, stcls)
-            scores = self.model.score(context)
-            guess = self.moves.best_valid(scores, stcls)
-            best = self.moves.best_gold(scores, stcls, gold)
-            cost = guess.get_cost(stcls, &gold.c, guess.label)
-            self.model.update(context, guess.clas, best.clas, cost)
-            guess.do(stcls, guess.label)
-            loss += cost
-        return loss

    def _beam_train(self, Tokens tokens, GoldParse gold_parse):
        cdef Beam pred = Beam(self.moves.n_moves, self.cfg.beam_width)
@ -200,50 +235,4 @@ cdef class Parser:
            count_feats(counts[clas], feats, n_feats, inc)
            self.moves.c[clas].do(stcls, self.moves.c[clas].label)

-
-# These are passed as callbacks to thinc.search.Beam
-
-cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
-    dest = <StateClass>_dest
-    src = <StateClass>_src
-    moves = <const Transition*>_moves
-    dest.clone(src)
-    moves[clas].do(dest, moves[clas].label)
-
-
-cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
-    cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
-    st.fast_forward()
-    Py_INCREF(st)
-    return <void*>st
-
-
-cdef int _check_final_state(void* _state, void* extra_args) except -1:
-    return (<StateClass>_state).is_final()
-
-
-def _cleanup(Beam beam):
-    for i in range(beam.width):
-        Py_XDECREF(<PyObject*>beam._states[i].content)
-        Py_XDECREF(<PyObject*>beam._parents[i].content)
-
-cdef hash_t _hash_state(void* _state, void* _) except 0:
-    return <hash_t>_state
-    
-    #state = <const State*>_state
-    #cdef atom_t[10] rep
-
-    #rep[0] = state.stack[0] if state.stack_len >= 1 else 0
-    #rep[1] = state.stack[-1] if state.stack_len >= 2 else 0
-    #rep[2] = state.stack[-2] if state.stack_len >= 3 else 0
-    #rep[3] = state.i
-    #rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0
-    #rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0
-    #rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0
-    #rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0
-    #if get_left(state, get_n0(state), 1) != NULL:
-    #    rep[8] = get_left(state, get_n0(state), 1).dep 
-    #else:
-    #    rep[8] = 0
-    #rep[9] = state.sent[state.i].l_kids
-    #return hash64(rep, sizeof(atom_t) * 10, 0)
+"""
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@ -46,9 +46,5 @@ cdef class TransitionSystem:

    cdef int set_valid(self, bint* output, StateClass state) except -1
    
-    cdef int set_costs(self, int* output, StateClass state, GoldParse gold) except -1
-
-    cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *
-
-    cdef Transition best_gold(self, const weight_t* scores, StateClass state,
-                              GoldParse gold) except *
+    cdef int set_costs(self, bint* is_valid, int* costs,
+                       StateClass state, GoldParse gold) except -1
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -43,30 +43,17 @@ cdef class TransitionSystem:
    cdef Transition init_transition(self, int clas, int move, int label) except *:
        raise NotImplementedError

-    cdef Transition best_valid(self, const weight_t* scores, StateClass s) except *:
-        raise NotImplementedError
-    
-    cdef int set_valid(self, bint* output, StateClass state) except -1:
-        raise NotImplementedError
-
-    cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1:
+    cdef int set_valid(self, bint* is_valid, StateClass stcls) except -1:
        cdef int i
        for i in range(self.n_moves):
-            if self.c[i].is_valid(stcls, self.c[i].label):
-                output[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
+            is_valid[i] = self.c[i].is_valid(stcls, self.c[i].label)
+
+    cdef int set_costs(self, bint* is_valid, int* costs,
+                       StateClass stcls, GoldParse gold) except -1:
+        cdef int i
+        self.set_valid(is_valid, stcls)
+        for i in range(self.n_moves):
+            if is_valid[i]:
+                costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
            else:
-                output[i] = 9000
-
-    cdef Transition best_gold(self, const weight_t* scores, StateClass stcls,
-                              GoldParse gold) except *:
-        cdef Transition best
-        cdef weight_t score = MIN_SCORE
-        cdef int i
-        for i in range(self.n_moves):
-            if self.c[i].is_valid(stcls, self.c[i].label):
-                cost = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
-                if scores[i] > score and cost == 0:
-                    best = self.c[i]
-                    score = scores[i]
-        assert score > MIN_SCORE
-        return best
+                costs[i] = 9000
				`@ -0,0 +1,3 @@`
				`"""Feed-forward neural network, using Thenao."""`