diff --git a/bin/parser/conll_train.py b/bin/parser/conll_train.py
index 8075dcd8a..e55215585 100755
--- a/bin/parser/conll_train.py
+++ b/bin/parser/conll_train.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+from __future__ import print_function
 from __future__ import division
 from __future__ import unicode_literals
 
@@ -9,6 +10,8 @@ import io
 import random
 import time
 import gzip
+import re
+import numpy
 
 import plac
 import cProfile
@@ -20,23 +23,29 @@ from spacy.gold import GoldParse
 
 from spacy.syntax.util import Config
 from spacy.syntax.arc_eager import ArcEager
-from spacy.syntax.parser import Parser
+from spacy.syntax.parser import Parser, get_templates
+from spacy.syntax.beam_parser import BeamParser
 from spacy.scorer import Scorer
 from spacy.tagger import Tagger
+from spacy.syntax.nonproj import PseudoProjectivity
+from spacy.syntax import _parse_features as pf
 
 # Last updated for spaCy v0.97
 
 
-def read_conll(file_):
+def read_conll(file_, n=0):
     """Read a standard CoNLL/MALT-style format"""
-    sents = []
-    for sent_str in file_.read().strip().split('\n\n'):
+    text = file_.read().strip()
+    sent_strs = re.split(r'\n\s*\n', text)
+    for sent_id, sent_str in enumerate(sent_strs):
+        if not sent_str.strip():
+            continue
         ids = []
         words = []
         heads = []
         labels = []
         tags = []
-        for i, line in enumerate(sent_str.split('\n')):
+        for i, line in enumerate(sent_str.strip().split('\n')):
             word, pos_string, head_idx, label = _parse_line(line)
             words.append(word)
             if head_idx < 0:
@@ -45,10 +54,10 @@ def read_conll(file_):
             heads.append(head_idx)
             labels.append(label)
             tags.append(pos_string)
-        text = ' '.join(words)
         annot = (ids, words, tags, heads, labels, ['O'] * len(ids))
-        sents.append((None, [(annot, [])]))
-    return sents
+        yield (None, [(annot, None)])
+        if n and sent_id >= n:
+            break
 
 
 def _parse_line(line):
@@ -68,21 +77,33 @@ def _parse_line(line):
         pos = pieces[4]
         head_idx = int(pieces[6])-1
         label = pieces[7]
-    if head_idx == 0:
+    if head_idx < 0:
         label = 'ROOT'
     return word, pos, head_idx, label
 
+
+def print_words(strings, words, embeddings):
+    ids = {strings[word]: word for word in words}
+    vectors = {}
+    for key, values in embeddings[5]:
+        if key in ids:
+            vectors[strings[key]] = values
+    for word in words:
+        if word in vectors:
+            print(word, vectors[word])
+
         
 def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
     tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-    nlp.tagger(tokens)
+    nlp.tagger.tag_from_strings(tokens, annot_tuples[2])
     nlp.parser(tokens)
     gold = GoldParse(tokens, annot_tuples, make_projective=False)
     scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))
 
 
-def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
-          gold_preproc=False, force_gold=False):
+def train(Language, gold_tuples, model_dir, dev_loc, n_iter=15, feat_set=u'basic',
+          learn_rate=0.001, update_step='sgd_cm', 
+          batch_norm=False, seed=0, gold_preproc=False, force_gold=False):
     dep_model_dir = path.join(model_dir, 'deps')
     pos_model_dir = path.join(model_dir, 'pos')
     if path.exists(dep_model_dir):
@@ -92,66 +113,141 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0
     os.mkdir(dep_model_dir)
     os.mkdir(pos_model_dir)
 
-    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
-                 labels=ArcEager.get_labels(gold_tuples))
+    if feat_set != 'neural':
+        Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
+            labels=ArcEager.get_labels(gold_tuples))
+
+    else:
+        feat_groups = [
+            (pf.core_words, 8),
+            (pf.core_tags, 4),
+            (pf.core_labels, 4),
+            (pf.core_shapes, 4),
+            ([f[0] for f in pf.valencies], 2)
+        ]
+        slots = []
+        vector_widths = []
+        feat_set = []
+        input_length = 0
+        for i, (feat_group, width) in enumerate(feat_groups):
+            feat_set.extend((f,) for f in feat_group)
+            slots += [i] * len(feat_group)
+            vector_widths.append(width)
+            input_length += width * len(feat_group)
+        hidden_layers = [128] * 5
+        rho = 1e-4
+        Config.write(dep_model_dir, 'config',
+                     model='neural',
+                     seed=seed,
+                     labels=ArcEager.get_labels(gold_tuples),
+                     feat_set=feat_set,
+                     vector_widths=vector_widths,
+                     slots=slots,
+                     hidden_layers=hidden_layers,
+                     update_step=update_step,
+                     batch_norm=batch_norm,
+                     eta=learn_rate,
+                     mu=0.9,
+                     ensemble_size=1,
+                     rho=rho)
 
     nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
     nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
     nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
+    for word in nlp.vocab:
+        word.norm = word.orth
+    words = list(nlp.vocab)
+    top5k = numpy.ndarray(shape=(10000, len(word.vector)), dtype='float32')
+    norms = numpy.ndarray(shape=(10000,), dtype='float32')
+    for i in range(10000):
+        if i >= 400 and words[i].has_vector:
+            top5k[i] = words[i].vector
+            norms[i] = numpy.sqrt(sum(top5k[i] ** 2))
+        else:
+            # Make these way off values, to make big distance.
+            top5k[i] = 100.0
+            norms[i] = 100.0
+    print("Setting vectors")
+    for word in words[10000:]:
+        if word.has_vector:
+            cosines = numpy.dot(top5k, word.vector)
+            cosines /= norms * numpy.sqrt(sum(word.vector ** 2))
+            most_similar = words[numpy.argmax(cosines)]
+            word.norm = most_similar.norm
+        else:
+            word.norm = word.shape
+    
+    print(nlp.parser.model.widths)
  
-    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
+    print("Itn.\tP.Loss\tPruned\tTrain\tDev\tSize")
+    last_score = 0.0
+    nr_trimmed = 0
+    eg_seen = 0
+    loss = 0
     for itn in range(n_iter):
-        scorer = Scorer()
-        loss = 0
+        random.shuffle(gold_tuples)
         for _, sents in gold_tuples:
             for annot_tuples, _ in sents:
-                if len(annot_tuples[1]) == 1:
-                    continue
-
-                score_model(scorer, nlp, None, annot_tuples, verbose=False)
-
                 tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-                nlp.tagger(tokens)
-                gold = GoldParse(tokens, annot_tuples, make_projective=True)
-                if not gold.is_projective:
-                    raise Exception(
-                        "Non-projective sentence in training, after we should "
-                        "have enforced projectivity: %s" % annot_tuples
-                    )
- 
+                nlp.tagger.tag_from_strings(tokens, annot_tuples[2])
+                gold = GoldParse(tokens, annot_tuples)
                 loss += nlp.parser.train(tokens, gold)
-                nlp.tagger.train(tokens, gold.tags)
-        random.shuffle(gold_tuples)
-        print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
-                                             scorer.tags_acc, scorer.token_acc))
-    print('end training')
+                eg_seen += 1
+                if eg_seen % 10000 == 0:
+                    scorer = Scorer()
+                    with io.open(dev_loc, 'r', encoding='utf8') as file_:
+                        for _, sents in read_conll(file_):
+                            for annot_tuples, _ in sents:
+                                score_model(scorer, nlp, None, annot_tuples)
+                    train_scorer = Scorer()
+                    for _, sents in gold_tuples[:1000]:
+                        for annot_tuples, _ in sents:
+                            score_model(train_scorer, nlp, None, annot_tuples)
+                    print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%d' % (itn, int(loss), nr_trimmed,
+                                                             train_scorer.uas, scorer.uas,
+                                                             nlp.parser.model.mem.size))
+                    loss = 0
+        if feat_set != 'basic':
+            nlp.parser.model.eta *= 0.99
+            threshold = 0.05 * (1.05 ** itn)
+            nr_trimmed = nlp.parser.model.sparsify_embeddings(threshold, True) 
     nlp.end_training(model_dir)
-    print('done')
+    return nlp
 
 
 @plac.annotations(
     train_loc=("Location of CoNLL 09 formatted training file"),
     dev_loc=("Location of CoNLL 09 formatted development file"),
     model_dir=("Location of output model directory"),
-    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
     n_iter=("Number of training iterations", "option", "i", int),
+    batch_norm=("Use batch normalization and residual connections", "flag", "b"),
+    update_step=("Update step", "option", "u", str),
+    learn_rate=("Learn rate", "option", "e", float),
+    neural=("Use neural network?", "flag", "N")
 )
-def main(train_loc, dev_loc, model_dir, n_iter=15):
+def main(train_loc, dev_loc, model_dir, n_iter=15, neural=False, batch_norm=False,
+         learn_rate=0.001, update_step='sgd_cm'):
     with io.open(train_loc, 'r', encoding='utf8') as file_:
-        train_sents = read_conll(file_)
-    if not eval_only:
-        train(English, train_sents, model_dir, n_iter=n_iter)
-    nlp = English(data_dir=model_dir)
-    dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8'))
+        train_sents = list(read_conll(file_))
+    # preprocess training data here before ArcEager.get_labels() is called
+    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
+
+    nlp = train(English, train_sents, model_dir, dev_loc, n_iter=n_iter,
+                feat_set='neural' if neural else 'basic',
+                batch_norm=batch_norm,
+                learn_rate=learn_rate,
+                update_step=update_step)
     scorer = Scorer()
-    for _, sents in dev_sents:
-        for annot_tuples, _ in sents:
-            score_model(scorer, nlp, None, annot_tuples)
-    print('TOK', 100-scorer.token_acc)
+    with io.open(dev_loc, 'r', encoding='utf8') as file_:
+        for _, sents in read_conll(file_):
+            for annot_tuples, _ in sents:
+                score_model(scorer, nlp, None, annot_tuples)
+    print('TOK', scorer.token_acc)
     print('POS', scorer.tags_acc)
     print('UAS', scorer.uas)
     print('LAS', scorer.las)
 
 
+
 if __name__ == '__main__':
     plac.call(main)
diff --git a/bin/parser/train.py b/bin/parser/train.py
index 372c7932e..0a86bf933 100755
--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@@ -23,7 +23,8 @@ from spacy.scorer import Scorer
 from spacy.syntax.arc_eager import ArcEager
 from spacy.syntax.ner import BiluoPushDown
 from spacy.tagger import Tagger
-from spacy.syntax.parser import Parser
+from spacy.syntax.parser import Parser, get_templates
+from spacy.syntax.beam_parser import BeamParser
 from spacy.syntax.nonproj import PseudoProjectivity
 
 
@@ -103,6 +104,23 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
     Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                  labels=ArcEager.get_labels(gold_tuples),
                  beam_width=beam_width,projectivize=pseudoprojective)
+    #feat_set, slots = get_templates('neural')
+    #vector_widths = [10, 10, 10]
+    #hidden_layers = [100, 100, 100]
+    #update_step = 'adam'
+    #eta = 0.001
+    #rho = 1e-4
+    #Config.write(dep_model_dir, 'config', model='neural',
+    #             seed=seed, labels=ArcEager.get_labels(gold_tuples),
+    #             feat_set=feat_set,
+    #             vector_widths=vector_widths,
+    #             slots=slots,
+    #             hidden_layers=hidden_layers,
+    #             update_step=update_step,
+    #             eta=eta,
+    #             rho=rho)
+
+
     Config.write(ner_model_dir, 'config', features='ner', seed=seed,
                  labels=BiluoPushDown.get_labels(gold_tuples),
                  beam_width=0)
@@ -112,8 +130,13 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
 
     nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
     nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
-    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
-    nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
+    nlp.parser = BeamParser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
+    nlp.entity = BeamParser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
+    print(nlp.parser.model.widths)
+    for raw_text, sents in gold_tuples:
+        for annot_tuples, ctnt in sents:
+            for word in annot_tuples[1]:
+                _ = nlp.vocab[word]
     print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
     for itn in range(n_iter):
         scorer = Scorer()
@@ -224,12 +247,13 @@ def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc=
     if not eval_only:
         gold_train = list(read_json_file(train_loc))
         train(lang, gold_train, model_dir,
-              feat_set='basic' if not debug else 'debug',
+              feat_set='neural' if not debug else 'debug',
               gold_preproc=gold_preproc, n_sents=n_sents,
               corruption_level=corruption_level, n_iter=n_iter,
               verbose=verbose,pseudoprojective=pseudoprojective)
     if out_loc:
         write_parses(lang, dev_loc, model_dir, out_loc)
+    print(model_dir)
     scorer = evaluate(lang, list(read_json_file(dev_loc)),
                       model_dir, gold_preproc=gold_preproc, verbose=verbose)
     print('TOK', scorer.token_acc)
diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py
index 213591804..108be6192 100644
--- a/bin/parser/train_ud.py
+++ b/bin/parser/train_ud.py
@@ -16,24 +16,86 @@ from spacy.syntax.arc_eager import ArcEager
 from spacy.syntax.parser import get_templates
 from spacy.scorer import Scorer
 import spacy.attrs
+from spacy.syntax.nonproj import PseudoProjectivity
+
+from spacy.syntax._parse_features import *
 
 from spacy.language import Language
 
-from spacy.tagger import W_orth
-
-TAGGER_TEMPLATES = (
-    (W_orth,),
-)
-
 try:
     from codecs import open
 except ImportError:
     pass
 
 
+features = [
+    (S2W,),
+    (S1W, ),
+    (S1rW,),
+    (S0lW, ),
+    (S0l2W, ),
+    (S0W, ),
+    (S0r2W, ),
+    (S0rW, ),
+    (N0l2W, ),
+    (N0lW, ),
+    (N0W, ),
+    (N1W, ),
+    (N2W, )
+]
+
+slots = [0] * len(features)
+
+features += [
+    (S2p,),
+    (S1p, ),
+    (S1rp,),
+    (S0lp,),
+    (S0l2p,),
+    (S0p, ),
+    (S0r2p, ),
+    (S0rp, ),
+    (N0l2p, ),
+    (N0lp, ),
+    (N0p, ),
+    (N1p, ),
+    (N2p, )
+]
+
+slots += [1] * (len(features) - len(slots))
+
+features += [
+    (S2L,),
+    (S1L,),
+    (S1rL,),
+    (S0lL,),
+    (S0l2L,),
+    (S0L,),
+    (S0rL,),
+    (S0r2L,),
+    (N0l2L,),
+    (N0lL,),
+]
+slots += [2] * (len(features) - len(slots))
+#
+#features += [(S2p, S1p), (S1p, S0p)]
+#slots += [3, 3]
+#features += [(S0p, N0p)]
+#slots += [4]
+#    (S0l2p, S0l2L, S0lp, S0l2L),
+#    (N0l2p, N0l2L, N0lp, N0lL),
+#    (S1p, S1rp, S1rL),
+#    (S0p, S0rp, S0rL),
+#)
+
+
+
+
 class TreebankParser(object):
     @staticmethod
-    def setup_model_dir(model_dir, labels, templates, feat_set='basic', seed=0):
+    def setup_model_dir(model_dir, labels, vector_widths=(300,), slots=(0,),
+            hidden_layers=(300, 300),
+            feat_set='basic', seed=0, update_step='sgd', eta=0.005, rho=0.0):
         dep_model_dir = path.join(model_dir, 'deps')
         pos_model_dir = path.join(model_dir, 'pos')
         if path.exists(dep_model_dir):
@@ -43,15 +105,16 @@ class TreebankParser(object):
         os.mkdir(dep_model_dir)
         os.mkdir(pos_model_dir)
 
-        Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
-                     labels=labels)
+        Config.write(dep_model_dir, 'config', model='neural', feat_set=feat_set,
+                     seed=seed, labels=labels, vector_widths=vector_widths, slots=slots,
+                     hidden_layers=hidden_layers, update_step=update_step, eta=eta, rho=rho)
 
     @classmethod
     def from_dir(cls, tag_map, model_dir):
-        vocab = Vocab(tag_map=tag_map, get_lex_attr=Language.default_lex_attrs())
+        vocab = Vocab.load(model_dir, get_lex_attr=Language.default_lex_attrs())
         vocab.get_lex_attr[spacy.attrs.LANG] = lambda _: 0
         tokenizer = Tokenizer(vocab, {}, None, None, None)
-        tagger = Tagger.blank(vocab, TAGGER_TEMPLATES)
+        tagger = Tagger.blank(vocab, Tagger.default_templates())
 
         cfg = Config.read(path.join(model_dir, 'deps'), 'config')
         parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings, ArcEager)
@@ -64,22 +127,14 @@ class TreebankParser(object):
         self.parser = parser
 
     def train(self, words, tags, heads, deps):
-        tokens = self.tokenizer.tokens_from_list(list(words))
-        self.tagger.train(tokens, tags)
-        
         tokens = self.tokenizer.tokens_from_list(list(words))
         ids = range(len(words))
         ner = ['O'] * len(words)
-        gold = GoldParse(tokens, ((ids, words, tags, heads, deps, ner)),
-                         make_projective=False)
-        self.tagger(tokens)
-        if gold.is_projective:
-            try:
-                self.parser.train(tokens, gold)
-            except:
-                for id_, word, head, dep in zip(ids, words, heads, deps):
-                    print(id_, word, head, dep)
-                raise
+        gold = GoldParse(tokens, ((ids, words, tags, heads, deps, ner)))
+        self.tagger.tag_from_strings(tokens, tags)
+        loss = self.parser.train(tokens, gold)
+        PseudoProjectivity.deprojectivize(tokens)
+        return loss
 
     def __call__(self, words, tags=None):
         tokens = self.tokenizer.tokens_from_list(list(words))
@@ -88,6 +143,7 @@ class TreebankParser(object):
         else:
             self.tagger.tag_from_strings(tokens, tags)
         self.parser(tokens)
+        PseudoProjectivity.deprojectivize(tokens)
         return tokens
 
     def end_training(self, data_dir):
@@ -101,8 +157,6 @@ class TreebankParser(object):
         self.vocab.dump(path.join(data_dir, 'vocab', 'lexemes.bin'))
 
 
- 
-
 def read_conllx(loc):
     with open(loc, 'r', 'utf8') as file_:
         text = file_.read()
@@ -119,8 +173,8 @@ def read_conllx(loc):
                 id_ = int(id_) - 1
                 head = (int(head) - 1) if head != '0' else id_
                 dep = 'ROOT' if dep == 'root' else dep
-                tokens.append((id_, word, tag, head, dep, 'O'))
-            tuples = zip(*tokens)
+                tokens.append([id_, word, tag, head, dep, 'O'])
+            tuples = [list(el) for el in zip(*tokens)]
             yield (None, [(tuples, [])])
 
 
@@ -134,27 +188,38 @@ def score_model(nlp, gold_docs, verbose=False):
     return scorer
 
 
-def main(train_loc, dev_loc, model_dir, tag_map_loc):
+@plac.annotations(
+    n_iter=("Number of training iterations", "option", "i", int),
+)
+def main(train_loc, dev_loc, model_dir, tag_map_loc, n_iter=10):
     with open(tag_map_loc) as file_:
         tag_map = json.loads(file_.read())
     train_sents = list(read_conllx(train_loc))
-    labels = ArcEager.get_labels(train_sents)
-    templates = get_templates('basic')
+    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
+    dev_sents = list(read_conllx(dev_loc))
 
-    TreebankParser.setup_model_dir(model_dir, labels, templates)
+    labels = ArcEager.get_labels(train_sents)
+
+    TreebankParser.setup_model_dir(model_dir, labels,
+        feat_set=features, vector_widths=(10,10,10,30,30), slots=slots,
+        hidden_layers=(100,100,100), update_step='adam')
     
     nlp = TreebankParser.from_dir(tag_map, model_dir)
+    nlp.parser.model.rho = 1e-4
+    print(nlp.parser.model.widths)
 
-    for itn in range(15):
+    for itn in range(n_iter):
+        loss = 0.0
         for _, doc_sents in train_sents:
             for (ids, words, tags, heads, deps, ner), _ in doc_sents:
-                nlp.train(words, tags, heads, deps)
+                loss += nlp.train(words, tags, heads, deps)
         random.shuffle(train_sents)
-        scorer = score_model(nlp, read_conllx(dev_loc))
-        print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc))
+        scorer = score_model(nlp, dev_sents)
+        print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
+        print(nlp.parser.model.mem.size)
     nlp.end_training(model_dir)
     scorer = score_model(nlp, read_conllx(dev_loc))
-    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
+    print('Dev: %.3f\t%.3f\t%.3f' % (scorer.uas, scorer.las, scorer.tags_acc))
  
 
 if __name__ == '__main__':
diff --git a/setup.py b/setup.py
index 2098fb377..3871432cc 100644
--- a/setup.py
+++ b/setup.py
@@ -51,6 +51,7 @@ MOD_NAMES = [
     'spacy.syntax._state',
     'spacy.tokenizer',
     'spacy.syntax.parser',
+    'spacy.syntax.beam_parser',
     'spacy.syntax.nonproj',
     'spacy.syntax.transition_system',
     'spacy.syntax.arc_eager',
@@ -73,7 +74,8 @@ MOD_NAMES = [
 compile_options =  {
     'msvc': ['/Ox', '/EHsc'],
     'mingw32' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'],
-    'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function']
+    'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function',
+               '-I/Users/matt/blis/include/blis']
 }
 
 
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index c3badc60d..de5b129fd 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -1,3 +1,4 @@
+# cython: profile=True
 import numpy
 import io
 import json
@@ -264,13 +265,3 @@ cdef class GoldParse:
 
 def is_punct_label(label):
     return label == 'P' or label.lower() == 'punct'
-
-
-
-
-
-
-
-
-
-
diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx
index bc54e0c9d..4a17a0d61 100644
--- a/spacy/syntax/_parse_features.pyx
+++ b/spacy/syntax/_parse_features.pyx
@@ -35,8 +35,8 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
         context[11] = 0
         context[12] = 0
     else:
-        context[0] = token.lex.orth
-        context[1] = token.lemma
+        context[0] = token.lex.norm
+        context[1] = token.lex.norm
         context[2] = token.tag
         context[3] = token.lex.cluster
         # We've read in the string little-endian, so now we can take & (2**n)-1
@@ -366,27 +366,26 @@ trigrams = (
 
 
 words = (
-    S2w,
-    S1w,
-    S1rw,
-    S0lw,
-    S0l2w,
-    S0w,
-    S0r2w,
-    S0rw,
-    N0lw,
-    N0l2w,
-    N0w,
-    N1w,
-    N2w,
-    P1w,
-    P2w
+    S2W,
+    S1W,
+    S1rW,
+    S0lW,
+    S0l2W,
+    S0W,
+    S0r2W,
+    S0rW,
+    N0lW,
+    N0l2W,
+    N0W,
+    N1W,
+    N2W,
+    P1W,
+    P2W
 )
 
 tags = (
     S2p,
     S1p,
-    S1rp,
     S0lp,
     S0l2p,
     S0p,
@@ -404,7 +403,6 @@ tags = (
 labels = (
     S2L,
     S1L,
-    S1rL,
     S0lL,
     S0l2L,
     S0L,
@@ -412,9 +410,88 @@ labels = (
     S0rL,
     N0lL,
     N0l2L,
-    N0L,
-    N1L,
-    N2L,
-    P1L,
-    P2L
 )
+
+core_words = (
+    S2w,
+    S1w,
+    S0lw,
+    S0l2w,
+    S0w,
+    S0rw,
+    S0r2w,
+    N0lw,
+    N0l2w,
+    N0w,
+    N1w,
+    N2w,
+)
+
+
+core_shapes = (
+    S2_shape,
+    S1_shape,
+    S0l_shape,
+    S0l2_shape,
+    S0_shape,
+    S0r_shape,
+    S0r2_shape,
+    N0l_shape,
+    N0l2_shape,
+    N0_shape,
+    N1_shape,
+    N2_shape,
+)
+
+
+core_clusters = (
+    S2c,
+    S1c,
+    S0lc,
+    S0l2c,
+    S0c,
+    S0rc,
+    S0r2c,
+    N0lc,
+    N0l2c,
+    N0c,
+    N1c,
+    N2c,
+)
+
+
+
+core_tags = (
+    S2p,
+    S1p,
+    S0lp,
+    S0l2p,
+    S0p,
+    S0r2p,
+    S0rp,
+    N0lp,
+    N0l2p,
+    N0p,
+    N1p,
+    N2p,
+)
+
+core_labels = (
+    S2L,
+    S1L,
+    S0lL,
+    S0l2L,
+    S0L,
+    S0r2L,
+    S0rL,
+    N0lL,
+    N0l2L,
+)
+
+valencies = (
+    (N0lv,),
+    (S0lv,),
+    (S0rv,),
+    (S1lv,),
+    (S1rv,),
+) 
diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd
index bf1dbf90f..ad8dc2ef2 100644
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@@ -1,6 +1,9 @@
 from libc.string cimport memcpy, memset
 from libc.stdlib cimport malloc, calloc, free
-from libc.stdint cimport uint32_t
+from libc.stdint cimport uint32_t, uint64_t
+
+from murmurhash.mrmr cimport hash64
+
 from ..vocab cimport EMPTY_LEXEME
 from ..structs cimport TokenC, Entity
 from ..lexeme cimport Lexeme
@@ -201,6 +204,21 @@ cdef cppclass StateC:
         else:
             return this.length - this._b_i
 
+    uint64_t hash() nogil const:
+        cdef TokenC[11] sig
+        sig[0] = this.S_(2)[0]
+        sig[1] = this.S_(1)[0]
+        sig[2] = this.R_(this.S(1), 1)[0]
+        sig[3] = this.L_(this.S(0), 1)[0]
+        sig[4] = this.L_(this.S(0), 2)[0]
+        sig[5] = this.S_(0)[0]
+        sig[6] = this.R_(this.S(0), 2)[0]
+        sig[7] = this.R_(this.S(0), 1)[0]
+        sig[8] = this.B_(0)[0]
+        sig[9] = this.E_(0)[0]
+        sig[10] = this.E_(1)[0]
+        return hash64(sig, sizeof(sig), this._s_i)
+
     void push() nogil:
         if this.B(0) != -1:
             this._stack[this._s_i] = this.B(0)
@@ -290,6 +308,8 @@ cdef cppclass StateC:
         memcpy(this._stack, src._stack, this.length * sizeof(int))
         memcpy(this._buffer, src._buffer, this.length * sizeof(int))
         memcpy(this._ents, src._ents, this.length * sizeof(Entity))
+        memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
+        this.length = src.length
         this._b_i = src._b_i
         this._s_i = src._s_i
         this._e_i = src._e_i
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 4e2590734..409676c55 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -436,4 +436,11 @@ cdef class ArcEager(TransitionSystem):
             else:
                 is_valid[i] = False
                 costs[i] = 9000
-        assert n_gold >= 1
+        if n_gold < 1:
+            for annot in gold.orig_annot:
+                print(annot)
+            print([move_costs[i] for i in range(N_MOVES)])
+            print(gold.orig_annot[stcls.S(0)][1], gold.orig_annot[stcls.B(0)][1])
+            print(gold.heads[stcls.S(0)], gold.heads[stcls.B(0)])
+            print(gold.labels[stcls.S(0)], gold.labels[stcls.B(0)])
+            raise Exception("No gold moves")
diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx
index 9ceb0c0bf..cec0ea57f 100644
--- a/spacy/syntax/iterators.pyx
+++ b/spacy/syntax/iterators.pyx
@@ -10,7 +10,7 @@ def english_noun_chunks(doc):
     for i, word in enumerate(doc):
         if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
             yield word.left_edge.i, word.i+1, np_label
-        elif word.pos == NOUN and word.dep == conj:
+        elif word.pos in (NOUN, PROPN, PRON) and word.dep == conj:
             head = word.head
             while head.dep == conj and head.head.i < head.i:
                 head = head.head
diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd
index c22254c66..956c178a7 100644
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@@ -1,25 +1,37 @@
 from thinc.linear.avgtron cimport AveragedPerceptron
 from thinc.neural.nn cimport NeuralNet
+from thinc.linear.features cimport ConjunctionExtracter
 from thinc.base cimport Model
 from thinc.extra.eg cimport Example
+from thinc.typedefs cimport weight_t
+from thinc.structs cimport FeatureC
 
 from .stateclass cimport StateClass
 from .arc_eager cimport TransitionSystem
 from ..tokens.doc cimport Doc
 from ..structs cimport TokenC
-from thinc.structs cimport ExampleC
+from thinc.structs cimport NeuralNetC, ExampleC
 from ._state cimport StateC
 
 
 cdef class ParserNeuralNet(NeuralNet):
-    cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil
+    cdef ConjunctionExtracter extracter
+    cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil
+
 
 cdef class ParserPerceptron(AveragedPerceptron):
-    cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil
+    cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil
 
 
+cdef class ParserNeuralNetEnsemble(ParserNeuralNet):
+    cdef object _models
+    cdef NeuralNetC** _models_c
+    cdef int** _masks
+    cdef int _nr_model
+
+    
 cdef class Parser:
-    cdef readonly ParserNeuralNet model
+    cdef readonly Model model
     cdef readonly TransitionSystem moves
     cdef int _projectivize
 
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index b83f7bc07..3b1d7a284 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -1,4 +1,5 @@
 # cython: infer_types=True
+# cython: profile=True
 """
 MALT-style dependency parser
 """
@@ -18,13 +19,14 @@ import shutil
 import json
 import sys
 from .nonproj import PseudoProjectivity
+import random
 
 from cymem.cymem cimport Pool, Address
 from murmurhash.mrmr cimport hash64
-from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
+from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t, idx_t
 from thinc.linear.avgtron cimport AveragedPerceptron
 from thinc.linalg cimport VecVec
-from thinc.structs cimport SparseArrayC, ExampleC
+from thinc.structs cimport NeuralNetC, SparseArrayC, ExampleC
 from preshed.maps cimport MapStruct
 from preshed.maps cimport map_get
 from thinc.structs cimport FeatureC
@@ -61,8 +63,10 @@ def get_templates(name):
         return pf.ner
     elif name == 'debug':
         return pf.unigrams
-    elif name.startswith('embed'):
-        return (pf.words, pf.tags, pf.labels)
+    elif name.startswith('neural'):
+        features = pf.words + pf.tags + pf.labels
+        slots = [0] * len(pf.words) + [1] * len(pf.tags) + [2] * len(pf.labels)
+        return ([(f,) for f in features], slots)
     else:
         return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
                 pf.tree_shape + pf.trigrams)
@@ -73,72 +77,238 @@ def ParserFactory(transition_system):
 
 
 cdef class ParserPerceptron(AveragedPerceptron):
-    cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil: 
+    @property
+    def widths(self):
+        return (self.extracter.nr_templ,)
+
+    def update(self, Example eg):
+        '''Does regression on negative cost. Sort of cute?'''
+        self.time += 1
+        cdef weight_t loss = 0.0
+        best = eg.best
+        for clas in range(eg.c.nr_class):
+            if not eg.c.is_valid[clas]:
+                continue
+            if eg.c.scores[clas] < eg.c.scores[best]:
+                continue
+            loss += (-eg.c.costs[clas] - eg.c.scores[clas]) ** 2
+            d_loss = 2 * (-eg.c.costs[clas] - eg.c.scores[clas])
+            step = d_loss * 0.001
+            for feat in eg.c.features[:eg.c.nr_feat]:
+                self.update_weight(feat.key, clas, feat.value * step)
+        return int(loss)
+
+    cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil: 
+        state = <const StateC*>_state
         fill_context(eg.atoms, state)
         eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
 
 
 cdef class ParserNeuralNet(NeuralNet):
-    def __init__(self, nr_class, hidden_width=50, depth=2, word_width=50,
-            tag_width=20, dep_width=20, update_step='sgd', eta=0.01, rho=0.0):
-        #input_length = 3 * word_width + 5 * tag_width + 3 * dep_width
-        input_length = 12 * word_width + 7 * dep_width
-        widths = [input_length] + [hidden_width] * depth + [nr_class]
-        #vector_widths = [word_width, tag_width, dep_width]
-        #slots = [0] * 3 + [1] * 5 + [2] * 3
-        vector_widths = [word_width, dep_width]
-        slots = [0] * 12 + [1] * 7
-        NeuralNet.__init__(
-            self,
-            widths,
-            embed=(vector_widths, slots),
-            eta=eta,
-            rho=rho,
-            update_step=update_step)
+    def __init__(self, shape, **kwargs):
+        vector_widths = [4] * 57
+        slots =  [0, 1, 2, 3] # S0
+        slots += [4, 5, 6, 7] # S1
+        slots += [8, 9, 10, 11] # S2
+        slots += [12, 13, 14, 15] # S3+
+        slots += [16, 17, 18, 19] # B0
+        slots += [20, 21, 22, 23] # B1
+        slots += [24, 25, 26, 27] # B2
+        slots += [28, 29, 30, 31] # B3+
+        slots += [32, 33, 34, 35] * 2 # S0l, S0r
+        slots += [36, 37, 38, 39] * 2 # B0l, B0r
+        slots += [40, 41, 42, 43] * 2 # S1l, S1r
+        slots += [44, 45, 46, 47] * 2 # S2l, S2r
+        slots += [48, 49, 50, 51, 52]
+        slots += [53, 54, 55, 56]
+        input_length = sum(vector_widths[slot] for slot in slots)
+        widths = [input_length] + shape[3:]
+        
+        NeuralNet.__init__(self, widths, embed=(vector_widths, slots), **kwargs)
 
     @property
     def nr_feat(self):
-        #return 3+5+3
-        return 12+7
+        return 2000
 
-    cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil: 
+    cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil: 
+        memset(eg.features, 0, 2000 * sizeof(FeatureC))
+        state = <const StateC*>_state
         fill_context(eg.atoms, state)
-        eg.nr_feat = 12 + 7
-        for j in range(eg.nr_feat):
-            eg.features[j].value = 1.0
-            eg.features[j].i = j
-        #eg.features[0].key = eg.atoms[S0w]
-        #eg.features[1].key = eg.atoms[S1w]
-        #eg.features[2].key = eg.atoms[N0w]
+        feats = eg.features
 
-        eg.features[0].key = eg.atoms[S2W]
-        eg.features[1].key = eg.atoms[S1W]
-        eg.features[2].key = eg.atoms[S0lW]
-        eg.features[3].key = eg.atoms[S0l2W]
-        eg.features[4].key = eg.atoms[S0W]
-        eg.features[5].key = eg.atoms[S0r2W]
-        eg.features[6].key = eg.atoms[S0rW]
-        eg.features[7].key = eg.atoms[N0lW]
-        eg.features[8].key = eg.atoms[N0l2W]
-        eg.features[9].key = eg.atoms[N0W]
-        eg.features[10].key = eg.atoms[N1W]
-        eg.features[11].key = eg.atoms[N2W]
+        feats = _add_token(feats, 0, state.S_(0), 1.0)
+        feats = _add_token(feats, 4, state.S_(1), 1.0)
+        feats = _add_token(feats, 8, state.S_(2), 1.0)
+        # Rest of the stack, with exponential decay
+        for i in range(3, state.stack_depth()):
+            feats = _add_token(feats, 12, state.S_(i), 1.0 * 0.5**(i-2))
+        feats = _add_token(feats, 16, state.B_(0), 1.0)
+        feats = _add_token(feats, 20, state.B_(1), 1.0)
+        feats = _add_token(feats, 24, state.B_(2), 1.0)
+        # Rest of the buffer, with exponential decay
+        for i in range(3, min(8, state.buffer_length())):
+            feats = _add_token(feats, 28, state.B_(i), 1.0 * 0.5**(i-2))
+        feats = _add_subtree(feats, 32, state, state.S(0))
+        feats = _add_subtree(feats, 40, state, state.B(0))
+        feats = _add_subtree(feats, 48, state, state.S(1))
+        feats = _add_subtree(feats, 56, state, state.S(2))
+        feats = _add_pos_bigram(feats, 64, state.S_(0), state.B_(0))
+        feats = _add_pos_bigram(feats, 65, state.S_(1), state.S_(0))
+        feats = _add_pos_bigram(feats, 66, state.S_(1), state.B_(0))
+        feats = _add_pos_bigram(feats, 67, state.S_(0), state.B_(1))
+        feats = _add_pos_bigram(feats, 68, state.B_(0), state.B_(1))
+        feats = _add_pos_trigram(feats, 69, state.S_(1), state.S_(0), state.B_(0))
+        feats = _add_pos_trigram(feats, 70, state.S_(0), state.B_(0), state.B_(1))
+        feats = _add_pos_trigram(feats, 71, state.S_(0), state.R_(state.S(0), 1),
+                                 state.R_(state.S(0), 2))
+        feats = _add_pos_trigram(feats, 72, state.S_(0), state.L_(state.S(0), 1),
+                                 state.L_(state.S(0), 2))
+        eg.nr_feat = feats - eg.features
 
-        eg.features[12].key = eg.atoms[S2L]
-        eg.features[13].key = eg.atoms[S1L]
-        eg.features[14].key = eg.atoms[S0l2L]
-        eg.features[15].key = eg.atoms[S0lL]
-        eg.features[16].key = eg.atoms[S0L]
-        eg.features[17].key = eg.atoms[S0r2L]
-        eg.features[18].key = eg.atoms[S0rL]
+
+cdef inline FeatureC* _add_token(FeatureC* feats,
+        int slot, const TokenC* token, weight_t value) nogil:
+    # Word
+    feats.i = slot
+    feats.key = token.lex.norm
+    feats.value = value
+    feats += 1
+    # POS tag
+    feats.i = slot+1
+    feats.key = token.tag
+    feats.value = value
+    feats += 1
+    # Dependency label 
+    feats.i = slot+2
+    feats.key = token.dep
+    feats.value = value
+    feats += 1
+    # Word, label, tag
+    feats.i = slot+3
+    cdef uint64_t key[3]
+    key[0] = token.lex.cluster
+    key[1] = token.tag
+    key[2] = token.dep
+    feats.key = hash64(key, sizeof(key), 0)
+    feats.value = value
+    feats += 1
+    return feats
+
+
+cdef inline FeatureC* _add_subtree(FeatureC* feats, int slot, const StateC* state, int t) nogil:
+    value = 1.0
+    for i in range(state.n_R(t)):
+        feats = _add_token(feats, slot, state.R_(t, i+1), value)
+        value *= 0.5
+    slot += 4
+    value = 1.0
+    for i in range(state.n_L(t)):
+        feats = _add_token(feats, slot, state.L_(t, i+1), value)
+        value *= 0.5
+    return feats
+
+
+cdef inline FeatureC* _add_pos_bigram(FeatureC* feat, int slot,
+        const TokenC* t1, const TokenC* t2) nogil:
+    cdef uint64_t[2] key
+    key[0] = t1.tag
+    key[1] = t2.tag
+    feat.i = slot
+    feat.key = hash64(key, sizeof(key), slot)
+    feat.value = 1.0
+    return feat+1
+ 
+
+cdef inline FeatureC* _add_pos_trigram(FeatureC* feat, int slot,
+        const TokenC* t1, const TokenC* t2, const TokenC* t3) nogil:
+    cdef uint64_t[3] key
+    key[0] = t1.tag
+    key[1] = t2.tag
+    key[2] = t3.tag
+    feat.i = slot
+    feat.key = hash64(key, sizeof(key), slot)
+    feat.value = 1.0
+    return feat+1
+ 
+cdef class ParserNeuralNetEnsemble(ParserNeuralNet):
+    def __init__(self, shape, update_step='sgd', eta=0.01, rho=0.0, n=5):
+        ParserNeuralNet.__init__(self, shape, update_step=update_step, eta=eta, rho=rho)
+        self._models_c = <NeuralNetC**>self.mem.alloc(sizeof(NeuralNetC*), n)
+        self._masks = <int**>self.mem.alloc(sizeof(int*), n)
+        self._models = []
+        cdef ParserNeuralNet model
+        threshold = 1.5 / n
+        self._nr_model = n
+        for i in range(n):
+            self._masks[i] = <int*>self.mem.alloc(sizeof(int), self.nr_feat)
+            for j in range(self.nr_feat):
+                self._masks[i][j] = random.random() < threshold
+            # We have to pass our pool here, because the embedding table passes
+            # it around.
+            model = ParserNeuralNet(shape, update_step=update_step, eta=eta, rho=rho)
+            self._models_c[i] = &model.c
+            self._models.append(model)
+
+    property eta:
+        def __get__(self):
+            return self._models[0].eta
+
+        def __set__(self, weight_t value):
+            for model in self._models:
+                model.eta = value
+
+    def sparsify_embeddings(self, penalty):
+        p = 0.0
+        for model in self._models:
+            p += model.sparsify_embeddings(penalty)
+        return p / len(self._models)
+
+    cdef void set_scoresC(self, weight_t* scores, const void* _feats,
+            int nr_feat, int is_sparse) nogil:
+        nr_class = self.c.widths[self.c.nr_layer-1]
+        sub_scores = <weight_t*>calloc(sizeof(weight_t), nr_class)
+        sub_feats = <FeatureC*>calloc(sizeof(FeatureC), nr_feat)
+        feats = <const FeatureC*>_feats
+        for i in range(self._nr_model):
+            for j in range(nr_feat):
+                sub_feats[j] = feats[j]
+                sub_feats[j].value *= self._masks[i][j]
+            self.c = self._models_c[i][0]
+            self.c.weights = self._models_c[i].weights
+            self.c.gradient = self._models_c[i].gradient
+            ParserNeuralNet.set_scoresC(self, sub_scores, sub_feats, nr_feat, 1)
+            for j in range(nr_class):
+                scores[j] += sub_scores[j]
+                sub_scores[j] = 0.0
+        for j in range(nr_class):
+            scores[j] /= self._nr_model
+        free(sub_feats)
+        free(sub_scores)
+
+    def update(self, Example eg):
+        if eg.cost == 0:
+            return 0.0
+        loss = 0.0
+        full_feats = <FeatureC*>calloc(sizeof(FeatureC), eg.nr_feat)
+        memcpy(full_feats, eg.c.features, sizeof(FeatureC) * eg.nr_feat)
+        cdef ParserNeuralNet model
+        for i, model in enumerate(self._models):
+            for j in range(eg.nr_feat):
+                eg.c.features[j].value *= self._masks[i][j]
+            loss += model.update(eg)
+            memcpy(eg.c.features, full_feats, sizeof(FeatureC) * eg.nr_feat)
+        free(full_feats)
+        return loss
+
+    def end_training(self):
+        for model in self._models:
+            model.end_training()
 
 
 cdef class Parser:
-    def __init__(self, StringStore strings, transition_system, ParserNeuralNet model,
-            int projectivize = 0):
+    def __init__(self, StringStore strings, transition_system, model):
         self.moves = transition_system
         self.model = model
-        self._projectivize = projectivize
 
     @classmethod
     def from_dir(cls, model_dir, strings, transition_system):
@@ -148,16 +318,24 @@ cdef class Parser:
             print >> sys.stderr, "Warning: model path:", model_dir, "is not a directory"
         cfg = Config.read(model_dir, 'config')
         moves = transition_system(strings, cfg.labels)
-        model = ParserNeuralNet(moves.n_moves, hidden_width=cfg.hidden_width,
-                                depth=cfg.depth, word_width=cfg.word_width,
-                                tag_width=cfg.tag_width, dep_width=cfg.dep_width,
-                                update_step=cfg.update_step,
-                                eta=cfg.eta, rho=cfg.rho)
 
-        project = cfg.projectivize if hasattr(cfg,'projectivize') else False
+        if cfg.get('model') == 'neural':
+            shape = [cfg.vector_widths, cfg.slots, cfg.feat_set]
+            shape.extend(cfg.hidden_layers)
+            shape.append(moves.n_moves)
+            if cfg.get('ensemble_size') >= 2:
+                model = ParserNeuralNetEnsemble(shape, update_step=cfg.update_step,
+                                                eta=cfg.eta, rho=cfg.rho, 
+                                                n=cfg.ensemble_size)
+            else:
+                model = ParserNeuralNet(shape, update_step=cfg.update_step,
+                                        eta=cfg.eta, rho=cfg.rho)
+        else:
+            model = ParserPerceptron(get_templates(cfg.feat_set))
+
         if path.exists(path.join(model_dir, 'model')):
             model.load(path.join(model_dir, 'model'))
-        return cls(strings, moves, model, project)
+        return cls(strings, moves, model)
 
     @classmethod
     def load(cls, pkg_or_str_or_file, vocab):
@@ -253,18 +431,18 @@ cdef class Parser:
                 widths=self.model.widths,
                 nr_atom=CONTEXT_SIZE,
                 nr_feat=self.model.nr_feat)
-        cdef weight_t loss = 0
+        loss = 0
         cdef Transition action
         while not stcls.is_final():
             self.model.set_featuresC(eg.c, stcls.c)
+            self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat, 1)
             self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
-            
-            # Sets eg.c.scores, which Example uses to calculate eg.guess
-            self.model.updateC(eg.c)
-            
-            action = self.moves.c[eg.guess]
+            guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
+            assert guess >= 0
+            action = self.moves.c[guess]
             action.do(stcls.c, action.label)
-            loss += eg.loss
+            
+            loss += self.model.update(eg)
             eg.reset()
         return loss
 
diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd
index 51e465188..9b2d10f89 100644
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@@ -7,7 +7,7 @@ from .vocab cimport Vocab
 
 
 cdef class TaggerModel(AveragedPerceptron):
-    cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *
+    cdef void set_featuresC(self, ExampleC* eg, const void* _token) nogil
  
 
 cdef class Tagger:
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index 991e008ad..e1c3d9b07 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -71,13 +71,13 @@ cpdef enum:
 
 
 cdef class TaggerModel(AveragedPerceptron):
-    cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:
-        
-        _fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])
-        _fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])
-        _fill_from_token(&eg.atoms[W_orth], &tokens[i])
-        _fill_from_token(&eg.atoms[N1_orth], &tokens[i+1])
-        _fill_from_token(&eg.atoms[N2_orth], &tokens[i+2])
+    cdef void set_featuresC(self, ExampleC* eg, const void* _token) nogil:
+        token = <const TokenC*>_token
+        _fill_from_token(&eg.atoms[P2_orth], token - 2)
+        _fill_from_token(&eg.atoms[P1_orth], token - 1)
+        _fill_from_token(&eg.atoms[W_orth], token)
+        _fill_from_token(&eg.atoms[N1_orth], token + 1)
+        _fill_from_token(&eg.atoms[N2_orth], token + 2)
 
         eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
 
@@ -153,7 +153,7 @@ cdef class Tagger:
     @classmethod
     def from_package(cls, pkg, vocab):
         # TODO: templates.json deprecated? not present in latest package
-        # templates = cls.default_templates()
+        #templates = cls.default_templates()
         templates = pkg.load_json(('pos', 'templates.json'), default=cls.default_templates())
 
         model = TaggerModel(templates)
@@ -202,12 +202,13 @@ cdef class Tagger:
                                   nr_feat=self.model.nr_feat)
         for i in range(tokens.length):
             if tokens.c[i].pos == 0:                
-                self.model.set_featuresC(eg.c, tokens.c, i)
+                self.model.set_featuresC(eg.c, &tokens.c[i])
                 self.model.set_scoresC(eg.c.scores,
                     eg.c.features, eg.c.nr_feat, 1)
                 guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
                 self.vocab.morphology.assign_tag(&tokens.c[i], guess)
                 eg.fill_scores(0, eg.c.nr_class)
+                eg.reset()
         tokens.is_tagged = True
         tokens._py_tokens = [None] * tokens.length
 
@@ -231,18 +232,15 @@ cdef class Tagger:
             nr_class=self.vocab.morphology.n_tags,
             nr_feat=self.model.nr_feat)
         for i in range(tokens.length):
-            self.model.set_featuresC(eg.c, tokens.c, i)
+            self.model.set_featuresC(eg.c, &tokens.c[i])
             eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]
             self.model.set_scoresC(eg.c.scores,
                 eg.c.features, eg.c.nr_feat, 1)
-            self.model.updateC(eg.c)
-
             self.vocab.morphology.assign_tag(&tokens.c[i], eg.guess)
-            
+            self.model.update(eg)
             correct += eg.cost == 0
             self.freqs[TAG][tokens.c[i].tag] += 1
-            eg.fill_scores(0, eg.c.nr_class)
-            eg.fill_costs(0, eg.c.nr_class)
+            eg.reset()
         tokens.is_tagged = True
         tokens._py_tokens = [None] * tokens.length
         return correct