diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index c7170c747..984cc1b5b 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -28,6 +28,8 @@ from murmurhash.mrmr cimport hash64
 from preshed.maps cimport MapStruct
 from preshed.maps cimport map_get
 
+from thinc.api import layerize
+
 from numpy import exp
 
 from . import _parse_features
@@ -55,40 +57,45 @@ def set_debug(val):
 
 
 def get_greedy_model_for_batch(tokvecs, TransitionSystem moves, feat_maps, upper_model):
-    is_valid = model.ops.allocate((len(docs), system.n_moves), dtype='i')
-    costs = model.ops.allocate((len(docs), system.n_moves), dtype='f')
-    token_ids = model.ops.allocate((len(docs), StateClass.nr_context_tokens()),
-                                    dtype='uint64')
-    cached, backprops = zip(*[lyr.begin_update(tokvecs) for lyr in feat_maps)
+    cdef int[:, :] is_valid_
+    cdef float[:, :] costs_
+    cdef int[:, :] token_ids
+    is_valid = upper_model.ops.allocate((len(tokvecs), moves.n_moves), dtype='i')
+    costs = upper_model.ops.allocate((len(tokvecs), moves.n_moves), dtype='f')
+    token_ids = upper_model.ops.allocate((len(tokvecs), StateClass.nr_context_tokens()),
+                                         dtype='uint64')
+    cached, backprops = zip(*[lyr.begin_update(tokvecs) for lyr in feat_maps])
+    is_valid_ = is_valid
+    costs_ = costs
 
     def forward(states, drop=0.):
-        nonlocal is_valid, costs, token_ids, features
+        nonlocal is_valid, costs, token_ids, moves
         is_valid = is_valid[:len(states)]
         costs = costs[:len(states)]
         token_ids = token_ids[:len(states)]
         is_valid = is_valid[:len(states)]
-        for state in states:
-            state.set_context_tokens(&token_ids[i])
-            moves.set_valid(&is_valid[i], state.c)
+        cdef StateClass state
+        for i, state in enumerate(states):
+            state.set_context_tokens(token_ids[i])
+            moves.set_valid(&is_valid_[i, 0], state.c)
 
         features = cached[token_ids].sum(axis=1)
 
         scores, bp_scores = upper_model.begin_update(features, drop=drop)
-        softmaxed = model.ops.softmax(scores)
+        softmaxed = upper_model.ops.softmax(scores)
         # Renormalize for invalid actions
         softmaxed *= is_valid
         softmaxed /= softmaxed.sum(axis=1).reshape((softmaxed.shape[0], 1))
 
         def backward(golds, sgd=None):
-            nonlocal costs_, is_valid_, moves_
-            cdef TransitionSystem moves = moves_
-            cdef int[:, :] is_valid
-            cdef float[:, :] costs
+            nonlocal costs_, is_valid_, moves
             for i, (state, gold) in enumerate(zip(states, golds)):
-                moves.set_costs(&costs[i], &is_valid[i],
+                moves.set_costs(&is_valid_[i, 0], &costs_[i, 0],
                     state, gold)
-            set_log_loss(model.ops, d_scores,
-                scores, is_valid, costs)
+            d_scores = scores.copy()
+            d_scores.fill(0)
+            set_log_loss(upper_model.ops, d_scores,
+                scores, is_valid_, costs_)
             d_tokens = bp_scores(d_scores, sgd)
             return d_tokens
 
@@ -119,6 +126,17 @@ def transition_batch(TransitionSystem moves, states, scores):
         action.do(state.c, action.label)
 
 
+def init_states(TransitionSystem moves, docs):
+    states = []
+    cdef Doc doc
+    cdef StateClass state
+    for i, doc in enumerate(docs):
+        state = StateClass.init(doc.c, doc.length)
+        moves.initialize_state(state.c)
+        states.append(state)
+    return states
+
+
 cdef class Parser:
     """
     Base class of the DependencyParser and EntityRecognizer.
@@ -176,7 +194,8 @@ cdef class Parser:
     def build_model(self, width=32, nr_vector=1000, nF=1, nB=1, nS=1, nL=1, nR=1, **_):
         nr_context_tokens = StateClass.nr_context_tokens(nF, nB, nS, nL, nR)
         self.model = build_model(width*2, 2, self.moves.n_moves)
-        self.feature_maps = build_feature_maps(nr_context_tokens, width, nr_vector))
+        # TODO
+        self.feature_maps = [] #build_feature_maps(nr_context_tokens, width, nr_vector)
 
     def __call__(self, Doc tokens):
         """
@@ -248,6 +267,7 @@ cdef class Parser:
 
         model = get_greedy_model_for_batch([d.tensor for d in docs],
                     self.moves, self.model, self.feat_maps)
+        states = init_states(self.moves, docs)
 
         d_tokens = [self.model.ops.allocate(d.tensor.shape) for d in docs]
         output = list(d_tokens)
@@ -261,7 +281,7 @@ cdef class Parser:
             transition_batch(self.moves, states)
             # Get unfinished states (and their matching gold and token gradients)
             todo = filter(lambda sp: not sp[0].py_is_final(), todo)
-        return output, sum(losses)
+        return output
 
     def begin_training(self, docs, golds):
         for gold in golds:
@@ -336,31 +356,6 @@ def _begin_update(self, model, states, tokvecs, drop=0.):
         return finish_update(d_scores, sgd=sgd)
     return softmaxed, backward
 
-def _init_states(self, docs):
-    states = []
-    cdef Doc doc
-    cdef StateClass state
-    for i, doc in enumerate(docs):
-        state = StateClass.init(doc.c, doc.length)
-        self.moves.initialize_state(state.c)
-        states.append(state)
-    return states
-
-def _validate_batch(self, int[:, ::1] is_valid, states):
-    cdef StateClass state
-    cdef int i
-    for i, state in enumerate(states):
-        self.moves.set_valid(&is_valid[i, 0], state.c)
-
-def _cost_batch(self, weight_t[:, ::1] costs, int[:, ::1] is_valid,
-        states, golds):
-    cdef int i
-    cdef StateClass state
-    cdef GoldParse gold
-    for i, (state, gold) in enumerate(zip(states, golds)):
-        self.moves.set_costs(&is_valid[i, 0], &costs[i, 0], state, gold)
-
-
 
 def _get_features(self, states, all_tokvecs, attr_names,
         nF=1, nB=0, nS=2, nL=2, nR=2):