Tidy up syntax

2025-10-31 07:57:35 +03:00 · 2017-10-27 19:45:57 +02:00 · 2017-10-27 19:45:57 +02:00 · b4d226a3f1
commit b4d226a3f1
parent 5167a0cce2
8 changed files with 195 additions and 230 deletions
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@ -2,7 +2,7 @@
 # cython: profile=True
 cimport numpy as np
 import numpy
-from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
+from cpython.ref cimport PyObject, Py_XDECREF
 from thinc.extra.search cimport Beam
 from thinc.extra.search import MaxViolation
 from thinc.typedefs cimport hash_t, class_t
@ -11,7 +11,6 @@ from thinc.extra.search cimport MaxViolation
 from .transition_system cimport TransitionSystem, Transition
 from .stateclass cimport StateClass
 from ..gold cimport GoldParse
 from ..tokens.doc cimport Doc
 # These are passed as callbacks to thinc.search.Beam
@ -50,7 +49,7 @@ cdef class ParserBeam(object):
    cdef public object dones
    def __init__(self, TransitionSystem moves, states, golds,
-            int width, float density):
+                 int width, float density):
        self.moves = moves
        self.states = states
        self.golds = golds
@ -59,7 +58,8 @@ cdef class ParserBeam(object):
        cdef StateClass state, st
        for state in states:
            beam = Beam(self.moves.n_moves, width, density)
-            beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
+            beam.initialize(self.moves.init_beam_state, state.c.length,
                            state.c._sent)
            for i in range(beam.width):
                st = <StateClass>beam.at(i)
                st.c.offset = state.c.offset
@ -74,7 +74,8 @@ cdef class ParserBeam(object):
    @property
    def is_done(self):
-        return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams))
+        return all(b.is_done or self.dones[i]
                   for i, b in enumerate(self.beams))
    def __getitem__(self, i):
        return self.beams[i]
@ -126,7 +127,8 @@ cdef class ParserBeam(object):
        for i in range(beam.size):
            state = <StateClass>beam.at(i)
            if not state.c.is_final():
-                self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
+                self.moves.set_costs(beam.is_valid[i], beam.costs[i],
                                     state, gold)
                if follow_gold:
                    for j in range(beam.nr_class):
                        if beam.costs[i][j] >= 1:
@ -146,7 +148,10 @@ def get_token_ids(states, int n_tokens):
        c_ids += ids.shape[1]
    return ids
 nr_update = 0
 def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
                states, golds,
                state2vec, vec2scores,
@ -167,23 +172,27 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
        if pbeam.is_done and gbeam.is_done:
            break
        # The beam maps let us find the right row in the flattened scores
-        # arrays for each state. States are identified by (example id, history).
+        # arrays for each state. States are identified by (example id,
-        # We keep a different beam map for each step (since we'll have a flat
+        # history). We keep a different beam map for each step (since we'll
-        # scores array for each step). The beam map will let us take the per-state
+        # have a flat scores array for each step). The beam map will let us
-        # losses, and compute the gradient for each (step, state, class).
+        # take the per-state losses, and compute the gradient for each (step,
        # state, class).
        beam_maps.append({})
        # Gather all states from the two beams in a list. Some stats may occur
        # in both beams. To figure out which beam each state belonged to,
        # we keep two lists of indices, p_indices and g_indices
-        states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
+        states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1],
                                                  nr_update)
        if not states:
            break
        # Now that we have our flat list of states, feed them through the model
        token_ids = get_token_ids(states, nr_feature)
        vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
        if hist_feats:
-            hists = numpy.asarray([st.history[:hist_feats] for st in states], dtype='i')
+            hists = numpy.asarray([st.history[:hist_feats] for st in states],
-            scores, bp_scores = vec2scores.begin_update((vectors, hists), drop=drop)
+                                  dtype='i')
            scores, bp_scores = vec2scores.begin_update((vectors, hists),
                                                        drop=drop)
        else:
            scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
@ -192,8 +201,10 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
        # Unpack the flat scores into lists for the two beams. The indices arrays
        # tell us which example and state the scores-row refers to.
-        p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
+        p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
-        g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')  for indices in g_indices]
+                    for indices in p_indices]
        g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
                    for indices in g_indices]
        # Now advance the states in the beams. The gold beam is contrained to
        # to follow only gold analyses.
        pbeam.advance(p_scores)
@ -249,8 +260,7 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
 def get_gradient(nr_class, beam_maps, histories, losses):
-    """
+    """The global model assigns a loss to each parse. The beam scores
    The global model assigns a loss to each parse. The beam scores
    are additive, so the same gradient is applied to each action
    in the history. This gives the gradient of a single *action*
    for a beam state -- so we have "the gradient of loss for taking
@ -270,7 +280,8 @@ def get_gradient(nr_class, beam_maps, histories, losses):
            if loss != 0.0 and not numpy.isnan(loss):
                nr_step = max(nr_step, len(hist))
    for i in range(nr_step):
-        grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f'))
+        grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
                                 dtype='f'))
    assert len(histories) == len(losses)
    for eg_id, hists in enumerate(histories):
        for loss, hist in zip(losses[eg_id], hists):
@ -287,5 +298,3 @@ def get_gradient(nr_class, beam_maps, histories, losses):
                grads[j][i, clas] += loss
                key = key + tuple([clas])
    return grads
--- a/spacy/syntax/_state.pyx
+++ b/spacy/syntax/_state.pyx
@ -1 +0,0 @@
 # test
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -4,24 +4,16 @@
 # coding: utf-8
 from __future__ import unicode_literals
-from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
+from cpython.ref cimport Py_INCREF
 import ctypes
 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
 from cymem.cymem cimport Pool
 from collections import OrderedDict
 from thinc.extra.search cimport Beam
 import numpy
 from .stateclass cimport StateClass
-from ._state cimport StateC, is_space_token
+from ._state cimport StateC
 from .nonproj import is_nonproj_tree
 from .transition_system cimport do_func_t, get_cost_func_t
 from .transition_system cimport move_cost_func_t, label_cost_func_t
-from ..gold cimport GoldParse
+from ..gold cimport GoldParse, GoldParseC
 from ..gold cimport GoldParseC
 from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE, IS_PUNCT
 from ..lexeme cimport Lexeme
 from ..structs cimport TokenC
@ -316,14 +308,13 @@ cdef class ArcEager(TransitionSystem):
    @classmethod
    def get_actions(cls, **kwargs):
-        actions = kwargs.get('actions',
+        actions = kwargs.get('actions', OrderedDict((
-                    OrderedDict((
+            (SHIFT, ['']),
-                        (SHIFT, ['']),
+            (REDUCE, ['']),
-                        (REDUCE, ['']),
+            (RIGHT, []),
-                        (RIGHT, []),
+            (LEFT, []),
-                        (LEFT, []),
+            (BREAK, ['ROOT']))
-                        (BREAK, ['ROOT'])
+        ))
                    )))
        seen_actions = set()
        for label in kwargs.get('left_labels', []):
            if label.upper() != 'ROOT':
@ -363,7 +354,8 @@ cdef class ArcEager(TransitionSystem):
            if gold.cand_to_gold[i] is None:
                continue
            if state.safe_get(i).dep:
-                predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep]))
+                predicted.add((i, state.H(i),
                              self.strings[state.safe_get(i).dep]))
            else:
                predicted.add((i, state.H(i), 'ROOT'))
            id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
@ -381,7 +373,8 @@ cdef class ArcEager(TransitionSystem):
        if not self.has_gold(gold):
            return None
        for i in range(gold.length):
-            if gold.heads[i] is None or gold.labels[i] is None: # Missing values
+            # Missing values
            if gold.heads[i] is None or gold.labels[i] is None:
                gold.c.heads[i] = i
                gold.c.has_dep[i] = False
            else:
@ -517,14 +510,15 @@ cdef class ArcEager(TransitionSystem):
            # Check projectivity --- leading cause
            if is_nonproj_tree(gold.heads):
                raise ValueError(
-                    "Could not find a gold-standard action to supervise the dependency "
+                    "Could not find a gold-standard action to supervise the "
-                    "parser.\n"
+                    "dependency parser. Likely cause: the tree is "
-                    "Likely cause: the tree is non-projective (i.e. it has crossing "
+                    "non-projective (i.e. it has crossing arcs -- see "
-                    "arcs -- see spacy/syntax/nonproj.pyx for definitions)\n"
+                    "spacy/syntax/nonproj.pyx for definitions). The ArcEager "
-                    "The ArcEager transition system only supports projective trees.\n"
+                    "transition system only supports projective trees. To "
-                    "To learn non-projective representations, transform the data "
+                    "learn non-projective representations, transform the data "
-                    "before training and after parsing. Either pass make_projective=True "
+                    "before training and after parsing. Either pass "
-                    "to the GoldParse class, or use PseudoProjectivity.preprocess_training_data")
+                    "make_projective=True to the GoldParse class, or use "
                    "spacy.syntax.nonproj.preprocess_training_data.")
            else:
                print(gold.orig_annot)
                print(gold.words)
@ -532,12 +526,10 @@ cdef class ArcEager(TransitionSystem):
                print(gold.labels)
                print(gold.sent_starts)
                raise ValueError(
-                    "Could not find a gold-standard action to supervise the dependency "
+                    "Could not find a gold-standard action to supervise the"
-                    "parser.\n"
+                    "dependency parser. The GoldParse was projective. The "
-                    "The GoldParse was projective.\n"
+                    "transition system has %d actions. State at failure: %s"
-                    "The transition system has %d actions.\n"
+                    % (self.n_moves, stcls.print_state(gold.words)))
                    "State at failure:\n"
                    "%s" % (self.n_moves, stcls.print_state(gold.words)))
        assert n_gold >= 1
    def get_beam_annot(self, Beam beam):
@ -558,4 +550,3 @@ cdef class ArcEager(TransitionSystem):
                    deps[j].setdefault(dep, 0.0)
                    deps[j][dep] += prob
        return heads, deps
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -4,17 +4,12 @@ from __future__ import unicode_literals
 from thinc.typedefs cimport weight_t
 from thinc.extra.search cimport Beam
 from collections import OrderedDict
 import numpy
 from thinc.neural.ops import NumpyOps
 from .stateclass cimport StateClass
 from ._state cimport StateC
 from .transition_system cimport Transition
 from .transition_system cimport do_func_t
-from ..structs cimport TokenC, Entity
+from ..gold cimport GoldParseC, GoldParse
 from ..gold cimport GoldParseC
 from ..gold cimport GoldParse
 from ..attrs cimport ENT_TYPE, ENT_IOB
 cdef enum:
@ -69,15 +64,14 @@ cdef class BiluoPushDown(TransitionSystem):
    @classmethod
    def get_actions(cls, **kwargs):
-        actions = kwargs.get('actions',
+        actions = kwargs.get('actions', OrderedDict((
-                    OrderedDict((
+            (MISSING, ['']),
-                        (MISSING, ['']),
+            (BEGIN, []),
-                        (BEGIN, []),
+            (IN, []),
-                        (IN, []),
+            (LAST, []),
-                        (LAST, []),
+            (UNIT, []),
-                        (UNIT, []),
+            (OUT, [''])
-                        (OUT, [''])
+        )))
                    )))
        seen_entities = set()
        for entity_type in kwargs.get('entity_types', []):
            if entity_type in seen_entities:
@ -160,7 +154,7 @@ cdef class BiluoPushDown(TransitionSystem):
    cdef Transition lookup_transition(self, object name) except *:
        cdef attr_t label
-        if name == '-' or name == None:
+        if name == '-' or name is None:
            return Transition(clas=0, move=MISSING, label=0, score=0)
        elif name == '!O':
            return Transition(clas=0, move=ISNT, label=0, score=0)
@ -328,8 +322,8 @@ cdef class In:
            return False
        elif preset_ent_iob == 3:
            return False
-        # TODO: Is this quite right?
+        # TODO: Is this quite right? I think it's supposed to be ensuring the
-        # I think it's supposed to be ensuring the gazetteer matches are maintained
+        # gazetteer matches are maintained
        elif st.B_(1).ent_iob != preset_ent_iob:
            return False
        # Don't allow entities to extend across sentence boundaries
@ -354,10 +348,12 @@ cdef class In:
        if g_act == MISSING:
            return 0
        elif g_act == BEGIN:
-            # I, Gold B --> True (P of bad open entity sunk, R of this entity sunk)
+            # I, Gold B --> True
            # (P of bad open entity sunk, R of this entity sunk)
            return 0
        elif g_act == IN:
-            # I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk)
+            # I, Gold I --> True
            # (label forced by prev, if mismatch, P and R both sunk)
            return 0
        elif g_act == LAST:
            # I, Gold L --> True iff this entity sunk and next tag == O
@ -505,11 +501,3 @@ cdef class Out:
            return 1
        else:
            return 1
 class OracleError(Exception):
    pass
 class UnknownMove(Exception):
    pass
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -5,71 +5,48 @@
 # coding: utf-8
 from __future__ import unicode_literals, print_function
-from collections import Counter, OrderedDict
+from collections import OrderedDict
 import ujson
 import json
 import contextlib
 import numpy
 from libc.math cimport exp
 cimport cython
 cimport cython.parallel
 import cytoolz
 import dill
 import numpy.random
 cimport numpy as np
-
+from cpython.ref cimport PyObject, Py_XDECREF
 from libcpp.vector cimport vector
 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
-from libc.stdint cimport uint32_t, uint64_t
+from libc.math cimport exp
-from libc.string cimport memset, memcpy
+from libcpp.vector cimport vector
-from libc.stdlib cimport malloc, calloc, free
+from libc.string cimport memset
-from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
+from libc.stdlib cimport calloc, free
-from thinc.linear.avgtron cimport AveragedPerceptron
+from cymem.cymem cimport Pool
-from thinc.linalg cimport Vec, VecVec
+from thinc.typedefs cimport weight_t, class_t, hash_t
 from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
 from thinc.extra.eg cimport Example
 from thinc.extra.search cimport Beam
-
+from thinc.api import chain, clone
-from cymem.cymem cimport Pool, Address
+from thinc.v2v import Model, Maxout, Affine
 from murmurhash.mrmr cimport hash64
 from preshed.maps cimport MapStruct
 from preshed.maps cimport map_get
 from thinc.api import layerize, chain, clone, with_flatten
 from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
 from thinc.misc import LayerNorm
-
+from thinc.neural.ops import CupyOps
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module
-from .. import util
+from .._ml import zero_init, PrecomputableMaxouts, Tok2Vec, flatten
 from ..util import get_async, get_cuda_stream
 from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
 from .._ml import Tok2Vec, doc2feats, rebatch
 from .._ml import Residual, flatten
 from .._ml import link_vectors_to_models
 from ..compat import json_dumps, copy_array
-
+from ..tokens.doc cimport Doc
 from ..gold cimport GoldParse
 from .. import util
 from .stateclass cimport StateClass
 from ._state cimport StateC
-from . import nonproj
+from .transition_system cimport Transition
-from .transition_system import OracleError
+from . import _beam_utils, nonproj
 from .transition_system cimport TransitionSystem, Transition
 from ..structs cimport TokenC
 from ..tokens.doc cimport Doc
 from ..strings cimport StringStore
 from ..gold cimport GoldParse
 from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
 from . import _beam_utils
 def get_templates(*args, **kwargs):
    return []
 DEBUG = False
 def set_debug(val):
    global DEBUG
    DEBUG = val
@ -100,7 +77,8 @@ cdef class precompute_hiddens:
    cdef object _cuda_stream
    cdef object _bp_hiddens
-    def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, drop=0.):
+    def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
                 drop=0.):
        gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
        cdef np.ndarray cached
        if not isinstance(gpu_cached, numpy.ndarray):
@ -120,8 +98,7 @@ cdef class precompute_hiddens:
        self._bp_hiddens = bp_features
    cdef const float* get_feat_weights(self) except NULL:
-        if not self._is_synchronized \
+        if not self._is_synchronized and self._cuda_stream is not None:
        and self._cuda_stream is not None:
            self._cuda_stream.synchronize()
            self._is_synchronized = True
        return <float*>self._cached.data
@ -130,7 +107,8 @@ cdef class precompute_hiddens:
        return self.begin_update(X)[0]
    def begin_update(self, token_ids, drop=0.):
-        cdef np.ndarray state_vector = numpy.zeros((token_ids.shape[0], self.nO*self.nP), dtype='f')
+        cdef np.ndarray state_vector = numpy.zeros(
            (token_ids.shape[0], self.nO*self.nP), dtype='f')
        # This is tricky, but (assuming GPU available);
        # - Input to forward on CPU
        # - Output from forward on CPU
@ -141,8 +119,8 @@ cdef class precompute_hiddens:
        feat_weights = self.get_feat_weights()
        cdef int[:, ::1] ids = token_ids
        sum_state_features(<float*>state_vector.data,
-            feat_weights, &ids[0,0],
+                           feat_weights, &ids[0, 0],
-            token_ids.shape[0], self.nF, self.nO*self.nP)
+                           token_ids.shape[0], self.nF, self.nO*self.nP)
        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
        def backward(d_state_vector, sgd=None):
@ -161,10 +139,11 @@ cdef class precompute_hiddens:
        state_vector = state_vector.reshape(
            (state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
        best, which = self.ops.maxout(state_vector)
        def backprop(d_best, sgd=None):
            return self.ops.backprop_maxout(d_best, which, self.nP)
        return best, backprop
        return best, backprop
 cdef void sum_state_features(float* output,
@ -239,11 +218,15 @@ cdef class Parser:
        depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
        if depth != 1:
            raise ValueError("Currently parser depth is hard-coded to 1.")
-        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2))
+        parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
                                            cfg.get('maxout_pieces', 2))
        if parser_maxout_pieces != 2:
-            raise ValueError("Currently parser_maxout_pieces is hard-coded to 2")
+            raise ValueError("Currently parser_maxout_pieces is hard-coded "
-        token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128))
+                             "to 2")
-        hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200))
+        token_vector_width = util.env_opt('token_vector_width',
                                          cfg.get('token_vector_width', 128))
        hidden_width = util.env_opt('hidden_width',
                                    cfg.get('hidden_width', 200))
        embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
        hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
        hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
@ -365,8 +348,8 @@ cdef class Parser:
                    parse_states = self.parse_batch(subbatch)
                    beams = []
                else:
-                    beams = self.beam_parse(subbatch,
+                    beams = self.beam_parse(subbatch, beam_width=beam_width,
-                                beam_width=beam_width, beam_density=beam_density)
+                                            beam_density=beam_density)
                    parse_states = []
                    for beam in beams:
                        parse_states.append(<StateClass>beam.at(0))
@ -386,9 +369,9 @@ cdef class Parser:
        if isinstance(docs, Doc):
            docs = [docs]
-        cuda_stream = get_cuda_stream()
+        cuda_stream = util.get_cuda_stream()
-        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
+        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
-                                                                            0.0)
+            docs, cuda_stream, 0.0)
        nr_state = len(docs)
        nr_class = self.moves.n_moves
        nr_dim = tokvecs.shape[1]
@ -402,7 +385,8 @@ cdef class Parser:
        feat_weights = state2vec.get_feat_weights()
        cdef int i
-        cdef np.ndarray hidden_weights = numpy.ascontiguousarray(vec2scores._layers[-1].W.T)
+        cdef np.ndarray hidden_weights = numpy.ascontiguousarray(
            vec2scores._layers[-1].W.T)
        cdef np.ndarray hidden_bias = vec2scores._layers[-1].b
        hW = <float*>hidden_weights.data
@ -462,9 +446,9 @@ cdef class Parser:
        cdef Doc doc
        cdef int nr_class = self.moves.n_moves
        cdef StateClass stcls, output
-        cuda_stream = get_cuda_stream()
+        cuda_stream = util.get_cuda_stream()
-        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
+        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
-                                                                            0.0)
+            docs, cuda_stream, 0.0)
        beams = []
        cdef int offset = 0
        cdef int j = 0
@ -519,9 +503,7 @@ cdef class Parser:
        if isinstance(docs, Doc) and isinstance(golds, GoldParse):
            docs = [docs]
            golds = [golds]
-
+        cuda_stream = util.get_cuda_stream()
        cuda_stream = get_cuda_stream()
        states, golds, max_steps = self._init_gold_batch(docs, golds)
        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
                                                                            drop)
@ -536,7 +518,6 @@ cdef class Parser:
        n_steps = 0
        while todo:
            states, golds = zip(*todo)
            token_ids = self.get_token_ids(states)
            vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0)
            if drop != 0:
@ -558,8 +539,8 @@ cdef class Parser:
            and not isinstance(token_ids, state2vec.ops.xp.ndarray):
                # Move token_ids and d_vector to GPU, asynchronously
                backprops.append((
-                    get_async(cuda_stream, token_ids),
+                    util.get_async(cuda_stream, token_ids),
-                    get_async(cuda_stream, d_vector),
+                    util.get_async(cuda_stream, d_vector),
                    bp_vector
                ))
            else:
@ -592,15 +573,13 @@ cdef class Parser:
        states = self.moves.init_batch(docs)
        for gold in golds:
            self.moves.preprocess_gold(gold)
-
+        cuda_stream = util.get_cuda_stream()
-        cuda_stream = get_cuda_stream()
+        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
-        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop)
+            docs, cuda_stream, drop)
-
+        states_d_scores, backprops = _beam_utils.update_beam(
-        states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
+            self.moves, self.nr_feature, 500, states, golds, state2vec,
-                                        states, golds,
+            vec2scores, width, density, self.cfg.get('hist_size', 0),
-                                        state2vec, vec2scores,
+            drop=drop, losses=losses)
                                        width, density, self.cfg.get('hist_size', 0),
                                        drop=drop, losses=losses)
        backprop_lower = []
        cdef float batch_size = len(docs)
        for i, d_scores in enumerate(states_d_scores):
@ -612,13 +591,14 @@ cdef class Parser:
            if isinstance(self.model[0].ops, CupyOps) \
            and not isinstance(ids, state2vec.ops.xp.ndarray):
                backprop_lower.append((
-                    get_async(cuda_stream, ids),
+                    util.get_async(cuda_stream, ids),
-                    get_async(cuda_stream, d_vector),
+                    util.get_async(cuda_stream, d_vector),
                    bp_vectors))
            else:
                backprop_lower.append((ids, d_vector, bp_vectors))
        d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
-        self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream)
+        self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd,
                           cuda_stream)
    def _init_gold_batch(self, whole_docs, whole_golds):
        """Make a square batch, of length equal to the shortest doc. A long
@ -768,7 +748,8 @@ cdef class Parser:
    def begin_training(self, gold_tuples, pipeline=None, **cfg):
        if 'model' in cfg:
            self.model = cfg['model']
-        gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100)
+        gold_tuples = nonproj.preprocess_training_data(gold_tuples,
                                                       label_freq_cutoff=100)
        actions = self.moves.get_actions(gold_parses=gold_tuples)
        for action, labels in actions.items():
            for label in labels:
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@ -1,39 +1,37 @@
 # coding: utf-8
-"""
+"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
 Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
 for doing pseudo-projective parsing implementation uses the HEAD decoration
 scheme.
 """
 from __future__ import unicode_literals
 from copy import copy
 from ..tokens.doc cimport Doc
 from ..attrs import DEP, HEAD
 DELIMITER = '||'
 def ancestors(tokenid, heads):
-    # returns all words going from the word up the path to the root
+    # Returns all words going from the word up the path to the root. The path
-    # the path to root cannot be longer than the number of words in the sentence
+    # to root cannot be longer than the number of words in the  sentence. This
-    # this function ends after at most len(heads) steps
+    # function ends after at most len(heads) steps, because it would otherwise
-    # because it would otherwise loop indefinitely on cycles
+    # loop indefinitely on cycles.
    head = tokenid
    cnt = 0
    while heads[head] != head and cnt < len(heads):
        head = heads[head]
        cnt += 1
        yield head
-        if head == None:
+        if head is None:
            break
 def contains_cycle(heads):
-    # in an acyclic tree, the path from each word following
+    # in an acyclic tree, the path from each word following the head relation
-    # the head relation upwards always ends at the root node
+    # upwards always ends at the root node
    for tokenid in range(len(heads)):
        seen = set([tokenid])
-        for ancestor in ancestors(tokenid,heads):
+        for ancestor in ancestors(tokenid, heads):
            if ancestor in seen:
                return seen
            seen.add(ancestor)
@ -45,26 +43,26 @@ def is_nonproj_arc(tokenid, heads):
    # if there is a token k, h < k < d such that h is not
    # an ancestor of k. Same for h -> d, h > d
    head = heads[tokenid]
-    if head == tokenid: # root arcs cannot be non-projective
+    if head == tokenid:  # root arcs cannot be non-projective
        return False
-    elif head == None: # unattached tokens cannot be non-projective
+    elif head is None:  # unattached tokens cannot be non-projective
        return False
    start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
-    for k in range(start,end):
+    for k in range(start, end):
-        for ancestor in ancestors(k,heads):
+        for ancestor in ancestors(k, heads):
-            if ancestor == None: # for unattached tokens/subtrees
+            if ancestor is None:  # for unattached tokens/subtrees
                break
-            elif ancestor == head: # normal case: k dominated by h
+            elif ancestor == head:  # normal case: k dominated by h
                break
-        else: # head not in ancestors: d -> h is non-projective
+        else:  # head not in ancestors: d -> h is non-projective
            return True
    return False
 def is_nonproj_tree(heads):
    # a tree is non-projective if at least one arc is non-projective
-    return any( is_nonproj_arc(word,heads) for word in range(len(heads)) )
+    return any(is_nonproj_arc(word, heads) for word in range(len(heads)))
 def decompose(label):
@ -81,32 +79,32 @@ def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
    for raw_text, sents in gold_tuples:
        prepro_sents = []
        for (ids, words, tags, heads, labels, iob), ctnts in sents:
-            proj_heads,deco_labels = projectivize(heads,labels)
+            proj_heads, deco_labels = projectivize(heads, labels)
            # set the label to ROOT for each root dependent
-            deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
+            deco_labels = ['ROOT' if head == i else deco_labels[i]
                           for i, head in enumerate(proj_heads)]
            # count label frequencies
            if label_freq_cutoff > 0:
                for label in deco_labels:
                    if is_decorated(label):
-                        freqs[label] = freqs.get(label,0) + 1
+                        freqs[label] = freqs.get(label, 0) + 1
-            prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts))
+            prepro_sents.append(
                ((ids, words, tags, proj_heads, deco_labels, iob), ctnts))
        preprocessed.append((raw_text, prepro_sents))
    if label_freq_cutoff > 0:
-        return _filter_labels(preprocessed,label_freq_cutoff,freqs)
+        return _filter_labels(preprocessed, label_freq_cutoff, freqs)
    return preprocessed
 def projectivize(heads, labels):
-    # use the algorithm by Nivre & Nilsson 2005
+    # Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper
-    # assumes heads to be a proper tree, i.e. connected and cycle-free
+    # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
-    # returns a new pair (heads,labels) which encode
+    # which encode a projective and decorated tree.
    # a projective and decorated tree
    proj_heads = copy(heads)
    smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
-    if smallest_np_arc == None: # this sentence is already projective
+    if smallest_np_arc is None:  # this sentence is already projective
        return proj_heads, copy(labels)
-    while smallest_np_arc != None:
+    while smallest_np_arc is not None:
        _lift(smallest_np_arc, proj_heads)
        smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
    deco_labels = _decorate(heads, proj_heads, labels)
@ -114,24 +112,26 @@ def projectivize(heads, labels):
 def deprojectivize(tokens):
-    # reattach arcs with decorated labels (following HEAD scheme)
+    # Reattach arcs with decorated labels (following HEAD scheme). For each
-    # for each decorated arc X||Y, search top-down, left-to-right,
+    # decorated arc X||Y, search top-down, left-to-right, breadth-first until
-    # breadth-first until hitting a Y then make this the new head
+    # hitting a Y then make this the new head.
    for token in tokens:
        if is_decorated(token.dep_):
-            newlabel,headlabel = decompose(token.dep_)
+            newlabel, headlabel = decompose(token.dep_)
-            newhead = _find_new_head(token,headlabel)
+            newhead = _find_new_head(token, headlabel)
            token.head = newhead
            token.dep_ = newlabel
    return tokens
 def _decorate(heads, proj_heads, labels):
    # uses decoration scheme HEAD from Nivre & Nilsson 2005
    assert(len(heads) == len(proj_heads) == len(labels))
    deco_labels = []
-    for tokenid,head in enumerate(heads):
+    for tokenid, head in enumerate(heads):
        if head != proj_heads[tokenid]:
-            deco_labels.append('%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
+            deco_labels.append(
                '%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
        else:
            deco_labels.append(labels[tokenid])
    return deco_labels
@ -143,9 +143,9 @@ def _get_smallest_nonproj_arc(heads):
    # and ties are broken left to right
    smallest_size = float('inf')
    smallest_np_arc = None
-    for tokenid,head in enumerate(heads):
+    for tokenid, head in enumerate(heads):
        size = abs(tokenid-head)
-        if size < smallest_size and is_nonproj_arc(tokenid,heads):
+        if size < smallest_size and is_nonproj_arc(tokenid, heads):
            smallest_size = size
            smallest_np_arc = tokenid
    return smallest_np_arc
@ -168,8 +168,10 @@ def _find_new_head(token, headlabel):
        next_queue = []
        for qtoken in queue:
            for child in qtoken.children:
-                if child.is_space: continue
+                if child.is_space:
-                if child == token: continue
+                    continue
                if child == token:
                    continue
                if child.dep_ == headlabel:
                    return child
                next_queue.append(child)
@ -184,7 +186,10 @@ def _filter_labels(gold_tuples, cutoff, freqs):
    for raw_text, sents in gold_tuples:
        filtered_sents = []
        for (ids, words, tags, heads, labels, iob), ctnts in sents:
-            filtered_labels = [ decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ]
+            filtered_labels = [decompose(label)[0]
-            filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
+                               if freqs.get(label, cutoff) < cutoff
                               else label for label in labels]
            filtered_sents.append(
                ((ids, words, tags, heads, filtered_labels, iob), ctnts))
        filtered.append((raw_text, filtered_sents))
    return filtered
--- a/spacy/syntax/stateclass.pyx
+++ b/spacy/syntax/stateclass.pyx
@ -2,17 +2,8 @@
 # cython: infer_types=True
 from __future__ import unicode_literals
 from libc.string cimport memcpy, memset
 from libc.stdint cimport uint32_t, uint64_t
 import numpy
 from ..vocab cimport EMPTY_LEXEME
 from ..structs cimport Entity
 from ..lexeme cimport Lexeme
 from ..symbols cimport punct
 from ..attrs cimport IS_SPACE
 from ..attrs cimport attr_id_t
 from ..tokens.token cimport Token
 from ..tokens.doc cimport Doc
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -2,17 +2,17 @@
 # coding: utf-8
 from __future__ import unicode_literals
-from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
+from cpython.ref cimport Py_INCREF
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport weight_t
-from collections import defaultdict, OrderedDict
+from collections import OrderedDict
 import ujson
 from .. import util
 from ..structs cimport TokenC
 from .stateclass cimport StateClass
 from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
 from ..typedefs cimport attr_t
 from ..compat import json_dumps
 from .. import util
 cdef weight_t MIN_SCORE = -90000
@ -136,11 +136,12 @@ cdef class TransitionSystem:
            print([gold.c.ner[i].clas for i in range(gold.length)])
            print([gold.c.ner[i].move for i in range(gold.length)])
            print([gold.c.ner[i].label for i in range(gold.length)])
-            print("Self labels", [self.c[i].label for i in range(self.n_moves)])
+            print("Self labels",
                  [self.c[i].label for i in range(self.n_moves)])
            raise ValueError(
                "Could not find a gold-standard action to supervise "
-                "the entity recognizer\n"
+                "the entity recognizer. The transition system has "
-                "The transition system has %d actions." % (self.n_moves))
+                "%d actions." % (self.n_moves))
    def get_class_name(self, int clas):
        act = self.c[clas]
@ -149,7 +150,7 @@ cdef class TransitionSystem:
    def add_action(self, int action, label_name):
        cdef attr_t label_id
        if not isinstance(label_name, int) and \
-        not isinstance(label_name, long):
+           not isinstance(label_name, long):
            label_id = self.strings.add(label_name)
        else:
            label_id = label_name
@ -186,7 +187,7 @@ cdef class TransitionSystem:
                'name': self.move_name(trans.move, trans.label)
            })
        serializers = {
-            'transitions': lambda: ujson.dumps(transitions),
+            'transitions': lambda: json_dumps(transitions),
            'strings': lambda: self.strings.to_bytes()
        }
        return util.to_bytes(serializers, exclude)