Tidy up syntax

2025-08-23 21:44:54 +03:00 · 2017-10-27 19:45:57 +02:00 · 2017-10-27 19:45:57 +02:00 · b4d226a3f1
commit b4d226a3f1
parent 5167a0cce2
8 changed files with 195 additions and 230 deletions
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@ -2,7 +2,7 @@
 # cython: profile=True
 cimport numpy as np
 import numpy
-from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
+from cpython.ref cimport PyObject, Py_XDECREF
 from thinc.extra.search cimport Beam
 from thinc.extra.search import MaxViolation
 from thinc.typedefs cimport hash_t, class_t
@ -11,7 +11,6 @@ from thinc.extra.search cimport MaxViolation
 from .transition_system cimport TransitionSystem, Transition
 from .stateclass cimport StateClass
 from ..gold cimport GoldParse
-from ..tokens.doc cimport Doc


 # These are passed as callbacks to thinc.search.Beam
@ -50,7 +49,7 @@ cdef class ParserBeam(object):
    cdef public object dones

    def __init__(self, TransitionSystem moves, states, golds,
-            int width, float density):
+                 int width, float density):
        self.moves = moves
        self.states = states
        self.golds = golds
@ -59,7 +58,8 @@ cdef class ParserBeam(object):
        cdef StateClass state, st
        for state in states:
            beam = Beam(self.moves.n_moves, width, density)
-            beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
+            beam.initialize(self.moves.init_beam_state, state.c.length,
+                            state.c._sent)
            for i in range(beam.width):
                st = <StateClass>beam.at(i)
                st.c.offset = state.c.offset
@ -74,7 +74,8 @@ cdef class ParserBeam(object):

    @property
    def is_done(self):
-        return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams))
+        return all(b.is_done or self.dones[i]
+                   for i, b in enumerate(self.beams))

    def __getitem__(self, i):
        return self.beams[i]
@ -126,7 +127,8 @@ cdef class ParserBeam(object):
        for i in range(beam.size):
            state = <StateClass>beam.at(i)
            if not state.c.is_final():
-                self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
+                self.moves.set_costs(beam.is_valid[i], beam.costs[i],
+                                     state, gold)
                if follow_gold:
                    for j in range(beam.nr_class):
                        if beam.costs[i][j] >= 1:
@ -146,7 +148,10 @@ def get_token_ids(states, int n_tokens):
        c_ids += ids.shape[1]
    return ids

+
 nr_update = 0
+
+
 def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
                states, golds,
                state2vec, vec2scores,
@ -167,23 +172,27 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
        if pbeam.is_done and gbeam.is_done:
            break
        # The beam maps let us find the right row in the flattened scores
-        # arrays for each state. States are identified by (example id, history).
-        # We keep a different beam map for each step (since we'll have a flat
-        # scores array for each step). The beam map will let us take the per-state
-        # losses, and compute the gradient for each (step, state, class).
+        # arrays for each state. States are identified by (example id,
+        # history). We keep a different beam map for each step (since we'll
+        # have a flat scores array for each step). The beam map will let us
+        # take the per-state losses, and compute the gradient for each (step,
+        # state, class).
        beam_maps.append({})
        # Gather all states from the two beams in a list. Some stats may occur
        # in both beams. To figure out which beam each state belonged to,
        # we keep two lists of indices, p_indices and g_indices
-        states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
+        states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1],
+                                                  nr_update)
        if not states:
            break
        # Now that we have our flat list of states, feed them through the model
        token_ids = get_token_ids(states, nr_feature)
        vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
        if hist_feats:
-            hists = numpy.asarray([st.history[:hist_feats] for st in states], dtype='i')
-            scores, bp_scores = vec2scores.begin_update((vectors, hists), drop=drop)
+            hists = numpy.asarray([st.history[:hist_feats] for st in states],
+                                  dtype='i')
+            scores, bp_scores = vec2scores.begin_update((vectors, hists),
+                                                        drop=drop)
        else:
            scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)

@ -192,8 +201,10 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,

        # Unpack the flat scores into lists for the two beams. The indices arrays
        # tell us which example and state the scores-row refers to.
-        p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
-        g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')  for indices in g_indices]
+        p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
+                    for indices in p_indices]
+        g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
+                    for indices in g_indices]
        # Now advance the states in the beams. The gold beam is contrained to
        # to follow only gold analyses.
        pbeam.advance(p_scores)
@ -249,8 +260,7 @@ def get_states(pbeams, gbeams, beam_map, nr_update):


 def get_gradient(nr_class, beam_maps, histories, losses):
-    """
-    The global model assigns a loss to each parse. The beam scores
+    """The global model assigns a loss to each parse. The beam scores
    are additive, so the same gradient is applied to each action
    in the history. This gives the gradient of a single *action*
    for a beam state -- so we have "the gradient of loss for taking
@ -270,7 +280,8 @@ def get_gradient(nr_class, beam_maps, histories, losses):
            if loss != 0.0 and not numpy.isnan(loss):
                nr_step = max(nr_step, len(hist))
    for i in range(nr_step):
-        grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f'))
+        grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
+                                 dtype='f'))
    assert len(histories) == len(losses)
    for eg_id, hists in enumerate(histories):
        for loss, hist in zip(losses[eg_id], hists):
@ -287,5 +298,3 @@ def get_gradient(nr_class, beam_maps, histories, losses):
                grads[j][i, clas] += loss
                key = key + tuple([clas])
    return grads
-
-
--- a/spacy/syntax/_state.pyx
+++ b/spacy/syntax/_state.pyx
@ -1 +0,0 @@
-# test
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -4,24 +4,16 @@
 # coding: utf-8
 from __future__ import unicode_literals

-from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
-import ctypes
-from libc.stdint cimport uint32_t
-from libc.string cimport memcpy
+from cpython.ref cimport Py_INCREF
 from cymem.cymem cimport Pool
 from collections import OrderedDict
 from thinc.extra.search cimport Beam
-import numpy

 from .stateclass cimport StateClass
-from ._state cimport StateC, is_space_token
+from ._state cimport StateC
 from .nonproj import is_nonproj_tree
-from .transition_system cimport do_func_t, get_cost_func_t
 from .transition_system cimport move_cost_func_t, label_cost_func_t
-from ..gold cimport GoldParse
-from ..gold cimport GoldParseC
-from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE, IS_PUNCT
-from ..lexeme cimport Lexeme
+from ..gold cimport GoldParse, GoldParseC
 from ..structs cimport TokenC


@ -316,14 +308,13 @@ cdef class ArcEager(TransitionSystem):

    @classmethod
    def get_actions(cls, **kwargs):
-        actions = kwargs.get('actions',
-                    OrderedDict((
-                        (SHIFT, ['']),
-                        (REDUCE, ['']),
-                        (RIGHT, []),
-                        (LEFT, []),
-                        (BREAK, ['ROOT'])
-                    )))
+        actions = kwargs.get('actions', OrderedDict((
+            (SHIFT, ['']),
+            (REDUCE, ['']),
+            (RIGHT, []),
+            (LEFT, []),
+            (BREAK, ['ROOT']))
+        ))
        seen_actions = set()
        for label in kwargs.get('left_labels', []):
            if label.upper() != 'ROOT':
@ -363,7 +354,8 @@ cdef class ArcEager(TransitionSystem):
            if gold.cand_to_gold[i] is None:
                continue
            if state.safe_get(i).dep:
-                predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep]))
+                predicted.add((i, state.H(i),
+                              self.strings[state.safe_get(i).dep]))
            else:
                predicted.add((i, state.H(i), 'ROOT'))
            id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
@ -381,7 +373,8 @@ cdef class ArcEager(TransitionSystem):
        if not self.has_gold(gold):
            return None
        for i in range(gold.length):
-            if gold.heads[i] is None or gold.labels[i] is None: # Missing values
+            # Missing values
+            if gold.heads[i] is None or gold.labels[i] is None:
                gold.c.heads[i] = i
                gold.c.has_dep[i] = False
            else:
@ -517,14 +510,15 @@ cdef class ArcEager(TransitionSystem):
            # Check projectivity --- leading cause
            if is_nonproj_tree(gold.heads):
                raise ValueError(
-                    "Could not find a gold-standard action to supervise the dependency "
-                    "parser.\n"
-                    "Likely cause: the tree is non-projective (i.e. it has crossing "
-                    "arcs -- see spacy/syntax/nonproj.pyx for definitions)\n"
-                    "The ArcEager transition system only supports projective trees.\n"
-                    "To learn non-projective representations, transform the data "
-                    "before training and after parsing. Either pass make_projective=True "
-                    "to the GoldParse class, or use PseudoProjectivity.preprocess_training_data")
+                    "Could not find a gold-standard action to supervise the "
+                    "dependency parser. Likely cause: the tree is "
+                    "non-projective (i.e. it has crossing arcs -- see "
+                    "spacy/syntax/nonproj.pyx for definitions). The ArcEager "
+                    "transition system only supports projective trees. To "
+                    "learn non-projective representations, transform the data "
+                    "before training and after parsing. Either pass "
+                    "make_projective=True to the GoldParse class, or use "
+                    "spacy.syntax.nonproj.preprocess_training_data.")
            else:
                print(gold.orig_annot)
                print(gold.words)
@ -532,12 +526,10 @@ cdef class ArcEager(TransitionSystem):
                print(gold.labels)
                print(gold.sent_starts)
                raise ValueError(
-                    "Could not find a gold-standard action to supervise the dependency "
-                    "parser.\n"
-                    "The GoldParse was projective.\n"
-                    "The transition system has %d actions.\n"
-                    "State at failure:\n"
-                    "%s" % (self.n_moves, stcls.print_state(gold.words)))
+                    "Could not find a gold-standard action to supervise the"
+                    "dependency parser. The GoldParse was projective. The "
+                    "transition system has %d actions. State at failure: %s"
+                    % (self.n_moves, stcls.print_state(gold.words)))
        assert n_gold >= 1

    def get_beam_annot(self, Beam beam):
@ -558,4 +550,3 @@ cdef class ArcEager(TransitionSystem):
                    deps[j].setdefault(dep, 0.0)
                    deps[j][dep] += prob
        return heads, deps
-
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -4,17 +4,12 @@ from __future__ import unicode_literals
 from thinc.typedefs cimport weight_t
 from thinc.extra.search cimport Beam
 from collections import OrderedDict
-import numpy
-from thinc.neural.ops import NumpyOps

 from .stateclass cimport StateClass
 from ._state cimport StateC
 from .transition_system cimport Transition
 from .transition_system cimport do_func_t
-from ..structs cimport TokenC, Entity
-from ..gold cimport GoldParseC
-from ..gold cimport GoldParse
-from ..attrs cimport ENT_TYPE, ENT_IOB
+from ..gold cimport GoldParseC, GoldParse


 cdef enum:
@ -69,15 +64,14 @@ cdef class BiluoPushDown(TransitionSystem):

    @classmethod
    def get_actions(cls, **kwargs):
-        actions = kwargs.get('actions',
-                    OrderedDict((
-                        (MISSING, ['']),
-                        (BEGIN, []),
-                        (IN, []),
-                        (LAST, []),
-                        (UNIT, []),
-                        (OUT, [''])
-                    )))
+        actions = kwargs.get('actions', OrderedDict((
+            (MISSING, ['']),
+            (BEGIN, []),
+            (IN, []),
+            (LAST, []),
+            (UNIT, []),
+            (OUT, [''])
+        )))
        seen_entities = set()
        for entity_type in kwargs.get('entity_types', []):
            if entity_type in seen_entities:
@ -160,7 +154,7 @@ cdef class BiluoPushDown(TransitionSystem):

    cdef Transition lookup_transition(self, object name) except *:
        cdef attr_t label
-        if name == '-' or name == None:
+        if name == '-' or name is None:
            return Transition(clas=0, move=MISSING, label=0, score=0)
        elif name == '!O':
            return Transition(clas=0, move=ISNT, label=0, score=0)
@ -328,8 +322,8 @@ cdef class In:
            return False
        elif preset_ent_iob == 3:
            return False
-        # TODO: Is this quite right?
-        # I think it's supposed to be ensuring the gazetteer matches are maintained
+        # TODO: Is this quite right? I think it's supposed to be ensuring the
+        # gazetteer matches are maintained
        elif st.B_(1).ent_iob != preset_ent_iob:
            return False
        # Don't allow entities to extend across sentence boundaries
@ -354,10 +348,12 @@ cdef class In:
        if g_act == MISSING:
            return 0
        elif g_act == BEGIN:
-            # I, Gold B --> True (P of bad open entity sunk, R of this entity sunk)
+            # I, Gold B --> True
+            # (P of bad open entity sunk, R of this entity sunk)
            return 0
        elif g_act == IN:
-            # I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk)
+            # I, Gold I --> True
+            # (label forced by prev, if mismatch, P and R both sunk)
            return 0
        elif g_act == LAST:
            # I, Gold L --> True iff this entity sunk and next tag == O
@ -505,11 +501,3 @@ cdef class Out:
            return 1
        else:
            return 1
-
-
-class OracleError(Exception):
-    pass
-
-
-class UnknownMove(Exception):
-    pass
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -5,71 +5,48 @@
 # coding: utf-8
 from __future__ import unicode_literals, print_function

-from collections import Counter, OrderedDict
+from collections import OrderedDict
 import ujson
 import json
-import contextlib
 import numpy
-
-from libc.math cimport exp
-cimport cython
 cimport cython.parallel
 import cytoolz
-import dill
-
 import numpy.random
 cimport numpy as np
-
-from libcpp.vector cimport vector
-from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
+from cpython.ref cimport PyObject, Py_XDECREF
 from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
-from libc.stdint cimport uint32_t, uint64_t
-from libc.string cimport memset, memcpy
-from libc.stdlib cimport malloc, calloc, free
-from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
-from thinc.linear.avgtron cimport AveragedPerceptron
-from thinc.linalg cimport Vec, VecVec
-from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
-from thinc.extra.eg cimport Example
+from libc.math cimport exp
+from libcpp.vector cimport vector
+from libc.string cimport memset
+from libc.stdlib cimport calloc, free
+from cymem.cymem cimport Pool
+from thinc.typedefs cimport weight_t, class_t, hash_t
 from thinc.extra.search cimport Beam
-
-from cymem.cymem cimport Pool, Address
-from murmurhash.mrmr cimport hash64
-from preshed.maps cimport MapStruct
-from preshed.maps cimport map_get
-
-from thinc.api import layerize, chain, clone, with_flatten
-from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
+from thinc.api import chain, clone
+from thinc.v2v import Model, Maxout, Affine
 from thinc.misc import LayerNorm
-
-from thinc.neural.ops import NumpyOps, CupyOps
+from thinc.neural.ops import CupyOps
 from thinc.neural.util import get_array_module

-from .. import util
-from ..util import get_async, get_cuda_stream
-from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
-from .._ml import Tok2Vec, doc2feats, rebatch
-from .._ml import Residual, flatten
+from .._ml import zero_init, PrecomputableMaxouts, Tok2Vec, flatten
 from .._ml import link_vectors_to_models
 from ..compat import json_dumps, copy_array
-
+from ..tokens.doc cimport Doc
+from ..gold cimport GoldParse
+from .. import util
 from .stateclass cimport StateClass
 from ._state cimport StateC
-from . import nonproj
-from .transition_system import OracleError
-from .transition_system cimport TransitionSystem, Transition
-from ..structs cimport TokenC
-from ..tokens.doc cimport Doc
-from ..strings cimport StringStore
-from ..gold cimport GoldParse
-from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
-from . import _beam_utils
+from .transition_system cimport Transition
+from . import _beam_utils, nonproj


 def get_templates(*args, **kwargs):
    return []

+
 DEBUG = False
+
+
 def set_debug(val):
    global DEBUG
    DEBUG = val
@ -100,7 +77,8 @@ cdef class precompute_hiddens:
    cdef object _cuda_stream
    cdef object _bp_hiddens

-    def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, drop=0.):
+    def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
+                 drop=0.):
        gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
        cdef np.ndarray cached
        if not isinstance(gpu_cached, numpy.ndarray):
@ -120,8 +98,7 @@ cdef class precompute_hiddens:
        self._bp_hiddens = bp_features

    cdef const float* get_feat_weights(self) except NULL:
-        if not self._is_synchronized \
-        and self._cuda_stream is not None:
+        if not self._is_synchronized and self._cuda_stream is not None:
            self._cuda_stream.synchronize()
            self._is_synchronized = True
        return <float*>self._cached.data
@ -130,7 +107,8 @@ cdef class precompute_hiddens:
        return self.begin_update(X)[0]

    def begin_update(self, token_ids, drop=0.):
-        cdef np.ndarray state_vector = numpy.zeros((token_ids.shape[0], self.nO*self.nP), dtype='f')
+        cdef np.ndarray state_vector = numpy.zeros(
+            (token_ids.shape[0], self.nO*self.nP), dtype='f')
        # This is tricky, but (assuming GPU available);
        # - Input to forward on CPU
        # - Output from forward on CPU
@ -141,8 +119,8 @@ cdef class precompute_hiddens:
        feat_weights = self.get_feat_weights()
        cdef int[:, ::1] ids = token_ids
        sum_state_features(<float*>state_vector.data,
-            feat_weights, &ids[0,0],
-            token_ids.shape[0], self.nF, self.nO*self.nP)
+                           feat_weights, &ids[0, 0],
+                           token_ids.shape[0], self.nF, self.nO*self.nP)
        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)

        def backward(d_state_vector, sgd=None):
@ -161,10 +139,11 @@ cdef class precompute_hiddens:
        state_vector = state_vector.reshape(
            (state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
        best, which = self.ops.maxout(state_vector)
+
        def backprop(d_best, sgd=None):
            return self.ops.backprop_maxout(d_best, which, self.nP)
-        return best, backprop

+        return best, backprop


 cdef void sum_state_features(float* output,
@ -239,11 +218,15 @@ cdef class Parser:
        depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
        if depth != 1:
            raise ValueError("Currently parser depth is hard-coded to 1.")
-        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2))
+        parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
+                                            cfg.get('maxout_pieces', 2))
        if parser_maxout_pieces != 2:
-            raise ValueError("Currently parser_maxout_pieces is hard-coded to 2")
-        token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128))
-        hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200))
+            raise ValueError("Currently parser_maxout_pieces is hard-coded "
+                             "to 2")
+        token_vector_width = util.env_opt('token_vector_width',
+                                          cfg.get('token_vector_width', 128))
+        hidden_width = util.env_opt('hidden_width',
+                                    cfg.get('hidden_width', 200))
        embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
        hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
        hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
@ -365,8 +348,8 @@ cdef class Parser:
                    parse_states = self.parse_batch(subbatch)
                    beams = []
                else:
-                    beams = self.beam_parse(subbatch,
-                                beam_width=beam_width, beam_density=beam_density)
+                    beams = self.beam_parse(subbatch, beam_width=beam_width,
+                                            beam_density=beam_density)
                    parse_states = []
                    for beam in beams:
                        parse_states.append(<StateClass>beam.at(0))
@ -386,9 +369,9 @@ cdef class Parser:
        if isinstance(docs, Doc):
            docs = [docs]

-        cuda_stream = get_cuda_stream()
-        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
-                                                                            0.0)
+        cuda_stream = util.get_cuda_stream()
+        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
+            docs, cuda_stream, 0.0)
        nr_state = len(docs)
        nr_class = self.moves.n_moves
        nr_dim = tokvecs.shape[1]
@ -402,7 +385,8 @@ cdef class Parser:

        feat_weights = state2vec.get_feat_weights()
        cdef int i
-        cdef np.ndarray hidden_weights = numpy.ascontiguousarray(vec2scores._layers[-1].W.T)
+        cdef np.ndarray hidden_weights = numpy.ascontiguousarray(
+            vec2scores._layers[-1].W.T)
        cdef np.ndarray hidden_bias = vec2scores._layers[-1].b

        hW = <float*>hidden_weights.data
@ -462,9 +446,9 @@ cdef class Parser:
        cdef Doc doc
        cdef int nr_class = self.moves.n_moves
        cdef StateClass stcls, output
-        cuda_stream = get_cuda_stream()
-        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
-                                                                            0.0)
+        cuda_stream = util.get_cuda_stream()
+        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
+            docs, cuda_stream, 0.0)
        beams = []
        cdef int offset = 0
        cdef int j = 0
@ -519,9 +503,7 @@ cdef class Parser:
        if isinstance(docs, Doc) and isinstance(golds, GoldParse):
            docs = [docs]
            golds = [golds]
-
-        cuda_stream = get_cuda_stream()
-
+        cuda_stream = util.get_cuda_stream()
        states, golds, max_steps = self._init_gold_batch(docs, golds)
        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
                                                                            drop)
@ -536,7 +518,6 @@ cdef class Parser:
        n_steps = 0
        while todo:
            states, golds = zip(*todo)
-
            token_ids = self.get_token_ids(states)
            vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0)
            if drop != 0:
@ -558,8 +539,8 @@ cdef class Parser:
            and not isinstance(token_ids, state2vec.ops.xp.ndarray):
                # Move token_ids and d_vector to GPU, asynchronously
                backprops.append((
-                    get_async(cuda_stream, token_ids),
-                    get_async(cuda_stream, d_vector),
+                    util.get_async(cuda_stream, token_ids),
+                    util.get_async(cuda_stream, d_vector),
                    bp_vector
                ))
            else:
@ -592,15 +573,13 @@ cdef class Parser:
        states = self.moves.init_batch(docs)
        for gold in golds:
            self.moves.preprocess_gold(gold)
-
-        cuda_stream = get_cuda_stream()
-        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop)
-
-        states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
-                                        states, golds,
-                                        state2vec, vec2scores,
-                                        width, density, self.cfg.get('hist_size', 0),
-                                        drop=drop, losses=losses)
+        cuda_stream = util.get_cuda_stream()
+        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
+            docs, cuda_stream, drop)
+        states_d_scores, backprops = _beam_utils.update_beam(
+            self.moves, self.nr_feature, 500, states, golds, state2vec,
+            vec2scores, width, density, self.cfg.get('hist_size', 0),
+            drop=drop, losses=losses)
        backprop_lower = []
        cdef float batch_size = len(docs)
        for i, d_scores in enumerate(states_d_scores):
@ -612,13 +591,14 @@ cdef class Parser:
            if isinstance(self.model[0].ops, CupyOps) \
            and not isinstance(ids, state2vec.ops.xp.ndarray):
                backprop_lower.append((
-                    get_async(cuda_stream, ids),
-                    get_async(cuda_stream, d_vector),
+                    util.get_async(cuda_stream, ids),
+                    util.get_async(cuda_stream, d_vector),
                    bp_vectors))
            else:
                backprop_lower.append((ids, d_vector, bp_vectors))
        d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
-        self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream)
+        self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd,
+                           cuda_stream)

    def _init_gold_batch(self, whole_docs, whole_golds):
        """Make a square batch, of length equal to the shortest doc. A long
@ -768,7 +748,8 @@ cdef class Parser:
    def begin_training(self, gold_tuples, pipeline=None, **cfg):
        if 'model' in cfg:
            self.model = cfg['model']
-        gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100)
+        gold_tuples = nonproj.preprocess_training_data(gold_tuples,
+                                                       label_freq_cutoff=100)
        actions = self.moves.get_actions(gold_parses=gold_tuples)
        for action, labels in actions.items():
            for label in labels:
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@ -1,39 +1,37 @@
 # coding: utf-8
-"""
-Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
+"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
 for doing pseudo-projective parsing implementation uses the HEAD decoration
 scheme.
 """
 from __future__ import unicode_literals
+
 from copy import copy

-from ..tokens.doc cimport Doc
-from ..attrs import DEP, HEAD

 DELIMITER = '||'


 def ancestors(tokenid, heads):
-    # returns all words going from the word up the path to the root
-    # the path to root cannot be longer than the number of words in the sentence
-    # this function ends after at most len(heads) steps
-    # because it would otherwise loop indefinitely on cycles
+    # Returns all words going from the word up the path to the root. The path
+    # to root cannot be longer than the number of words in the  sentence. This
+    # function ends after at most len(heads) steps, because it would otherwise
+    # loop indefinitely on cycles.
    head = tokenid
    cnt = 0
    while heads[head] != head and cnt < len(heads):
        head = heads[head]
        cnt += 1
        yield head
-        if head == None:
+        if head is None:
            break


 def contains_cycle(heads):
-    # in an acyclic tree, the path from each word following
-    # the head relation upwards always ends at the root node
+    # in an acyclic tree, the path from each word following the head relation
+    # upwards always ends at the root node
    for tokenid in range(len(heads)):
        seen = set([tokenid])
-        for ancestor in ancestors(tokenid,heads):
+        for ancestor in ancestors(tokenid, heads):
            if ancestor in seen:
                return seen
            seen.add(ancestor)
@ -45,26 +43,26 @@ def is_nonproj_arc(tokenid, heads):
    # if there is a token k, h < k < d such that h is not
    # an ancestor of k. Same for h -> d, h > d
    head = heads[tokenid]
-    if head == tokenid: # root arcs cannot be non-projective
+    if head == tokenid:  # root arcs cannot be non-projective
        return False
-    elif head == None: # unattached tokens cannot be non-projective
+    elif head is None:  # unattached tokens cannot be non-projective
        return False

    start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
-    for k in range(start,end):
-        for ancestor in ancestors(k,heads):
-            if ancestor == None: # for unattached tokens/subtrees
+    for k in range(start, end):
+        for ancestor in ancestors(k, heads):
+            if ancestor is None:  # for unattached tokens/subtrees
                break
-            elif ancestor == head: # normal case: k dominated by h
+            elif ancestor == head:  # normal case: k dominated by h
                break
-        else: # head not in ancestors: d -> h is non-projective
+        else:  # head not in ancestors: d -> h is non-projective
            return True
    return False


 def is_nonproj_tree(heads):
    # a tree is non-projective if at least one arc is non-projective
-    return any( is_nonproj_arc(word,heads) for word in range(len(heads)) )
+    return any(is_nonproj_arc(word, heads) for word in range(len(heads)))


 def decompose(label):
@ -81,32 +79,32 @@ def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
    for raw_text, sents in gold_tuples:
        prepro_sents = []
        for (ids, words, tags, heads, labels, iob), ctnts in sents:
-            proj_heads,deco_labels = projectivize(heads,labels)
+            proj_heads, deco_labels = projectivize(heads, labels)
            # set the label to ROOT for each root dependent
-            deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
+            deco_labels = ['ROOT' if head == i else deco_labels[i]
+                           for i, head in enumerate(proj_heads)]
            # count label frequencies
            if label_freq_cutoff > 0:
                for label in deco_labels:
                    if is_decorated(label):
-                        freqs[label] = freqs.get(label,0) + 1
-            prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts))
+                        freqs[label] = freqs.get(label, 0) + 1
+            prepro_sents.append(
+                ((ids, words, tags, proj_heads, deco_labels, iob), ctnts))
        preprocessed.append((raw_text, prepro_sents))
-
    if label_freq_cutoff > 0:
-        return _filter_labels(preprocessed,label_freq_cutoff,freqs)
+        return _filter_labels(preprocessed, label_freq_cutoff, freqs)
    return preprocessed


 def projectivize(heads, labels):
-    # use the algorithm by Nivre & Nilsson 2005
-    # assumes heads to be a proper tree, i.e. connected and cycle-free
-    # returns a new pair (heads,labels) which encode
-    # a projective and decorated tree
+    # Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper
+    # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
+    # which encode a projective and decorated tree.
    proj_heads = copy(heads)
    smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
-    if smallest_np_arc == None: # this sentence is already projective
+    if smallest_np_arc is None:  # this sentence is already projective
        return proj_heads, copy(labels)
-    while smallest_np_arc != None:
+    while smallest_np_arc is not None:
        _lift(smallest_np_arc, proj_heads)
        smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
    deco_labels = _decorate(heads, proj_heads, labels)
@ -114,24 +112,26 @@ def projectivize(heads, labels):


 def deprojectivize(tokens):
-    # reattach arcs with decorated labels (following HEAD scheme)
-    # for each decorated arc X||Y, search top-down, left-to-right,
-    # breadth-first until hitting a Y then make this the new head
+    # Reattach arcs with decorated labels (following HEAD scheme). For each
+    # decorated arc X||Y, search top-down, left-to-right, breadth-first until
+    # hitting a Y then make this the new head.
    for token in tokens:
        if is_decorated(token.dep_):
-            newlabel,headlabel = decompose(token.dep_)
-            newhead = _find_new_head(token,headlabel)
+            newlabel, headlabel = decompose(token.dep_)
+            newhead = _find_new_head(token, headlabel)
            token.head = newhead
            token.dep_ = newlabel
    return tokens

+
 def _decorate(heads, proj_heads, labels):
    # uses decoration scheme HEAD from Nivre & Nilsson 2005
    assert(len(heads) == len(proj_heads) == len(labels))
    deco_labels = []
-    for tokenid,head in enumerate(heads):
+    for tokenid, head in enumerate(heads):
        if head != proj_heads[tokenid]:
-            deco_labels.append('%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
+            deco_labels.append(
+                '%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
        else:
            deco_labels.append(labels[tokenid])
    return deco_labels
@ -143,9 +143,9 @@ def _get_smallest_nonproj_arc(heads):
    # and ties are broken left to right
    smallest_size = float('inf')
    smallest_np_arc = None
-    for tokenid,head in enumerate(heads):
+    for tokenid, head in enumerate(heads):
        size = abs(tokenid-head)
-        if size < smallest_size and is_nonproj_arc(tokenid,heads):
+        if size < smallest_size and is_nonproj_arc(tokenid, heads):
            smallest_size = size
            smallest_np_arc = tokenid
    return smallest_np_arc
@ -168,8 +168,10 @@ def _find_new_head(token, headlabel):
        next_queue = []
        for qtoken in queue:
            for child in qtoken.children:
-                if child.is_space: continue
-                if child == token: continue
+                if child.is_space:
+                    continue
+                if child == token:
+                    continue
                if child.dep_ == headlabel:
                    return child
                next_queue.append(child)
@ -184,7 +186,10 @@ def _filter_labels(gold_tuples, cutoff, freqs):
    for raw_text, sents in gold_tuples:
        filtered_sents = []
        for (ids, words, tags, heads, labels, iob), ctnts in sents:
-            filtered_labels = [ decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ]
-            filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
+            filtered_labels = [decompose(label)[0]
+                               if freqs.get(label, cutoff) < cutoff
+                               else label for label in labels]
+            filtered_sents.append(
+                ((ids, words, tags, heads, filtered_labels, iob), ctnts))
        filtered.append((raw_text, filtered_sents))
    return filtered
--- a/spacy/syntax/stateclass.pyx
+++ b/spacy/syntax/stateclass.pyx
@ -2,17 +2,8 @@
 # cython: infer_types=True
 from __future__ import unicode_literals

-from libc.string cimport memcpy, memset
-from libc.stdint cimport uint32_t, uint64_t
 import numpy

-from ..vocab cimport EMPTY_LEXEME
-from ..structs cimport Entity
-from ..lexeme cimport Lexeme
-from ..symbols cimport punct
-from ..attrs cimport IS_SPACE
-from ..attrs cimport attr_id_t
-from ..tokens.token cimport Token
 from ..tokens.doc cimport Doc


--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -2,17 +2,17 @@
 # coding: utf-8
 from __future__ import unicode_literals

-from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
+from cpython.ref cimport Py_INCREF
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport weight_t
-from collections import defaultdict, OrderedDict
+from collections import OrderedDict
 import ujson

-from .. import util
 from ..structs cimport TokenC
 from .stateclass cimport StateClass
-from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
 from ..typedefs cimport attr_t
+from ..compat import json_dumps
+from .. import util


 cdef weight_t MIN_SCORE = -90000
@ -136,11 +136,12 @@ cdef class TransitionSystem:
            print([gold.c.ner[i].clas for i in range(gold.length)])
            print([gold.c.ner[i].move for i in range(gold.length)])
            print([gold.c.ner[i].label for i in range(gold.length)])
-            print("Self labels", [self.c[i].label for i in range(self.n_moves)])
+            print("Self labels",
+                  [self.c[i].label for i in range(self.n_moves)])
            raise ValueError(
                "Could not find a gold-standard action to supervise "
-                "the entity recognizer\n"
-                "The transition system has %d actions." % (self.n_moves))
+                "the entity recognizer. The transition system has "
+                "%d actions." % (self.n_moves))

    def get_class_name(self, int clas):
        act = self.c[clas]
@ -149,7 +150,7 @@ cdef class TransitionSystem:
    def add_action(self, int action, label_name):
        cdef attr_t label_id
        if not isinstance(label_name, int) and \
-        not isinstance(label_name, long):
+           not isinstance(label_name, long):
            label_id = self.strings.add(label_name)
        else:
            label_id = label_name
@ -186,7 +187,7 @@ cdef class TransitionSystem:
                'name': self.move_name(trans.move, trans.label)
            })
        serializers = {
-            'transitions': lambda: ujson.dumps(transitions),
+            'transitions': lambda: json_dumps(transitions),
            'strings': lambda: self.strings.to_bytes()
        }
        return util.to_bytes(serializers, exclude)