diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
index ca31c1699..883862551 100644
--- a/spacy/ml/parser_model.pxd
+++ b/spacy/ml/parser_model.pxd
@@ -41,10 +41,9 @@ cdef ActivationsC alloc_activations(SizesC n) nogil
 cdef void free_activations(const ActivationsC* A) nogil
 
 cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
-        const WeightsC* W, SizesC n) nogil
- 
+                         const WeightsC* W, SizesC n) nogil
+
 cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
 
-cdef void cpu_log_loss(float* d_scores,
-        const float* costs, const int* is_valid, const float* scores, int O) nogil
- 
+cdef void cpu_log_loss(float* d_scores, const float* costs,
+                       const int* is_valid, const float* scores, int O) nogil
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index 90e836f8a..843275f4c 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -13,7 +13,7 @@ from .. import util
 from ..errors import Errors
 
 from ..pipeline._parser_internals.stateclass cimport StateClass
-from ..typedefs cimport class_t, hash_t, weight_t
+from ..typedefs cimport weight_t
 
 
 cdef WeightsC get_c_weights(model) except *:
@@ -78,31 +78,31 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
         A._max_size = n.states
     else:
         A.token_ids = <int*>realloc(A.token_ids,
-            n.states * n.feats * sizeof(A.token_ids[0]))
+                                    n.states * n.feats * sizeof(A.token_ids[0]))
         A.scores = <float*>realloc(A.scores,
-            n.states * n.classes * sizeof(A.scores[0]))
+                                   n.states * n.classes * sizeof(A.scores[0]))
         A.unmaxed = <float*>realloc(A.unmaxed,
-            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
+                                    n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
         A.hiddens = <float*>realloc(A.hiddens,
-            n.states * n.hiddens * sizeof(A.hiddens[0]))
+                                    n.states * n.hiddens * sizeof(A.hiddens[0]))
         A.is_valid = <int*>realloc(A.is_valid,
-            n.states * n.classes * sizeof(A.is_valid[0]))
+                                   n.states * n.classes * sizeof(A.is_valid[0]))
         A._max_size = n.states
     A._curr_size = n.states
 
 
 cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
-        const WeightsC* W, SizesC n) nogil:
-    cdef double one = 1.0
+                         const WeightsC* W, SizesC n) nogil:
     resize_activations(A, n)
     for i in range(n.states):
         states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
     memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
     memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
-    sum_state_features(cblas, A.unmaxed,
-        W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
+    sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n.states,
+                       n.feats, n.hiddens * n.pieces)
     for i in range(n.states):
-        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1,
+                     &A.unmaxed[i*n.hiddens*n.pieces], 1)
         for j in range(n.hiddens):
             index = i * n.hiddens * n.pieces + j * n.pieces
             which = _arg_max(&A.unmaxed[index], n.pieces)
@@ -112,10 +112,10 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
         memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
     else:
         # Compute hidden-to-output
-        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
-            1.0, <const float *>A.hiddens, n.hiddens,
-            <const float *>W.hidden_weights, n.hiddens,
-            0.0, A.scores, n.classes)
+        sgemm(cblas)(False, True, n.states, n.classes, n.hiddens, 1.0,
+                     <const float *>A.hiddens, n.hiddens,
+                     <const float *>W.hidden_weights, n.hiddens, 0.0,
+                     A.scores, n.classes)
         # Add bias
         for i in range(n.states):
             saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
@@ -131,9 +131,9 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
                 A.scores[i*n.classes+j] = min_
 
 
-cdef void sum_state_features(CBlas cblas, float* output,
-        const float* cached, const int* token_ids, int B, int F, int O) nogil:
-    cdef int idx, b, f, i
+cdef void sum_state_features(CBlas cblas, float* output, const float* cached,
+                             const int* token_ids, int B, int F, int O) nogil:
+    cdef int idx, b, f
     cdef const float* feature
     padding = cached
     cached += F * O
@@ -150,9 +150,8 @@ cdef void sum_state_features(CBlas cblas, float* output,
         token_ids += F
 
 
-cdef void cpu_log_loss(float* d_scores,
-        const float* costs, const int* is_valid, const float* scores,
-        int O) nogil:
+cdef void cpu_log_loss(float* d_scores, const float* costs, const int* is_valid,
+                       const float* scores, int O) nogil:
     """Do multi-label log loss"""
     cdef double max_, gmax, Z, gZ
     best = arg_max_if_gold(scores, costs, is_valid, O)
@@ -178,7 +177,7 @@ cdef void cpu_log_loss(float* d_scores,
 
 
 cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
-        const int* is_valid, int n) nogil:
+                         const int* is_valid, int n) nogil:
     # Find minimum cost
     cdef float cost = 1
     for i in range(n):
@@ -202,10 +201,9 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
     return best
 
 
-
 class ParserStepModel(Model):
     def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
-            dropout=0.1):
+                 dropout=0.1):
         Model.__init__(self, name="parser_step_model", forward=step_forward)
         self.attrs["has_upper"] = has_upper
         self.attrs["dropout_rate"] = dropout
@@ -267,7 +265,7 @@ class ParserStepModel(Model):
 
     def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
         if isinstance(self.state2vec.ops, CupyOps) \
-        and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
+           and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
             # Move token_ids and d_vector to GPU, asynchronously
             self.backprops.append((
                 util.get_async(self.cuda_stream, token_ids),
@@ -277,7 +275,6 @@ class ParserStepModel(Model):
         else:
             self.backprops.append((token_ids, d_vector, get_d_tokvecs))
 
-
     def finish_steps(self, golds):
         # Add a padding vector to the d_tokvecs gradient, so that missing
         # values don't affect the real gradient.
@@ -290,14 +287,15 @@ class ParserStepModel(Model):
             ids = ids.flatten()
             d_state_features = d_state_features.reshape(
                 (ids.size, d_state_features.shape[2]))
-            self.ops.scatter_add(d_tokvecs, ids,
-                d_state_features)
+            self.ops.scatter_add(d_tokvecs, ids, d_state_features)
         # Padded -- see update()
         self.bp_tokvecs(d_tokvecs[:-1])
         return d_tokvecs
 
+
 NUMPY_OPS = NumpyOps()
 
+
 def step_forward(model: ParserStepModel, states, is_train):
     token_ids = model.get_token_ids(states)
     vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
@@ -310,7 +308,7 @@ def step_forward(model: ParserStepModel, states, is_train):
         scores, get_d_vector = model.vec2scores(vector, is_train)
     else:
         scores = NumpyOps().asarray(vector)
-        get_d_vector = lambda d_scores: d_scores
+        def get_d_vector(d_scores): return d_scores
     # If the class is unseen, make sure its score is minimum
     scores[:, model._class_mask == 0] = numpy.nanmin(scores)
 
@@ -445,8 +443,8 @@ cdef class precompute_hiddens:
         feat_weights = self.get_feat_weights()
         cdef int[:, ::1] ids = token_ids
         sum_state_features(cblas, <float*>state_vector.data,
-            feat_weights, &ids[0,0],
-            token_ids.shape[0], self.nF, self.nO*self.nP)
+                           feat_weights, &ids[0, 0], token_ids.shape[0],
+                           self.nF, self.nO*self.nP)
         state_vector += self.bias
         state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
 
@@ -471,7 +469,7 @@ cdef class precompute_hiddens:
 
         def backprop_maxout(d_best):
             return self.ops.backprop_maxout(d_best, mask, self.nP)
-        
+
         return state_vector, backprop_maxout
 
     def _relu_nonlinearity(self, state_vector):
@@ -485,7 +483,7 @@ cdef class precompute_hiddens:
         def backprop_relu(d_best):
             d_best *= mask
             return d_best.reshape((d_best.shape + (1,)))
- 
+
         return state_vector, backprop_relu
 
 cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index d6ee29397..411e53668 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -156,7 +156,7 @@ cdef class BiluoPushDown(TransitionSystem):
             if token.ent_type:
                 labels.add(token.ent_type_)
         return labels
-    
+
     def move_name(self, int move, attr_t label):
         if move == OUT:
             return 'O'
@@ -641,7 +641,7 @@ cdef class Unit:
                 cost += 1
                 break
         return cost
- 
+
 
 cdef class Out:
     @staticmethod
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 6daf6e7a6..e4767ed2f 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -127,6 +127,7 @@ def make_parser(
         scorer=scorer,
     )
 
+
 @Language.factory(
     "beam_parser",
     assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index f11b16f65..4ce7ec37b 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -15,7 +15,7 @@ from ._parser_internals.ner cimport BiluoPushDown
 from .transition_parser cimport Parser
 
 from ..language import Language
-from ..scorer import PRFScore, get_ner_prf
+from ..scorer import get_ner_prf
 from ..training import remove_bilu_prefix
 from ..util import registry
 
@@ -105,6 +105,7 @@ def make_ner(
         scorer=scorer,
     )
 
+
 @Language.factory(
     "beam_ner",
     assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
index a48d76b68..7adb82213 100644
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@@ -15,7 +15,7 @@ cdef class Parser(TrainablePipe):
     cdef object _cpu_ops
 
     cdef void _parseC(self, CBlas cblas, StateC** states,
-            WeightsC weights, SizesC sizes) nogil
+                      WeightsC weights, SizesC sizes) nogil
 
     cdef void c_transition_batch(self, StateC** states, const float* scores,
-            int nr_class, int batch_size) nogil
+                                 int nr_class, int batch_size) nogil
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index fb4db2da9..66eb03ee4 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -9,7 +9,7 @@ from cymem.cymem cimport Pool
 from itertools import islice
 
 from libc.stdlib cimport calloc, free
-from libc.string cimport memcpy, memset
+from libc.string cimport memset
 from libcpp.vector cimport vector
 
 import random
@@ -22,14 +22,13 @@ from thinc.api import (
     NumpyOps,
     Optimizer,
     chain,
-    get_array_module,
     get_ops,
     set_dropout_rate,
     softmax_activation,
     use_ops,
 )
 from thinc.legacy import LegacySequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d
+from thinc.types import Floats2d
 
 from ..ml.parser_model cimport (
     ActivationsC,
@@ -44,7 +43,6 @@ from ..ml.parser_model cimport (
     predict_states,
 )
 from ..tokens.doc cimport Doc
-from ._parser_internals.search cimport Beam
 from ._parser_internals.stateclass cimport StateClass
 
 from .trainable_pipe import TrainablePipe
@@ -54,11 +52,10 @@ from ._parser_internals cimport _beam_utils
 from ._parser_internals import _beam_utils
 
 from ..tokens.doc cimport Doc
-from ..typedefs cimport weight_t
 from ..vocab cimport Vocab
 from ._parser_internals cimport _beam_utils
 from ._parser_internals.stateclass cimport StateC, StateClass
-from ._parser_internals.transition_system cimport Transition, TransitionSystem
+from ._parser_internals.transition_system cimport Transition
 from .trainable_pipe cimport TrainablePipe
 
 from .. import util
@@ -289,7 +286,7 @@ cdef class Parser(TrainablePipe):
         with use_ops("numpy"):
             teacher_model = chain(teacher_step_model, softmax_activation())
             student_model = chain(student_step_model, softmax_activation())
-        
+
         max_moves = self.cfg["update_with_oracle_cut_size"]
         if max_moves >= 1:
             # Chop sequences into lengths of this many words, to make the
@@ -434,8 +431,6 @@ cdef class Parser(TrainablePipe):
         return batch
 
     def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
-        cdef Beam beam
-        cdef Doc doc
         self._ensure_labels_are_added(docs)
         batch = _beam_utils.BeamBatch(
             self.moves,
@@ -456,15 +451,15 @@ cdef class Parser(TrainablePipe):
         return list(batch)
 
     cdef void _parseC(self, CBlas cblas, StateC** states,
-            WeightsC weights, SizesC sizes) nogil:
-        cdef int i, j
+                      WeightsC weights, SizesC sizes) nogil:
+        cdef int i
         cdef vector[StateC*] unfinished
         cdef ActivationsC activations = alloc_activations(sizes)
         while sizes.states >= 1:
             predict_states(cblas, &activations, states, &weights, sizes)
             # Validate actions, argmax, take action.
-            self.c_transition_batch(states,
-                activations.scores, sizes.classes, sizes.states)
+            self.c_transition_batch(states, activations.scores,
+                                    sizes.classes, sizes.states)
             for i in range(sizes.states):
                 if not states[i].is_final():
                     unfinished.push_back(states[i])
@@ -493,7 +488,7 @@ cdef class Parser(TrainablePipe):
         return [state for state in states if not state.c.is_final()]
 
     cdef void c_transition_batch(self, StateC** states, const float* scores,
-            int nr_class, int batch_size) nogil:
+                                 int nr_class, int batch_size) nogil:
         # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
         with gil:
             assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
@@ -551,8 +546,7 @@ cdef class Parser(TrainablePipe):
         if not states:
             return losses
         model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
- 
-        all_states = list(states)
+
         states_golds = list(zip(states, golds))
         n_moves = 0
         while states_golds:
@@ -632,8 +626,8 @@ cdef class Parser(TrainablePipe):
         del tutor
         return losses
 
-    def update_beam(self, examples, *, beam_width,
-            drop=0., sgd=None, losses=None, beam_density=0.0):
+    def update_beam(self, examples, *, beam_width, drop=0., sgd=None,
+                    losses=None, beam_density=0.0):
         states, golds, _ = self.moves.init_gold_batch(examples)
         if not states:
             return losses
@@ -664,7 +658,7 @@ cdef class Parser(TrainablePipe):
         is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
         costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
         cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
-                                        dtype='f', order='C')
+                                               dtype='f', order='C')
         c_d_scores = <float*>d_scores.data
         unseen_classes = self.model.attrs["unseen_classes"]
         for i, (state, gold) in enumerate(zip(states, golds)):
@@ -674,8 +668,8 @@ cdef class Parser(TrainablePipe):
             for j in range(self.moves.n_moves):
                 if costs[j] <= 0.0 and j in unseen_classes:
                     unseen_classes.remove(j)
-            cpu_log_loss(c_d_scores,
-                costs, is_valid, &scores[i, 0], d_scores.shape[1])
+            cpu_log_loss(c_d_scores, costs, is_valid, &scores[i, 0],
+                         d_scores.shape[1])
             c_d_scores += d_scores.shape[1]
         # Note that we don't normalize this. See comment in update() for why.
         if losses is not None:
@@ -785,10 +779,7 @@ cdef class Parser(TrainablePipe):
         long_doc[:N], and another representing long_doc[N:]. In contrast to
         _init_gold_batch, this version uses a teacher model to generate the
         cut sequences."""
-        cdef:
-            StateClass start_state
-            StateClass state
-            Transition action
+        cdef StateClass state
         all_states = self.moves.init_batch(docs)
         states = []
         to_cut = []
@@ -810,7 +801,6 @@ cdef class Parser(TrainablePipe):
                 length += 1
         return states
 
-
     def _init_gold_batch(self, examples, max_length):
         """Make a square batch, of length equal to the shortest transition
         sequence or a cap. A long
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index e41f9e02e..efca4bcb0 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,4 +1,3 @@
-import warnings
 from collections.abc import Iterable as IterableInstance
 
 import numpy