diff --git a/pyproject.toml b/pyproject.toml
index 7abd7a96f..4b0da39b9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.1.0,<8.2.0",
+    "thinc>=9.0.0.dev0,<9.1.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 778c05e21..5bd04aa9c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=3.0.10,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.1.0,<8.2.0
+thinc>=9.0.0.dev0,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 3c1bf5b0b..0870e032e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -38,7 +38,7 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.1.0,<8.2.0
+    thinc>=9.0.0.dev0,<9.1.0
     wasabi>=0.9.1,<1.1.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0
diff --git a/setup.py b/setup.py
index 55494344b..77a4cf283 100755
--- a/setup.py
+++ b/setup.py
@@ -48,6 +48,7 @@ MOD_NAMES = [
     "spacy.pipeline._parser_internals.arc_eager",
     "spacy.pipeline._parser_internals.ner",
     "spacy.pipeline._parser_internals.nonproj",
+    "spacy.pipeline._parser_internals.search",
     "spacy.pipeline._parser_internals._state",
     "spacy.pipeline._parser_internals.stateclass",
     "spacy.pipeline._parser_internals.transition_system",
@@ -67,6 +68,7 @@ MOD_NAMES = [
     "spacy.matcher.dependencymatcher",
     "spacy.symbols",
     "spacy.vectors",
+    "spacy.tests.parser._search",
 ]
 COMPILE_OPTIONS = {
     "msvc": ["/Ox", "/EHsc"],
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index 055fa0bad..91558683b 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -3,7 +3,6 @@ cimport numpy as np
 from libc.math cimport exp
 from libc.string cimport memset, memcpy
 from libc.stdlib cimport calloc, free, realloc
-from thinc.backends.linalg cimport Vec, VecVec
 from thinc.backends.cblas cimport saxpy, sgemm
 
 import numpy
@@ -102,11 +101,10 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
     sum_state_features(cblas, A.unmaxed,
         W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
     for i in range(n.states):
-        VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces],
-            W.feat_bias, 1., n.hiddens * n.pieces)
+        saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
         for j in range(n.hiddens):
             index = i * n.hiddens * n.pieces + j * n.pieces
-            which = Vec.arg_max(&A.unmaxed[index], n.pieces)
+            which = _arg_max(&A.unmaxed[index], n.pieces)
             A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
     memset(A.scores, 0, n.states * n.classes * sizeof(float))
     if W.hidden_weights == NULL:
@@ -119,8 +117,7 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
             0.0, A.scores, n.classes)
         # Add bias
         for i in range(n.states):
-            VecVec.add_i(&A.scores[i*n.classes],
-                W.hidden_bias, 1., n.classes)
+            saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
     # Set unseen classes to minimum value
     i = 0
     min_ = A.scores[0]
@@ -158,7 +155,8 @@ cdef void cpu_log_loss(float* d_scores,
     """Do multi-label log loss"""
     cdef double max_, gmax, Z, gZ
     best = arg_max_if_gold(scores, costs, is_valid, O)
-    guess = Vec.arg_max(scores, O)
+    guess = _arg_max(scores, O)
+
     if best == -1 or guess == -1:
         # These shouldn't happen, but if they do, we want to make sure we don't
         # cause an OOB access.
@@ -488,3 +486,15 @@ cdef class precompute_hiddens:
             return d_best.reshape((d_best.shape + (1,)))
  
         return state_vector, backprop_relu
+
+cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
+    if n_classes == 2:
+        return 0 if scores[0] > scores[1] else 1
+    cdef int i
+    cdef int best = 0
+    cdef float mode = scores[0]
+    for i in range(1, n_classes):
+        if scores[i] > mode:
+            mode = scores[i]
+            best = i
+    return best
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pxd b/spacy/pipeline/_parser_internals/_beam_utils.pxd
index de3573fbc..571f246b1 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pxd
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pxd
@@ -1,6 +1,6 @@
 from ...typedefs cimport class_t, hash_t
 
-# These are passed as callbacks to thinc.search.Beam
+# These are passed as callbacks to .search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
 
 cdef int check_final_state(void* _state, void* extra_args) except -1
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index fa7df2056..610c8ddee 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -3,17 +3,16 @@
 cimport numpy as np
 import numpy
 from cpython.ref cimport PyObject, Py_XDECREF
-from thinc.extra.search cimport Beam
-from thinc.extra.search import MaxViolation
-from thinc.extra.search cimport MaxViolation
 
 from ...typedefs cimport hash_t, class_t
 from .transition_system cimport TransitionSystem, Transition
 from ...errors import Errors
+from .search cimport Beam, MaxViolation
+from .search import MaxViolation
 from .stateclass cimport StateC, StateClass
 
 
-# These are passed as callbacks to thinc.search.Beam
+# These are passed as callbacks to .search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
     dest = <StateC*>_dest
     src = <StateC*>_src
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 257b5ef8a..a79aef64a 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -15,7 +15,7 @@ from ...training.example cimport Example
 from .stateclass cimport StateClass
 from ._state cimport StateC, ArcC
 from ...errors import Errors
-from thinc.extra.search cimport Beam
+from .search cimport Beam
 
 cdef weight_t MIN_SCORE = -90000
 cdef attr_t SUBTOK_LABEL = hash_string('subtok')
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index cc196d85a..53ed03523 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -6,7 +6,6 @@ from libcpp.vector cimport vector
 from cymem.cymem cimport Pool
 
 from collections import Counter
-from thinc.extra.search cimport Beam
 
 from ...tokens.doc cimport Doc
 from ...tokens.span import Span
@@ -17,6 +16,7 @@ from ...attrs cimport IS_SPACE
 from ...structs cimport TokenC, SpanC
 from ...training import split_bilu_label
 from ...training.example cimport Example
+from .search cimport Beam
 from .stateclass cimport StateClass
 from ._state cimport StateC
 from .transition_system cimport Transition, do_func_t
diff --git a/spacy/pipeline/_parser_internals/search.pxd b/spacy/pipeline/_parser_internals/search.pxd
new file mode 100644
index 000000000..dfe30e1c1
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/search.pxd
@@ -0,0 +1,89 @@
+from cymem.cymem cimport Pool
+
+from libc.stdint cimport uint32_t
+from libc.stdint cimport uint64_t
+from libcpp.pair cimport pair
+from libcpp.queue cimport priority_queue
+from libcpp.vector cimport vector
+
+from ...typedefs cimport class_t, weight_t, hash_t
+
+ctypedef pair[weight_t, size_t] Entry
+ctypedef priority_queue[Entry] Queue
+
+
+ctypedef int (*trans_func_t)(void* dest, void* src, class_t clas, void* x) except -1
+
+ctypedef void* (*init_func_t)(Pool mem, int n, void* extra_args) except NULL
+
+ctypedef int (*del_func_t)(Pool mem, void* state, void* extra_args) except -1
+
+ctypedef int (*finish_func_t)(void* state, void* extra_args) except -1
+
+ctypedef hash_t (*hash_func_t)(void* state, void* x) except 0
+
+
+cdef struct _State:
+    void* content
+    class_t* hist
+    weight_t score
+    weight_t loss
+    int i
+    int t
+    bint is_done
+
+
+cdef class Beam:
+    cdef Pool mem
+    cdef class_t nr_class
+    cdef class_t width
+    cdef class_t size
+    cdef public weight_t min_density
+    cdef int t
+    cdef readonly bint is_done
+    cdef list histories
+    cdef list _parent_histories
+    cdef weight_t** scores
+    cdef int** is_valid
+    cdef weight_t** costs
+    cdef _State* _parents
+    cdef _State* _states
+    cdef del_func_t del_func
+
+    cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1
+
+    cdef inline void* at(self, int i) nogil:
+        return self._states[i].content
+
+    cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1
+    cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
+                     void* extra_args) except -1
+    cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1
+ 
+
+    cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
+        self.scores[i][j] = score
+        self.is_valid[i][j] = is_valid
+        self.costs[i][j] = cost
+
+    cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
+                     const weight_t* costs) except -1
+    cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1
+
+
+cdef class MaxViolation:
+    cdef Pool mem
+    cdef weight_t cost
+    cdef weight_t delta
+    cdef readonly weight_t p_score
+    cdef readonly weight_t g_score
+    cdef readonly double Z
+    cdef readonly double gZ
+    cdef class_t n
+    cdef readonly list p_hist
+    cdef readonly list g_hist
+    cdef readonly list p_probs
+    cdef readonly list g_probs
+
+    cpdef int check(self, Beam pred, Beam gold) except -1
+    cpdef int check_crf(self, Beam pred, Beam gold) except -1
diff --git a/spacy/pipeline/_parser_internals/search.pyx b/spacy/pipeline/_parser_internals/search.pyx
new file mode 100644
index 000000000..1d9b6dd7a
--- /dev/null
+++ b/spacy/pipeline/_parser_internals/search.pyx
@@ -0,0 +1,306 @@
+# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
+cimport cython
+from libc.string cimport memset, memcpy
+from libc.math cimport log, exp
+import math
+
+from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMap
+
+
+cdef class Beam:
+    def __init__(self, class_t nr_class, class_t width, weight_t min_density=0.0):
+        assert nr_class != 0
+        assert width != 0
+        self.nr_class = nr_class
+        self.width = width
+        self.min_density = min_density
+        self.size = 1
+        self.t = 0
+        self.mem = Pool()
+        self.del_func = NULL
+        self._parents = <_State*>self.mem.alloc(self.width, sizeof(_State))
+        self._states = <_State*>self.mem.alloc(self.width, sizeof(_State))
+        cdef int i
+        self.histories = [[] for i in range(self.width)]
+        self._parent_histories = [[] for i in range(self.width)]
+
+        self.scores = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
+        self.is_valid = <int**>self.mem.alloc(self.width, sizeof(weight_t*))
+        self.costs = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
+        for i in range(self.width):
+            self.scores[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
+            self.is_valid[i] = <int*>self.mem.alloc(self.nr_class, sizeof(int))
+            self.costs[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
+
+    def __len__(self):
+        return self.size
+
+    property score:
+        def __get__(self):
+            return self._states[0].score
+
+    property min_score:
+        def __get__(self):
+            return self._states[self.size-1].score
+
+    property loss:
+        def __get__(self):
+            return self._states[0].loss
+
+    property probs:
+        def __get__(self):
+            return _softmax([self._states[i].score for i in range(self.size)])
+
+    property scores:
+        def __get__(self):
+            return [self._states[i].score for i in range(self.size)]
+
+    property histories:
+        def __get__(self):
+            return self.histories
+
+    cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
+                     const weight_t* costs) except -1:
+        cdef int j
+        for j in range(self.nr_class):
+            self.scores[i][j] = scores[j]
+            self.is_valid[i][j] = is_valid[j]
+            self.costs[i][j] = costs[j]
+
+    cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
+        cdef int i, j
+        for i in range(self.width):
+            memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
+            memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
+            memcpy(self.costs[i], costs[i], sizeof(int) * self.nr_class)
+
+    cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1:
+        for i in range(self.width):
+            self._states[i].content = init_func(self.mem, n, extra_args)
+            self._parents[i].content = init_func(self.mem, n, extra_args)
+        self.del_func = del_func
+
+    def __dealloc__(self):
+        if self.del_func == NULL:
+            return
+
+        for i in range(self.width):
+            self.del_func(self.mem, self._states[i].content, NULL)
+            self.del_func(self.mem, self._parents[i].content, NULL)
+
+    @cython.cdivision(True)
+    cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
+                     void* extra_args) except -1:
+        cdef weight_t** scores = self.scores
+        cdef int** is_valid = self.is_valid
+        cdef weight_t** costs = self.costs
+
+        cdef Queue* q = new Queue()
+        self._fill(q, scores, is_valid)
+        # For a beam of width k, we only ever need 2k state objects. How?
+        # Each transition takes a parent and a class and produces a new state.
+        # So, we don't need the whole history --- just the parent. So at
+        # each step, we take a parent, and apply one or more extensions to
+        # it.
+        self._parents, self._states = self._states, self._parents
+        self._parent_histories, self.histories = self.histories, self._parent_histories
+        cdef weight_t score
+        cdef int p_i
+        cdef int i = 0
+        cdef class_t clas
+        cdef _State* parent
+        cdef _State* state
+        cdef hash_t key
+        cdef PreshMap seen_states = PreshMap(self.width)
+        cdef uint64_t is_seen
+        cdef uint64_t one = 1
+        while i < self.width and not q.empty():
+            data = q.top()
+            p_i = data.second / self.nr_class
+            clas = data.second % self.nr_class
+            score = data.first
+            q.pop()
+            parent = &self._parents[p_i]
+            # Indicates terminal state reached; i.e. state is done
+            if parent.is_done:
+                # Now parent will not be changed, so we don't have to copy.
+                # Once finished, should also be unbranching.
+                self._states[i], parent[0] = parent[0], self._states[i]
+                parent.i = self._states[i].i
+                parent.t = self._states[i].t
+                parent.is_done = self._states[i].t
+                self._states[i].score = score
+                self.histories[i] = list(self._parent_histories[p_i])
+                i += 1
+            else:
+                state = &self._states[i]
+                # The supplied transition function should adjust the destination
+                # state to be the result of applying the class to the source state
+                transition_func(state.content, parent.content, clas, extra_args)
+                key = hash_func(state.content, extra_args) if hash_func is not NULL else 0
+                is_seen = <uint64_t>seen_states.get(key)
+                if key == 0 or key == 1 or not is_seen:
+                    if key != 0 and key != 1:
+                        seen_states.set(key, <void*>one)
+                    state.score = score
+                    state.loss = parent.loss + costs[p_i][clas]
+                    self.histories[i] = list(self._parent_histories[p_i])
+                    self.histories[i].append(clas)
+                    i += 1
+        del q
+        self.size = i
+        assert self.size >= 1
+        for i in range(self.width):
+            memset(self.scores[i], 0, sizeof(weight_t) * self.nr_class)
+            memset(self.costs[i], 0, sizeof(weight_t) * self.nr_class)
+            memset(self.is_valid[i], 0, sizeof(int) * self.nr_class)
+        self.t += 1
+
+    cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1:
+        cdef int i
+        for i in range(self.size):
+            if not self._states[i].is_done:
+                self._states[i].is_done = finish_func(self._states[i].content, extra_args)
+        for i in range(self.size):
+            if not self._states[i].is_done:
+                self.is_done = False
+                break
+        else:
+            self.is_done = True
+
+    @cython.cdivision(True)
+    cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1:
+        """Populate the queue from a k * n matrix of scores, where k is the
+        beam-width, and n is the number of classes.
+        """
+        cdef Entry entry
+        cdef weight_t score
+        cdef _State* s
+        cdef int i, j, move_id
+        assert self.size >= 1
+        cdef vector[Entry] entries
+        for i in range(self.size):
+            s = &self._states[i]
+            move_id = i * self.nr_class
+            if s.is_done:
+                # Update score by path average, following TACL '13 paper.
+                if self.histories[i]:
+                    entry.first = s.score + (s.score / self.t)
+                else:
+                    entry.first = s.score
+                entry.second = move_id
+                entries.push_back(entry)
+            else:
+                for j in range(self.nr_class):
+                    if is_valid[i][j]:
+                        entry.first = s.score + scores[i][j]
+                        entry.second = move_id + j
+                        entries.push_back(entry)
+        cdef double max_, Z, cutoff
+        if self.min_density == 0.0:
+            for i in range(entries.size()):
+                q.push(entries[i])
+        elif not entries.empty():
+            max_ = entries[0].first
+            Z = 0.
+            cutoff = 0.
+            # Softmax into probabilities, so we can prune
+            for i in range(entries.size()):
+                if entries[i].first > max_:
+                    max_ = entries[i].first
+            for i in range(entries.size()):
+                Z += exp(entries[i].first-max_)
+            cutoff = (1. / Z) * self.min_density
+            for i in range(entries.size()):
+                prob = exp(entries[i].first-max_) / Z
+                if prob >= cutoff:
+                    q.push(entries[i])
+
+
+cdef class MaxViolation:
+    def __init__(self):
+        self.p_score = 0.0
+        self.g_score = 0.0
+        self.Z = 0.0
+        self.gZ = 0.0
+        self.delta = -1
+        self.cost = 0
+        self.p_hist = []
+        self.g_hist = []
+        self.p_probs = []
+        self.g_probs = []
+
+    cpdef int check(self, Beam pred, Beam gold) except -1:
+        cdef _State* p = &pred._states[0]
+        cdef _State* g = &gold._states[0]
+        cdef weight_t d = p.score - g.score
+        if p.loss >= 1 and (self.cost == 0 or d > self.delta):
+            self.cost = p.loss
+            self.delta = d
+            self.p_hist = list(pred.histories[0])
+            self.g_hist = list(gold.histories[0])
+            self.p_score = p.score
+            self.g_score = g.score
+            self.Z = 1e-10
+            self.gZ = 1e-10
+            for i in range(pred.size):
+                if pred._states[i].loss > 0:
+                    self.Z += exp(pred._states[i].score)
+            for i in range(gold.size):
+                if gold._states[i].loss == 0:
+                    prob = exp(gold._states[i].score)
+                    self.Z += prob
+                    self.gZ += prob
+
+    cpdef int check_crf(self, Beam pred, Beam gold) except -1:
+        d = pred.score - gold.score
+        seen_golds = set([tuple(gold.histories[i]) for i in range(gold.size)])
+        if pred.loss > 0 and (self.cost == 0 or d > self.delta):
+            p_hist = []
+            p_scores = []
+            g_hist = []
+            g_scores = []
+            for i in range(pred.size):
+                if pred._states[i].loss > 0:
+                    p_scores.append(pred._states[i].score)
+                    p_hist.append(list(pred.histories[i]))
+                # This can happen from non-monotonic actions
+                # If we find a better gold analysis this way, be sure to keep it.
+                elif pred._states[i].loss <= 0 \
+                and tuple(pred.histories[i]) not in seen_golds:
+                    g_scores.append(pred._states[i].score)
+                    g_hist.append(list(pred.histories[i]))
+            for i in range(gold.size):
+                if gold._states[i].loss == 0:
+                    g_scores.append(gold._states[i].score)
+                    g_hist.append(list(gold.histories[i]))
+
+            all_probs = _softmax(p_scores + g_scores)
+            p_probs = all_probs[:len(p_scores)]
+            g_probs_all = all_probs[len(p_scores):]
+            g_probs = _softmax(g_scores)
+
+            self.cost = pred.loss
+            self.delta = d
+            self.p_hist = p_hist
+            self.g_hist = g_hist
+            # TODO: These variables are misnamed! These are the gradients of the loss.
+            self.p_probs = p_probs
+            # Intuition here:
+            # The gradient of the loss is:
+            # P(model) - P(truth)
+            # Normally, P(truth) is 1 for the gold
+            # But, if we want to do the "partial credit" scheme, we want
+            # to create a distribution over the gold, proportional to the scores
+            # awarded.
+            self.g_probs = [x-y for x, y in zip(g_probs_all, g_probs)]
+
+
+def _softmax(nums):
+    if not nums:
+        return []
+    max_ = max(nums)
+    nums = [(exp(n-max_) if n is not None else None) for n in nums]
+    Z = sum(n for n in nums if n is not None)
+    return [(n/Z if n is not None else None) for n in nums]
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 9676e2194..0531d4ba5 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -5,8 +5,9 @@ from itertools import islice
 import numpy as np
 
 import srsly
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+from thinc.api import Config, Model
 from thinc.types import ArrayXd, Floats2d, Ints1d
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
 
 from ._edit_tree_internals.edit_trees import EditTrees
 from ._edit_tree_internals.schemas import validate_edit_tree
@@ -129,7 +130,9 @@ class EditTreeLemmatizer(TrainablePipe):
         self, examples: Iterable[Example], scores: List[Floats2d]
     ) -> Tuple[float, List[Floats2d]]:
         validate_examples(examples, "EditTreeLemmatizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(normalize=False, missing_value=-1)
+        loss_func = LegacySequenceCategoricalCrossentropy(
+            normalize=False, missing_value=-1
+        )
 
         truths = []
         for eg in examples:
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 782a1dabe..293add9e1 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,7 +1,8 @@
 # cython: infer_types=True, profile=True, binding=True
 from typing import Callable, Dict, Iterable, List, Optional, Union
 import srsly
-from thinc.api import SequenceCategoricalCrossentropy, Model, Config
+from thinc.api import Model, Config
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
 from itertools import islice
 
@@ -290,7 +291,7 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#get_loss
         """
         validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
+        loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
         truths = []
         for eg in examples:
             eg_truths = []
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 93a7ee796..42feeb277 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -3,7 +3,9 @@ from typing import Dict, Iterable, Optional, Callable, List, Union
 from itertools import islice
 
 import srsly
-from thinc.api import Model, SequenceCategoricalCrossentropy, Config
+from thinc.api import Model, Config
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
+
 from thinc.types import Floats2d, Ints1d
 
 from ..tokens.doc cimport Doc
@@ -161,7 +163,7 @@ class SentenceRecognizer(Tagger):
         """
         validate_examples(examples, "SentenceRecognizer.get_loss")
         labels = self.labels
-        loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
+        loss_func = LegacySequenceCategoricalCrossentropy(names=labels, normalize=False)
         truths = []
         for eg in examples:
             eg_truth = []
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 3b4715ce5..e12f116af 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -2,7 +2,8 @@
 from typing import Callable, Dict, Iterable, List, Optional, Union
 import numpy
 import srsly
-from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
+from thinc.api import Model, set_dropout_rate, Config
+from thinc.legacy import LegacySequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d
 import warnings
 from itertools import islice
@@ -244,7 +245,7 @@ class Tagger(TrainablePipe):
 
         DOCS: https://spacy.io/api/tagger#rehearse
         """
-        loss_func = SequenceCategoricalCrossentropy()
+        loss_func = LegacySequenceCategoricalCrossentropy()
         if losses is None:
             losses = {}
         losses.setdefault(self.name, 0.0)
@@ -275,7 +276,7 @@ class Tagger(TrainablePipe):
         DOCS: https://spacy.io/api/tagger#get_loss
         """
         validate_examples(examples, "Tagger.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
+        loss_func = LegacySequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
         # Convert empty tag "" to missing value None so that both misaligned
         # tokens and tokens with missing annotation have the default missing
         # value None.
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 340334b1a..9d7b258c6 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -10,12 +10,12 @@ import random
 
 import srsly
 from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
-from thinc.extra.search cimport Beam
 import numpy.random
 import numpy
 import warnings
 
 from ._parser_internals.stateclass cimport StateClass
+from ._parser_internals.search cimport Beam
 from ..ml.parser_model cimport alloc_activations, free_activations
 from ..ml.parser_model cimport predict_states, arg_max_if_valid
 from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 2be286a57..b9c4ef715 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -1,6 +1,10 @@
 import pytest
 from spacy.util import get_lang_class
+import functools
 from hypothesis import settings
+import inspect
+import importlib
+import sys
 
 # Functionally disable deadline settings for tests
 # to prevent spurious test failures in CI builds.
@@ -47,6 +51,33 @@ def pytest_runtest_setup(item):
             pytest.skip("not referencing any issues")
 
 
+# Decorator for Cython-built tests
+# https://shwina.github.io/cython-testing/
+def cytest(func):
+    """
+    Wraps `func` in a plain Python function.
+    """
+
+    @functools.wraps(func)
+    def wrapped(*args, **kwargs):
+        bound = inspect.signature(func).bind(*args, **kwargs)
+        return func(*bound.args, **bound.kwargs)
+
+    return wrapped
+
+
+def register_cython_tests(cython_mod_name: str, test_mod_name: str):
+    """
+    Registers all callables with name `test_*` in Cython module `cython_mod_name`
+    as attributes in module `test_mod_name`, making them discoverable by pytest.
+    """
+    cython_mod = importlib.import_module(cython_mod_name)
+    for name in dir(cython_mod):
+        item = getattr(cython_mod, name)
+        if callable(item) and name.startswith("test_"):
+            setattr(sys.modules[test_mod_name], name, item)
+
+
 # Fixtures for language tokenizers (languages sorted alphabetically)
 
 
diff --git a/spacy/tests/parser/_search.pyx b/spacy/tests/parser/_search.pyx
new file mode 100644
index 000000000..23fc81644
--- /dev/null
+++ b/spacy/tests/parser/_search.pyx
@@ -0,0 +1,119 @@
+# cython: infer_types=True, binding=True
+from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
+from spacy.typedefs cimport class_t, weight_t
+from cymem.cymem cimport Pool
+
+from ..conftest import cytest
+import pytest
+
+cdef struct TestState:
+    int length
+    int x
+    Py_UNICODE* string
+
+
+cdef int transition(void* dest, void* src, class_t clas, void* extra_args) except -1:
+    dest_state = <TestState*>dest
+    src_state = <TestState*>src
+    dest_state.length = src_state.length
+    dest_state.x = src_state.x
+    dest_state.x += clas
+    if extra_args != NULL:
+        dest_state.string = <Py_UNICODE*>extra_args
+    else:
+        dest_state.string = src_state.string
+
+
+cdef void* initialize(Pool mem, int n, void* extra_args) except NULL:
+    state = <TestState*>mem.alloc(1, sizeof(TestState))
+    state.length = n
+    state.x = 1
+    if extra_args == NULL:
+        state.string = u'default'
+    else:
+        state.string = <Py_UNICODE*>extra_args
+    return state
+
+
+cdef int destroy(Pool mem, void* state, void* extra_args) except -1:
+    state = <TestState*>state
+    mem.free(state)
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width",
+    [
+        (2, 3),
+        (3, 6),
+        (4, 20),
+    ]
+)
+def test_init(nr_class, beam_width):
+    b = Beam(nr_class, beam_width)
+    assert b.size == 1
+    assert b.width == beam_width
+    assert b.nr_class == nr_class
+
+@cytest
+def test_init_violn():
+    MaxViolation()
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width,length",
+    [
+        (2, 3, 3),
+        (3, 6, 15),
+        (4, 20, 32),
+    ]
+)
+def test_initialize(nr_class, beam_width, length):
+    b = Beam(nr_class, beam_width)
+    b.initialize(initialize, destroy, length, NULL)
+    for i in range(b.width):
+        s = <TestState*>b.at(i)
+        assert s.length == length, s.length
+        assert s.string == 'default'
+
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width,length,extra",
+    [
+        (2, 3, 4, None),
+        (3, 6, 15, u"test beam 1"),
+    ]
+)
+def test_initialize_extra(nr_class, beam_width, length, extra):
+    b = Beam(nr_class, beam_width)
+    if extra is None:
+        b.initialize(initialize, destroy, length, NULL)
+    else:
+        b.initialize(initialize, destroy, length, <void*><Py_UNICODE*>extra)
+    for i in range(b.width):
+        s = <TestState*>b.at(i)
+        assert s.length == length
+
+
+@cytest
+@pytest.mark.parametrize("nr_class,beam_width,length",
+    [
+        (3, 6, 15),
+        (4, 20, 32),
+    ]
+)
+def test_transition(nr_class, beam_width, length):
+    b = Beam(nr_class, beam_width)
+    b.initialize(initialize, destroy, length, NULL)
+    b.set_cell(0, 2, 30, True, 0)
+    b.set_cell(0, 1, 42, False, 0)
+    b.advance(transition, NULL, NULL)
+    assert b.size == 1, b.size
+    assert b.score == 30, b.score
+    s = <TestState*>b.at(0)
+    assert s.x == 3
+    assert b._states[0].score == 30, b._states[0].score
+    b.set_cell(0, 1, 10, True, 0)
+    b.set_cell(0, 2, 20, True, 0)
+    b.advance(transition, NULL, NULL)
+    assert b._states[0].score == 50, b._states[0].score
+    assert b._states[1].score == 40
+    s = <TestState*>b.at(0)
+    assert s.x == 5
diff --git a/spacy/tests/parser/test_search.py b/spacy/tests/parser/test_search.py
new file mode 100644
index 000000000..136c3a11b
--- /dev/null
+++ b/spacy/tests/parser/test_search.py
@@ -0,0 +1,3 @@
+from ..conftest import register_cython_tests
+
+register_cython_tests("spacy.tests.parser._search", __name__)