Remove obsolete parser.pyx

2025-10-28 22:47:52 +03:00 · 2017-10-26 12:42:05 +02:00 · 2017-10-26 12:42:05 +02:00 · 33f8c58782
commit 33f8c58782
parent a8abc47811
6 changed files with 0 additions and 1477 deletions
--- a/spacy/syntax/_parse_features.pxd
+++ b/spacy/syntax/_parse_features.pxd
@ -1,259 +0,0 @@
 from thinc.typedefs cimport atom_t
 from .stateclass cimport StateClass
 from ._state cimport StateC
 cdef int fill_context(atom_t* context, const StateC* state) nogil
 # Context elements
 # Ensure each token's attributes are listed: w, p, c, c6, c4. The order
 # is referenced by incrementing the enum...
 # Tokens are listed in left-to-right order.
 #cdef size_t* SLOTS = [
 #    S2w, S1w,
 #    S0l0w, S0l2w, S0lw,
 #    S0w,
 #    S0r0w, S0r2w, S0rw,
 #    N0l0w, N0l2w, N0lw,
 #    P2w, P1w,
 #    N0w, N1w, N2w, N3w, 0
 #]
 # NB: The order of the enum is _NOT_ arbitrary!!
 cpdef enum:
    S2w
    S2W
    S2p
    S2c
    S2c4
    S2c6
    S2L
    S2_prefix
    S2_suffix
    S2_shape
    S2_ne_iob
    S2_ne_type
    S1w
    S1W
    S1p
    S1c
    S1c4
    S1c6
    S1L
    S1_prefix
    S1_suffix
    S1_shape
    S1_ne_iob
    S1_ne_type
    S1rw
    S1rW
    S1rp
    S1rc
    S1rc4
    S1rc6
    S1rL
    S1r_prefix
    S1r_suffix
    S1r_shape
    S1r_ne_iob
    S1r_ne_type
    S0lw
    S0lW
    S0lp
    S0lc
    S0lc4
    S0lc6
    S0lL
    S0l_prefix
    S0l_suffix
    S0l_shape
    S0l_ne_iob
    S0l_ne_type
    S0l2w
    S0l2W
    S0l2p
    S0l2c
    S0l2c4
    S0l2c6
    S0l2L
    S0l2_prefix
    S0l2_suffix
    S0l2_shape
    S0l2_ne_iob
    S0l2_ne_type
    S0w
    S0W
    S0p
    S0c
    S0c4
    S0c6
    S0L
    S0_prefix
    S0_suffix
    S0_shape
    S0_ne_iob
    S0_ne_type
    S0r2w
    S0r2W
    S0r2p
    S0r2c
    S0r2c4
    S0r2c6
    S0r2L
    S0r2_prefix
    S0r2_suffix
    S0r2_shape
    S0r2_ne_iob
    S0r2_ne_type
    S0rw
    S0rW
    S0rp
    S0rc
    S0rc4
    S0rc6
    S0rL
    S0r_prefix
    S0r_suffix
    S0r_shape
    S0r_ne_iob
    S0r_ne_type
    N0l2w
    N0l2W
    N0l2p
    N0l2c
    N0l2c4
    N0l2c6
    N0l2L
    N0l2_prefix
    N0l2_suffix
    N0l2_shape
    N0l2_ne_iob
    N0l2_ne_type
    N0lw
    N0lW
    N0lp
    N0lc
    N0lc4
    N0lc6
    N0lL
    N0l_prefix
    N0l_suffix
    N0l_shape
    N0l_ne_iob
    N0l_ne_type
    N0w
    N0W
    N0p
    N0c
    N0c4
    N0c6
    N0L
    N0_prefix
    N0_suffix
    N0_shape
    N0_ne_iob
    N0_ne_type
    N1w
    N1W
    N1p
    N1c
    N1c4
    N1c6
    N1L
    N1_prefix
    N1_suffix
    N1_shape
    N1_ne_iob
    N1_ne_type
    N2w
    N2W
    N2p
    N2c
    N2c4
    N2c6
    N2L
    N2_prefix
    N2_suffix
    N2_shape
    N2_ne_iob
    N2_ne_type
    P1w
    P1W
    P1p
    P1c
    P1c4
    P1c6
    P1L
    P1_prefix
    P1_suffix
    P1_shape
    P1_ne_iob
    P1_ne_type
    P2w
    P2W
    P2p
    P2c
    P2c4
    P2c6
    P2L
    P2_prefix
    P2_suffix
    P2_shape
    P2_ne_iob
    P2_ne_type
    E0w
    E0W
    E0p
    E0c
    E0c4
    E0c6
    E0L
    E0_prefix
    E0_suffix
    E0_shape
    E0_ne_iob
    E0_ne_type
    E1w
    E1W
    E1p
    E1c
    E1c4
    E1c6
    E1L
    E1_prefix
    E1_suffix
    E1_shape
    E1_ne_iob
    E1_ne_type
    # Misc features at the end
    dist
    N0lv
    S0lv
    S0rv
    S1lv
    S1rv
    S0_has_head
    S1_has_head
    S2_has_head
    CONTEXT_SIZE
--- a/spacy/syntax/_parse_features.pyx
+++ b/spacy/syntax/_parse_features.pyx
@ -1,419 +0,0 @@
 """
 Fill an array, context, with every _atomic_ value our features reference.
 We then write the _actual features_ as tuples of the atoms. The machinery
 that translates from the tuples to feature-extractors (which pick the values
 out of "context") is in features/extractor.pyx
 The atomic feature names are listed in a big enum, so that the feature tuples
 can refer to them.
 """
 # coding: utf-8
 from __future__ import unicode_literals
 from libc.string cimport memset
 from itertools import combinations
 from cymem.cymem cimport Pool
 from ..structs cimport TokenC
 from .stateclass cimport StateClass
 from ._state cimport StateC
 cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
    if token is NULL:
        context[0] = 0
        context[1] = 0
        context[2] = 0
        context[3] = 0
        context[4] = 0
        context[5] = 0
        context[6] = 0
        context[7] = 0
        context[8] = 0
        context[9] = 0
        context[10] = 0
        context[11] = 0
    else:
        context[0] = token.lex.orth
        context[1] = token.lemma
        context[2] = token.tag
        context[3] = token.lex.cluster
        # We've read in the string little-endian, so now we can take & (2**n)-1
        # to get the first n bits of the cluster.
        # e.g. s = "1110010101"
        # s = ''.join(reversed(s))
        # first_4_bits = int(s, 2)
        # print first_4_bits
        # 5
        # print "{0:b}".format(prefix).ljust(4, '0')
        # 1110
        # What we're doing here is picking a number where all bits are 1, e.g.
        # 15 is 1111, 63 is 111111 and doing bitwise AND, so getting all bits in
        # the source that are set to 1.
        context[4] = token.lex.cluster & 15
        context[5] = token.lex.cluster & 63
        context[6] = token.dep if token.head != 0 else 0
        context[7] = token.lex.prefix
        context[8] = token.lex.suffix
        context[9] = token.lex.shape
        context[10] = token.ent_iob
        context[11] = token.ent_type
 cdef int fill_context(atom_t* ctxt, const StateC* st) nogil:
    # Take care to fill every element of context!
    # We could memset, but this makes it very easy to have broken features that
    # make almost no impact on accuracy. If instead they're unset, the impact
    # tends to be dramatic, so we get an obvious regression to fix...
    fill_token(&ctxt[S2w], st.S_(2))
    fill_token(&ctxt[S1w], st.S_(1))
    fill_token(&ctxt[S1rw], st.R_(st.S(1), 1))
    fill_token(&ctxt[S0lw], st.L_(st.S(0), 1))
    fill_token(&ctxt[S0l2w], st.L_(st.S(0), 2))
    fill_token(&ctxt[S0w], st.S_(0))
    fill_token(&ctxt[S0r2w], st.R_(st.S(0), 2))
    fill_token(&ctxt[S0rw], st.R_(st.S(0), 1))
    fill_token(&ctxt[N0lw], st.L_(st.B(0), 1))
    fill_token(&ctxt[N0l2w], st.L_(st.B(0), 2))
    fill_token(&ctxt[N0w], st.B_(0))
    fill_token(&ctxt[N1w], st.B_(1))
    fill_token(&ctxt[N2w], st.B_(2))
    fill_token(&ctxt[P1w], st.safe_get(st.B(0)-1))
    fill_token(&ctxt[P2w], st.safe_get(st.B(0)-2))
    fill_token(&ctxt[E0w], st.E_(0))
    fill_token(&ctxt[E1w], st.E_(1))
    if st.stack_depth() >= 1 and not st.eol():
        ctxt[dist] = min_(st.B(0) - st.E(0), 5)
    else:
        ctxt[dist] = 0
    ctxt[N0lv] = min_(st.n_L(st.B(0)), 5)
    ctxt[S0lv] = min_(st.n_L(st.S(0)), 5)
    ctxt[S0rv] = min_(st.n_R(st.S(0)), 5)
    ctxt[S1lv] = min_(st.n_L(st.S(1)), 5)
    ctxt[S1rv] = min_(st.n_R(st.S(1)), 5)
    ctxt[S0_has_head] = 0
    ctxt[S1_has_head] = 0
    ctxt[S2_has_head] = 0
    if st.stack_depth() >= 1:
        ctxt[S0_has_head] = st.has_head(st.S(0)) + 1
        if st.stack_depth() >= 2:
            ctxt[S1_has_head] = st.has_head(st.S(1)) + 1
            if st.stack_depth() >= 3:
                ctxt[S2_has_head] = st.has_head(st.S(2)) + 1
 cdef inline int min_(int a, int b) nogil:
    return a if a > b else b
 ner = (
    (N0W,),
    (P1W,),
    (N1W,),
    (P2W,),
    (N2W,),
    (P1W, N0W,),
    (N0W, N1W),
    (N0_prefix,),
    (N0_suffix,),
    (P1_shape,),
    (N0_shape,),
    (N1_shape,),
    (P1_shape, N0_shape,),
    (N0_shape, P1_shape,),
    (P1_shape, N0_shape, N1_shape),
    (N2_shape,),
    (P2_shape,),
    #(P2_norm, P1_norm, W_norm),
    #(P1_norm, W_norm, N1_norm),
    #(W_norm, N1_norm, N2_norm)
    (P2p,),
    (P1p,),
    (N0p,),
    (N1p,),
    (N2p,),
    (P1p, N0p),
    (N0p, N1p),
    (P2p, P1p, N0p),
    (P1p, N0p, N1p),
    (N0p, N1p, N2p),
    (P2c,),
    (P1c,),
    (N0c,),
    (N1c,),
    (N2c,),
    (P1c, N0c),
    (N0c, N1c),
    (E0W,),
    (E0c,),
    (E0p,),
    (E0W, N0W),
    (E0c, N0W),
    (E0p, N0W),
    (E0p, P1p, N0p),
    (E0c, P1c, N0c),
    (E0w, P1c),
    (E0p, P1p),
    (E0c, P1c),
    (E0p, E1p),
    (E0c, P1p),
    (E1W,),
    (E1c,),
    (E1p,),
    (E0W, E1W),
    (E0W, E1p,),
    (E0p, E1W,),
    (E0p, E1W),
    (P1_ne_iob,),
    (P1_ne_iob, P1_ne_type),
    (N0w, P1_ne_iob, P1_ne_type),
    (N0_shape,),
    (N1_shape,),
    (N2_shape,),
    (P1_shape,),
    (P2_shape,),
    (N0_prefix,),
    (N0_suffix,),
    (P1_ne_iob,),
    (P2_ne_iob,),
    (P1_ne_iob, P2_ne_iob),
    (P1_ne_iob, P1_ne_type),
    (P2_ne_iob, P2_ne_type),
    (N0w, P1_ne_iob, P1_ne_type),
    (N0w, N1w),
 )
 unigrams = (
    (S2W, S2p),
    (S2c6, S2p),
    (S1W, S1p),
    (S1c6, S1p),
    (S0W, S0p),
    (S0c6, S0p),
    (N0W, N0p),
    (N0p,),
    (N0c,),
    (N0c6, N0p),
    (N0L,),
    (N1W, N1p),
    (N1c6, N1p),
    (N2W, N2p),
    (N2c6, N2p),
    (S0r2W, S0r2p),
    (S0r2c6, S0r2p),
    (S0r2L,),
    (S0rW, S0rp),
    (S0rc6, S0rp),
    (S0rL,),
    (S0l2W, S0l2p),
    (S0l2c6, S0l2p),
    (S0l2L,),
    (S0lW, S0lp),
    (S0lc6, S0lp),
    (S0lL,),
    (N0l2W, N0l2p),
    (N0l2c6, N0l2p),
    (N0l2L,),
    (N0lW, N0lp),
    (N0lc6, N0lp),
    (N0lL,),
 )
 s0_n0 = (
    (S0W, S0p, N0W, N0p),
    (S0c, S0p, N0c, N0p),
    (S0c6, S0p, N0c6, N0p),
    (S0c4, S0p, N0c4, N0p),
    (S0p, N0p),
    (S0W, N0p),
    (S0p, N0W),
    (S0W, N0c),
    (S0c, N0W),
    (S0p, N0c),
    (S0c, N0p),
    (S0W, S0rp, N0p),
    (S0p, S0rp, N0p),
    (S0p, N0lp, N0W),
    (S0p, N0lp, N0p),
    (S0L, N0p),
    (S0p, S0rL, N0p),
    (S0p, N0lL, N0p),
    (S0p, S0rv, N0p),
    (S0p, N0lv, N0p),
    (S0c6, S0rL, S0r2L, N0p),
    (S0p, N0lL, N0l2L, N0p),
 )
 s1_s0 = (
    (S1p, S0p),
    (S1p, S0p, S0_has_head),
    (S1W, S0p),
    (S1W, S0p, S0_has_head),
    (S1c, S0p),
    (S1c, S0p, S0_has_head),
    (S1p, S1rL, S0p),
    (S1p, S1rL, S0p, S0_has_head),
    (S1p, S0lL, S0p),
    (S1p, S0lL, S0p, S0_has_head),
    (S1p, S0lL, S0l2L, S0p),
    (S1p, S0lL, S0l2L, S0p, S0_has_head),
    (S1L, S0L, S0W),
    (S1L, S0L, S0p),
    (S1p, S1L, S0L, S0p),
    (S1p, S0p),
 )
 s1_n0 = (
    (S1p, N0p),
    (S1c, N0c),
    (S1c, N0p),
    (S1p, N0c),
    (S1W, S1p, N0p),
    (S1p, N0W, N0p),
    (S1c6, S1p, N0c6, N0p),
    (S1L, N0p),
    (S1p, S1rL, N0p),
    (S1p, S1rp, N0p),
 )
 s0_n1 = (
    (S0p, N1p),
    (S0c, N1c),
    (S0c, N1p),
    (S0p, N1c),
    (S0W, S0p, N1p),
    (S0p, N1W, N1p),
    (S0c6, S0p, N1c6, N1p),
    (S0L, N1p),
    (S0p, S0rL, N1p),
 )
 n0_n1 = (
    (N0W, N0p, N1W, N1p),
    (N0W, N0p, N1p),
    (N0p, N1W, N1p),
    (N0c, N0p, N1c, N1p),
    (N0c6, N0p, N1c6, N1p),
    (N0c, N1c),
    (N0p, N1c),
 )
 tree_shape = (
    (dist,),
    (S0p, S0_has_head, S1_has_head, S2_has_head),
    (S0p, S0lv, S0rv),
    (N0p, N0lv),
 )
 trigrams = (
    (N0p, N1p, N2p),
    (S0p, S0lp, S0l2p),
    (S0p, S0rp, S0r2p),
    (S0p, S1p, S2p),
    (S1p, S0p, N0p),
    (S0p, S0lp, N0p),
    (S0p, N0p, N0lp),
    (N0p, N0lp, N0l2p),
    (S0W, S0p, S0rL, S0r2L),
    (S0p, S0rL, S0r2L),
    (S0W, S0p, S0lL, S0l2L),
    (S0p, S0lL, S0l2L),
    (N0W, N0p, N0lL, N0l2L),
    (N0p, N0lL, N0l2L),
 )
 words = (
    S2w,
    S1w,
    S1rw,
    S0lw,
    S0l2w,
    S0w,
    S0r2w,
    S0rw,
    N0lw,
    N0l2w,
    N0w,
    N1w,
    N2w,
    P1w,
    P2w
 )
 tags = (
    S2p,
    S1p,
    S1rp,
    S0lp,
    S0l2p,
    S0p,
    S0r2p,
    S0rp,
    N0lp,
    N0l2p,
    N0p,
    N1p,
    N2p,
    P1p,
    P2p
 )
 labels = (
    S2L,
    S1L,
    S1rL,
    S0lL,
    S0l2L,
    S0L,
    S0r2L,
    S0rL,
    N0lL,
    N0l2L,
    N0L,
    N1L,
    N2L,
    P1L,
    P2L
 )
--- a/spacy/syntax/beam_parser.pxd
+++ b/spacy/syntax/beam_parser.pxd
@ -1,10 +0,0 @@
 from .parser cimport Parser
 from ..structs cimport TokenC
 from thinc.typedefs cimport weight_t
 cdef class BeamParser(Parser):
    cdef public int beam_width
    cdef public weight_t beam_density
    cdef int _parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) except -1
--- a/spacy/syntax/beam_parser.pyx
+++ b/spacy/syntax/beam_parser.pyx
@ -1,239 +0,0 @@
 """
 MALT-style dependency parser
 """
 # cython: profile=True
 # cython: experimental_cpp_class_def=True
 # cython: cdivision=True
 # cython: infer_types=True
 # coding: utf-8
 from __future__ import unicode_literals, print_function
 cimport cython
 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 from libc.stdint cimport uint32_t, uint64_t
 from libc.string cimport memset, memcpy
 from libc.stdlib cimport rand
 from libc.math cimport log, exp, isnan, isinf
 from cymem.cymem cimport Pool, Address
 from murmurhash.mrmr cimport real_hash64 as hash64
 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
 from thinc.linear.features cimport ConjunctionExtracter
 from thinc.structs cimport FeatureC, ExampleC
 from thinc.extra.search cimport Beam, MaxViolation
 from thinc.extra.eg cimport Example
 from thinc.extra.mb cimport Minibatch
 from ..structs cimport TokenC
 from ..tokens.doc cimport Doc
 from ..strings cimport StringStore
 from .transition_system cimport TransitionSystem, Transition
 from ..gold cimport GoldParse
 from . import _parse_features
 from ._parse_features cimport CONTEXT_SIZE
 from ._parse_features cimport fill_context
 from .stateclass cimport StateClass
 from .parser cimport Parser
 DEBUG = False
 def set_debug(val):
    global DEBUG
    DEBUG = val
 def get_templates(name):
    pf = _parse_features
    if name == 'ner':
        return pf.ner
    elif name == 'debug':
        return pf.unigrams
    else:
        return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
                pf.tree_shape + pf.trigrams)
 cdef int BEAM_WIDTH = 16
 cdef weight_t BEAM_DENSITY = 0.001
 cdef class BeamParser(Parser):
    def __init__(self, *args, **kwargs):
        self.beam_width = kwargs.get('beam_width', BEAM_WIDTH)
        self.beam_density = kwargs.get('beam_density', BEAM_DENSITY)
        Parser.__init__(self, *args, **kwargs)
    cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil:
        with gil:
            self._parseC(tokens, length, nr_feat, self.moves.n_moves)
    cdef int _parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) except -1:
        cdef Beam beam = Beam(self.moves.n_moves, self.beam_width, min_density=self.beam_density)
        # TODO: How do we handle new labels here? This increases nr_class
        beam.initialize(self.moves.init_beam_state, length, tokens)
        beam.check_done(_check_final_state, NULL)
        if beam.is_done:
            _cleanup(beam)
            return 0
        while not beam.is_done:
            self._advance_beam(beam, None, False)
        state = <StateClass>beam.at(0)
        self.moves.finalize_state(state.c)
        for i in range(length):
            tokens[i] = state.c._sent[i]
        _cleanup(beam)
    def update(self, Doc tokens, GoldParse gold_parse, itn=0):
        self.moves.preprocess_gold(gold_parse)
        cdef Beam pred = Beam(self.moves.n_moves, self.beam_width)
        pred.initialize(self.moves.init_beam_state, tokens.length, tokens.c)
        pred.check_done(_check_final_state, NULL)
        # Hack for NER
        for i in range(pred.size):
            stcls = <StateClass>pred.at(i)
            self.moves.initialize_state(stcls.c)
        cdef Beam gold = Beam(self.moves.n_moves, self.beam_width, min_density=0.0)
        gold.initialize(self.moves.init_beam_state, tokens.length, tokens.c)
        gold.check_done(_check_final_state, NULL)
        violn = MaxViolation()
        while not pred.is_done and not gold.is_done:
            # We search separately here, to allow for ambiguity in the gold parse.
            self._advance_beam(pred, gold_parse, False)
            self._advance_beam(gold, gold_parse, True)
            violn.check_crf(pred, gold)
            if pred.loss > 0 and pred.min_score > (gold.score + self.model.time):
                break
        else:
            # The non-monotonic oracle makes it difficult to ensure final costs are
            # correct. Therefore do final correction
            for i in range(pred.size):
                if self.moves.is_gold_parse(<StateClass>pred.at(i), gold_parse):
                    pred._states[i].loss = 0.0
                elif pred._states[i].loss == 0.0:
                    pred._states[i].loss = 1.0
            violn.check_crf(pred, gold)
        if pred.size < 1:
            raise Exception("No candidates", tokens.length)
        if gold.size < 1:
            raise Exception("No gold", tokens.length)
        if pred.loss == 0:
            self.model.update_from_histories(self.moves, tokens, [(0.0, [])])
        elif True:
            #_check_train_integrity(pred, gold, gold_parse, self.moves)
            histories = list(zip(violn.p_probs, violn.p_hist)) + \
                        list(zip(violn.g_probs, violn.g_hist))
            self.model.update_from_histories(self.moves, tokens, histories, min_grad=0.001**(itn+1))
        else:
            self.model.update_from_histories(self.moves, tokens,
                [(1.0, violn.p_hist[0]), (-1.0, violn.g_hist[0])])
        _cleanup(pred)
        _cleanup(gold)
        return pred.loss
    def _advance_beam(self, Beam beam, GoldParse gold, bint follow_gold):
        cdef atom_t[CONTEXT_SIZE] context
        cdef Pool mem = Pool()
        features = <FeatureC*>mem.alloc(self.model.nr_feat, sizeof(FeatureC))
        if False:
            mb = Minibatch(self.model.widths, beam.size)
            for i in range(beam.size):
                stcls = <StateClass>beam.at(i)
                if stcls.c.is_final():
                    nr_feat = 0
                else:
                    nr_feat = self.model.set_featuresC(context, features, stcls.c)
                    self.moves.set_valid(beam.is_valid[i], stcls.c)
                mb.c.push_back(features, nr_feat, beam.costs[i], beam.is_valid[i], 0)
            self.model(mb)
            for i in range(beam.size):
                memcpy(beam.scores[i], mb.c.scores(i), mb.c.nr_out() * sizeof(beam.scores[i][0]))
        else:
            for i in range(beam.size):
                stcls = <StateClass>beam.at(i)
                if not stcls.is_final():
                    nr_feat = self.model.set_featuresC(context, features, stcls.c)
                    self.moves.set_valid(beam.is_valid[i], stcls.c)
                    self.model.set_scoresC(beam.scores[i], features, nr_feat)
        if gold is not None:
            n_gold = 0
            lines = []
            for i in range(beam.size):
                stcls = <StateClass>beam.at(i)
                if not stcls.c.is_final():
                    self.moves.set_costs(beam.is_valid[i], beam.costs[i], stcls, gold)
                    if follow_gold:
                        for j in range(self.moves.n_moves):
                            if beam.costs[i][j] >= 1:
                                beam.is_valid[i][j] = 0
                                lines.append((stcls.B(0), stcls.B(1),
                                    stcls.B_(0).ent_iob, stcls.B_(1).ent_iob,
                                    stcls.B_(1).sent_start,
                                    j,
                                    beam.is_valid[i][j], 'set invalid',
                                    beam.costs[i][j], self.moves.c[j].move, self.moves.c[j].label))
                            n_gold += 1 if beam.is_valid[i][j] else 0
            if follow_gold and n_gold == 0:
                raise Exception("No gold")
        if follow_gold:
            beam.advance(_transition_state, NULL, <void*>self.moves.c)
        else:
            beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
        beam.check_done(_check_final_state, NULL)
 # These are passed as callbacks to thinc.search.Beam
 cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
    dest = <StateClass>_dest
    src = <StateClass>_src
    moves = <const Transition*>_moves
    dest.clone(src)
    moves[clas].do(dest.c, moves[clas].label)
 cdef int _check_final_state(void* _state, void* extra_args) except -1:
    return (<StateClass>_state).is_final()
 def _cleanup(Beam beam):
    for i in range(beam.width):
        Py_XDECREF(<PyObject*>beam._states[i].content)
        Py_XDECREF(<PyObject*>beam._parents[i].content)
 cdef hash_t _hash_state(void* _state, void* _) except 0:
    state = <StateClass>_state
    if state.c.is_final():
        return 1
    else:
        return state.c.hash()
 def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, TransitionSystem moves):
    for i in range(pred.size):
        if not pred._states[i].is_done or pred._states[i].loss == 0:
            continue
        state = <StateClass>pred.at(i)
        if moves.is_gold_parse(state, gold_parse) == True:
            for dep in gold_parse.orig_annot:
                print(dep[1], dep[3], dep[4])
            print("Cost", pred._states[i].loss)
            for j in range(gold_parse.length):
                print(gold_parse.orig_annot[j][1], state.H(j), moves.strings[state.safe_get(j).dep])
            acts = [moves.c[clas].move for clas in pred.histories[i]]
            labels = [moves.c[clas].label for clas in pred.histories[i]]
            print([moves.move_name(move, label) for move, label in zip(acts, labels)])
            raise Exception("Predicted state is gold-standard")
    for i in range(gold.size):
        if not gold._states[i].is_done:
            continue
        state = <StateClass>gold.at(i)
        if moves.is_gold(state, gold_parse) == False:
            print("Truth")
            for dep in gold_parse.orig_annot:
                print(dep[1], dep[3], dep[4])
            print("Predicted good")
            for j in range(gold_parse.length):
                print(gold_parse.orig_annot[j][1], state.H(j), moves.strings[state.safe_get(j).dep])
            raise Exception("Gold parse is not gold-standard")
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@ -1,24 +0,0 @@
 from thinc.linear.avgtron cimport AveragedPerceptron
 from thinc.typedefs cimport atom_t
 from thinc.structs cimport FeatureC
 from .stateclass cimport StateClass
 from .arc_eager cimport TransitionSystem
 from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc
 from ..structs cimport TokenC
 from ._state cimport StateC
 cdef class ParserModel(AveragedPerceptron):
    cdef int set_featuresC(self, atom_t* context, FeatureC* features,
                            const StateC* state) nogil
 cdef class Parser:
    cdef readonly Vocab vocab
    cdef readonly ParserModel model
    cdef readonly TransitionSystem moves
    cdef readonly object cfg
    cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -1,526 +0,0 @@
 """
 MALT-style dependency parser
 """
 # coding: utf-8
 # cython: infer_types=True
 from __future__ import unicode_literals
 from collections import Counter
 import ujson
 cimport cython
 cimport cython.parallel
 import numpy.random
 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 from cpython.exc cimport PyErr_CheckSignals
 from libc.stdint cimport uint32_t, uint64_t
 from libc.string cimport memset, memcpy
 from libc.stdlib cimport malloc, calloc, free
 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
 from thinc.linear.avgtron cimport AveragedPerceptron
 from thinc.linalg cimport VecVec
 from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
 from thinc.extra.eg cimport Example
 from cymem.cymem cimport Pool, Address
 from murmurhash.mrmr cimport hash64
 from preshed.maps cimport MapStruct
 from preshed.maps cimport map_get
 from . import _parse_features
 from ._parse_features cimport CONTEXT_SIZE
 from ._parse_features cimport fill_context
 from .stateclass cimport StateClass
 from ._state cimport StateC
 from .transition_system import OracleError
 from .transition_system cimport TransitionSystem, Transition
 from ..structs cimport TokenC
 from ..tokens.doc cimport Doc
 from ..strings cimport StringStore
 from ..gold cimport GoldParse
 USE_FTRL = True
 DEBUG = False
 def set_debug(val):
    global DEBUG
    DEBUG = val
 def get_templates(name):
    pf = _parse_features
    if name == 'ner':
        return pf.ner
    elif name == 'debug':
        return pf.unigrams
    elif name.startswith('embed'):
        return (pf.words, pf.tags, pf.labels)
    else:
        return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
                pf.tree_shape + pf.trigrams)
 cdef class ParserModel(AveragedPerceptron):
    cdef int set_featuresC(self, atom_t* context, FeatureC* features,
            const StateC* state) nogil:
        fill_context(context, state)
        nr_feat = self.extracter.set_features(features, context)
        return nr_feat
    def update(self, Example eg, itn=0):
        """
        Does regression on negative cost. Sort of cute?
        """
        self.time += 1
        cdef int best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
        cdef int guess = eg.guess
        if guess == best or best == -1:
            return 0.0
        cdef FeatureC feat
        cdef int clas
        cdef weight_t gradient
        if USE_FTRL:
            for feat in eg.c.features[:eg.c.nr_feat]:
                for clas in range(eg.c.nr_class):
                    if eg.c.is_valid[clas] and eg.c.scores[clas] >= eg.c.scores[best]:
                        gradient = eg.c.scores[clas] + eg.c.costs[clas]
                        self.update_weight_ftrl(feat.key, clas, feat.value * gradient)
        else:
            for feat in eg.c.features[:eg.c.nr_feat]:
                self.update_weight(feat.key, guess, feat.value * eg.c.costs[guess])
                self.update_weight(feat.key, best, -feat.value * eg.c.costs[guess])
        return eg.c.costs[guess]
    def update_from_histories(self, TransitionSystem moves, Doc doc, histories, weight_t min_grad=0.0):
        cdef Pool mem = Pool()
        features = <FeatureC*>mem.alloc(self.nr_feat, sizeof(FeatureC))
        cdef StateClass stcls
        cdef class_t clas
        self.time += 1
        cdef atom_t[CONTEXT_SIZE] atoms
        histories = [(grad, hist) for grad, hist in histories if abs(grad) >= min_grad and hist]
        if not histories:
            return None
        gradient = [Counter() for _ in range(max([max(h)+1 for _, h in histories]))]
        for d_loss, history in histories:
            stcls = StateClass.init(doc.c, doc.length)
            moves.initialize_state(stcls.c)
            for clas in history:
                nr_feat = self.set_featuresC(atoms, features, stcls.c)
                clas_grad = gradient[clas]
                for feat in features[:nr_feat]:
                    clas_grad[feat.key] += d_loss * feat.value
                moves.c[clas].do(stcls.c, moves.c[clas].label)
        cdef feat_t key
        cdef weight_t d_feat
        for clas, clas_grad in enumerate(gradient):
            for key, d_feat in clas_grad.items():
                if d_feat != 0:
                    self.update_weight_ftrl(key, clas, d_feat)
 cdef class Parser:
    """
    Base class of the DependencyParser and EntityRecognizer.
    """
    @classmethod
    def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg):
        """
        Load the statistical model from the supplied path.
        Arguments:
            path (Path):
                The path to load from.
            vocab (Vocab):
                The vocabulary. Must be shared by the documents to be processed.
            require (bool):
                Whether to raise an error if the files are not found.
        Returns (Parser):
            The newly constructed object.
        """
        with (path / 'config.json').open() as file_:
            cfg = ujson.load(file_)
        # TODO: remove this shim when we don't have to support older data
        if 'labels' in cfg and 'actions' not in cfg:
            cfg['actions'] = cfg.pop('labels')
        # TODO: remove this shim when we don't have to support older data
        for action_name, labels in dict(cfg.get('actions', {})).items():
            # We need this to be sorted
            if isinstance(labels, dict):
                labels = list(sorted(labels.keys()))
            cfg['actions'][action_name] = labels
        self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg)
        if (path / 'model').exists():
            self.model.load(str(path / 'model'))
        elif require:
            raise IOError(
                "Required file %s/model not found when loading" % str(path))
        return self
    def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg):
        """
        Create a Parser.
        Arguments:
            vocab (Vocab):
                The vocabulary object. Must be shared with documents to be processed.
            model (thinc.linear.AveragedPerceptron):
                The statistical model.
        Returns (Parser):
            The newly constructed object.
        """
        if TransitionSystem is None:
            TransitionSystem = self.TransitionSystem
        self.vocab = vocab
        cfg['actions'] = TransitionSystem.get_actions(**cfg)
        self.moves = TransitionSystem(vocab.strings, cfg['actions'])
        # TODO: Remove this when we no longer need to support old-style models
        if isinstance(cfg.get('features'), basestring):
            cfg['features'] = get_templates(cfg['features'])
        elif 'features' not in cfg:
            cfg['features'] = self.feature_templates
        self.model = ParserModel(cfg['features'])
        self.model.l1_penalty = cfg.get('L1', 0.0)
        self.model.learn_rate = cfg.get('learn_rate', 0.001)
        self.cfg = cfg
        # TODO: This is a pretty hacky fix to the problem of adding more
        # labels. The issue is they come in out of order, if labels are
        # added during training
        for label in cfg.get('extra_labels', []):
            self.add_label(label)
    def __reduce__(self):
        return (Parser, (self.vocab, self.moves, self.model), None, None)
    def __call__(self, Doc tokens):
        """
        Apply the entity recognizer, setting the annotations onto the Doc object.
        Arguments:
            doc (Doc): The document to be processed.
        Returns:
            None
        """
        cdef int nr_feat = self.model.nr_feat
        with nogil:
            status = self.parseC(tokens.c, tokens.length, nr_feat)
        # Check for KeyboardInterrupt etc. Untested
        PyErr_CheckSignals()
        if status != 0:
            raise ParserStateError(tokens)
        self.moves.finalize_doc(tokens)
    def pipe(self, stream, int batch_size=1000, int n_threads=2):
        """
        Process a stream of documents.
        Arguments:
            stream: The sequence of documents to process.
            batch_size (int):
                The number of documents to accumulate into a working set.
            n_threads (int):
                The number of threads with which to work on the buffer in parallel.
        Yields (Doc): Documents, in order.
        """
        cdef Pool mem = Pool()
        cdef TokenC** doc_ptr = <TokenC**>mem.alloc(batch_size, sizeof(TokenC*))
        cdef int* lengths = <int*>mem.alloc(batch_size, sizeof(int))
        cdef Doc doc
        cdef int i
        cdef int nr_feat = self.model.nr_feat
        cdef int status
        queue = []
        for doc in stream:
            doc_ptr[len(queue)] = doc.c
            lengths[len(queue)] = doc.length
            queue.append(doc)
            if len(queue) == batch_size:
                with nogil:
                    for i in cython.parallel.prange(batch_size, num_threads=n_threads):
                        status = self.parseC(doc_ptr[i], lengths[i], nr_feat)
                        if status != 0:
                            with gil:
                                raise ParserStateError(queue[i])
                PyErr_CheckSignals()
                for doc in queue:
                    self.moves.finalize_doc(doc)
                    yield doc
                queue = []
        batch_size = len(queue)
        with nogil:
            for i in cython.parallel.prange(batch_size, num_threads=n_threads):
                status = self.parseC(doc_ptr[i], lengths[i], nr_feat)
                if status != 0:
                    with gil:
                        raise ParserStateError(queue[i])
        PyErr_CheckSignals()
        for doc in queue:
            self.moves.finalize_doc(doc)
            yield doc
    cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil:
        state = new StateC(tokens, length)
        # NB: This can change self.moves.n_moves!
        # I think this causes memory errors if called by .pipe()
        self.moves.initialize_state(state)
        nr_class = self.moves.n_moves
        cdef ExampleC eg
        eg.nr_feat = nr_feat
        eg.nr_atom = CONTEXT_SIZE
        eg.nr_class = nr_class
        eg.features = <FeatureC*>calloc(sizeof(FeatureC), nr_feat)
        eg.atoms = <atom_t*>calloc(sizeof(atom_t), CONTEXT_SIZE)
        eg.scores = <weight_t*>calloc(sizeof(weight_t), nr_class)
        eg.is_valid = <int*>calloc(sizeof(int), nr_class)
        cdef int i
        while not state.is_final():
            eg.nr_feat = self.model.set_featuresC(eg.atoms, eg.features, state)
            self.moves.set_valid(eg.is_valid, state)
            self.model.set_scoresC(eg.scores, eg.features, eg.nr_feat)
            guess = VecVec.arg_max_if_true(eg.scores, eg.is_valid, eg.nr_class)
            if guess < 0:
                return 1
            action = self.moves.c[guess]
            action.do(state, action.label)
            memset(eg.scores, 0, sizeof(eg.scores[0]) * eg.nr_class)
            for i in range(eg.nr_class):
                eg.is_valid[i] = 1
        self.moves.finalize_state(state)
        for i in range(length):
            tokens[i] = state._sent[i]
        del state
        free(eg.features)
        free(eg.atoms)
        free(eg.scores)
        free(eg.is_valid)
        return 0
    def update(self, Doc tokens, GoldParse gold, itn=0, double drop=0.0):
        """
        Update the statistical model.
        Arguments:
            doc (Doc):
                The example document for the update.
            gold (GoldParse):
                The gold-standard annotations, to calculate the loss.
        Returns (float):
            The loss on this example.
        """
        self.moves.preprocess_gold(gold)
        cdef StateClass stcls = StateClass.init(tokens.c, tokens.length)
        self.moves.initialize_state(stcls.c)
        cdef Pool mem = Pool()
        cdef Example eg = Example(
                nr_class=self.moves.n_moves,
                nr_atom=CONTEXT_SIZE,
                nr_feat=self.model.nr_feat)
        cdef weight_t loss = 0
        cdef Transition action
        cdef double dropout_rate = self.cfg.get('dropout', drop)
        while not stcls.is_final():
            eg.c.nr_feat = self.model.set_featuresC(eg.c.atoms, eg.c.features,
                                                    stcls.c)
            dropout(eg.c.features, eg.c.nr_feat, dropout_rate)
            self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
            self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat)
            guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
            self.model.update(eg)
            action = self.moves.c[guess]
            action.do(stcls.c, action.label)
            loss += eg.costs[guess]
            eg.fill_scores(0, eg.c.nr_class)
            eg.fill_costs(0, eg.c.nr_class)
            eg.fill_is_valid(1, eg.c.nr_class)
        self.moves.finalize_state(stcls.c)
        return loss
    def step_through(self, Doc doc, GoldParse gold=None):
        """
        Set up a stepwise state, to introspect and control the transition sequence.
        Arguments:
            doc (Doc): The document to step through.
            gold (GoldParse): Optional gold parse
        Returns (StepwiseState):
            A state object, to step through the annotation process.
        """
        return StepwiseState(self, doc, gold=gold)
    def from_transition_sequence(self, Doc doc, sequence):
        """Control the annotations on a document by specifying a transition sequence
        to follow.
        Arguments:
            doc (Doc): The document to annotate.
            sequence: A sequence of action names, as unicode strings.
        Returns: None
        """
        with self.step_through(doc) as stepwise:
            for transition in sequence:
                stepwise.transition(transition)
    def add_label(self, label):
        # Doesn't set label into serializer -- subclasses override it to do that.
        for action in self.moves.action_types:
            added = self.moves.add_action(action, label)
            if added:
                # Important that the labels be stored as a list! We need the
                # order, or the model goes out of synch
                self.cfg.setdefault('extra_labels', []).append(label)
 cdef int dropout(FeatureC* feats, int nr_feat, float prob) except -1:
    if prob <= 0 or prob >= 1.:
        return 0
    cdef double[::1] py_probs = numpy.random.uniform(0., 1., nr_feat)
    cdef double* probs = &py_probs[0]
    for i in range(nr_feat):
        if probs[i] >= prob:
            feats[i].value /= prob
        else:
            feats[i].value = 0.
 cdef class StepwiseState:
    cdef readonly StateClass stcls
    cdef readonly Example eg
    cdef readonly Doc doc
    cdef readonly GoldParse gold
    cdef readonly Parser parser
    def __init__(self, Parser parser, Doc doc, GoldParse gold=None):
        self.parser = parser
        self.doc = doc
        if gold is not None:
            self.gold = gold
            self.parser.moves.preprocess_gold(self.gold)
        else:
            self.gold = GoldParse(doc)
        self.stcls = StateClass.init(doc.c, doc.length)
        self.parser.moves.initialize_state(self.stcls.c)
        self.eg = Example(
            nr_class=self.parser.moves.n_moves,
            nr_atom=CONTEXT_SIZE,
            nr_feat=self.parser.model.nr_feat)
    def __enter__(self):
        return self
    def __exit__(self, type, value, traceback):
        self.finish()
    @property
    def is_final(self):
        return self.stcls.is_final()
    @property
    def stack(self):
        return self.stcls.stack
    @property
    def queue(self):
        return self.stcls.queue
    @property
    def heads(self):
        return [self.stcls.H(i) for i in range(self.stcls.c.length)]
    @property
    def deps(self):
        return [self.doc.vocab.strings[self.stcls.c._sent[i].dep]
                for i in range(self.stcls.c.length)]
    @property
    def costs(self):
        """
        Find the action-costs for the current state.
        """
        if not self.gold:
            raise ValueError("Can't set costs: No GoldParse provided")
        self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs,
                self.stcls, self.gold)
        costs = {}
        for i in range(self.parser.moves.n_moves):
            if not self.eg.c.is_valid[i]:
                continue
            transition = self.parser.moves.c[i]
            name = self.parser.moves.move_name(transition.move, transition.label)
            costs[name] = self.eg.c.costs[i]
        return costs
    def predict(self):
        self.eg.reset()
        self.eg.c.nr_feat = self.parser.model.set_featuresC(self.eg.c.atoms, self.eg.c.features,
                                                            self.stcls.c)
        self.parser.moves.set_valid(self.eg.c.is_valid, self.stcls.c)
        self.parser.model.set_scoresC(self.eg.c.scores,
            self.eg.c.features, self.eg.c.nr_feat)
        cdef Transition action = self.parser.moves.c[self.eg.guess]
        return self.parser.moves.move_name(action.move, action.label)
    def transition(self, action_name=None):
        if action_name is None:
            action_name = self.predict()
        moves = {'S': 0, 'D': 1, 'L': 2, 'R': 3}
        if action_name == '_':
            action_name = self.predict()
            action = self.parser.moves.lookup_transition(action_name)
        elif action_name == 'L' or action_name == 'R':
            self.predict()
            move = moves[action_name]
            clas = _arg_max_clas(self.eg.c.scores, move, self.parser.moves.c,
                                 self.eg.c.nr_class)
            action = self.parser.moves.c[clas]
        else:
            action = self.parser.moves.lookup_transition(action_name)
        action.do(self.stcls.c, action.label)
    def finish(self):
        if self.stcls.is_final():
            self.parser.moves.finalize_state(self.stcls.c)
        self.doc.set_parse(self.stcls.c._sent)
        self.parser.moves.finalize_doc(self.doc)
 class ParserStateError(ValueError):
    def __init__(self, doc):
        ValueError.__init__(self,
            "Error analysing doc -- no valid actions available. This should "
            "never happen, so please report the error on the issue tracker. "
            "Here's the thread to do so --- reopen it if it's closed:\n"
            "https://github.com/spacy-io/spaCy/issues/429\n"
            "Please include the text that the parser failed on, which is:\n"
            "%s" % repr(doc.text))
 cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs, int n) nogil:
    cdef int best = -1
    for i in range(n):
        if costs[i] <= 0:
            if best == -1 or scores[i] > scores[best]:
                best = i
    return best
 cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,
                       int nr_class) except -1:
    cdef weight_t score = 0
    cdef int mode = -1
    cdef int i
    for i in range(nr_class):
        if actions[i].move == move and (mode == -1 or scores[i] >= score):
            mode = i
            score = scores[i]
    return mode