Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-10-18 17:54:17 +03:00 · 2018-05-16 23:20:51 +02:00 · 2018-05-16 23:20:51 +02:00 · a7aa49c419
commit a7aa49c419
parent d1b27fe5aa 8661218fe8
16 changed files with 855 additions and 789 deletions
--- a/.appveyor.yml
+++ b/.appveyor.yml
@ -5,7 +5,7 @@ environment:
    # For Python versions available on Appveyor, see
    # http://www.appveyor.com/docs/installed-software#python
-    - PYTHON: "C:\\Python27"
+    - PYTHON: "C:\\Python27-x64"
    #- PYTHON: "C:\\Python34"
    #- PYTHON: "C:\\Python35"
    #- PYTHON: "C:\\Python27-x64"
--- a/.travis.yml
+++ b/.travis.yml
@ -22,6 +22,7 @@ install:
  - pip install flake8
 script:
  - "cat /proc/cpuinfo | grep flags | head -n 1"
  - "pip install pytest pytest-timeout"
  - if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi
  - if [[ "${VIA}" == "flake8" ]]; then flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics; fi  
--- a/requirements.txt
+++ b/requirements.txt
@ -3,7 +3,7 @@ pathlib
 numpy>=1.7
 cymem>=1.30,<1.32
 preshed>=1.0.0,<2.0.0
-thinc>=6.11.1.dev12,<6.12.0
+thinc>=6.11.1.dev17,<6.12.0
 murmurhash>=0.28,<0.29
 cytoolz>=0.9.0,<0.10.0
 plac<1.0.0,>=0.9.6
--- a/setup.py
+++ b/setup.py
@ -28,9 +28,10 @@ MOD_NAMES = [
    'spacy.pipeline',
    'spacy.syntax.stateclass',
    'spacy.syntax._state',
    'spacy.syntax._beam_utils',
    'spacy.tokenizer',
    'spacy.syntax.nn_parser',
    'spacy.syntax._parser_model',
    'spacy.syntax._beam_utils',
    'spacy.syntax.nonproj',
    'spacy.syntax.transition_system',
    'spacy.syntax.arc_eager',
@ -191,7 +192,7 @@ def setup_package():
                'murmurhash>=0.28,<0.29',
                'cymem>=1.30,<1.32',
                'preshed>=1.0.0,<2.0.0',
-                'thinc>=6.11.1.dev11,<6.12.0',
+                'thinc>=6.11.1.dev17,<6.12.0',
                'plac<1.0.0,>=0.9.6',
                'pathlib',
                'ujson>=1.35',
--- a/spacy/cli/ud_run_test.py
+++ b/spacy/cli/ud_run_test.py
@ -16,10 +16,12 @@ from ..gold import GoldParse
 from ..util import compounding, minibatch_by_words
 from ..syntax.nonproj import projectivize
 from ..matcher import Matcher
-from ..morphology import Fused_begin, Fused_inside
+#from ..morphology import Fused_begin, Fused_inside
 from .. import displacy
 from collections import defaultdict, Counter
 from timeit import default_timer as timer
 Fused_begin = None
 Fused_inside = None
 import itertools
 import random
@ -254,12 +256,6 @@ def get_token_split_end(token):
    return token.nbor(i-1)
 Token.set_extension('split_start', getter=get_token_split_start)
 Token.set_extension('split_end', getter=get_token_split_end)
 Token.set_extension('begins_fused', default=False)
 Token.set_extension('inside_fused', default=False)
 ##################
 # Initialization #
 ##################
@ -280,6 +276,10 @@ def initialize_pipeline(nlp, docs, golds, config, device):
    corpus=("UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc", "positional", None, str),
 )
 def main(test_data_dir, experiment_dir, corpus):
    Token.set_extension('split_start', getter=get_token_split_start)
    Token.set_extension('split_end', getter=get_token_split_end)
    Token.set_extension('begins_fused', default=False)
    Token.set_extension('inside_fused', default=False)
    lang.zh.Chinese.Defaults.use_jieba = False
    lang.ja.Japanese.Defaults.use_janome = False
    lang.ru.Russian.Defaults.use_pymorphy2 = False
--- a/spacy/cli/ud_train.py
+++ b/spacy/cli/ud_train.py
@ -170,6 +170,16 @@ def golds_to_gold_tuples(docs, golds):
 ##############
 def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
    if text_loc.parts[-1].endswith('.conllu'):
        docs = []
        with text_loc.open() as file_:
            for conllu_doc in read_conllu(file_):
                for conllu_sent in conllu_doc:
                    words = [line[1] for line in conllu_sent]
                    docs.append(Doc(nlp.vocab, words=words))
        for name, component in nlp.pipeline:
            docs = list(component.pipe(docs))
    else:
        with text_loc.open('r', encoding='utf8') as text_file:
            texts = split_text(text_file.read())
            docs = list(nlp.pipe(texts))
@ -270,12 +280,12 @@ def load_nlp(corpus, config, vectors=None):
 def initialize_pipeline(nlp, docs, golds, config, device):
    nlp.add_pipe(nlp.create_pipe('tagger'))
    nlp.add_pipe(nlp.create_pipe('parser'))
    if config.multitask_tag:
        nlp.parser.add_multitask_objective('tag')
    if config.multitask_sent:
        nlp.parser.add_multitask_objective('sent_start')
    nlp.add_pipe(nlp.create_pipe('tagger'))
    for gold in golds:
        for tag in gold.tags:
            if tag is not None:
@ -337,10 +347,12 @@ class TreebankPaths(object):
    config=("Path to json formatted config file", "positional"),
    limit=("Size limit", "option", "n", int),
    use_gpu=("Use GPU", "option", "g", int),
    use_oracle_segments=("Use oracle segments", "flag", "G", int),
    vectors_dir=("Path to directory with pre-trained vectors, named e.g. en/",
                         "option", "v", Path),
 )
-def main(ud_dir, parses_dir, config, corpus, limit=0, use_gpu=-1, vectors_dir=None):
+def main(ud_dir, parses_dir, config, corpus, limit=0, use_gpu=-1, vectors_dir=None,
        use_oracle_segments=False):
    spacy.util.fix_random_seed()
    lang.zh.Chinese.Defaults.use_jieba = False
    lang.ja.Japanese.Defaults.use_janome = False
@ -353,13 +365,17 @@ def main(ud_dir, parses_dir, config, corpus, limit=0, use_gpu=-1, vectors_dir=No
    nlp = load_nlp(paths.lang, config, vectors=vectors_dir)
    docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
-                            max_doc_length=config.max_doc_length, limit=limit)
+                                        max_doc_length=None, limit=limit)
    optimizer = initialize_pipeline(nlp, docs, golds, config, use_gpu)
    batch_sizes = compounding(config.batch_size//10, config.batch_size, 1.001)
    nlp.parser.cfg['beam_update_prob'] = 1.0
    for i in range(config.nr_epoch):
-        docs = [nlp.make_doc(doc.text) for doc in docs]
+        docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
                                max_doc_length=config.max_doc_length, limit=limit,
                                oracle_segments=use_oracle_segments,
                                raw_text=not use_oracle_segments)
        Xs = list(zip(docs, golds))
        random.shuffle(Xs)
        batches = minibatch_by_words(Xs, size=batch_sizes)
@ -374,7 +390,12 @@ def main(ud_dir, parses_dir, config, corpus, limit=0, use_gpu=-1, vectors_dir=No
        out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)
        with nlp.use_params(optimizer.averages):
-            parsed_docs, scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path)
+            if use_oracle_segments:
                parsed_docs, scores = evaluate(nlp, paths.dev.conllu,
                                               paths.dev.conllu, out_path)
            else:
                parsed_docs, scores = evaluate(nlp, paths.dev.text,
                                               paths.dev.conllu, out_path)
            print_progress(i, losses, scores)
            _render_parses(i, parsed_docs[:50]) 
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -160,7 +160,7 @@ class GoldCorpus(object):
                yield item
                i += len(item[1])
                if limit and i >= limit:
-                    break
+                    return
    @property
    def dev_tuples(self):
@ -180,7 +180,7 @@ class GoldCorpus(object):
                n += len(sent_tuples[1])
                if self.limit and i >= self.limit:
                    break
-            i += len(paragraph_tuples)
+                i += 1
        return n
    def train_docs(self, nlp, gold_preproc=False, max_length=None,
@ -394,7 +394,7 @@ cdef class GoldParse:
    def __init__(self, doc, annot_tuples=None, words=None, tags=None,
                 heads=None, deps=None, entities=None, make_projective=False,
-                 cats=None):
+                 cats=None, **_):
        """Create a GoldParse.
        doc (Doc): The document the annotations refer to.
--- a/spacy/syntax/_beam_utils.pxd
+++ b/spacy/syntax/_beam_utils.pxd
@ -0,0 +1,6 @@
 from thinc.typedefs cimport class_t
 # These are passed as callbacks to thinc.search.Beam
 cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
 cdef int check_final_state(void* _state, void* extra_args) except -1
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@ -15,7 +15,7 @@ from .stateclass cimport StateC, StateClass
 # These are passed as callbacks to thinc.search.Beam
-cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
+cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
    dest = <StateC*>_dest
    src = <StateC*>_src
    moves = <const Transition*>_moves
@ -24,12 +24,12 @@ cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves)
    dest.push_hist(clas)
-cdef int _check_final_state(void* _state, void* extra_args) except -1:
+cdef int check_final_state(void* _state, void* extra_args) except -1:
    state = <StateC*>_state
    return state.is_final()
-cdef hash_t _hash_state(void* _state, void* _) except 0:
+cdef hash_t hash_state(void* _state, void* _) except 0:
    state = <StateC*>_state
    if state.is_final():
        return 1
@ -37,6 +37,20 @@ cdef hash_t _hash_state(void* _state, void* _) except 0:
        return state.hash()
 def collect_states(beams):
    cdef StateClass state
    cdef Beam beam
    states = []
    for state_or_beam in beams:
        if isinstance(state_or_beam, StateClass):
            states.append(state_or_beam)
        else:
            beam = state_or_beam
            state = StateClass.borrow(<StateC*>beam.at(0))
            states.append(state)
    return states
 cdef class ParserBeam(object):
    cdef public TransitionSystem moves
    cdef public object states
@ -45,7 +59,7 @@ cdef class ParserBeam(object):
    cdef public object dones
    def __init__(self, TransitionSystem moves, states, golds,
-                 int width, float density):
+                 int width, float density=0.):
        self.moves = moves
        self.states = states
        self.golds = golds
@ -54,7 +68,7 @@ cdef class ParserBeam(object):
        cdef StateClass state
        cdef StateC* st
        for state in states:
-            beam = Beam(self.moves.n_moves, width, density)
+            beam = Beam(self.moves.n_moves, width, min_density=density)
            beam.initialize(self.moves.init_beam_state, state.c.length,
                            state.c._sent)
            for i in range(beam.width):
@ -82,8 +96,8 @@ cdef class ParserBeam(object):
            self._set_scores(beam, scores[i])
            if self.golds is not None:
                self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
-            beam.advance(_transition_state, NULL, <void*>self.moves.c)
+            beam.advance(transition_state, NULL, <void*>self.moves.c)
-            beam.check_done(_check_final_state, NULL)
+            beam.check_done(check_final_state, NULL)
            # This handles the non-monotonic stuff for the parser.
            if beam.is_done and self.golds is not None:
                for j in range(beam.size):
@ -92,8 +106,6 @@ cdef class ParserBeam(object):
                        try:
                            if self.moves.is_gold_parse(state, self.golds[i]):
                                beam._states[j].loss = 0.0
                            elif beam._states[j].loss == 0.0:
                                beam._states[j].loss = 1.0
                        except NotImplementedError:
                            break
@ -119,8 +131,12 @@ cdef class ParserBeam(object):
                self.moves.set_costs(beam.is_valid[i], beam.costs[i],
                                     state, gold)
                if follow_gold:
                    min_cost = 0
                    for j in range(beam.nr_class):
-                        if beam.costs[i][j] >= 1:
+                        if beam.is_valid[i][j] and beam.costs[i][j] < min_cost:
                            min_cost = beam.costs[i][j]
                    for j in range(beam.nr_class):
                        if beam.costs[i][j] > min_cost:
                            beam.is_valid[i][j] = 0
@ -144,15 +160,13 @@ nr_update = 0
 def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
                states, golds,
                state2vec, vec2scores,
-                int width, float density, int hist_feats,
+                int width, losses=None, drop=0.,
-                losses=None, drop=0.):
+                early_update=True, beam_density=0.0):
    global nr_update
    cdef MaxViolation violn
    nr_update += 1
-    pbeam = ParserBeam(moves, states, golds,
+    pbeam = ParserBeam(moves, states, golds, width=width, density=beam_density)
-                       width=width, density=density)
+    gbeam = ParserBeam(moves, states, golds, width=width, density=beam_density)
    gbeam = ParserBeam(moves, states, golds,
                       width=width, density=density)
    cdef StateClass state
    beam_maps = []
    backprops = []
@ -177,12 +191,6 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
        # Now that we have our flat list of states, feed them through the model
        token_ids = get_token_ids(states, nr_feature)
        vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
        if hist_feats:
            hists = numpy.asarray([st.history[:hist_feats] for st in states],
                                  dtype='i')
            scores, bp_scores = vec2scores.begin_update((vectors, hists),
                                                        drop=drop)
        else:
        scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
        # Store the callbacks for the backward pass
@ -194,13 +202,17 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
                    for indices in p_indices]
        g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
                    for indices in g_indices]
-        # Now advance the states in the beams. The gold beam is contrained to
+        # Now advance the states in the beams. The gold beam is constrained to
        # to follow only gold analyses.
        pbeam.advance(p_scores)
        gbeam.advance(g_scores, follow_gold=True)
        # Track the "maximum violation", to use in the update.
        for i, violn in enumerate(violns):
            violn.check_crf(pbeam[i], gbeam[i])
            # Use 'early update' if best gold is way out of contention.
            if pbeam[i].loss > 0 and pbeam[i].min_score > (gbeam[i].score * 5.00):
                pbeam.dones[i] = True
                gbeam.dones[i] = True
    histories = []
    losses = []
    for violn in violns:
@ -264,14 +276,15 @@ def get_gradient(nr_class, beam_maps, histories, losses):
    Each batch has multiple beams
    So history is list of lists of lists of ints
    """
    nr_step = len(beam_maps)
    grads = []
-    nr_step = 0
+    nr_steps = []
    for eg_id, hists in enumerate(histories):
        nr_step = 0
        for loss, hist in zip(losses[eg_id], hists):
            if loss != 0.0 and not numpy.isnan(loss):
                nr_step = max(nr_step, len(hist))
-    for i in range(nr_step):
+        nr_steps.append(nr_step)
    for i in range(max(nr_steps)):
        grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
                                 dtype='f'))
    if len(histories) != len(losses):
@ -282,8 +295,11 @@ def get_gradient(nr_class, beam_maps, histories, losses):
                continue
            key = tuple([eg_id])
            # Adjust loss for length
            # We need to do this because each state in a short path is scored
            # multiple times, as we add in the average cost when we run out
            # of actions.
            avg_loss = loss / len(hist)
-            loss += avg_loss * (nr_step - len(hist))
+            loss += avg_loss * (nr_steps[eg_id] - len(hist))
            for j, clas in enumerate(hist):
                i = beam_maps[j][key]
                # In step j, at state i action clas
@ -291,3 +307,27 @@ def get_gradient(nr_class, beam_maps, histories, losses):
                grads[j][i, clas] += loss
                key = key + tuple([clas])
    return grads
 def cleanup_beam(Beam beam):
    cdef StateC* state
    # Once parsing has finished, states in beam may not be unique. Is this
    # correct?
    seen = set()
    for i in range(beam.width):
        addr = <size_t>beam._parents[i].content
        if addr not in seen:
            state = <StateC*>addr
            del state
            seen.add(addr)
        else:
            raise ValueError(Errors.E023.format(addr=addr, i=i))
        addr = <size_t>beam._states[i].content
        if addr not in seen:
            state = <StateC*>addr
            del state
            seen.add(addr)
        else:
            raise ValueError(Errors.E023.format(addr=addr, i=i))
--- a/spacy/syntax/_parser_model.pxd
+++ b/spacy/syntax/_parser_model.pxd
@ -0,0 +1,49 @@
 from libc.string cimport memset, memcpy
 from libc.stdlib cimport calloc, free, realloc
 from thinc.typedefs cimport weight_t, class_t, hash_t
 from ._state cimport StateC
 cdef struct SizesC:
    int states
    int classes
    int hiddens
    int pieces
    int feats
    int embed_width
 cdef struct WeightsC:
    const float* feat_weights
    const float* feat_bias
    const float* hidden_bias
    const float* hidden_weights
    const float* vectors
 cdef struct ActivationsC:
    int* token_ids
    float* vectors
    float* unmaxed
    float* scores
    float* hiddens
    int* is_valid
    int _curr_size
    int _max_size
 cdef WeightsC get_c_weights(model) except *
 cdef SizesC get_c_sizes(model, int batch_size) except *
 cdef void resize_activations(ActivationsC* A, SizesC n) nogil
 cdef void predict_states(ActivationsC* A, StateC** states,
        const WeightsC* W, SizesC n) nogil
 cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
 cdef void cpu_log_loss(float* d_scores,
        const float* costs, const int* is_valid, const float* scores, int O) nogil
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@ -0,0 +1,402 @@
 # cython: infer_types=True
 # cython: cdivision=True
 # cython: boundscheck=False
 # coding: utf-8
 from __future__ import unicode_literals, print_function
 from collections import OrderedDict
 import ujson
 import json
 import numpy
 cimport cython.parallel
 import cytoolz
 import numpy.random
 cimport numpy as np
 from libc.math cimport exp
 from libcpp.vector cimport vector
 from libc.string cimport memset, memcpy
 from libc.stdlib cimport calloc, free, realloc
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport weight_t, class_t, hash_t
 from thinc.extra.search cimport Beam
 from thinc.api import chain, clone
 from thinc.v2v import Model, Maxout, Affine
 from thinc.misc import LayerNorm
 from thinc.neural.ops import CupyOps
 from thinc.neural.util import get_array_module
 from thinc.linalg cimport Vec, VecVec
 from thinc cimport openblas
 from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
 from .._ml import link_vectors_to_models, create_default_optimizer
 from ..compat import json_dumps, copy_array
 from ..tokens.doc cimport Doc
 from ..gold cimport GoldParse
 from ..errors import Errors, TempErrors
 from .. import util
 from .stateclass cimport StateClass
 from .transition_system cimport Transition
 from . import _beam_utils
 from . import nonproj
 cdef WeightsC get_c_weights(model) except *:
    cdef WeightsC output
    cdef precompute_hiddens state2vec = model.state2vec
    output.feat_weights = state2vec.get_feat_weights()
    output.feat_bias = <const float*>state2vec.bias.data
    cdef np.ndarray vec2scores_W = model.vec2scores.W
    cdef np.ndarray vec2scores_b = model.vec2scores.b
    output.hidden_weights = <const float*>vec2scores_W.data
    output.hidden_bias = <const float*>vec2scores_b.data
    cdef np.ndarray tokvecs = model.tokvecs
    output.vectors = <float*>tokvecs.data
    return output
 cdef SizesC get_c_sizes(model, int batch_size) except *:
    cdef SizesC output
    output.states = batch_size
    output.classes = model.vec2scores.nO
    output.hiddens = model.state2vec.nO
    output.pieces = model.state2vec.nP
    output.feats = model.state2vec.nF
    output.embed_width = model.tokvecs.shape[1]
    return output
 cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
    if n.states <= A._max_size:
        A._curr_size = n.states
        return
    if A._max_size == 0:
        A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
        A.vectors = <float*>calloc(n.states * n.embed_width, sizeof(A.vectors[0]))
        A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
        A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
        A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
        A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
        A._max_size = n.states
    else:
        A.token_ids = <int*>realloc(A.token_ids,
            n.states * n.feats * sizeof(A.token_ids[0]))
        A.vectors = <float*>realloc(A.vectors,
            n.states * n.embed_width * sizeof(A.vectors[0]))
        A.scores = <float*>realloc(A.scores,
            n.states * n.classes * sizeof(A.scores[0]))
        A.unmaxed = <float*>realloc(A.unmaxed,
            n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
        A.hiddens = <float*>realloc(A.hiddens,
            n.states * n.hiddens * sizeof(A.hiddens[0]))
        A.is_valid = <int*>realloc(A.is_valid,
            n.states * n.classes * sizeof(A.is_valid[0]))
        A._max_size = n.states
    A._curr_size = n.states
 cdef void predict_states(ActivationsC* A, StateC** states,
        const WeightsC* W, SizesC n) nogil:
    resize_activations(A, n)
    memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
    memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
    for i in range(n.states):
        states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
    sum_state_features(A.unmaxed,
        W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
    for i in range(n.states):
        VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces],
            W.feat_bias, 1., n.hiddens * n.pieces)
        for j in range(n.hiddens):
            index = i * n.hiddens * n.pieces + j * n.pieces
            which = Vec.arg_max(&A.unmaxed[index], n.pieces)
            A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
    memset(A.scores, 0, n.states * n.classes * sizeof(float))
    # Compute hidden-to-output
    openblas.simple_gemm(A.scores, n.states, n.classes,
        A.hiddens, n.states, n.hiddens,
        W.hidden_weights, n.classes, n.hiddens, 0, 1)
    # Add bias
    for i in range(n.states):
        VecVec.add_i(&A.scores[i*n.classes],
            W.hidden_bias, 1., n.classes)
 cdef void sum_state_features(float* output,
        const float* cached, const int* token_ids, int B, int F, int O) nogil:
    cdef int idx, b, f, i
    cdef const float* feature
    padding = cached
    cached += F * O
    cdef int id_stride = F*O
    cdef float one = 1.
    for b in range(B):
        for f in range(F):
            if token_ids[f] < 0:
                feature = &padding[f*O]
            else:
                idx = token_ids[f] * id_stride + f*O
                feature = &cached[idx]
            openblas.simple_axpy(&output[b*O], O,
                feature, one)
        token_ids += F
 cdef void cpu_log_loss(float* d_scores,
        const float* costs, const int* is_valid, const float* scores,
        int O) nogil:
    """Do multi-label log loss"""
    cdef double max_, gmax, Z, gZ
    best = arg_max_if_gold(scores, costs, is_valid, O)
    guess = arg_max_if_valid(scores, is_valid, O)
    Z = 1e-10
    gZ = 1e-10
    max_ = scores[guess]
    gmax = scores[best]
    for i in range(O):
        if is_valid[i]:
            Z += exp(scores[i] - max_)
            if costs[i] <= costs[best]:
                gZ += exp(scores[i] - gmax)
    for i in range(O):
        if not is_valid[i]:
            d_scores[i] = 0.
        elif costs[i] <= costs[best]:
            d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
        else:
            d_scores[i] = exp(scores[i]-max_) / Z
 cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
        const int* is_valid, int n) nogil:
    # Find minimum cost
    cdef float cost = 1
    for i in range(n):
        if is_valid[i] and costs[i] < cost:
            cost = costs[i]
    # Now find best-scoring with that cost
    cdef int best = -1
    for i in range(n):
        if costs[i] <= cost and is_valid[i]:
            if best == -1 or scores[i] > scores[best]:
                best = i
    return best
 cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
    cdef int best = -1
    for i in range(n):
        if is_valid[i] >= 1:
            if best == -1 or scores[i] > scores[best]:
                best = i
    return best
 class ParserModel(Model):
    def __init__(self, tok2vec, lower_model, upper_model):
        Model.__init__(self)
        self._layers = [tok2vec, lower_model, upper_model]
    def begin_update(self, docs, drop=0.):
        step_model = ParserStepModel(docs, self._layers, drop=drop)
        def finish_parser_update(golds, sgd=None):
            step_model.make_updates(sgd)
            return None
        return step_model, finish_parser_update
    def resize_output(self, new_output):
        # Weights are stored in (nr_out, nr_in) format, so we're basically
        # just adding rows here.
        smaller = self._layers[-1]._layers[-1]
        larger = Affine(self.moves.n_moves, smaller.nI)
        copy_array(larger.W[:smaller.nO], smaller.W)
        copy_array(larger.b[:smaller.nO], smaller.b)
        self._layers[-1]._layers[-1] = larger
    @property
    def tok2vec(self):
        return self._layers[0]
    @property
    def lower(self):
        return self._layers[1]
    @property
    def upper(self):
        return self._layers[2]
 class ParserStepModel(Model):
    def __init__(self, docs, layers, drop=0.):
        self.tokvecs, self.bp_tokvecs = layers[0].begin_update(docs, drop=drop)
        self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
                                            drop=drop)
        self.vec2scores = layers[-1]
        self.cuda_stream = util.get_cuda_stream()
        self.backprops = []
    @property
    def nO(self):
        return self.state2vec.nO
    def begin_update(self, states, drop=0.):
        token_ids = self.get_token_ids(states)
        vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0)
        mask = self.ops.get_dropout_mask(vector.shape, drop)
        if mask is not None:
            vector *= mask
        scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
        def backprop_parser_step(d_scores, sgd=None):
            d_vector = get_d_vector(d_scores, sgd=sgd)
            if mask is not None:
                d_vector *= mask
            if isinstance(self.ops, CupyOps) \
            and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
                # Move token_ids and d_vector to GPU, asynchronously
                self.backprops.append((
                    util.get_async(self.cuda_stream, token_ids),
                    util.get_async(self.cuda_stream, d_vector),
                    get_d_tokvecs
                ))
            else:
                self.backprops.append((token_ids, d_vector, get_d_tokvecs))
            return None
        return scores, backprop_parser_step
    def get_token_ids(self, batch):
        states = _beam_utils.collect_states(batch)
        cdef StateClass state
        states = [state for state in states if not state.is_final()]
        cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
                                          dtype='i', order='C')
        ids.fill(-1)
        c_ids = <int*>ids.data
        for state in states:
            state.c.set_context_tokens(c_ids, ids.shape[1])
            c_ids += ids.shape[1]
        return ids
    def make_updates(self, sgd):
        # Tells CUDA to block, so our async copies complete.
        if self.cuda_stream is not None:
            self.cuda_stream.synchronize()
        # Add a padding vector to the d_tokvecs gradient, so that missing
        # values don't affect the real gradient.
        d_tokvecs = self.ops.allocate((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
        for ids, d_vector, bp_vector in self.backprops:
            d_state_features = bp_vector((d_vector, ids), sgd=sgd)
            ids = ids.flatten()
            d_state_features = d_state_features.reshape(
                (ids.size, d_state_features.shape[2]))
            self.ops.scatter_add(d_tokvecs, ids,
                d_state_features)
        # Padded -- see update()
        self.bp_tokvecs(d_tokvecs[:-1], sgd=sgd)
        return d_tokvecs
 cdef class precompute_hiddens:
    """Allow a model to be "primed" by pre-computing input features in bulk.
    This is used for the parser, where we want to take a batch of documents,
    and compute vectors for each (token, position) pair. These vectors can then
    be reused, especially for beam-search.
    Let's say we're using 12 features for each state, e.g. word at start of
    buffer, three words on stack, their children, etc. In the normal arc-eager
    system, a document of length N is processed in 2*N states. This means we'll
    create 2*N*12 feature vectors --- but if we pre-compute, we only need
    N*12 vector computations. The saving for beam-search is much better:
    if we have a beam of k, we'll normally make 2*N*12*K computations --
    so we can save the factor k. This also gives a nice CPU/GPU division:
    we can do all our hard maths up front, packed into large multiplications,
    and do the hard-to-program parsing on the CPU.
    """
    cdef readonly int nF, nO, nP
    cdef bint _is_synchronized
    cdef public object ops
    cdef np.ndarray _features
    cdef np.ndarray _cached
    cdef np.ndarray bias
    cdef object _cuda_stream
    cdef object _bp_hiddens
    def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
                 drop=0.):
        gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
        cdef np.ndarray cached
        if not isinstance(gpu_cached, numpy.ndarray):
            # Note the passing of cuda_stream here: it lets
            # cupy make the copy asynchronously.
            # We then have to block before first use.
            cached = gpu_cached.get(stream=cuda_stream)
        else:
            cached = gpu_cached
        if not isinstance(lower_model.b, numpy.ndarray):
            self.bias = lower_model.b.get()
        else:
            self.bias = lower_model.b
        self.nF = cached.shape[1]
        self.nP = getattr(lower_model, 'nP', 1)
        self.nO = cached.shape[2]
        self.ops = lower_model.ops
        self._is_synchronized = False
        self._cuda_stream = cuda_stream
        self._cached = cached
        self._bp_hiddens = bp_features
    cdef const float* get_feat_weights(self) except NULL:
        if not self._is_synchronized and self._cuda_stream is not None:
            self._cuda_stream.synchronize()
            self._is_synchronized = True
        return <float*>self._cached.data
    def __call__(self, X):
        return self.begin_update(X)[0]
    def begin_update(self, token_ids, drop=0.):
        cdef np.ndarray state_vector = numpy.zeros(
            (token_ids.shape[0], self.nO, self.nP), dtype='f')
        # This is tricky, but (assuming GPU available);
        # - Input to forward on CPU
        # - Output from forward on CPU
        # - Input to backward on GPU!
        # - Output from backward on GPU
        bp_hiddens = self._bp_hiddens
        feat_weights = self.get_feat_weights()
        cdef int[:, ::1] ids = token_ids
        sum_state_features(<float*>state_vector.data,
            feat_weights, &ids[0,0],
            token_ids.shape[0], self.nF, self.nO*self.nP)
        state_vector += self.bias
        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
        def backward(d_state_vector_ids, sgd=None):
            d_state_vector, token_ids = d_state_vector_ids
            d_state_vector = bp_nonlinearity(d_state_vector, sgd)
            # This will usually be on GPU
            if not isinstance(d_state_vector, self.ops.xp.ndarray):
                d_state_vector = self.ops.xp.array(d_state_vector)
            d_tokens = bp_hiddens((d_state_vector, token_ids), sgd)
            return d_tokens
        return state_vector, backward
    def _nonlinearity(self, state_vector):
        if self.nP == 1:
            state_vector = state_vector.reshape(state_vector.shape[:-1])
            mask = state_vector >= 0.
            state_vector *= mask
        else:
            state_vector, mask = self.ops.maxout(state_vector)
        def backprop_nonlinearity(d_best, sgd=None):
            if self.nP == 1:
                d_best *= mask
                d_best = d_best.reshape((d_best.shape + (1,)))
                return d_best
            else:
                return self.ops.backprop_maxout(d_best, mask, self.nP)
        return state_vector, backprop_nonlinearity
--- a/spacy/syntax/nn_parser.pxd
+++ b/spacy/syntax/nn_parser.pxd
@ -6,6 +6,7 @@ from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc
 from ..structs cimport TokenC
 from ._state cimport StateC
 from ._parser_model cimport WeightsC, ActivationsC, SizesC
 cdef class Parser:
@ -15,7 +16,9 @@ cdef class Parser:
    cdef readonly object cfg
    cdef public object _multitasks
-    cdef void _parseC(self, StateC** states, int nr_task, 
+    cdef void _parseC(self, StateC** states,
-            const float* feat_weights, const float* bias,
+            WeightsC weights, SizesC sizes) nogil
-            const float* hW, const float* hb,
+ 
-            int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil
+    cdef void c_transition_batch(self, StateC** states, const float* scores,
            int nr_class, int batch_size) nogil
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -5,9 +5,12 @@ from __future__ import unicode_literals
 from cpython.ref cimport Py_INCREF
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport weight_t
 from thinc.extra.search cimport Beam
 from collections import OrderedDict, Counter
 import ujson
 from . cimport _beam_utils
 from ..tokens.doc cimport Doc
 from ..structs cimport TokenC
 from .stateclass cimport StateClass
 from ..typedefs cimport attr_t
@ -57,6 +60,21 @@ cdef class TransitionSystem:
            offset += len(doc)
        return states
    def init_beams(self, docs, beam_width, beam_density=0.):
        cdef Doc doc
        beams = []
        cdef int offset = 0
        for doc in docs:
            beam = Beam(self.n_moves, beam_width, min_density=beam_density)
            beam.initialize(self.init_beam_state, doc.length, doc.c)
            for i in range(beam.width):
                state = <StateC*>beam.at(i)
                state.offset = offset
            offset += len(doc)
            beam.check_done(_beam_utils.check_final_state, NULL)
            beams.append(beam)
        return beams
    def get_oracle_sequence(self, doc, GoldParse gold):
        cdef Pool mem = Pool()
        costs = <float*>mem.alloc(self.n_moves, sizeof(float))
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@ -35,8 +35,7 @@ def parser(vocab, arc_eager):
@pytest.fixture
 def model(arc_eager, tok2vec):
-    return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO,
+    return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)[0]
                        hist_size=0)[0]
@pytest.fixture
 def doc(vocab):
@ -69,11 +68,13 @@ def test_update_doc(parser, model, doc, gold):
    parser.update([doc], [gold], sgd=optimize)
@pytest.mark.xfail
 def test_predict_doc_beam(parser, model, doc):
    parser.model = model
    parser(doc, beam_width=32, beam_density=0.001)
@pytest.mark.xfail
 def test_update_doc_beam(parser, model, doc, gold):
    parser.model = model
    def optimize(weights, gradient, key=None):
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -34,6 +34,7 @@ def test_util_get_package_path(package):
    assert isinstance(path, Path)
@pytest.mark.xfail
 def test_displacy_parse_ents(en_vocab):
    """Test that named entities on a Doc are converted into displaCy's format."""
    doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
@ -44,6 +45,7 @@ def test_displacy_parse_ents(en_vocab):
    assert ents['ents'] == [{'start': 4, 'end': 10, 'label': 'ORG'}]
@pytest.mark.xfail
 def test_displacy_parse_deps(en_vocab):
    """Test that deps and tags on a Doc are converted into displaCy's format."""
    words = ["This", "is", "a", "sentence"]
@ -64,6 +66,7 @@ def test_displacy_parse_deps(en_vocab):
                            {'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}]
@pytest.mark.xfail
 def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP)
    assert model.W.shape == (nF, nO, nP, nI)