Revert noise-level back to default 0.0

2025-12-17 07:04:29 +03:00 · 2017-09-06 04:58:33 -05:00 · 2017-09-06 04:58:33 -05:00 · 167f6a8938
commit 167f6a8938
parent 4c1b6a4c81 e88a42e460
130 changed files with 67009 additions and 17315 deletions
--- a/.gitignore
+++ b/.gitignore
@ -40,7 +40,6 @@ venv/
 # Distribution / packaging
 env/
 bin/
 build/
 develop-eggs/
 dist/
--- a/.travis.yml
+++ b/.travis.yml
@ -14,8 +14,7 @@ os:
 env:
  - VIA=compile LC_ALL=en_US.ascii 
  - VIA=compile
-
+  #- VIA=pypi_nightly
 #  - VIA=sdist
 install:
  - "./travis.sh"
@ -23,7 +22,7 @@ install:
 script:
  - "pip install pytest pytest-timeout"
  - if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi
-  - if [[ "${VIA}" == "pypi" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(ospath.dirname(spacy.__file__)))"`; fi
+  - if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
  - if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
 notifications:
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,3 +1,4 @@
 recursive-include include *.h
 include LICENSE
 include README.rst
 include bin/spacy
--- a/README.rst
+++ b/README.rst
@ -229,7 +229,7 @@ Compile from source
 The other way to install spaCy is to clone its
 `GitHub repository <https://github.com/explosion/spaCy>`_ and build it from
 source. That is the common way if you want to make changes to the code base.
-You'll need to make sure that you have a development enviroment consisting of a
+You'll need to make sure that you have a development environment consisting of a
 Python distribution including header files, a compiler,
 `pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_
 and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest.
--- a/bin/spacy
+++ b/bin/spacy
@ -0,0 +1 @@
 python -m spacy "$@"
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@ -0,0 +1,109 @@
 from __future__ import unicode_literals
 import plac
 import random
 import tqdm
 from thinc.neural.optimizers import Adam
 from thinc.neural.ops import NumpyOps
 import thinc.extra.datasets
 import spacy.lang.en
 from spacy.gold import GoldParse, minibatch
 from spacy.util import compounding
 from spacy.pipeline import TextCategorizer
 def train_textcat(tokenizer, textcat,
                  train_texts, train_cats, dev_texts, dev_cats,
                  n_iter=20):
    '''
    Train the TextCategorizer without associated pipeline.
    '''
    textcat.begin_training()
    optimizer = Adam(NumpyOps(), 0.001)
    train_docs = [tokenizer(text) for text in train_texts]
    train_gold = [GoldParse(doc, cats=cats) for doc, cats in
                  zip(train_docs, train_cats)]
    train_data = zip(train_docs, train_gold)
    batch_sizes = compounding(4., 128., 1.001)
    for i in range(n_iter):
        losses = {}
        train_data = tqdm.tqdm(train_data, leave=False) # Progress bar
        for batch in minibatch(train_data, size=batch_sizes):
            docs, golds = zip(*batch)
            textcat.update((docs, None), golds, sgd=optimizer, drop=0.2,
                losses=losses)
        with textcat.model.use_params(optimizer.averages):
            scores = evaluate(tokenizer, textcat, dev_texts, dev_cats)
        yield losses['textcat'], scores
 def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8 # True positives
    fp = 1e-8 # False positives
    fn = 1e-8 # False negatives
    tn = 1e-8 # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if score >= 0.5 and label in gold:
                tp += 1.
            elif score >= 0.5 and label not in gold:
                fp += 1.
            elif score < 0.5 and label not in gold:
                tn += 1
            if score < 0.5 and label in gold:
                fn += 1
    precis = tp / (tp + fp)
    recall = tp / (tp + fn)
    fscore = 2 * (precis * recall) / (precis + recall)
    return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}  
 def load_data():
    # Partition off part of the train data --- avoid running experiments
    # against test.
    train_data, _ = thinc.extra.datasets.imdb()
    random.shuffle(train_data)
    texts, labels = zip(*train_data)
    cats = [(['POSITIVE'] if y else []) for y in labels]
    split = int(len(train_data) * 0.8)
    train_texts = texts[:split]
    train_cats = cats[:split]
    dev_texts = texts[split:]
    dev_cats = cats[split:]
    return (train_texts, train_cats), (dev_texts, dev_cats)
 def main(model_loc=None):
    nlp = spacy.lang.en.English()
    tokenizer = nlp.tokenizer
    textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])
    print("Load IMDB data")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data()
    print("Itn.\tLoss\tP\tR\tF")
    progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'
    for i, (loss, scores) in enumerate(train_textcat(tokenizer, textcat,
                                       train_texts, train_cats,
                                       dev_texts, dev_cats, n_iter=20)):
        print(progress.format(i=i, loss=loss, **scores))
    # How to save, load and use
    nlp.pipeline.append(textcat)
    if model_loc is not None:
        nlp.to_disk(model_loc)
        nlp = spacy.load(model_loc)
        doc = nlp(u'This movie sucked!')
        print(doc.cats)
 if __name__ == '__main__':
    plac.call(main)
--- a/requirements.txt
+++ b/requirements.txt
@ -3,7 +3,7 @@ pathlib
 numpy>=1.7
 cymem>=1.30,<1.32
 preshed>=1.0.0,<2.0.0
-thinc>=6.7.3,<6.8.0
+thinc>=6.8.0,<6.9.0
 murmurhash>=0.28,<0.29
 plac<1.0.0,>=0.9.6
 six
--- a/setup.py
+++ b/setup.py
@ -28,7 +28,9 @@ MOD_NAMES = [
    'spacy.pipeline',
    'spacy.syntax.stateclass',
    'spacy.syntax._state',
    'spacy.syntax._beam_utils',
    'spacy.tokenizer',
    'spacy._cfile',
    'spacy.syntax.parser',
    'spacy.syntax.nn_parser',
    'spacy.syntax.beam_parser',
@ -187,12 +189,13 @@ def setup_package():
            url=about['__uri__'],
            license=about['__license__'],
            ext_modules=ext_modules,
            scripts=['bin/spacy'],
            install_requires=[
                'numpy>=1.7',
                'murmurhash>=0.28,<0.29',
                'cymem>=1.30,<1.32',
                'preshed>=1.0.0,<2.0.0',
-                'thinc>=6.7.3,<6.8.0',
+                'thinc>=6.8.0,<6.9.0',
                'plac<1.0.0,>=0.9.6',
                'pip>=9.0.0,<10.0.0',
                'six',
--- a/spacy/init.py
+++ b/spacy/init.py
@ -13,5 +13,10 @@ def load(name, **overrides):
    return util.load_model(name, **overrides)
 def blank(name, **kwargs):
    LangClass = util.get_lang_class(name)
    return LangClass(**kwargs)
 def info(model=None, markdown=False):
    return cli_info(None, model, markdown)
--- a/spacy/main.py
+++ b/spacy/main.py
@ -3,15 +3,23 @@ from __future__ import print_function
 # NB! This breaks in plac on Python 2!!
 #from __future__ import unicode_literals
 if __name__ == '__main__':
    import plac
    import sys
-    from spacy.cli import download, link, info, package, train, convert
+    from spacy.cli import download, link, info, package, train, convert, model
    from spacy.cli import profile
    from spacy.util import prints
-    commands = {'download': download, 'link': link, 'info': info, 'train': train,
+    commands = {
-                'convert': convert, 'package': package}
+        'download': download,
        'link': link,
        'info': info,
        'train': train,
        'convert': convert,
        'package': package,
        'model': model,
        'profile': profile,
    }
    if len(sys.argv) == 1:
        prints(', '.join(commands), title="Available commands", exits=1)
    command = sys.argv.pop(1)
@ -19,5 +27,7 @@ if __name__ == '__main__':
    if command in commands:
        plac.call(commands[command])
    else:
-        prints("Available: %s" % ', '.join(commands),
+        prints(
-               title="Unknown command: %s" % command, exits=1)
+            "Available: %s" % ', '.join(commands),
            title="Unknown command: %s" % command,
            exits=1)
--- a/spacy/_cfile.pxd
+++ b/spacy/_cfile.pxd
@ -0,0 +1,26 @@
 from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from cymem.cymem cimport Pool
 cdef class CFile:
    cdef FILE* fp
    cdef bint is_open
    cdef Pool mem
    cdef int size # For compatibility with subclass
    cdef int _capacity # For compatibility with subclass
    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
 cdef class StringCFile(CFile):
    cdef unsigned char* data
    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
--- a/spacy/_cfile.pyx
+++ b/spacy/_cfile.pyx
@ -0,0 +1,88 @@
 from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from libc.string cimport memcpy
 cdef class CFile:
    def __init__(self, loc, mode, on_open_error=None):
        if isinstance(mode, unicode):
            mode_str = mode.encode('ascii')
        else:
            mode_str = mode
        if hasattr(loc, 'as_posix'):
            loc = loc.as_posix()
        self.mem = Pool()
        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
        self.fp = fopen(<char*>bytes_loc, mode_str)
        if self.fp == NULL:
            if on_open_error is not None:
                on_open_error()
            else:
                raise IOError("Could not open binary file %s" % bytes_loc)
        self.is_open = True
    def __dealloc__(self):
        if self.is_open:
            fclose(self.fp)
    def close(self):
        fclose(self.fp)
        self.is_open = False
    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
        st = fread(dest, elem_size, number, self.fp)
        if st != number:
            raise IOError
    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
        st = fwrite(src, elem_size, number, self.fp)
        if st != number:
            raise IOError
    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
        cdef void* dest = mem.alloc(number, elem_size)
        self.read_into(dest, number, elem_size)
        return dest
    def write_unicode(self, unicode value):
        cdef bytes py_bytes = value.encode('utf8')
        cdef char* chars = <char*>py_bytes
        self.write(sizeof(char), len(py_bytes), chars)
 cdef class StringCFile:
    def __init__(self, mode, bytes data=b'', on_open_error=None):
        self.mem = Pool()
        self.is_open = 'w' in mode
        self._capacity = max(len(data), 8)
        self.size = len(data)
        self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
        for i in range(len(data)):
            self.data[i] = data[i]
    def close(self):
        self.is_open = False
    def string_data(self):
        return (self.data-self.size)[:self.size]
    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
        memcpy(dest, self.data, elem_size * number)
        self.data += elem_size * number
    cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
        write_size = number * elem_size
        if (self.size + write_size) >= self._capacity:
            self._capacity = (self.size + write_size) * 2
            self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
        memcpy(&self.data[self.size], src, elem_size * number)
        self.size += write_size
    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
        cdef void* dest = mem.alloc(number, elem_size)
        self.read_into(dest, number, elem_size)
        return dest
    def write_unicode(self, unicode value):
        cdef bytes py_bytes = value.encode('utf8')
        cdef char* chars = <char*>py_bytes
        self.write(sizeof(char), len(py_bytes), chars)
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -3,23 +3,101 @@ from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
 from thinc.neural import Model, Maxout, Softmax, Affine
 from thinc.neural._classes.hash_embed import HashEmbed
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module
 import random
 import cytoolz
 from thinc.neural._classes.convolution import ExtractWindow
 from thinc.neural._classes.static_vectors import StaticVectors
-from thinc.neural._classes.batchnorm import BatchNorm
+from thinc.neural._classes.batchnorm import BatchNorm as BN
 from thinc.neural._classes.layernorm import LayerNorm as LN
 from thinc.neural._classes.resnet import Residual
 from thinc.neural import ReLu
 from thinc.neural._classes.selu import SELU
 from thinc import describe
 from thinc.describe import Dimension, Synapses, Biases, Gradient
 from thinc.neural._classes.affine import _set_dimensions_if_needed
 from thinc.api import FeatureExtracter, with_getitem
 from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
 from thinc.neural._classes.attention import ParametricAttention
 from thinc.linear.linear import LinearModel
 from thinc.api import uniqued, wrap, flatten_add_lengths
-from .attrs import ID, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
+
 from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
 from .tokens.doc import Doc
 from . import util
 import numpy
 import io
@layerize
 def _flatten_add_lengths(seqs, pad=0, drop=0.):
    ops = Model.ops
    lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
    def finish_update(d_X, sgd=None):
        return ops.unflatten(d_X, lengths, pad=pad)
    X = ops.flatten(seqs, pad=pad)
    return (X, lengths), finish_update
@layerize
 def _logistic(X, drop=0.):
    xp = get_array_module(X)
    if not isinstance(X, xp.ndarray):
        X = xp.asarray(X)
    # Clip to range (-10, 10)
    X = xp.minimum(X, 10., X)
    X = xp.maximum(X, -10., X)
    Y = 1. / (1. + xp.exp(-X))
    def logistic_bwd(dY, sgd=None):
        dX = dY * (Y * (1-Y))
        return dX
    return Y, logistic_bwd
@layerize
 def add_tuples(X, drop=0.):
    """Give inputs of sequence pairs, where each sequence is (vals, length),
    sum the values, returning a single sequence.
    If input is:
    ((vals1, length), (vals2, length)
    Output is:
    (vals1+vals2, length)
    vals are a single tensor for the whole batch.
    """
    (vals1, length1), (vals2, length2) = X
    assert length1 == length2
    def add_tuples_bwd(dY, sgd=None):
        return (dY, dY)
    return (vals1+vals2, length), add_tuples_bwd
 def _zero_init(model):
    def _zero_init_impl(self, X, y):
        self.W.fill(0)
    model.on_data_hooks.append(_zero_init_impl)
    if model.W is not None:
        model.W.fill(0.)
    return model
@layerize
 def _preprocess_doc(docs, drop=0.):
    keys = [doc.to_array([LOWER]) for doc in docs]
    keys = [a[:, 0] for a in keys]
    ops = Model.ops
    lengths = ops.asarray([arr.shape[0] for arr in keys])
    keys = ops.xp.concatenate(keys)
    vals = ops.allocate(keys.shape[0]) + 1
    return (keys, vals, lengths), None
 def _init_for_precomputed(W, ops):
    if (W**2).sum() != 0.:
        return
@ -27,6 +105,7 @@ def _init_for_precomputed(W, ops):
    ops.xavier_uniform_init(reshaped)
    W[:] = reshaped.reshape(W.shape)
@describe.on_data(_set_dimensions_if_needed)
@describe.attributes(
    nI=Dimension("Input size"),
@ -130,25 +209,42 @@ class PrecomputableMaxouts(Model):
            return dXf
        return Yfp, backward
 def drop_layer(layer, factor=2.):
    def drop_layer_fwd(X, drop=0.):
        if drop <= 0.:
            return layer.begin_update(X, drop=drop)
        else:
            coinflip = layer.ops.xp.random.random()
            if (coinflip / factor) >= drop:
                return layer.begin_update(X, drop=drop)
            else:
                return X, lambda dX, sgd=None: dX
    model = wrap(drop_layer_fwd, layer)
    model.predict = layer
    return model
 def Tok2Vec(width, embed_size, preprocess=None):
-    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
+    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
        norm = get_col(cols.index(NORM))     >> HashEmbed(width, embed_size, name='embed_lower')
        prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
        suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
        shape = get_col(cols.index(SHAPE))   >> HashEmbed(width, embed_size//2, name='embed_shape')
-        embed = (norm | prefix | suffix | shape )
+        embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3))
        tok2vec = (
            with_flatten(
                asarray(Model.ops, dtype='uint64')
-                >> embed
+                >> uniqued(embed, column=5)
-                >> Maxout(width, width*4, pieces=3)
+                >> drop_layer(
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
+                    Residual(
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
+                        (ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
+                    )
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
+                ) ** 4, pad=4
-            pad=4)
+            )
        )
        if preprocess not in (False, None):
            tok2vec = preprocess >> tok2vec
@ -243,7 +339,8 @@ def zero_init(model):
 def doc2feats(cols=None):
-    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
+    if cols is None:
        cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    def forward(docs, drop=0.):
        feats = []
        for doc in docs:
@ -269,6 +366,45 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
    return vectors, backward
 def fine_tune(embedding, combine=None):
    if combine is not None:
        raise NotImplementedError(
            "fine_tune currently only supports addition. Set combine=None")
    def fine_tune_fwd(docs_tokvecs, drop=0.):
        docs, tokvecs = docs_tokvecs
        lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
        vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
        flat_tokvecs = embedding.ops.flatten(tokvecs)
        flat_vecs = embedding.ops.flatten(vecs)
        output = embedding.ops.unflatten(
                   (model.mix[0] * flat_tokvecs + model.mix[1] * flat_vecs), lengths)
        def fine_tune_bwd(d_output, sgd=None):
            flat_grad = model.ops.flatten(d_output)
            model.d_mix[0] += flat_tokvecs.dot(flat_grad.T).sum()
            model.d_mix[1] += flat_vecs.dot(flat_grad.T).sum()
            bp_vecs([d_o * model.mix[1] for d_o in d_output], sgd=sgd)
            if sgd is not None:
                sgd(model._mem.weights, model._mem.gradient, key=model.id)
            return [d_o * model.mix[0] for d_o in d_output]
        return output, fine_tune_bwd
    def fine_tune_predict(docs_tokvecs):
        docs, tokvecs = docs_tokvecs
        vecs = embedding(docs)
        return [model.mix[0]*tv+model.mix[1]*v
                for tv, v in zip(tokvecs, vecs)]
    model = wrap(fine_tune_fwd, embedding)
    model.mix = model._mem.add((model.id, 'mix'), (2,))
    model.mix.fill(0.5)
    model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
    model.predict = fine_tune_predict
    return model
@layerize
 def flatten(seqs, drop=0.):
    if isinstance(seqs[0], numpy.ndarray):
@ -282,3 +418,201 @@ def flatten(seqs, drop=0.):
        return ops.unflatten(d_X, lengths)
    X = ops.xp.vstack(seqs)
    return X, finish_update
@layerize
 def logistic(X, drop=0.):
    xp = get_array_module(X)
    if not isinstance(X, xp.ndarray):
        X = xp.asarray(X)
    # Clip to range (-10, 10)
    X = xp.minimum(X, 10., X)
    X = xp.maximum(X, -10., X)
    Y = 1. / (1. + xp.exp(-X))
    def logistic_bwd(dY, sgd=None):
        dX = dY * (Y * (1-Y))
        return dX
    return Y, logistic_bwd
 def zero_init(model):
    def _zero_init_impl(self, X, y):
        self.W.fill(0)
    model.on_data_hooks.append(_zero_init_impl)
    return model
@layerize
 def preprocess_doc(docs, drop=0.):
    keys = [doc.to_array([LOWER]) for doc in docs]
    keys = [a[:, 0] for a in keys]
    ops = Model.ops
    lengths = ops.asarray([arr.shape[0] for arr in keys])
    keys = ops.xp.concatenate(keys)
    vals = ops.allocate(keys.shape[0]) + 1
    return (keys, vals, lengths), None
 def getitem(i):
    def getitem_fwd(X, drop=0.):
        return X[i], None
    return layerize(getitem_fwd)
 def build_tagger_model(nr_class, token_vector_width, **cfg):
    embed_size = util.env_opt('embed_size', 7500)
    with Model.define_operators({'>>': chain, '+': add}):
        # Input: (doc, tensor) tuples
        private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats())
        model = (
            fine_tune(private_tok2vec)
            >> with_flatten(
                Maxout(token_vector_width, token_vector_width)
                >> Softmax(nr_class, token_vector_width)
            )
        )
    model.nI = None
    return model
@layerize
 def SpacyVectors(docs, drop=0.):
    xp = get_array_module(docs[0].vocab.vectors.data)
    width = docs[0].vocab.vectors.data.shape[1]
    batch = []
    for doc in docs:
        indices = numpy.zeros((len(doc),), dtype='i')
        for i, word in enumerate(doc):
            if word.orth in doc.vocab.vectors.key2row:
                indices[i] = doc.vocab.vectors.key2row[word.orth]
            else:
                indices[i] = 0
        vectors = doc.vocab.vectors.data[indices]
        batch.append(vectors)
    return batch, None
 def foreach(layer, drop_factor=1.0):
    '''Map a layer across elements in a list'''
    def foreach_fwd(Xs, drop=0.):
        drop *= drop_factor
        ys = []
        backprops = []
        for X in Xs:
            y, bp_y = layer.begin_update(X, drop=drop)
            ys.append(y)
            backprops.append(bp_y)
        def foreach_bwd(d_ys, sgd=None):
            d_Xs = []
            for d_y, bp_y in zip(d_ys, backprops):
                if bp_y is not None and bp_y is not None:
                    d_Xs.append(d_y, sgd=sgd)
                else:
                    d_Xs.append(None)
            return d_Xs
        return ys, foreach_bwd
    model = wrap(foreach_fwd, layer)
    return model
 def build_text_classifier(nr_class, width=64, **cfg):
    nr_vector = cfg.get('nr_vector', 5000)
    with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
                                 '**': clone}):
        if cfg.get('low_data'):
            model = (
                SpacyVectors
                >> flatten_add_lengths
                >> with_getitem(0,
                    Affine(width, 300)
                )
                >> ParametricAttention(width)
                >> Pooling(sum_pool)
                >> Residual(ReLu(width, width)) ** 2
                >> zero_init(Affine(nr_class, width, drop_factor=0.0))
                >> logistic
            )
            return model
        lower = HashEmbed(width, nr_vector, column=1)
        prefix = HashEmbed(width//2, nr_vector, column=2)
        suffix = HashEmbed(width//2, nr_vector, column=3)
        shape = HashEmbed(width//2, nr_vector, column=4)
        trained_vectors = (
            FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID])
            >> with_flatten(
                uniqued(
                    (lower | prefix | suffix | shape)
                    >> LN(Maxout(width, width+(width//2)*3)),
                    column=0
                )
            )
        )
        static_vectors = (
            SpacyVectors
            >> with_flatten(Affine(width, 300))
        )
        cnn_model = (
            # TODO Make concatenate support lists
            concatenate_lists(trained_vectors, static_vectors) 
            >> with_flatten(
                LN(Maxout(width, width*2))
                >> Residual(
                    (ExtractWindow(nW=1) >> zero_init(Maxout(width, width*3)))
                ) ** 2, pad=2
            )
            >> flatten_add_lengths
            >> ParametricAttention(width)
            >> Pooling(sum_pool)
            >> Residual(zero_init(Maxout(width, width)))
            >> zero_init(Affine(nr_class, width, drop_factor=0.0))
        )
        linear_model = (
            _preprocess_doc
            >> LinearModel(nr_class, drop_factor=0.)
        )
        model = (
            (linear_model | cnn_model)
            >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
            >> logistic
        )
    model.lsuv = False
    return model
@layerize
 def flatten(seqs, drop=0.):
    ops = Model.ops
    lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
    def finish_update(d_X, sgd=None):
        return ops.unflatten(d_X, lengths, pad=0)
    X = ops.flatten(seqs, pad=0)
    return X, finish_update
 def concatenate_lists(*layers, **kwargs): # pragma: no cover
    '''Compose two or more models `f`, `g`, etc, such that their outputs are
    concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
    '''
    if not layers:
        return noop()
    drop_factor = kwargs.get('drop_factor', 1.0)
    ops = layers[0].ops
    layers = [chain(layer, flatten) for layer in layers]
    concat = concatenate(*layers)
    def concatenate_lists_fwd(Xs, drop=0.):
        drop *= drop_factor
        lengths = ops.asarray([len(X) for X in Xs], dtype='i')
        flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
        ys = ops.unflatten(flat_y, lengths)
        def concatenate_lists_bwd(d_ys, sgd=None):
            return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
        return ys, concatenate_lists_bwd
    model = wrap(concatenate_lists_fwd, concat)
    return model
--- a/spacy/about.py
+++ b/spacy/about.py
@ -3,7 +3,7 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
 __title__ = 'spacy-nightly'
-__version__ = '2.0.0a1'
+__version__ = '2.0.0a13'
 __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 __uri__ = 'https://spacy.io'
 __author__ = 'Explosion AI'
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -2,5 +2,7 @@ from .download import download
 from .info import info
 from .link import link
 from .package import package
 from .profile import profile
 from .train import train
 from .convert import convert
 from .model import model
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -21,10 +21,10 @@ CONVERTERS = {
@plac.annotations(
    input_file=("input file", "positional", None, str),
    output_dir=("output directory for converted file", "positional", None, str),
-    n_sents=("Number of sentences per doc", "option", "n", float),
+    n_sents=("Number of sentences per doc", "option", "n", int),
    morphology=("Enable appending morphology to tags", "flag", "m", bool)
 )
-def convert(cmd, input_file, output_dir, n_sents, morphology):
+def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
    """
    Convert files into JSON format for use with train command and other
    experiment management functions.
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -73,10 +73,10 @@ def generate_sentence(sent):
    tokens = []
    for i, id in enumerate(id_):
        token = {}
-        token["orth"] = word[id]
+        token["orth"] = word[i]
-        token["tag"] = tag[id]
+        token["tag"] = tag[i]
-        token["head"] = head[id] - i
+        token["head"] = head[i] - id
-        token["dep"] = dep[id]
+        token["dep"] = dep[i]
        tokens.append(token)
    sentence["tokens"] = tokens
    return sentence
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -8,7 +8,7 @@ import subprocess
 import sys
 from .link import link
-from ..util import prints
+from ..util import prints, get_package_path
 from .. import about
@ -24,15 +24,20 @@ def download(cmd, model, direct=False):
    with version.
    """
    if direct:
-        download_model('{m}/{m}.tar.gz'.format(m=model))
+        dl = download_model('{m}/{m}.tar.gz'.format(m=model))
    else:
        shortcuts = get_json(about.__shortcuts__, "available shortcuts")
        model_name = shortcuts.get(model, model)
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
-        download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
+        dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
        if dl == 0:
            try:
-            link(None, model_name, model, force=True)
+                # Get package path here because link uses
                # pip.get_installed_distributions() to check if model is a package,
                # which fails if model was just installed via subprocess
                package_path = get_package_path(model_name)
                link(None, model_name, model, force=True, model_path=package_path)
            except:
                # Dirty, but since spacy.download and the auto-linking is mostly
                # a convenience wrapper, it's best to show a success message and
@ -73,6 +78,6 @@ def get_version(model, comp):
 def download_model(filename):
    download_url = about.__download_url__ + '/' + filename
-    subprocess.call([sys.executable, '-m',
+    return subprocess.call([sys.executable, '-m',
        'pip', 'install', '--no-cache-dir', download_url],
        env=os.environ.copy())
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@ -14,7 +14,7 @@ from .. import util
    link_name=("name of shortuct link to create", "positional", None, str),
    force=("force overwriting of existing link", "flag", "f", bool)
 )
-def link(cmd, origin, link_name, force=False):
+def link(cmd, origin, link_name, force=False, model_path=None):
    """
    Create a symlink for models within the spacy/data directory. Accepts
    either the name of a pip package, or the local path to the model data
@ -23,7 +23,7 @@ def link(cmd, origin, link_name, force=False):
    if util.is_package(origin):
        model_path = util.get_package_path(origin)
    else:
-        model_path = Path(origin)
+        model_path = Path(origin) if model_path is None else Path(model_path)
    if not model_path.exists():
        prints("The data should be located in %s" % path2str(model_path),
               title="Can't locate model data", exits=1)
--- a/spacy/cli/model.py
+++ b/spacy/cli/model.py
@ -0,0 +1,137 @@
 # coding: utf8
 from __future__ import unicode_literals
 import bz2
 import gzip
 import math
 from ast import literal_eval
 from pathlib import Path
 import numpy as np
 import spacy
 from preshed.counter import PreshCounter
 from .. import util
 from ..compat import fix_text
 def model(cmd, lang, model_dir, freqs_data, clusters_data, vectors_data,
          min_doc_freq=5, min_word_freq=200):
    model_path = Path(model_dir)
    freqs_path = Path(freqs_data)
    clusters_path = Path(clusters_data) if clusters_data else None
    vectors_path = Path(vectors_data) if vectors_data else None
    check_dirs(freqs_path, clusters_path, vectors_path)
    vocab = util.get_lang_class(lang).Defaults.create_vocab()
    nlp = spacy.blank(lang)
    vocab = nlp.vocab
    probs, oov_prob = read_probs(
        freqs_path, min_doc_freq=int(min_doc_freq), min_freq=int(min_doc_freq))
    clusters = read_clusters(clusters_path) if clusters_path else {}
    populate_vocab(vocab, clusters, probs, oov_prob)
    add_vectors(vocab, vectors_path)
    create_model(model_path, nlp)
 def add_vectors(vocab, vectors_path):
    with bz2.BZ2File(vectors_path.as_posix()) as f:
        num_words, dim = next(f).split()
        vocab.clear_vectors(int(dim))
        for line in f:
            word_w_vector = line.decode("utf8").strip().split(" ")
            word = word_w_vector[0]
            vector = np.array([float(val) for val in word_w_vector[1:]])
            if word in vocab:
                vocab.set_vector(word, vector)
 def create_model(model_path, model):
    if not model_path.exists():
        model_path.mkdir()
    model.to_disk(model_path.as_posix())
 def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
    counts = PreshCounter()
    total = 0
    freqs_file = check_unzip(freqs_path)
    for i, line in enumerate(freqs_file):
        freq, doc_freq, key = line.rstrip().split('\t', 2)
        freq = int(freq)
        counts.inc(i + 1, freq)
        total += freq
    counts.smooth()
    log_total = math.log(total)
    freqs_file = check_unzip(freqs_path)
    probs = {}
    for line in freqs_file:
        freq, doc_freq, key = line.rstrip().split('\t', 2)
        doc_freq = int(doc_freq)
        freq = int(freq)
        if doc_freq >= min_doc_freq and freq >= min_freq and len(
                key) < max_length:
            word = literal_eval(key)
            smooth_count = counts.smoother(int(freq))
            probs[word] = math.log(smooth_count) - log_total
    oov_prob = math.log(counts.smoother(0)) - log_total
    return probs, oov_prob
 def read_clusters(clusters_path):
    clusters = {}
    with clusters_path.open() as f:
        for line in f:
            try:
                cluster, word, freq = line.split()
                word = fix_text(word)
            except ValueError:
                continue
            # If the clusterer has only seen the word a few times, its
            # cluster is unreliable.
            if int(freq) >= 3:
                clusters[word] = cluster
            else:
                clusters[word] = '0'
    # Expand clusters with re-casing
    for word, cluster in list(clusters.items()):
        if word.lower() not in clusters:
            clusters[word.lower()] = cluster
        if word.title() not in clusters:
            clusters[word.title()] = cluster
        if word.upper() not in clusters:
            clusters[word.upper()] = cluster
    return clusters
 def populate_vocab(vocab, clusters, probs, oov_prob):
    for word, prob in reversed(
            sorted(list(probs.items()), key=lambda item: item[1])):
        lexeme = vocab[word]
        lexeme.prob = prob
        lexeme.is_oov = False
        # Decode as a little-endian string, so that we can do & 15 to get
        # the first 4 bits. See _parse_features.pyx
        if word in clusters:
            lexeme.cluster = int(clusters[word][::-1], 2)
        else:
            lexeme.cluster = 0
 def check_unzip(file_path):
    file_path_str = file_path.as_posix()
    if file_path_str.endswith('gz'):
        return gzip.open(file_path_str)
    else:
        return file_path.open()
 def check_dirs(freqs_data, clusters_data, vectors_data):
    if not freqs_data.is_file():
        util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
    if clusters_data and not clusters_data.is_file():
        util.sys_exit(
            clusters_data.as_posix(), title="No Brown clusters file found")
    if vectors_data and not vectors_data.is_file():
        util.sys_exit(
            vectors_data.as_posix(), title="No word vectors file found")
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -15,10 +15,11 @@ from .. import about
@plac.annotations(
    input_dir=("directory with model data", "positional", None, str),
    output_dir=("output parent directory", "positional", None, str),
-    meta=("path to meta.json", "option", "m", str),
+    meta_path=("path to meta.json", "option", "m", str),
    create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool),
    force=("force overwriting of existing folder in output directory", "flag", "f", bool)
 )
-def package(cmd, input_dir, output_dir, meta=None, force=False):
+def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False):
    """
    Generate Python package for model data, including meta and required
    installation files. A new directory will be created in the specified
@ -26,7 +27,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
    """
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
-    meta_path = util.ensure_path(meta)
+    meta_path = util.ensure_path(meta_path)
    if not input_path or not input_path.exists():
        prints(input_path, title="Model directory not found", exits=1)
    if not output_path or not output_path.exists():
@ -38,7 +39,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
    template_manifest = get_template('MANIFEST.in')
    template_init = get_template('xx_model_name/__init__.py')
    meta_path = meta_path or input_path / 'meta.json'
-    if meta_path.is_file():
+    if not create_meta and meta_path.is_file():
        prints(meta_path, title="Reading meta.json from file")
        meta = util.read_json(meta_path)
    else:
@ -100,7 +101,7 @@ def generate_meta():
 def generate_pipeline():
    prints("If set to 'True', the default pipeline is used. If set to 'False', "
           "the pipeline will be disabled. Components should be specified as a "
-           "comma-separated list of component names, e.g. vectorizer, tagger, "
+           "comma-separated list of component names, e.g. tensorizer, tagger, "
           "parser, ner. For more information, see the docs on processing pipelines.",
           title="Enter your model's pipeline components")
    pipeline = util.get_raw_input("Pipeline components", True)
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@ -0,0 +1,45 @@
 # coding: utf8
 from __future__ import unicode_literals, division, print_function
 import plac
 from pathlib import Path
 import ujson
 import cProfile
 import pstats
 import spacy
 import sys
 import tqdm
 import cytoolz
 def read_inputs(loc):
    if loc is None:
        file_ = sys.stdin
        file_ = (line.encode('utf8') for line in file_)
    else:
        file_ = Path(loc).open()
    for line in file_:
        data = ujson.loads(line)
        text = data['text']
        yield text
@plac.annotations(
    lang=("model/language", "positional", None, str),
    inputs=("Location of input file", "positional", None, read_inputs)
 )
 def profile(cmd, lang, inputs=None):
    """
    Profile a spaCy pipeline, to find out which functions take the most time.
    """
    nlp = spacy.load(lang) 
    texts = list(cytoolz.take(10000, inputs))
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
    s = pstats.Stats("Profile.prof")
    s.strip_dirs().sort_stats("time").print_stats()
 def parse_texts(nlp, texts):
    for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128):
        pass
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -32,10 +32,12 @@ from ..compat import json_dumps
    resume=("Whether to resume training", "flag", "R", bool),
    no_tagger=("Don't train tagger", "flag", "T", bool),
    no_parser=("Don't train parser", "flag", "P", bool),
-    no_entities=("Don't train NER", "flag", "N", bool)
+    no_entities=("Don't train NER", "flag", "N", bool),
    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
 )
 def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
-          use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False):
+          use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False,
          gold_preproc=False):
    """
    Train a model. Expects data in spaCy's JSON format.
    """
@ -69,7 +71,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                                   util.env_opt('batch_to', 64),
                                   util.env_opt('batch_compound', 1.001))
    gold_preproc = util.env_opt('gold_preproc', False)
-    noise_level = util.env_opt('noise_level', 0.25)
+    noise_level = util.env_opt('noise_level', 0.0)
    if resume:
        prints(output_path / 'model19.pickle', title="Resuming training")
@ -95,15 +97,14 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                for batch in minibatch(train_docs, size=batch_sizes):
                    docs, golds = zip(*batch)
                    nlp.update(docs, golds, sgd=optimizer,
-                               drop=next(dropout_rates), losses=losses)
+                               drop=next(dropout_rates), losses=losses,
                               update_shared=True)
                    pbar.update(sum(len(doc) for doc in docs))
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ('model%d' % i)
                nlp.to_disk(epoch_model_path)
                with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
                    dill.dump(nlp, file_, -1)
                nlp_loaded = lang_class(pipeline=pipeline)
                nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
                scorer = nlp_loaded.evaluate(
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -5,6 +5,7 @@ import six
 import ftfy
 import sys
 import ujson
 import itertools
 from thinc.neural.util import copy_array
@ -35,6 +36,7 @@ CudaStream = CudaStream
 cupy = cupy
 fix_text = ftfy.fix_text
 copy_array = copy_array
 izip = getattr(itertools, 'izip', zip)
 is_python2 = six.PY2
 is_python3 = six.PY3
@ -44,21 +46,31 @@ is_osx = sys.platform == 'darwin'
 if is_python2:
    import imp
    bytes_ = str
    unicode_ = unicode
    basestring_ = basestring
    input_ = raw_input
-    json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8')
+    json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False).decode('utf8')
    path2str = lambda path: str(path).decode('utf8')
 elif is_python3:
    import importlib.util
    bytes_ = bytes
    unicode_ = str
    basestring_ = str
    input_ = input
-    json_dumps = lambda data: ujson.dumps(data, indent=2)
+    json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False)
    path2str = lambda path: str(path)
 def b_to_str(b_str):
    if is_python2:
        return b_str
    # important: if no encoding is set, string becomes "b'...'"
    return str(b_str, encoding='utf8')
 def getattr_(obj, name, *default):
    if is_python3 and isinstance(name, bytes):
        name = name.decode('utf8')
@ -92,3 +104,12 @@ def normalize_string_keys(old):
    return new
 def import_file(name, loc):
    loc = str(loc)
    if is_python2:
        return imp.load_source(name, loc)
    else:
        spec = importlib.util.spec_from_file_location(name, str(loc))
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        return module
--- a/spacy/deprecated.py
+++ b/spacy/deprecated.py
@ -15,7 +15,7 @@ def depr_model_download(lang):
    lang (unicode): Language shortcut, 'en' or 'de'.
    """
    prints("The spacy.%s.download command is now deprecated. Please use "
-           "python -m spacy download [model name or shortcut] instead. For "
+           "spacy download [model name or shortcut] instead. For "
           "more info, see the documentation:" % lang,
           about.__docs_models__,
           "Downloading default '%s' model now..." % lang,
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -3,6 +3,7 @@ from __future__ import unicode_literals
 from .render import DependencyRenderer, EntityRenderer
 from ..tokens import Doc
 from ..compat import b_to_str
 from ..util import prints, is_in_jupyter
@ -65,7 +66,9 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
 def app(environ, start_response):
-    start_response('200 OK', [('Content-type', 'text/html; charset=utf-8')])
+    # headers and status need to be bytes in Python 2, see #1227
    headers = [(b_to_str(b'Content-type'), b_to_str(b'text/html; charset=utf-8'))]
    start_response(b_to_str(b'200 OK'), headers)
    res = _html['parsed'].encode(encoding='utf-8')
    return [res]
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@ -60,7 +60,7 @@ GLOSSARY = {
    'JJR':          'adjective, comparative',
    'JJS':          'adjective, superlative',
    'LS':           'list item marker',
-    'MD':           'verb, modal auxillary',
+    'MD':           'verb, modal auxiliary',
    'NIL':          'missing tag',
    'NN':           'noun, singular or mass',
    'NNP':          'noun, proper singular',
@ -91,7 +91,7 @@ GLOSSARY = {
    'NFP':          'superfluous punctuation',
    'GW':           'additional word in multi-word expression',
    'XX':           'unknown',
-    'BES':          'auxillary "be"',
+    'BES':          'auxiliary "be"',
    'HVS':          'forms of "have"',
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@ -9,6 +9,7 @@ cdef struct GoldParseC:
    int* tags
    int* heads
    int* has_dep
    int* sent_start
    attr_t* labels
    int** brackets
    Transition* ner
@ -29,6 +30,7 @@ cdef class GoldParse:
    cdef public list ner
    cdef public list ents
    cdef public dict brackets
    cdef public object cats
    cdef readonly list cand_to_gold
    cdef readonly list gold_to_cand
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -381,7 +381,8 @@ cdef class GoldParse:
                   make_projective=make_projective)
    def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
-                 deps=None, entities=None, make_projective=False):
+                 deps=None, entities=None, make_projective=False,
                 cats=tuple()):
        """Create a GoldParse.
        doc (Doc): The document the annotations refer to.
@ -392,6 +393,12 @@ cdef class GoldParse:
        entities (iterable): A sequence of named entity annotations, either as
            BILUO tag strings, or as `(start_char, end_char, label)` tuples,
            representing the entity positions.
        cats (iterable): A sequence of labels for text classification. Each
            label may be a string or an int, or a `(start_char, end_char, label)`
            tuple, indicating that the label is applied to only part of the
            document (usually a sentence). Unlike entity annotations, label
            annotations can overlap, i.e. a single word can be covered by
            multiple labelled spans.
        RETURNS (GoldParse): The newly constructed object.
        """
        if words is None:
@ -399,11 +406,11 @@ cdef class GoldParse:
        if tags is None:
            tags = [None for _ in doc]
        if heads is None:
-            heads = [token.i for token in doc]
+            heads = [None for token in doc]
        if deps is None:
            deps = [None for _ in doc]
        if entities is None:
-            entities = ['-' for _ in doc]
+            entities = [None for _ in doc]
        elif len(entities) == 0:
            entities = ['O' for _ in doc]
        elif not isinstance(entities[0], basestring):
@ -419,8 +426,10 @@ cdef class GoldParse:
        self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
        self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
        self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
        self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
        self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
        self.cats = list(cats)
        self.words = [None] * len(doc)
        self.tags = [None] * len(doc)
        self.heads = [None] * len(doc)
@ -474,8 +483,12 @@ cdef class GoldParse:
        """
        return not nonproj.is_nonproj_tree(self.heads)
    @property
    def sent_starts(self):
        return [self.c.sent_start[i] for i in range(self.length)]
-def biluo_tags_from_offsets(doc, entities):
+
 def biluo_tags_from_offsets(doc, entities, missing='O'):
    """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
    scheme (BILUO).
@ -527,7 +540,7 @@ def biluo_tags_from_offsets(doc, entities):
            if i in entity_chars:
                break
        else:
-            biluo[token.i] = 'O'
+            biluo[token.i] = missing
    return biluo
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -27,7 +27,7 @@ ALPHA_UPPER = merge_char_classes(_upper + _uncased)
 _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
          'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
-          'TB T G M K')
+          'TB T G M K %')
 _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
 _punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
 _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
--- a/spacy/lang/da/examples.py
+++ b/spacy/lang/da/examples.py
@ -0,0 +1,18 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.da.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple overvejer at købe et britisk statup for 1 milliard dollar",
    "Selvkørende biler flytter forsikringsansvaret over på producenterne",
    "San Francisco overvejer at forbyde leverandørrobotter på fortov",
    "London er en stor by i Storbritannien"
 ]
--- a/spacy/lang/de/examples.py
+++ b/spacy/lang/de/examples.py
@ -0,0 +1,22 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.de.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
    "Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
    "Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz",
    "Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion",
    "San Francisco erwägt Verbot von Lieferrobotern",
    "Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
    "Wo bist du?",
    "Was ist die Hauptstadt von Deutschland?"
 ]
--- a/spacy/lang/en/examples.py
+++ b/spacy/lang/en/examples.py
@ -0,0 +1,22 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.en.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple is looking at buying U.K. startup for $1 billion",
    "Autonomous cars shift insurance liability toward manufacturers",
    "San Francisco considers banning sidewalk delivery robots",
    "London is a big city in the United Kingdom.",
    "Where are you?",
    "Who is the president of France?",
    "What is the capital of the United States?",
    "When was Barack Obama born?"
 ]
--- a/spacy/lang/en/morph_rules.py
+++ b/spacy/lang/en/morph_rules.py
@ -59,7 +59,8 @@ MORPH_RULES = {
    "VBP": {
        "are":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
-        "'re":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}
+        "'re":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
        "am":           {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"},
    },
    "VBD": {
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -232,7 +232,10 @@ for verb_data in [
    {ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
    {ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
    {ORTH: "was", LEMMA: "be", NORM: "was"},
-    {ORTH: "were", LEMMA: "be", NORM: "were"}]:
+    {ORTH: "were", LEMMA: "be", NORM: "were"},
    {ORTH: "have", NORM: "have"},
    {ORTH: "has", LEMMA: "have", NORM: "has"},
    {ORTH: "dare", NORM: "dare"}]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
    for data in [verb_data, verb_data_tc]:
--- a/spacy/lang/es/examples.py
+++ b/spacy/lang/es/examples.py
@ -0,0 +1,22 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.es.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares",
    "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes",
    "San Francisco analiza prohibir los robots delivery",
    "Londres es una gran ciudad del Reino Unido",
    "El gato come pescado",
    "Veo al hombre con el telescopio",
    "La araña come moscas",
    "El pingüino incuba en su nido"
 ]
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -5,6 +5,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from .lemmatizer import LOOKUP
 from .syntax_iterators import SYNTAX_ITERATORS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
@ -24,6 +25,7 @@ class FrenchDefaults(Language.Defaults):
    infixes = tuple(TOKENIZER_INFIXES)
    suffixes = tuple(TOKENIZER_SUFFIXES)
    token_match = TOKEN_MATCH
    syntax_iterators = dict(SYNTAX_ITERATORS)
    @classmethod
    def create_lemmatizer(cls, nlp=None):
--- a/spacy/lang/fr/_tokenizer_exceptions_list.py
+++ b/spacy/lang/fr/_tokenizer_exceptions_list.py
--- a/spacy/lang/fr/examples.py
+++ b/spacy/lang/fr/examples.py
@ -0,0 +1,26 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.fr.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple cherche a acheter une startup anglaise pour 1 milliard de dollard",
    "Les voitures autonomes voient leur assurances décalées vers les constructeurs",
    "San Francisco envisage d'interdire les robots coursiers",
    "Londres est une grande ville du Royaume-Uni",
    "L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe",
    "Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon",
    "La France ne devrait pas manquer d'électricité cet été, même en cas de canicule",
    "Nouvelles attaques de Trump contre le maire de Londres",
    "Où es-tu ?",
    "Qui est le président de la France ?",
    "Où est la capitale des Etats-Unis ?",
    "Quand est né Barack Obama ?"
 ]
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@ -0,0 +1,42 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...symbols import NOUN, PROPN, PRON
 def noun_chunks(obj):
    """
    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
    """
    labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
    doc = obj.doc  # Ensure works on both Doc and Span.
    np_deps = [doc.vocab.strings[label] for label in labels]
    conj = doc.vocab.strings.add('conj')
    np_label = doc.vocab.strings.add('NP')
    seen = set()
    for i, word in enumerate(obj):
        if word.pos not in (NOUN, PROPN, PRON):
            continue
        # Prevent nested chunks from being produced
        if word.i in seen:
            continue
        if word.dep in np_deps:
            if any(w.i in seen for w in word.subtree):
                continue
            seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
            yield word.left_edge.i, word.right_edge.i+1, np_label
        elif word.dep == conj:
            head = word.head
            while head.dep == conj and head.head.i < head.i:
                head = head.head
            # If the head is an NP, and we're coordinated to it, we're an NP
            if head.dep in np_deps:
                if any(w.i in seen for w in word.subtree):
                    continue
                seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
                yield word.left_edge.i, word.right_edge.i+1, np_label
 SYNTAX_ITERATORS = {
    'noun_chunks': noun_chunks
 }
--- a/spacy/lang/he/examples.py
+++ b/spacy/lang/he/examples.py
@ -0,0 +1,28 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.he.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל',
    'רה"מ הודיע כי יחרים טקס בחסותו',
    'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100',
    'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית',
    'סע לשלום, המפתחות בפנים.',
    'מלצר, פעמיים טורקי!',
    'ואהבת לרעך כמוך.',
    'היום נעשה משהו בלתי נשכח.',
    'איפה הילד?',
    'מיהו נשיא צרפת?',
    'מהי בירת ארצות הברית?',
    "איך קוראים בעברית לצ'ופצ'יק של הקומקום?",
    'מה הייתה הדקה?',
    'מי אומר שלום ראשון, זה שעולה או זה שיורד?'
 ]
--- a/spacy/lang/id/init.py
+++ b/spacy/lang/id/init.py
@ -0,0 +1,42 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .stop_words import STOP_WORDS
 from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .norm_exceptions import NORM_EXCEPTIONS
 from .lemmatizer import LOOKUP
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
 from ...lemmatizerlookup import Lemmatizer
 from ...attrs import LANG
 from ...util import update_exc
 class IndonesianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'id'
    lex_attr_getters.update(LEX_ATTRS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = set(STOP_WORDS)
    prefixes = tuple(TOKENIZER_PREFIXES)
    suffixes = tuple(TOKENIZER_SUFFIXES)
    infixes = tuple(TOKENIZER_INFIXES)
    syntax_iterators = dict(SYNTAX_ITERATORS)
    @classmethod
    def create_lemmatizer(cls, nlp=None):
        return Lemmatizer(LOOKUP)
 class Indonesian(Language):
    lang = 'id'
    Defaults = IndonesianDefaults
 __all__ = ['Indonesian']
--- a/spacy/lang/id/_tokenizer_exceptions_list.py
+++ b/spacy/lang/id/_tokenizer_exceptions_list.py
--- a/spacy/lang/id/examples.py
+++ b/spacy/lang/id/examples.py
@ -0,0 +1,22 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.en.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Al Qaidah mengklaim bom mobil yang menewaskan 60 Orang di Mali",
    "Abu Sayyaf mengeksekusi sandera warga Filipina",
    "Penyaluran pupuk berasal dari lima lokasi yakni Bontang, Kalimantan Timur, Surabaya, Banyuwangi, Semarang, dan Makassar.",
    "PT Pupuk Kaltim telah menyalurkan 274.707 ton pupuk bersubsidi ke wilayah penyaluran di 14 provinsi.",
    "Jakarta adalah kota besar yang nyaris tidak pernah tidur."
    "Kamu ada di mana semalam?",
    "Siapa yang membeli makanan ringan tersebut?",
    "Siapa presiden pertama Republik Indonesia?"
 ]
--- a/spacy/lang/id/lemmatizer.py
+++ b/spacy/lang/id/lemmatizer.py
--- a/spacy/lang/id/lex_attrs.py
+++ b/spacy/lang/id/lex_attrs.py
@ -0,0 +1,42 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...attrs import LIKE_NUM
 _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
              'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen',
              'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty',
              'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety',
              'hundred', 'thousand', 'million', 'billion', 'trillion', 'quadrillion',
              'gajillion', 'bazillion',
              'nol', 'satu', 'dua', 'tiga', 'empat', 'lima', 'enam', 'tujuh',
              'delapan', 'sembilan', 'sepuluh', 'sebelas', 'duabelas', 'tigabelas',
              'empatbelas', 'limabelas', 'enambelas', 'tujuhbelas', 'delapanbelas',
              'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta',
              'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun',
              'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun',
              'noniliun', 'desiliun',
              ]
 def like_num(text):
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
    if text.count('/') == 1:
        num, denom = text.split('/')
        if num.isdigit() and denom.isdigit():
            return True
    if text in _num_words:
        return True
    if text.count('-') == 1:
        _, num = text.split('-')
        if num.isdigit() or num in _num_words:
            return True
    return False
 LEX_ATTRS = {
    LIKE_NUM: like_num
 }
--- a/spacy/lang/id/norm_exceptions.py
+++ b/spacy/lang/id/norm_exceptions.py
@ -0,0 +1,17 @@
 # coding: utf8
 from __future__ import unicode_literals
 _exc = {
    "Rp": "$",
    "IDR": "$",
    "RMB": "$",
    "USD": "$",
    "AUD": "$",
    "GBP": "$",
 }
 NORM_EXCEPTIONS = {}
 for string, norm in _exc.items():
    NORM_EXCEPTIONS[string] = norm
    NORM_EXCEPTIONS[string.title()] = norm
--- a/spacy/lang/id/punctuation.py
+++ b/spacy/lang/id/punctuation.py
@ -0,0 +1,53 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from ..char_classes import merge_chars, split_chars, _currency, _units
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
 from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER, HYPHENS
 _units = (_units + 's bit Gbps Mbps mbps Kbps kbps ƒ ppi px '
          'Hz kHz MHz GHz mAh '
          'ratus rb ribu ribuan '
          'juta jt jutaan mill?iar million bil[l]?iun bilyun billion '
          )
 _currency = (_currency + r' USD Rp IDR RMB SGD S\$')
 _months = ('Januari Februari Maret April Mei Juni Juli Agustus September '
           'Oktober November Desember January February March May June '
           'July August October December Jan Feb Mar Jun Jul Aug Sept '
           'Oct Okt Nov Des ')
 UNITS = merge_chars(_units)
 CURRENCY = merge_chars(_currency)
 HTML_PREFIX = r'<(b|strong|i|em|p|span|div|br)\s?/>|<a([^>]+)>'
 HTML_SUFFIX = r'</(b|strong|i|em|p|span|div|a)>'
 MONTHS = merge_chars(_months)
 LIST_CURRENCY = split_chars(_currency)
 TOKENIZER_PREFIXES.remove('#') # hashtag
 _prefixes = TOKENIZER_PREFIXES + LIST_CURRENCY + [HTML_PREFIX] + ['/', '—']
 _suffixes = TOKENIZER_SUFFIXES + [r'\-[Nn]ya', '-[KkMm]u', '[—-]'] + [
        r'(?<={c})(?:[0-9]+)'.format(c=CURRENCY),
        r'(?<=[0-9])(?:{u})'.format(u=UNITS),
        r'(?<=[0-9])%',
        r'(?<=[0-9{a}]{h})(?:[\.,:-])'.format(a=ALPHA, h=HTML_SUFFIX),
        r'(?<=[0-9{a}])(?:{h})'.format(a=ALPHA, h=HTML_SUFFIX),
    ]
 _infixes = TOKENIZER_INFIXES + [
    r'(?<=[0-9])[\\/](?=[0-9%-])',
    r'(?<=[0-9])%(?=[{a}0-9/])'.format(a=ALPHA),
    r'(?<={u})[\/-](?=[0-9])'.format(u=UNITS),
    r'(?<={m})[\/-](?=[0-9])'.format(m=MONTHS),
    r'(?<=[0-9\)][\.,])"(?=[0-9])',
    r'(?<=[{a}\)][\.,\'])["—](?=[{a}])'.format(a=ALPHA),
    r'(?<=[{a}])-(?=[0-9])'.format(a=ALPHA),
    r'(?<=[0-9])-(?=[{a}])'.format(a=ALPHA),
    r'(?<=[{a}])[\/-](?={c}{a})'.format(a=ALPHA, c=CURRENCY),
 ]
 TOKENIZER_PREFIXES = _prefixes
 TOKENIZER_SUFFIXES = _suffixes
 TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/id/stop_words.py
+++ b/spacy/lang/id/stop_words.py
@ -0,0 +1,763 @@
 # coding: utf8
 from __future__ import unicode_literals
 STOP_WORDS = set("""
 ada
 adalah
 adanya
 adapun
 agak
 agaknya
 agar
 akan
 akankah
 akhir
 akhiri
 akhirnya
 aku
 akulah
 amat
 amatlah
 anda
 andalah
 antar
 antara
 antaranya
 apa
 apaan
 apabila
 apakah
 apalagi
 apatah
 artinya
 asal
 asalkan
 atas
 atau
 ataukah
 ataupun
 awal
 awalnya
 bagai
 bagaikan
 bagaimana
 bagaimanakah
 bagaimanapun
 bagi
 bagian
 bahkan
 bahwa
 bahwasanya
 baik
 bakal
 bakalan
 balik
 banyak
 bapak
 baru
 bawah
 beberapa
 begini
 beginian
 beginikah
 beginilah
 begitu
 begitukah
 begitulah
 begitupun
 bekerja
 belakang
 belakangan
 belum
 belumlah
 benar
 benarkah
 benarlah
 berada
 berakhir
 berakhirlah
 berakhirnya
 berapa
 berapakah
 berapalah
 berapapun
 berarti
 berawal
 berbagai
 berdatangan
 beri
 berikan
 berikut
 berikutnya
 berjumlah
 berkali-kali
 berkata
 berkehendak
 berkeinginan
 berkenaan
 berlainan
 berlalu
 berlangsung
 berlebihan
 bermacam
 bermacam-macam
 bermaksud
 bermula
 bersama
 bersama-sama
 bersiap
 bersiap-siap
 bertanya
 bertanya-tanya
 berturut
 berturut-turut
 bertutur
 berujar
 berupa
 besar
 betul
 betulkah
 biasa
 biasanya
 bila
 bilakah
 bisa
 bisakah
 boleh
 bolehkah
 bolehlah
 buat
 bukan
 bukankah
 bukanlah
 bukannya
 bulan
 bung
 cara
 caranya
 cukup
 cukupkah
 cukuplah
 cuma
 dahulu
 dalam
 dan
 dapat
 dari
 daripada
 datang
 dekat
 demi
 demikian
 demikianlah
 dengan
 depan
 di
 dia
 diakhiri
 diakhirinya
 dialah
 diantara
 diantaranya
 diberi
 diberikan
 diberikannya
 dibuat
 dibuatnya
 didapat
 didatangkan
 digunakan
 diibaratkan
 diibaratkannya
 diingat
 diingatkan
 diinginkan
 dijawab
 dijelaskan
 dijelaskannya
 dikarenakan
 dikatakan
 dikatakannya
 dikerjakan
 diketahui
 diketahuinya
 dikira
 dilakukan
 dilalui
 dilihat
 dimaksud
 dimaksudkan
 dimaksudkannya
 dimaksudnya
 diminta
 dimintai
 dimisalkan
 dimulai
 dimulailah
 dimulainya
 dimungkinkan
 dini
 dipastikan
 diperbuat
 diperbuatnya
 dipergunakan
 diperkirakan
 diperlihatkan
 diperlukan
 diperlukannya
 dipersoalkan
 dipertanyakan
 dipunyai
 diri
 dirinya
 disampaikan
 disebut
 disebutkan
 disebutkannya
 disini
 disinilah
 ditambahkan
 ditandaskan
 ditanya
 ditanyai
 ditanyakan
 ditegaskan
 ditujukan
 ditunjuk
 ditunjuki
 ditunjukkan
 ditunjukkannya
 ditunjuknya
 dituturkan
 dituturkannya
 diucapkan
 diucapkannya
 diungkapkan
 dong
 dua
 dulu
 empat
 enggak
 enggaknya
 entah
 entahlah
 guna
 gunakan
 hal
 hampir
 hanya
 hanyalah
 hari
 harus
 haruslah
 harusnya
 hendak
 hendaklah
 hendaknya
 hingga
 ia
 ialah
 ibarat
 ibaratkan
 ibaratnya
 ibu
 ikut
 ingat
 ingat-ingat
 ingin
 inginkah
 inginkan
 ini
 inikah
 inilah
 itu
 itukah
 itulah
 jadi
 jadilah
 jadinya
 jangan
 jangankan
 janganlah
 jauh
 jawab
 jawaban
 jawabnya
 jelas
 jelaskan
 jelaslah
 jelasnya
 jika
 jikalau
 juga
 jumlah
 jumlahnya
 justru
 kala
 kalau
 kalaulah
 kalaupun
 kalian
 kami
 kamilah
 kamu
 kamulah
 kan
 kapan
 kapankah
 kapanpun
 karena
 karenanya
 kasus
 kata
 katakan
 katakanlah
 katanya
 ke
 keadaan
 kebetulan
 kecil
 kedua
 keduanya
 keinginan
 kelamaan
 kelihatan
 kelihatannya
 kelima
 keluar
 kembali
 kemudian
 kemungkinan
 kemungkinannya
 kenapa
 kepada
 kepadanya
 kesampaian
 keseluruhan
 keseluruhannya
 keterlaluan
 ketika
 khususnya
 kini
 kinilah
 kira
 kira-kira
 kiranya
 kita
 kitalah
 kok
 kurang
 lagi
 lagian
 lah
 lain
 lainnya
 lalu
 lama
 lamanya
 lanjut
 lanjutnya
 lebih
 lewat
 lima
 luar
 macam
 maka
 makanya
 makin
 malah
 malahan
 mampu
 mampukah
 mana
 manakala
 manalagi
 masa
 masalah
 masalahnya
 masih
 masihkah
 masing
 masing-masing
 mau
 maupun
 melainkan
 melakukan
 melalui
 melihat
 melihatnya
 memang
 memastikan
 memberi
 memberikan
 membuat
 memerlukan
 memihak
 meminta
 memintakan
 memisalkan
 memperbuat
 mempergunakan
 memperkirakan
 memperlihatkan
 mempersiapkan
 mempersoalkan
 mempertanyakan
 mempunyai
 memulai
 memungkinkan
 menaiki
 menambahkan
 menandaskan
 menanti
 menanti-nanti
 menantikan
 menanya
 menanyai
 menanyakan
 mendapat
 mendapatkan
 mendatang
 mendatangi
 mendatangkan
 menegaskan
 mengakhiri
 mengapa
 mengatakan
 mengatakannya
 mengenai
 mengerjakan
 mengetahui
 menggunakan
 menghendaki
 mengibaratkan
 mengibaratkannya
 mengingat
 mengingatkan
 menginginkan
 mengira
 mengucapkan
 mengucapkannya
 mengungkapkan
 menjadi
 menjawab
 menjelaskan
 menuju
 menunjuk
 menunjuki
 menunjukkan
 menunjuknya
 menurut
 menuturkan
 menyampaikan
 menyangkut
 menyatakan
 menyebutkan
 menyeluruh
 menyiapkan
 merasa
 mereka
 merekalah
 merupakan
 meski
 meskipun
 meyakini
 meyakinkan
 minta
 mirip
 misal
 misalkan
 misalnya
 mula
 mulai
 mulailah
 mulanya
 mungkin
 mungkinkah
 nah
 naik
 namun
 nanti
 nantinya
 nyaris
 nyatanya
 oleh
 olehnya
 pada
 padahal
 padanya
 pak
 paling
 panjang
 pantas
 para
 pasti
 pastilah
 penting
 pentingnya
 per
 percuma
 perlu
 perlukah
 perlunya
 pernah
 persoalan
 pertama
 pertama-tama
 pertanyaan
 pertanyakan
 pihak
 pihaknya
 pukul
 pula
 pun
 punya
 rasa
 rasanya
 rata
 rupanya
 saat
 saatnya
 saja
 sajalah
 saling
 sama
 sama-sama
 sambil
 sampai
 sampai-sampai
 sampaikan
 sana
 sangat
 sangatlah
 satu
 saya
 sayalah
 se
 sebab
 sebabnya
 sebagai
 sebagaimana
 sebagainya
 sebagian
 sebaik
 sebaik-baiknya
 sebaiknya
 sebaliknya
 sebanyak
 sebegini
 sebegitu
 sebelum
 sebelumnya
 sebenarnya
 seberapa
 sebesar
 sebetulnya
 sebisanya
 sebuah
 sebut
 sebutlah
 sebutnya
 secara
 secukupnya
 sedang
 sedangkan
 sedemikian
 sedikit
 sedikitnya
 seenaknya
 segala
 segalanya
 segera
 seharusnya
 sehingga
 seingat
 sejak
 sejauh
 sejenak
 sejumlah
 sekadar
 sekadarnya
 sekali
 sekali-kali
 sekalian
 sekaligus
 sekalipun
 sekarang
 sekarang
 sekecil
 seketika
 sekiranya
 sekitar
 sekitarnya
 sekurang-kurangnya
 sekurangnya
 sela
 selain
 selaku
 selalu
 selama
 selama-lamanya
 selamanya
 selanjutnya
 seluruh
 seluruhnya
 semacam
 semakin
 semampu
 semampunya
 semasa
 semasih
 semata
 semata-mata
 semaunya
 sementara
 semisal
 semisalnya
 sempat
 semua
 semuanya
 semula
 sendiri
 sendirian
 sendirinya
 seolah
 seolah-olah
 seorang
 sepanjang
 sepantasnya
 sepantasnyalah
 seperlunya
 seperti
 sepertinya
 sepihak
 sering
 seringnya
 serta
 serupa
 sesaat
 sesama
 sesampai
 sesegera
 sesekali
 seseorang
 sesuatu
 sesuatunya
 sesudah
 sesudahnya
 setelah
 setempat
 setengah
 seterusnya
 setiap
 setiba
 setibanya
 setidak-tidaknya
 setidaknya
 setinggi
 seusai
 sewaktu
 siap
 siapa
 siapakah
 siapapun
 sini
 sinilah
 soal
 soalnya
 suatu
 sudah
 sudahkah
 sudahlah
 supaya
 tadi
 tadinya
 tahu
 tahun
 tak
 tambah
 tambahnya
 tampak
 tampaknya
 tandas
 tandasnya
 tanpa
 tanya
 tanyakan
 tanyanya
 tapi
 tegas
 tegasnya
 telah
 tempat
 tengah
 tentang
 tentu
 tentulah
 tentunya
 tepat
 terakhir
 terasa
 terbanyak
 terdahulu
 terdapat
 terdiri
 terhadap
 terhadapnya
 teringat
 teringat-ingat
 terjadi
 terjadilah
 terjadinya
 terkira
 terlalu
 terlebih
 terlihat
 termasuk
 ternyata
 tersampaikan
 tersebut
 tersebutlah
 tertentu
 tertuju
 terus
 terutama
 tetap
 tetapi
 tiap
 tiba
 tiba-tiba
 tidak
 tidakkah
 tidaklah
 tiga
 tinggi
 toh
 tunjuk
 turut
 tutur
 tuturnya
 ucap
 ucapnya
 ujar
 ujarnya
 umum
 umumnya
 ungkap
 ungkapnya
 untuk
 usah
 usai
 waduh
 wah
 wahai
 waktu
 waktunya
 walau
 walaupun
 wong
 yaitu
 yakin
 yakni
 yang
 """.split())
--- a/spacy/lang/id/syntax_iterators.py
+++ b/spacy/lang/id/syntax_iterators.py
@ -0,0 +1,42 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...symbols import NOUN, PROPN, PRON
 def noun_chunks(obj):
    """
    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
    """
    labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
    doc = obj.doc  # Ensure works on both Doc and Span.
    np_deps = [doc.vocab.strings[label] for label in labels]
    conj = doc.vocab.strings.add('conj')
    np_label = doc.vocab.strings.add('NP')
    seen = set()
    for i, word in enumerate(obj):
        if word.pos not in (NOUN, PROPN, PRON):
            continue
        # Prevent nested chunks from being produced
        if word.i in seen:
            continue
        if word.dep in np_deps:
            if any(w.i in seen for w in word.subtree):
                continue
            seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
            yield word.left_edge.i, word.right_edge.i+1, np_label
        elif word.dep == conj:
            head = word.head
            while head.dep == conj and head.head.i < head.i:
                head = head.head
            # If the head is an NP, and we're coordinated to it, we're an NP
            if head.dep in np_deps:
                if any(w.i in seen for w in word.subtree):
                    continue
                seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
                yield word.left_edge.i, word.right_edge.i+1, np_label
 SYNTAX_ITERATORS = {
    'noun_chunks': noun_chunks
 }
--- a/spacy/lang/id/tokenizer_exceptions.py
+++ b/spacy/lang/id/tokenizer_exceptions.py
@ -0,0 +1,50 @@
 # coding: utf8
 from __future__ import unicode_literals
 import regex as re
 from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
 from ..tokenizer_exceptions import URL_PATTERN
 from ...symbols import ORTH
 _exc = {}
 for orth in ID_BASE_EXCEPTIONS:
    _exc[orth] = [{ORTH: orth}]
    orth_title = orth.title()
    _exc[orth_title] = [{ORTH: orth_title}]
    orth_caps = orth.upper()
    _exc[orth_caps] = [{ORTH: orth_caps}]
    orth_lower = orth.lower()
    _exc[orth_lower] = [{ORTH: orth_lower}]
    if '-' in orth:
        orth_title = '-'.join([part.title() for part in orth.split('-')])
        _exc[orth_title] = [{ORTH: orth_title}]
        orth_caps = '-'.join([part.upper() for part in orth.split('-')])
        _exc[orth_caps] = [{ORTH: orth_caps}]
 for orth in [
    "'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.",
    "E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.",
    "Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.",
    "Ph.D.", "Rep.", "Rev.", "Sen.", "St.", "vs.",
    "B.A.", "B.Ch.E.", "B.Sc.", "Dr.", "Dra.", "Drs.", "Hj.", "Ka.", "Kp.",
    "M.Ag.", "M.Hum.", "M.Kes,", "M.Kom.", "M.M.", "M.P.", "M.Pd.", "M.Sc.",
    "M.Si.", "M.Sn.", "M.T.", "M.Th.", "No.", "Pjs.", "Plt.", "R.A.", "S.Ag.",
    "S.E.", "S.H.", "S.Hut.", "S.K.M.", "S.Kedg.", "S.Kedh.", "S.Kom.",
    "S.Pd.", "S.Pol.", "S.Psi.", "S.S.", "S.Sos.", "S.T.", "S.Tekp.", "S.Th.",
    "a.l.", "a.n.", "a.s.", "b.d.", "d.a.", "d.l.", "d/h", "dkk.", "dll.",
    "dr.", "drh.", "ds.", "dsb.", "dst.", "faks.", "fax.", "hlm.", "i/o",
    "n.b.", "p.p." "pjs.", "s.d.", "tel.", "u.p.",
    ]:
    _exc[orth] = [{ORTH: orth}]
 TOKENIZER_EXCEPTIONS = dict(_exc)
--- a/spacy/lang/it/examples.py
+++ b/spacy/lang/it/examples.py
@ -0,0 +1,18 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.it.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
    "Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
    "San Francisco prevede di bandire i robot di consegna porta a porta",
    "Londra è una grande città del Regno Unito."
 ]
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@ -137,6 +137,7 @@ LEX_ATTRS = {
    attrs.IS_UPPER: lambda string: string.isupper(),
    attrs.IS_STOP: lambda string: False,
    attrs.IS_OOV: lambda string: True,
    attrs.PROB: lambda string: -20.,
    attrs.LIKE_EMAIL: like_email,
    attrs.LIKE_NUM: like_num,
    attrs.IS_PUNCT: is_punct,
--- a/spacy/lang/nb/examples.py
+++ b/spacy/lang/nb/examples.py
@ -0,0 +1,18 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.nb.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar",
    "Selvkjørende biler flytter forsikringsansvaret over på produsentene ",
    "San Francisco vurderer å forby robotbud på fortauene",
    "London er en stor by i Storbritannia."
 ]
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -1,6 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
@ -15,7 +16,7 @@ class PolishDefaults(Language.Defaults):
    lex_attr_getters[LANG] = lambda text: 'pl'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = set(STOP_WORDS)
--- a/spacy/lang/pl/examples.py
+++ b/spacy/lang/pl/examples.py
@ -0,0 +1,20 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.pl.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Poczuł przyjemną woń mocnej kawy.",
    "Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
    "Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.",
    "Nowy abonament pod lupą Komisji Europejskiej",
    "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
    "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”."
 ]
--- a/spacy/lang/pl/tokenizer_exceptions.py
+++ b/spacy/lang/pl/tokenizer_exceptions.py
@ -0,0 +1,23 @@
 # encoding: utf8
 from __future__ import unicode_literals
 from ..symbols import ORTH, LEMMA, POS
 _exc = {}
 for exc_data in [
    {ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
    {ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
    {ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
    {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
    {ORTH: "tj.", LEMMA: "to jest", POS: ADV},
    {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
    _exc[exc_data[ORTH]] = [dict(exc_data)],
 for orth in [
    "w.", "r."]:
    _exc[orth] = [{ORTH: orth}]
 TOKENIZER_EXCEPTIONS = dict(_exc)
--- a/spacy/lang/pt/examples.py
+++ b/spacy/lang/pt/examples.py
@ -0,0 +1,18 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.pt.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares",
    "Carros autônomos empurram a responsabilidade do seguro para os fabricantes."
    "São Francisco considera banir os robôs de entrega que andam pelas calçadas",
    "Londres é a maior cidade do Reino Unido"
 ]
--- a/spacy/lang/sv/examples.py
+++ b/spacy/lang/sv/examples.py
@ -0,0 +1,18 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.sv.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple överväger att köpa brittisk startup för 1 miljard dollar.",
    "Självkörande bilar förskjuter försäkringsansvar mot tillverkare.",
    "San Fransisco överväger förbud mot leveransrobotar på trottoarer.".
    "London är en storstad i Storbritannien."
 ]
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -15,6 +15,7 @@ class Chinese(Language):
            raise ImportError("The Chinese tokenizer requires the Jieba library: "
                              "https://github.com/fxsjy/jieba")
        words = list(jieba.cut(text, cut_all=True))
        words=[x for x in words if x]
        return Doc(self.vocab, words=words, spaces=[False]*len(words))
--- a/spacy/language.py
+++ b/spacy/language.py
@ -10,6 +10,7 @@ from thinc.neural.optimizers import Adam, SGD
 import random
 import ujson
 from collections import OrderedDict
 import itertools
 from .tokenizer import Tokenizer
 from .vocab import Vocab
@ -22,8 +23,10 @@ from .pipeline import NeuralDependencyParser, EntityRecognizer
 from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
 from .pipeline import NeuralLabeller
 from .pipeline import SimilarityHook
 from .pipeline import TextCategorizer
 from . import about
-from .compat import json_dumps
+from .compat import json_dumps, izip
 from .attrs import IS_STOP
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .lang.tokenizer_exceptions import TOKEN_MATCH
@ -92,7 +95,7 @@ class BaseDefaults(object):
        meta = nlp.meta if nlp is not None else {}
        # Resolve strings, like "cnn", "lstm", etc
        pipeline = []
-        for entry in cls.pipeline:
+        for entry in meta.get('pipeline', []):
            if entry in disable or getattr(entry, 'name', entry) in disable:
                continue
            factory = cls.Defaults.factories[entry]
@ -107,6 +110,8 @@ class BaseDefaults(object):
            NeuralDependencyParser(nlp.vocab, **cfg),
            nonproj.deprojectivize],
        'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
        'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)],
        'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)],
        # Temporary compatibility -- delete after pivot
        'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
        'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
@ -115,7 +120,6 @@ class BaseDefaults(object):
            nonproj.deprojectivize,
        ],
        'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
        'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)]
    }
    token_match = TOKEN_MATCH
@ -147,8 +151,8 @@ class Language(object):
    Defaults = BaseDefaults
    lang = None
-    def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={},
+    def __init__(self, vocab=True, make_doc=True, pipeline=None,
-            disable=tuple(), **kwargs):
+                 meta={}, disable=tuple(), **kwargs):
        """Initialise a Language object.
        vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
@ -165,7 +169,7 @@ class Language(object):
            models to add model meta data.
        RETURNS (Language): The newly constructed object.
        """
-        self.meta = dict(meta)
+        self._meta = dict(meta)
        if vocab is True:
            factory = self.Defaults.create_vocab
            vocab = factory(self, **meta.get('vocab', {}))
@ -196,6 +200,29 @@ class Language(object):
            else:
                flat_list.append(pipe)
        self.pipeline = flat_list
        self._optimizer = None
    @property
    def meta(self):
        self._meta.setdefault('lang', self.vocab.lang)
        self._meta.setdefault('name', '')
        self._meta.setdefault('version', '0.0.0')
        self._meta.setdefault('spacy_version', about.__version__)
        self._meta.setdefault('description', '')
        self._meta.setdefault('author', '')
        self._meta.setdefault('email', '')
        self._meta.setdefault('url', '')
        self._meta.setdefault('license', '')
        pipeline = []
        for component in self.pipeline:
            if hasattr(component, 'name'):
                pipeline.append(component.name)
        self._meta['pipeline'] = pipeline
        return self._meta
    @meta.setter
    def meta(self, value):
        self._meta = value
    # Conveniences to access pipeline components
    @property
@ -251,7 +278,8 @@ class Language(object):
    def make_doc(self, text):
        return self.tokenizer(text)
-    def update(self, docs, golds, drop=0., sgd=None, losses=None):
+    def update(self, docs, golds, drop=0., sgd=None, losses=None,
            update_shared=False):
        """Update the models in the pipeline.
        docs (iterable): A batch of `Doc` objects.
@ -266,6 +294,15 @@ class Language(object):
            >>>        for docs, golds in epoch:
            >>>            state = nlp.update(docs, golds, sgd=optimizer)
        """
        if len(docs) != len(golds):
            raise IndexError("Update expects same number of docs and golds "
                "Got: %d, %d" % (len(docs), len(golds)))
        if len(docs) == 0:
            return
        if sgd is None:
            if self._optimizer is None:
                self._optimizer = Adam(Model.ops, 0.001)
            sgd = self._optimizer
        tok2vec = self.pipeline[0]
        feats = tok2vec.doc2feats(docs)
        grads = {}
@ -273,14 +310,18 @@ class Language(object):
            grads[key] = (W, dW)
        pipes = list(self.pipeline[1:])
        random.shuffle(pipes)
        tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
        all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
        for proc in pipes:
            if not hasattr(proc, 'update'):
                continue
            tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
            d_tokvecses = proc.update((docs, tokvecses), golds,
                                      drop=drop, sgd=get_grads, losses=losses)
-            if d_tokvecses is not None:
+            if update_shared and d_tokvecses is not None:
-                bp_tokvecses(d_tokvecses, sgd=sgd)
+                for i, d_tv in enumerate(d_tokvecses):
                    all_d_tokvecses[i] += d_tv
        if update_shared and bp_tokvecses is not None:
            bp_tokvecses(all_d_tokvecses, sgd=sgd)
        for key, (W, dW) in grads.items():
            sgd(W, dW, key=key)
        # Clear the tensor variable, to free GPU memory.
@ -343,16 +384,25 @@ class Language(object):
        eps = util.env_opt('optimizer_eps', 1e-08)
        L2 = util.env_opt('L2_penalty', 1e-6)
        max_grad_norm = util.env_opt('grad_norm_clip', 1.)
-        optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
+        self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
                              beta2=beta2, eps=eps)
-        optimizer.max_grad_norm = max_grad_norm
+        self._optimizer.max_grad_norm = max_grad_norm
-        optimizer.device = device
+        self._optimizer.device = device
-        return optimizer
+        return self._optimizer
    def evaluate(self, docs_golds):
        docs, golds = zip(*docs_golds)
        scorer = Scorer()
-        for doc, gold in zip(self.pipe(docs, batch_size=32), golds):
+        docs, golds = zip(*docs_golds)
        docs = list(docs)
        golds = list(golds)
        for pipe in self.pipeline:
            if not hasattr(pipe, 'pipe'):
                for doc in docs:
                    pipe(doc)
            else:
                docs = list(pipe.pipe(docs))
        assert len(docs) == len(golds)
        for doc, gold in zip(docs, golds):
            scorer.score(doc, gold)
            doc.tensor = None
        return scorer
@ -386,11 +436,16 @@ class Language(object):
            except StopIteration:
                pass
-    def pipe(self, texts, n_threads=2, batch_size=1000, disable=[]):
+    def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
            disable=[]):
        """Process texts as a stream, and yield `Doc` objects in order. Supports
        GIL-free multi-threading.
        texts (iterator): A sequence of texts to process.
        as_tuples (bool):
            If set to True, inputs should be a sequence of
            (text, context) tuples. Output will then be a sequence of
            (doc, context) tuples. Defaults to False.
        n_threads (int): The number of worker threads to use. If -1, OpenMP will
            decide how many to use at run time. Default is 2.
        batch_size (int): The number of texts to buffer.
@ -402,8 +457,16 @@ class Language(object):
            >>>     for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
            >>>         assert doc.is_parsed
        """
        if as_tuples:
            text_context1, text_context2 = itertools.tee(texts)
            texts = (tc[0] for tc in text_context1)
            contexts = (tc[1] for tc in text_context2)
            docs = self.pipe(texts, n_threads=n_threads, batch_size=batch_size,
                             disable=disable)
            for doc, context in izip(docs, contexts):
                yield (doc, context)
            return
        docs = (self.make_doc(text) for text in texts)
        docs = texts
        for proc in self.pipeline:
            name = getattr(proc, 'name', None)
            if name in disable:
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -44,6 +44,11 @@ class Lemmatizer(object):
            return True
        elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
            return True
        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
        # morphology
        elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
                                     morphology.get('Tense') == 'pres'):
            return True
        elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
            return True
        elif VerbForm_inf in morphology:
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -171,6 +171,8 @@ cdef class Lexeme:
    property rank:
        def __get__(self):
            return self.c.id
        def __set__(self, value):
            self.c.id = value
    property sentiment:
        def __get__(self):
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -42,15 +42,148 @@ from .compat import json_dumps
 from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
 from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
 from ._ml import build_text_classifier, build_tagger_model
 from .parts_of_speech import X
-class TokenVectorEncoder(object):
+class SentenceSegmenter(object):
    '''A simple spaCy hook, to allow custom sentence boundary detection logic
    (that doesn't require the dependency parse).
    To change the sentence boundary detection strategy, pass a generator
    function `strategy` on initialization, or assign a new strategy to
    the .strategy attribute.
    Sentence detection strategies should be generators that take `Doc` objects
    and yield `Span` objects for each sentence.
    '''
    name = 'sbd'
    def __init__(self, vocab, strategy=None):
        self.vocab = vocab
        if strategy is None or strategy == 'on_punct':
            strategy = self.split_on_punct
        self.strategy = strategy
    def __call__(self, doc):
        doc.user_hooks['sents'] = self.strategy
    @staticmethod
    def split_on_punct(doc):
        start = 0
        seen_period = False
        for i, word in enumerate(doc):
            if seen_period and not word.is_punct:
                yield doc[start : word.i]
                start = word.i
                seen_period = False
            elif word.text in ['.', '!', '?']:
                seen_period = True
        if start < len(doc):
            yield doc[start : len(doc)]
 class BaseThincComponent(object):
    name = None
    @classmethod
    def Model(cls, *shape, **kwargs):
        raise NotImplementedError
    def __init__(self, vocab, model=True, **cfg):
        raise NotImplementedError
    def __call__(self, doc):
        scores = self.predict([doc])
        self.set_annotations([doc], scores)
        return doc
    def pipe(self, stream, batch_size=128, n_threads=-1):
        for docs in cytoolz.partition_all(batch_size, stream):
            docs = list(docs)
            scores = self.predict(docs)
            self.set_annotations(docs, scores)
            yield from docs
    def predict(self, docs):
        raise NotImplementedError
    def set_annotations(self, docs, scores):
        raise NotImplementedError
    def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
        raise NotImplementedError
    def get_loss(self, docs, golds, scores):
        raise NotImplementedError
    def begin_training(self, gold_tuples=tuple(), pipeline=None):
        token_vector_width = pipeline[0].model.nO
        if self.model is True:
            self.model = self.Model(1, token_vector_width)
    def use_params(self, params):
        with self.model.use_params(params):
            yield
    def to_bytes(self, **exclude):
        serialize = OrderedDict((
            ('cfg', lambda: json_dumps(self.cfg)),
            ('model', lambda: self.model.to_bytes()),
            ('vocab', lambda: self.vocab.to_bytes())
        ))
        return util.to_bytes(serialize, exclude)
    def from_bytes(self, bytes_data, **exclude):
        def load_model(b):
            if self.model is True:
                self.model = self.Model(**self.cfg)
            self.model.from_bytes(b)
        deserialize = OrderedDict((
            ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
            ('model', load_model),
            ('vocab', lambda b: self.vocab.from_bytes(b))
        ))
        util.from_bytes(bytes_data, deserialize, exclude)
        return self
    def to_disk(self, path, **exclude):
        serialize = OrderedDict((
            ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
            ('vocab', lambda p: self.vocab.to_disk(p))
        ))
        util.to_disk(path, serialize, exclude)
    def from_disk(self, path, **exclude):
        def load_model(p):
            if self.model is True:
                self.model = self.Model(**self.cfg)
            self.model.from_bytes(p.open('rb').read())
        deserialize = OrderedDict((
            ('cfg', lambda p: self.cfg.update(_load_cfg(p))),
            ('model', load_model),
            ('vocab', lambda p: self.vocab.from_disk(p)),
        ))
        util.from_disk(path, deserialize, exclude)
        return self
 def _load_cfg(path):
    if path.exists():
        return ujson.load(path.open())
    else:
        return {}
 class TokenVectorEncoder(BaseThincComponent):
    """Assign position-sensitive vectors to tokens, using a CNN or RNN."""
    name = 'tensorizer'
    @classmethod
-    def Model(cls, width=128, embed_size=7500, **cfg):
+    def Model(cls, width=128, embed_size=4000, **cfg):
        """Create a new statistical model for the class.
        width (int): Output size of the model.
@ -79,6 +212,7 @@ class TokenVectorEncoder(object):
        self.vocab = vocab
        self.doc2feats = doc2feats()
        self.model = model
        self.cfg = dict(cfg)
    def __call__(self, doc):
        """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
@ -144,7 +278,7 @@ class TokenVectorEncoder(object):
        # TODO: implement
        raise NotImplementedError
-    def begin_training(self, gold_tuples, pipeline=None):
+    def begin_training(self, gold_tuples=tuple(), pipeline=None):
        """Allocate models, pre-process training data and acquire a trainer and
        optimizer.
@ -155,74 +289,34 @@ class TokenVectorEncoder(object):
        if self.model is True:
            self.model = self.Model()
    def use_params(self, params):
        """Replace weights of models in the pipeline with those provided in the
        params dictionary.
-        params (dict): A dictionary of parameters keyed by model ID.
+class NeuralTagger(BaseThincComponent):
        """
        with self.model.use_params(params):
            yield
    def to_bytes(self, **exclude):
        serialize = OrderedDict((
            ('model', lambda: self.model.to_bytes()),
            ('vocab', lambda: self.vocab.to_bytes())
        ))
        return util.to_bytes(serialize, exclude)
    def from_bytes(self, bytes_data, **exclude):
        if self.model is True:
            self.model = self.Model()
        deserialize = OrderedDict((
            ('model', lambda b: self.model.from_bytes(b)),
            ('vocab', lambda b: self.vocab.from_bytes(b))
        ))
        util.from_bytes(bytes_data, deserialize, exclude)
        return self
    def to_disk(self, path, **exclude):
        serialize = OrderedDict((
            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
            ('vocab', lambda p: self.vocab.to_disk(p))
        ))
        util.to_disk(path, serialize, exclude)
    def from_disk(self, path, **exclude):
        if self.model is True:
            self.model = self.Model()
        deserialize = OrderedDict((
            ('model', lambda p: self.model.from_bytes(p.open('rb').read())),
            ('vocab', lambda p: self.vocab.from_disk(p))
        ))
        util.from_disk(path, deserialize, exclude)
        return self
 class NeuralTagger(object):
    name = 'tagger'
-    def __init__(self, vocab, model=True):
+    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
        self.cfg = dict(cfg)
    def __call__(self, doc):
-        tags = self.predict([doc.tensor])
+        tags = self.predict(([doc], [doc.tensor]))
        self.set_annotations([doc], tags)
        return doc
    def pipe(self, stream, batch_size=128, n_threads=-1):
        for docs in cytoolz.partition_all(batch_size, stream):
            docs = list(docs)
            tokvecs = [d.tensor for d in docs]
-            tag_ids = self.predict(tokvecs)
+            tag_ids = self.predict((docs, tokvecs))
            self.set_annotations(docs, tag_ids)
            yield from docs
-    def predict(self, tokvecs):
+    def predict(self, docs_tokvecs):
-        scores = self.model(tokvecs)
+        scores = self.model(docs_tokvecs)
        scores = self.model.ops.flatten(scores)
        guesses = scores.argmax(axis=1)
        if not isinstance(guesses, numpy.ndarray):
            guesses = guesses.get()
        tokvecs = docs_tokvecs[1]
        guesses = self.model.ops.unflatten(guesses,
                    [tv.shape[0] for tv in tokvecs])
        return guesses
@ -235,6 +329,8 @@ class NeuralTagger(object):
        cdef Vocab vocab = self.vocab
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, 'get'):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
                # Don't clobber preset POS tags
                if doc.c[j].tag == 0 and doc.c[j].pos == 0:
@ -243,16 +339,18 @@ class NeuralTagger(object):
        doc.is_tagged = True
    def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
        docs, tokvecs = docs_tokvecs
        if self.model.nI is None:
            self.model.nI = tokvecs[0].shape[1]
-
+        tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
        tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
        loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
        d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
-
+        if losses is not None:
            losses[self.name] += loss
        return d_tokvecs
    def get_loss(self, docs, golds, scores):
@ -276,7 +374,7 @@ class NeuralTagger(object):
        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
        return float(loss), d_scores
-    def begin_training(self, gold_tuples, pipeline=None):
+    def begin_training(self, gold_tuples=tuple(), pipeline=None):
        orig_tag_map = dict(self.vocab.morphology.tag_map)
        new_tag_map = {}
        for raw_text, annots_brackets in gold_tuples:
@ -300,9 +398,7 @@ class NeuralTagger(object):
    @classmethod
    def Model(cls, n_tags, token_vector_width):
-        return with_flatten(
+        return build_tagger_model(n_tags, token_vector_width)
            chain(Maxout(token_vector_width, token_vector_width),
                  Softmax(n_tags, token_vector_width)))
    def use_params(self, params):
        with self.model.use_params(params):
@ -321,7 +417,8 @@ class NeuralTagger(object):
    def from_bytes(self, bytes_data, **exclude):
        def load_model(b):
            if self.model is True:
-                token_vector_width = util.env_opt('token_vector_width', 128)
+                token_vector_width = util.env_opt('token_vector_width',
                        self.cfg.get('token_vector_width', 128))
                self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
            self.model.from_bytes(b)
@ -348,13 +445,15 @@ class NeuralTagger(object):
                use_bin_type=True,
                encoding='utf8'))),
            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
            ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg)))
        ))
        util.to_disk(path, serialize, exclude)
    def from_disk(self, path, **exclude):
        def load_model(p):
            if self.model is True:
-                token_vector_width = util.env_opt('token_vector_width', 128)
+                token_vector_width = util.env_opt('token_vector_width',
                        self.cfg.get('token_vector_width', 128))
                self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
            self.model.from_bytes(p.open('rb').read())
@ -370,6 +469,7 @@ class NeuralTagger(object):
            ('vocab', lambda p: self.vocab.from_disk(p)),
            ('tag_map', load_tag_map),
            ('model', load_model),
            ('cfg', lambda p: self.cfg.update(_load_cfg(p)))
        ))
        util.from_disk(path, deserialize, exclude)
        return self
@ -377,15 +477,23 @@ class NeuralTagger(object):
 class NeuralLabeller(NeuralTagger):
    name = 'nn_labeller'
-    def __init__(self, vocab, model=True):
+    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
-        self.labels = {}
+        self.cfg = dict(cfg)
    @property
    def labels(self):
        return self.cfg.setdefault('labels', {})
    @labels.setter
    def labels(self, value):
        self.cfg['labels'] = value
    def set_annotations(self, docs, dep_ids):
        pass
-    def begin_training(self, gold_tuples, pipeline=None):
+    def begin_training(self, gold_tuples=tuple(), pipeline=None):
        gold_tuples = nonproj.preprocess_training_data(gold_tuples)
        for raw_text, annots_brackets in gold_tuples:
            for annots, brackets in annots_brackets:
@ -399,9 +507,7 @@ class NeuralLabeller(NeuralTagger):
    @classmethod
    def Model(cls, n_tags, token_vector_width):
-        return with_flatten(
+        return build_tagger_model(n_tags, token_vector_width)
            chain(Maxout(token_vector_width, token_vector_width),
                  Softmax(n_tags, token_vector_width)))
    def get_loss(self, docs, golds, scores):
        scores = self.model.ops.flatten(scores)
@ -423,7 +529,7 @@ class NeuralLabeller(NeuralTagger):
        return float(loss), d_scores
-class SimilarityHook(object):
+class SimilarityHook(BaseThincComponent):
    """
    Experimental
@ -439,9 +545,10 @@ class SimilarityHook(object):
    Where W is a vector of dimension weights, initialized to 1.
    """
    name = 'similarity'
-    def __init__(self, vocab, model=True):
+    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
        self.cfg = dict(cfg)
    @classmethod
    def Model(cls, length):
@ -467,7 +574,7 @@ class SimilarityHook(object):
        return d_tensor1s, d_tensor2s
-    def begin_training(self, _, pipeline=None):
+    def begin_training(self, _=tuple(), pipeline=None):
        """
        Allocate model, using width from tensorizer in pipeline.
@ -477,48 +584,77 @@ class SimilarityHook(object):
        if self.model is True:
            self.model = self.Model(pipeline[0].model.nO)
    def use_params(self, params):
        """Replace weights of models in the pipeline with those provided in the
        params dictionary.
-        params (dict): A dictionary of parameters keyed by model ID.
+class TextCategorizer(BaseThincComponent):
-        """
+    name = 'textcat'
        with self.model.use_params(params):
            yield
-    def to_bytes(self, **exclude):
+    @classmethod
-        serialize = OrderedDict((
+    def Model(cls, nr_class=1, width=64, **cfg):
-            ('model', lambda: self.model.to_bytes()),
+        return build_text_classifier(nr_class, width, **cfg)
            ('vocab', lambda: self.vocab.to_bytes())
        ))
        return util.to_bytes(serialize, exclude)
-    def from_bytes(self, bytes_data, **exclude):
+    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
        self.cfg = dict(cfg)
    @property
    def labels(self):
        return self.cfg.get('labels', ['LABEL'])
    @labels.setter
    def labels(self, value):
        self.cfg['labels'] = value
    def __call__(self, doc):
        scores = self.predict([doc])
        self.set_annotations([doc], scores)
        return doc
    def pipe(self, stream, batch_size=128, n_threads=-1):
        for docs in cytoolz.partition_all(batch_size, stream):
            docs = list(docs)
            scores = self.predict(docs)
            self.set_annotations(docs, scores)
            yield from docs
    def predict(self, docs):
        scores = self.model(docs)
        scores = self.model.ops.asarray(scores)
        return scores
    def set_annotations(self, docs, scores):
        for i, doc in enumerate(docs):
            for j, label in enumerate(self.labels):
                doc.cats[label] = float(scores[i, j])
    def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
        docs, tensors = docs_tensors
        scores, bp_scores = self.model.begin_update(docs, drop=drop)
        loss, d_scores = self.get_loss(docs, golds, scores)
        d_tensors = bp_scores(d_scores, sgd=sgd)
        if losses is not None:
            losses.setdefault(self.name, 0.0)
            losses[self.name] += loss
        return d_tensors
    def get_loss(self, docs, golds, scores):
        truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
        for i, gold in enumerate(golds):
            for j, label in enumerate(self.labels):
                truths[i, j] = label in gold.cats
        truths = self.model.ops.asarray(truths)
        d_scores = (scores-truths) / scores.shape[0]
        mean_square_error = ((scores-truths)**2).sum(axis=1).mean()
        return mean_square_error, d_scores
    def begin_training(self, gold_tuples=tuple(), pipeline=None):
        if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
            token_vector_width = pipeline[0].model.nO
        else:
            token_vector_width = 64
        if self.model is True:
-            self.model = self.Model()
+            self.model = self.Model(len(self.labels), token_vector_width,
-        deserialize = OrderedDict((
+                                    **self.cfg)
            ('model', lambda b: self.model.from_bytes(b)),
            ('vocab', lambda b: self.vocab.from_bytes(b))
        ))
        util.from_bytes(bytes_data, deserialize, exclude)
        return self
    def to_disk(self, path, **exclude):
        serialize = OrderedDict((
            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
            ('vocab', lambda p: self.vocab.to_disk(p))
        ))
        util.to_disk(path, serialize, exclude)
    def from_disk(self, path, **exclude):
        if self.model is True:
            self.model = self.Model()
        deserialize = OrderedDict((
            ('model', lambda p: self.model.from_bytes(p.open('rb').read())),
            ('vocab', lambda p: self.vocab.from_disk(p))
        ))
        util.from_disk(path, deserialize, exclude)
        return self
 cdef class EntityRecognizer(LinearParser):
@ -569,6 +705,14 @@ cdef class NeuralEntityRecognizer(NeuralParser):
    nr_feature = 6
    def predict_confidences(self, docs):
        tensors = [d.tensor for d in docs]
        samples = []
        for i in range(10):
            states = self.parse_batch(docs, tensors, drop=0.3)
            for state in states:
                samples.append(self._get_entities(state))
    def __reduce__(self):
        return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -215,7 +215,10 @@ cdef class StringStore:
        path = util.ensure_path(path)
        with path.open('r') as file_:
            strings = ujson.load(file_)
        prev = list(self)
        self._reset_and_load(strings)
        for word in prev:
            self.add(word)
        return self
    def to_bytes(self, **exclude):
@ -234,7 +237,10 @@ cdef class StringStore:
        RETURNS (StringStore): The `StringStore` object.
        """
        strings = ujson.loads(bytes_data)
        prev = list(self)
        self._reset_and_load(strings)
        for word in prev:
            self.add(word)
        return self
    def set_frozen(self, bint is_frozen):
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@ -0,0 +1,286 @@
 # cython: infer_types=True
 # cython: profile=True
 cimport numpy as np
 import numpy
 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 from thinc.extra.search cimport Beam
 from thinc.extra.search import MaxViolation
 from thinc.typedefs cimport hash_t, class_t
 from thinc.extra.search cimport MaxViolation
 from .transition_system cimport TransitionSystem, Transition
 from .stateclass cimport StateClass
 from ..gold cimport GoldParse
 from ..tokens.doc cimport Doc
 # These are passed as callbacks to thinc.search.Beam
 cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
    dest = <StateClass>_dest
    src = <StateClass>_src
    moves = <const Transition*>_moves
    dest.clone(src)
    moves[clas].do(dest.c, moves[clas].label)
 cdef int _check_final_state(void* _state, void* extra_args) except -1:
    return (<StateClass>_state).is_final()
 def _cleanup(Beam beam):
    for i in range(beam.width):
        Py_XDECREF(<PyObject*>beam._states[i].content)
        Py_XDECREF(<PyObject*>beam._parents[i].content)
 cdef hash_t _hash_state(void* _state, void* _) except 0:
    state = <StateClass>_state
    if state.c.is_final():
        return 1
    else:
        return state.c.hash()
 cdef class ParserBeam(object):
    cdef public TransitionSystem moves
    cdef public object states
    cdef public object golds
    cdef public object beams
    cdef public object dones
    def __init__(self, TransitionSystem moves, states, golds,
            int width, float density):
        self.moves = moves
        self.states = states
        self.golds = golds
        self.beams = []
        cdef Beam beam
        cdef StateClass state, st
        for state in states:
            beam = Beam(self.moves.n_moves, width, density)
            beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
            for i in range(beam.width):
                st = <StateClass>beam.at(i)
                st.c.offset = state.c.offset
            self.beams.append(beam)
        self.dones = [False] * len(self.beams)
    def __dealloc__(self):
        if self.beams is not None:
            for beam in self.beams:
                if beam is not None:
                    _cleanup(beam)
    @property
    def is_done(self):
        return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams))
    def __getitem__(self, i):
        return self.beams[i]
    def __len__(self):
        return len(self.beams)
    def advance(self, scores, follow_gold=False):
        cdef Beam beam
        for i, beam in enumerate(self.beams):
            if beam.is_done or not scores[i].size or self.dones[i]:
                continue
            self._set_scores(beam, scores[i])
            if self.golds is not None:
                self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
            if follow_gold:
                beam.advance(_transition_state, NULL, <void*>self.moves.c)
            else:
                beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
            beam.check_done(_check_final_state, NULL)
            if beam.is_done and self.golds is not None:
                for j in range(beam.size):
                    state = <StateClass>beam.at(j)
                    if state.is_final():
                        try:
                            if self.moves.is_gold_parse(state, self.golds[i]):
                                beam._states[j].loss = 0.0
                            elif beam._states[j].loss == 0.0:
                                beam._states[j].loss = 1.0
                        except NotImplementedError:
                            break
    def _set_scores(self, Beam beam, float[:, ::1] scores):
        cdef float* c_scores = &scores[0, 0]
        cdef int nr_state = min(scores.shape[0], beam.size)
        cdef int nr_class = scores.shape[1]
        for i in range(nr_state):
            state = <StateClass>beam.at(i)
            if not state.is_final():
                for j in range(nr_class):
                    beam.scores[i][j] = c_scores[i * nr_class + j]
                self.moves.set_valid(beam.is_valid[i], state.c)
            else:
                for j in range(beam.nr_class):
                    beam.scores[i][j] = 0
                    beam.costs[i][j] = 0
    def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
        for i in range(beam.size):
            state = <StateClass>beam.at(i)
            if not state.c.is_final():
                self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
                if follow_gold:
                    for j in range(beam.nr_class):
                        if beam.costs[i][j] >= 1:
                            beam.is_valid[i][j] = 0
 def get_token_ids(states, int n_tokens):
    cdef StateClass state
    cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
                                      dtype='int32', order='C')
    c_ids = <int*>ids.data
    for i, state in enumerate(states):
        if not state.is_final():
            state.c.set_context_tokens(c_ids, n_tokens)
        else:
            ids[i] = -1
        c_ids += ids.shape[1]
    return ids
 nr_update = 0
 def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
                states, tokvecs, golds,
                state2vec, vec2scores, 
                int width, float density,
                sgd=None, losses=None, drop=0.):
    global nr_update
    cdef MaxViolation violn
    nr_update += 1
    pbeam = ParserBeam(moves, states, golds,
                       width=width, density=density)
    gbeam = ParserBeam(moves, states, golds,
                       width=width, density=0.0)
    cdef StateClass state
    beam_maps = []
    backprops = []
    violns = [MaxViolation() for _ in range(len(states))]
    for t in range(max_steps):
        if pbeam.is_done and gbeam.is_done:
            break
        # The beam maps let us find the right row in the flattened scores
        # arrays for each state. States are identified by (example id, history).
        # We keep a different beam map for each step (since we'll have a flat
        # scores array for each step). The beam map will let us take the per-state
        # losses, and compute the gradient for each (step, state, class).
        beam_maps.append({})
        # Gather all states from the two beams in a list. Some stats may occur
        # in both beams. To figure out which beam each state belonged to,
        # we keep two lists of indices, p_indices and g_indices
        states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
        if not states:
            break
        # Now that we have our flat list of states, feed them through the model
        token_ids = get_token_ids(states, nr_feature)
        vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
        scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
        # Store the callbacks for the backward pass
        backprops.append((token_ids, bp_vectors, bp_scores))
        # Unpack the flat scores into lists for the two beams. The indices arrays
        # tell us which example and state the scores-row refers to.
        p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
        g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')  for indices in g_indices]
        # Now advance the states in the beams. The gold beam is contrained to
        # to follow only gold analyses.
        pbeam.advance(p_scores)
        gbeam.advance(g_scores, follow_gold=True)
        # Track the "maximum violation", to use in the update.
        for i, violn in enumerate(violns):
            violn.check_crf(pbeam[i], gbeam[i])
    histories = []
    losses = []
    for violn in violns:
        if violn.p_hist:
            histories.append(violn.p_hist + violn.g_hist)
            losses.append(violn.p_probs + violn.g_probs)
        else:
            histories.append([])
            losses.append([])
    states_d_scores = get_gradient(moves.n_moves, beam_maps, histories, losses)
    return states_d_scores, backprops[:len(states_d_scores)]
 def get_states(pbeams, gbeams, beam_map, nr_update):
    seen = {}
    states = []
    p_indices = []
    g_indices = []
    cdef Beam pbeam, gbeam
    assert len(pbeams) == len(gbeams)
    for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
        p_indices.append([])
        g_indices.append([])
        for i in range(pbeam.size):
            state = <StateClass>pbeam.at(i)
            if not state.is_final():
                key = tuple([eg_id] + pbeam.histories[i])
                assert key not in seen, (key, seen)
                seen[key] = len(states)
                p_indices[-1].append(len(states))
                states.append(state)
        beam_map.update(seen)
        for i in range(gbeam.size):
            state = <StateClass>gbeam.at(i)
            if not state.is_final():
                key = tuple([eg_id] + gbeam.histories[i])
                if key in seen:
                    g_indices[-1].append(seen[key])
                else:
                    g_indices[-1].append(len(states))
                    beam_map[key] = len(states)
                    states.append(state)
    p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices]
    g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices]
    return states, p_idx, g_idx
 def get_gradient(nr_class, beam_maps, histories, losses):
    """
    The global model assigns a loss to each parse. The beam scores
    are additive, so the same gradient is applied to each action
    in the history. This gives the gradient of a single *action*
    for a beam state -- so we have "the gradient of loss for taking
    action i given history H."
    Histories: Each hitory is a list of actions
    Each candidate has a history
    Each beam has multiple candidates
    Each batch has multiple beams
    So history is list of lists of lists of ints
    """
    nr_step = len(beam_maps)
    grads = []
    nr_step = 0
    for eg_id, hists in enumerate(histories):
        for loss, hist in zip(losses[eg_id], hists):
            if loss != 0.0 and not numpy.isnan(loss):
                nr_step = max(nr_step, len(hist))
    for i in range(nr_step):
        grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f'))
    assert len(histories) == len(losses)
    for eg_id, hists in enumerate(histories):
        for loss, hist in zip(losses[eg_id], hists):
            if loss == 0.0 or numpy.isnan(loss):
                continue
            key = tuple([eg_id])
            # Adjust loss for length
            avg_loss = loss / len(hist)
            loss += avg_loss * (nr_step - len(hist))
            for j, clas in enumerate(hist):
                i = beam_maps[j][key]
                # In step j, at state i action clas
                # resulted in loss
                grads[j][i, clas] += loss
                key = key + tuple([clas])
    return grads
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@ -37,6 +37,7 @@ cdef cppclass StateC:
        this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
        this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
        this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity))
        this.offset = 0
        cdef int i
        for i in range(length + (PADDING * 2)):
            this._ents[i].end = -1
@ -73,7 +74,16 @@ cdef cppclass StateC:
        free(this.shifted - PADDING)
    void set_context_tokens(int* ids, int n) nogil:
-        if n == 13:
+        if n == 8:
            ids[0] = this.B(0)
            ids[1] = this.B(1)
            ids[2] = this.S(0)
            ids[3] = this.S(1)
            ids[4] = this.H(this.S(0))
            ids[5] = this.L(this.B(0), 1)
            ids[6] = this.L(this.S(0), 2)
            ids[7] = this.R(this.S(0), 1)
        elif n == 13:
            ids[0] = this.B(0)
            ids[1] = this.B(1)
            ids[2] = this.S(0)
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -10,6 +10,8 @@ from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
 from cymem.cymem cimport Pool
 from collections import OrderedDict
 from thinc.extra.search cimport Beam
 import numpy
 from .stateclass cimport StateClass
 from ._state cimport StateC, is_space_token
@ -18,7 +20,7 @@ from .transition_system cimport do_func_t, get_cost_func_t
 from .transition_system cimport move_cost_func_t, label_cost_func_t
 from ..gold cimport GoldParse
 from ..gold cimport GoldParseC
-from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
+from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE, IS_PUNCT
 from ..lexeme cimport Lexeme
 from ..structs cimport TokenC
@ -284,7 +286,7 @@ cdef class Break:
        return 0
 cdef int _get_root(int word, const GoldParseC* gold) nogil:
-    while gold.heads[word] != word and not gold.has_dep[word] and word >= 0:
+    while gold.heads[word] != word and gold.has_dep[word] and word >= 0:
        word = gold.heads[word]
    if not gold.has_dep[word]:
        return -1
@ -349,6 +351,20 @@ cdef class ArcEager(TransitionSystem):
        def __get__(self):
            return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
    def is_gold_parse(self, StateClass state, GoldParse gold):
        predicted = set()
        truth = set()
        for i in range(gold.length):
            if gold.cand_to_gold[i] is None:
                continue
            if state.safe_get(i).dep:
                predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep]))
            else:
                predicted.add((i, state.H(i), 'ROOT'))
            id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
            truth.add((id_, head, dep))
        return truth == predicted
    def has_gold(self, GoldParse gold, start=0, end=None):
        end = end or len(gold.heads)
        if all([tag is None for tag in gold.heads[start:end]]):
@ -360,7 +376,7 @@ cdef class ArcEager(TransitionSystem):
        if not self.has_gold(gold):
            return None
        for i in range(gold.length):
-            if gold.heads[i] is None: # Missing values
+            if gold.heads[i] is None or gold.labels[i] is None: # Missing values
                gold.c.heads[i] = i
                gold.c.has_dep[i] = False
            else:
@ -383,6 +399,7 @@ cdef class ArcEager(TransitionSystem):
        for i in range(self.n_moves):
            if self.c[i].move == move and self.c[i].label == label:
                return self.c[i]
        return Transition(clas=0, move=MISSING, label=0)
    def move_name(self, int move, attr_t label):
        label_str = self.strings[label]
@ -499,9 +516,11 @@ cdef class ArcEager(TransitionSystem):
                    "before training and after parsing. Either pass make_projective=True "
                    "to the GoldParse class, or use PseudoProjectivity.preprocess_training_data")
            else:
                print(gold.orig_annot)
                print(gold.words)
                print(gold.heads)
                print(gold.labels)
                print(gold.sent_starts)
                raise ValueError(
                    "Could not find a gold-standard action to supervise the dependency "
                    "parser.\n"
@ -510,3 +529,23 @@ cdef class ArcEager(TransitionSystem):
                    "State at failure:\n"
                    "%s" % (self.n_moves, stcls.print_state(gold.words)))
        assert n_gold >= 1
    def get_beam_annot(self, Beam beam):
        length = (<StateClass>beam.at(0)).c.length
        heads = [{} for _ in range(length)]
        deps = [{} for _ in range(length)]
        probs = beam.probs
        for i in range(beam.size):
            stcls = <StateClass>beam.at(i)
            self.finalize_state(stcls.c)
            if stcls.is_final():
                prob = probs[i]
                for j in range(stcls.c.length):
                    head = j + stcls.c._sent[j].head
                    dep = stcls.c._sent[j].dep
                    heads[j].setdefault(head, 0.0)
                    heads[j][head] += prob
                    deps[j].setdefault(dep, 0.0)
                    deps[j][dep] += prob
        return heads, deps
--- a/spacy/syntax/beam_parser.pyx
+++ b/spacy/syntax/beam_parser.pyx
@ -107,7 +107,7 @@ cdef class BeamParser(Parser):
            # The non-monotonic oracle makes it difficult to ensure final costs are
            # correct. Therefore do final correction
            for i in range(pred.size):
-                if is_gold(<StateClass>pred.at(i), gold_parse, self.moves.strings):
+                if self.moves.is_gold_parse(<StateClass>pred.at(i), gold_parse):
                    pred._states[i].loss = 0.0
                elif pred._states[i].loss == 0.0:
                    pred._states[i].loss = 1.0
@ -213,7 +213,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
        if not pred._states[i].is_done or pred._states[i].loss == 0:
            continue
        state = <StateClass>pred.at(i)
-        if is_gold(state, gold_parse, moves.strings) == True:
+        if moves.is_gold_parse(state, gold_parse) == True:
            for dep in gold_parse.orig_annot:
                print(dep[1], dep[3], dep[4])
            print("Cost", pred._states[i].loss)
@ -227,7 +227,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
        if not gold._states[i].is_done:
            continue
        state = <StateClass>gold.at(i)
-        if is_gold(state, gold_parse, moves.strings) == False:
+        if moves.is_gold(state, gold_parse) == False:
            print("Truth")
            for dep in gold_parse.orig_annot:
                print(dep[1], dep[3], dep[4])
@ -237,16 +237,3 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
            raise Exception("Gold parse is not gold-standard")
 def is_gold(StateClass state, GoldParse gold, StringStore strings):
    predicted = set()
    truth = set()
    for i in range(gold.length):
        if gold.cand_to_gold[i] is None:
            continue
        if state.safe_get(i).dep:
            predicted.add((i, state.H(i), strings[state.safe_get(i).dep]))
        else:
            predicted.add((i, state.H(i), 'ROOT'))
        id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
        truth.add((id_, head, dep))
    return truth == predicted
--- a/spacy/syntax/iterators.pyx
+++ b/spacy/syntax/iterators.pyx
@ -110,5 +110,35 @@ def es_noun_chunks(obj):
        token = next_token(token)
 def french_noun_chunks(obj):
    labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
    doc = obj.doc  # Ensure works on both Doc and Span.
    np_deps = [doc.vocab.strings[label] for label in labels]
    conj = doc.vocab.strings.add('conj')
    np_label = doc.vocab.strings.add('NP')
    seen = set()
    for i, word in enumerate(obj):
        if word.pos not in (NOUN, PROPN, PRON):
            continue
        # Prevent nested chunks from being produced
        if word.i in seen:
            continue
        if word.dep in np_deps:
            if any(w.i in seen for w in word.subtree):
                continue
            seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
            yield word.left_edge.i, word.right_edge.i+1, np_label
        elif word.dep == conj:
            head = word.head
            while head.dep == conj and head.head.i < head.i:
                head = head.head
            # If the head is an NP, and we're coordinated to it, we're an NP
            if head.dep in np_deps:
                if any(w.i in seen for w in word.subtree):
                    continue
                seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
                yield word.left_edge.i, word.right_edge.i+1, np_label
 CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
-            'es': es_noun_chunks}
+            'es': es_noun_chunks, 'fr': french_noun_chunks}
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -2,7 +2,10 @@
 from __future__ import unicode_literals
 from thinc.typedefs cimport weight_t
 from thinc.extra.search cimport Beam
 from collections import OrderedDict
 import numpy
 from thinc.neural.ops import NumpyOps
 from .stateclass cimport StateClass
 from ._state cimport StateC
@ -110,7 +113,7 @@ cdef class BiluoPushDown(TransitionSystem):
    def has_gold(self, GoldParse gold, start=0, end=None):
        end = end or len(gold.ner)
-        if all([tag == '-' for tag in gold.ner[start:end]]):
+        if all([tag in ('-', None) for tag in gold.ner[start:end]]):
            return False
        else:
            return True
@ -122,11 +125,46 @@ cdef class BiluoPushDown(TransitionSystem):
            gold.c.ner[i] = self.lookup_transition(gold.ner[i])
        return gold
    def get_beam_annot(self, Beam beam):
        entities = {}
        probs = beam.probs
        for i in range(beam.size):
            stcls = <StateClass>beam.at(i)
            if stcls.is_final():
                self.finalize_state(stcls.c)
                prob = probs[i]
                for j in range(stcls.c._e_i):
                    start = stcls.c._ents[j].start
                    end = stcls.c._ents[j].end
                    label = stcls.c._ents[j].label
                    entities.setdefault((start, end, label), 0.0)
                    entities[(start, end, label)] += prob
        return entities
    def get_beam_parses(self, Beam beam):
        parses = []
        probs = beam.probs
        for i in range(beam.size):
            stcls = <StateClass>beam.at(i)
            if stcls.is_final():
                self.finalize_state(stcls.c)
                prob = probs[i]
                parse = []
                for j in range(stcls.c._e_i):
                    start = stcls.c._ents[j].start
                    end = stcls.c._ents[j].end
                    label = stcls.c._ents[j].label
                    parse.append((start, end, self.strings[label]))
                parses.append((prob, parse))
        return parses
    cdef Transition lookup_transition(self, object name) except *:
        cdef attr_t label
        if name == '-' or name == None:
            move_str = 'M'
            label = 0
        elif name == '!O':
            return Transition(clas=0, move=ISNT, label=0, score=0)
        elif '-' in name:
            move_str, label_str = name.split('-', 1)
            # Hacky way to denote 'not this entity'
@ -308,6 +346,9 @@ cdef class In:
        elif g_act == UNIT:
            # I, Gold U --> True iff next tag == O
            return next_act != OUT
        # Support partial supervision in the form of "not this label"
        elif g_act == ISNT:
            return 0
        else:
            return 1
@ -350,6 +391,9 @@ cdef class Last:
        elif g_act == UNIT:
            # L, Gold U --> True
            return 0
        # Support partial supervision in the form of "not this label"
        elif g_act == ISNT:
            return 0
        else:
            return 1
@ -418,7 +462,9 @@ cdef class Out:
        cdef int g_act = gold.ner[s.B(0)].move
        cdef attr_t g_tag = gold.ner[s.B(0)].label
-        if g_act == MISSING or g_act == ISNT:
+        if g_act == ISNT and g_tag == 0:
            return 1
        elif g_act == MISSING or g_act == ISNT:
            return 0
        elif g_act == BEGIN:
            # O, Gold B --> False
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -29,21 +29,26 @@ from thinc.linear.avgtron cimport AveragedPerceptron
 from thinc.linalg cimport VecVec
 from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
 from thinc.extra.eg cimport Example
 from thinc.extra.search cimport Beam
 from cymem.cymem cimport Pool, Address
 from murmurhash.mrmr cimport hash64
 from preshed.maps cimport MapStruct
 from preshed.maps cimport map_get
-from thinc.api import layerize, chain, noop, clone
+from thinc.api import layerize, chain, noop, clone, with_flatten
-from thinc.neural import Model, Affine, ELU, ReLu, Maxout
+from thinc.neural import Model, Affine, ReLu, Maxout
 from thinc.neural._classes.batchnorm import BatchNorm as BN
 from thinc.neural._classes.selu import SELU
 from thinc.neural._classes.layernorm import LayerNorm
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module
 from .. import util
 from ..util import get_async, get_cuda_stream
 from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
-from .._ml import Tok2Vec, doc2feats, rebatch
+from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
 from .._ml import Residual, drop_layer
 from ..compat import json_dumps
 from . import _parse_features
@ -58,8 +63,10 @@ from ..structs cimport TokenC
 from ..tokens.doc cimport Doc
 from ..strings cimport StringStore
 from ..gold cimport GoldParse
-from ..attrs cimport TAG, DEP
+from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
 from . import _beam_utils
 USE_FINE_TUNE = True
 def get_templates(*args, **kwargs):
    return []
@ -110,7 +117,6 @@ cdef class precompute_hiddens:
        self.nO = cached.shape[2]
        self.nP = getattr(lower_model, 'nP', 1)
        self.ops = lower_model.ops
        self._features = numpy.zeros((batch_size, self.nO*self.nP), dtype='f')
        self._is_synchronized = False
        self._cuda_stream = cuda_stream
        self._cached = cached
@ -127,13 +133,12 @@ cdef class precompute_hiddens:
        return self.begin_update(X)[0]
    def begin_update(self, token_ids, drop=0.):
-        self._features.fill(0)
+        cdef np.ndarray state_vector = numpy.zeros((token_ids.shape[0], self.nO*self.nP), dtype='f')
        # This is tricky, but (assuming GPU available);
        # - Input to forward on CPU
        # - Output from forward on CPU
        # - Input to backward on GPU!
        # - Output from backward on GPU
        cdef np.ndarray state_vector = self._features[:len(token_ids)]
        bp_hiddens = self._bp_hiddens
        feat_weights = self.get_feat_weights()
@ -233,11 +238,14 @@ cdef class Parser:
    Base class of the DependencyParser and EntityRecognizer.
    """
    @classmethod
-    def Model(cls, nr_class, token_vector_width=128, hidden_width=128, depth=1, **cfg):
+    def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg):
        depth = util.env_opt('parser_hidden_depth', depth)
        token_vector_width = util.env_opt('token_vector_width', token_vector_width)
        hidden_width = util.env_opt('hidden_width', hidden_width)
        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
        embed_size = util.env_opt('embed_size', 4000)
        tensors = fine_tune(Tok2Vec(token_vector_width, embed_size,
                                    preprocess=doc2feats()))
        if parser_maxout_pieces == 1:
            lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
                        nF=cls.nr_feature,
@ -269,7 +277,7 @@ cdef class Parser:
            'hidden_width': hidden_width,
            'maxout_pieces': parser_maxout_pieces
        }
-        return (lower, upper), cfg
+        return (tensors, lower, upper), cfg
    def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
        """
@ -295,6 +303,10 @@ cdef class Parser:
            self.moves = self.TransitionSystem(self.vocab.strings, {})
        else:
            self.moves = moves
        if 'beam_width' not in cfg:
            cfg['beam_width'] = util.env_opt('beam_width', 1)
        if 'beam_density' not in cfg:
            cfg['beam_density'] = util.env_opt('beam_density', 0.0)
        self.cfg = cfg
        if 'actions' in self.cfg:
            for action, labels in self.cfg.get('actions', {}).items():
@ -305,7 +317,7 @@ cdef class Parser:
    def __reduce__(self):
        return (Parser, (self.vocab, self.moves, self.model), None, None)
-    def __call__(self, Doc doc):
+    def __call__(self, Doc doc, beam_width=None, beam_density=None):
        """
        Apply the parser or entity recognizer, setting the annotations onto the Doc object.
@ -314,11 +326,26 @@ cdef class Parser:
        Returns:
            None
        """
        if beam_width is None:
            beam_width = self.cfg.get('beam_width', 1)
        if beam_density is None:
            beam_density = self.cfg.get('beam_density', 0.0)
        cdef Beam beam
        if beam_width == 1:
            states = self.parse_batch([doc], [doc.tensor])
            self.set_annotations([doc], states)
            return doc
        else:
            beam = self.beam_parse([doc], [doc.tensor],
                        beam_width=beam_width, beam_density=beam_density)[0]
            output = self.moves.get_beam_annot(beam)
            state = <StateClass>beam.at(0)
            self.set_annotations([doc], [state])
            _cleanup(beam)
            return output
-    def pipe(self, docs, int batch_size=1000, int n_threads=2):
+    def pipe(self, docs, int batch_size=1000, int n_threads=2,
             beam_width=None, beam_density=None):
        """
        Process a stream of documents.
@ -330,13 +357,23 @@ cdef class Parser:
                The number of threads with which to work on the buffer in parallel.
        Yields (Doc): Documents, in order.
        """
-        cdef StateClass parse_state
+        if beam_width is None:
            beam_width = self.cfg.get('beam_width', 1)
        if beam_density is None:
            beam_density = self.cfg.get('beam_density', 0.0)
        cdef Doc doc
-        queue = []
+        cdef Beam beam
        for docs in cytoolz.partition_all(batch_size, docs):
            docs = list(docs)
-            tokvecs = [d.tensor for d in docs]
+            tokvecs = [doc.tensor for doc in docs]
            if beam_width == 1:
                parse_states = self.parse_batch(docs, tokvecs)
            else:
                beams = self.beam_parse(docs, tokvecs,
                            beam_width=beam_width, beam_density=beam_density)
                parse_states = []
                for beam in beams:
                    parse_states.append(<StateClass>beam.at(0))
            self.set_annotations(docs, parse_states)
            yield from docs
@ -351,8 +388,13 @@ cdef class Parser:
            int nr_class, nr_feat, nr_piece, nr_dim, nr_state
        if isinstance(docs, Doc):
            docs = [docs]
        if isinstance(tokvecses, np.ndarray):
            tokvecses = [tokvecses]
        tokvecs = self.model[0].ops.flatten(tokvecses)
        if USE_FINE_TUNE:
            # TODO: This is incorrect! Unhack when training next model
            tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
        nr_state = len(docs)
        nr_class = self.moves.n_moves
@ -404,6 +446,55 @@ cdef class Parser:
                    next_step.push_back(st)
        return states
    def beam_parse(self, docs, tokvecses, int beam_width=3, float beam_density=0.001):
        cdef Beam beam
        cdef np.ndarray scores
        cdef Doc doc
        cdef int nr_class = self.moves.n_moves
        cdef StateClass stcls, output
        tokvecs = self.model[0].ops.flatten(tokvecses)
        if USE_FINE_TUNE:
            tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
        cuda_stream = get_cuda_stream()
        state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
                                                     cuda_stream, 0.0)
        beams = []
        cdef int offset = 0
        cdef int j = 0
        cdef int k
        for doc in docs:
            beam = Beam(nr_class, beam_width, min_density=beam_density)
            beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
            for i in range(beam.width):
                stcls = <StateClass>beam.at(i)
                stcls.c.offset = offset
            offset += len(doc)
            beam.check_done(_check_final_state, NULL)
            while not beam.is_done:
                states = []
                for i in range(beam.size):
                    stcls = <StateClass>beam.at(i)
                    # This way we avoid having to score finalized states
                    # We do have to take care to keep indexes aligned, though
                    if not stcls.is_final():
                        states.append(stcls)
                token_ids = self.get_token_ids(states)
                vectors = state2vec(token_ids)
                scores = vec2scores(vectors)
                j = 0
                c_scores = <float*>scores.data
                for i in range(beam.size):
                    stcls = <StateClass>beam.at(i)
                    if not stcls.is_final():
                        self.moves.set_valid(beam.is_valid[i], stcls.c)
                        for k in range(nr_class):
                            beam.scores[i][k] = c_scores[j * scores.shape[1] + k]
                        j += 1
                beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
                beam.check_done(_check_final_state, NULL)
            beams.append(beam)
        return beams
    cdef void _parse_step(self, StateC* state,
            const float* feat_weights,
            int nr_class, int nr_feat, int nr_piece) nogil:
@ -427,6 +518,12 @@ cdef class Parser:
        free(token_ids)
    def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
        if not any(self.moves.has_gold(gold) for gold in golds):
            return None
        if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
            return self.update_beam(docs_tokvecs, golds,
                    self.cfg['beam_width'], self.cfg['beam_density'],
                    drop=drop, sgd=sgd, losses=losses)
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
        docs, tokvec_lists = docs_tokvecs
@ -434,6 +531,9 @@ cdef class Parser:
        if isinstance(docs, Doc) and isinstance(golds, GoldParse):
            docs = [docs]
            golds = [golds]
        if USE_FINE_TUNE:
            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
            tokvecs += self.model[0].ops.flatten(my_tokvecs)
        cuda_stream = get_cuda_stream()
@ -460,13 +560,14 @@ cdef class Parser:
            scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
            d_scores = self.get_batch_loss(states, golds, scores)
-            d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
+            d_scores /= len(docs)
            d_vector = bp_scores(d_scores, sgd=sgd)
            if drop != 0:
                d_vector *= mask
            if isinstance(self.model[0].ops, CupyOps) \
            and not isinstance(token_ids, state2vec.ops.xp.ndarray):
-                # Move token_ids and d_vector to CPU, asynchronously
+                # Move token_ids and d_vector to GPU, asynchronously
                backprops.append((
                    get_async(cuda_stream, token_ids),
                    get_async(cuda_stream, d_vector),
@ -483,7 +584,65 @@ cdef class Parser:
                break
        self._make_updates(d_tokvecs,
            backprops, sgd, cuda_stream)
-        return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
+        d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
        if USE_FINE_TUNE:
            d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
        return d_tokvecs
    def update_beam(self, docs_tokvecs, golds, width=None, density=None,
            drop=0., sgd=None, losses=None):
        if not any(self.moves.has_gold(gold) for gold in golds):
            return None
        if not golds:
            return None
        if width is None:
            width = self.cfg.get('beam_width', 2)
        if density is None:
            density = self.cfg.get('beam_density', 0.0)
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
        docs, tokvecs = docs_tokvecs
        lengths = [len(d) for d in docs]
        assert min(lengths) >= 1
        tokvecs = self.model[0].ops.flatten(tokvecs)
        if USE_FINE_TUNE:
            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
            tokvecs += self.model[0].ops.flatten(my_tokvecs)
        states = self.moves.init_batch(docs)
        for gold in golds:
            self.moves.preprocess_gold(gold)
        cuda_stream = get_cuda_stream()
        state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0)
        states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
                                        states, tokvecs, golds,
                                        state2vec, vec2scores,
                                        width, density,
                                        sgd=sgd, drop=drop, losses=losses)
        backprop_lower = []
        cdef float batch_size = len(docs)
        for i, d_scores in enumerate(states_d_scores):
            d_scores /= batch_size
            if losses is not None:
                losses[self.name] += (d_scores**2).sum()
            ids, bp_vectors, bp_scores = backprops[i]
            d_vector = bp_scores(d_scores, sgd=sgd)
            if isinstance(self.model[0].ops, CupyOps) \
            and not isinstance(ids, state2vec.ops.xp.ndarray):
                backprop_lower.append((
                    get_async(cuda_stream, ids),
                    get_async(cuda_stream, d_vector),
                    bp_vectors))
            else:
                backprop_lower.append((ids, d_vector, bp_vectors))
        d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
        self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
        d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
        if USE_FINE_TUNE:
            d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
        return d_tokvecs
    def _init_gold_batch(self, whole_docs, whole_golds):
        """Make a square batch, of length equal to the shortest doc. A long
@ -528,14 +687,10 @@ cdef class Parser:
        xp = get_array_module(d_tokvecs)
        for ids, d_vector, bp_vector in backprops:
            d_state_features = bp_vector(d_vector, sgd=sgd)
-            active_feats = ids * (ids >= 0)
+            mask = ids >= 0
-            active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1))
+            d_state_features *= mask.reshape(ids.shape + (1,))
-            if hasattr(xp, 'scatter_add'):
+            self.model[0].ops.scatter_add(d_tokvecs, ids * mask,
-                xp.scatter_add(d_tokvecs,
+                d_state_features)
                    ids, d_state_features * active_feats)
            else:
                xp.add.at(d_tokvecs,
                    ids, d_state_features * active_feats)
    @property
    def move_names(self):
@ -546,7 +701,7 @@ cdef class Parser:
        return names
    def get_batch_model(self, batch_size, tokvecs, stream, dropout):
-        lower, upper = self.model
+        _, lower, upper = self.model
        state2vec = precompute_hiddens(batch_size, tokvecs,
                        lower, stream, drop=dropout)
        return state2vec, upper
@ -560,6 +715,7 @@ cdef class Parser:
                                          dtype='i', order='C')
        c_ids = <int*>ids.data
        for i, state in enumerate(states):
            if not state.is_final():
                state.c.set_context_tokens(c_ids, n_tokens)
            c_ids += ids.shape[1]
        return ids
@ -635,10 +791,12 @@ cdef class Parser:
    def to_disk(self, path, **exclude):
        serializers = {
-            'lower_model': lambda p: p.open('wb').write(
+            'tok2vec_model': lambda p: p.open('wb').write(
                self.model[0].to_bytes()),
-            'upper_model': lambda p: p.open('wb').write(
+            'lower_model': lambda p: p.open('wb').write(
                self.model[1].to_bytes()),
            'upper_model': lambda p: p.open('wb').write(
                self.model[2].to_bytes()),
            'vocab': lambda p: self.vocab.to_disk(p),
            'moves': lambda p: self.moves.to_disk(p, strings=False),
            'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
@ -659,24 +817,29 @@ cdef class Parser:
                self.model, cfg = self.Model(**self.cfg)
            else:
                cfg = {}
-            with (path / 'lower_model').open('rb') as file_:
+            with (path / 'tok2vec_model').open('rb') as file_:
                bytes_data = file_.read()
            self.model[0].from_bytes(bytes_data)
-            with (path / 'upper_model').open('rb') as file_:
+            with (path / 'lower_model').open('rb') as file_:
                bytes_data = file_.read()
            self.model[1].from_bytes(bytes_data)
            with (path / 'upper_model').open('rb') as file_:
                bytes_data = file_.read()
            self.model[2].from_bytes(bytes_data)
            self.cfg.update(cfg)
        return self
    def to_bytes(self, **exclude):
        serializers = OrderedDict((
-            ('lower_model', lambda: self.model[0].to_bytes()),
+            ('tok2vec_model', lambda: self.model[0].to_bytes()),
-            ('upper_model', lambda: self.model[1].to_bytes()),
+            ('lower_model', lambda: self.model[1].to_bytes()),
            ('upper_model', lambda: self.model[2].to_bytes()),
            ('vocab', lambda: self.vocab.to_bytes()),
            ('moves', lambda: self.moves.to_bytes(strings=False)),
            ('cfg', lambda: ujson.dumps(self.cfg))
        ))
        if 'model' in exclude:
            exclude['tok2vec_model'] = True
            exclude['lower_model'] = True
            exclude['upper_model'] = True
            exclude.pop('model')
@ -687,6 +850,7 @@ cdef class Parser:
            ('vocab', lambda b: self.vocab.from_bytes(b)),
            ('moves', lambda b: self.moves.from_bytes(b, strings=False)),
            ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
            ('tok2vec_model', lambda b: None),
            ('lower_model', lambda b: None),
            ('upper_model', lambda b: None)
        ))
@ -696,10 +860,12 @@ cdef class Parser:
                self.model, cfg = self.Model(self.moves.n_moves)
            else:
                cfg = {}
            if 'tok2vec_model' in msg:
                self.model[0].from_bytes(msg['tok2vec_model'])
            if 'lower_model' in msg:
-                self.model[0].from_bytes(msg['lower_model'])
+                self.model[1].from_bytes(msg['lower_model'])
            if 'upper_model' in msg:
-                self.model[1].from_bytes(msg['upper_model'])
+                self.model[2].from_bytes(msg['upper_model'])
            self.cfg.update(cfg)
        return self
@ -762,3 +928,30 @@ cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actio
            mode = i
            score = scores[i]
    return mode
 # These are passed as callbacks to thinc.search.Beam
 cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
    dest = <StateClass>_dest
    src = <StateClass>_src
    moves = <const Transition*>_moves
    dest.clone(src)
    moves[clas].do(dest.c, moves[clas].label)
 cdef int _check_final_state(void* _state, void* extra_args) except -1:
    return (<StateClass>_state).is_final()
 def _cleanup(Beam beam):
    for i in range(beam.width):
        Py_XDECREF(<PyObject*>beam._states[i].content)
        Py_XDECREF(<PyObject*>beam._parents[i].content)
 cdef hash_t _hash_state(void* _state, void* _) except 0:
    state = <StateClass>_state
    if state.c.is_final():
        return 1
    else:
        return state.c.hash()
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -99,6 +99,9 @@ cdef class TransitionSystem:
    def preprocess_gold(self, GoldParse gold):
        raise NotImplementedError
    def is_gold_parse(self, StateClass state, GoldParse gold):
        raise NotImplementedError
    cdef Transition lookup_transition(self, object name) except *:
        raise NotImplementedError
@ -107,6 +110,8 @@ cdef class TransitionSystem:
    def is_valid(self, StateClass stcls, move_name):
        action = self.lookup_transition(move_name)
        if action.move == 0:
            return False
        return action.is_valid(stcls.c, action.label)
    cdef int set_valid(self, int* is_valid, const StateC* st) nogil:
@ -137,6 +142,10 @@ cdef class TransitionSystem:
                "the entity recognizer\n"
                "The transition system has %d actions." % (self.n_moves))
    def get_class_name(self, int clas):
        act = self.c[clas]
        return self.move_name(act.move, act.label)
    def add_action(self, int action, label_name):
        cdef attr_t label_id
        if not isinstance(label_name, int):
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -11,9 +11,9 @@ from ..strings import StringStore
 from .. import util
-_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
+_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id',
-              'nl', 'pl', 'pt', 'sv', 'xx']
+              'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
-_models = {'en': ['en_depent_web_sm', 'en_core_web_md'],
+_models = {'en': ['en_core_web_sm'],
           'de': ['de_core_news_md'],
           'fr': ['fr_depvec_web_lg'],
           'xx': ['xx_ent_web_md']}
@ -86,6 +86,9 @@ def hu_tokenizer():
 def fi_tokenizer():
    return util.get_lang_class('fi').Defaults.create_tokenizer()
@pytest.fixture
 def id_tokenizer():
    return util.get_lang_class('id').Defaults.create_tokenizer()
@pytest.fixture
 def sv_tokenizer():
--- a/spacy/tests/lang/en/test_lemmatizer.py
+++ b/spacy/tests/lang/en/test_lemmatizer.py
@ -2,12 +2,18 @@
 from __future__ import unicode_literals
 import pytest
 from ....tokens.doc import Doc
@pytest.fixture
 def en_lemmatizer(EN):
    return EN.Defaults.create_lemmatizer()
@pytest.mark.models('en')
 def test_doc_lemmatization(EN):
    doc = Doc(EN.vocab, words=['bleed'])
    doc[0].tag_ = 'VBP'
    assert doc[0].lemma_ == 'bleed'
@pytest.mark.models('en')
@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]),
@ -19,6 +25,16 @@ def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
    assert en_lemmatizer.noun(text) == set(lemmas)
@pytest.mark.models('en')
@pytest.mark.parametrize('text,lemmas', [("bleed", ["bleed"]),
                                         ("feed", ["feed"]),
                                         ("need", ["need"]),
                                         ("ring", ["ring"]),
                                         ("axes", ["axis", "axe", "ax"])])
 def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
    assert en_lemmatizer.noun(text) == set(lemmas)
@pytest.mark.xfail
@pytest.mark.models('en')
 def test_en_lemmatizer_base_forms(en_lemmatizer):
--- a/spacy/tests/lang/en/test_tagger.py
+++ b/spacy/tests/lang/en/test_tagger.py
@ -25,7 +25,6 @@ def test_tag_names(EN):
    doc = EN(text, disable=['parser'])
    assert type(doc[2].pos) == int
    assert isinstance(doc[2].pos_, six.text_type)
    assert type(doc[2].dep) == int
    assert isinstance(doc[2].dep_, six.text_type)
    assert doc[2].tag_ == u'NNS'
--- a/spacy/tests/lang/id/init.py
+++ b/spacy/tests/lang/id/init.py
--- a/spacy/tests/lang/id/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/id/test_prefix_suffix_infix.py
@ -0,0 +1,115 @@
 # coding: utf-8
 """Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
 from __future__ import unicode_literals
 import pytest
@pytest.mark.parametrize('text', ["(Ma'arif)"])
 def test_tokenizer_splits_no_special(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Ma'arif"])
 def test_tokenizer_splits_no_punct(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 1
@pytest.mark.parametrize('text', ["(Ma'arif"])
 def test_tokenizer_splits_prefix_punct(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 2
@pytest.mark.parametrize('text', ["Ma'arif)"])
 def test_tokenizer_splits_suffix_punct(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(Ma'arif)"])
 def test_tokenizer_splits_even_wrap(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(Ma'arif?)"])
 def test_tokenizer_splits_uneven_wrap(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 4
@pytest.mark.parametrize('text,length', [("S.Kom.", 1), ("SKom.", 2), ("(S.Kom.", 2)])
 def test_tokenizer_splits_prefix_interact(id_tokenizer, text, length):
    tokens = id_tokenizer(text)
    assert len(tokens) == length
@pytest.mark.parametrize('text', ["S.Kom.)"])
 def test_tokenizer_splits_suffix_interact(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(S.Kom.)"])
 def test_tokenizer_splits_even_wrap_interact(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(S.Kom.?)"])
 def test_tokenizer_splits_uneven_wrap_interact(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 4
@pytest.mark.parametrize('text,length', [("gara-gara", 1), ("Jokowi-Ahok", 3), ("Sukarno-Hatta", 3)])
 def test_tokenizer_splits_hyphens(id_tokenizer, text, length):
    tokens = id_tokenizer(text)
    assert len(tokens) == length
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
 def test_tokenizer_splits_numeric_range(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["ini.Budi", "Halo.Bandung"])
 def test_tokenizer_splits_period_infix(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Halo,Bandung", "satu,dua"])
 def test_tokenizer_splits_comma_infix(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 3
    assert tokens[0].text == text.split(",")[0]
    assert tokens[1].text == ","
    assert tokens[2].text == text.split(",")[1]
@pytest.mark.parametrize('text', ["halo...Bandung", "dia...pergi"])
 def test_tokenizer_splits_ellipsis_infix(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 3
 def test_tokenizer_splits_double_hyphen_infix(id_tokenizer):
    tokens = id_tokenizer("Arsene Wenger--manajer Arsenal--melakukan konferensi pers.")
    assert len(tokens) == 10
    assert tokens[0].text == "Arsene"
    assert tokens[1].text == "Wenger"
    assert tokens[2].text == "--"
    assert tokens[3].text == "manajer"
    assert tokens[4].text == "Arsenal"
    assert tokens[5].text == "--"
    assert tokens[6].text == "melakukan"
    assert tokens[7].text == "konferensi"
    assert tokens[8].text == "pers"
    assert tokens[9].text == "."
--- a/spacy/tests/parser/test_beam_parse.py
+++ b/spacy/tests/parser/test_beam_parse.py
@ -0,0 +1,10 @@
 import spacy
 import pytest
@pytest.mark.models
 def test_beam_parse():
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(u'Australia is a country', disable=['ner'])
    ents = nlp.entity(doc, beam_width=2)
    print(ents)
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -0,0 +1,73 @@
 from __future__ import unicode_literals
 import pytest
 from ...vocab import Vocab
 from ...syntax.ner import BiluoPushDown
 from ...gold import GoldParse
 from ...tokens import Doc
@pytest.fixture
 def vocab():
    return Vocab()
@pytest.fixture
 def doc(vocab):
    return Doc(vocab, words=['Casey', 'went', 'to', 'New', 'York', '.'])
@pytest.fixture
 def entity_annots(doc):
    casey = doc[0:1]
    ny = doc[3:5]
    return [(casey.start_char, casey.end_char, 'PERSON'),
            (ny.start_char, ny.end_char, 'GPE')]
@pytest.fixture
 def entity_types(entity_annots):
    return sorted(set([label for (s, e, label) in entity_annots]))
@pytest.fixture
 def tsys(vocab, entity_types):
    actions = BiluoPushDown.get_actions(entity_types=entity_types)
    return BiluoPushDown(vocab.strings, actions)
 def test_get_oracle_moves(tsys, doc, entity_annots):
    gold = GoldParse(doc, entities=entity_annots)
    tsys.preprocess_gold(gold)
    act_classes = tsys.get_oracle_sequence(doc, gold)
    names = [tsys.get_class_name(act) for act in act_classes]
    assert names == ['U-PERSON', 'O', 'O', 'B-GPE', 'L-GPE', 'O']
 def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
    entity_annots = [(s, e, '!' + label) for s, e, label in entity_annots]
    gold = GoldParse(doc, entities=entity_annots)
    for i, tag in enumerate(gold.ner):
        if tag == 'L-!GPE':
            gold.ner[i] = '-'
    tsys.preprocess_gold(gold)
    act_classes = tsys.get_oracle_sequence(doc, gold)
    names = [tsys.get_class_name(act) for act in act_classes]
 def test_get_oracle_moves_negative_entities2(tsys, vocab):
    doc = Doc(vocab, words=['A', 'B', 'C', 'D'])
    gold = GoldParse(doc, entities=[])
    gold.ner = ['B-!PERSON', 'L-!PERSON', 'B-!PERSON', 'L-!PERSON']
    tsys.preprocess_gold(gold)
    act_classes = tsys.get_oracle_sequence(doc, gold)
    names = [tsys.get_class_name(act) for act in act_classes]
 def test_get_oracle_moves_negative_O(tsys, vocab):
    doc = Doc(vocab, words=['A', 'B', 'C', 'D'])
    gold = GoldParse(doc, entities=[])
    gold.ner = ['O', '!O', 'O', '!O']
    tsys.preprocess_gold(gold)
    act_classes = tsys.get_oracle_sequence(doc, gold)
    names = [tsys.get_class_name(act) for act in act_classes]
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 from thinc.neural import Model
 from mock import Mock
 import pytest
 import numpy
@ -36,7 +35,7 @@ def parser(vocab, arc_eager):
@pytest.fixture
 def model(arc_eager, tok2vec):
-    return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)
+    return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)[0]
@pytest.fixture
 def doc(vocab):
@ -45,29 +44,50 @@ def doc(vocab):
@pytest.fixture
 def gold(doc):
    return GoldParse(doc, heads=[1, 1, 1], deps=['L', 'ROOT', 'R'])
 def test_can_init_nn_parser(parser):
    assert parser.model is None
 def test_build_model(parser):
-    parser.model = Parser.Model(parser.moves.n_moves)
+    parser.model = Parser.Model(parser.moves.n_moves)[0]
    assert parser.model is not None
@pytest.mark.xfail
 def test_predict_doc(parser, tok2vec, model, doc):
-    doc.tensor = tok2vec([doc])
+    doc.tensor = tok2vec([doc])[0]
    parser.model = model
    parser(doc)
@pytest.mark.xfail
 def test_update_doc(parser, tok2vec, model, doc, gold):
    parser.model = model
    tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
-    d_tokvecs = parser.update((doc, tokvecs), gold)
+    d_tokvecs = parser.update(([doc], tokvecs), [gold])
-    assert d_tokvecs.shape == tokvecs.shape
+    assert d_tokvecs[0].shape == tokvecs[0].shape
    def optimize(weights, gradient, key=None):
        weights -= 0.001 * gradient
    bp_tokvecs(d_tokvecs, sgd=optimize)
-    assert d_tokvecs.sum() == 0.
+    assert d_tokvecs[0].sum() == 0.
 def test_predict_doc_beam(parser, tok2vec, model, doc):
    doc.tensor = tok2vec([doc])[0]
    parser.model = model
    parser(doc, beam_width=32, beam_density=0.001)
    for word in doc:
        print(word.text, word.head, word.dep_)
 def test_update_doc_beam(parser, tok2vec, model, doc, gold):
    parser.model = model
    tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
    d_tokvecs = parser.update_beam(([doc], tokvecs), [gold])
    assert d_tokvecs[0].shape == tokvecs[0].shape
    def optimize(weights, gradient, key=None):
        weights -= 0.001 * gradient
    bp_tokvecs(d_tokvecs, sgd=optimize)
    assert d_tokvecs[0].sum() == 0.
--- a/spacy/tests/parser/test_nn_beam.py
+++ b/spacy/tests/parser/test_nn_beam.py
@ -0,0 +1,87 @@
 from __future__ import unicode_literals
 import pytest
 import numpy
 from thinc.api import layerize
 from ...vocab import Vocab
 from ...syntax.arc_eager import ArcEager
 from ...tokens import Doc
 from ...gold import GoldParse
 from ...syntax._beam_utils import ParserBeam, update_beam
 from ...syntax.stateclass import StateClass
@pytest.fixture
 def vocab():
    return Vocab()
@pytest.fixture
 def moves(vocab):
    aeager = ArcEager(vocab.strings, {})
    aeager.add_action(2, 'nsubj')
    aeager.add_action(3, 'dobj')
    aeager.add_action(2, 'aux')
    return aeager
@pytest.fixture
 def docs(vocab):
    return [Doc(vocab, words=['Rats', 'bite', 'things'])]
@pytest.fixture
 def states(docs):
    return [StateClass(doc) for doc in docs]
@pytest.fixture
 def tokvecs(docs, vector_size):
    output = []
    for doc in docs:
        vec = numpy.random.uniform(-0.1, 0.1, (len(doc), vector_size))
        output.append(numpy.asarray(vec))
    return output
@pytest.fixture
 def golds(docs):
    return [GoldParse(doc) for doc in docs]
@pytest.fixture
 def batch_size(docs):
    return len(docs)
@pytest.fixture
 def beam_width():
    return 4
@pytest.fixture
 def vector_size():
    return 6
@pytest.fixture
 def beam(moves, states, golds, beam_width):
    return ParserBeam(moves, states, golds, width=beam_width, density=0.0)
@pytest.fixture
 def scores(moves, batch_size, beam_width):
    return [
        numpy.asarray(
            numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)),
            dtype='f')
        for _ in range(batch_size)]
 def test_create_beam(beam):
    pass
 def test_beam_advance(beam, scores):
    beam.advance(scores)
 def test_beam_advance_too_few_scores(beam, scores):
    with pytest.raises(IndexError):
        beam.advance(scores[:-1])
--- a/spacy/tests/regression/test_issue1257.py
+++ b/spacy/tests/regression/test_issue1257.py
@ -0,0 +1,12 @@
 '''Test tokens compare correctly'''
 from __future__ import unicode_literals
 from ..util import get_doc
 from ...vocab import Vocab
 def test_issue1257():
    doc1 = get_doc(Vocab(), ['a', 'b', 'c'])
    doc2 = get_doc(Vocab(), ['a', 'c', 'e'])
    assert doc1[0] != doc2[0]
    assert not doc1[0] == doc2[0]
--- a/spacy/tests/serialize/test_serialize_tagger.py
+++ b/spacy/tests/serialize/test_serialize_tagger.py
@ -11,8 +11,8 @@ import pytest
 def taggers(en_vocab):
    tagger1 = Tagger(en_vocab)
    tagger2 = Tagger(en_vocab)
-    tagger1.model = tagger1.Model(None, None)
+    tagger1.model = tagger1.Model(8, 8)
-    tagger2.model = tagger2.Model(None, None)
+    tagger2.model = tagger1.model
    return (tagger1, tagger2)
@ -20,7 +20,6 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
    tagger1, tagger2 = taggers
    tagger1_b = tagger1.to_bytes()
    tagger2_b = tagger2.to_bytes()
    assert tagger1_b == tagger2_b
    tagger1 = tagger1.from_bytes(tagger1_b)
    assert tagger1.to_bytes() == tagger1_b
    new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)
--- a/spacy/tests/spans/test_span.py
+++ b/spacy/tests/spans/test_span.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 from ..util import get_doc
 from ...attrs import ORTH, LENGTH
 import pytest
@ -89,3 +90,19 @@ def test_spans_are_hashable(en_tokenizer):
    span3 = tokens[0:2]
    assert hash(span3) == hash(span1)
 def test_spans_by_character(doc):
    span1 = doc[1:-2]
    span2 = doc.char_span(span1.start_char, span1.end_char, label='GPE')
    assert span1.start_char == span2.start_char
    assert span1.end_char == span2.end_char
    assert span2.label_ == 'GPE'
 def test_span_to_array(doc):
    span = doc[1:-2]
    arr = span.to_array([ORTH, LENGTH])
    assert arr.shape == (len(span), 2)
    assert arr[0, 0] == span[0].orth
    assert arr[0, 1] == len(span[0])
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@ -79,9 +79,9 @@ def add_vecs_to_vocab(vocab, vectors):
    """Add list of vector tuples to given vocab. All vectors need to have the
    same length. Format: [("text", [1, 2, 3])]"""
    length = len(vectors[0][1])
-    vocab.resize_vectors(length)
+    vocab.clear_vectors(length)
    for word, vec in vectors:
-        vocab[word].vector = vec
+        vocab.set_vector(word, vec)
    return vocab
--- a/spacy/tests/vectors/test_similarity.py
+++ b/spacy/tests/vectors/test_similarity.py
@ -14,10 +14,9 @@ def vectors():
@pytest.fixture()
 def vocab(en_vocab, vectors):
-    #return add_vecs_to_vocab(en_vocab, vectors)
+    add_vecs_to_vocab(en_vocab, vectors)
-    return None
+    return en_vocab
@pytest.mark.xfail
 def test_vectors_similarity_LL(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    lex1 = vocab[word1]
@ -31,7 +30,6 @@ def test_vectors_similarity_LL(vocab, vectors):
    assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
@pytest.mark.xfail
 def test_vectors_similarity_TT(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = get_doc(vocab, words=[word1, word2])
@ -44,21 +42,18 @@ def test_vectors_similarity_TT(vocab, vectors):
    assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
@pytest.mark.xfail
 def test_vectors_similarity_TD(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = get_doc(vocab, words=[word1, word2])
    assert doc.similarity(doc[0]) == doc[0].similarity(doc)
@pytest.mark.xfail
 def test_vectors_similarity_DS(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = get_doc(vocab, words=[word1, word2])
    assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
@pytest.mark.xfail
 def test_vectors_similarity_TS(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = get_doc(vocab, words=[word1, word2])
--- a/spacy/tests/vectors/test_vectors.py
+++ b/spacy/tests/vectors/test_vectors.py
@ -2,6 +2,8 @@
 from __future__ import unicode_literals
 from ...vectors import Vectors
 from ...tokenizer import Tokenizer
 from ..util import add_vecs_to_vocab, get_doc
 import numpy
 import pytest
@ -11,22 +13,42 @@ import pytest
 def strings():
    return ["apple", "orange"]
@pytest.fixture
 def vectors():
    return [
        ("apple", [1, 2, 3]),
        ("orange", [-1, -2, -3]),
        ('and', [-1, -1, -1]),
        ('juice', [5, 5, 10]),
        ('pie', [7, 6.3, 8.9])]
@pytest.fixture
 def data():
    return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype='f')
@pytest.fixture()
 def vocab(en_vocab, vectors):
    add_vecs_to_vocab(en_vocab, vectors)
    return en_vocab
 def test_init_vectors_with_data(strings, data):
    v = Vectors(strings, data)
    assert v.shape == data.shape
 def test_init_vectors_with_width(strings):
    v = Vectors(strings, 3)
    for string in strings:
        v.add(string)
    assert v.shape == (len(strings), 3)
 def test_get_vector(strings, data):
    v = Vectors(strings, data)
    for string in strings:
        v.add(string)
    assert list(v[strings[0]]) == list(data[0])
    assert list(v[strings[0]]) != list(data[1])
    assert list(v[strings[1]]) != list(data[0])
@ -35,6 +57,8 @@ def test_get_vector(strings, data):
 def test_set_vector(strings, data):
    orig = data.copy()
    v = Vectors(strings, data)
    for string in strings:
        v.add(string)
    assert list(v[strings[0]]) == list(orig[0])
    assert list(v[strings[0]]) != list(orig[1])
    v[strings[0]] = data[1]
@ -42,125 +66,111 @@ def test_set_vector(strings, data):
    assert list(v[strings[0]]) != list(orig[0])
-#
+
-#@pytest.fixture()
+@pytest.fixture()
-#def tokenizer_v(vocab):
+def tokenizer_v(vocab):
-#    return Tokenizer(vocab, {}, None, None, None)
+    return Tokenizer(vocab, {}, None, None, None)
-#
+
-#
+
-#@pytest.mark.xfail
+@pytest.mark.parametrize('text', ["apple and orange"])
-#@pytest.mark.parametrize('text', ["apple and orange"])
+def test_vectors_token_vector(tokenizer_v, vectors, text):
-#def test_vectors_token_vector(tokenizer_v, vectors, text):
+    doc = tokenizer_v(text)
-#    doc = tokenizer_v(text)
+    assert vectors[0] == (doc[0].text, list(doc[0].vector))
-#    assert vectors[0] == (doc[0].text, list(doc[0].vector))
+    assert vectors[1] == (doc[2].text, list(doc[2].vector))
-#    assert vectors[1] == (doc[2].text, list(doc[2].vector))
+
-#
+
-#
+@pytest.mark.parametrize('text', ["apple", "orange"])
-#@pytest.mark.xfail
+def test_vectors_lexeme_vector(vocab, text):
-#@pytest.mark.parametrize('text', ["apple", "orange"])
+    lex = vocab[text]
-#def test_vectors_lexeme_vector(vocab, text):
+    assert list(lex.vector)
-#    lex = vocab[text]
+    assert lex.vector_norm
-#    assert list(lex.vector)
+
-#    assert lex.vector_norm
+
-#
+@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
-#
+def test_vectors_doc_vector(vocab, text):
-#@pytest.mark.xfail
+    doc = get_doc(vocab, text)
-#@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
+    assert list(doc.vector)
-#def test_vectors_doc_vector(vocab, text):
+    assert doc.vector_norm
-#    doc = get_doc(vocab, text)
+
-#    assert list(doc.vector)
+
-#    assert doc.vector_norm
+@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
-#
+def test_vectors_span_vector(vocab, text):
-#
+    span = get_doc(vocab, text)[0:2]
-#@pytest.mark.xfail
+    assert list(span.vector)
-#@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
+    assert span.vector_norm
-#def test_vectors_span_vector(vocab, text):
+
-#    span = get_doc(vocab, text)[0:2]
+
-#    assert list(span.vector)
+@pytest.mark.parametrize('text', ["apple orange"])
-#    assert span.vector_norm
+def test_vectors_token_token_similarity(tokenizer_v, text):
-#
+    doc = tokenizer_v(text)
-#
+    assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
-#@pytest.mark.xfail
+    assert -1. < doc[0].similarity(doc[1]) < 1.0
-#@pytest.mark.parametrize('text', ["apple orange"])
+
-#def test_vectors_token_token_similarity(tokenizer_v, text):
+
-#    doc = tokenizer_v(text)
+@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
-#    assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
+def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
-#    assert 0.0 < doc[0].similarity(doc[1]) < 1.0
+    token = tokenizer_v(text1)
-#
+    lex = vocab[text2]
-#
+    assert token.similarity(lex) == lex.similarity(token)
-#@pytest.mark.xfail
+    assert -1. < token.similarity(lex) < 1.0
-#@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
+
-#def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
+
-#    token = tokenizer_v(text1)
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#    lex = vocab[text2]
+def test_vectors_token_span_similarity(vocab, text):
-#    assert token.similarity(lex) == lex.similarity(token)
+    doc = get_doc(vocab, text)
-#    assert 0.0 < token.similarity(lex) < 1.0
+    assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0])
-#
+    assert -1. < doc[0].similarity(doc[1:3]) < 1.0
-#
+
-#@pytest.mark.xfail
+
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#def test_vectors_token_span_similarity(vocab, text):
+def test_vectors_token_doc_similarity(vocab, text):
-#    doc = get_doc(vocab, text)
+    doc = get_doc(vocab, text)
-#    assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0])
+    assert doc[0].similarity(doc) == doc.similarity(doc[0])
-#    assert 0.0 < doc[0].similarity(doc[1:3]) < 1.0
+    assert -1. < doc[0].similarity(doc) < 1.0
-#
+
-#
+
-#@pytest.mark.xfail
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+def test_vectors_lexeme_span_similarity(vocab, text):
-#def test_vectors_token_doc_similarity(vocab, text):
+    doc = get_doc(vocab, text)
-#    doc = get_doc(vocab, text)
+    lex = vocab[text[0]]
-#    assert doc[0].similarity(doc) == doc.similarity(doc[0])
+    assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex)
-#    assert 0.0 < doc[0].similarity(doc) < 1.0
+    assert -1. < doc.similarity(doc[1:3]) < 1.0
-#
+
-#
+
-#@pytest.mark.xfail
+@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
-#def test_vectors_lexeme_span_similarity(vocab, text):
+    lex1 = vocab[text1]
-#    doc = get_doc(vocab, text)
+    lex2 = vocab[text2]
-#    lex = vocab[text[0]]
+    assert lex1.similarity(lex2) == lex2.similarity(lex1)
-#    assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex)
+    assert -1. < lex1.similarity(lex2) < 1.0
-#    assert 0.0 < doc.similarity(doc[1:3]) < 1.0
+
-#
+
-#
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#@pytest.mark.xfail
+def test_vectors_lexeme_doc_similarity(vocab, text):
-#@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
+    doc = get_doc(vocab, text)
-#def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
+    lex = vocab[text[0]]
-#    lex1 = vocab[text1]
+    assert lex.similarity(doc) == doc.similarity(lex)
-#    lex2 = vocab[text2]
+    assert -1. < lex.similarity(doc) < 1.0
-#    assert lex1.similarity(lex2) == lex2.similarity(lex1)
+
-#    assert 0.0 < lex1.similarity(lex2) < 1.0
+
-#
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#
+def test_vectors_span_span_similarity(vocab, text):
-#@pytest.mark.xfail
+    doc = get_doc(vocab, text)
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+    assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
-#def test_vectors_lexeme_doc_similarity(vocab, text):
+    assert -1. < doc[0:2].similarity(doc[1:3]) < 1.0
-#    doc = get_doc(vocab, text)
+
-#    lex = vocab[text[0]]
+
-#    assert lex.similarity(doc) == doc.similarity(lex)
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#    assert 0.0 < lex.similarity(doc) < 1.0
+def test_vectors_span_doc_similarity(vocab, text):
-#
+    doc = get_doc(vocab, text)
-#
+    assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
-#@pytest.mark.xfail
+    assert -1. < doc[0:2].similarity(doc) < 1.0
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+
-#def test_vectors_span_span_similarity(vocab, text):
+
-#    doc = get_doc(vocab, text)
+@pytest.mark.parametrize('text1,text2', [
-#    assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
+    (["apple", "and", "apple", "pie"], ["orange", "juice"])])
-#    assert 0.0 < doc[0:2].similarity(doc[1:3]) < 1.0
+def test_vectors_doc_doc_similarity(vocab, text1, text2):
-#
+    doc1 = get_doc(vocab, text1)
-#
+    doc2 = get_doc(vocab, text2)
-#@pytest.mark.xfail
+    assert doc1.similarity(doc2) == doc2.similarity(doc1)
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+    assert -1. < doc1.similarity(doc2) < 1.0
 #def test_vectors_span_doc_similarity(vocab, text):
 #    doc = get_doc(vocab, text)
 #    assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
 #    assert 0.0 < doc[0:2].similarity(doc) < 1.0
 #
 #
 #@pytest.mark.xfail
 #@pytest.mark.parametrize('text1,text2', [
 #    (["apple", "and", "apple", "pie"], ["orange", "juice"])])
 #def test_vectors_doc_doc_similarity(vocab, text1, text2):
 #    doc1 = get_doc(vocab, text1)
 #    doc2 = get_doc(vocab, text2)
 #    assert doc1.similarity(doc2) == doc2.similarity(doc1)
 #    assert 0.0 < doc1.similarity(doc2) < 1.0
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -33,6 +33,7 @@ cdef class Doc:
    cdef public object _vector_norm
    cdef public object tensor
    cdef public object cats
    cdef public object user_data
    cdef TokenC* c
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -117,6 +117,7 @@ cdef class Doc:
        self.is_tagged = False
        self.is_parsed = False
        self.sentiment = 0.0
        self.cats = {}
        self.user_hooks = {}
        self.user_token_hooks = {}
        self.user_span_hooks = {}
@ -237,6 +238,29 @@ cdef class Doc:
    def doc(self):
        return self
    def char_span(self, int start_idx, int end_idx, label=0, vector=None):
        """Create a `Span` object from the slice `doc.text[start : end]`.
        doc (Doc): The parent document.
        start (int): The index of the first character of the span.
        end (int): The index of the first character after the span.
        label (uint64 or string): A label to attach to the Span, e.g. for named entities.
        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
        RETURNS (Span): The newly constructed object.
        """
        if not isinstance(label, int):
            label = self.vocab.strings.add(label)
        cdef int start = token_by_start(self.c, self.length, start_idx)
        if start == -1:
            return None
        cdef int end = token_by_end(self.c, self.length, end_idx)
        if end == -1:
            return None
        # Currently we have the token index, we want the range-end index
        end += 1
        cdef Span span = Span(self, start, end, label=label, vector=vector)
        return span
    def similarity(self, other):
        """Make a semantic similarity estimate. The default estimate is cosine
        similarity using an average of word vectors.
@ -279,8 +303,14 @@ cdef class Doc:
                return self.user_hooks['vector'](self)
            if self._vector is not None:
                return self._vector
-            elif self.has_vector and len(self):
+            elif not len(self):
-                self._vector = sum(t.vector for t in self) / len(self)
+                self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
                return self._vector
            elif self.has_vector:
                vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
                for token in self.c[:self.length]:
                    vector += self.vocab.get_vector(token.lex.orth)
                self._vector = vector / len(self)
                return self._vector
            elif self.tensor is not None:
                self._vector = self.tensor.mean(axis=0)
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@ -15,5 +15,5 @@ cdef class Span:
    cdef public _vector
    cdef public _vector_norm
    cpdef int _recalculate_indices(self) except -1
    cpdef np.ndarray to_array(self, object features)
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -7,7 +7,7 @@ import numpy
 import numpy.linalg
 from libc.math cimport sqrt
-from .doc cimport token_by_start, token_by_end
+from .doc cimport token_by_start, token_by_end, get_token_attr
 from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport flags_t, attr_t, hash_t
 from ..attrs cimport attr_id_t
@ -135,6 +135,29 @@ cdef class Span:
            return 0.0
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
    cpdef np.ndarray to_array(self, object py_attr_ids):
        """Given a list of M attribute IDs, export the tokens to a numpy
        `ndarray` of shape `(N, M)`, where `N` is the length of the document.
        The values will be 32-bit integers.
        attr_ids (list[int]): A list of attribute ID ints.
        RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
            per word, and one column per attribute indicated in the input
            `attr_ids`.
        """
        cdef int i, j
        cdef attr_id_t feature
        cdef np.ndarray[attr_t, ndim=2] output
        # Make an array from the attributes --- otherwise our inner loop is Python
        # dict iteration.
        cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
        cdef int length = self.end - self.start
        output = numpy.ndarray(shape=(length, len(attr_ids)), dtype=numpy.uint64)
        for i in range(self.start, self.end):
            for j, feature in enumerate(attr_ids):
                output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature)
        return output
    cpdef int _recalculate_indices(self) except -1:
        if self.end > self.doc.length \
        or self.doc.c[self.start].idx != self.start_char \
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -62,18 +62,26 @@ cdef class Token:
    def __richcmp__(self, Token other, int op):
        # http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
        cdef Doc my_doc = self.doc
        cdef Doc other_doc = other.doc
        my = self.idx
        their = other.idx if other is not None else None
        if op == 0:
            return my < their
        elif op == 2:
            if my_doc is other_doc:
                return my == their
            else:
                return False
        elif op == 4:
            return my > their
        elif op == 1:
            return my <= their
        elif op == 3:
            if my_doc is other_doc:
                return my != their
            else:
                return True
        elif op == 5:
            return my >= their
        else:
--- a/spacy/util.py
+++ b/spacy/util.py
@ -22,7 +22,7 @@ import ujson
 from .symbols import ORTH
 from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
-from .compat import copy_array, normalize_string_keys, getattr_
+from .compat import copy_array, normalize_string_keys, getattr_, import_file
 LANGUAGES = {}
@ -112,15 +112,13 @@ def load_model(name, **overrides):
 def load_model_from_link(name, **overrides):
    """Load a model from a shortcut link, or directory in spaCy data path."""
-    init_file = get_data_path() / name / '__init__.py'
+    path = get_data_path() / name / '__init__.py'
    spec = importlib.util.spec_from_file_location(name, init_file)
    try:
-        cls = importlib.util.module_from_spec(spec)
+        cls = import_file(name, path)
    except AttributeError:
        raise IOError(
            "Cant' load '%s'. If you're using a shortcut link, make sure it "
            "points to a valid model package (not just a data directory)." % name)
    spec.loader.exec_module(cls)
    return cls.load(**overrides)
@ -171,8 +169,8 @@ def get_model_meta(path):
        raise IOError("Could not read meta.json from %s" % meta_path)
    meta = read_json(meta_path)
    for setting in ['lang', 'name', 'version']:
-        if setting not in meta:
+        if setting not in meta or not meta[setting]:
-            raise ValueError('No %s setting found in model meta.json' % setting)
+            raise ValueError("No valid '%s' setting found in model meta.json" % setting)
    return meta
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@ -1,18 +1,25 @@
 from __future__ import unicode_literals
 from libc.stdint cimport int32_t, uint64_t
 import numpy
 from collections import OrderedDict
 import msgpack
 import msgpack_numpy
 msgpack_numpy.patch()
 cimport numpy as np
 from .typedefs cimport attr_t
 from .strings cimport StringStore
 from . import util
 from .compat import basestring_
 cdef class Vectors:
    '''Store, save and load word vectors.'''
    cdef public object data
    cdef readonly StringStore strings
-    cdef public object key2i
+    cdef public object key2row
    cdef public object keys
    cdef public int i
    def __init__(self, strings, data_or_width):
        self.strings = StringStore()
@ -21,10 +28,10 @@ cdef class Vectors:
                                           dtype='f')
        else:
            data = data_or_width
        self.i = 0
        self.data = data
-        self.key2i = {}
+        self.key2row = {}
-        for i, string in enumerate(strings):
+        self.keys = np.ndarray((self.data.shape[0],), dtype='uint64') 
            self.key2i[self.strings.add(string)] = i
    def __reduce__(self):
        return (Vectors, (self.strings, self.data))
@ -32,7 +39,7 @@ cdef class Vectors:
    def __getitem__(self, key):
        if isinstance(key, basestring):
            key = self.strings[key]
-        i = self.key2i[key]
+        i = self.key2row[key]
        if i is None:
            raise KeyError(key)
        else:
@ -41,14 +48,36 @@ cdef class Vectors:
    def __setitem__(self, key, vector):
        if isinstance(key, basestring):
            key = self.strings.add(key)
-        i = self.key2i[key]
+        i = self.key2row[key]
        self.data[i] = vector
    def __iter__(self):
        yield from self.data
    def __len__(self):
-        return len(self.strings)
+        return self.i
    def __contains__(self, key):
        if isinstance(key, basestring_):
            key = self.strings[key]
        return key in self.key2row
    def add(self, key, vector=None):
        if isinstance(key, basestring_):
            key = self.strings.add(key)
        if key not in self.key2row:
            i = self.i
            if i >= self.keys.shape[0]:
                self.keys.resize((self.keys.shape[0]*2,))
                self.data.resize((self.data.shape[0]*2, self.data.shape[1]))
            self.key2row[key] = self.i
            self.keys[self.i] = key
            self.i += 1
        else:
            i = self.key2row[key]
        if vector is not None:
            self.data[i] = vector
        return i
    def items(self):
        for i, string in enumerate(self.strings):
@ -61,34 +90,87 @@ cdef class Vectors:
    def most_similar(self, key):
        raise NotImplementedError
-    def to_disk(self, path):
+    def from_glove(self, path):
-        raise NotImplementedError
+        '''Load GloVe vectors from a directory. Assumes binary format,
        that the vocab is in a vocab.txt, and that vectors are named
        vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32
        vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc.
        By default GloVe outputs 64-bit vectors.'''
        path = util.ensure_path(path)
        for name in path.iterdir():
            if name.parts[-1].startswith('vectors'):
                _, dims, dtype, _2 = name.parts[-1].split('.')
                self.width = int(dims)
                break
        else:
            raise IOError("Expected file named e.g. vectors.128.f.bin")
        bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims,
                                                             dtype=dtype)
        with bin_loc.open('rb') as file_:
            self.data = numpy.fromfile(file_, dtype='float64')
            self.data = numpy.ascontiguousarray(self.data, dtype='float32')
        n = 0
        with (path / 'vocab.txt').open('r') as file_:
            for line in file_:
                self.add(line.strip())
                n += 1
        if (self.data.size % self.width) == 0:
            self.data
-    def from_disk(self, path):
+    def to_disk(self, path, **exclude):
-        raise NotImplementedError
+        serializers = OrderedDict((
            ('vectors', lambda p: numpy.save(p.open('wb'), self.data, allow_pickle=False)),
            ('keys', lambda p: numpy.save(p.open('wb'), self.keys, allow_pickle=False)),
        ))
        return util.to_disk(path, serializers, exclude)
    def from_disk(self, path, **exclude):
        def load_keys(path):
            if path.exists():
                self.keys = numpy.load(path)
                for i, key in enumerate(self.keys):
                    self.keys[i] = key
                    self.key2row[key] = i
        def load_vectors(path):
            if path.exists():
                self.data = numpy.load(path)
        serializers = OrderedDict((
            ('keys', load_keys),
            ('vectors', load_vectors),
        ))
        util.from_disk(path, serializers, exclude)
        return self
    def to_bytes(self, **exclude):
        def serialize_weights():
-            if hasattr(self.weights, 'to_bytes'):
+            if hasattr(self.data, 'to_bytes'):
-                return self.weights.to_bytes()
+                return self.data.to_bytes()
            else:
-                return msgpack.dumps(self.weights)
+                return msgpack.dumps(self.data)
        serializers = OrderedDict((
-            ('strings', lambda: self.strings.to_bytes()),
+            ('keys', lambda: msgpack.dumps(self.keys)),
-            ('weights', serialize_weights)
+            ('vectors', serialize_weights)
        ))
        return util.to_bytes(serializers, exclude)
    def from_bytes(self, data, **exclude):
        def deserialize_weights(b):
-            if hasattr(self.weights, 'from_bytes'):
+            if hasattr(self.data, 'from_bytes'):
-                self.weights.from_bytes()
+                self.data.from_bytes()
            else:
-                self.weights = msgpack.loads(b)
+                self.data = msgpack.loads(b)
        def load_keys(keys):
            self.keys.resize((len(keys),))
            for i, key in enumerate(keys):
                self.keys[i] = key
                self.key2row[key] = i
        deserializers = OrderedDict((
-            ('strings', lambda b: self.strings.from_bytes(b)),
+            ('keys', lambda b: load_keys(msgpack.loads(b))),
-            ('weights', deserialize_weights)
+            ('vectors', deserialize_weights)
        ))
-        return util.from_bytes(deserializers, exclude)
+        util.from_bytes(data, deserializers, exclude)
        return self
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import bz2
 import ujson
 import re
 import numpy
 from libc.string cimport memset, memcpy
 from libc.stdint cimport int32_t
@ -19,9 +20,10 @@ from .tokens.token cimport Token
 from .attrs cimport PROB, LANG
 from .structs cimport SerializedLexemeC
-from .compat import copy_reg, pickle
+from .compat import copy_reg, pickle, basestring_
 from .lemmatizer import Lemmatizer
 from .attrs import intify_attrs
 from .vectors import Vectors
 from . import util
 from . import attrs
 from . import symbols
@ -63,6 +65,7 @@ cdef class Vocab:
                self.strings.add(name)
        self.lex_attr_getters = lex_attr_getters
        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
        self.vectors = Vectors(self.strings, 300)
    property lang:
        def __get__(self):
@ -242,13 +245,15 @@ cdef class Vocab:
    @property
    def vectors_length(self):
-        raise NotImplementedError
+        return self.vectors.data.shape[1]
-    def clear_vectors(self):
+    def clear_vectors(self, new_dim=None):
        """Drop the current vector table. Because all vectors must be the same
        width, you have to call this to change the size of the vectors.
        """
-        raise NotImplementedError
+        if new_dim is None:
            new_dim = self.vectors.data.shape[1]
        self.vectors = Vectors(self.strings, new_dim)
    def get_vector(self, orth):
        """Retrieve a vector for a word in the vocabulary.
@ -262,7 +267,12 @@ cdef class Vocab:
        RAISES: If no vectors data is loaded, ValueError is raised.
        """
-        raise NotImplementedError
+        if isinstance(orth, basestring_):
            orth = self.strings.add(orth)
        if orth in self.vectors.key2row:
            return self.vectors[orth]
        else:
            return numpy.zeros((self.vectors_length,), dtype='f')
    def set_vector(self, orth, vector):
        """Set a vector for a word in the vocabulary.
@ -272,15 +282,19 @@ cdef class Vocab:
        RETURNS:
            None
        """
-        raise NotImplementedError
+        if not isinstance(orth, basestring_):
            orth = self.strings[orth]
        self.vectors.add(orth, vector=vector)
    def has_vector(self, orth):
        """Check whether a word has a vector. Returns False if no
        vectors have been loaded. Words can be looked up by string
        or int ID."""
-        return False
+        if isinstance(orth, basestring_):
            orth = self.strings.add(orth)
        return orth in self.vectors
-    def to_disk(self, path):
+    def to_disk(self, path, **exclude):
        """Save the current state to a directory.
        path (unicode or Path): A path to a directory, which will be created if
@ -292,8 +306,10 @@ cdef class Vocab:
        self.strings.to_disk(path / 'strings.json')
        with (path / 'lexemes.bin').open('wb') as file_:
            file_.write(self.lexemes_to_bytes())
        if self.vectors is not None:
            self.vectors.to_disk(path)
-    def from_disk(self, path):
+    def from_disk(self, path, **exclude):
        """Loads state from a directory. Modifies the object in place and
        returns it.
@ -305,6 +321,8 @@ cdef class Vocab:
        self.strings.from_disk(path / 'strings.json')
        with (path / 'lexemes.bin').open('rb') as file_:
            self.lexemes_from_bytes(file_.read())
        if self.vectors is not None:
            self.vectors.from_disk(path, exclude='strings.json')
        return self
    def to_bytes(self, **exclude):
@ -313,9 +331,16 @@ cdef class Vocab:
        **exclude: Named attributes to prevent from being serialized.
        RETURNS (bytes): The serialized form of the `Vocab` object.
        """
        def deserialize_vectors():
            if self.vectors is None:
                return None
            else:
                return self.vectors.to_bytes(exclude='strings.json')
        getters = OrderedDict((
            ('strings', lambda: self.strings.to_bytes()),
            ('lexemes', lambda: self.lexemes_to_bytes()),
            ('vectors', deserialize_vectors)
        ))
        return util.to_bytes(getters, exclude)
@ -326,9 +351,15 @@ cdef class Vocab:
        **exclude: Named attributes to prevent from being loaded.
        RETURNS (Vocab): The `Vocab` object.
        """
        def serialize_vectors(b):
            if self.vectors is None:
                return None
            else:
                return self.vectors.from_bytes(b, exclude='strings')
        setters = OrderedDict((
            ('strings', lambda b: self.strings.from_bytes(b)),
            ('lexemes', lambda b: self.lexemes_from_bytes(b)),
            ('vectors', lambda b: serialize_vectors(b))
        ))
        util.from_bytes(bytes_data, setters, exclude)
        return self
--- a/travis.sh
+++ b/travis.sh
@ -2,9 +2,8 @@
 if [ "${VIA}" == "pypi" ]; then
    rm -rf *
-    pip install spacy
+    pip install spacy-nightly
-    python -m spacy.en.download
+    python -m spacy download en
    python -m spacy.de.download
 fi
 if [[ "${VIA}" == "sdist" && "${TRAVIS_PULL_REQUEST}" == "false" ]]; then
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -103,20 +103,20 @@ mixin button(url, trusted, ...style)
    label    - [string] aside title (optional or false for no label)
    language - [string] language for syntax highlighting (default: "python")
               supports basic relevant languages available for PrismJS
-    icon     - [string] icon to display next to code block, mostly used for old/new
+    prompt    - [string] prompt or icon to display next to code block, (mostly used for old/new)
    height   - [integer] optional height to clip code block to
-mixin code(label, language, icon, height)
+mixin code(label, language, prompt, height)
    pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
        if label
            h4.u-text-label.u-text-label--dark=label
-
+        - var icon = (prompt == 'accept' || prompt == 'reject')
        if icon
            - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
            .c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
                +icon(icon, 18)
-        code.c-code-block__content
+        code.c-code-block__content(data-prompt=icon ? null : prompt)
            block
--- a/website/assets/css/_base/_utilities.sass
+++ b/website/assets/css/_base/_utilities.sass
@ -112,6 +112,10 @@
 .u-nowrap
    white-space: nowrap
 .u-break.u-break
    word-wrap: break-word
    white-space: initial
 .u-no-border
    border: none
--- a/website/assets/css/_components/_code.sass
+++ b/website/assets/css/_components/_code.sass
@ -35,6 +35,13 @@
    font: normal normal 1.1rem/#{2} $font-code
    padding: 1em 2em
    &[data-prompt]:before,
        content: attr(data-prompt)
        margin-right: 0.65em
        display: inline-block
        vertical-align: middle
        opacity: 0.5
 //- Inline code
--- a/Show More
+++ b/Show More