Revert noise-level back to default 0.0

2025-08-09 06:34:54 +03:00 · 2017-09-06 04:58:33 -05:00 · 2017-09-06 04:58:33 -05:00 · 167f6a8938
commit 167f6a8938
parent 4c1b6a4c81 e88a42e460
130 changed files with 67009 additions and 17315 deletions
--- a/.gitignore
+++ b/.gitignore
@ -40,7 +40,6 @@ venv/

 # Distribution / packaging
 env/
-bin/
 build/
 develop-eggs/
 dist/
--- a/.travis.yml
+++ b/.travis.yml
@ -14,8 +14,7 @@ os:
 env:
  - VIA=compile LC_ALL=en_US.ascii 
  - VIA=compile
-
-#  - VIA=sdist
+  #- VIA=pypi_nightly

 install:
  - "./travis.sh"
@ -23,7 +22,7 @@ install:
 script:
  - "pip install pytest pytest-timeout"
  - if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi
-  - if [[ "${VIA}" == "pypi" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(ospath.dirname(spacy.__file__)))"`; fi
+  - if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
  - if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
  
 notifications:
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,3 +1,4 @@
 recursive-include include *.h
 include LICENSE
 include README.rst
+include bin/spacy
--- a/README.rst
+++ b/README.rst
@ -229,7 +229,7 @@ Compile from source
 The other way to install spaCy is to clone its
 `GitHub repository <https://github.com/explosion/spaCy>`_ and build it from
 source. That is the common way if you want to make changes to the code base.
-You'll need to make sure that you have a development enviroment consisting of a
+You'll need to make sure that you have a development environment consisting of a
 Python distribution including header files, a compiler,
 `pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_
 and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest.
--- a/bin/spacy
+++ b/bin/spacy
@ -0,0 +1 @@
+python -m spacy "$@"
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@ -0,0 +1,109 @@
+from __future__ import unicode_literals
+import plac
+import random
+import tqdm
+
+from thinc.neural.optimizers import Adam
+from thinc.neural.ops import NumpyOps
+import thinc.extra.datasets
+
+import spacy.lang.en
+from spacy.gold import GoldParse, minibatch
+from spacy.util import compounding
+from spacy.pipeline import TextCategorizer
+
+
+def train_textcat(tokenizer, textcat,
+                  train_texts, train_cats, dev_texts, dev_cats,
+                  n_iter=20):
+    '''
+    Train the TextCategorizer without associated pipeline.
+    '''
+    textcat.begin_training()
+    optimizer = Adam(NumpyOps(), 0.001)
+    train_docs = [tokenizer(text) for text in train_texts]
+    train_gold = [GoldParse(doc, cats=cats) for doc, cats in
+                  zip(train_docs, train_cats)]
+    train_data = zip(train_docs, train_gold)
+    batch_sizes = compounding(4., 128., 1.001)
+    for i in range(n_iter):
+        losses = {}
+        train_data = tqdm.tqdm(train_data, leave=False) # Progress bar
+        for batch in minibatch(train_data, size=batch_sizes):
+            docs, golds = zip(*batch)
+            textcat.update((docs, None), golds, sgd=optimizer, drop=0.2,
+                losses=losses)
+        with textcat.model.use_params(optimizer.averages):
+            scores = evaluate(tokenizer, textcat, dev_texts, dev_cats)
+        yield losses['textcat'], scores
+
+
+def evaluate(tokenizer, textcat, texts, cats):
+    docs = (tokenizer(text) for text in texts)
+    tp = 1e-8 # True positives
+    fp = 1e-8 # False positives
+    fn = 1e-8 # False negatives
+    tn = 1e-8 # True negatives
+    for i, doc in enumerate(textcat.pipe(docs)):
+        gold = cats[i]
+        for label, score in doc.cats.items():
+            if score >= 0.5 and label in gold:
+                tp += 1.
+            elif score >= 0.5 and label not in gold:
+                fp += 1.
+            elif score < 0.5 and label not in gold:
+                tn += 1
+            if score < 0.5 and label in gold:
+                fn += 1
+    precis = tp / (tp + fp)
+    recall = tp / (tp + fn)
+    fscore = 2 * (precis * recall) / (precis + recall)
+    return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}  
+
+
+def load_data():
+    # Partition off part of the train data --- avoid running experiments
+    # against test.
+    train_data, _ = thinc.extra.datasets.imdb()
+
+    random.shuffle(train_data)
+
+    texts, labels = zip(*train_data)
+    cats = [(['POSITIVE'] if y else []) for y in labels]
+
+    split = int(len(train_data) * 0.8)
+
+    train_texts = texts[:split]
+    train_cats = cats[:split]
+    dev_texts = texts[split:]
+    dev_cats = cats[split:]
+    return (train_texts, train_cats), (dev_texts, dev_cats)
+
+
+def main(model_loc=None):
+    nlp = spacy.lang.en.English()
+    tokenizer = nlp.tokenizer
+    textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])
+
+    print("Load IMDB data")
+    (train_texts, train_cats), (dev_texts, dev_cats) = load_data()
+
+    print("Itn.\tLoss\tP\tR\tF")
+    progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'
+
+    for i, (loss, scores) in enumerate(train_textcat(tokenizer, textcat,
+                                       train_texts, train_cats,
+                                       dev_texts, dev_cats, n_iter=20)):
+        print(progress.format(i=i, loss=loss, **scores))
+    # How to save, load and use
+    nlp.pipeline.append(textcat)
+    if model_loc is not None:
+        nlp.to_disk(model_loc)
+
+        nlp = spacy.load(model_loc)
+        doc = nlp(u'This movie sucked!')
+        print(doc.cats)
+
+
+if __name__ == '__main__':
+    plac.call(main)
--- a/requirements.txt
+++ b/requirements.txt
@ -3,7 +3,7 @@ pathlib
 numpy>=1.7
 cymem>=1.30,<1.32
 preshed>=1.0.0,<2.0.0
-thinc>=6.7.3,<6.8.0
+thinc>=6.8.0,<6.9.0
 murmurhash>=0.28,<0.29
 plac<1.0.0,>=0.9.6
 six
--- a/setup.py
+++ b/setup.py
@ -28,7 +28,9 @@ MOD_NAMES = [
    'spacy.pipeline',
    'spacy.syntax.stateclass',
    'spacy.syntax._state',
+    'spacy.syntax._beam_utils',
    'spacy.tokenizer',
+    'spacy._cfile',
    'spacy.syntax.parser',
    'spacy.syntax.nn_parser',
    'spacy.syntax.beam_parser',
@ -187,12 +189,13 @@ def setup_package():
            url=about['__uri__'],
            license=about['__license__'],
            ext_modules=ext_modules,
+            scripts=['bin/spacy'],
            install_requires=[
                'numpy>=1.7',
                'murmurhash>=0.28,<0.29',
                'cymem>=1.30,<1.32',
                'preshed>=1.0.0,<2.0.0',
-                'thinc>=6.7.3,<6.8.0',
+                'thinc>=6.8.0,<6.9.0',
                'plac<1.0.0,>=0.9.6',
                'pip>=9.0.0,<10.0.0',
                'six',
--- a/spacy/init.py
+++ b/spacy/init.py
@ -13,5 +13,10 @@ def load(name, **overrides):
    return util.load_model(name, **overrides)


+def blank(name, **kwargs):
+    LangClass = util.get_lang_class(name)
+    return LangClass(**kwargs)
+
+
 def info(model=None, markdown=False):
    return cli_info(None, model, markdown)
--- a/spacy/main.py
+++ b/spacy/main.py
@ -3,15 +3,23 @@ from __future__ import print_function
 # NB! This breaks in plac on Python 2!!
 #from __future__ import unicode_literals

-
 if __name__ == '__main__':
    import plac
    import sys
-    from spacy.cli import download, link, info, package, train, convert
+    from spacy.cli import download, link, info, package, train, convert, model
+    from spacy.cli import profile
    from spacy.util import prints

-    commands = {'download': download, 'link': link, 'info': info, 'train': train,
-                'convert': convert, 'package': package}
+    commands = {
+        'download': download,
+        'link': link,
+        'info': info,
+        'train': train,
+        'convert': convert,
+        'package': package,
+        'model': model,
+        'profile': profile,
+    }
    if len(sys.argv) == 1:
        prints(', '.join(commands), title="Available commands", exits=1)
    command = sys.argv.pop(1)
@ -19,5 +27,7 @@ if __name__ == '__main__':
    if command in commands:
        plac.call(commands[command])
    else:
-        prints("Available: %s" % ', '.join(commands),
-               title="Unknown command: %s" % command, exits=1)
+        prints(
+            "Available: %s" % ', '.join(commands),
+            title="Unknown command: %s" % command,
+            exits=1)
--- a/spacy/_cfile.pxd
+++ b/spacy/_cfile.pxd
@ -0,0 +1,26 @@
+from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
+from cymem.cymem cimport Pool
+
+cdef class CFile:
+    cdef FILE* fp
+    cdef bint is_open
+    cdef Pool mem
+    cdef int size # For compatibility with subclass
+    cdef int _capacity # For compatibility with subclass
+
+    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
+
+    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
+
+    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
+
+
+
+cdef class StringCFile(CFile):
+    cdef unsigned char* data
+ 
+    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
+
+    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
+    
+    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
--- a/spacy/_cfile.pyx
+++ b/spacy/_cfile.pyx
@ -0,0 +1,88 @@
+from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
+from libc.string cimport memcpy
+
+
+cdef class CFile:
+    def __init__(self, loc, mode, on_open_error=None):
+        if isinstance(mode, unicode):
+            mode_str = mode.encode('ascii')
+        else:
+            mode_str = mode
+        if hasattr(loc, 'as_posix'):
+            loc = loc.as_posix()
+        self.mem = Pool()
+        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
+        self.fp = fopen(<char*>bytes_loc, mode_str)
+        if self.fp == NULL:
+            if on_open_error is not None:
+                on_open_error()
+            else:
+                raise IOError("Could not open binary file %s" % bytes_loc)
+        self.is_open = True
+
+    def __dealloc__(self):
+        if self.is_open:
+            fclose(self.fp)
+
+    def close(self):
+        fclose(self.fp)
+        self.is_open = False
+
+    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
+        st = fread(dest, elem_size, number, self.fp)
+        if st != number:
+            raise IOError
+
+    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
+        st = fwrite(src, elem_size, number, self.fp)
+        if st != number:
+            raise IOError
+
+    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
+        cdef void* dest = mem.alloc(number, elem_size)
+        self.read_into(dest, number, elem_size)
+        return dest
+
+    def write_unicode(self, unicode value):
+        cdef bytes py_bytes = value.encode('utf8')
+        cdef char* chars = <char*>py_bytes
+        self.write(sizeof(char), len(py_bytes), chars)
+
+
+cdef class StringCFile:
+    def __init__(self, mode, bytes data=b'', on_open_error=None):
+        self.mem = Pool()
+        self.is_open = 'w' in mode
+        self._capacity = max(len(data), 8)
+        self.size = len(data)
+        self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
+        for i in range(len(data)):
+            self.data[i] = data[i]
+
+    def close(self):
+        self.is_open = False
+
+    def string_data(self):
+        return (self.data-self.size)[:self.size]
+
+    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
+        memcpy(dest, self.data, elem_size * number)
+        self.data += elem_size * number
+
+    cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
+        write_size = number * elem_size
+        if (self.size + write_size) >= self._capacity:
+            self._capacity = (self.size + write_size) * 2
+            self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
+        memcpy(&self.data[self.size], src, elem_size * number)
+        self.size += write_size
+
+    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
+        cdef void* dest = mem.alloc(number, elem_size)
+        self.read_into(dest, number, elem_size)
+        return dest
+
+    def write_unicode(self, unicode value):
+        cdef bytes py_bytes = value.encode('utf8')
+        cdef char* chars = <char*>py_bytes
+        self.write(sizeof(char), len(py_bytes), chars)
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -3,23 +3,101 @@ from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
 from thinc.neural import Model, Maxout, Softmax, Affine
 from thinc.neural._classes.hash_embed import HashEmbed
 from thinc.neural.ops import NumpyOps, CupyOps
+from thinc.neural.util import get_array_module
+import random
+import cytoolz

 from thinc.neural._classes.convolution import ExtractWindow
 from thinc.neural._classes.static_vectors import StaticVectors
-from thinc.neural._classes.batchnorm import BatchNorm
+from thinc.neural._classes.batchnorm import BatchNorm as BN
+from thinc.neural._classes.layernorm import LayerNorm as LN
 from thinc.neural._classes.resnet import Residual
 from thinc.neural import ReLu
+from thinc.neural._classes.selu import SELU
 from thinc import describe
 from thinc.describe import Dimension, Synapses, Biases, Gradient
 from thinc.neural._classes.affine import _set_dimensions_if_needed
+from thinc.api import FeatureExtracter, with_getitem
+from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
+from thinc.neural._classes.attention import ParametricAttention
+from thinc.linear.linear import LinearModel
+from thinc.api import uniqued, wrap, flatten_add_lengths

-from .attrs import ID, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
+
+from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
 from .tokens.doc import Doc
+from . import util

 import numpy
 import io


+@layerize
+def _flatten_add_lengths(seqs, pad=0, drop=0.):
+    ops = Model.ops
+    lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
+    def finish_update(d_X, sgd=None):
+        return ops.unflatten(d_X, lengths, pad=pad)
+    X = ops.flatten(seqs, pad=pad)
+    return (X, lengths), finish_update
+
+
+@layerize
+def _logistic(X, drop=0.):
+    xp = get_array_module(X)
+    if not isinstance(X, xp.ndarray):
+        X = xp.asarray(X)
+    # Clip to range (-10, 10)
+    X = xp.minimum(X, 10., X)
+    X = xp.maximum(X, -10., X)
+    Y = 1. / (1. + xp.exp(-X))
+    def logistic_bwd(dY, sgd=None):
+        dX = dY * (Y * (1-Y))
+        return dX
+    return Y, logistic_bwd
+
+
+@layerize
+def add_tuples(X, drop=0.):
+    """Give inputs of sequence pairs, where each sequence is (vals, length),
+    sum the values, returning a single sequence.
+
+    If input is:
+    ((vals1, length), (vals2, length)
+    Output is:
+    (vals1+vals2, length)
+
+    vals are a single tensor for the whole batch.
+    """
+    (vals1, length1), (vals2, length2) = X
+    assert length1 == length2
+
+    def add_tuples_bwd(dY, sgd=None):
+        return (dY, dY)
+
+    return (vals1+vals2, length), add_tuples_bwd
+
+
+def _zero_init(model):
+    def _zero_init_impl(self, X, y):
+        self.W.fill(0)
+    model.on_data_hooks.append(_zero_init_impl)
+    if model.W is not None:
+        model.W.fill(0.)
+    return model
+
+
+@layerize
+def _preprocess_doc(docs, drop=0.):
+    keys = [doc.to_array([LOWER]) for doc in docs]
+    keys = [a[:, 0] for a in keys]
+    ops = Model.ops
+    lengths = ops.asarray([arr.shape[0] for arr in keys])
+    keys = ops.xp.concatenate(keys)
+    vals = ops.allocate(keys.shape[0]) + 1
+    return (keys, vals, lengths), None
+
+
 def _init_for_precomputed(W, ops):
    if (W**2).sum() != 0.:
        return
@ -27,6 +105,7 @@ def _init_for_precomputed(W, ops):
    ops.xavier_uniform_init(reshaped)
    W[:] = reshaped.reshape(W.shape)

+
@describe.on_data(_set_dimensions_if_needed)
@describe.attributes(
    nI=Dimension("Input size"),
@ -130,25 +209,42 @@ class PrecomputableMaxouts(Model):
            return dXf
        return Yfp, backward

+
+def drop_layer(layer, factor=2.):
+    def drop_layer_fwd(X, drop=0.):
+        if drop <= 0.:
+            return layer.begin_update(X, drop=drop)
+        else:
+            coinflip = layer.ops.xp.random.random()
+            if (coinflip / factor) >= drop:
+                return layer.begin_update(X, drop=drop)
+            else:
+                return X, lambda dX, sgd=None: dX
+
+    model = wrap(drop_layer_fwd, layer)
+    model.predict = layer
+    return model
+
+
 def Tok2Vec(width, embed_size, preprocess=None):
-    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
+    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
-        norm = get_col(cols.index(NORM))   >> HashEmbed(width, embed_size, name='embed_lower')
+        norm = get_col(cols.index(NORM))     >> HashEmbed(width, embed_size, name='embed_lower')
        prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
        suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
        shape = get_col(cols.index(SHAPE))   >> HashEmbed(width, embed_size//2, name='embed_shape')

-        embed = (norm | prefix | suffix | shape )
+        embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3))
        tok2vec = (
            with_flatten(
                asarray(Model.ops, dtype='uint64')
-                >> embed
-                >> Maxout(width, width*4, pieces=3)
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
-            pad=4)
+                >> uniqued(embed, column=5)
+                >> drop_layer(
+                    Residual(
+                        (ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
+                    )
+                ) ** 4, pad=4
+            )
        )
        if preprocess not in (False, None):
            tok2vec = preprocess >> tok2vec
@ -243,7 +339,8 @@ def zero_init(model):


 def doc2feats(cols=None):
-    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
+    if cols is None:
+        cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    def forward(docs, drop=0.):
        feats = []
        for doc in docs:
@ -269,6 +366,45 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
    return vectors, backward


+def fine_tune(embedding, combine=None):
+    if combine is not None:
+        raise NotImplementedError(
+            "fine_tune currently only supports addition. Set combine=None")
+    def fine_tune_fwd(docs_tokvecs, drop=0.):
+        docs, tokvecs = docs_tokvecs
+        lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
+
+        vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
+        flat_tokvecs = embedding.ops.flatten(tokvecs)
+        flat_vecs = embedding.ops.flatten(vecs)
+        output = embedding.ops.unflatten(
+                   (model.mix[0] * flat_tokvecs + model.mix[1] * flat_vecs), lengths)
+
+        def fine_tune_bwd(d_output, sgd=None):
+            flat_grad = model.ops.flatten(d_output)
+            model.d_mix[0] += flat_tokvecs.dot(flat_grad.T).sum()
+            model.d_mix[1] += flat_vecs.dot(flat_grad.T).sum()
+
+            bp_vecs([d_o * model.mix[1] for d_o in d_output], sgd=sgd)
+            if sgd is not None:
+                sgd(model._mem.weights, model._mem.gradient, key=model.id)
+            return [d_o * model.mix[0] for d_o in d_output]
+        return output, fine_tune_bwd
+
+    def fine_tune_predict(docs_tokvecs):
+        docs, tokvecs = docs_tokvecs
+        vecs = embedding(docs)
+        return [model.mix[0]*tv+model.mix[1]*v
+                for tv, v in zip(tokvecs, vecs)]
+
+    model = wrap(fine_tune_fwd, embedding)
+    model.mix = model._mem.add((model.id, 'mix'), (2,))
+    model.mix.fill(0.5)
+    model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
+    model.predict = fine_tune_predict
+    return model
+
+
@layerize
 def flatten(seqs, drop=0.):
    if isinstance(seqs[0], numpy.ndarray):
@ -282,3 +418,201 @@ def flatten(seqs, drop=0.):
        return ops.unflatten(d_X, lengths)
    X = ops.xp.vstack(seqs)
    return X, finish_update
+
+
+@layerize
+def logistic(X, drop=0.):
+    xp = get_array_module(X)
+    if not isinstance(X, xp.ndarray):
+        X = xp.asarray(X)
+    # Clip to range (-10, 10)
+    X = xp.minimum(X, 10., X)
+    X = xp.maximum(X, -10., X)
+    Y = 1. / (1. + xp.exp(-X))
+    def logistic_bwd(dY, sgd=None):
+        dX = dY * (Y * (1-Y))
+        return dX
+    return Y, logistic_bwd
+
+
+def zero_init(model):
+    def _zero_init_impl(self, X, y):
+        self.W.fill(0)
+    model.on_data_hooks.append(_zero_init_impl)
+    return model
+
+@layerize
+def preprocess_doc(docs, drop=0.):
+    keys = [doc.to_array([LOWER]) for doc in docs]
+    keys = [a[:, 0] for a in keys]
+    ops = Model.ops
+    lengths = ops.asarray([arr.shape[0] for arr in keys])
+    keys = ops.xp.concatenate(keys)
+    vals = ops.allocate(keys.shape[0]) + 1
+    return (keys, vals, lengths), None
+
+def getitem(i):
+    def getitem_fwd(X, drop=0.):
+        return X[i], None
+    return layerize(getitem_fwd)
+
+def build_tagger_model(nr_class, token_vector_width, **cfg):
+    embed_size = util.env_opt('embed_size', 7500)
+    with Model.define_operators({'>>': chain, '+': add}):
+        # Input: (doc, tensor) tuples
+        private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats())
+
+        model = (
+            fine_tune(private_tok2vec)
+            >> with_flatten(
+                Maxout(token_vector_width, token_vector_width)
+                >> Softmax(nr_class, token_vector_width)
+            )
+        )
+    model.nI = None
+    return model
+
+
+@layerize
+def SpacyVectors(docs, drop=0.):
+    xp = get_array_module(docs[0].vocab.vectors.data)
+    width = docs[0].vocab.vectors.data.shape[1]
+    batch = []
+    for doc in docs:
+        indices = numpy.zeros((len(doc),), dtype='i')
+        for i, word in enumerate(doc):
+            if word.orth in doc.vocab.vectors.key2row:
+                indices[i] = doc.vocab.vectors.key2row[word.orth]
+            else:
+                indices[i] = 0
+        vectors = doc.vocab.vectors.data[indices]
+        batch.append(vectors)
+    return batch, None
+
+
+def foreach(layer, drop_factor=1.0):
+    '''Map a layer across elements in a list'''
+    def foreach_fwd(Xs, drop=0.):
+        drop *= drop_factor
+        ys = []
+        backprops = []
+        for X in Xs:
+            y, bp_y = layer.begin_update(X, drop=drop)
+            ys.append(y)
+            backprops.append(bp_y)
+        def foreach_bwd(d_ys, sgd=None):
+            d_Xs = []
+            for d_y, bp_y in zip(d_ys, backprops):
+                if bp_y is not None and bp_y is not None:
+                    d_Xs.append(d_y, sgd=sgd)
+                else:
+                    d_Xs.append(None)
+            return d_Xs
+        return ys, foreach_bwd
+    model = wrap(foreach_fwd, layer)
+    return model
+
+
+def build_text_classifier(nr_class, width=64, **cfg):
+    nr_vector = cfg.get('nr_vector', 5000)
+    with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
+                                 '**': clone}):
+        if cfg.get('low_data'):
+            model = (
+                SpacyVectors
+                >> flatten_add_lengths
+                >> with_getitem(0,
+                    Affine(width, 300)
+                )
+                >> ParametricAttention(width)
+                >> Pooling(sum_pool)
+                >> Residual(ReLu(width, width)) ** 2
+                >> zero_init(Affine(nr_class, width, drop_factor=0.0))
+                >> logistic
+            )
+            return model
+
+
+        lower = HashEmbed(width, nr_vector, column=1)
+        prefix = HashEmbed(width//2, nr_vector, column=2)
+        suffix = HashEmbed(width//2, nr_vector, column=3)
+        shape = HashEmbed(width//2, nr_vector, column=4)
+
+        trained_vectors = (
+            FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID])
+            >> with_flatten(
+                uniqued(
+                    (lower | prefix | suffix | shape)
+                    >> LN(Maxout(width, width+(width//2)*3)),
+                    column=0
+                )
+            )
+        )
+
+        static_vectors = (
+            SpacyVectors
+            >> with_flatten(Affine(width, 300))
+        )
+
+        cnn_model = (
+            # TODO Make concatenate support lists
+            concatenate_lists(trained_vectors, static_vectors) 
+            >> with_flatten(
+                LN(Maxout(width, width*2))
+                >> Residual(
+                    (ExtractWindow(nW=1) >> zero_init(Maxout(width, width*3)))
+                ) ** 2, pad=2
+            )
+            >> flatten_add_lengths
+            >> ParametricAttention(width)
+            >> Pooling(sum_pool)
+            >> Residual(zero_init(Maxout(width, width)))
+            >> zero_init(Affine(nr_class, width, drop_factor=0.0))
+        )
+
+        linear_model = (
+            _preprocess_doc
+            >> LinearModel(nr_class, drop_factor=0.)
+        )
+
+        model = (
+            (linear_model | cnn_model)
+            >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
+            >> logistic
+        )
+
+    model.lsuv = False
+    return model
+
+@layerize
+def flatten(seqs, drop=0.):
+    ops = Model.ops
+    lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
+    def finish_update(d_X, sgd=None):
+        return ops.unflatten(d_X, lengths, pad=0)
+    X = ops.flatten(seqs, pad=0)
+    return X, finish_update
+
+
+def concatenate_lists(*layers, **kwargs): # pragma: no cover
+    '''Compose two or more models `f`, `g`, etc, such that their outputs are
+    concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
+    '''
+    if not layers:
+        return noop()
+    drop_factor = kwargs.get('drop_factor', 1.0)
+    ops = layers[0].ops
+    layers = [chain(layer, flatten) for layer in layers]
+    concat = concatenate(*layers)
+    def concatenate_lists_fwd(Xs, drop=0.):
+        drop *= drop_factor
+        lengths = ops.asarray([len(X) for X in Xs], dtype='i')
+        flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
+        ys = ops.unflatten(flat_y, lengths)
+        def concatenate_lists_bwd(d_ys, sgd=None):
+            return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
+        return ys, concatenate_lists_bwd
+    model = wrap(concatenate_lists_fwd, concat)
+    return model
+
+
--- a/spacy/about.py
+++ b/spacy/about.py
@ -3,7 +3,7 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py

 __title__ = 'spacy-nightly'
-__version__ = '2.0.0a1'
+__version__ = '2.0.0a13'
 __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 __uri__ = 'https://spacy.io'
 __author__ = 'Explosion AI'
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -2,5 +2,7 @@ from .download import download
 from .info import info
 from .link import link
 from .package import package
+from .profile import profile
 from .train import train
 from .convert import convert
+from .model import model
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -21,10 +21,10 @@ CONVERTERS = {
@plac.annotations(
    input_file=("input file", "positional", None, str),
    output_dir=("output directory for converted file", "positional", None, str),
-    n_sents=("Number of sentences per doc", "option", "n", float),
+    n_sents=("Number of sentences per doc", "option", "n", int),
    morphology=("Enable appending morphology to tags", "flag", "m", bool)
 )
-def convert(cmd, input_file, output_dir, n_sents, morphology):
+def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
    """
    Convert files into JSON format for use with train command and other
    experiment management functions.
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -73,10 +73,10 @@ def generate_sentence(sent):
    tokens = []
    for i, id in enumerate(id_):
        token = {}
-        token["orth"] = word[id]
-        token["tag"] = tag[id]
-        token["head"] = head[id] - i
-        token["dep"] = dep[id]
+        token["orth"] = word[i]
+        token["tag"] = tag[i]
+        token["head"] = head[i] - id
+        token["dep"] = dep[i]
        tokens.append(token)
    sentence["tokens"] = tokens
    return sentence
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -8,7 +8,7 @@ import subprocess
 import sys

 from .link import link
-from ..util import prints
+from ..util import prints, get_package_path
 from .. import about


@ -24,24 +24,29 @@ def download(cmd, model, direct=False):
    with version.
    """
    if direct:
-        download_model('{m}/{m}.tar.gz'.format(m=model))
+        dl = download_model('{m}/{m}.tar.gz'.format(m=model))
    else:
        shortcuts = get_json(about.__shortcuts__, "available shortcuts")
        model_name = shortcuts.get(model, model)
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
-        download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
-        try:
-            link(None, model_name, model, force=True)
-        except:
-            # Dirty, but since spacy.download and the auto-linking is mostly
-            # a convenience wrapper, it's best to show a success message and
-            # loading instructions, even if linking fails.
-            prints("Creating a shortcut link for 'en' didn't work (maybe you "
-                   "don't have admin permissions?), but you can still load "
-                   "the model via its full package name:",
-                   "nlp = spacy.load('%s')" % model_name,
-                   title="Download successful")
+        dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
+        if dl == 0:
+            try:
+                # Get package path here because link uses
+                # pip.get_installed_distributions() to check if model is a package,
+                # which fails if model was just installed via subprocess
+                package_path = get_package_path(model_name)
+                link(None, model_name, model, force=True, model_path=package_path)
+            except:
+                # Dirty, but since spacy.download and the auto-linking is mostly
+                # a convenience wrapper, it's best to show a success message and
+                # loading instructions, even if linking fails.
+                prints("Creating a shortcut link for 'en' didn't work (maybe you "
+                    "don't have admin permissions?), but you can still load "
+                    "the model via its full package name:",
+                    "nlp = spacy.load('%s')" % model_name,
+                    title="Download successful")


 def get_json(url, desc):
@ -73,6 +78,6 @@ def get_version(model, comp):

 def download_model(filename):
    download_url = about.__download_url__ + '/' + filename
-    subprocess.call([sys.executable, '-m',
+    return subprocess.call([sys.executable, '-m',
        'pip', 'install', '--no-cache-dir', download_url],
        env=os.environ.copy())
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@ -14,7 +14,7 @@ from .. import util
    link_name=("name of shortuct link to create", "positional", None, str),
    force=("force overwriting of existing link", "flag", "f", bool)
 )
-def link(cmd, origin, link_name, force=False):
+def link(cmd, origin, link_name, force=False, model_path=None):
    """
    Create a symlink for models within the spacy/data directory. Accepts
    either the name of a pip package, or the local path to the model data
@ -23,7 +23,7 @@ def link(cmd, origin, link_name, force=False):
    if util.is_package(origin):
        model_path = util.get_package_path(origin)
    else:
-        model_path = Path(origin)
+        model_path = Path(origin) if model_path is None else Path(model_path)
    if not model_path.exists():
        prints("The data should be located in %s" % path2str(model_path),
               title="Can't locate model data", exits=1)
--- a/spacy/cli/model.py
+++ b/spacy/cli/model.py
@ -0,0 +1,137 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import bz2
+import gzip
+import math
+from ast import literal_eval
+from pathlib import Path
+
+import numpy as np
+import spacy
+from preshed.counter import PreshCounter
+
+from .. import util
+from ..compat import fix_text
+
+
+def model(cmd, lang, model_dir, freqs_data, clusters_data, vectors_data,
+          min_doc_freq=5, min_word_freq=200):
+    model_path = Path(model_dir)
+    freqs_path = Path(freqs_data)
+    clusters_path = Path(clusters_data) if clusters_data else None
+    vectors_path = Path(vectors_data) if vectors_data else None
+
+    check_dirs(freqs_path, clusters_path, vectors_path)
+    vocab = util.get_lang_class(lang).Defaults.create_vocab()
+    nlp = spacy.blank(lang)
+    vocab = nlp.vocab
+    probs, oov_prob = read_probs(
+        freqs_path, min_doc_freq=int(min_doc_freq), min_freq=int(min_doc_freq))
+    clusters = read_clusters(clusters_path) if clusters_path else {}
+    populate_vocab(vocab, clusters, probs, oov_prob)
+    add_vectors(vocab, vectors_path)
+    create_model(model_path, nlp)
+
+
+def add_vectors(vocab, vectors_path):
+    with bz2.BZ2File(vectors_path.as_posix()) as f:
+        num_words, dim = next(f).split()
+        vocab.clear_vectors(int(dim))
+        for line in f:
+            word_w_vector = line.decode("utf8").strip().split(" ")
+            word = word_w_vector[0]
+            vector = np.array([float(val) for val in word_w_vector[1:]])
+            if word in vocab:
+                vocab.set_vector(word, vector)
+
+
+def create_model(model_path, model):
+    if not model_path.exists():
+        model_path.mkdir()
+    model.to_disk(model_path.as_posix())
+
+
+def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
+    counts = PreshCounter()
+    total = 0
+    freqs_file = check_unzip(freqs_path)
+    for i, line in enumerate(freqs_file):
+        freq, doc_freq, key = line.rstrip().split('\t', 2)
+        freq = int(freq)
+        counts.inc(i + 1, freq)
+        total += freq
+    counts.smooth()
+    log_total = math.log(total)
+    freqs_file = check_unzip(freqs_path)
+    probs = {}
+    for line in freqs_file:
+        freq, doc_freq, key = line.rstrip().split('\t', 2)
+        doc_freq = int(doc_freq)
+        freq = int(freq)
+        if doc_freq >= min_doc_freq and freq >= min_freq and len(
+                key) < max_length:
+            word = literal_eval(key)
+            smooth_count = counts.smoother(int(freq))
+            probs[word] = math.log(smooth_count) - log_total
+    oov_prob = math.log(counts.smoother(0)) - log_total
+    return probs, oov_prob
+
+
+def read_clusters(clusters_path):
+    clusters = {}
+    with clusters_path.open() as f:
+        for line in f:
+            try:
+                cluster, word, freq = line.split()
+                word = fix_text(word)
+            except ValueError:
+                continue
+            # If the clusterer has only seen the word a few times, its
+            # cluster is unreliable.
+            if int(freq) >= 3:
+                clusters[word] = cluster
+            else:
+                clusters[word] = '0'
+    # Expand clusters with re-casing
+    for word, cluster in list(clusters.items()):
+        if word.lower() not in clusters:
+            clusters[word.lower()] = cluster
+        if word.title() not in clusters:
+            clusters[word.title()] = cluster
+        if word.upper() not in clusters:
+            clusters[word.upper()] = cluster
+    return clusters
+
+
+def populate_vocab(vocab, clusters, probs, oov_prob):
+    for word, prob in reversed(
+            sorted(list(probs.items()), key=lambda item: item[1])):
+        lexeme = vocab[word]
+        lexeme.prob = prob
+        lexeme.is_oov = False
+        # Decode as a little-endian string, so that we can do & 15 to get
+        # the first 4 bits. See _parse_features.pyx
+        if word in clusters:
+            lexeme.cluster = int(clusters[word][::-1], 2)
+        else:
+            lexeme.cluster = 0
+
+
+def check_unzip(file_path):
+    file_path_str = file_path.as_posix()
+    if file_path_str.endswith('gz'):
+        return gzip.open(file_path_str)
+    else:
+        return file_path.open()
+
+
+def check_dirs(freqs_data, clusters_data, vectors_data):
+    if not freqs_data.is_file():
+        util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
+    if clusters_data and not clusters_data.is_file():
+        util.sys_exit(
+            clusters_data.as_posix(), title="No Brown clusters file found")
+    if vectors_data and not vectors_data.is_file():
+        util.sys_exit(
+            vectors_data.as_posix(), title="No word vectors file found")
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -15,10 +15,11 @@ from .. import about
@plac.annotations(
    input_dir=("directory with model data", "positional", None, str),
    output_dir=("output parent directory", "positional", None, str),
-    meta=("path to meta.json", "option", "m", str),
+    meta_path=("path to meta.json", "option", "m", str),
+    create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool),
    force=("force overwriting of existing folder in output directory", "flag", "f", bool)
 )
-def package(cmd, input_dir, output_dir, meta=None, force=False):
+def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False):
    """
    Generate Python package for model data, including meta and required
    installation files. A new directory will be created in the specified
@ -26,7 +27,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
    """
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
-    meta_path = util.ensure_path(meta)
+    meta_path = util.ensure_path(meta_path)
    if not input_path or not input_path.exists():
        prints(input_path, title="Model directory not found", exits=1)
    if not output_path or not output_path.exists():
@ -38,7 +39,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
    template_manifest = get_template('MANIFEST.in')
    template_init = get_template('xx_model_name/__init__.py')
    meta_path = meta_path or input_path / 'meta.json'
-    if meta_path.is_file():
+    if not create_meta and meta_path.is_file():
        prints(meta_path, title="Reading meta.json from file")
        meta = util.read_json(meta_path)
    else:
@ -100,7 +101,7 @@ def generate_meta():
 def generate_pipeline():
    prints("If set to 'True', the default pipeline is used. If set to 'False', "
           "the pipeline will be disabled. Components should be specified as a "
-           "comma-separated list of component names, e.g. vectorizer, tagger, "
+           "comma-separated list of component names, e.g. tensorizer, tagger, "
           "parser, ner. For more information, see the docs on processing pipelines.",
           title="Enter your model's pipeline components")
    pipeline = util.get_raw_input("Pipeline components", True)
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@ -0,0 +1,45 @@
+# coding: utf8
+from __future__ import unicode_literals, division, print_function
+
+import plac
+from pathlib import Path
+import ujson
+import cProfile
+import pstats
+
+import spacy
+import sys
+import tqdm
+import cytoolz
+
+
+def read_inputs(loc):
+    if loc is None:
+        file_ = sys.stdin
+        file_ = (line.encode('utf8') for line in file_)
+    else:
+        file_ = Path(loc).open()
+    for line in file_:
+        data = ujson.loads(line)
+        text = data['text']
+        yield text
+
+
+@plac.annotations(
+    lang=("model/language", "positional", None, str),
+    inputs=("Location of input file", "positional", None, read_inputs)
+)
+def profile(cmd, lang, inputs=None):
+    """
+    Profile a spaCy pipeline, to find out which functions take the most time.
+    """
+    nlp = spacy.load(lang) 
+    texts = list(cytoolz.take(10000, inputs))
+    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
+    s = pstats.Stats("Profile.prof")
+    s.strip_dirs().sort_stats("time").print_stats()
+
+
+def parse_texts(nlp, texts):
+    for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128):
+        pass
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -32,10 +32,12 @@ from ..compat import json_dumps
    resume=("Whether to resume training", "flag", "R", bool),
    no_tagger=("Don't train tagger", "flag", "T", bool),
    no_parser=("Don't train parser", "flag", "P", bool),
-    no_entities=("Don't train NER", "flag", "N", bool)
+    no_entities=("Don't train NER", "flag", "N", bool),
+    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
 )
 def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
-          use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False):
+          use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False,
+          gold_preproc=False):
    """
    Train a model. Expects data in spaCy's JSON format.
    """
@ -69,7 +71,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                                   util.env_opt('batch_to', 64),
                                   util.env_opt('batch_compound', 1.001))
    gold_preproc = util.env_opt('gold_preproc', False)
-    noise_level = util.env_opt('noise_level', 0.25)
+    noise_level = util.env_opt('noise_level', 0.0)

    if resume:
        prints(output_path / 'model19.pickle', title="Resuming training")
@ -95,15 +97,14 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                for batch in minibatch(train_docs, size=batch_sizes):
                    docs, golds = zip(*batch)
                    nlp.update(docs, golds, sgd=optimizer,
-                               drop=next(dropout_rates), losses=losses)
+                               drop=next(dropout_rates), losses=losses,
+                               update_shared=True)
                    pbar.update(sum(len(doc) for doc in docs))

            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ('model%d' % i)
                nlp.to_disk(epoch_model_path)
-                with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
-                    dill.dump(nlp, file_, -1)
                nlp_loaded = lang_class(pipeline=pipeline)
                nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
                scorer = nlp_loaded.evaluate(
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -5,6 +5,7 @@ import six
 import ftfy
 import sys
 import ujson
+import itertools

 from thinc.neural.util import copy_array

@ -35,6 +36,7 @@ CudaStream = CudaStream
 cupy = cupy
 fix_text = ftfy.fix_text
 copy_array = copy_array
+izip = getattr(itertools, 'izip', zip)

 is_python2 = six.PY2
 is_python3 = six.PY3
@ -44,21 +46,31 @@ is_osx = sys.platform == 'darwin'


 if is_python2:
+    import imp
    bytes_ = str
    unicode_ = unicode
    basestring_ = basestring
    input_ = raw_input
-    json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8')
+    json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False).decode('utf8')
    path2str = lambda path: str(path).decode('utf8')

 elif is_python3:
+    import importlib.util
    bytes_ = bytes
    unicode_ = str
    basestring_ = str
    input_ = input
-    json_dumps = lambda data: ujson.dumps(data, indent=2)
+    json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False)
    path2str = lambda path: str(path)

+
+def b_to_str(b_str):
+    if is_python2:
+        return b_str
+    # important: if no encoding is set, string becomes "b'...'"
+    return str(b_str, encoding='utf8')
+
+
 def getattr_(obj, name, *default):
    if is_python3 and isinstance(name, bytes):
        name = name.decode('utf8')
@ -92,3 +104,12 @@ def normalize_string_keys(old):
    return new


+def import_file(name, loc):
+    loc = str(loc)
+    if is_python2:
+        return imp.load_source(name, loc)
+    else:
+        spec = importlib.util.spec_from_file_location(name, str(loc))
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        return module
--- a/spacy/deprecated.py
+++ b/spacy/deprecated.py
@ -15,7 +15,7 @@ def depr_model_download(lang):
    lang (unicode): Language shortcut, 'en' or 'de'.
    """
    prints("The spacy.%s.download command is now deprecated. Please use "
-           "python -m spacy download [model name or shortcut] instead. For "
+           "spacy download [model name or shortcut] instead. For "
           "more info, see the documentation:" % lang,
           about.__docs_models__,
           "Downloading default '%s' model now..." % lang,
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -3,6 +3,7 @@ from __future__ import unicode_literals

 from .render import DependencyRenderer, EntityRenderer
 from ..tokens import Doc
+from ..compat import b_to_str
 from ..util import prints, is_in_jupyter


@ -65,7 +66,9 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,


 def app(environ, start_response):
-    start_response('200 OK', [('Content-type', 'text/html; charset=utf-8')])
+    # headers and status need to be bytes in Python 2, see #1227
+    headers = [(b_to_str(b'Content-type'), b_to_str(b'text/html; charset=utf-8'))]
+    start_response(b_to_str(b'200 OK'), headers)
    res = _html['parsed'].encode(encoding='utf-8')
    return [res]

--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@ -60,7 +60,7 @@ GLOSSARY = {
    'JJR':          'adjective, comparative',
    'JJS':          'adjective, superlative',
    'LS':           'list item marker',
-    'MD':           'verb, modal auxillary',
+    'MD':           'verb, modal auxiliary',
    'NIL':          'missing tag',
    'NN':           'noun, singular or mass',
    'NNP':          'noun, proper singular',
@ -91,7 +91,7 @@ GLOSSARY = {
    'NFP':          'superfluous punctuation',
    'GW':           'additional word in multi-word expression',
    'XX':           'unknown',
-    'BES':          'auxillary "be"',
+    'BES':          'auxiliary "be"',
    'HVS':          'forms of "have"',


--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@ -9,6 +9,7 @@ cdef struct GoldParseC:
    int* tags
    int* heads
    int* has_dep
+    int* sent_start
    attr_t* labels
    int** brackets
    Transition* ner
@ -29,6 +30,7 @@ cdef class GoldParse:
    cdef public list ner
    cdef public list ents
    cdef public dict brackets
+    cdef public object cats

    cdef readonly list cand_to_gold
    cdef readonly list gold_to_cand
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -381,7 +381,8 @@ cdef class GoldParse:
                   make_projective=make_projective)

    def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
-                 deps=None, entities=None, make_projective=False):
+                 deps=None, entities=None, make_projective=False,
+                 cats=tuple()):
        """Create a GoldParse.

        doc (Doc): The document the annotations refer to.
@ -392,6 +393,12 @@ cdef class GoldParse:
        entities (iterable): A sequence of named entity annotations, either as
            BILUO tag strings, or as `(start_char, end_char, label)` tuples,
            representing the entity positions.
+        cats (iterable): A sequence of labels for text classification. Each
+            label may be a string or an int, or a `(start_char, end_char, label)`
+            tuple, indicating that the label is applied to only part of the
+            document (usually a sentence). Unlike entity annotations, label
+            annotations can overlap, i.e. a single word can be covered by
+            multiple labelled spans.
        RETURNS (GoldParse): The newly constructed object.
        """
        if words is None:
@ -399,11 +406,11 @@ cdef class GoldParse:
        if tags is None:
            tags = [None for _ in doc]
        if heads is None:
-            heads = [token.i for token in doc]
+            heads = [None for token in doc]
        if deps is None:
            deps = [None for _ in doc]
        if entities is None:
-            entities = ['-' for _ in doc]
+            entities = [None for _ in doc]
        elif len(entities) == 0:
            entities = ['O' for _ in doc]
        elif not isinstance(entities[0], basestring):
@ -419,8 +426,10 @@ cdef class GoldParse:
        self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
        self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
        self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
+        self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
        self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))

+        self.cats = list(cats)
        self.words = [None] * len(doc)
        self.tags = [None] * len(doc)
        self.heads = [None] * len(doc)
@ -474,8 +483,12 @@ cdef class GoldParse:
        """
        return not nonproj.is_nonproj_tree(self.heads)

+    @property
+    def sent_starts(self):
+        return [self.c.sent_start[i] for i in range(self.length)]

-def biluo_tags_from_offsets(doc, entities):
+
+def biluo_tags_from_offsets(doc, entities, missing='O'):
    """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
    scheme (BILUO).

@ -527,7 +540,7 @@ def biluo_tags_from_offsets(doc, entities):
            if i in entity_chars:
                break
        else:
-            biluo[token.i] = 'O'
+            biluo[token.i] = missing
    return biluo


--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -27,7 +27,7 @@ ALPHA_UPPER = merge_char_classes(_upper + _uncased)

 _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
          'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
-          'TB T G M K')
+          'TB T G M K %')
 _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
 _punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
 _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
--- a/spacy/lang/da/examples.py
+++ b/spacy/lang/da/examples.py
@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.da.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple overvejer at købe et britisk statup for 1 milliard dollar",
+    "Selvkørende biler flytter forsikringsansvaret over på producenterne",
+    "San Francisco overvejer at forbyde leverandørrobotter på fortov",
+    "London er en stor by i Storbritannien"
+]
--- a/spacy/lang/de/examples.py
+++ b/spacy/lang/de/examples.py
@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.de.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
+    "Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
+    "Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz",
+    "Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion",
+    "San Francisco erwägt Verbot von Lieferrobotern",
+    "Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
+    "Wo bist du?",
+    "Was ist die Hauptstadt von Deutschland?"
+]
--- a/spacy/lang/en/examples.py
+++ b/spacy/lang/en/examples.py
@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.en.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple is looking at buying U.K. startup for $1 billion",
+    "Autonomous cars shift insurance liability toward manufacturers",
+    "San Francisco considers banning sidewalk delivery robots",
+    "London is a big city in the United Kingdom.",
+    "Where are you?",
+    "Who is the president of France?",
+    "What is the capital of the United States?",
+    "When was Barack Obama born?"
+]
--- a/spacy/lang/en/morph_rules.py
+++ b/spacy/lang/en/morph_rules.py
@ -59,7 +59,8 @@ MORPH_RULES = {

    "VBP": {
        "are":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
-        "'re":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}
+        "'re":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
+        "am":           {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"},
    },

    "VBD": {
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -232,7 +232,10 @@ for verb_data in [
    {ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
    {ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
    {ORTH: "was", LEMMA: "be", NORM: "was"},
-    {ORTH: "were", LEMMA: "be", NORM: "were"}]:
+    {ORTH: "were", LEMMA: "be", NORM: "were"},
+    {ORTH: "have", NORM: "have"},
+    {ORTH: "has", LEMMA: "have", NORM: "has"},
+    {ORTH: "dare", NORM: "dare"}]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
    for data in [verb_data, verb_data_tc]:
--- a/spacy/lang/es/examples.py
+++ b/spacy/lang/es/examples.py
@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.es.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares",
+    "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes",
+    "San Francisco analiza prohibir los robots delivery",
+    "Londres es una gran ciudad del Reino Unido",
+    "El gato come pescado",
+    "Veo al hombre con el telescopio",
+    "La araña come moscas",
+    "El pingüino incuba en su nido"
+]
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -5,6 +5,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from .lemmatizer import LOOKUP
+from .syntax_iterators import SYNTAX_ITERATORS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
@ -24,6 +25,7 @@ class FrenchDefaults(Language.Defaults):
    infixes = tuple(TOKENIZER_INFIXES)
    suffixes = tuple(TOKENIZER_SUFFIXES)
    token_match = TOKEN_MATCH
+    syntax_iterators = dict(SYNTAX_ITERATORS)

    @classmethod
    def create_lemmatizer(cls, nlp=None):
--- a/spacy/lang/fr/_tokenizer_exceptions_list.py
+++ b/spacy/lang/fr/_tokenizer_exceptions_list.py
--- a/spacy/lang/fr/examples.py
+++ b/spacy/lang/fr/examples.py
@ -0,0 +1,26 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.fr.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple cherche a acheter une startup anglaise pour 1 milliard de dollard",
+    "Les voitures autonomes voient leur assurances décalées vers les constructeurs",
+    "San Francisco envisage d'interdire les robots coursiers",
+    "Londres est une grande ville du Royaume-Uni",
+    "L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe",
+    "Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon",
+    "La France ne devrait pas manquer d'électricité cet été, même en cas de canicule",
+    "Nouvelles attaques de Trump contre le maire de Londres",
+    "Où es-tu ?",
+    "Qui est le président de la France ?",
+    "Où est la capitale des Etats-Unis ?",
+    "Quand est né Barack Obama ?"
+]
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@ -0,0 +1,42 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import NOUN, PROPN, PRON
+
+
+def noun_chunks(obj):
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
+    doc = obj.doc  # Ensure works on both Doc and Span.
+    np_deps = [doc.vocab.strings[label] for label in labels]
+    conj = doc.vocab.strings.add('conj')
+    np_label = doc.vocab.strings.add('NP')
+    seen = set()
+    for i, word in enumerate(obj):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.i in seen:
+            continue
+        if word.dep in np_deps:
+            if any(w.i in seen for w in word.subtree):
+                continue
+            seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
+            yield word.left_edge.i, word.right_edge.i+1, np_label
+        elif word.dep == conj:
+            head = word.head
+            while head.dep == conj and head.head.i < head.i:
+                head = head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                if any(w.i in seen for w in word.subtree):
+                    continue
+                seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
+                yield word.left_edge.i, word.right_edge.i+1, np_label
+
+
+SYNTAX_ITERATORS = {
+    'noun_chunks': noun_chunks
+}
--- a/spacy/lang/he/examples.py
+++ b/spacy/lang/he/examples.py
@ -0,0 +1,28 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.he.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל',
+    'רה"מ הודיע כי יחרים טקס בחסותו',
+    'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100',
+    'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית',
+    'סע לשלום, המפתחות בפנים.',
+    'מלצר, פעמיים טורקי!',
+    'ואהבת לרעך כמוך.',
+    'היום נעשה משהו בלתי נשכח.',
+    'איפה הילד?',
+    'מיהו נשיא צרפת?',
+    'מהי בירת ארצות הברית?',
+    "איך קוראים בעברית לצ'ופצ'יק של הקומקום?",
+    'מה הייתה הדקה?',
+    'מי אומר שלום ראשון, זה שעולה או זה שיורד?'
+]
--- a/spacy/lang/id/init.py
+++ b/spacy/lang/id/init.py
@ -0,0 +1,42 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .stop_words import STOP_WORDS
+from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .norm_exceptions import NORM_EXCEPTIONS
+from .lemmatizer import LOOKUP
+from .lex_attrs import LEX_ATTRS
+from .syntax_iterators import SYNTAX_ITERATORS
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ...language import Language
+from ...lemmatizerlookup import Lemmatizer
+from ...attrs import LANG
+from ...util import update_exc
+
+
+class IndonesianDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'id'
+
+    lex_attr_getters.update(LEX_ATTRS)
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+    prefixes = tuple(TOKENIZER_PREFIXES)
+    suffixes = tuple(TOKENIZER_SUFFIXES)
+    infixes = tuple(TOKENIZER_INFIXES)
+    syntax_iterators = dict(SYNTAX_ITERATORS)
+
+    @classmethod
+    def create_lemmatizer(cls, nlp=None):
+        return Lemmatizer(LOOKUP)
+
+
+class Indonesian(Language):
+    lang = 'id'
+    Defaults = IndonesianDefaults
+
+
+__all__ = ['Indonesian']
--- a/spacy/lang/id/_tokenizer_exceptions_list.py
+++ b/spacy/lang/id/_tokenizer_exceptions_list.py
--- a/spacy/lang/id/examples.py
+++ b/spacy/lang/id/examples.py
@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.en.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Al Qaidah mengklaim bom mobil yang menewaskan 60 Orang di Mali",
+    "Abu Sayyaf mengeksekusi sandera warga Filipina",
+    "Penyaluran pupuk berasal dari lima lokasi yakni Bontang, Kalimantan Timur, Surabaya, Banyuwangi, Semarang, dan Makassar.",
+    "PT Pupuk Kaltim telah menyalurkan 274.707 ton pupuk bersubsidi ke wilayah penyaluran di 14 provinsi.",
+    "Jakarta adalah kota besar yang nyaris tidak pernah tidur."
+    "Kamu ada di mana semalam?",
+    "Siapa yang membeli makanan ringan tersebut?",
+    "Siapa presiden pertama Republik Indonesia?"
+]
--- a/spacy/lang/id/lemmatizer.py
+++ b/spacy/lang/id/lemmatizer.py
--- a/spacy/lang/id/lex_attrs.py
+++ b/spacy/lang/id/lex_attrs.py
@ -0,0 +1,42 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...attrs import LIKE_NUM
+
+
+_num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
+              'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen',
+              'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty',
+              'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety',
+              'hundred', 'thousand', 'million', 'billion', 'trillion', 'quadrillion',
+              'gajillion', 'bazillion',
+              'nol', 'satu', 'dua', 'tiga', 'empat', 'lima', 'enam', 'tujuh',
+              'delapan', 'sembilan', 'sepuluh', 'sebelas', 'duabelas', 'tigabelas',
+              'empatbelas', 'limabelas', 'enambelas', 'tujuhbelas', 'delapanbelas',
+              'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta',
+              'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun',
+              'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun',
+              'noniliun', 'desiliun',
+              ]
+
+
+def like_num(text):
+    text = text.replace(',', '').replace('.', '')
+    if text.isdigit():
+        return True
+    if text.count('/') == 1:
+        num, denom = text.split('/')
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text in _num_words:
+        return True
+    if text.count('-') == 1:
+        _, num = text.split('-')
+        if num.isdigit() or num in _num_words:
+            return True
+    return False
+
+
+LEX_ATTRS = {
+    LIKE_NUM: like_num
+}
--- a/spacy/lang/id/norm_exceptions.py
+++ b/spacy/lang/id/norm_exceptions.py
@ -0,0 +1,17 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+_exc = {
+    "Rp": "$",
+    "IDR": "$",
+    "RMB": "$",
+    "USD": "$",
+    "AUD": "$",
+    "GBP": "$",
+}
+
+NORM_EXCEPTIONS = {}
+
+for string, norm in _exc.items():
+    NORM_EXCEPTIONS[string] = norm
+    NORM_EXCEPTIONS[string.title()] = norm
--- a/spacy/lang/id/punctuation.py
+++ b/spacy/lang/id/punctuation.py
@ -0,0 +1,53 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
+from ..char_classes import merge_chars, split_chars, _currency, _units
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
+from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER, HYPHENS
+
+_units = (_units + 's bit Gbps Mbps mbps Kbps kbps ƒ ppi px '
+          'Hz kHz MHz GHz mAh '
+          'ratus rb ribu ribuan '
+          'juta jt jutaan mill?iar million bil[l]?iun bilyun billion '
+          )
+_currency = (_currency + r' USD Rp IDR RMB SGD S\$')
+_months = ('Januari Februari Maret April Mei Juni Juli Agustus September '
+           'Oktober November Desember January February March May June '
+           'July August October December Jan Feb Mar Jun Jul Aug Sept '
+           'Oct Okt Nov Des ')
+
+
+UNITS = merge_chars(_units)
+CURRENCY = merge_chars(_currency)
+HTML_PREFIX = r'<(b|strong|i|em|p|span|div|br)\s?/>|<a([^>]+)>'
+HTML_SUFFIX = r'</(b|strong|i|em|p|span|div|a)>'
+MONTHS = merge_chars(_months)
+LIST_CURRENCY = split_chars(_currency)
+
+TOKENIZER_PREFIXES.remove('#') # hashtag
+_prefixes = TOKENIZER_PREFIXES + LIST_CURRENCY + [HTML_PREFIX] + ['/', '—']
+
+_suffixes = TOKENIZER_SUFFIXES + [r'\-[Nn]ya', '-[KkMm]u', '[—-]'] + [
+        r'(?<={c})(?:[0-9]+)'.format(c=CURRENCY),
+        r'(?<=[0-9])(?:{u})'.format(u=UNITS),
+        r'(?<=[0-9])%',
+        r'(?<=[0-9{a}]{h})(?:[\.,:-])'.format(a=ALPHA, h=HTML_SUFFIX),
+        r'(?<=[0-9{a}])(?:{h})'.format(a=ALPHA, h=HTML_SUFFIX),
+    ]
+
+_infixes = TOKENIZER_INFIXES + [
+    r'(?<=[0-9])[\\/](?=[0-9%-])',
+    r'(?<=[0-9])%(?=[{a}0-9/])'.format(a=ALPHA),
+    r'(?<={u})[\/-](?=[0-9])'.format(u=UNITS),
+    r'(?<={m})[\/-](?=[0-9])'.format(m=MONTHS),
+    r'(?<=[0-9\)][\.,])"(?=[0-9])',
+    r'(?<=[{a}\)][\.,\'])["—](?=[{a}])'.format(a=ALPHA),
+    r'(?<=[{a}])-(?=[0-9])'.format(a=ALPHA),
+    r'(?<=[0-9])-(?=[{a}])'.format(a=ALPHA),
+    r'(?<=[{a}])[\/-](?={c}{a})'.format(a=ALPHA, c=CURRENCY),
+]
+
+TOKENIZER_PREFIXES = _prefixes
+TOKENIZER_SUFFIXES = _suffixes
+TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/id/stop_words.py
+++ b/spacy/lang/id/stop_words.py
@ -0,0 +1,763 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+STOP_WORDS = set("""
+ada
+adalah
+adanya
+adapun
+agak
+agaknya
+agar
+akan
+akankah
+akhir
+akhiri
+akhirnya
+aku
+akulah
+amat
+amatlah
+anda
+andalah
+antar
+antara
+antaranya
+apa
+apaan
+apabila
+apakah
+apalagi
+apatah
+artinya
+asal
+asalkan
+atas
+atau
+ataukah
+ataupun
+awal
+awalnya
+bagai
+bagaikan
+bagaimana
+bagaimanakah
+bagaimanapun
+bagi
+bagian
+bahkan
+bahwa
+bahwasanya
+baik
+bakal
+bakalan
+balik
+banyak
+bapak
+baru
+bawah
+beberapa
+begini
+beginian
+beginikah
+beginilah
+begitu
+begitukah
+begitulah
+begitupun
+bekerja
+belakang
+belakangan
+belum
+belumlah
+benar
+benarkah
+benarlah
+berada
+berakhir
+berakhirlah
+berakhirnya
+berapa
+berapakah
+berapalah
+berapapun
+berarti
+berawal
+berbagai
+berdatangan
+beri
+berikan
+berikut
+berikutnya
+berjumlah
+berkali-kali
+berkata
+berkehendak
+berkeinginan
+berkenaan
+berlainan
+berlalu
+berlangsung
+berlebihan
+bermacam
+bermacam-macam
+bermaksud
+bermula
+bersama
+bersama-sama
+bersiap
+bersiap-siap
+bertanya
+bertanya-tanya
+berturut
+berturut-turut
+bertutur
+berujar
+berupa
+besar
+betul
+betulkah
+biasa
+biasanya
+bila
+bilakah
+bisa
+bisakah
+boleh
+bolehkah
+bolehlah
+buat
+bukan
+bukankah
+bukanlah
+bukannya
+bulan
+bung
+cara
+caranya
+cukup
+cukupkah
+cukuplah
+cuma
+dahulu
+dalam
+dan
+dapat
+dari
+daripada
+datang
+dekat
+demi
+demikian
+demikianlah
+dengan
+depan
+di
+dia
+diakhiri
+diakhirinya
+dialah
+diantara
+diantaranya
+diberi
+diberikan
+diberikannya
+dibuat
+dibuatnya
+didapat
+didatangkan
+digunakan
+diibaratkan
+diibaratkannya
+diingat
+diingatkan
+diinginkan
+dijawab
+dijelaskan
+dijelaskannya
+dikarenakan
+dikatakan
+dikatakannya
+dikerjakan
+diketahui
+diketahuinya
+dikira
+dilakukan
+dilalui
+dilihat
+dimaksud
+dimaksudkan
+dimaksudkannya
+dimaksudnya
+diminta
+dimintai
+dimisalkan
+dimulai
+dimulailah
+dimulainya
+dimungkinkan
+dini
+dipastikan
+diperbuat
+diperbuatnya
+dipergunakan
+diperkirakan
+diperlihatkan
+diperlukan
+diperlukannya
+dipersoalkan
+dipertanyakan
+dipunyai
+diri
+dirinya
+disampaikan
+disebut
+disebutkan
+disebutkannya
+disini
+disinilah
+ditambahkan
+ditandaskan
+ditanya
+ditanyai
+ditanyakan
+ditegaskan
+ditujukan
+ditunjuk
+ditunjuki
+ditunjukkan
+ditunjukkannya
+ditunjuknya
+dituturkan
+dituturkannya
+diucapkan
+diucapkannya
+diungkapkan
+dong
+dua
+dulu
+empat
+enggak
+enggaknya
+entah
+entahlah
+guna
+gunakan
+hal
+hampir
+hanya
+hanyalah
+hari
+harus
+haruslah
+harusnya
+hendak
+hendaklah
+hendaknya
+hingga
+ia
+ialah
+ibarat
+ibaratkan
+ibaratnya
+ibu
+ikut
+ingat
+ingat-ingat
+ingin
+inginkah
+inginkan
+ini
+inikah
+inilah
+itu
+itukah
+itulah
+jadi
+jadilah
+jadinya
+jangan
+jangankan
+janganlah
+jauh
+jawab
+jawaban
+jawabnya
+jelas
+jelaskan
+jelaslah
+jelasnya
+jika
+jikalau
+juga
+jumlah
+jumlahnya
+justru
+kala
+kalau
+kalaulah
+kalaupun
+kalian
+kami
+kamilah
+kamu
+kamulah
+kan
+kapan
+kapankah
+kapanpun
+karena
+karenanya
+kasus
+kata
+katakan
+katakanlah
+katanya
+ke
+keadaan
+kebetulan
+kecil
+kedua
+keduanya
+keinginan
+kelamaan
+kelihatan
+kelihatannya
+kelima
+keluar
+kembali
+kemudian
+kemungkinan
+kemungkinannya
+kenapa
+kepada
+kepadanya
+kesampaian
+keseluruhan
+keseluruhannya
+keterlaluan
+ketika
+khususnya
+kini
+kinilah
+kira
+kira-kira
+kiranya
+kita
+kitalah
+kok
+kurang
+lagi
+lagian
+lah
+lain
+lainnya
+lalu
+lama
+lamanya
+lanjut
+lanjutnya
+lebih
+lewat
+lima
+luar
+macam
+maka
+makanya
+makin
+malah
+malahan
+mampu
+mampukah
+mana
+manakala
+manalagi
+masa
+masalah
+masalahnya
+masih
+masihkah
+masing
+masing-masing
+mau
+maupun
+melainkan
+melakukan
+melalui
+melihat
+melihatnya
+memang
+memastikan
+memberi
+memberikan
+membuat
+memerlukan
+memihak
+meminta
+memintakan
+memisalkan
+memperbuat
+mempergunakan
+memperkirakan
+memperlihatkan
+mempersiapkan
+mempersoalkan
+mempertanyakan
+mempunyai
+memulai
+memungkinkan
+menaiki
+menambahkan
+menandaskan
+menanti
+menanti-nanti
+menantikan
+menanya
+menanyai
+menanyakan
+mendapat
+mendapatkan
+mendatang
+mendatangi
+mendatangkan
+menegaskan
+mengakhiri
+mengapa
+mengatakan
+mengatakannya
+mengenai
+mengerjakan
+mengetahui
+menggunakan
+menghendaki
+mengibaratkan
+mengibaratkannya
+mengingat
+mengingatkan
+menginginkan
+mengira
+mengucapkan
+mengucapkannya
+mengungkapkan
+menjadi
+menjawab
+menjelaskan
+menuju
+menunjuk
+menunjuki
+menunjukkan
+menunjuknya
+menurut
+menuturkan
+menyampaikan
+menyangkut
+menyatakan
+menyebutkan
+menyeluruh
+menyiapkan
+merasa
+mereka
+merekalah
+merupakan
+meski
+meskipun
+meyakini
+meyakinkan
+minta
+mirip
+misal
+misalkan
+misalnya
+mula
+mulai
+mulailah
+mulanya
+mungkin
+mungkinkah
+nah
+naik
+namun
+nanti
+nantinya
+nyaris
+nyatanya
+oleh
+olehnya
+pada
+padahal
+padanya
+pak
+paling
+panjang
+pantas
+para
+pasti
+pastilah
+penting
+pentingnya
+per
+percuma
+perlu
+perlukah
+perlunya
+pernah
+persoalan
+pertama
+pertama-tama
+pertanyaan
+pertanyakan
+pihak
+pihaknya
+pukul
+pula
+pun
+punya
+rasa
+rasanya
+rata
+rupanya
+saat
+saatnya
+saja
+sajalah
+saling
+sama
+sama-sama
+sambil
+sampai
+sampai-sampai
+sampaikan
+sana
+sangat
+sangatlah
+satu
+saya
+sayalah
+se
+sebab
+sebabnya
+sebagai
+sebagaimana
+sebagainya
+sebagian
+sebaik
+sebaik-baiknya
+sebaiknya
+sebaliknya
+sebanyak
+sebegini
+sebegitu
+sebelum
+sebelumnya
+sebenarnya
+seberapa
+sebesar
+sebetulnya
+sebisanya
+sebuah
+sebut
+sebutlah
+sebutnya
+secara
+secukupnya
+sedang
+sedangkan
+sedemikian
+sedikit
+sedikitnya
+seenaknya
+segala
+segalanya
+segera
+seharusnya
+sehingga
+seingat
+sejak
+sejauh
+sejenak
+sejumlah
+sekadar
+sekadarnya
+sekali
+sekali-kali
+sekalian
+sekaligus
+sekalipun
+sekarang
+sekarang
+sekecil
+seketika
+sekiranya
+sekitar
+sekitarnya
+sekurang-kurangnya
+sekurangnya
+sela
+selain
+selaku
+selalu
+selama
+selama-lamanya
+selamanya
+selanjutnya
+seluruh
+seluruhnya
+semacam
+semakin
+semampu
+semampunya
+semasa
+semasih
+semata
+semata-mata
+semaunya
+sementara
+semisal
+semisalnya
+sempat
+semua
+semuanya
+semula
+sendiri
+sendirian
+sendirinya
+seolah
+seolah-olah
+seorang
+sepanjang
+sepantasnya
+sepantasnyalah
+seperlunya
+seperti
+sepertinya
+sepihak
+sering
+seringnya
+serta
+serupa
+sesaat
+sesama
+sesampai
+sesegera
+sesekali
+seseorang
+sesuatu
+sesuatunya
+sesudah
+sesudahnya
+setelah
+setempat
+setengah
+seterusnya
+setiap
+setiba
+setibanya
+setidak-tidaknya
+setidaknya
+setinggi
+seusai
+sewaktu
+siap
+siapa
+siapakah
+siapapun
+sini
+sinilah
+soal
+soalnya
+suatu
+sudah
+sudahkah
+sudahlah
+supaya
+tadi
+tadinya
+tahu
+tahun
+tak
+tambah
+tambahnya
+tampak
+tampaknya
+tandas
+tandasnya
+tanpa
+tanya
+tanyakan
+tanyanya
+tapi
+tegas
+tegasnya
+telah
+tempat
+tengah
+tentang
+tentu
+tentulah
+tentunya
+tepat
+terakhir
+terasa
+terbanyak
+terdahulu
+terdapat
+terdiri
+terhadap
+terhadapnya
+teringat
+teringat-ingat
+terjadi
+terjadilah
+terjadinya
+terkira
+terlalu
+terlebih
+terlihat
+termasuk
+ternyata
+tersampaikan
+tersebut
+tersebutlah
+tertentu
+tertuju
+terus
+terutama
+tetap
+tetapi
+tiap
+tiba
+tiba-tiba
+tidak
+tidakkah
+tidaklah
+tiga
+tinggi
+toh
+tunjuk
+turut
+tutur
+tuturnya
+ucap
+ucapnya
+ujar
+ujarnya
+umum
+umumnya
+ungkap
+ungkapnya
+untuk
+usah
+usai
+waduh
+wah
+wahai
+waktu
+waktunya
+walau
+walaupun
+wong
+yaitu
+yakin
+yakni
+yang
+""".split())
--- a/spacy/lang/id/syntax_iterators.py
+++ b/spacy/lang/id/syntax_iterators.py
@ -0,0 +1,42 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import NOUN, PROPN, PRON
+
+
+def noun_chunks(obj):
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
+    doc = obj.doc  # Ensure works on both Doc and Span.
+    np_deps = [doc.vocab.strings[label] for label in labels]
+    conj = doc.vocab.strings.add('conj')
+    np_label = doc.vocab.strings.add('NP')
+    seen = set()
+    for i, word in enumerate(obj):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.i in seen:
+            continue
+        if word.dep in np_deps:
+            if any(w.i in seen for w in word.subtree):
+                continue
+            seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
+            yield word.left_edge.i, word.right_edge.i+1, np_label
+        elif word.dep == conj:
+            head = word.head
+            while head.dep == conj and head.head.i < head.i:
+                head = head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                if any(w.i in seen for w in word.subtree):
+                    continue
+                seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
+                yield word.left_edge.i, word.right_edge.i+1, np_label
+
+
+SYNTAX_ITERATORS = {
+    'noun_chunks': noun_chunks
+}
--- a/spacy/lang/id/tokenizer_exceptions.py
+++ b/spacy/lang/id/tokenizer_exceptions.py
@ -0,0 +1,50 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import regex as re
+
+from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
+from ..tokenizer_exceptions import URL_PATTERN
+from ...symbols import ORTH
+
+
+_exc = {}
+
+for orth in ID_BASE_EXCEPTIONS:
+    _exc[orth] = [{ORTH: orth}]
+
+    orth_title = orth.title()
+    _exc[orth_title] = [{ORTH: orth_title}]
+
+    orth_caps = orth.upper()
+    _exc[orth_caps] = [{ORTH: orth_caps}]
+
+    orth_lower = orth.lower()
+    _exc[orth_lower] = [{ORTH: orth_lower}]
+
+    if '-' in orth:
+        orth_title = '-'.join([part.title() for part in orth.split('-')])
+        _exc[orth_title] = [{ORTH: orth_title}]
+
+        orth_caps = '-'.join([part.upper() for part in orth.split('-')])
+        _exc[orth_caps] = [{ORTH: orth_caps}]
+
+
+for orth in [
+    "'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.",
+    "E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.",
+    "Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.",
+    "Ph.D.", "Rep.", "Rev.", "Sen.", "St.", "vs.",
+    "B.A.", "B.Ch.E.", "B.Sc.", "Dr.", "Dra.", "Drs.", "Hj.", "Ka.", "Kp.",
+    "M.Ag.", "M.Hum.", "M.Kes,", "M.Kom.", "M.M.", "M.P.", "M.Pd.", "M.Sc.",
+    "M.Si.", "M.Sn.", "M.T.", "M.Th.", "No.", "Pjs.", "Plt.", "R.A.", "S.Ag.",
+    "S.E.", "S.H.", "S.Hut.", "S.K.M.", "S.Kedg.", "S.Kedh.", "S.Kom.",
+    "S.Pd.", "S.Pol.", "S.Psi.", "S.S.", "S.Sos.", "S.T.", "S.Tekp.", "S.Th.",
+    "a.l.", "a.n.", "a.s.", "b.d.", "d.a.", "d.l.", "d/h", "dkk.", "dll.",
+    "dr.", "drh.", "ds.", "dsb.", "dst.", "faks.", "fax.", "hlm.", "i/o",
+    "n.b.", "p.p." "pjs.", "s.d.", "tel.", "u.p.",
+    ]:
+    _exc[orth] = [{ORTH: orth}]
+
+TOKENIZER_EXCEPTIONS = dict(_exc)
+
--- a/spacy/lang/it/examples.py
+++ b/spacy/lang/it/examples.py
@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.it.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
+    "Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
+    "San Francisco prevede di bandire i robot di consegna porta a porta",
+    "Londra è una grande città del Regno Unito."
+]
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@ -137,6 +137,7 @@ LEX_ATTRS = {
    attrs.IS_UPPER: lambda string: string.isupper(),
    attrs.IS_STOP: lambda string: False,
    attrs.IS_OOV: lambda string: True,
+    attrs.PROB: lambda string: -20.,
    attrs.LIKE_EMAIL: like_email,
    attrs.LIKE_NUM: like_num,
    attrs.IS_PUNCT: is_punct,
--- a/spacy/lang/nb/examples.py
+++ b/spacy/lang/nb/examples.py
@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.nb.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar",
+    "Selvkjørende biler flytter forsikringsansvaret over på produsentene ",
+    "San Francisco vurderer å forby robotbud på fortauene",
+    "London er en stor by i Storbritannia."
+]
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -1,6 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
@ -15,7 +16,7 @@ class PolishDefaults(Language.Defaults):
    lex_attr_getters[LANG] = lambda text: 'pl'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)

-    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = set(STOP_WORDS)


--- a/spacy/lang/pl/examples.py
+++ b/spacy/lang/pl/examples.py
@ -0,0 +1,20 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.pl.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Poczuł przyjemną woń mocnej kawy.",
+    "Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
+    "Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.",
+    "Nowy abonament pod lupą Komisji Europejskiej",
+    "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
+    "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”."
+]
--- a/spacy/lang/pl/tokenizer_exceptions.py
+++ b/spacy/lang/pl/tokenizer_exceptions.py
@ -0,0 +1,23 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ..symbols import ORTH, LEMMA, POS
+
+
+_exc = {}
+
+for exc_data in [
+    {ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
+    {ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
+    {ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
+    {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
+    {ORTH: "tj.", LEMMA: "to jest", POS: ADV},
+    {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
+    _exc[exc_data[ORTH]] = [dict(exc_data)],
+
+for orth in [
+    "w.", "r."]:
+    _exc[orth] = [{ORTH: orth}]
+
+
+TOKENIZER_EXCEPTIONS = dict(_exc)
--- a/spacy/lang/pt/examples.py
+++ b/spacy/lang/pt/examples.py
@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.pt.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares",
+    "Carros autônomos empurram a responsabilidade do seguro para os fabricantes."
+    "São Francisco considera banir os robôs de entrega que andam pelas calçadas",
+    "Londres é a maior cidade do Reino Unido"
+]
--- a/spacy/lang/sv/examples.py
+++ b/spacy/lang/sv/examples.py
@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.sv.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple överväger att köpa brittisk startup för 1 miljard dollar.",
+    "Självkörande bilar förskjuter försäkringsansvar mot tillverkare.",
+    "San Fransisco överväger förbud mot leveransrobotar på trottoarer.".
+    "London är en storstad i Storbritannien."
+]
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -15,6 +15,7 @@ class Chinese(Language):
            raise ImportError("The Chinese tokenizer requires the Jieba library: "
                              "https://github.com/fxsjy/jieba")
        words = list(jieba.cut(text, cut_all=True))
+        words=[x for x in words if x]
        return Doc(self.vocab, words=words, spaces=[False]*len(words))


--- a/spacy/language.py
+++ b/spacy/language.py
@ -10,6 +10,7 @@ from thinc.neural.optimizers import Adam, SGD
 import random
 import ujson
 from collections import OrderedDict
+import itertools

 from .tokenizer import Tokenizer
 from .vocab import Vocab
@ -22,8 +23,10 @@ from .pipeline import NeuralDependencyParser, EntityRecognizer
 from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
 from .pipeline import NeuralLabeller
 from .pipeline import SimilarityHook
+from .pipeline import TextCategorizer
+from . import about

-from .compat import json_dumps
+from .compat import json_dumps, izip
 from .attrs import IS_STOP
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .lang.tokenizer_exceptions import TOKEN_MATCH
@ -92,7 +95,7 @@ class BaseDefaults(object):
        meta = nlp.meta if nlp is not None else {}
        # Resolve strings, like "cnn", "lstm", etc
        pipeline = []
-        for entry in cls.pipeline:
+        for entry in meta.get('pipeline', []):
            if entry in disable or getattr(entry, 'name', entry) in disable:
                continue
            factory = cls.Defaults.factories[entry]
@ -107,6 +110,8 @@ class BaseDefaults(object):
            NeuralDependencyParser(nlp.vocab, **cfg),
            nonproj.deprojectivize],
        'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
+        'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)],
+        'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)],
        # Temporary compatibility -- delete after pivot
        'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
        'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
@ -115,7 +120,6 @@ class BaseDefaults(object):
            nonproj.deprojectivize,
        ],
        'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
-        'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)]
    }

    token_match = TOKEN_MATCH
@ -147,8 +151,8 @@ class Language(object):
    Defaults = BaseDefaults
    lang = None

-    def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={},
-            disable=tuple(), **kwargs):
+    def __init__(self, vocab=True, make_doc=True, pipeline=None,
+                 meta={}, disable=tuple(), **kwargs):
        """Initialise a Language object.

        vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
@ -165,7 +169,7 @@ class Language(object):
            models to add model meta data.
        RETURNS (Language): The newly constructed object.
        """
-        self.meta = dict(meta)
+        self._meta = dict(meta)
        if vocab is True:
            factory = self.Defaults.create_vocab
            vocab = factory(self, **meta.get('vocab', {}))
@ -196,6 +200,29 @@ class Language(object):
            else:
                flat_list.append(pipe)
        self.pipeline = flat_list
+        self._optimizer = None
+
+    @property
+    def meta(self):
+        self._meta.setdefault('lang', self.vocab.lang)
+        self._meta.setdefault('name', '')
+        self._meta.setdefault('version', '0.0.0')
+        self._meta.setdefault('spacy_version', about.__version__)
+        self._meta.setdefault('description', '')
+        self._meta.setdefault('author', '')
+        self._meta.setdefault('email', '')
+        self._meta.setdefault('url', '')
+        self._meta.setdefault('license', '')
+        pipeline = []
+        for component in self.pipeline:
+            if hasattr(component, 'name'):
+                pipeline.append(component.name)
+        self._meta['pipeline'] = pipeline
+        return self._meta
+
+    @meta.setter
+    def meta(self, value):
+        self._meta = value

    # Conveniences to access pipeline components
    @property
@ -218,7 +245,7 @@ class Language(object):
    def matcher(self):
        return self.get_component('matcher')

-    def get_component(self, name): 
+    def get_component(self, name):
        if self.pipeline in (True, None):
            return None
        for proc in self.pipeline:
@ -251,7 +278,8 @@ class Language(object):
    def make_doc(self, text):
        return self.tokenizer(text)

-    def update(self, docs, golds, drop=0., sgd=None, losses=None):
+    def update(self, docs, golds, drop=0., sgd=None, losses=None,
+            update_shared=False):
        """Update the models in the pipeline.

        docs (iterable): A batch of `Doc` objects.
@ -266,6 +294,15 @@ class Language(object):
            >>>        for docs, golds in epoch:
            >>>            state = nlp.update(docs, golds, sgd=optimizer)
        """
+        if len(docs) != len(golds):
+            raise IndexError("Update expects same number of docs and golds "
+                "Got: %d, %d" % (len(docs), len(golds)))
+        if len(docs) == 0:
+            return
+        if sgd is None:
+            if self._optimizer is None:
+                self._optimizer = Adam(Model.ops, 0.001)
+            sgd = self._optimizer
        tok2vec = self.pipeline[0]
        feats = tok2vec.doc2feats(docs)
        grads = {}
@ -273,14 +310,18 @@ class Language(object):
            grads[key] = (W, dW)
        pipes = list(self.pipeline[1:])
        random.shuffle(pipes)
+        tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
+        all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
        for proc in pipes:
            if not hasattr(proc, 'update'):
                continue
-            tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
            d_tokvecses = proc.update((docs, tokvecses), golds,
                                      drop=drop, sgd=get_grads, losses=losses)
-            if d_tokvecses is not None:
-                bp_tokvecses(d_tokvecses, sgd=sgd)
+            if update_shared and d_tokvecses is not None:
+                for i, d_tv in enumerate(d_tokvecses):
+                    all_d_tokvecses[i] += d_tv
+        if update_shared and bp_tokvecses is not None:
+            bp_tokvecses(all_d_tokvecses, sgd=sgd)
        for key, (W, dW) in grads.items():
            sgd(W, dW, key=key)
        # Clear the tensor variable, to free GPU memory.
@ -343,16 +384,25 @@ class Language(object):
        eps = util.env_opt('optimizer_eps', 1e-08)
        L2 = util.env_opt('L2_penalty', 1e-6)
        max_grad_norm = util.env_opt('grad_norm_clip', 1.)
-        optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
-                         beta2=beta2, eps=eps)
-        optimizer.max_grad_norm = max_grad_norm
-        optimizer.device = device
-        return optimizer
+        self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
+                              beta2=beta2, eps=eps)
+        self._optimizer.max_grad_norm = max_grad_norm
+        self._optimizer.device = device
+        return self._optimizer

    def evaluate(self, docs_golds):
-        docs, golds = zip(*docs_golds)
        scorer = Scorer()
-        for doc, gold in zip(self.pipe(docs, batch_size=32), golds):
+        docs, golds = zip(*docs_golds)
+        docs = list(docs)
+        golds = list(golds)
+        for pipe in self.pipeline:
+            if not hasattr(pipe, 'pipe'):
+                for doc in docs:
+                    pipe(doc)
+            else:
+                docs = list(pipe.pipe(docs))
+        assert len(docs) == len(golds)
+        for doc, gold in zip(docs, golds):
            scorer.score(doc, gold)
            doc.tensor = None
        return scorer
@ -386,11 +436,16 @@ class Language(object):
            except StopIteration:
                pass

-    def pipe(self, texts, n_threads=2, batch_size=1000, disable=[]):
+    def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
+            disable=[]):
        """Process texts as a stream, and yield `Doc` objects in order. Supports
        GIL-free multi-threading.

        texts (iterator): A sequence of texts to process.
+        as_tuples (bool):
+            If set to True, inputs should be a sequence of
+            (text, context) tuples. Output will then be a sequence of
+            (doc, context) tuples. Defaults to False.
        n_threads (int): The number of worker threads to use. If -1, OpenMP will
            decide how many to use at run time. Default is 2.
        batch_size (int): The number of texts to buffer.
@ -402,8 +457,16 @@ class Language(object):
            >>>     for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
            >>>         assert doc.is_parsed
        """
+        if as_tuples:
+            text_context1, text_context2 = itertools.tee(texts)
+            texts = (tc[0] for tc in text_context1)
+            contexts = (tc[1] for tc in text_context2)
+            docs = self.pipe(texts, n_threads=n_threads, batch_size=batch_size,
+                             disable=disable)
+            for doc, context in izip(docs, contexts):
+                yield (doc, context)
+            return
        docs = (self.make_doc(text) for text in texts)
-        docs = texts
        for proc in self.pipeline:
            name = getattr(proc, 'name', None)
            if name in disable:
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -44,6 +44,11 @@ class Lemmatizer(object):
            return True
        elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
            return True
+        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
+        # morphology
+        elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
+                                     morphology.get('Tense') == 'pres'):
+            return True
        elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
            return True
        elif VerbForm_inf in morphology:
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -171,6 +171,8 @@ cdef class Lexeme:
    property rank:
        def __get__(self):
            return self.c.id
+        def __set__(self, value):
+            self.c.id = value

    property sentiment:
        def __get__(self):
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -42,15 +42,148 @@ from .compat import json_dumps

 from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
 from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
+from ._ml import build_text_classifier, build_tagger_model
 from .parts_of_speech import X


-class TokenVectorEncoder(object):
+class SentenceSegmenter(object):
+    '''A simple spaCy hook, to allow custom sentence boundary detection logic
+    (that doesn't require the dependency parse).
+
+    To change the sentence boundary detection strategy, pass a generator
+    function `strategy` on initialization, or assign a new strategy to
+    the .strategy attribute.
+
+    Sentence detection strategies should be generators that take `Doc` objects
+    and yield `Span` objects for each sentence.
+    '''
+    name = 'sbd'
+
+    def __init__(self, vocab, strategy=None):
+        self.vocab = vocab
+        if strategy is None or strategy == 'on_punct':
+            strategy = self.split_on_punct
+        self.strategy = strategy
+
+    def __call__(self, doc):
+        doc.user_hooks['sents'] = self.strategy
+
+    @staticmethod
+    def split_on_punct(doc):
+        start = 0
+        seen_period = False
+        for i, word in enumerate(doc):
+            if seen_period and not word.is_punct:
+                yield doc[start : word.i]
+                start = word.i
+                seen_period = False
+            elif word.text in ['.', '!', '?']:
+                seen_period = True
+        if start < len(doc):
+            yield doc[start : len(doc)]
+
+
+class BaseThincComponent(object):
+    name = None
+
+    @classmethod
+    def Model(cls, *shape, **kwargs):
+        raise NotImplementedError
+
+    def __init__(self, vocab, model=True, **cfg):
+        raise NotImplementedError
+
+    def __call__(self, doc):
+        scores = self.predict([doc])
+        self.set_annotations([doc], scores)
+        return doc
+
+    def pipe(self, stream, batch_size=128, n_threads=-1):
+        for docs in cytoolz.partition_all(batch_size, stream):
+            docs = list(docs)
+            scores = self.predict(docs)
+            self.set_annotations(docs, scores)
+            yield from docs
+
+    def predict(self, docs):
+        raise NotImplementedError
+
+    def set_annotations(self, docs, scores):
+        raise NotImplementedError
+
+    def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
+        raise NotImplementedError
+
+    def get_loss(self, docs, golds, scores):
+        raise NotImplementedError
+
+    def begin_training(self, gold_tuples=tuple(), pipeline=None):
+        token_vector_width = pipeline[0].model.nO
+        if self.model is True:
+            self.model = self.Model(1, token_vector_width)
+
+    def use_params(self, params):
+        with self.model.use_params(params):
+            yield
+
+    def to_bytes(self, **exclude):
+        serialize = OrderedDict((
+            ('cfg', lambda: json_dumps(self.cfg)),
+            ('model', lambda: self.model.to_bytes()),
+            ('vocab', lambda: self.vocab.to_bytes())
+        ))
+        return util.to_bytes(serialize, exclude)
+
+    def from_bytes(self, bytes_data, **exclude):
+        def load_model(b):
+            if self.model is True:
+                self.model = self.Model(**self.cfg)
+            self.model.from_bytes(b)
+
+        deserialize = OrderedDict((
+            ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
+            ('model', load_model),
+            ('vocab', lambda b: self.vocab.from_bytes(b))
+        ))
+        util.from_bytes(bytes_data, deserialize, exclude)
+        return self
+
+    def to_disk(self, path, **exclude):
+        serialize = OrderedDict((
+            ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
+            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
+            ('vocab', lambda p: self.vocab.to_disk(p))
+        ))
+        util.to_disk(path, serialize, exclude)
+
+    def from_disk(self, path, **exclude):
+        def load_model(p):
+            if self.model is True:
+                self.model = self.Model(**self.cfg)
+            self.model.from_bytes(p.open('rb').read())
+
+        deserialize = OrderedDict((
+            ('cfg', lambda p: self.cfg.update(_load_cfg(p))),
+            ('model', load_model),
+            ('vocab', lambda p: self.vocab.from_disk(p)),
+        ))
+        util.from_disk(path, deserialize, exclude)
+        return self
+
+
+def _load_cfg(path):
+    if path.exists():
+        return ujson.load(path.open())
+    else:
+        return {}
+
+
+class TokenVectorEncoder(BaseThincComponent):
    """Assign position-sensitive vectors to tokens, using a CNN or RNN."""
    name = 'tensorizer'

    @classmethod
-    def Model(cls, width=128, embed_size=7500, **cfg):
+    def Model(cls, width=128, embed_size=4000, **cfg):
        """Create a new statistical model for the class.

        width (int): Output size of the model.
@ -79,6 +212,7 @@ class TokenVectorEncoder(object):
        self.vocab = vocab
        self.doc2feats = doc2feats()
        self.model = model
+        self.cfg = dict(cfg)

    def __call__(self, doc):
        """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
@ -144,7 +278,7 @@ class TokenVectorEncoder(object):
        # TODO: implement
        raise NotImplementedError

-    def begin_training(self, gold_tuples, pipeline=None):
+    def begin_training(self, gold_tuples=tuple(), pipeline=None):
        """Allocate models, pre-process training data and acquire a trainer and
        optimizer.

@ -155,74 +289,34 @@ class TokenVectorEncoder(object):
        if self.model is True:
            self.model = self.Model()

-    def use_params(self, params):
-        """Replace weights of models in the pipeline with those provided in the
-        params dictionary.

-        params (dict): A dictionary of parameters keyed by model ID.
-        """
-        with self.model.use_params(params):
-            yield
-
-    def to_bytes(self, **exclude):
-        serialize = OrderedDict((
-            ('model', lambda: self.model.to_bytes()),
-            ('vocab', lambda: self.vocab.to_bytes())
-        ))
-        return util.to_bytes(serialize, exclude)
-
-    def from_bytes(self, bytes_data, **exclude):
-        if self.model is True:
-            self.model = self.Model()
-        deserialize = OrderedDict((
-            ('model', lambda b: self.model.from_bytes(b)),
-            ('vocab', lambda b: self.vocab.from_bytes(b))
-        ))
-        util.from_bytes(bytes_data, deserialize, exclude)
-        return self
-
-    def to_disk(self, path, **exclude):
-        serialize = OrderedDict((
-            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
-            ('vocab', lambda p: self.vocab.to_disk(p))
-        ))
-        util.to_disk(path, serialize, exclude)
-
-    def from_disk(self, path, **exclude):
-        if self.model is True:
-            self.model = self.Model()
-        deserialize = OrderedDict((
-            ('model', lambda p: self.model.from_bytes(p.open('rb').read())),
-            ('vocab', lambda p: self.vocab.from_disk(p))
-        ))
-        util.from_disk(path, deserialize, exclude)
-        return self
-
-
-class NeuralTagger(object):
+class NeuralTagger(BaseThincComponent):
    name = 'tagger'
-    def __init__(self, vocab, model=True):
+    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
+        self.cfg = dict(cfg)

    def __call__(self, doc):
-        tags = self.predict([doc.tensor])
+        tags = self.predict(([doc], [doc.tensor]))
        self.set_annotations([doc], tags)
        return doc

    def pipe(self, stream, batch_size=128, n_threads=-1):
        for docs in cytoolz.partition_all(batch_size, stream):
+            docs = list(docs)
            tokvecs = [d.tensor for d in docs]
-            tag_ids = self.predict(tokvecs)
+            tag_ids = self.predict((docs, tokvecs))
            self.set_annotations(docs, tag_ids)
            yield from docs

-    def predict(self, tokvecs):
-        scores = self.model(tokvecs)
+    def predict(self, docs_tokvecs):
+        scores = self.model(docs_tokvecs)
        scores = self.model.ops.flatten(scores)
        guesses = scores.argmax(axis=1)
        if not isinstance(guesses, numpy.ndarray):
            guesses = guesses.get()
+        tokvecs = docs_tokvecs[1]
        guesses = self.model.ops.unflatten(guesses,
                    [tv.shape[0] for tv in tokvecs])
        return guesses
@ -235,6 +329,8 @@ class NeuralTagger(object):
        cdef Vocab vocab = self.vocab
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[i]
+            if hasattr(doc_tag_ids, 'get'):
+                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
                # Don't clobber preset POS tags
                if doc.c[j].tag == 0 and doc.c[j].pos == 0:
@ -243,16 +339,18 @@ class NeuralTagger(object):
        doc.is_tagged = True

    def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
+        if losses is not None and self.name not in losses:
+            losses[self.name] = 0.
        docs, tokvecs = docs_tokvecs

        if self.model.nI is None:
            self.model.nI = tokvecs[0].shape[1]
-
-        tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
+        tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
        loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)

        d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
-
+        if losses is not None:
+            losses[self.name] += loss
        return d_tokvecs

    def get_loss(self, docs, golds, scores):
@ -276,7 +374,7 @@ class NeuralTagger(object):
        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
        return float(loss), d_scores

-    def begin_training(self, gold_tuples, pipeline=None):
+    def begin_training(self, gold_tuples=tuple(), pipeline=None):
        orig_tag_map = dict(self.vocab.morphology.tag_map)
        new_tag_map = {}
        for raw_text, annots_brackets in gold_tuples:
@ -300,10 +398,8 @@ class NeuralTagger(object):

    @classmethod
    def Model(cls, n_tags, token_vector_width):
-        return with_flatten(
-            chain(Maxout(token_vector_width, token_vector_width),
-                  Softmax(n_tags, token_vector_width)))
-
+        return build_tagger_model(n_tags, token_vector_width)
+ 
    def use_params(self, params):
        with self.model.use_params(params):
            yield
@ -321,7 +417,8 @@ class NeuralTagger(object):
    def from_bytes(self, bytes_data, **exclude):
        def load_model(b):
            if self.model is True:
-                token_vector_width = util.env_opt('token_vector_width', 128)
+                token_vector_width = util.env_opt('token_vector_width',
+                        self.cfg.get('token_vector_width', 128))
                self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
            self.model.from_bytes(b)

@ -348,13 +445,15 @@ class NeuralTagger(object):
                use_bin_type=True,
                encoding='utf8'))),
            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
+            ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg)))
        ))
        util.to_disk(path, serialize, exclude)

    def from_disk(self, path, **exclude):
        def load_model(p):
            if self.model is True:
-                token_vector_width = util.env_opt('token_vector_width', 128)
+                token_vector_width = util.env_opt('token_vector_width',
+                        self.cfg.get('token_vector_width', 128))
                self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
            self.model.from_bytes(p.open('rb').read())

@ -370,6 +469,7 @@ class NeuralTagger(object):
            ('vocab', lambda p: self.vocab.from_disk(p)),
            ('tag_map', load_tag_map),
            ('model', load_model),
+            ('cfg', lambda p: self.cfg.update(_load_cfg(p)))
        ))
        util.from_disk(path, deserialize, exclude)
        return self
@ -377,15 +477,23 @@ class NeuralTagger(object):

 class NeuralLabeller(NeuralTagger):
    name = 'nn_labeller'
-    def __init__(self, vocab, model=True):
+    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
-        self.labels = {}
+        self.cfg = dict(cfg)
+
+    @property
+    def labels(self):
+        return self.cfg.setdefault('labels', {})
+
+    @labels.setter
+    def labels(self, value):
+        self.cfg['labels'] = value

    def set_annotations(self, docs, dep_ids):
        pass

-    def begin_training(self, gold_tuples, pipeline=None):
+    def begin_training(self, gold_tuples=tuple(), pipeline=None):
        gold_tuples = nonproj.preprocess_training_data(gold_tuples)
        for raw_text, annots_brackets in gold_tuples:
            for annots, brackets in annots_brackets:
@ -399,10 +507,8 @@ class NeuralLabeller(NeuralTagger):

    @classmethod
    def Model(cls, n_tags, token_vector_width):
-        return with_flatten(
-            chain(Maxout(token_vector_width, token_vector_width),
-                  Softmax(n_tags, token_vector_width)))
-
+        return build_tagger_model(n_tags, token_vector_width)
+    
    def get_loss(self, docs, golds, scores):
        scores = self.model.ops.flatten(scores)
        cdef int idx = 0
@ -423,7 +529,7 @@ class NeuralLabeller(NeuralTagger):
        return float(loss), d_scores


-class SimilarityHook(object):
+class SimilarityHook(BaseThincComponent):
    """
    Experimental

@ -439,9 +545,10 @@ class SimilarityHook(object):
    Where W is a vector of dimension weights, initialized to 1.
    """
    name = 'similarity'
-    def __init__(self, vocab, model=True):
+    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
+        self.cfg = dict(cfg)

    @classmethod
    def Model(cls, length):
@ -467,7 +574,7 @@ class SimilarityHook(object):

        return d_tensor1s, d_tensor2s

-    def begin_training(self, _, pipeline=None):
+    def begin_training(self, _=tuple(), pipeline=None):
        """
        Allocate model, using width from tensorizer in pipeline.

@ -477,48 +584,77 @@ class SimilarityHook(object):
        if self.model is True:
            self.model = self.Model(pipeline[0].model.nO)

-    def use_params(self, params):
-        """Replace weights of models in the pipeline with those provided in the
-        params dictionary.

-        params (dict): A dictionary of parameters keyed by model ID.
-        """
-        with self.model.use_params(params):
-            yield
+class TextCategorizer(BaseThincComponent):
+    name = 'textcat'

-    def to_bytes(self, **exclude):
-        serialize = OrderedDict((
-            ('model', lambda: self.model.to_bytes()),
-            ('vocab', lambda: self.vocab.to_bytes())
-        ))
-        return util.to_bytes(serialize, exclude)
+    @classmethod
+    def Model(cls, nr_class=1, width=64, **cfg):
+        return build_text_classifier(nr_class, width, **cfg)

-    def from_bytes(self, bytes_data, **exclude):
+    def __init__(self, vocab, model=True, **cfg):
+        self.vocab = vocab
+        self.model = model
+        self.cfg = dict(cfg)
+
+    @property
+    def labels(self):
+        return self.cfg.get('labels', ['LABEL'])
+
+    @labels.setter
+    def labels(self, value):
+        self.cfg['labels'] = value
+
+    def __call__(self, doc):
+        scores = self.predict([doc])
+        self.set_annotations([doc], scores)
+        return doc
+
+    def pipe(self, stream, batch_size=128, n_threads=-1):
+        for docs in cytoolz.partition_all(batch_size, stream):
+            docs = list(docs)
+            scores = self.predict(docs)
+            self.set_annotations(docs, scores)
+            yield from docs
+
+    def predict(self, docs):
+        scores = self.model(docs)
+        scores = self.model.ops.asarray(scores)
+        return scores
+
+    def set_annotations(self, docs, scores):
+        for i, doc in enumerate(docs):
+            for j, label in enumerate(self.labels):
+                doc.cats[label] = float(scores[i, j])
+
+    def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
+        docs, tensors = docs_tensors
+        scores, bp_scores = self.model.begin_update(docs, drop=drop)
+        loss, d_scores = self.get_loss(docs, golds, scores)
+        d_tensors = bp_scores(d_scores, sgd=sgd)
+        if losses is not None:
+            losses.setdefault(self.name, 0.0)
+            losses[self.name] += loss
+        return d_tensors
+
+    def get_loss(self, docs, golds, scores):
+        truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
+        for i, gold in enumerate(golds):
+            for j, label in enumerate(self.labels):
+                truths[i, j] = label in gold.cats
+        truths = self.model.ops.asarray(truths)
+        d_scores = (scores-truths) / scores.shape[0]
+        mean_square_error = ((scores-truths)**2).sum(axis=1).mean()
+        return mean_square_error, d_scores
+
+    def begin_training(self, gold_tuples=tuple(), pipeline=None):
+        if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
+            token_vector_width = pipeline[0].model.nO
+        else:
+            token_vector_width = 64
        if self.model is True:
-            self.model = self.Model()
-        deserialize = OrderedDict((
-            ('model', lambda b: self.model.from_bytes(b)),
-            ('vocab', lambda b: self.vocab.from_bytes(b))
-        ))
-        util.from_bytes(bytes_data, deserialize, exclude)
-        return self
-
-    def to_disk(self, path, **exclude):
-        serialize = OrderedDict((
-            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
-            ('vocab', lambda p: self.vocab.to_disk(p))
-        ))
-        util.to_disk(path, serialize, exclude)
-
-    def from_disk(self, path, **exclude):
-        if self.model is True:
-            self.model = self.Model()
-        deserialize = OrderedDict((
-            ('model', lambda p: self.model.from_bytes(p.open('rb').read())),
-            ('vocab', lambda p: self.vocab.from_disk(p))
-        ))
-        util.from_disk(path, deserialize, exclude)
-        return self
+            self.model = self.Model(len(self.labels), token_vector_width,
+                                    **self.cfg)


 cdef class EntityRecognizer(LinearParser):
@ -569,6 +705,14 @@ cdef class NeuralEntityRecognizer(NeuralParser):

    nr_feature = 6

+    def predict_confidences(self, docs):
+        tensors = [d.tensor for d in docs]
+        samples = []
+        for i in range(10):
+            states = self.parse_batch(docs, tensors, drop=0.3)
+            for state in states:
+                samples.append(self._get_entities(state))
+
    def __reduce__(self):
        return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)

--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -215,7 +215,10 @@ cdef class StringStore:
        path = util.ensure_path(path)
        with path.open('r') as file_:
            strings = ujson.load(file_)
+        prev = list(self)
        self._reset_and_load(strings)
+        for word in prev:
+            self.add(word)
        return self

    def to_bytes(self, **exclude):
@ -234,7 +237,10 @@ cdef class StringStore:
        RETURNS (StringStore): The `StringStore` object.
        """
        strings = ujson.loads(bytes_data)
+        prev = list(self)
        self._reset_and_load(strings)
+        for word in prev:
+            self.add(word)
        return self

    def set_frozen(self, bint is_frozen):
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@ -0,0 +1,286 @@
+# cython: infer_types=True
+# cython: profile=True
+cimport numpy as np
+import numpy
+from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
+from thinc.extra.search cimport Beam
+from thinc.extra.search import MaxViolation
+from thinc.typedefs cimport hash_t, class_t
+from thinc.extra.search cimport MaxViolation
+
+from .transition_system cimport TransitionSystem, Transition
+from .stateclass cimport StateClass
+from ..gold cimport GoldParse
+from ..tokens.doc cimport Doc
+
+
+# These are passed as callbacks to thinc.search.Beam
+cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
+    dest = <StateClass>_dest
+    src = <StateClass>_src
+    moves = <const Transition*>_moves
+    dest.clone(src)
+    moves[clas].do(dest.c, moves[clas].label)
+
+
+cdef int _check_final_state(void* _state, void* extra_args) except -1:
+    return (<StateClass>_state).is_final()
+
+
+def _cleanup(Beam beam):
+    for i in range(beam.width):
+        Py_XDECREF(<PyObject*>beam._states[i].content)
+        Py_XDECREF(<PyObject*>beam._parents[i].content)
+
+
+cdef hash_t _hash_state(void* _state, void* _) except 0:
+    state = <StateClass>_state
+    if state.c.is_final():
+        return 1
+    else:
+        return state.c.hash()
+
+
+cdef class ParserBeam(object):
+    cdef public TransitionSystem moves
+    cdef public object states
+    cdef public object golds
+    cdef public object beams
+    cdef public object dones
+
+    def __init__(self, TransitionSystem moves, states, golds,
+            int width, float density):
+        self.moves = moves
+        self.states = states
+        self.golds = golds
+        self.beams = []
+        cdef Beam beam
+        cdef StateClass state, st
+        for state in states:
+            beam = Beam(self.moves.n_moves, width, density)
+            beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
+            for i in range(beam.width):
+                st = <StateClass>beam.at(i)
+                st.c.offset = state.c.offset
+            self.beams.append(beam)
+        self.dones = [False] * len(self.beams)
+
+    def __dealloc__(self):
+        if self.beams is not None:
+            for beam in self.beams:
+                if beam is not None:
+                    _cleanup(beam)
+
+    @property
+    def is_done(self):
+        return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams))
+
+    def __getitem__(self, i):
+        return self.beams[i]
+
+    def __len__(self):
+        return len(self.beams)
+
+    def advance(self, scores, follow_gold=False):
+        cdef Beam beam
+        for i, beam in enumerate(self.beams):
+            if beam.is_done or not scores[i].size or self.dones[i]:
+                continue
+            self._set_scores(beam, scores[i])
+            if self.golds is not None:
+                self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
+            if follow_gold:
+                beam.advance(_transition_state, NULL, <void*>self.moves.c)
+            else:
+                beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
+            beam.check_done(_check_final_state, NULL)
+            if beam.is_done and self.golds is not None:
+                for j in range(beam.size):
+                    state = <StateClass>beam.at(j)
+                    if state.is_final():
+                        try:
+                            if self.moves.is_gold_parse(state, self.golds[i]):
+                                beam._states[j].loss = 0.0
+                            elif beam._states[j].loss == 0.0:
+                                beam._states[j].loss = 1.0
+                        except NotImplementedError:
+                            break
+
+    def _set_scores(self, Beam beam, float[:, ::1] scores):
+        cdef float* c_scores = &scores[0, 0]
+        cdef int nr_state = min(scores.shape[0], beam.size)
+        cdef int nr_class = scores.shape[1]
+        for i in range(nr_state):
+            state = <StateClass>beam.at(i)
+            if not state.is_final():
+                for j in range(nr_class):
+                    beam.scores[i][j] = c_scores[i * nr_class + j]
+                self.moves.set_valid(beam.is_valid[i], state.c)
+            else:
+                for j in range(beam.nr_class):
+                    beam.scores[i][j] = 0
+                    beam.costs[i][j] = 0
+
+    def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
+        for i in range(beam.size):
+            state = <StateClass>beam.at(i)
+            if not state.c.is_final():
+                self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
+                if follow_gold:
+                    for j in range(beam.nr_class):
+                        if beam.costs[i][j] >= 1:
+                            beam.is_valid[i][j] = 0
+
+
+def get_token_ids(states, int n_tokens):
+    cdef StateClass state
+    cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
+                                      dtype='int32', order='C')
+    c_ids = <int*>ids.data
+    for i, state in enumerate(states):
+        if not state.is_final():
+            state.c.set_context_tokens(c_ids, n_tokens)
+        else:
+            ids[i] = -1
+        c_ids += ids.shape[1]
+    return ids
+
+nr_update = 0
+def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
+                states, tokvecs, golds,
+                state2vec, vec2scores, 
+                int width, float density,
+                sgd=None, losses=None, drop=0.):
+    global nr_update
+    cdef MaxViolation violn
+    nr_update += 1
+    pbeam = ParserBeam(moves, states, golds,
+                       width=width, density=density)
+    gbeam = ParserBeam(moves, states, golds,
+                       width=width, density=0.0)
+    cdef StateClass state
+    beam_maps = []
+    backprops = []
+    violns = [MaxViolation() for _ in range(len(states))]
+    for t in range(max_steps):
+        if pbeam.is_done and gbeam.is_done:
+            break
+        # The beam maps let us find the right row in the flattened scores
+        # arrays for each state. States are identified by (example id, history).
+        # We keep a different beam map for each step (since we'll have a flat
+        # scores array for each step). The beam map will let us take the per-state
+        # losses, and compute the gradient for each (step, state, class).
+        beam_maps.append({})
+        # Gather all states from the two beams in a list. Some stats may occur
+        # in both beams. To figure out which beam each state belonged to,
+        # we keep two lists of indices, p_indices and g_indices
+        states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
+        if not states:
+            break
+        # Now that we have our flat list of states, feed them through the model
+        token_ids = get_token_ids(states, nr_feature)
+        vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
+        scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
+
+        # Store the callbacks for the backward pass
+        backprops.append((token_ids, bp_vectors, bp_scores))
+
+        # Unpack the flat scores into lists for the two beams. The indices arrays
+        # tell us which example and state the scores-row refers to.
+        p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
+        g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')  for indices in g_indices]
+        # Now advance the states in the beams. The gold beam is contrained to
+        # to follow only gold analyses.
+        pbeam.advance(p_scores)
+        gbeam.advance(g_scores, follow_gold=True)
+        # Track the "maximum violation", to use in the update.
+        for i, violn in enumerate(violns):
+            violn.check_crf(pbeam[i], gbeam[i])
+    histories = []
+    losses = []
+    for violn in violns:
+        if violn.p_hist:
+            histories.append(violn.p_hist + violn.g_hist)
+            losses.append(violn.p_probs + violn.g_probs)
+        else:
+            histories.append([])
+            losses.append([])
+    states_d_scores = get_gradient(moves.n_moves, beam_maps, histories, losses)
+    return states_d_scores, backprops[:len(states_d_scores)]
+
+
+def get_states(pbeams, gbeams, beam_map, nr_update):
+    seen = {}
+    states = []
+    p_indices = []
+    g_indices = []
+    cdef Beam pbeam, gbeam
+    assert len(pbeams) == len(gbeams)
+    for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
+        p_indices.append([])
+        g_indices.append([])
+        for i in range(pbeam.size):
+            state = <StateClass>pbeam.at(i)
+            if not state.is_final():
+                key = tuple([eg_id] + pbeam.histories[i])
+                assert key not in seen, (key, seen)
+                seen[key] = len(states)
+                p_indices[-1].append(len(states))
+                states.append(state)
+        beam_map.update(seen)
+        for i in range(gbeam.size):
+            state = <StateClass>gbeam.at(i)
+            if not state.is_final():
+                key = tuple([eg_id] + gbeam.histories[i])
+                if key in seen:
+                    g_indices[-1].append(seen[key])
+                else:
+                    g_indices[-1].append(len(states))
+                    beam_map[key] = len(states)
+                    states.append(state)
+    p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices]
+    g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices]
+    return states, p_idx, g_idx
+
+
+def get_gradient(nr_class, beam_maps, histories, losses):
+    """
+    The global model assigns a loss to each parse. The beam scores
+    are additive, so the same gradient is applied to each action
+    in the history. This gives the gradient of a single *action*
+    for a beam state -- so we have "the gradient of loss for taking
+    action i given history H."
+
+    Histories: Each hitory is a list of actions
+    Each candidate has a history
+    Each beam has multiple candidates
+    Each batch has multiple beams
+    So history is list of lists of lists of ints
+    """
+    nr_step = len(beam_maps)
+    grads = []
+    nr_step = 0
+    for eg_id, hists in enumerate(histories):
+        for loss, hist in zip(losses[eg_id], hists):
+            if loss != 0.0 and not numpy.isnan(loss):
+                nr_step = max(nr_step, len(hist))
+    for i in range(nr_step):
+        grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f'))
+    assert len(histories) == len(losses)
+    for eg_id, hists in enumerate(histories):
+        for loss, hist in zip(losses[eg_id], hists):
+            if loss == 0.0 or numpy.isnan(loss):
+                continue
+            key = tuple([eg_id])
+            # Adjust loss for length
+            avg_loss = loss / len(hist)
+            loss += avg_loss * (nr_step - len(hist))
+            for j, clas in enumerate(hist):
+                i = beam_maps[j][key]
+                # In step j, at state i action clas
+                # resulted in loss
+                grads[j][i, clas] += loss
+                key = key + tuple([clas])
+    return grads
+
+
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@ -37,6 +37,7 @@ cdef cppclass StateC:
        this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
        this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
        this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity))
+        this.offset = 0
        cdef int i
        for i in range(length + (PADDING * 2)):
            this._ents[i].end = -1
@ -73,7 +74,16 @@ cdef cppclass StateC:
        free(this.shifted - PADDING)

    void set_context_tokens(int* ids, int n) nogil:
-        if n == 13:
+        if n == 8:
+            ids[0] = this.B(0)
+            ids[1] = this.B(1)
+            ids[2] = this.S(0)
+            ids[3] = this.S(1)
+            ids[4] = this.H(this.S(0))
+            ids[5] = this.L(this.B(0), 1)
+            ids[6] = this.L(this.S(0), 2)
+            ids[7] = this.R(this.S(0), 1)
+        elif n == 13:
            ids[0] = this.B(0)
            ids[1] = this.B(1)
            ids[2] = this.S(0)
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -10,6 +10,8 @@ from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
 from cymem.cymem cimport Pool
 from collections import OrderedDict
+from thinc.extra.search cimport Beam
+import numpy

 from .stateclass cimport StateClass
 from ._state cimport StateC, is_space_token
@ -18,7 +20,7 @@ from .transition_system cimport do_func_t, get_cost_func_t
 from .transition_system cimport move_cost_func_t, label_cost_func_t
 from ..gold cimport GoldParse
 from ..gold cimport GoldParseC
-from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
+from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE, IS_PUNCT
 from ..lexeme cimport Lexeme
 from ..structs cimport TokenC

@ -284,7 +286,7 @@ cdef class Break:
        return 0

 cdef int _get_root(int word, const GoldParseC* gold) nogil:
-    while gold.heads[word] != word and not gold.has_dep[word] and word >= 0:
+    while gold.heads[word] != word and gold.has_dep[word] and word >= 0:
        word = gold.heads[word]
    if not gold.has_dep[word]:
        return -1
@ -349,6 +351,20 @@ cdef class ArcEager(TransitionSystem):
        def __get__(self):
            return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)

+    def is_gold_parse(self, StateClass state, GoldParse gold):
+        predicted = set()
+        truth = set()
+        for i in range(gold.length):
+            if gold.cand_to_gold[i] is None:
+                continue
+            if state.safe_get(i).dep:
+                predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep]))
+            else:
+                predicted.add((i, state.H(i), 'ROOT'))
+            id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
+            truth.add((id_, head, dep))
+        return truth == predicted
+
    def has_gold(self, GoldParse gold, start=0, end=None):
        end = end or len(gold.heads)
        if all([tag is None for tag in gold.heads[start:end]]):
@ -360,7 +376,7 @@ cdef class ArcEager(TransitionSystem):
        if not self.has_gold(gold):
            return None
        for i in range(gold.length):
-            if gold.heads[i] is None: # Missing values
+            if gold.heads[i] is None or gold.labels[i] is None: # Missing values
                gold.c.heads[i] = i
                gold.c.has_dep[i] = False
            else:
@ -383,6 +399,7 @@ cdef class ArcEager(TransitionSystem):
        for i in range(self.n_moves):
            if self.c[i].move == move and self.c[i].label == label:
                return self.c[i]
+        return Transition(clas=0, move=MISSING, label=0)

    def move_name(self, int move, attr_t label):
        label_str = self.strings[label]
@ -499,9 +516,11 @@ cdef class ArcEager(TransitionSystem):
                    "before training and after parsing. Either pass make_projective=True "
                    "to the GoldParse class, or use PseudoProjectivity.preprocess_training_data")
            else:
+                print(gold.orig_annot)
                print(gold.words)
                print(gold.heads)
                print(gold.labels)
+                print(gold.sent_starts)
                raise ValueError(
                    "Could not find a gold-standard action to supervise the dependency "
                    "parser.\n"
@ -510,3 +529,23 @@ cdef class ArcEager(TransitionSystem):
                    "State at failure:\n"
                    "%s" % (self.n_moves, stcls.print_state(gold.words)))
        assert n_gold >= 1
+
+    def get_beam_annot(self, Beam beam):
+        length = (<StateClass>beam.at(0)).c.length
+        heads = [{} for _ in range(length)]
+        deps = [{} for _ in range(length)]
+        probs = beam.probs
+        for i in range(beam.size):
+            stcls = <StateClass>beam.at(i)
+            self.finalize_state(stcls.c)
+            if stcls.is_final():
+                prob = probs[i]
+                for j in range(stcls.c.length):
+                    head = j + stcls.c._sent[j].head
+                    dep = stcls.c._sent[j].dep
+                    heads[j].setdefault(head, 0.0)
+                    heads[j][head] += prob
+                    deps[j].setdefault(dep, 0.0)
+                    deps[j][dep] += prob
+        return heads, deps
+
--- a/spacy/syntax/beam_parser.pyx
+++ b/spacy/syntax/beam_parser.pyx
@ -107,7 +107,7 @@ cdef class BeamParser(Parser):
            # The non-monotonic oracle makes it difficult to ensure final costs are
            # correct. Therefore do final correction
            for i in range(pred.size):
-                if is_gold(<StateClass>pred.at(i), gold_parse, self.moves.strings):
+                if self.moves.is_gold_parse(<StateClass>pred.at(i), gold_parse):
                    pred._states[i].loss = 0.0
                elif pred._states[i].loss == 0.0:
                    pred._states[i].loss = 1.0
@ -213,7 +213,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
        if not pred._states[i].is_done or pred._states[i].loss == 0:
            continue
        state = <StateClass>pred.at(i)
-        if is_gold(state, gold_parse, moves.strings) == True:
+        if moves.is_gold_parse(state, gold_parse) == True:
            for dep in gold_parse.orig_annot:
                print(dep[1], dep[3], dep[4])
            print("Cost", pred._states[i].loss)
@ -227,7 +227,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
        if not gold._states[i].is_done:
            continue
        state = <StateClass>gold.at(i)
-        if is_gold(state, gold_parse, moves.strings) == False:
+        if moves.is_gold(state, gold_parse) == False:
            print("Truth")
            for dep in gold_parse.orig_annot:
                print(dep[1], dep[3], dep[4])
@ -237,16 +237,3 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
            raise Exception("Gold parse is not gold-standard")


-def is_gold(StateClass state, GoldParse gold, StringStore strings):
-    predicted = set()
-    truth = set()
-    for i in range(gold.length):
-        if gold.cand_to_gold[i] is None:
-            continue
-        if state.safe_get(i).dep:
-            predicted.add((i, state.H(i), strings[state.safe_get(i).dep]))
-        else:
-            predicted.add((i, state.H(i), 'ROOT'))
-        id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
-        truth.add((id_, head, dep))
-    return truth == predicted
--- a/spacy/syntax/iterators.pyx
+++ b/spacy/syntax/iterators.pyx
@ -110,5 +110,35 @@ def es_noun_chunks(obj):
        token = next_token(token)


+def french_noun_chunks(obj):
+    labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
+    doc = obj.doc  # Ensure works on both Doc and Span.
+    np_deps = [doc.vocab.strings[label] for label in labels]
+    conj = doc.vocab.strings.add('conj')
+    np_label = doc.vocab.strings.add('NP')
+    seen = set()
+    for i, word in enumerate(obj):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.i in seen:
+            continue
+        if word.dep in np_deps:
+            if any(w.i in seen for w in word.subtree):
+                continue
+            seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
+            yield word.left_edge.i, word.right_edge.i+1, np_label
+        elif word.dep == conj:
+            head = word.head
+            while head.dep == conj and head.head.i < head.i:
+                head = head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                if any(w.i in seen for w in word.subtree):
+                    continue
+                seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
+                yield word.left_edge.i, word.right_edge.i+1, np_label
+
+
 CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
-            'es': es_noun_chunks}
+            'es': es_noun_chunks, 'fr': french_noun_chunks}
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -2,7 +2,10 @@
 from __future__ import unicode_literals

 from thinc.typedefs cimport weight_t
+from thinc.extra.search cimport Beam
 from collections import OrderedDict
+import numpy
+from thinc.neural.ops import NumpyOps

 from .stateclass cimport StateClass
 from ._state cimport StateC
@ -110,7 +113,7 @@ cdef class BiluoPushDown(TransitionSystem):

    def has_gold(self, GoldParse gold, start=0, end=None):
        end = end or len(gold.ner)
-        if all([tag == '-' for tag in gold.ner[start:end]]):
+        if all([tag in ('-', None) for tag in gold.ner[start:end]]):
            return False
        else:
            return True
@ -122,11 +125,46 @@ cdef class BiluoPushDown(TransitionSystem):
            gold.c.ner[i] = self.lookup_transition(gold.ner[i])
        return gold

+    def get_beam_annot(self, Beam beam):
+        entities = {}
+        probs = beam.probs
+        for i in range(beam.size):
+            stcls = <StateClass>beam.at(i)
+            if stcls.is_final():
+                self.finalize_state(stcls.c)
+                prob = probs[i]
+                for j in range(stcls.c._e_i):
+                    start = stcls.c._ents[j].start
+                    end = stcls.c._ents[j].end
+                    label = stcls.c._ents[j].label
+                    entities.setdefault((start, end, label), 0.0)
+                    entities[(start, end, label)] += prob
+        return entities
+
+    def get_beam_parses(self, Beam beam):
+        parses = []
+        probs = beam.probs
+        for i in range(beam.size):
+            stcls = <StateClass>beam.at(i)
+            if stcls.is_final():
+                self.finalize_state(stcls.c)
+                prob = probs[i]
+                parse = []
+                for j in range(stcls.c._e_i):
+                    start = stcls.c._ents[j].start
+                    end = stcls.c._ents[j].end
+                    label = stcls.c._ents[j].label
+                    parse.append((start, end, self.strings[label]))
+                parses.append((prob, parse))
+        return parses
+
    cdef Transition lookup_transition(self, object name) except *:
        cdef attr_t label
        if name == '-' or name == None:
            move_str = 'M'
            label = 0
+        elif name == '!O':
+            return Transition(clas=0, move=ISNT, label=0, score=0)
        elif '-' in name:
            move_str, label_str = name.split('-', 1)
            # Hacky way to denote 'not this entity'
@ -308,6 +346,9 @@ cdef class In:
        elif g_act == UNIT:
            # I, Gold U --> True iff next tag == O
            return next_act != OUT
+        # Support partial supervision in the form of "not this label"
+        elif g_act == ISNT:
+            return 0
        else:
            return 1

@ -350,6 +391,9 @@ cdef class Last:
        elif g_act == UNIT:
            # L, Gold U --> True
            return 0
+        # Support partial supervision in the form of "not this label"
+        elif g_act == ISNT:
+            return 0
        else:
            return 1

@ -418,7 +462,9 @@ cdef class Out:
        cdef int g_act = gold.ner[s.B(0)].move
        cdef attr_t g_tag = gold.ner[s.B(0)].label

-        if g_act == MISSING or g_act == ISNT:
+        if g_act == ISNT and g_tag == 0:
+            return 1
+        elif g_act == MISSING or g_act == ISNT:
            return 0
        elif g_act == BEGIN:
            # O, Gold B --> False
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -29,21 +29,26 @@ from thinc.linear.avgtron cimport AveragedPerceptron
 from thinc.linalg cimport VecVec
 from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
 from thinc.extra.eg cimport Example
+from thinc.extra.search cimport Beam

 from cymem.cymem cimport Pool, Address
 from murmurhash.mrmr cimport hash64
 from preshed.maps cimport MapStruct
 from preshed.maps cimport map_get

-from thinc.api import layerize, chain, noop, clone
-from thinc.neural import Model, Affine, ELU, ReLu, Maxout
+from thinc.api import layerize, chain, noop, clone, with_flatten
+from thinc.neural import Model, Affine, ReLu, Maxout
+from thinc.neural._classes.batchnorm import BatchNorm as BN
+from thinc.neural._classes.selu import SELU
+from thinc.neural._classes.layernorm import LayerNorm
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module

 from .. import util
 from ..util import get_async, get_cuda_stream
 from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
-from .._ml import Tok2Vec, doc2feats, rebatch
+from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
+from .._ml import Residual, drop_layer
 from ..compat import json_dumps

 from . import _parse_features
@ -58,8 +63,10 @@ from ..structs cimport TokenC
 from ..tokens.doc cimport Doc
 from ..strings cimport StringStore
 from ..gold cimport GoldParse
-from ..attrs cimport TAG, DEP
+from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
+from . import _beam_utils

+USE_FINE_TUNE = True

 def get_templates(*args, **kwargs):
    return []
@ -110,7 +117,6 @@ cdef class precompute_hiddens:
        self.nO = cached.shape[2]
        self.nP = getattr(lower_model, 'nP', 1)
        self.ops = lower_model.ops
-        self._features = numpy.zeros((batch_size, self.nO*self.nP), dtype='f')
        self._is_synchronized = False
        self._cuda_stream = cuda_stream
        self._cached = cached
@ -127,13 +133,12 @@ cdef class precompute_hiddens:
        return self.begin_update(X)[0]

    def begin_update(self, token_ids, drop=0.):
-        self._features.fill(0)
+        cdef np.ndarray state_vector = numpy.zeros((token_ids.shape[0], self.nO*self.nP), dtype='f')
        # This is tricky, but (assuming GPU available);
        # - Input to forward on CPU
        # - Output from forward on CPU
        # - Input to backward on GPU!
        # - Output from backward on GPU
-        cdef np.ndarray state_vector = self._features[:len(token_ids)]
        bp_hiddens = self._bp_hiddens

        feat_weights = self.get_feat_weights()
@ -233,11 +238,14 @@ cdef class Parser:
    Base class of the DependencyParser and EntityRecognizer.
    """
    @classmethod
-    def Model(cls, nr_class, token_vector_width=128, hidden_width=128, depth=1, **cfg):
+    def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg):
        depth = util.env_opt('parser_hidden_depth', depth)
        token_vector_width = util.env_opt('token_vector_width', token_vector_width)
        hidden_width = util.env_opt('hidden_width', hidden_width)
        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
+        embed_size = util.env_opt('embed_size', 4000)
+        tensors = fine_tune(Tok2Vec(token_vector_width, embed_size,
+                                    preprocess=doc2feats()))
        if parser_maxout_pieces == 1:
            lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
                        nF=cls.nr_feature,
@ -269,7 +277,7 @@ cdef class Parser:
            'hidden_width': hidden_width,
            'maxout_pieces': parser_maxout_pieces
        }
-        return (lower, upper), cfg
+        return (tensors, lower, upper), cfg

    def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
        """
@ -295,6 +303,10 @@ cdef class Parser:
            self.moves = self.TransitionSystem(self.vocab.strings, {})
        else:
            self.moves = moves
+        if 'beam_width' not in cfg:
+            cfg['beam_width'] = util.env_opt('beam_width', 1)
+        if 'beam_density' not in cfg:
+            cfg['beam_density'] = util.env_opt('beam_density', 0.0)
        self.cfg = cfg
        if 'actions' in self.cfg:
            for action, labels in self.cfg.get('actions', {}).items():
@ -305,7 +317,7 @@ cdef class Parser:
    def __reduce__(self):
        return (Parser, (self.vocab, self.moves, self.model), None, None)

-    def __call__(self, Doc doc):
+    def __call__(self, Doc doc, beam_width=None, beam_density=None):
        """
        Apply the parser or entity recognizer, setting the annotations onto the Doc object.

@ -314,11 +326,26 @@ cdef class Parser:
        Returns:
            None
        """
-        states = self.parse_batch([doc], [doc.tensor])
-        self.set_annotations([doc], states)
-        return doc
+        if beam_width is None:
+            beam_width = self.cfg.get('beam_width', 1)
+        if beam_density is None:
+            beam_density = self.cfg.get('beam_density', 0.0)
+        cdef Beam beam
+        if beam_width == 1:
+            states = self.parse_batch([doc], [doc.tensor])
+            self.set_annotations([doc], states)
+            return doc
+        else:
+            beam = self.beam_parse([doc], [doc.tensor],
+                        beam_width=beam_width, beam_density=beam_density)[0]
+            output = self.moves.get_beam_annot(beam)
+            state = <StateClass>beam.at(0)
+            self.set_annotations([doc], [state])
+            _cleanup(beam)
+            return output

-    def pipe(self, docs, int batch_size=1000, int n_threads=2):
+    def pipe(self, docs, int batch_size=1000, int n_threads=2,
+             beam_width=None, beam_density=None):
        """
        Process a stream of documents.

@ -330,13 +357,23 @@ cdef class Parser:
                The number of threads with which to work on the buffer in parallel.
        Yields (Doc): Documents, in order.
        """
-        cdef StateClass parse_state
+        if beam_width is None:
+            beam_width = self.cfg.get('beam_width', 1)
+        if beam_density is None:
+            beam_density = self.cfg.get('beam_density', 0.0)
        cdef Doc doc
-        queue = []
+        cdef Beam beam
        for docs in cytoolz.partition_all(batch_size, docs):
            docs = list(docs)
-            tokvecs = [d.tensor for d in docs]
-            parse_states = self.parse_batch(docs, tokvecs)
+            tokvecs = [doc.tensor for doc in docs]
+            if beam_width == 1:
+                parse_states = self.parse_batch(docs, tokvecs)
+            else:
+                beams = self.beam_parse(docs, tokvecs,
+                            beam_width=beam_width, beam_density=beam_density)
+                parse_states = []
+                for beam in beams:
+                    parse_states.append(<StateClass>beam.at(0))
            self.set_annotations(docs, parse_states)
            yield from docs

@ -351,8 +388,13 @@ cdef class Parser:
            int nr_class, nr_feat, nr_piece, nr_dim, nr_state
        if isinstance(docs, Doc):
            docs = [docs]
+        if isinstance(tokvecses, np.ndarray):
+            tokvecses = [tokvecses]

        tokvecs = self.model[0].ops.flatten(tokvecses)
+        if USE_FINE_TUNE:
+            # TODO: This is incorrect! Unhack when training next model
+            tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))

        nr_state = len(docs)
        nr_class = self.moves.n_moves
@ -404,6 +446,55 @@ cdef class Parser:
                    next_step.push_back(st)
        return states

+    def beam_parse(self, docs, tokvecses, int beam_width=3, float beam_density=0.001):
+        cdef Beam beam
+        cdef np.ndarray scores
+        cdef Doc doc
+        cdef int nr_class = self.moves.n_moves
+        cdef StateClass stcls, output
+        tokvecs = self.model[0].ops.flatten(tokvecses)
+        if USE_FINE_TUNE:
+            tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
+        cuda_stream = get_cuda_stream()
+        state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
+                                                     cuda_stream, 0.0)
+        beams = []
+        cdef int offset = 0
+        cdef int j = 0
+        cdef int k
+        for doc in docs:
+            beam = Beam(nr_class, beam_width, min_density=beam_density)
+            beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
+            for i in range(beam.width):
+                stcls = <StateClass>beam.at(i)
+                stcls.c.offset = offset
+            offset += len(doc)
+            beam.check_done(_check_final_state, NULL)
+            while not beam.is_done:
+                states = []
+                for i in range(beam.size):
+                    stcls = <StateClass>beam.at(i)
+                    # This way we avoid having to score finalized states
+                    # We do have to take care to keep indexes aligned, though
+                    if not stcls.is_final():
+                        states.append(stcls)
+                token_ids = self.get_token_ids(states)
+                vectors = state2vec(token_ids)
+                scores = vec2scores(vectors)
+                j = 0
+                c_scores = <float*>scores.data
+                for i in range(beam.size):
+                    stcls = <StateClass>beam.at(i)
+                    if not stcls.is_final():
+                        self.moves.set_valid(beam.is_valid[i], stcls.c)
+                        for k in range(nr_class):
+                            beam.scores[i][k] = c_scores[j * scores.shape[1] + k]
+                        j += 1
+                beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
+                beam.check_done(_check_final_state, NULL)
+            beams.append(beam)
+        return beams
+
    cdef void _parse_step(self, StateC* state,
            const float* feat_weights,
            int nr_class, int nr_feat, int nr_piece) nogil:
@ -427,6 +518,12 @@ cdef class Parser:
        free(token_ids)

    def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
+        if not any(self.moves.has_gold(gold) for gold in golds):
+            return None
+        if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
+            return self.update_beam(docs_tokvecs, golds,
+                    self.cfg['beam_width'], self.cfg['beam_density'],
+                    drop=drop, sgd=sgd, losses=losses)
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
        docs, tokvec_lists = docs_tokvecs
@ -434,6 +531,9 @@ cdef class Parser:
        if isinstance(docs, Doc) and isinstance(golds, GoldParse):
            docs = [docs]
            golds = [golds]
+        if USE_FINE_TUNE:
+            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
+            tokvecs += self.model[0].ops.flatten(my_tokvecs)

        cuda_stream = get_cuda_stream()

@ -460,13 +560,14 @@ cdef class Parser:
            scores, bp_scores = vec2scores.begin_update(vector, drop=drop)

            d_scores = self.get_batch_loss(states, golds, scores)
-            d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
+            d_scores /= len(docs)
+            d_vector = bp_scores(d_scores, sgd=sgd)
            if drop != 0:
                d_vector *= mask

            if isinstance(self.model[0].ops, CupyOps) \
            and not isinstance(token_ids, state2vec.ops.xp.ndarray):
-                # Move token_ids and d_vector to CPU, asynchronously
+                # Move token_ids and d_vector to GPU, asynchronously
                backprops.append((
                    get_async(cuda_stream, token_ids),
                    get_async(cuda_stream, d_vector),
@ -483,7 +584,65 @@ cdef class Parser:
                break
        self._make_updates(d_tokvecs,
            backprops, sgd, cuda_stream)
-        return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
+        d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
+        if USE_FINE_TUNE:
+            d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
+        return d_tokvecs
+
+    def update_beam(self, docs_tokvecs, golds, width=None, density=None,
+            drop=0., sgd=None, losses=None):
+        if not any(self.moves.has_gold(gold) for gold in golds):
+            return None
+        if not golds:
+            return None
+        if width is None:
+            width = self.cfg.get('beam_width', 2)
+        if density is None:
+            density = self.cfg.get('beam_density', 0.0)
+        if losses is not None and self.name not in losses:
+            losses[self.name] = 0.
+        docs, tokvecs = docs_tokvecs
+        lengths = [len(d) for d in docs]
+        assert min(lengths) >= 1
+        tokvecs = self.model[0].ops.flatten(tokvecs)
+        if USE_FINE_TUNE:
+            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
+            tokvecs += self.model[0].ops.flatten(my_tokvecs)
+
+        states = self.moves.init_batch(docs)
+        for gold in golds:
+            self.moves.preprocess_gold(gold)
+
+        cuda_stream = get_cuda_stream()
+        state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0)
+
+        states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
+                                        states, tokvecs, golds,
+                                        state2vec, vec2scores,
+                                        width, density,
+                                        sgd=sgd, drop=drop, losses=losses)
+        backprop_lower = []
+        cdef float batch_size = len(docs)
+        for i, d_scores in enumerate(states_d_scores):
+            d_scores /= batch_size
+            if losses is not None:
+                losses[self.name] += (d_scores**2).sum()
+            ids, bp_vectors, bp_scores = backprops[i]
+            d_vector = bp_scores(d_scores, sgd=sgd)
+            if isinstance(self.model[0].ops, CupyOps) \
+            and not isinstance(ids, state2vec.ops.xp.ndarray):
+                backprop_lower.append((
+                    get_async(cuda_stream, ids),
+                    get_async(cuda_stream, d_vector),
+                    bp_vectors))
+            else:
+                backprop_lower.append((ids, d_vector, bp_vectors))
+        d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
+        self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
+        d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
+        if USE_FINE_TUNE:
+            d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
+        return d_tokvecs

    def _init_gold_batch(self, whole_docs, whole_golds):
        """Make a square batch, of length equal to the shortest doc. A long
@ -528,14 +687,10 @@ cdef class Parser:
        xp = get_array_module(d_tokvecs)
        for ids, d_vector, bp_vector in backprops:
            d_state_features = bp_vector(d_vector, sgd=sgd)
-            active_feats = ids * (ids >= 0)
-            active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1))
-            if hasattr(xp, 'scatter_add'):
-                xp.scatter_add(d_tokvecs,
-                    ids, d_state_features * active_feats)
-            else:
-                xp.add.at(d_tokvecs,
-                    ids, d_state_features * active_feats)
+            mask = ids >= 0
+            d_state_features *= mask.reshape(ids.shape + (1,))
+            self.model[0].ops.scatter_add(d_tokvecs, ids * mask,
+                d_state_features)

    @property
    def move_names(self):
@ -546,7 +701,7 @@ cdef class Parser:
        return names

    def get_batch_model(self, batch_size, tokvecs, stream, dropout):
-        lower, upper = self.model
+        _, lower, upper = self.model
        state2vec = precompute_hiddens(batch_size, tokvecs,
                        lower, stream, drop=dropout)
        return state2vec, upper
@ -560,7 +715,8 @@ cdef class Parser:
                                          dtype='i', order='C')
        c_ids = <int*>ids.data
        for i, state in enumerate(states):
-            state.c.set_context_tokens(c_ids, n_tokens)
+            if not state.is_final():
+                state.c.set_context_tokens(c_ids, n_tokens)
            c_ids += ids.shape[1]
        return ids

@ -635,10 +791,12 @@ cdef class Parser:

    def to_disk(self, path, **exclude):
        serializers = {
-            'lower_model': lambda p: p.open('wb').write(
+            'tok2vec_model': lambda p: p.open('wb').write(
                self.model[0].to_bytes()),
-            'upper_model': lambda p: p.open('wb').write(
+            'lower_model': lambda p: p.open('wb').write(
                self.model[1].to_bytes()),
+            'upper_model': lambda p: p.open('wb').write(
+                self.model[2].to_bytes()),
            'vocab': lambda p: self.vocab.to_disk(p),
            'moves': lambda p: self.moves.to_disk(p, strings=False),
            'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
@ -659,24 +817,29 @@ cdef class Parser:
                self.model, cfg = self.Model(**self.cfg)
            else:
                cfg = {}
-            with (path / 'lower_model').open('rb') as file_:
+            with (path / 'tok2vec_model').open('rb') as file_:
                bytes_data = file_.read()
            self.model[0].from_bytes(bytes_data)
-            with (path / 'upper_model').open('rb') as file_:
+            with (path / 'lower_model').open('rb') as file_:
                bytes_data = file_.read()
            self.model[1].from_bytes(bytes_data)
+            with (path / 'upper_model').open('rb') as file_:
+                bytes_data = file_.read()
+            self.model[2].from_bytes(bytes_data)
            self.cfg.update(cfg)
        return self

    def to_bytes(self, **exclude):
        serializers = OrderedDict((
-            ('lower_model', lambda: self.model[0].to_bytes()),
-            ('upper_model', lambda: self.model[1].to_bytes()),
+            ('tok2vec_model', lambda: self.model[0].to_bytes()),
+            ('lower_model', lambda: self.model[1].to_bytes()),
+            ('upper_model', lambda: self.model[2].to_bytes()),
            ('vocab', lambda: self.vocab.to_bytes()),
            ('moves', lambda: self.moves.to_bytes(strings=False)),
            ('cfg', lambda: ujson.dumps(self.cfg))
        ))
        if 'model' in exclude:
+            exclude['tok2vec_model'] = True
            exclude['lower_model'] = True
            exclude['upper_model'] = True
            exclude.pop('model')
@ -687,6 +850,7 @@ cdef class Parser:
            ('vocab', lambda b: self.vocab.from_bytes(b)),
            ('moves', lambda b: self.moves.from_bytes(b, strings=False)),
            ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
+            ('tok2vec_model', lambda b: None),
            ('lower_model', lambda b: None),
            ('upper_model', lambda b: None)
        ))
@ -696,10 +860,12 @@ cdef class Parser:
                self.model, cfg = self.Model(self.moves.n_moves)
            else:
                cfg = {}
+            if 'tok2vec_model' in msg:
+                self.model[0].from_bytes(msg['tok2vec_model'])
            if 'lower_model' in msg:
-                self.model[0].from_bytes(msg['lower_model'])
+                self.model[1].from_bytes(msg['lower_model'])
            if 'upper_model' in msg:
-                self.model[1].from_bytes(msg['upper_model'])
+                self.model[2].from_bytes(msg['upper_model'])
            self.cfg.update(cfg)
        return self

@ -762,3 +928,30 @@ cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actio
            mode = i
            score = scores[i]
    return mode
+
+
+# These are passed as callbacks to thinc.search.Beam
+cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
+    dest = <StateClass>_dest
+    src = <StateClass>_src
+    moves = <const Transition*>_moves
+    dest.clone(src)
+    moves[clas].do(dest.c, moves[clas].label)
+
+
+cdef int _check_final_state(void* _state, void* extra_args) except -1:
+    return (<StateClass>_state).is_final()
+
+
+def _cleanup(Beam beam):
+    for i in range(beam.width):
+        Py_XDECREF(<PyObject*>beam._states[i].content)
+        Py_XDECREF(<PyObject*>beam._parents[i].content)
+
+
+cdef hash_t _hash_state(void* _state, void* _) except 0:
+    state = <StateClass>_state
+    if state.c.is_final():
+        return 1
+    else:
+        return state.c.hash()
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -99,6 +99,9 @@ cdef class TransitionSystem:
    def preprocess_gold(self, GoldParse gold):
        raise NotImplementedError

+    def is_gold_parse(self, StateClass state, GoldParse gold):
+        raise NotImplementedError
+
    cdef Transition lookup_transition(self, object name) except *:
        raise NotImplementedError

@ -107,6 +110,8 @@ cdef class TransitionSystem:

    def is_valid(self, StateClass stcls, move_name):
        action = self.lookup_transition(move_name)
+        if action.move == 0:
+            return False
        return action.is_valid(stcls.c, action.label)

    cdef int set_valid(self, int* is_valid, const StateC* st) nogil:
@ -137,6 +142,10 @@ cdef class TransitionSystem:
                "the entity recognizer\n"
                "The transition system has %d actions." % (self.n_moves))

+    def get_class_name(self, int clas):
+        act = self.c[clas]
+        return self.move_name(act.move, act.label)
+
    def add_action(self, int action, label_name):
        cdef attr_t label_id
        if not isinstance(label_name, int):
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -11,9 +11,9 @@ from ..strings import StringStore
 from .. import util


-_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
-              'nl', 'pl', 'pt', 'sv', 'xx']
-_models = {'en': ['en_depent_web_sm', 'en_core_web_md'],
+_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id',
+              'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
+_models = {'en': ['en_core_web_sm'],
           'de': ['de_core_news_md'],
           'fr': ['fr_depvec_web_lg'],
           'xx': ['xx_ent_web_md']}
@ -86,6 +86,9 @@ def hu_tokenizer():
 def fi_tokenizer():
    return util.get_lang_class('fi').Defaults.create_tokenizer()

+@pytest.fixture
+def id_tokenizer():
+    return util.get_lang_class('id').Defaults.create_tokenizer()

@pytest.fixture
 def sv_tokenizer():
--- a/spacy/tests/lang/en/test_lemmatizer.py
+++ b/spacy/tests/lang/en/test_lemmatizer.py
@ -2,12 +2,18 @@
 from __future__ import unicode_literals

 import pytest
+from ....tokens.doc import Doc


@pytest.fixture
 def en_lemmatizer(EN):
    return EN.Defaults.create_lemmatizer()

+@pytest.mark.models('en')
+def test_doc_lemmatization(EN):
+    doc = Doc(EN.vocab, words=['bleed'])
+    doc[0].tag_ = 'VBP'
+    assert doc[0].lemma_ == 'bleed'

@pytest.mark.models('en')
@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]),
@ -19,6 +25,16 @@ def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
    assert en_lemmatizer.noun(text) == set(lemmas)


+@pytest.mark.models('en')
+@pytest.mark.parametrize('text,lemmas', [("bleed", ["bleed"]),
+                                         ("feed", ["feed"]),
+                                         ("need", ["need"]),
+                                         ("ring", ["ring"]),
+                                         ("axes", ["axis", "axe", "ax"])])
+def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
+    assert en_lemmatizer.noun(text) == set(lemmas)
+
+
@pytest.mark.xfail
@pytest.mark.models('en')
 def test_en_lemmatizer_base_forms(en_lemmatizer):
--- a/spacy/tests/lang/en/test_tagger.py
+++ b/spacy/tests/lang/en/test_tagger.py
@ -25,7 +25,6 @@ def test_tag_names(EN):
    doc = EN(text, disable=['parser'])
    assert type(doc[2].pos) == int
    assert isinstance(doc[2].pos_, six.text_type)
-    assert type(doc[2].dep) == int
    assert isinstance(doc[2].dep_, six.text_type)
    assert doc[2].tag_ == u'NNS'

--- a/spacy/tests/lang/id/init.py
+++ b/spacy/tests/lang/id/init.py
--- a/spacy/tests/lang/id/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/id/test_prefix_suffix_infix.py
@ -0,0 +1,115 @@
+# coding: utf-8
+"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize('text', ["(Ma'arif)"])
+def test_tokenizer_splits_no_special(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["Ma'arif"])
+def test_tokenizer_splits_no_punct(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 1
+
+
+@pytest.mark.parametrize('text', ["(Ma'arif"])
+def test_tokenizer_splits_prefix_punct(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 2
+
+
+@pytest.mark.parametrize('text', ["Ma'arif)"])
+def test_tokenizer_splits_suffix_punct(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 2
+
+
+@pytest.mark.parametrize('text', ["(Ma'arif)"])
+def test_tokenizer_splits_even_wrap(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["(Ma'arif?)"])
+def test_tokenizer_splits_uneven_wrap(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 4
+
+
+@pytest.mark.parametrize('text,length', [("S.Kom.", 1), ("SKom.", 2), ("(S.Kom.", 2)])
+def test_tokenizer_splits_prefix_interact(id_tokenizer, text, length):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == length
+
+
+@pytest.mark.parametrize('text', ["S.Kom.)"])
+def test_tokenizer_splits_suffix_interact(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 2
+
+
+@pytest.mark.parametrize('text', ["(S.Kom.)"])
+def test_tokenizer_splits_even_wrap_interact(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["(S.Kom.?)"])
+def test_tokenizer_splits_uneven_wrap_interact(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 4
+
+
+@pytest.mark.parametrize('text,length', [("gara-gara", 1), ("Jokowi-Ahok", 3), ("Sukarno-Hatta", 3)])
+def test_tokenizer_splits_hyphens(id_tokenizer, text, length):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == length
+
+
+@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
+def test_tokenizer_splits_numeric_range(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["ini.Budi", "Halo.Bandung"])
+def test_tokenizer_splits_period_infix(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 3
+
+
+@pytest.mark.parametrize('text', ["Halo,Bandung", "satu,dua"])
+def test_tokenizer_splits_comma_infix(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 3
+    assert tokens[0].text == text.split(",")[0]
+    assert tokens[1].text == ","
+    assert tokens[2].text == text.split(",")[1]
+
+
+@pytest.mark.parametrize('text', ["halo...Bandung", "dia...pergi"])
+def test_tokenizer_splits_ellipsis_infix(id_tokenizer, text):
+    tokens = id_tokenizer(text)
+    assert len(tokens) == 3
+
+
+def test_tokenizer_splits_double_hyphen_infix(id_tokenizer):
+    tokens = id_tokenizer("Arsene Wenger--manajer Arsenal--melakukan konferensi pers.")
+    assert len(tokens) == 10
+    assert tokens[0].text == "Arsene"
+    assert tokens[1].text == "Wenger"
+    assert tokens[2].text == "--"
+    assert tokens[3].text == "manajer"
+    assert tokens[4].text == "Arsenal"
+    assert tokens[5].text == "--"
+    assert tokens[6].text == "melakukan"
+    assert tokens[7].text == "konferensi"
+    assert tokens[8].text == "pers"
+    assert tokens[9].text == "."
--- a/spacy/tests/parser/test_beam_parse.py
+++ b/spacy/tests/parser/test_beam_parse.py
@ -0,0 +1,10 @@
+import spacy
+import pytest
+
+@pytest.mark.models
+def test_beam_parse():
+    nlp = spacy.load('en_core_web_sm')
+    doc = nlp(u'Australia is a country', disable=['ner'])
+    ents = nlp.entity(doc, beam_width=2)
+    print(ents)
+
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -0,0 +1,73 @@
+from __future__ import unicode_literals
+
+import pytest
+
+from ...vocab import Vocab
+from ...syntax.ner import BiluoPushDown
+from ...gold import GoldParse
+from ...tokens import Doc
+
+
+@pytest.fixture
+def vocab():
+    return Vocab()
+
+
+@pytest.fixture
+def doc(vocab):
+    return Doc(vocab, words=['Casey', 'went', 'to', 'New', 'York', '.'])
+
+
+@pytest.fixture
+def entity_annots(doc):
+    casey = doc[0:1]
+    ny = doc[3:5]
+    return [(casey.start_char, casey.end_char, 'PERSON'),
+            (ny.start_char, ny.end_char, 'GPE')]
+
+
+@pytest.fixture
+def entity_types(entity_annots):
+    return sorted(set([label for (s, e, label) in entity_annots]))
+
+
+@pytest.fixture
+def tsys(vocab, entity_types):
+    actions = BiluoPushDown.get_actions(entity_types=entity_types)
+    return BiluoPushDown(vocab.strings, actions)
+
+
+def test_get_oracle_moves(tsys, doc, entity_annots):
+    gold = GoldParse(doc, entities=entity_annots)
+    tsys.preprocess_gold(gold)
+    act_classes = tsys.get_oracle_sequence(doc, gold)
+    names = [tsys.get_class_name(act) for act in act_classes]
+    assert names == ['U-PERSON', 'O', 'O', 'B-GPE', 'L-GPE', 'O']
+
+def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
+    entity_annots = [(s, e, '!' + label) for s, e, label in entity_annots]
+    gold = GoldParse(doc, entities=entity_annots)
+    for i, tag in enumerate(gold.ner):
+        if tag == 'L-!GPE':
+            gold.ner[i] = '-'
+    tsys.preprocess_gold(gold)
+    act_classes = tsys.get_oracle_sequence(doc, gold)
+    names = [tsys.get_class_name(act) for act in act_classes]
+
+
+def test_get_oracle_moves_negative_entities2(tsys, vocab):
+    doc = Doc(vocab, words=['A', 'B', 'C', 'D'])
+    gold = GoldParse(doc, entities=[])
+    gold.ner = ['B-!PERSON', 'L-!PERSON', 'B-!PERSON', 'L-!PERSON']
+    tsys.preprocess_gold(gold)
+    act_classes = tsys.get_oracle_sequence(doc, gold)
+    names = [tsys.get_class_name(act) for act in act_classes]
+
+
+def test_get_oracle_moves_negative_O(tsys, vocab):
+    doc = Doc(vocab, words=['A', 'B', 'C', 'D'])
+    gold = GoldParse(doc, entities=[])
+    gold.ner = ['O', '!O', 'O', '!O']
+    tsys.preprocess_gold(gold)
+    act_classes = tsys.get_oracle_sequence(doc, gold)
+    names = [tsys.get_class_name(act) for act in act_classes]
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 from thinc.neural import Model
-from mock import Mock
 import pytest
 import numpy

@ -36,7 +35,7 @@ def parser(vocab, arc_eager):

@pytest.fixture
 def model(arc_eager, tok2vec):
-    return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)
+    return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)[0]

@pytest.fixture
 def doc(vocab):
@ -45,29 +44,50 @@ def doc(vocab):
@pytest.fixture
 def gold(doc):
    return GoldParse(doc, heads=[1, 1, 1], deps=['L', 'ROOT', 'R'])
+
+
 def test_can_init_nn_parser(parser):
    assert parser.model is None


 def test_build_model(parser):
-    parser.model = Parser.Model(parser.moves.n_moves)
+    parser.model = Parser.Model(parser.moves.n_moves)[0]
    assert parser.model is not None


-@pytest.mark.xfail
 def test_predict_doc(parser, tok2vec, model, doc):
-    doc.tensor = tok2vec([doc])
+    doc.tensor = tok2vec([doc])[0]
    parser.model = model
    parser(doc)


-@pytest.mark.xfail
 def test_update_doc(parser, tok2vec, model, doc, gold):
    parser.model = model
    tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
-    d_tokvecs = parser.update((doc, tokvecs), gold)
-    assert d_tokvecs.shape == tokvecs.shape
+    d_tokvecs = parser.update(([doc], tokvecs), [gold])
+    assert d_tokvecs[0].shape == tokvecs[0].shape
    def optimize(weights, gradient, key=None):
        weights -= 0.001 * gradient
    bp_tokvecs(d_tokvecs, sgd=optimize)
-    assert d_tokvecs.sum() == 0.
+    assert d_tokvecs[0].sum() == 0.
+
+
+def test_predict_doc_beam(parser, tok2vec, model, doc):
+    doc.tensor = tok2vec([doc])[0]
+    parser.model = model
+    parser(doc, beam_width=32, beam_density=0.001)
+    for word in doc:
+        print(word.text, word.head, word.dep_)
+
+
+def test_update_doc_beam(parser, tok2vec, model, doc, gold):
+    parser.model = model
+    tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
+    d_tokvecs = parser.update_beam(([doc], tokvecs), [gold])
+    assert d_tokvecs[0].shape == tokvecs[0].shape
+    def optimize(weights, gradient, key=None):
+        weights -= 0.001 * gradient
+    bp_tokvecs(d_tokvecs, sgd=optimize)
+    assert d_tokvecs[0].sum() == 0.
+
+
--- a/spacy/tests/parser/test_nn_beam.py
+++ b/spacy/tests/parser/test_nn_beam.py
@ -0,0 +1,87 @@
+from __future__ import unicode_literals
+import pytest
+import numpy
+from thinc.api import layerize
+
+from ...vocab import Vocab
+from ...syntax.arc_eager import ArcEager
+from ...tokens import Doc
+from ...gold import GoldParse
+from ...syntax._beam_utils import ParserBeam, update_beam
+from ...syntax.stateclass import StateClass
+
+
+@pytest.fixture
+def vocab():
+    return Vocab()
+
+@pytest.fixture
+def moves(vocab):
+    aeager = ArcEager(vocab.strings, {})
+    aeager.add_action(2, 'nsubj')
+    aeager.add_action(3, 'dobj')
+    aeager.add_action(2, 'aux')
+    return aeager
+
+
+@pytest.fixture
+def docs(vocab):
+    return [Doc(vocab, words=['Rats', 'bite', 'things'])]
+
+@pytest.fixture
+def states(docs):
+    return [StateClass(doc) for doc in docs]
+
+@pytest.fixture
+def tokvecs(docs, vector_size):
+    output = []
+    for doc in docs:
+        vec = numpy.random.uniform(-0.1, 0.1, (len(doc), vector_size))
+        output.append(numpy.asarray(vec))
+    return output
+
+
+@pytest.fixture
+def golds(docs):
+    return [GoldParse(doc) for doc in docs]
+
+
+@pytest.fixture
+def batch_size(docs):
+    return len(docs)
+
+
+@pytest.fixture
+def beam_width():
+    return 4
+
+
+@pytest.fixture
+def vector_size():
+    return 6
+
+
+@pytest.fixture
+def beam(moves, states, golds, beam_width):
+    return ParserBeam(moves, states, golds, width=beam_width, density=0.0)
+
+@pytest.fixture
+def scores(moves, batch_size, beam_width):
+    return [
+        numpy.asarray(
+            numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)),
+            dtype='f')
+        for _ in range(batch_size)]
+
+
+def test_create_beam(beam):
+    pass
+
+
+def test_beam_advance(beam, scores):
+    beam.advance(scores)
+
+
+def test_beam_advance_too_few_scores(beam, scores):
+    with pytest.raises(IndexError):
+        beam.advance(scores[:-1])
--- a/spacy/tests/regression/test_issue1257.py
+++ b/spacy/tests/regression/test_issue1257.py
@ -0,0 +1,12 @@
+'''Test tokens compare correctly'''
+from __future__ import unicode_literals
+
+from ..util import get_doc
+from ...vocab import Vocab
+
+
+def test_issue1257():
+    doc1 = get_doc(Vocab(), ['a', 'b', 'c'])
+    doc2 = get_doc(Vocab(), ['a', 'c', 'e'])
+    assert doc1[0] != doc2[0]
+    assert not doc1[0] == doc2[0]
--- a/spacy/tests/serialize/test_serialize_tagger.py
+++ b/spacy/tests/serialize/test_serialize_tagger.py
@ -11,8 +11,8 @@ import pytest
 def taggers(en_vocab):
    tagger1 = Tagger(en_vocab)
    tagger2 = Tagger(en_vocab)
-    tagger1.model = tagger1.Model(None, None)
-    tagger2.model = tagger2.Model(None, None)
+    tagger1.model = tagger1.Model(8, 8)
+    tagger2.model = tagger1.model
    return (tagger1, tagger2)


@ -20,7 +20,6 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
    tagger1, tagger2 = taggers
    tagger1_b = tagger1.to_bytes()
    tagger2_b = tagger2.to_bytes()
-    assert tagger1_b == tagger2_b
    tagger1 = tagger1.from_bytes(tagger1_b)
    assert tagger1.to_bytes() == tagger1_b
    new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)
--- a/spacy/tests/spans/test_span.py
+++ b/spacy/tests/spans/test_span.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals

 from ..util import get_doc
+from ...attrs import ORTH, LENGTH

 import pytest

@ -89,3 +90,19 @@ def test_spans_are_hashable(en_tokenizer):
    span3 = tokens[0:2]
    assert hash(span3) == hash(span1)
 
+
+def test_spans_by_character(doc):
+    span1 = doc[1:-2]
+    span2 = doc.char_span(span1.start_char, span1.end_char, label='GPE')
+    assert span1.start_char == span2.start_char
+    assert span1.end_char == span2.end_char
+    assert span2.label_ == 'GPE'
+
+
+def test_span_to_array(doc):
+    span = doc[1:-2]
+    arr = span.to_array([ORTH, LENGTH])
+    assert arr.shape == (len(span), 2)
+    assert arr[0, 0] == span[0].orth
+    assert arr[0, 1] == len(span[0])
+
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@ -79,9 +79,9 @@ def add_vecs_to_vocab(vocab, vectors):
    """Add list of vector tuples to given vocab. All vectors need to have the
    same length. Format: [("text", [1, 2, 3])]"""
    length = len(vectors[0][1])
-    vocab.resize_vectors(length)
+    vocab.clear_vectors(length)
    for word, vec in vectors:
-        vocab[word].vector = vec
+        vocab.set_vector(word, vec)
    return vocab


--- a/spacy/tests/vectors/test_similarity.py
+++ b/spacy/tests/vectors/test_similarity.py
@ -14,10 +14,9 @@ def vectors():

@pytest.fixture()
 def vocab(en_vocab, vectors):
-    #return add_vecs_to_vocab(en_vocab, vectors)
-    return None
+    add_vecs_to_vocab(en_vocab, vectors)
+    return en_vocab

-@pytest.mark.xfail
 def test_vectors_similarity_LL(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    lex1 = vocab[word1]
@ -31,7 +30,6 @@ def test_vectors_similarity_LL(vocab, vectors):
    assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))


-@pytest.mark.xfail
 def test_vectors_similarity_TT(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = get_doc(vocab, words=[word1, word2])
@ -44,21 +42,18 @@ def test_vectors_similarity_TT(vocab, vectors):
    assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))


-@pytest.mark.xfail
 def test_vectors_similarity_TD(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = get_doc(vocab, words=[word1, word2])
    assert doc.similarity(doc[0]) == doc[0].similarity(doc)


-@pytest.mark.xfail
 def test_vectors_similarity_DS(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = get_doc(vocab, words=[word1, word2])
    assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)


-@pytest.mark.xfail
 def test_vectors_similarity_TS(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = get_doc(vocab, words=[word1, word2])
--- a/spacy/tests/vectors/test_vectors.py
+++ b/spacy/tests/vectors/test_vectors.py
@ -2,6 +2,8 @@
 from __future__ import unicode_literals

 from ...vectors import Vectors
+from ...tokenizer import Tokenizer
+from ..util import add_vecs_to_vocab, get_doc

 import numpy
 import pytest
@ -11,22 +13,42 @@ import pytest
 def strings():
    return ["apple", "orange"]

+@pytest.fixture
+def vectors():
+    return [
+        ("apple", [1, 2, 3]),
+        ("orange", [-1, -2, -3]),
+        ('and', [-1, -1, -1]),
+        ('juice', [5, 5, 10]),
+        ('pie', [7, 6.3, 8.9])]
+
+
@pytest.fixture
 def data():
    return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype='f')


+@pytest.fixture()
+def vocab(en_vocab, vectors):
+    add_vecs_to_vocab(en_vocab, vectors)
+    return en_vocab
+
+
 def test_init_vectors_with_data(strings, data):
    v = Vectors(strings, data)
    assert v.shape == data.shape

 def test_init_vectors_with_width(strings):
    v = Vectors(strings, 3)
+    for string in strings:
+        v.add(string)
    assert v.shape == (len(strings), 3)


 def test_get_vector(strings, data):
    v = Vectors(strings, data)
+    for string in strings:
+        v.add(string)
    assert list(v[strings[0]]) == list(data[0])
    assert list(v[strings[0]]) != list(data[1])
    assert list(v[strings[1]]) != list(data[0])
@ -35,6 +57,8 @@ def test_get_vector(strings, data):
 def test_set_vector(strings, data):
    orig = data.copy()
    v = Vectors(strings, data)
+    for string in strings:
+        v.add(string)
    assert list(v[strings[0]]) == list(orig[0])
    assert list(v[strings[0]]) != list(orig[1])
    v[strings[0]] = data[1]
@ -42,125 +66,111 @@ def test_set_vector(strings, data):
    assert list(v[strings[0]]) != list(orig[0])


-#
-#@pytest.fixture()
-#def tokenizer_v(vocab):
-#    return Tokenizer(vocab, {}, None, None, None)
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', ["apple and orange"])
-#def test_vectors_token_vector(tokenizer_v, vectors, text):
-#    doc = tokenizer_v(text)
-#    assert vectors[0] == (doc[0].text, list(doc[0].vector))
-#    assert vectors[1] == (doc[2].text, list(doc[2].vector))
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', ["apple", "orange"])
-#def test_vectors_lexeme_vector(vocab, text):
-#    lex = vocab[text]
-#    assert list(lex.vector)
-#    assert lex.vector_norm
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
-#def test_vectors_doc_vector(vocab, text):
-#    doc = get_doc(vocab, text)
-#    assert list(doc.vector)
-#    assert doc.vector_norm
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
-#def test_vectors_span_vector(vocab, text):
-#    span = get_doc(vocab, text)[0:2]
-#    assert list(span.vector)
-#    assert span.vector_norm
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', ["apple orange"])
-#def test_vectors_token_token_similarity(tokenizer_v, text):
-#    doc = tokenizer_v(text)
-#    assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
-#    assert 0.0 < doc[0].similarity(doc[1]) < 1.0
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
-#def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
-#    token = tokenizer_v(text1)
-#    lex = vocab[text2]
-#    assert token.similarity(lex) == lex.similarity(token)
-#    assert 0.0 < token.similarity(lex) < 1.0
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#def test_vectors_token_span_similarity(vocab, text):
-#    doc = get_doc(vocab, text)
-#    assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0])
-#    assert 0.0 < doc[0].similarity(doc[1:3]) < 1.0
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#def test_vectors_token_doc_similarity(vocab, text):
-#    doc = get_doc(vocab, text)
-#    assert doc[0].similarity(doc) == doc.similarity(doc[0])
-#    assert 0.0 < doc[0].similarity(doc) < 1.0
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#def test_vectors_lexeme_span_similarity(vocab, text):
-#    doc = get_doc(vocab, text)
-#    lex = vocab[text[0]]
-#    assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex)
-#    assert 0.0 < doc.similarity(doc[1:3]) < 1.0
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
-#def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
-#    lex1 = vocab[text1]
-#    lex2 = vocab[text2]
-#    assert lex1.similarity(lex2) == lex2.similarity(lex1)
-#    assert 0.0 < lex1.similarity(lex2) < 1.0
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#def test_vectors_lexeme_doc_similarity(vocab, text):
-#    doc = get_doc(vocab, text)
-#    lex = vocab[text[0]]
-#    assert lex.similarity(doc) == doc.similarity(lex)
-#    assert 0.0 < lex.similarity(doc) < 1.0
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#def test_vectors_span_span_similarity(vocab, text):
-#    doc = get_doc(vocab, text)
-#    assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
-#    assert 0.0 < doc[0:2].similarity(doc[1:3]) < 1.0
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#def test_vectors_span_doc_similarity(vocab, text):
-#    doc = get_doc(vocab, text)
-#    assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
-#    assert 0.0 < doc[0:2].similarity(doc) < 1.0
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text1,text2', [
-#    (["apple", "and", "apple", "pie"], ["orange", "juice"])])
-#def test_vectors_doc_doc_similarity(vocab, text1, text2):
-#    doc1 = get_doc(vocab, text1)
-#    doc2 = get_doc(vocab, text2)
-#    assert doc1.similarity(doc2) == doc2.similarity(doc1)
-#    assert 0.0 < doc1.similarity(doc2) < 1.0
+
+@pytest.fixture()
+def tokenizer_v(vocab):
+    return Tokenizer(vocab, {}, None, None, None)
+
+
+@pytest.mark.parametrize('text', ["apple and orange"])
+def test_vectors_token_vector(tokenizer_v, vectors, text):
+    doc = tokenizer_v(text)
+    assert vectors[0] == (doc[0].text, list(doc[0].vector))
+    assert vectors[1] == (doc[2].text, list(doc[2].vector))
+
+
+@pytest.mark.parametrize('text', ["apple", "orange"])
+def test_vectors_lexeme_vector(vocab, text):
+    lex = vocab[text]
+    assert list(lex.vector)
+    assert lex.vector_norm
+
+
+@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
+def test_vectors_doc_vector(vocab, text):
+    doc = get_doc(vocab, text)
+    assert list(doc.vector)
+    assert doc.vector_norm
+
+
+@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
+def test_vectors_span_vector(vocab, text):
+    span = get_doc(vocab, text)[0:2]
+    assert list(span.vector)
+    assert span.vector_norm
+
+
+@pytest.mark.parametrize('text', ["apple orange"])
+def test_vectors_token_token_similarity(tokenizer_v, text):
+    doc = tokenizer_v(text)
+    assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
+    assert -1. < doc[0].similarity(doc[1]) < 1.0
+
+
+@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
+def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
+    token = tokenizer_v(text1)
+    lex = vocab[text2]
+    assert token.similarity(lex) == lex.similarity(token)
+    assert -1. < token.similarity(lex) < 1.0
+
+
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+def test_vectors_token_span_similarity(vocab, text):
+    doc = get_doc(vocab, text)
+    assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0])
+    assert -1. < doc[0].similarity(doc[1:3]) < 1.0
+
+
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+def test_vectors_token_doc_similarity(vocab, text):
+    doc = get_doc(vocab, text)
+    assert doc[0].similarity(doc) == doc.similarity(doc[0])
+    assert -1. < doc[0].similarity(doc) < 1.0
+
+
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+def test_vectors_lexeme_span_similarity(vocab, text):
+    doc = get_doc(vocab, text)
+    lex = vocab[text[0]]
+    assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex)
+    assert -1. < doc.similarity(doc[1:3]) < 1.0
+
+
+@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
+def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
+    lex1 = vocab[text1]
+    lex2 = vocab[text2]
+    assert lex1.similarity(lex2) == lex2.similarity(lex1)
+    assert -1. < lex1.similarity(lex2) < 1.0
+
+
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+def test_vectors_lexeme_doc_similarity(vocab, text):
+    doc = get_doc(vocab, text)
+    lex = vocab[text[0]]
+    assert lex.similarity(doc) == doc.similarity(lex)
+    assert -1. < lex.similarity(doc) < 1.0
+
+
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+def test_vectors_span_span_similarity(vocab, text):
+    doc = get_doc(vocab, text)
+    assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
+    assert -1. < doc[0:2].similarity(doc[1:3]) < 1.0
+
+
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+def test_vectors_span_doc_similarity(vocab, text):
+    doc = get_doc(vocab, text)
+    assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
+    assert -1. < doc[0:2].similarity(doc) < 1.0
+
+
+@pytest.mark.parametrize('text1,text2', [
+    (["apple", "and", "apple", "pie"], ["orange", "juice"])])
+def test_vectors_doc_doc_similarity(vocab, text1, text2):
+    doc1 = get_doc(vocab, text1)
+    doc2 = get_doc(vocab, text2)
+    assert doc1.similarity(doc2) == doc2.similarity(doc1)
+    assert -1. < doc1.similarity(doc2) < 1.0
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -33,6 +33,7 @@ cdef class Doc:
    cdef public object _vector_norm

    cdef public object tensor
+    cdef public object cats
    cdef public object user_data

    cdef TokenC* c
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -117,6 +117,7 @@ cdef class Doc:
        self.is_tagged = False
        self.is_parsed = False
        self.sentiment = 0.0
+        self.cats = {}
        self.user_hooks = {}
        self.user_token_hooks = {}
        self.user_span_hooks = {}
@ -237,6 +238,29 @@ cdef class Doc:
    def doc(self):
        return self

+    def char_span(self, int start_idx, int end_idx, label=0, vector=None):
+        """Create a `Span` object from the slice `doc.text[start : end]`.
+
+        doc (Doc): The parent document.
+        start (int): The index of the first character of the span.
+        end (int): The index of the first character after the span.
+        label (uint64 or string): A label to attach to the Span, e.g. for named entities.
+        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
+        RETURNS (Span): The newly constructed object.
+        """
+        if not isinstance(label, int):
+            label = self.vocab.strings.add(label)
+        cdef int start = token_by_start(self.c, self.length, start_idx)
+        if start == -1:
+            return None
+        cdef int end = token_by_end(self.c, self.length, end_idx)
+        if end == -1:
+            return None
+        # Currently we have the token index, we want the range-end index
+        end += 1
+        cdef Span span = Span(self, start, end, label=label, vector=vector)
+        return span
+
    def similarity(self, other):
        """Make a semantic similarity estimate. The default estimate is cosine
        similarity using an average of word vectors.
@ -279,8 +303,14 @@ cdef class Doc:
                return self.user_hooks['vector'](self)
            if self._vector is not None:
                return self._vector
-            elif self.has_vector and len(self):
-                self._vector = sum(t.vector for t in self) / len(self)
+            elif not len(self):
+                self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
+                return self._vector
+            elif self.has_vector:
+                vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
+                for token in self.c[:self.length]:
+                    vector += self.vocab.get_vector(token.lex.orth)
+                self._vector = vector / len(self)
                return self._vector
            elif self.tensor is not None:
                self._vector = self.tensor.mean(axis=0)
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@ -15,5 +15,5 @@ cdef class Span:
    cdef public _vector
    cdef public _vector_norm

-
    cpdef int _recalculate_indices(self) except -1
+    cpdef np.ndarray to_array(self, object features)
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -7,7 +7,7 @@ import numpy
 import numpy.linalg
 from libc.math cimport sqrt

-from .doc cimport token_by_start, token_by_end
+from .doc cimport token_by_start, token_by_end, get_token_attr
 from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport flags_t, attr_t, hash_t
 from ..attrs cimport attr_id_t
@ -135,6 +135,29 @@ cdef class Span:
            return 0.0
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)

+    cpdef np.ndarray to_array(self, object py_attr_ids):
+        """Given a list of M attribute IDs, export the tokens to a numpy
+        `ndarray` of shape `(N, M)`, where `N` is the length of the document.
+        The values will be 32-bit integers.
+
+        attr_ids (list[int]): A list of attribute ID ints.
+        RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
+            per word, and one column per attribute indicated in the input
+            `attr_ids`.
+        """
+        cdef int i, j
+        cdef attr_id_t feature
+        cdef np.ndarray[attr_t, ndim=2] output
+        # Make an array from the attributes --- otherwise our inner loop is Python
+        # dict iteration.
+        cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
+        cdef int length = self.end - self.start
+        output = numpy.ndarray(shape=(length, len(attr_ids)), dtype=numpy.uint64)
+        for i in range(self.start, self.end):
+            for j, feature in enumerate(attr_ids):
+                output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature)
+        return output
+
    cpdef int _recalculate_indices(self) except -1:
        if self.end > self.doc.length \
        or self.doc.c[self.start].idx != self.start_char \
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -62,18 +62,26 @@ cdef class Token:

    def __richcmp__(self, Token other, int op):
        # http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
+        cdef Doc my_doc = self.doc
+        cdef Doc other_doc = other.doc
        my = self.idx
        their = other.idx if other is not None else None
        if op == 0:
            return my < their
        elif op == 2:
-            return my == their
+            if my_doc is other_doc:
+                return my == their
+            else:
+                return False
        elif op == 4:
            return my > their
        elif op == 1:
            return my <= their
        elif op == 3:
-            return my != their
+            if my_doc is other_doc:
+                return my != their
+            else:
+                return True
        elif op == 5:
            return my >= their
        else:
--- a/spacy/util.py
+++ b/spacy/util.py
@ -22,7 +22,7 @@ import ujson

 from .symbols import ORTH
 from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
-from .compat import copy_array, normalize_string_keys, getattr_
+from .compat import copy_array, normalize_string_keys, getattr_, import_file


 LANGUAGES = {}
@ -112,15 +112,13 @@ def load_model(name, **overrides):

 def load_model_from_link(name, **overrides):
    """Load a model from a shortcut link, or directory in spaCy data path."""
-    init_file = get_data_path() / name / '__init__.py'
-    spec = importlib.util.spec_from_file_location(name, init_file)
+    path = get_data_path() / name / '__init__.py'
    try:
-        cls = importlib.util.module_from_spec(spec)
+        cls = import_file(name, path)
    except AttributeError:
        raise IOError(
            "Cant' load '%s'. If you're using a shortcut link, make sure it "
            "points to a valid model package (not just a data directory)." % name)
-    spec.loader.exec_module(cls)
    return cls.load(**overrides)


@ -171,8 +169,8 @@ def get_model_meta(path):
        raise IOError("Could not read meta.json from %s" % meta_path)
    meta = read_json(meta_path)
    for setting in ['lang', 'name', 'version']:
-        if setting not in meta:
-            raise ValueError('No %s setting found in model meta.json' % setting)
+        if setting not in meta or not meta[setting]:
+            raise ValueError("No valid '%s' setting found in model meta.json" % setting)
    return meta


--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@ -1,18 +1,25 @@
+from __future__ import unicode_literals
+from libc.stdint cimport int32_t, uint64_t
 import numpy
 from collections import OrderedDict
 import msgpack
 import msgpack_numpy
 msgpack_numpy.patch()
+cimport numpy as np

+from .typedefs cimport attr_t
 from .strings cimport StringStore
 from . import util
+from .compat import basestring_


 cdef class Vectors:
    '''Store, save and load word vectors.'''
    cdef public object data
    cdef readonly StringStore strings
-    cdef public object key2i
+    cdef public object key2row
+    cdef public object keys
+    cdef public int i

    def __init__(self, strings, data_or_width):
        self.strings = StringStore()
@ -21,10 +28,10 @@ cdef class Vectors:
                                           dtype='f')
        else:
            data = data_or_width
+        self.i = 0
        self.data = data
-        self.key2i = {}
-        for i, string in enumerate(strings):
-            self.key2i[self.strings.add(string)] = i
+        self.key2row = {}
+        self.keys = np.ndarray((self.data.shape[0],), dtype='uint64') 

    def __reduce__(self):
        return (Vectors, (self.strings, self.data))
@ -32,7 +39,7 @@ cdef class Vectors:
    def __getitem__(self, key):
        if isinstance(key, basestring):
            key = self.strings[key]
-        i = self.key2i[key]
+        i = self.key2row[key]
        if i is None:
            raise KeyError(key)
        else:
@ -41,14 +48,36 @@ cdef class Vectors:
    def __setitem__(self, key, vector):
        if isinstance(key, basestring):
            key = self.strings.add(key)
-        i = self.key2i[key]
+        i = self.key2row[key]
        self.data[i] = vector

    def __iter__(self):
        yield from self.data

    def __len__(self):
-        return len(self.strings)
+        return self.i
+
+    def __contains__(self, key):
+        if isinstance(key, basestring_):
+            key = self.strings[key]
+        return key in self.key2row
+
+    def add(self, key, vector=None):
+        if isinstance(key, basestring_):
+            key = self.strings.add(key)
+        if key not in self.key2row:
+            i = self.i
+            if i >= self.keys.shape[0]:
+                self.keys.resize((self.keys.shape[0]*2,))
+                self.data.resize((self.data.shape[0]*2, self.data.shape[1]))
+            self.key2row[key] = self.i
+            self.keys[self.i] = key
+            self.i += 1
+        else:
+            i = self.key2row[key]
+        if vector is not None:
+            self.data[i] = vector
+        return i

    def items(self):
        for i, string in enumerate(self.strings):
@ -61,34 +90,87 @@ cdef class Vectors:
    def most_similar(self, key):
        raise NotImplementedError

-    def to_disk(self, path):
-        raise NotImplementedError
+    def from_glove(self, path):
+        '''Load GloVe vectors from a directory. Assumes binary format,
+        that the vocab is in a vocab.txt, and that vectors are named
+        vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32
+        vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc.
+        By default GloVe outputs 64-bit vectors.'''
+        path = util.ensure_path(path)
+        for name in path.iterdir():
+            if name.parts[-1].startswith('vectors'):
+                _, dims, dtype, _2 = name.parts[-1].split('.')
+                self.width = int(dims)
+                break
+        else:
+            raise IOError("Expected file named e.g. vectors.128.f.bin")
+        bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims,
+                                                             dtype=dtype)
+        with bin_loc.open('rb') as file_:
+            self.data = numpy.fromfile(file_, dtype='float64')
+            self.data = numpy.ascontiguousarray(self.data, dtype='float32')
+        n = 0
+        with (path / 'vocab.txt').open('r') as file_:
+            for line in file_:
+                self.add(line.strip())
+                n += 1
+        if (self.data.size % self.width) == 0:
+            self.data

-    def from_disk(self, path):
-        raise NotImplementedError
+    def to_disk(self, path, **exclude):
+        serializers = OrderedDict((
+            ('vectors', lambda p: numpy.save(p.open('wb'), self.data, allow_pickle=False)),
+            ('keys', lambda p: numpy.save(p.open('wb'), self.keys, allow_pickle=False)),
+        ))
+        return util.to_disk(path, serializers, exclude)
+
+    def from_disk(self, path, **exclude):
+        def load_keys(path):
+            if path.exists():
+                self.keys = numpy.load(path)
+                for i, key in enumerate(self.keys):
+                    self.keys[i] = key
+                    self.key2row[key] = i
+
+        def load_vectors(path):
+            if path.exists():
+                self.data = numpy.load(path)
+
+        serializers = OrderedDict((
+            ('keys', load_keys),
+            ('vectors', load_vectors),
+        ))
+        util.from_disk(path, serializers, exclude)
+        return self

    def to_bytes(self, **exclude):
        def serialize_weights():
-            if hasattr(self.weights, 'to_bytes'):
-                return self.weights.to_bytes()
+            if hasattr(self.data, 'to_bytes'):
+                return self.data.to_bytes()
            else:
-                return msgpack.dumps(self.weights)
-
+                return msgpack.dumps(self.data)
        serializers = OrderedDict((
-            ('strings', lambda: self.strings.to_bytes()),
-            ('weights', serialize_weights)
+            ('keys', lambda: msgpack.dumps(self.keys)),
+            ('vectors', serialize_weights)
        ))
        return util.to_bytes(serializers, exclude)

    def from_bytes(self, data, **exclude):
        def deserialize_weights(b):
-            if hasattr(self.weights, 'from_bytes'):
-                self.weights.from_bytes()
+            if hasattr(self.data, 'from_bytes'):
+                self.data.from_bytes()
            else:
-                self.weights = msgpack.loads(b)
+                self.data = msgpack.loads(b)
+
+        def load_keys(keys):
+            self.keys.resize((len(keys),))
+            for i, key in enumerate(keys):
+                self.keys[i] = key
+                self.key2row[key] = i

        deserializers = OrderedDict((
-            ('strings', lambda b: self.strings.from_bytes(b)),
-            ('weights', deserialize_weights)
+            ('keys', lambda b: load_keys(msgpack.loads(b))),
+            ('vectors', deserialize_weights)
        ))
-        return util.from_bytes(deserializers, exclude)
+        util.from_bytes(data, deserializers, exclude)
+        return self
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import bz2
 import ujson
 import re
+import numpy

 from libc.string cimport memset, memcpy
 from libc.stdint cimport int32_t
@ -19,9 +20,10 @@ from .tokens.token cimport Token
 from .attrs cimport PROB, LANG
 from .structs cimport SerializedLexemeC

-from .compat import copy_reg, pickle
+from .compat import copy_reg, pickle, basestring_
 from .lemmatizer import Lemmatizer
 from .attrs import intify_attrs
+from .vectors import Vectors
 from . import util
 from . import attrs
 from . import symbols
@ -63,6 +65,7 @@ cdef class Vocab:
                self.strings.add(name)
        self.lex_attr_getters = lex_attr_getters
        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
+        self.vectors = Vectors(self.strings, 300)

    property lang:
        def __get__(self):
@ -242,13 +245,15 @@ cdef class Vocab:

    @property
    def vectors_length(self):
-        raise NotImplementedError
+        return self.vectors.data.shape[1]

-    def clear_vectors(self):
+    def clear_vectors(self, new_dim=None):
        """Drop the current vector table. Because all vectors must be the same
        width, you have to call this to change the size of the vectors.
        """
-        raise NotImplementedError
+        if new_dim is None:
+            new_dim = self.vectors.data.shape[1]
+        self.vectors = Vectors(self.strings, new_dim)

    def get_vector(self, orth):
        """Retrieve a vector for a word in the vocabulary.
@ -262,7 +267,12 @@ cdef class Vocab:

        RAISES: If no vectors data is loaded, ValueError is raised.
        """
-        raise NotImplementedError
+        if isinstance(orth, basestring_):
+            orth = self.strings.add(orth)
+        if orth in self.vectors.key2row:
+            return self.vectors[orth]
+        else:
+            return numpy.zeros((self.vectors_length,), dtype='f')

    def set_vector(self, orth, vector):
        """Set a vector for a word in the vocabulary.
@ -272,15 +282,19 @@ cdef class Vocab:
        RETURNS:
            None
        """
-        raise NotImplementedError
+        if not isinstance(orth, basestring_):
+            orth = self.strings[orth]
+        self.vectors.add(orth, vector=vector)

    def has_vector(self, orth):
        """Check whether a word has a vector. Returns False if no
        vectors have been loaded. Words can be looked up by string
        or int ID."""
-        return False
+        if isinstance(orth, basestring_):
+            orth = self.strings.add(orth)
+        return orth in self.vectors

-    def to_disk(self, path):
+    def to_disk(self, path, **exclude):
        """Save the current state to a directory.

        path (unicode or Path): A path to a directory, which will be created if
@ -292,8 +306,10 @@ cdef class Vocab:
        self.strings.to_disk(path / 'strings.json')
        with (path / 'lexemes.bin').open('wb') as file_:
            file_.write(self.lexemes_to_bytes())
+        if self.vectors is not None:
+            self.vectors.to_disk(path)

-    def from_disk(self, path):
+    def from_disk(self, path, **exclude):
        """Loads state from a directory. Modifies the object in place and
        returns it.

@ -305,6 +321,8 @@ cdef class Vocab:
        self.strings.from_disk(path / 'strings.json')
        with (path / 'lexemes.bin').open('rb') as file_:
            self.lexemes_from_bytes(file_.read())
+        if self.vectors is not None:
+            self.vectors.from_disk(path, exclude='strings.json')
        return self

    def to_bytes(self, **exclude):
@ -313,9 +331,16 @@ cdef class Vocab:
        **exclude: Named attributes to prevent from being serialized.
        RETURNS (bytes): The serialized form of the `Vocab` object.
        """
+        def deserialize_vectors():
+            if self.vectors is None:
+                return None
+            else:
+                return self.vectors.to_bytes(exclude='strings.json')
+ 
        getters = OrderedDict((
            ('strings', lambda: self.strings.to_bytes()),
            ('lexemes', lambda: self.lexemes_to_bytes()),
+            ('vectors', deserialize_vectors)
        ))
        return util.to_bytes(getters, exclude)

@ -326,9 +351,15 @@ cdef class Vocab:
        **exclude: Named attributes to prevent from being loaded.
        RETURNS (Vocab): The `Vocab` object.
        """
+        def serialize_vectors(b):
+            if self.vectors is None:
+                return None
+            else:
+                return self.vectors.from_bytes(b, exclude='strings')
        setters = OrderedDict((
            ('strings', lambda b: self.strings.from_bytes(b)),
            ('lexemes', lambda b: self.lexemes_from_bytes(b)),
+            ('vectors', lambda b: serialize_vectors(b))
        ))
        util.from_bytes(bytes_data, setters, exclude)
        return self
--- a/travis.sh
+++ b/travis.sh
@ -2,9 +2,8 @@

 if [ "${VIA}" == "pypi" ]; then
    rm -rf *
-    pip install spacy
-    python -m spacy.en.download
-    python -m spacy.de.download
+    pip install spacy-nightly
+    python -m spacy download en
 fi

 if [[ "${VIA}" == "sdist" && "${TRAVIS_PULL_REQUEST}" == "false" ]]; then
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -103,20 +103,20 @@ mixin button(url, trusted, ...style)
    label    - [string] aside title (optional or false for no label)
    language - [string] language for syntax highlighting (default: "python")
               supports basic relevant languages available for PrismJS
-    icon     - [string] icon to display next to code block, mostly used for old/new
+    prompt    - [string] prompt or icon to display next to code block, (mostly used for old/new)
    height   - [integer] optional height to clip code block to

-mixin code(label, language, icon, height)
+mixin code(label, language, prompt, height)
    pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
        if label
            h4.u-text-label.u-text-label--dark=label
-
+        - var icon = (prompt == 'accept' || prompt == 'reject')
        if icon
            - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
            .c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
                +icon(icon, 18)

-        code.c-code-block__content
+        code.c-code-block__content(data-prompt=icon ? null : prompt)
            block


--- a/website/assets/css/_base/_utilities.sass
+++ b/website/assets/css/_base/_utilities.sass
@ -112,6 +112,10 @@
 .u-nowrap
    white-space: nowrap

+.u-break.u-break
+    word-wrap: break-word
+    white-space: initial
+
 .u-no-border
    border: none

--- a/website/assets/css/_components/_code.sass
+++ b/website/assets/css/_components/_code.sass
@ -35,6 +35,13 @@
    font: normal normal 1.1rem/#{2} $font-code
    padding: 1em 2em

+    &[data-prompt]:before,
+        content: attr(data-prompt)
+        margin-right: 0.65em
+        display: inline-block
+        vertical-align: middle
+        opacity: 0.5
+

 //- Inline code

--- a/Show More
+++ b/Show More