diff --git a/setup.py b/setup.py
index 0a3384ed5..6a22f4076 100755
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,9 @@ MOD_NAMES = [
     'spacy.pipeline',
     'spacy.syntax.stateclass',
     'spacy.syntax._state',
+    'spacy.syntax._beam_utils',
     'spacy.tokenizer',
+    'spacy._cfile',
     'spacy.syntax.parser',
     'spacy.syntax.nn_parser',
     'spacy.syntax.beam_parser',
diff --git a/spacy/_cfile.pxd b/spacy/_cfile.pxd
new file mode 100644
index 000000000..cb0077587
--- /dev/null
+++ b/spacy/_cfile.pxd
@@ -0,0 +1,26 @@
+from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
+from cymem.cymem cimport Pool
+
+cdef class CFile:
+    cdef FILE* fp
+    cdef bint is_open
+    cdef Pool mem
+    cdef int size # For compatibility with subclass
+    cdef int _capacity # For compatibility with subclass
+
+    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
+
+    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
+
+    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
+
+
+
+cdef class StringCFile(CFile):
+    cdef unsigned char* data
+ 
+    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
+
+    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
+    
+    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
diff --git a/spacy/_cfile.pyx b/spacy/_cfile.pyx
new file mode 100644
index 000000000..ceebe2e59
--- /dev/null
+++ b/spacy/_cfile.pyx
@@ -0,0 +1,88 @@
+from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
+from libc.string cimport memcpy
+
+
+cdef class CFile:
+    def __init__(self, loc, mode, on_open_error=None):
+        if isinstance(mode, unicode):
+            mode_str = mode.encode('ascii')
+        else:
+            mode_str = mode
+        if hasattr(loc, 'as_posix'):
+            loc = loc.as_posix()
+        self.mem = Pool()
+        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
+        self.fp = fopen(<char*>bytes_loc, mode_str)
+        if self.fp == NULL:
+            if on_open_error is not None:
+                on_open_error()
+            else:
+                raise IOError("Could not open binary file %s" % bytes_loc)
+        self.is_open = True
+
+    def __dealloc__(self):
+        if self.is_open:
+            fclose(self.fp)
+
+    def close(self):
+        fclose(self.fp)
+        self.is_open = False
+
+    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
+        st = fread(dest, elem_size, number, self.fp)
+        if st != number:
+            raise IOError
+
+    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
+        st = fwrite(src, elem_size, number, self.fp)
+        if st != number:
+            raise IOError
+
+    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
+        cdef void* dest = mem.alloc(number, elem_size)
+        self.read_into(dest, number, elem_size)
+        return dest
+
+    def write_unicode(self, unicode value):
+        cdef bytes py_bytes = value.encode('utf8')
+        cdef char* chars = <char*>py_bytes
+        self.write(sizeof(char), len(py_bytes), chars)
+
+
+cdef class StringCFile:
+    def __init__(self, mode, bytes data=b'', on_open_error=None):
+        self.mem = Pool()
+        self.is_open = 'w' in mode
+        self._capacity = max(len(data), 8)
+        self.size = len(data)
+        self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
+        for i in range(len(data)):
+            self.data[i] = data[i]
+
+    def close(self):
+        self.is_open = False
+
+    def string_data(self):
+        return (self.data-self.size)[:self.size]
+
+    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
+        memcpy(dest, self.data, elem_size * number)
+        self.data += elem_size * number
+
+    cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
+        write_size = number * elem_size
+        if (self.size + write_size) >= self._capacity:
+            self._capacity = (self.size + write_size) * 2
+            self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
+        memcpy(&self.data[self.size], src, elem_size * number)
+        self.size += write_size
+
+    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
+        cdef void* dest = mem.alloc(number, elem_size)
+        self.read_into(dest, number, elem_size)
+        return dest
+
+    def write_unicode(self, unicode value):
+        cdef bytes py_bytes = value.encode('utf8')
+        cdef char* chars = <char*>py_bytes
+        self.write(sizeof(char), len(py_bytes), chars)
diff --git a/spacy/_ml.py b/spacy/_ml.py
index f1ded666e..b3b0d3e46 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -5,10 +5,12 @@ from thinc.neural._classes.hash_embed import HashEmbed
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module
 import random
+import cytoolz
 
 from thinc.neural._classes.convolution import ExtractWindow
 from thinc.neural._classes.static_vectors import StaticVectors
-from thinc.neural._classes.batchnorm import BatchNorm
+from thinc.neural._classes.batchnorm import BatchNorm as BN
+from thinc.neural._classes.layernorm import LayerNorm as LN
 from thinc.neural._classes.resnet import Residual
 from thinc.neural import ReLu
 from thinc.neural._classes.selu import SELU
@@ -19,10 +21,12 @@ from thinc.api import FeatureExtracter, with_getitem
 from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
 from thinc.neural._classes.attention import ParametricAttention
 from thinc.linear.linear import LinearModel
-from thinc.api import uniqued, wrap
+from thinc.api import uniqued, wrap, flatten_add_lengths
+
 
 from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
 from .tokens.doc import Doc
+from . import util
 
 import numpy
 import io
@@ -53,6 +57,27 @@ def _logistic(X, drop=0.):
     return Y, logistic_bwd
 
 
+@layerize
+def add_tuples(X, drop=0.):
+    """Give inputs of sequence pairs, where each sequence is (vals, length),
+    sum the values, returning a single sequence.
+
+    If input is:
+    ((vals1, length), (vals2, length)
+    Output is:
+    (vals1+vals2, length)
+
+    vals are a single tensor for the whole batch.
+    """
+    (vals1, length1), (vals2, length2) = X
+    assert length1 == length2
+
+    def add_tuples_bwd(dY, sgd=None):
+        return (dY, dY)
+
+    return (vals1+vals2, length), add_tuples_bwd
+
+
 def _zero_init(model):
     def _zero_init_impl(self, X, y):
         self.W.fill(0)
@@ -61,6 +86,7 @@ def _zero_init(model):
         model.W.fill(0.)
     return model
 
+
 @layerize
 def _preprocess_doc(docs, drop=0.):
     keys = [doc.to_array([LOWER]) for doc in docs]
@@ -72,7 +98,6 @@ def _preprocess_doc(docs, drop=0.):
     return (keys, vals, lengths), None
 
 
-
 def _init_for_precomputed(W, ops):
     if (W**2).sum() != 0.:
         return
@@ -80,6 +105,7 @@ def _init_for_precomputed(W, ops):
     ops.xavier_uniform_init(reshaped)
     W[:] = reshaped.reshape(W.shape)
 
+
 @describe.on_data(_set_dimensions_if_needed)
 @describe.attributes(
     nI=Dimension("Input size"),
@@ -184,25 +210,36 @@ class PrecomputableMaxouts(Model):
         return Yfp, backward
 
 
+def drop_layer(layer, factor=2.):
+    def drop_layer_fwd(X, drop=0.):
+        drop *= factor
+        mask = layer.ops.get_dropout_mask((1,), drop)
+        if mask is None or mask > 0:
+            return layer.begin_update(X, drop=drop)
+        else:
+            return X, lambda dX, sgd=None: dX
+    return wrap(drop_layer_fwd, layer)
+
+
 def Tok2Vec(width, embed_size, preprocess=None):
-    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
+    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
     with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
-        norm = get_col(cols.index(NORM))   >> HashEmbed(width, embed_size, name='embed_lower')
+        norm = get_col(cols.index(NORM))     >> HashEmbed(width, embed_size, name='embed_lower')
         prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
         suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
         shape = get_col(cols.index(SHAPE))   >> HashEmbed(width, embed_size//2, name='embed_shape')
 
-        embed = (norm | prefix | suffix | shape )
+        embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3))
         tok2vec = (
             with_flatten(
                 asarray(Model.ops, dtype='uint64')
-                >> embed
-                >> Maxout(width, width*4, pieces=3)
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
-            pad=4)
+                >> uniqued(embed, column=5)
+                >> drop_layer(
+                    Residual(
+                        (ExtractWindow(nW=1) >> BN(Maxout(width, width*3)))
+                    )
+                ) ** 4, pad=4
+            )
         )
         if preprocess not in (False, None):
             tok2vec = preprocess >> tok2vec
@@ -297,7 +334,8 @@ def zero_init(model):
 
 
 def doc2feats(cols=None):
-    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
+    if cols is None:
+        cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
     def forward(docs, drop=0.):
         feats = []
         for doc in docs:
@@ -323,6 +361,37 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
     return vectors, backward
 
 
+def fine_tune(embedding, combine=None):
+    if combine is not None:
+        raise NotImplementedError(
+            "fine_tune currently only supports addition. Set combine=None")
+    def fine_tune_fwd(docs_tokvecs, drop=0.):
+        docs, tokvecs = docs_tokvecs
+        lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
+
+        vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
+        flat_tokvecs = embedding.ops.flatten(tokvecs)
+        flat_vecs = embedding.ops.flatten(vecs)
+        output = embedding.ops.unflatten(
+                   (model.mix[0] * flat_vecs + model.mix[1] * flat_tokvecs),
+                    lengths)
+
+        def fine_tune_bwd(d_output, sgd=None):
+            bp_vecs(d_output, sgd=sgd)
+            flat_grad = model.ops.flatten(d_output)
+            model.d_mix[1] += flat_tokvecs.dot(flat_grad.T).sum()
+            model.d_mix[0] += flat_vecs.dot(flat_grad.T).sum()
+            if sgd is not None:
+                sgd(model._mem.weights, model._mem.gradient, key=model.id)
+            return d_output
+        return output, fine_tune_bwd
+    model = wrap(fine_tune_fwd, embedding)
+    model.mix = model._mem.add((model.id, 'mix'), (2,))
+    model.mix.fill(1.)
+    model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
+    return model
+
+
 @layerize
 def flatten(seqs, drop=0.):
     if isinstance(seqs[0], numpy.ndarray):
@@ -369,6 +438,27 @@ def preprocess_doc(docs, drop=0.):
     vals = ops.allocate(keys.shape[0]) + 1
     return (keys, vals, lengths), None
 
+def getitem(i):
+    def getitem_fwd(X, drop=0.):
+        return X[i], None
+    return layerize(getitem_fwd)
+
+def build_tagger_model(nr_class, token_vector_width, **cfg):
+    embed_size = util.env_opt('embed_size', 7500)
+    with Model.define_operators({'>>': chain, '+': add}):
+        # Input: (doc, tensor) tuples
+        private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats())
+
+        model = (
+            fine_tune(private_tok2vec)
+            >> with_flatten(
+                Maxout(token_vector_width, token_vector_width)
+                >> Softmax(nr_class, token_vector_width)
+            )
+        )
+    model.nI = None
+    return model
+
 
 def build_text_classifier(nr_class, width=64, **cfg):
     nr_vector = cfg.get('nr_vector', 200)
@@ -383,7 +473,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
             >> _flatten_add_lengths
             >> with_getitem(0,
                 uniqued(
-                  (embed_lower | embed_prefix | embed_suffix | embed_shape) 
+                  (embed_lower | embed_prefix | embed_suffix | embed_shape)
                   >> Maxout(width, width+(width//2)*3))
                 >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
                 >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
@@ -404,7 +494,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
             >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
             >> logistic
         )
- 
+
     model.lsuv = False
     return model
 
diff --git a/spacy/about.py b/spacy/about.py
index 9f62c769e..d494f8d31 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -3,7 +3,7 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
 
 __title__ = 'spacy-nightly'
-__version__ = '2.0.0a9'
+__version__ = '2.0.0a10'
 __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 __uri__ = 'https://spacy.io'
 __author__ = 'Explosion AI'
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index a0a76e5ec..fef6753e6 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -21,10 +21,10 @@ CONVERTERS = {
 @plac.annotations(
     input_file=("input file", "positional", None, str),
     output_dir=("output directory for converted file", "positional", None, str),
-    n_sents=("Number of sentences per doc", "option", "n", float),
+    n_sents=("Number of sentences per doc", "option", "n", int),
     morphology=("Enable appending morphology to tags", "flag", "m", bool)
 )
-def convert(cmd, input_file, output_dir, n_sents, morphology):
+def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
     """
     Convert files into JSON format for use with train command and other
     experiment management functions.
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index af028dae5..04aac8319 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -91,15 +91,14 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                 for batch in minibatch(train_docs, size=batch_sizes):
                     docs, golds = zip(*batch)
                     nlp.update(docs, golds, sgd=optimizer,
-                               drop=next(dropout_rates), losses=losses)
+                               drop=next(dropout_rates), losses=losses,
+                               update_tensors=True)
                     pbar.update(sum(len(doc) for doc in docs))
 
             with nlp.use_params(optimizer.averages):
                 util.set_env_log(False)
                 epoch_model_path = output_path / ('model%d' % i)
                 nlp.to_disk(epoch_model_path)
-                with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
-                    dill.dump(nlp, file_, -1)
                 nlp_loaded = lang_class(pipeline=pipeline)
                 nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
                 scorer = nlp_loaded.evaluate(
diff --git a/spacy/compat.py b/spacy/compat.py
index 4ef24cd8b..e6b7c066b 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -46,19 +46,21 @@ is_osx = sys.platform == 'darwin'
 
 
 if is_python2:
+    import imp
     bytes_ = str
     unicode_ = unicode
     basestring_ = basestring
     input_ = raw_input
-    json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8')
+    json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False).decode('utf8')
     path2str = lambda path: str(path).decode('utf8')
 
 elif is_python3:
+    import importlib.util
     bytes_ = bytes
     unicode_ = str
     basestring_ = str
     input_ = input
-    json_dumps = lambda data: ujson.dumps(data, indent=2)
+    json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False)
     path2str = lambda path: str(path)
 
 
@@ -102,3 +104,12 @@ def normalize_string_keys(old):
     return new
 
 
+def import_file(name, loc):
+    loc = str(loc)
+    if is_python2:
+        return imp.load_source(name, loc)
+    else:
+        spec = importlib.util.spec_from_file_location(name, str(loc))
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        return module
diff --git a/spacy/lang/da/examples.py b/spacy/lang/da/examples.py
new file mode 100644
index 000000000..549f71fb5
--- /dev/null
+++ b/spacy/lang/da/examples.py
@@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.da.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple overvejer at købe et britisk statup for 1 milliard dollar",
+    "Selvkørende biler flytter forsikringsansvaret over på producenterne",
+    "San Francisco overvejer at forbyde leverandørrobotter på fortov",
+    "London er en stor by i Storbritannien"
+]
diff --git a/spacy/lang/de/examples.py b/spacy/lang/de/examples.py
new file mode 100644
index 000000000..49ac0e14b
--- /dev/null
+++ b/spacy/lang/de/examples.py
@@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.de.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
+    "Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
+    "Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz",
+    "Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion",
+    "San Francisco erwägt Verbot von Lieferrobotern",
+    "Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
+    "Wo bist du?",
+    "Was ist die Hauptstadt von Deutschland?"
+]
diff --git a/spacy/lang/en/examples.py b/spacy/lang/en/examples.py
new file mode 100644
index 000000000..b92d4a65c
--- /dev/null
+++ b/spacy/lang/en/examples.py
@@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.en.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple is looking at buying U.K. startup for $1 billion",
+    "Autonomous cars shift insurance liability toward manufacturers",
+    "San Francisco considers banning sidewalk delivery robots",
+    "London is a big city in the United Kingdom.",
+    "Where are you?",
+    "Who is the president of France?",
+    "What is the capital of the United States?",
+    "When was Barack Obama born?"
+]
diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py
new file mode 100644
index 000000000..61fe8c9be
--- /dev/null
+++ b/spacy/lang/es/examples.py
@@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.es.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares",
+    "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes",
+    "San Francisco analiza prohibir los robots delivery",
+    "Londres es una gran ciudad del Reino Unido",
+    "El gato come pescado",
+    "Veo al hombre con el telescopio",
+    "La araña come moscas",
+    "El pingüino incuba en su nido"
+]
diff --git a/spacy/lang/fr/examples.py b/spacy/lang/fr/examples.py
new file mode 100644
index 000000000..08409ea61
--- /dev/null
+++ b/spacy/lang/fr/examples.py
@@ -0,0 +1,26 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.fr.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple cherche a acheter une startup anglaise pour 1 milliard de dollard",
+    "Les voitures autonomes voient leur assurances décalées vers les constructeurs",
+    "San Francisco envisage d'interdire les robots coursiers",
+    "Londres est une grande ville du Royaume-Uni",
+    "L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe",
+    "Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon",
+    "La France ne devrait pas manquer d'électricité cet été, même en cas de canicule",
+    "Nouvelles attaques de Trump contre le maire de Londres",
+    "Où es-tu ?",
+    "Qui est le président de la France ?",
+    "Où est la capitale des Etats-Unis ?",
+    "Quand est né Barack Obama ?"
+]
diff --git a/spacy/lang/he/examples.py b/spacy/lang/he/examples.py
new file mode 100644
index 000000000..f99f4814b
--- /dev/null
+++ b/spacy/lang/he/examples.py
@@ -0,0 +1,28 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.he.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל',
+    'רה"מ הודיע כי יחרים טקס בחסותו',
+    'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100',
+    'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית',
+    'סע לשלום, המפתחות בפנים.',
+    'מלצר, פעמיים טורקי!',
+    'ואהבת לרעך כמוך.',
+    'היום נעשה משהו בלתי נשכח.',
+    'איפה הילד?',
+    'מיהו נשיא צרפת?',
+    'מהי בירת ארצות הברית?',
+    "איך קוראים בעברית לצ'ופצ'יק של הקומקום?",
+    'מה הייתה הדקה?',
+    'מי אומר שלום ראשון, זה שעולה או זה שיורד?'
+]
diff --git a/spacy/lang/it/examples.py b/spacy/lang/it/examples.py
new file mode 100644
index 000000000..d35b9f834
--- /dev/null
+++ b/spacy/lang/it/examples.py
@@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.it.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
+    "Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
+    "San Francisco prevede di bandire i robot di consegna porta a porta",
+    "Londra è una grande città del Regno Unito."
+]
diff --git a/spacy/lang/nb/examples.py b/spacy/lang/nb/examples.py
new file mode 100644
index 000000000..0dc5c8144
--- /dev/null
+++ b/spacy/lang/nb/examples.py
@@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.nb.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar",
+    "Selvkjørende biler flytter forsikringsansvaret over på produsentene ",
+    "San Francisco vurderer å forby robotbud på fortauene",
+    "London er en stor by i Storbritannia."
+]
diff --git a/spacy/lang/pl/examples.py b/spacy/lang/pl/examples.py
new file mode 100644
index 000000000..af6c72af0
--- /dev/null
+++ b/spacy/lang/pl/examples.py
@@ -0,0 +1,20 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.pl.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Poczuł przyjemną woń mocnej kawy.",
+    "Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
+    "Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.",
+    "Nowy abonament pod lupą Komisji Europejskiej",
+    "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
+    "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”."
+]
diff --git a/spacy/lang/pt/examples.py b/spacy/lang/pt/examples.py
new file mode 100644
index 000000000..239929215
--- /dev/null
+++ b/spacy/lang/pt/examples.py
@@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.pt.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares",
+    "Carros autônomos empurram a responsabilidade do seguro para os fabricantes."
+    "São Francisco considera banir os robôs de entrega que andam pelas calçadas",
+    "Londres é a maior cidade do Reino Unido"
+]
diff --git a/spacy/lang/sv/examples.py b/spacy/lang/sv/examples.py
new file mode 100644
index 000000000..be279c4bd
--- /dev/null
+++ b/spacy/lang/sv/examples.py
@@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.sv.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple överväger att köpa brittisk startup för 1 miljard dollar.",
+    "Självkörande bilar förskjuter försäkringsansvar mot tillverkare.",
+    "San Fransisco överväger förbud mot leveransrobotar på trottoarer.".
+    "London är en storstad i Storbritannien."
+]
diff --git a/spacy/language.py b/spacy/language.py
index 0284c4636..ed880d9ca 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -95,7 +95,7 @@ class BaseDefaults(object):
         meta = nlp.meta if nlp is not None else {}
         # Resolve strings, like "cnn", "lstm", etc
         pipeline = []
-        for entry in cls.pipeline:
+        for entry in meta.get('pipeline', []):
             if entry in disable or getattr(entry, 'name', entry) in disable:
                 continue
             factory = cls.Defaults.factories[entry]
@@ -277,7 +277,8 @@ class Language(object):
     def make_doc(self, text):
         return self.tokenizer(text)
 
-    def update(self, docs, golds, drop=0., sgd=None, losses=None):
+    def update(self, docs, golds, drop=0., sgd=None, losses=None,
+            update_tensors=False):
         """Update the models in the pipeline.
 
         docs (iterable): A batch of `Doc` objects.
@@ -304,14 +305,17 @@ class Language(object):
             grads[key] = (W, dW)
         pipes = list(self.pipeline[1:])
         random.shuffle(pipes)
+        tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
+        all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
         for proc in pipes:
             if not hasattr(proc, 'update'):
                 continue
-            tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
             d_tokvecses = proc.update((docs, tokvecses), golds,
                                       drop=drop, sgd=get_grads, losses=losses)
-            if d_tokvecses is not None:
-                bp_tokvecses(d_tokvecses, sgd=sgd)
+            if update_tensors and d_tokvecses is not None:
+                for i, d_tv in enumerate(d_tokvecses):
+                    all_d_tokvecses[i] += d_tv
+        bp_tokvecses(all_d_tokvecses, sgd=sgd)
         for key, (W, dW) in grads.items():
             sgd(W, dW, key=key)
         # Clear the tensor variable, to free GPU memory.
@@ -381,9 +385,18 @@ class Language(object):
         return optimizer
 
     def evaluate(self, docs_golds):
-        docs, golds = zip(*docs_golds)
         scorer = Scorer()
-        for doc, gold in zip(self.pipe(docs, batch_size=32), golds):
+        docs, golds = zip(*docs_golds)
+        docs = list(docs)
+        golds = list(golds)
+        for pipe in self.pipeline:
+            if not hasattr(pipe, 'pipe'):
+                for doc in docs:
+                    pipe(doc)
+            else:
+                docs = list(pipe.pipe(docs))
+        assert len(docs) == len(golds)
+        for doc, gold in zip(docs, golds):
             scorer.score(doc, gold)
             doc.tensor = None
         return scorer
@@ -417,11 +430,16 @@ class Language(object):
             except StopIteration:
                 pass
 
-    def pipe(self, texts, tuples=False, n_threads=2, batch_size=1000, disable=[]):
+    def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
+            disable=[]):
         """Process texts as a stream, and yield `Doc` objects in order. Supports
         GIL-free multi-threading.
 
         texts (iterator): A sequence of texts to process.
+        as_tuples (bool):
+            If set to True, inputs should be a sequence of
+            (text, context) tuples. Output will then be a sequence of
+            (doc, context) tuples. Defaults to False.
         n_threads (int): The number of worker threads to use. If -1, OpenMP will
             decide how many to use at run time. Default is 2.
         batch_size (int): The number of texts to buffer.
@@ -433,7 +451,7 @@ class Language(object):
             >>>     for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
             >>>         assert doc.is_parsed
         """
-        if tuples:
+        if as_tuples:
             text_context1, text_context2 = itertools.tee(texts)
             texts = (tc[0] for tc in text_context1)
             contexts = (tc[1] for tc in text_context2)
diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index 947f0a1f1..634d3e4b5 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -42,7 +42,7 @@ from .compat import json_dumps
 
 from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
 from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
-from ._ml import build_text_classifier
+from ._ml import build_text_classifier, build_tagger_model
 from .parts_of_speech import X
 
 
@@ -138,7 +138,7 @@ class TokenVectorEncoder(BaseThincComponent):
     name = 'tensorizer'
 
     @classmethod
-    def Model(cls, width=128, embed_size=7500, **cfg):
+    def Model(cls, width=128, embed_size=4000, **cfg):
         """Create a new statistical model for the class.
 
         width (int): Output size of the model.
@@ -253,23 +253,25 @@ class NeuralTagger(BaseThincComponent):
         self.cfg = dict(cfg)
 
     def __call__(self, doc):
-        tags = self.predict([doc.tensor])
+        tags = self.predict(([doc], [doc.tensor]))
         self.set_annotations([doc], tags)
         return doc
 
     def pipe(self, stream, batch_size=128, n_threads=-1):
         for docs in cytoolz.partition_all(batch_size, stream):
+            docs = list(docs)
             tokvecs = [d.tensor for d in docs]
-            tag_ids = self.predict(tokvecs)
+            tag_ids = self.predict((docs, tokvecs))
             self.set_annotations(docs, tag_ids)
             yield from docs
 
-    def predict(self, tokvecs):
-        scores = self.model(tokvecs)
+    def predict(self, docs_tokvecs):
+        scores = self.model(docs_tokvecs)
         scores = self.model.ops.flatten(scores)
         guesses = scores.argmax(axis=1)
         if not isinstance(guesses, numpy.ndarray):
             guesses = guesses.get()
+        tokvecs = docs_tokvecs[1]
         guesses = self.model.ops.unflatten(guesses,
                     [tv.shape[0] for tv in tokvecs])
         return guesses
@@ -282,6 +284,8 @@ class NeuralTagger(BaseThincComponent):
         cdef Vocab vocab = self.vocab
         for i, doc in enumerate(docs):
             doc_tag_ids = batch_tag_ids[i]
+            if hasattr(doc_tag_ids, 'get'):
+                doc_tag_ids = doc_tag_ids.get()
             for j, tag_id in enumerate(doc_tag_ids):
                 # Don't clobber preset POS tags
                 if doc.c[j].tag == 0 and doc.c[j].pos == 0:
@@ -294,8 +298,7 @@ class NeuralTagger(BaseThincComponent):
 
         if self.model.nI is None:
             self.model.nI = tokvecs[0].shape[1]
-
-        tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
+        tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
         loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
 
         d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
@@ -346,10 +349,8 @@ class NeuralTagger(BaseThincComponent):
 
     @classmethod
     def Model(cls, n_tags, token_vector_width):
-        return with_flatten(
-            chain(Maxout(token_vector_width, token_vector_width),
-                  Softmax(n_tags, token_vector_width)))
-
+        return build_tagger_model(n_tags, token_vector_width)
+ 
     def use_params(self, params):
         with self.model.use_params(params):
             yield
@@ -432,7 +433,7 @@ class NeuralLabeller(NeuralTagger):
 
     @property
     def labels(self):
-        return self.cfg.get('labels', {})
+        return self.cfg.setdefault('labels', {})
 
     @labels.setter
     def labels(self, value):
@@ -455,10 +456,8 @@ class NeuralLabeller(NeuralTagger):
 
     @classmethod
     def Model(cls, n_tags, token_vector_width):
-        return with_flatten(
-            chain(Maxout(token_vector_width, token_vector_width),
-                  Softmax(n_tags, token_vector_width)))
-
+        return build_tagger_model(n_tags, token_vector_width)
+    
     def get_loss(self, docs, golds, scores):
         scores = self.model.ops.flatten(scores)
         cdef int idx = 0
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 2e42b9667..6f676c79a 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -215,7 +215,10 @@ cdef class StringStore:
         path = util.ensure_path(path)
         with path.open('r') as file_:
             strings = ujson.load(file_)
+        prev = list(self)
         self._reset_and_load(strings)
+        for word in prev:
+            self.add(word)
         return self
 
     def to_bytes(self, **exclude):
@@ -234,7 +237,10 @@ cdef class StringStore:
         RETURNS (StringStore): The `StringStore` object.
         """
         strings = ujson.loads(bytes_data)
+        prev = list(self)
         self._reset_and_load(strings)
+        for word in prev:
+            self.add(word)
         return self
 
     def set_frozen(self, bint is_frozen):
diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx
new file mode 100644
index 000000000..4d90fe23b
--- /dev/null
+++ b/spacy/syntax/_beam_utils.pyx
@@ -0,0 +1,286 @@
+# cython: infer_types=True
+# cython: profile=True
+cimport numpy as np
+import numpy
+from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
+from thinc.extra.search cimport Beam
+from thinc.extra.search import MaxViolation
+from thinc.typedefs cimport hash_t, class_t
+from thinc.extra.search cimport MaxViolation
+
+from .transition_system cimport TransitionSystem, Transition
+from .stateclass cimport StateClass
+from ..gold cimport GoldParse
+from ..tokens.doc cimport Doc
+
+
+# These are passed as callbacks to thinc.search.Beam
+cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
+    dest = <StateClass>_dest
+    src = <StateClass>_src
+    moves = <const Transition*>_moves
+    dest.clone(src)
+    moves[clas].do(dest.c, moves[clas].label)
+
+
+cdef int _check_final_state(void* _state, void* extra_args) except -1:
+    return (<StateClass>_state).is_final()
+
+
+def _cleanup(Beam beam):
+    for i in range(beam.width):
+        Py_XDECREF(<PyObject*>beam._states[i].content)
+        Py_XDECREF(<PyObject*>beam._parents[i].content)
+
+
+cdef hash_t _hash_state(void* _state, void* _) except 0:
+    state = <StateClass>_state
+    if state.c.is_final():
+        return 1
+    else:
+        return state.c.hash()
+
+
+cdef class ParserBeam(object):
+    cdef public TransitionSystem moves
+    cdef public object states
+    cdef public object golds
+    cdef public object beams
+    cdef public object dones
+
+    def __init__(self, TransitionSystem moves, states, golds,
+            int width, float density):
+        self.moves = moves
+        self.states = states
+        self.golds = golds
+        self.beams = []
+        cdef Beam beam
+        cdef StateClass state, st
+        for state in states:
+            beam = Beam(self.moves.n_moves, width, density)
+            beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
+            for i in range(beam.width):
+                st = <StateClass>beam.at(i)
+                st.c.offset = state.c.offset
+            self.beams.append(beam)
+        self.dones = [False] * len(self.beams)
+
+    def __dealloc__(self):
+        if self.beams is not None:
+            for beam in self.beams:
+                if beam is not None:
+                    _cleanup(beam)
+
+    @property
+    def is_done(self):
+        return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams))
+
+    def __getitem__(self, i):
+        return self.beams[i]
+
+    def __len__(self):
+        return len(self.beams)
+
+    def advance(self, scores, follow_gold=False):
+        cdef Beam beam
+        for i, beam in enumerate(self.beams):
+            if beam.is_done or not scores[i].size or self.dones[i]:
+                continue
+            self._set_scores(beam, scores[i])
+            if self.golds is not None:
+                self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
+            if follow_gold:
+                beam.advance(_transition_state, NULL, <void*>self.moves.c)
+            else:
+                beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
+            beam.check_done(_check_final_state, NULL)
+            if beam.is_done and self.golds is not None:
+                for j in range(beam.size):
+                    state = <StateClass>beam.at(j)
+                    if state.is_final():
+                        try:
+                            if self.moves.is_gold_parse(state, self.golds[i]):
+                                beam._states[j].loss = 0.0
+                            elif beam._states[j].loss == 0.0:
+                                beam._states[j].loss = 1.0
+                        except NotImplementedError:
+                            break
+
+    def _set_scores(self, Beam beam, float[:, ::1] scores):
+        cdef float* c_scores = &scores[0, 0]
+        cdef int nr_state = min(scores.shape[0], beam.size)
+        cdef int nr_class = scores.shape[1]
+        for i in range(nr_state):
+            state = <StateClass>beam.at(i)
+            if not state.is_final():
+                for j in range(nr_class):
+                    beam.scores[i][j] = c_scores[i * nr_class + j]
+                self.moves.set_valid(beam.is_valid[i], state.c)
+            else:
+                for j in range(beam.nr_class):
+                    beam.scores[i][j] = 0
+                    beam.costs[i][j] = 0
+
+    def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
+        for i in range(beam.size):
+            state = <StateClass>beam.at(i)
+            if not state.c.is_final():
+                self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
+                if follow_gold:
+                    for j in range(beam.nr_class):
+                        if beam.costs[i][j] >= 1:
+                            beam.is_valid[i][j] = 0
+
+
+def get_token_ids(states, int n_tokens):
+    cdef StateClass state
+    cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
+                                      dtype='int32', order='C')
+    c_ids = <int*>ids.data
+    for i, state in enumerate(states):
+        if not state.is_final():
+            state.c.set_context_tokens(c_ids, n_tokens)
+        else:
+            ids[i] = -1
+        c_ids += ids.shape[1]
+    return ids
+
+nr_update = 0
+def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
+                states, tokvecs, golds,
+                state2vec, vec2scores, 
+                int width, float density,
+                sgd=None, losses=None, drop=0.):
+    global nr_update
+    cdef MaxViolation violn
+    nr_update += 1
+    pbeam = ParserBeam(moves, states, golds,
+                       width=width, density=density)
+    gbeam = ParserBeam(moves, states, golds,
+                       width=width, density=0.0)
+    cdef StateClass state
+    beam_maps = []
+    backprops = []
+    violns = [MaxViolation() for _ in range(len(states))]
+    for t in range(max_steps):
+        if pbeam.is_done and gbeam.is_done:
+            break
+        # The beam maps let us find the right row in the flattened scores
+        # arrays for each state. States are identified by (example id, history).
+        # We keep a different beam map for each step (since we'll have a flat
+        # scores array for each step). The beam map will let us take the per-state
+        # losses, and compute the gradient for each (step, state, class).
+        beam_maps.append({})
+        # Gather all states from the two beams in a list. Some stats may occur
+        # in both beams. To figure out which beam each state belonged to,
+        # we keep two lists of indices, p_indices and g_indices
+        states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
+        if not states:
+            break
+        # Now that we have our flat list of states, feed them through the model
+        token_ids = get_token_ids(states, nr_feature)
+        vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
+        scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
+
+        # Store the callbacks for the backward pass
+        backprops.append((token_ids, bp_vectors, bp_scores))
+
+        # Unpack the flat scores into lists for the two beams. The indices arrays
+        # tell us which example and state the scores-row refers to.
+        p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
+        g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')  for indices in g_indices]
+        # Now advance the states in the beams. The gold beam is contrained to
+        # to follow only gold analyses.
+        pbeam.advance(p_scores)
+        gbeam.advance(g_scores, follow_gold=True)
+        # Track the "maximum violation", to use in the update.
+        for i, violn in enumerate(violns):
+            violn.check_crf(pbeam[i], gbeam[i])
+    histories = []
+    losses = []
+    for violn in violns:
+        if violn.p_hist:
+            histories.append(violn.p_hist + violn.g_hist)
+            losses.append(violn.p_probs + violn.g_probs)
+        else:
+            histories.append([])
+            losses.append([])
+    states_d_scores = get_gradient(moves.n_moves, beam_maps, histories, losses)
+    return states_d_scores, backprops[:len(states_d_scores)]
+
+
+def get_states(pbeams, gbeams, beam_map, nr_update):
+    seen = {}
+    states = []
+    p_indices = []
+    g_indices = []
+    cdef Beam pbeam, gbeam
+    assert len(pbeams) == len(gbeams)
+    for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
+        p_indices.append([])
+        g_indices.append([])
+        for i in range(pbeam.size):
+            state = <StateClass>pbeam.at(i)
+            if not state.is_final():
+                key = tuple([eg_id] + pbeam.histories[i])
+                assert key not in seen, (key, seen)
+                seen[key] = len(states)
+                p_indices[-1].append(len(states))
+                states.append(state)
+        beam_map.update(seen)
+        for i in range(gbeam.size):
+            state = <StateClass>gbeam.at(i)
+            if not state.is_final():
+                key = tuple([eg_id] + gbeam.histories[i])
+                if key in seen:
+                    g_indices[-1].append(seen[key])
+                else:
+                    g_indices[-1].append(len(states))
+                    beam_map[key] = len(states)
+                    states.append(state)
+    p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices]
+    g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices]
+    return states, p_idx, g_idx
+
+
+def get_gradient(nr_class, beam_maps, histories, losses):
+    """
+    The global model assigns a loss to each parse. The beam scores
+    are additive, so the same gradient is applied to each action
+    in the history. This gives the gradient of a single *action*
+    for a beam state -- so we have "the gradient of loss for taking
+    action i given history H."
+
+    Histories: Each hitory is a list of actions
+    Each candidate has a history
+    Each beam has multiple candidates
+    Each batch has multiple beams
+    So history is list of lists of lists of ints
+    """
+    nr_step = len(beam_maps)
+    grads = []
+    nr_step = 0
+    for eg_id, hists in enumerate(histories):
+        for loss, hist in zip(losses[eg_id], hists):
+            if loss != 0.0 and not numpy.isnan(loss):
+                nr_step = max(nr_step, len(hist))
+    for i in range(nr_step):
+        grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f'))
+    assert len(histories) == len(losses)
+    for eg_id, hists in enumerate(histories):
+        for loss, hist in zip(losses[eg_id], hists):
+            if loss == 0.0 or numpy.isnan(loss):
+                continue
+            key = tuple([eg_id])
+            # Adjust loss for length
+            avg_loss = loss / len(hist)
+            loss += avg_loss * (nr_step - len(hist))
+            for j, clas in enumerate(hist):
+                i = beam_maps[j][key]
+                # In step j, at state i action clas
+                # resulted in loss
+                grads[j][i, clas] += loss
+                key = key + tuple([clas])
+    return grads
+
+
diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd
index c06851978..3da9e5d4c 100644
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@@ -37,6 +37,7 @@ cdef cppclass StateC:
         this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
         this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
         this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity))
+        this.offset = 0
         cdef int i
         for i in range(length + (PADDING * 2)):
             this._ents[i].end = -1
@@ -73,7 +74,16 @@ cdef cppclass StateC:
         free(this.shifted - PADDING)
 
     void set_context_tokens(int* ids, int n) nogil:
-        if n == 13:
+        if n == 8:
+            ids[0] = this.B(0)
+            ids[1] = this.B(1)
+            ids[2] = this.S(0)
+            ids[3] = this.S(1)
+            ids[4] = this.H(this.S(0))
+            ids[5] = this.L(this.B(0), 1)
+            ids[6] = this.L(this.S(0), 2)
+            ids[7] = this.R(this.S(0), 1)
+        elif n == 13:
             ids[0] = this.B(0)
             ids[1] = this.B(1)
             ids[2] = this.S(0)
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 29e8de0aa..aab350d76 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -351,6 +351,20 @@ cdef class ArcEager(TransitionSystem):
         def __get__(self):
             return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
 
+    def is_gold_parse(self, StateClass state, GoldParse gold):
+        predicted = set()
+        truth = set()
+        for i in range(gold.length):
+            if gold.cand_to_gold[i] is None:
+                continue
+            if state.safe_get(i).dep:
+                predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep]))
+            else:
+                predicted.add((i, state.H(i), 'ROOT'))
+            id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
+            truth.add((id_, head, dep))
+        return truth == predicted
+
     def has_gold(self, GoldParse gold, start=0, end=None):
         end = end or len(gold.heads)
         if all([tag is None for tag in gold.heads[start:end]]):
@@ -385,6 +399,7 @@ cdef class ArcEager(TransitionSystem):
         for i in range(self.n_moves):
             if self.c[i].move == move and self.c[i].label == label:
                 return self.c[i]
+        return Transition(clas=0, move=MISSING, label=0)
 
     def move_name(self, int move, attr_t label):
         label_str = self.strings[label]
diff --git a/spacy/syntax/beam_parser.pyx b/spacy/syntax/beam_parser.pyx
index e96e28fcf..68e9f27af 100644
--- a/spacy/syntax/beam_parser.pyx
+++ b/spacy/syntax/beam_parser.pyx
@@ -107,7 +107,7 @@ cdef class BeamParser(Parser):
             # The non-monotonic oracle makes it difficult to ensure final costs are
             # correct. Therefore do final correction
             for i in range(pred.size):
-                if is_gold(<StateClass>pred.at(i), gold_parse, self.moves.strings):
+                if self.moves.is_gold_parse(<StateClass>pred.at(i), gold_parse):
                     pred._states[i].loss = 0.0
                 elif pred._states[i].loss == 0.0:
                     pred._states[i].loss = 1.0
@@ -213,7 +213,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
         if not pred._states[i].is_done or pred._states[i].loss == 0:
             continue
         state = <StateClass>pred.at(i)
-        if is_gold(state, gold_parse, moves.strings) == True:
+        if moves.is_gold_parse(state, gold_parse) == True:
             for dep in gold_parse.orig_annot:
                 print(dep[1], dep[3], dep[4])
             print("Cost", pred._states[i].loss)
@@ -227,7 +227,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
         if not gold._states[i].is_done:
             continue
         state = <StateClass>gold.at(i)
-        if is_gold(state, gold_parse, moves.strings) == False:
+        if moves.is_gold(state, gold_parse) == False:
             print("Truth")
             for dep in gold_parse.orig_annot:
                 print(dep[1], dep[3], dep[4])
@@ -237,16 +237,3 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
             raise Exception("Gold parse is not gold-standard")
 
 
-def is_gold(StateClass state, GoldParse gold, StringStore strings):
-    predicted = set()
-    truth = set()
-    for i in range(gold.length):
-        if gold.cand_to_gold[i] is None:
-            continue
-        if state.safe_get(i).dep:
-            predicted.add((i, state.H(i), strings[state.safe_get(i).dep]))
-        else:
-            predicted.add((i, state.H(i), 'ROOT'))
-        id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
-        truth.add((id_, head, dep))
-    return truth == predicted
diff --git a/spacy/syntax/nn_parser.pxd b/spacy/syntax/nn_parser.pxd
index 524718965..7ff4b9f9f 100644
--- a/spacy/syntax/nn_parser.pxd
+++ b/spacy/syntax/nn_parser.pxd
@@ -14,8 +14,4 @@ cdef class Parser:
     cdef readonly TransitionSystem moves
     cdef readonly object cfg
 
-    cdef void _parse_step(self, StateC* state,
-            const float* feat_weights,
-            int nr_class, int nr_feat, int nr_piece) nogil
-
     #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 0b39e2216..7412ebeee 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -37,14 +37,18 @@ from preshed.maps cimport MapStruct
 from preshed.maps cimport map_get
 
 from thinc.api import layerize, chain, noop, clone
-from thinc.neural import Model, Affine, ELU, ReLu, Maxout
+from thinc.neural import Model, Affine, ReLu, Maxout
+from thinc.neural._classes.batchnorm import BatchNorm as BN
+from thinc.neural._classes.selu import SELU
+from thinc.neural._classes.layernorm import LayerNorm
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module
 
 from .. import util
 from ..util import get_async, get_cuda_stream
 from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
-from .._ml import Tok2Vec, doc2feats, rebatch
+from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
+from .._ml import Residual, drop_layer
 from ..compat import json_dumps
 
 from . import _parse_features
@@ -59,8 +63,10 @@ from ..structs cimport TokenC
 from ..tokens.doc cimport Doc
 from ..strings cimport StringStore
 from ..gold cimport GoldParse
-from ..attrs cimport TAG, DEP
+from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
+from . import _beam_utils
 
+USE_FINE_TUNE = True
 
 def get_templates(*args, **kwargs):
     return []
@@ -232,11 +238,14 @@ cdef class Parser:
     Base class of the DependencyParser and EntityRecognizer.
     """
     @classmethod
-    def Model(cls, nr_class, token_vector_width=128, hidden_width=128, depth=1, **cfg):
+    def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg):
         depth = util.env_opt('parser_hidden_depth', depth)
         token_vector_width = util.env_opt('token_vector_width', token_vector_width)
         hidden_width = util.env_opt('hidden_width', hidden_width)
         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
+        embed_size = util.env_opt('embed_size', 4000)
+        tensors = fine_tune(Tok2Vec(token_vector_width, embed_size,
+                    preprocess=doc2feats()))
         if parser_maxout_pieces == 1:
             lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
                         nF=cls.nr_feature,
@@ -248,15 +257,10 @@ cdef class Parser:
                         nI=token_vector_width)
 
         with Model.use_device('cpu'):
-            if depth == 0:
-                upper = chain()
-                upper.is_noop = True
-            else:
-                upper = chain(
-                    clone(Maxout(hidden_width), (depth-1)),
-                    zero_init(Affine(nr_class, drop_factor=0.0))
-                )
-                upper.is_noop = False
+            upper = chain(
+                clone(Maxout(hidden_width), (depth-1)),
+                zero_init(Affine(nr_class, drop_factor=0.0))
+            )
         # TODO: This is an unfortunate hack atm!
         # Used to set input dimensions in network.
         lower.begin_training(lower.ops.allocate((500, token_vector_width)))
@@ -268,7 +272,7 @@ cdef class Parser:
             'hidden_width': hidden_width,
             'maxout_pieces': parser_maxout_pieces
         }
-        return (lower, upper), cfg
+        return (tensors, lower, upper), cfg
 
     def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
         """
@@ -294,6 +298,10 @@ cdef class Parser:
             self.moves = self.TransitionSystem(self.vocab.strings, {})
         else:
             self.moves = moves
+        if 'beam_width' not in cfg:
+            cfg['beam_width'] = util.env_opt('beam_width', 1)
+        if 'beam_density' not in cfg:
+            cfg['beam_density'] = util.env_opt('beam_density', 0.0)
         self.cfg = cfg
         if 'actions' in self.cfg:
             for action, labels in self.cfg.get('actions', {}).items():
@@ -316,7 +324,7 @@ cdef class Parser:
         if beam_width is None:
             beam_width = self.cfg.get('beam_width', 1)
         if beam_density is None:
-            beam_density = self.cfg.get('beam_density', 0.001)
+            beam_density = self.cfg.get('beam_density', 0.0)
         cdef Beam beam
         if beam_width == 1:
             states = self.parse_batch([doc], [doc.tensor])
@@ -332,7 +340,7 @@ cdef class Parser:
             return output
 
     def pipe(self, docs, int batch_size=1000, int n_threads=2,
-             beam_width=1, beam_density=0.001):
+             beam_width=None, beam_density=None):
         """
         Process a stream of documents.
 
@@ -344,17 +352,23 @@ cdef class Parser:
                 The number of threads with which to work on the buffer in parallel.
         Yields (Doc): Documents, in order.
         """
-        cdef StateClass parse_state
+        if beam_width is None:
+            beam_width = self.cfg.get('beam_width', 1)
+        if beam_density is None:
+            beam_density = self.cfg.get('beam_density', 0.0)
         cdef Doc doc
-        queue = []
+        cdef Beam beam
         for docs in cytoolz.partition_all(batch_size, docs):
             docs = list(docs)
-            tokvecs = [d.tensor for d in docs]
+            tokvecs = [doc.tensor for doc in docs]
             if beam_width == 1:
                 parse_states = self.parse_batch(docs, tokvecs)
             else:
-                parse_states = self.beam_parse(docs, tokvecs,
-                                    beam_width=beam_width, beam_density=beam_density)
+                beams = self.beam_parse(docs, tokvecs,
+                            beam_width=beam_width, beam_density=beam_density)
+                parse_states = []
+                for beam in beams:
+                    parse_states.append(<StateClass>beam.at(0))
             self.set_annotations(docs, parse_states)
             yield from docs
 
@@ -369,8 +383,12 @@ cdef class Parser:
             int nr_class, nr_feat, nr_piece, nr_dim, nr_state
         if isinstance(docs, Doc):
             docs = [docs]
+        if isinstance(tokvecses, np.ndarray):
+            tokvecses = [tokvecses]
 
         tokvecs = self.model[0].ops.flatten(tokvecses)
+        if USE_FINE_TUNE:
+            tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
 
         nr_state = len(docs)
         nr_class = self.moves.n_moves
@@ -394,27 +412,20 @@ cdef class Parser:
         cdef np.ndarray scores
         c_token_ids = <int*>token_ids.data
         c_is_valid = <int*>is_valid.data
-        cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
         while not next_step.empty():
-            if not has_hidden:
-                for i in cython.parallel.prange(
-                        next_step.size(), num_threads=6, nogil=True):
-                    self._parse_step(next_step[i],
-                        feat_weights, nr_class, nr_feat, nr_piece)
-            else:
-                for i in range(next_step.size()):
-                    st = next_step[i]
-                    st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
-                    self.moves.set_valid(&c_is_valid[i*nr_class], st)
-                vectors = state2vec(token_ids[:next_step.size()])
-                scores = vec2scores(vectors)
-                c_scores = <float*>scores.data
-                for i in range(next_step.size()):
-                    st = next_step[i]
-                    guess = arg_max_if_valid(
-                        &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
-                    action = self.moves.c[guess]
-                    action.do(st, action.label)
+            for i in range(next_step.size()):
+                st = next_step[i]
+                st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
+                self.moves.set_valid(&c_is_valid[i*nr_class], st)
+            vectors = state2vec(token_ids[:next_step.size()])
+            scores = vec2scores(vectors)
+            c_scores = <float*>scores.data
+            for i in range(next_step.size()):
+                st = next_step[i]
+                guess = arg_max_if_valid(
+                    &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
+                action = self.moves.c[guess]
+                action.do(st, action.label)
             this_step, next_step = next_step, this_step
             next_step.clear()
             for st in this_step:
@@ -422,18 +433,22 @@ cdef class Parser:
                     next_step.push_back(st)
         return states
 
-    def beam_parse(self, docs, tokvecses, int beam_width=8, float beam_density=0.001):
+    def beam_parse(self, docs, tokvecses, int beam_width=3, float beam_density=0.001):
         cdef Beam beam
         cdef np.ndarray scores
         cdef Doc doc
         cdef int nr_class = self.moves.n_moves
         cdef StateClass stcls, output
         tokvecs = self.model[0].ops.flatten(tokvecses)
+        if USE_FINE_TUNE:
+            tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
         cuda_stream = get_cuda_stream()
         state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
                                                      cuda_stream, 0.0)
         beams = []
         cdef int offset = 0
+        cdef int j = 0
+        cdef int k
         for doc in docs:
             beam = Beam(nr_class, beam_width, min_density=beam_density)
             beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
@@ -446,44 +461,32 @@ cdef class Parser:
                 states = []
                 for i in range(beam.size):
                     stcls = <StateClass>beam.at(i)
-                    states.append(stcls)
+                    # This way we avoid having to score finalized states
+                    # We do have to take care to keep indexes aligned, though
+                    if not stcls.is_final():
+                        states.append(stcls)
                 token_ids = self.get_token_ids(states)
                 vectors = state2vec(token_ids)
                 scores = vec2scores(vectors)
+                j = 0
+                c_scores = <float*>scores.data
                 for i in range(beam.size):
                     stcls = <StateClass>beam.at(i)
                     if not stcls.is_final():
                         self.moves.set_valid(beam.is_valid[i], stcls.c)
-                        for j in range(nr_class):
-                            beam.scores[i][j] = scores[i, j]
+                        for k in range(nr_class):
+                            beam.scores[i][k] = c_scores[j * scores.shape[1] + k]
+                        j += 1
                 beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
                 beam.check_done(_check_final_state, NULL)
             beams.append(beam)
         return beams
 
-    cdef void _parse_step(self, StateC* state,
-            const float* feat_weights,
-            int nr_class, int nr_feat, int nr_piece) nogil:
-        '''This only works with no hidden layers -- fast but inaccurate'''
-        #for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True):
-        #    self._parse_step(next_step[i], feat_weights, nr_class, nr_feat)
-        token_ids = <int*>calloc(nr_feat, sizeof(int))
-        scores = <float*>calloc(nr_class * nr_piece, sizeof(float))
-        is_valid = <int*>calloc(nr_class, sizeof(int))
-
-        state.set_context_tokens(token_ids, nr_feat)
-        sum_state_features(scores,
-            feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece)
-        self.moves.set_valid(is_valid, state)
-        guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece)
-        action = self.moves.c[guess]
-        action.do(state, action.label)
-
-        free(is_valid)
-        free(scores)
-        free(token_ids)
-
     def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
+        if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
+            return self.update_beam(docs_tokvecs, golds,
+                    self.cfg['beam_width'], self.cfg['beam_density'],
+                    drop=drop, sgd=sgd, losses=losses)
         if losses is not None and self.name not in losses:
             losses[self.name] = 0.
         docs, tokvec_lists = docs_tokvecs
@@ -491,6 +494,10 @@ cdef class Parser:
         if isinstance(docs, Doc) and isinstance(golds, GoldParse):
             docs = [docs]
             golds = [golds]
+        if USE_FINE_TUNE:
+            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
+            my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
+            tokvecs += my_tokvecs
 
         cuda_stream = get_cuda_stream()
 
@@ -517,13 +524,14 @@ cdef class Parser:
             scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
 
             d_scores = self.get_batch_loss(states, golds, scores)
-            d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
+            d_scores /= len(docs)
+            d_vector = bp_scores(d_scores, sgd=sgd)
             if drop != 0:
                 d_vector *= mask
 
             if isinstance(self.model[0].ops, CupyOps) \
             and not isinstance(token_ids, state2vec.ops.xp.ndarray):
-                # Move token_ids and d_vector to CPU, asynchronously
+                # Move token_ids and d_vector to GPU, asynchronously
                 backprops.append((
                     get_async(cuda_stream, token_ids),
                     get_async(cuda_stream, d_vector),
@@ -540,7 +548,62 @@ cdef class Parser:
                 break
         self._make_updates(d_tokvecs,
             backprops, sgd, cuda_stream)
-        return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
+        d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
+        if USE_FINE_TUNE:
+            bp_my_tokvecs(d_tokvecs, sgd=sgd)
+        return d_tokvecs
+
+    def update_beam(self, docs_tokvecs, golds, width=None, density=None,
+            drop=0., sgd=None, losses=None):
+        if width is None:
+            width = self.cfg.get('beam_width', 2)
+        if density is None:
+            density = self.cfg.get('beam_density', 0.0)
+        if losses is not None and self.name not in losses:
+            losses[self.name] = 0.
+        docs, tokvecs = docs_tokvecs
+        lengths = [len(d) for d in docs]
+        assert min(lengths) >= 1
+        tokvecs = self.model[0].ops.flatten(tokvecs)
+        if USE_FINE_TUNE:
+            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
+            my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
+            tokvecs += my_tokvecs
+
+        states = self.moves.init_batch(docs)
+        for gold in golds:
+            self.moves.preprocess_gold(gold)
+
+        cuda_stream = get_cuda_stream()
+        state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0)
+
+        states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
+                                        states, tokvecs, golds,
+                                        state2vec, vec2scores,
+                                        width, density,
+                                        sgd=sgd, drop=drop, losses=losses)
+        backprop_lower = []
+        cdef float batch_size = len(docs)
+        for i, d_scores in enumerate(states_d_scores):
+            d_scores /= batch_size
+            if losses is not None:
+                losses[self.name] += (d_scores**2).sum()
+            ids, bp_vectors, bp_scores = backprops[i]
+            d_vector = bp_scores(d_scores, sgd=sgd)
+            if isinstance(self.model[0].ops, CupyOps) \
+            and not isinstance(ids, state2vec.ops.xp.ndarray):
+                backprop_lower.append((
+                    get_async(cuda_stream, ids),
+                    get_async(cuda_stream, d_vector),
+                    bp_vectors))
+            else:
+                backprop_lower.append((ids, d_vector, bp_vectors))
+        d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
+        self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
+        d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
+        if USE_FINE_TUNE:
+            bp_my_tokvecs(d_tokvecs, sgd=sgd)
+        return d_tokvecs
 
     def _init_gold_batch(self, whole_docs, whole_golds):
         """Make a square batch, of length equal to the shortest doc. A long
@@ -585,14 +648,10 @@ cdef class Parser:
         xp = get_array_module(d_tokvecs)
         for ids, d_vector, bp_vector in backprops:
             d_state_features = bp_vector(d_vector, sgd=sgd)
-            active_feats = ids * (ids >= 0)
-            active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1))
-            if hasattr(xp, 'scatter_add'):
-                xp.scatter_add(d_tokvecs,
-                    ids, d_state_features * active_feats)
-            else:
-                xp.add.at(d_tokvecs,
-                    ids, d_state_features * active_feats)
+            mask = ids >= 0
+            d_state_features *= mask.reshape(ids.shape + (1,))
+            self.model[0].ops.scatter_add(d_tokvecs, ids * mask,
+                d_state_features)
 
     @property
     def move_names(self):
@@ -603,12 +662,12 @@ cdef class Parser:
         return names
 
     def get_batch_model(self, batch_size, tokvecs, stream, dropout):
-        lower, upper = self.model
+        _, lower, upper = self.model
         state2vec = precompute_hiddens(batch_size, tokvecs,
                         lower, stream, drop=dropout)
         return state2vec, upper
 
-    nr_feature = 13
+    nr_feature = 8
 
     def get_token_ids(self, states):
         cdef StateClass state
@@ -693,10 +752,12 @@ cdef class Parser:
 
     def to_disk(self, path, **exclude):
         serializers = {
-            'lower_model': lambda p: p.open('wb').write(
+            'tok2vec_model': lambda p: p.open('wb').write(
                 self.model[0].to_bytes()),
-            'upper_model': lambda p: p.open('wb').write(
+            'lower_model': lambda p: p.open('wb').write(
                 self.model[1].to_bytes()),
+            'upper_model': lambda p: p.open('wb').write(
+                self.model[2].to_bytes()),
             'vocab': lambda p: self.vocab.to_disk(p),
             'moves': lambda p: self.moves.to_disk(p, strings=False),
             'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
@@ -717,24 +778,29 @@ cdef class Parser:
                 self.model, cfg = self.Model(**self.cfg)
             else:
                 cfg = {}
-            with (path / 'lower_model').open('rb') as file_:
+            with (path / 'tok2vec_model').open('rb') as file_:
                 bytes_data = file_.read()
             self.model[0].from_bytes(bytes_data)
-            with (path / 'upper_model').open('rb') as file_:
+            with (path / 'lower_model').open('rb') as file_:
                 bytes_data = file_.read()
             self.model[1].from_bytes(bytes_data)
+            with (path / 'upper_model').open('rb') as file_:
+                bytes_data = file_.read()
+            self.model[2].from_bytes(bytes_data)
             self.cfg.update(cfg)
         return self
 
     def to_bytes(self, **exclude):
         serializers = OrderedDict((
-            ('lower_model', lambda: self.model[0].to_bytes()),
-            ('upper_model', lambda: self.model[1].to_bytes()),
+            ('tok2vec_model', lambda: self.model[0].to_bytes()),
+            ('lower_model', lambda: self.model[1].to_bytes()),
+            ('upper_model', lambda: self.model[2].to_bytes()),
             ('vocab', lambda: self.vocab.to_bytes()),
             ('moves', lambda: self.moves.to_bytes(strings=False)),
             ('cfg', lambda: ujson.dumps(self.cfg))
         ))
         if 'model' in exclude:
+            exclude['tok2vec_model'] = True
             exclude['lower_model'] = True
             exclude['upper_model'] = True
             exclude.pop('model')
@@ -745,6 +811,7 @@ cdef class Parser:
             ('vocab', lambda b: self.vocab.from_bytes(b)),
             ('moves', lambda b: self.moves.from_bytes(b, strings=False)),
             ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
+            ('tok2vec_model', lambda b: None),
             ('lower_model', lambda b: None),
             ('upper_model', lambda b: None)
         ))
@@ -754,10 +821,12 @@ cdef class Parser:
                 self.model, cfg = self.Model(self.moves.n_moves)
             else:
                 cfg = {}
+            if 'tok2vec_model' in msg:
+                self.model[0].from_bytes(msg['tok2vec_model'])
             if 'lower_model' in msg:
-                self.model[0].from_bytes(msg['lower_model'])
+                self.model[1].from_bytes(msg['lower_model'])
             if 'upper_model' in msg:
-                self.model[1].from_bytes(msg['upper_model'])
+                self.model[2].from_bytes(msg['upper_model'])
             self.cfg.update(cfg)
         return self
 
diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index 27b375bba..9cf82e0c7 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -99,6 +99,9 @@ cdef class TransitionSystem:
     def preprocess_gold(self, GoldParse gold):
         raise NotImplementedError
 
+    def is_gold_parse(self, StateClass state, GoldParse gold):
+        raise NotImplementedError
+
     cdef Transition lookup_transition(self, object name) except *:
         raise NotImplementedError
 
@@ -107,6 +110,8 @@ cdef class TransitionSystem:
 
     def is_valid(self, StateClass stcls, move_name):
         action = self.lookup_transition(move_name)
+        if action.move == 0:
+            return False
         return action.is_valid(stcls.c, action.label)
 
     cdef int set_valid(self, int* is_valid, const StateC* st) nogil:
diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py
index 42b55745f..30a6367c8 100644
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@@ -78,3 +78,16 @@ def test_predict_doc_beam(parser, tok2vec, model, doc):
     parser(doc, beam_width=32, beam_density=0.001)
     for word in doc:
         print(word.text, word.head, word.dep_)
+
+
+def test_update_doc_beam(parser, tok2vec, model, doc, gold):
+    parser.model = model
+    tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
+    d_tokvecs = parser.update_beam(([doc], tokvecs), [gold])
+    assert d_tokvecs[0].shape == tokvecs[0].shape
+    def optimize(weights, gradient, key=None):
+        weights -= 0.001 * gradient
+    bp_tokvecs(d_tokvecs, sgd=optimize)
+    assert d_tokvecs[0].sum() == 0.
+
+
diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py
new file mode 100644
index 000000000..ab8bf012b
--- /dev/null
+++ b/spacy/tests/parser/test_nn_beam.py
@@ -0,0 +1,87 @@
+from __future__ import unicode_literals
+import pytest
+import numpy
+from thinc.api import layerize
+
+from ...vocab import Vocab
+from ...syntax.arc_eager import ArcEager
+from ...tokens import Doc
+from ...gold import GoldParse
+from ...syntax._beam_utils import ParserBeam, update_beam
+from ...syntax.stateclass import StateClass
+
+
+@pytest.fixture
+def vocab():
+    return Vocab()
+
+@pytest.fixture
+def moves(vocab):
+    aeager = ArcEager(vocab.strings, {})
+    aeager.add_action(2, 'nsubj')
+    aeager.add_action(3, 'dobj')
+    aeager.add_action(2, 'aux')
+    return aeager
+
+
+@pytest.fixture
+def docs(vocab):
+    return [Doc(vocab, words=['Rats', 'bite', 'things'])]
+
+@pytest.fixture
+def states(docs):
+    return [StateClass(doc) for doc in docs]
+
+@pytest.fixture
+def tokvecs(docs, vector_size):
+    output = []
+    for doc in docs:
+        vec = numpy.random.uniform(-0.1, 0.1, (len(doc), vector_size))
+        output.append(numpy.asarray(vec))
+    return output
+
+
+@pytest.fixture
+def golds(docs):
+    return [GoldParse(doc) for doc in docs]
+
+
+@pytest.fixture
+def batch_size(docs):
+    return len(docs)
+
+
+@pytest.fixture
+def beam_width():
+    return 4
+
+
+@pytest.fixture
+def vector_size():
+    return 6
+
+
+@pytest.fixture
+def beam(moves, states, golds, beam_width):
+    return ParserBeam(moves, states, golds, width=beam_width, density=0.0)
+
+@pytest.fixture
+def scores(moves, batch_size, beam_width):
+    return [
+        numpy.asarray(
+            numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)),
+            dtype='f')
+        for _ in range(batch_size)]
+
+
+def test_create_beam(beam):
+    pass
+
+
+def test_beam_advance(beam, scores):
+    beam.advance(scores)
+
+
+def test_beam_advance_too_few_scores(beam, scores):
+    with pytest.raises(IndexError):
+        beam.advance(scores[:-1])
diff --git a/spacy/tests/regression/test_issue1257.py b/spacy/tests/regression/test_issue1257.py
new file mode 100644
index 000000000..de6b014a6
--- /dev/null
+++ b/spacy/tests/regression/test_issue1257.py
@@ -0,0 +1,12 @@
+'''Test tokens compare correctly'''
+from __future__ import unicode_literals
+
+from ..util import get_doc
+from ...vocab import Vocab
+
+
+def test_issue1257():
+    doc1 = get_doc(Vocab(), ['a', 'b', 'c'])
+    doc2 = get_doc(Vocab(), ['a', 'c', 'e'])
+    assert doc1[0] != doc2[0]
+    assert not doc1[0] == doc2[0]
diff --git a/spacy/tests/serialize/test_serialize_tagger.py b/spacy/tests/serialize/test_serialize_tagger.py
index fa9a776bb..3154687c3 100644
--- a/spacy/tests/serialize/test_serialize_tagger.py
+++ b/spacy/tests/serialize/test_serialize_tagger.py
@@ -11,8 +11,8 @@ import pytest
 def taggers(en_vocab):
     tagger1 = Tagger(en_vocab)
     tagger2 = Tagger(en_vocab)
-    tagger1.model = tagger1.Model(None, None)
-    tagger2.model = tagger2.Model(None, None)
+    tagger1.model = tagger1.Model(8, 8)
+    tagger2.model = tagger1.model
     return (tagger1, tagger2)
 
 
@@ -20,7 +20,6 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
     tagger1, tagger2 = taggers
     tagger1_b = tagger1.to_bytes()
     tagger2_b = tagger2.to_bytes()
-    assert tagger1_b == tagger2_b
     tagger1 = tagger1.from_bytes(tagger1_b)
     assert tagger1.to_bytes() == tagger1_b
     new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)
diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py
index d22fa52ae..7ed9333b8 100644
--- a/spacy/tests/spans/test_span.py
+++ b/spacy/tests/spans/test_span.py
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 
 from ..util import get_doc
+from ...attrs import ORTH, LENGTH
 
 import pytest
 
@@ -89,3 +90,19 @@ def test_spans_are_hashable(en_tokenizer):
     span3 = tokens[0:2]
     assert hash(span3) == hash(span1)
  
+
+def test_spans_by_character(doc):
+    span1 = doc[1:-2]
+    span2 = doc.char_span(span1.start_char, span1.end_char, label='GPE')
+    assert span1.start_char == span2.start_char
+    assert span1.end_char == span2.end_char
+    assert span2.label_ == 'GPE'
+
+
+def test_span_to_array(doc):
+    span = doc[1:-2]
+    arr = span.to_array([ORTH, LENGTH])
+    assert arr.shape == (len(span), 2)
+    assert arr[0, 0] == span[0].orth
+    assert arr[0, 1] == len(span[0])
+
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index 56aeb5223..2f474a926 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -79,9 +79,9 @@ def add_vecs_to_vocab(vocab, vectors):
     """Add list of vector tuples to given vocab. All vectors need to have the
     same length. Format: [("text", [1, 2, 3])]"""
     length = len(vectors[0][1])
-    vocab.resize_vectors(length)
+    vocab.clear_vectors(length)
     for word, vec in vectors:
-        vocab[word].vector = vec
+        vocab.set_vector(word, vec)
     return vocab
 
 
diff --git a/spacy/tests/vectors/test_similarity.py b/spacy/tests/vectors/test_similarity.py
index 1260728be..f9c18adca 100644
--- a/spacy/tests/vectors/test_similarity.py
+++ b/spacy/tests/vectors/test_similarity.py
@@ -14,10 +14,9 @@ def vectors():
 
 @pytest.fixture()
 def vocab(en_vocab, vectors):
-    #return add_vecs_to_vocab(en_vocab, vectors)
-    return None
+    add_vecs_to_vocab(en_vocab, vectors)
+    return en_vocab
 
-@pytest.mark.xfail
 def test_vectors_similarity_LL(vocab, vectors):
     [(word1, vec1), (word2, vec2)] = vectors
     lex1 = vocab[word1]
@@ -31,7 +30,6 @@ def test_vectors_similarity_LL(vocab, vectors):
     assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
 
 
-@pytest.mark.xfail
 def test_vectors_similarity_TT(vocab, vectors):
     [(word1, vec1), (word2, vec2)] = vectors
     doc = get_doc(vocab, words=[word1, word2])
@@ -44,21 +42,18 @@ def test_vectors_similarity_TT(vocab, vectors):
     assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
 
 
-@pytest.mark.xfail
 def test_vectors_similarity_TD(vocab, vectors):
     [(word1, vec1), (word2, vec2)] = vectors
     doc = get_doc(vocab, words=[word1, word2])
     assert doc.similarity(doc[0]) == doc[0].similarity(doc)
 
 
-@pytest.mark.xfail
 def test_vectors_similarity_DS(vocab, vectors):
     [(word1, vec1), (word2, vec2)] = vectors
     doc = get_doc(vocab, words=[word1, word2])
     assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
 
 
-@pytest.mark.xfail
 def test_vectors_similarity_TS(vocab, vectors):
     [(word1, vec1), (word2, vec2)] = vectors
     doc = get_doc(vocab, words=[word1, word2])
diff --git a/spacy/tests/vectors/test_vectors.py b/spacy/tests/vectors/test_vectors.py
index c42c3a4ce..798871edd 100644
--- a/spacy/tests/vectors/test_vectors.py
+++ b/spacy/tests/vectors/test_vectors.py
@@ -2,6 +2,8 @@
 from __future__ import unicode_literals
 
 from ...vectors import Vectors
+from ...tokenizer import Tokenizer
+from ..util import add_vecs_to_vocab, get_doc
 
 import numpy
 import pytest
@@ -11,22 +13,42 @@ import pytest
 def strings():
     return ["apple", "orange"]
 
+@pytest.fixture
+def vectors():
+    return [
+        ("apple", [1, 2, 3]),
+        ("orange", [-1, -2, -3]),
+        ('and', [-1, -1, -1]),
+        ('juice', [5, 5, 10]),
+        ('pie', [7, 6.3, 8.9])]
+
+
 @pytest.fixture
 def data():
     return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype='f')
 
 
+@pytest.fixture()
+def vocab(en_vocab, vectors):
+    add_vecs_to_vocab(en_vocab, vectors)
+    return en_vocab
+
+
 def test_init_vectors_with_data(strings, data):
     v = Vectors(strings, data)
     assert v.shape == data.shape
 
 def test_init_vectors_with_width(strings):
     v = Vectors(strings, 3)
+    for string in strings:
+        v.add(string)
     assert v.shape == (len(strings), 3)
 
 
 def test_get_vector(strings, data):
     v = Vectors(strings, data)
+    for string in strings:
+        v.add(string)
     assert list(v[strings[0]]) == list(data[0])
     assert list(v[strings[0]]) != list(data[1])
     assert list(v[strings[1]]) != list(data[0])
@@ -35,6 +57,8 @@ def test_get_vector(strings, data):
 def test_set_vector(strings, data):
     orig = data.copy()
     v = Vectors(strings, data)
+    for string in strings:
+        v.add(string)
     assert list(v[strings[0]]) == list(orig[0])
     assert list(v[strings[0]]) != list(orig[1])
     v[strings[0]] = data[1]
@@ -42,125 +66,111 @@ def test_set_vector(strings, data):
     assert list(v[strings[0]]) != list(orig[0])
 
 
-#
-#@pytest.fixture()
-#def tokenizer_v(vocab):
-#    return Tokenizer(vocab, {}, None, None, None)
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', ["apple and orange"])
-#def test_vectors_token_vector(tokenizer_v, vectors, text):
-#    doc = tokenizer_v(text)
-#    assert vectors[0] == (doc[0].text, list(doc[0].vector))
-#    assert vectors[1] == (doc[2].text, list(doc[2].vector))
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', ["apple", "orange"])
-#def test_vectors_lexeme_vector(vocab, text):
-#    lex = vocab[text]
-#    assert list(lex.vector)
-#    assert lex.vector_norm
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
-#def test_vectors_doc_vector(vocab, text):
-#    doc = get_doc(vocab, text)
-#    assert list(doc.vector)
-#    assert doc.vector_norm
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
-#def test_vectors_span_vector(vocab, text):
-#    span = get_doc(vocab, text)[0:2]
-#    assert list(span.vector)
-#    assert span.vector_norm
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', ["apple orange"])
-#def test_vectors_token_token_similarity(tokenizer_v, text):
-#    doc = tokenizer_v(text)
-#    assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
-#    assert 0.0 < doc[0].similarity(doc[1]) < 1.0
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
-#def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
-#    token = tokenizer_v(text1)
-#    lex = vocab[text2]
-#    assert token.similarity(lex) == lex.similarity(token)
-#    assert 0.0 < token.similarity(lex) < 1.0
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#def test_vectors_token_span_similarity(vocab, text):
-#    doc = get_doc(vocab, text)
-#    assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0])
-#    assert 0.0 < doc[0].similarity(doc[1:3]) < 1.0
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#def test_vectors_token_doc_similarity(vocab, text):
-#    doc = get_doc(vocab, text)
-#    assert doc[0].similarity(doc) == doc.similarity(doc[0])
-#    assert 0.0 < doc[0].similarity(doc) < 1.0
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#def test_vectors_lexeme_span_similarity(vocab, text):
-#    doc = get_doc(vocab, text)
-#    lex = vocab[text[0]]
-#    assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex)
-#    assert 0.0 < doc.similarity(doc[1:3]) < 1.0
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
-#def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
-#    lex1 = vocab[text1]
-#    lex2 = vocab[text2]
-#    assert lex1.similarity(lex2) == lex2.similarity(lex1)
-#    assert 0.0 < lex1.similarity(lex2) < 1.0
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#def test_vectors_lexeme_doc_similarity(vocab, text):
-#    doc = get_doc(vocab, text)
-#    lex = vocab[text[0]]
-#    assert lex.similarity(doc) == doc.similarity(lex)
-#    assert 0.0 < lex.similarity(doc) < 1.0
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#def test_vectors_span_span_similarity(vocab, text):
-#    doc = get_doc(vocab, text)
-#    assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
-#    assert 0.0 < doc[0:2].similarity(doc[1:3]) < 1.0
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#def test_vectors_span_doc_similarity(vocab, text):
-#    doc = get_doc(vocab, text)
-#    assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
-#    assert 0.0 < doc[0:2].similarity(doc) < 1.0
-#
-#
-#@pytest.mark.xfail
-#@pytest.mark.parametrize('text1,text2', [
-#    (["apple", "and", "apple", "pie"], ["orange", "juice"])])
-#def test_vectors_doc_doc_similarity(vocab, text1, text2):
-#    doc1 = get_doc(vocab, text1)
-#    doc2 = get_doc(vocab, text2)
-#    assert doc1.similarity(doc2) == doc2.similarity(doc1)
-#    assert 0.0 < doc1.similarity(doc2) < 1.0
+
+@pytest.fixture()
+def tokenizer_v(vocab):
+    return Tokenizer(vocab, {}, None, None, None)
+
+
+@pytest.mark.parametrize('text', ["apple and orange"])
+def test_vectors_token_vector(tokenizer_v, vectors, text):
+    doc = tokenizer_v(text)
+    assert vectors[0] == (doc[0].text, list(doc[0].vector))
+    assert vectors[1] == (doc[2].text, list(doc[2].vector))
+
+
+@pytest.mark.parametrize('text', ["apple", "orange"])
+def test_vectors_lexeme_vector(vocab, text):
+    lex = vocab[text]
+    assert list(lex.vector)
+    assert lex.vector_norm
+
+
+@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
+def test_vectors_doc_vector(vocab, text):
+    doc = get_doc(vocab, text)
+    assert list(doc.vector)
+    assert doc.vector_norm
+
+
+@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
+def test_vectors_span_vector(vocab, text):
+    span = get_doc(vocab, text)[0:2]
+    assert list(span.vector)
+    assert span.vector_norm
+
+
+@pytest.mark.parametrize('text', ["apple orange"])
+def test_vectors_token_token_similarity(tokenizer_v, text):
+    doc = tokenizer_v(text)
+    assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
+    assert -1. < doc[0].similarity(doc[1]) < 1.0
+
+
+@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
+def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
+    token = tokenizer_v(text1)
+    lex = vocab[text2]
+    assert token.similarity(lex) == lex.similarity(token)
+    assert -1. < token.similarity(lex) < 1.0
+
+
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+def test_vectors_token_span_similarity(vocab, text):
+    doc = get_doc(vocab, text)
+    assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0])
+    assert -1. < doc[0].similarity(doc[1:3]) < 1.0
+
+
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+def test_vectors_token_doc_similarity(vocab, text):
+    doc = get_doc(vocab, text)
+    assert doc[0].similarity(doc) == doc.similarity(doc[0])
+    assert -1. < doc[0].similarity(doc) < 1.0
+
+
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+def test_vectors_lexeme_span_similarity(vocab, text):
+    doc = get_doc(vocab, text)
+    lex = vocab[text[0]]
+    assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex)
+    assert -1. < doc.similarity(doc[1:3]) < 1.0
+
+
+@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
+def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
+    lex1 = vocab[text1]
+    lex2 = vocab[text2]
+    assert lex1.similarity(lex2) == lex2.similarity(lex1)
+    assert -1. < lex1.similarity(lex2) < 1.0
+
+
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+def test_vectors_lexeme_doc_similarity(vocab, text):
+    doc = get_doc(vocab, text)
+    lex = vocab[text[0]]
+    assert lex.similarity(doc) == doc.similarity(lex)
+    assert -1. < lex.similarity(doc) < 1.0
+
+
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+def test_vectors_span_span_similarity(vocab, text):
+    doc = get_doc(vocab, text)
+    assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
+    assert -1. < doc[0:2].similarity(doc[1:3]) < 1.0
+
+
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+def test_vectors_span_doc_similarity(vocab, text):
+    doc = get_doc(vocab, text)
+    assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
+    assert -1. < doc[0:2].similarity(doc) < 1.0
+
+
+@pytest.mark.parametrize('text1,text2', [
+    (["apple", "and", "apple", "pie"], ["orange", "juice"])])
+def test_vectors_doc_doc_similarity(vocab, text1, text2):
+    doc1 = get_doc(vocab, text1)
+    doc2 = get_doc(vocab, text2)
+    assert doc1.similarity(doc2) == doc2.similarity(doc1)
+    assert -1. < doc1.similarity(doc2) < 1.0
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 822a0152d..dd52c4cbf 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -238,6 +238,29 @@ cdef class Doc:
     def doc(self):
         return self
 
+    def char_span(self, int start_idx, int end_idx, label=0, vector=None):
+        """Create a `Span` object from the slice `doc.text[start : end]`.
+
+        doc (Doc): The parent document.
+        start (int): The index of the first character of the span.
+        end (int): The index of the first character after the span.
+        label (uint64 or string): A label to attach to the Span, e.g. for named entities.
+        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
+        RETURNS (Span): The newly constructed object.
+        """
+        if not isinstance(label, int):
+            label = self.vocab.strings.add(label)
+        cdef int start = token_by_start(self.c, self.length, start_idx)
+        if start == -1:
+            return None
+        cdef int end = token_by_end(self.c, self.length, end_idx)
+        if end == -1:
+            return None
+        # Currently we have the token index, we want the range-end index
+        end += 1
+        cdef Span span = Span(self, start, end, label=label, vector=vector)
+        return span
+
     def similarity(self, other):
         """Make a semantic similarity estimate. The default estimate is cosine
         similarity using an average of word vectors.
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index 8d675c04f..9645189a5 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -15,5 +15,5 @@ cdef class Span:
     cdef public _vector
     cdef public _vector_norm
 
-
     cpdef int _recalculate_indices(self) except -1
+    cpdef np.ndarray to_array(self, object features)
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 9f2115fe1..7e29cccf4 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -7,7 +7,7 @@ import numpy
 import numpy.linalg
 from libc.math cimport sqrt
 
-from .doc cimport token_by_start, token_by_end
+from .doc cimport token_by_start, token_by_end, get_token_attr
 from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport flags_t, attr_t, hash_t
 from ..attrs cimport attr_id_t
@@ -135,6 +135,29 @@ cdef class Span:
             return 0.0
         return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
 
+    cpdef np.ndarray to_array(self, object py_attr_ids):
+        """Given a list of M attribute IDs, export the tokens to a numpy
+        `ndarray` of shape `(N, M)`, where `N` is the length of the document.
+        The values will be 32-bit integers.
+
+        attr_ids (list[int]): A list of attribute ID ints.
+        RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
+            per word, and one column per attribute indicated in the input
+            `attr_ids`.
+        """
+        cdef int i, j
+        cdef attr_id_t feature
+        cdef np.ndarray[attr_t, ndim=2] output
+        # Make an array from the attributes --- otherwise our inner loop is Python
+        # dict iteration.
+        cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
+        cdef int length = self.end - self.start
+        output = numpy.ndarray(shape=(length, len(attr_ids)), dtype=numpy.uint64)
+        for i in range(self.start, self.end):
+            for j, feature in enumerate(attr_ids):
+                output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature)
+        return output
+
     cpdef int _recalculate_indices(self) except -1:
         if self.end > self.doc.length \
         or self.doc.c[self.start].idx != self.start_char \
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 5b8c276d8..7b11d6efa 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -62,18 +62,26 @@ cdef class Token:
 
     def __richcmp__(self, Token other, int op):
         # http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
+        cdef Doc my_doc = self.doc
+        cdef Doc other_doc = other.doc
         my = self.idx
         their = other.idx if other is not None else None
         if op == 0:
             return my < their
         elif op == 2:
-            return my == their
+            if my_doc is other_doc:
+                return my == their
+            else:
+                return False
         elif op == 4:
             return my > their
         elif op == 1:
             return my <= their
         elif op == 3:
-            return my != their
+            if my_doc is other_doc:
+                return my != their
+            else:
+                return True
         elif op == 5:
             return my >= their
         else:
diff --git a/spacy/util.py b/spacy/util.py
index d83fe3416..645f8b3f7 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -22,7 +22,7 @@ import ujson
 
 from .symbols import ORTH
 from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
-from .compat import copy_array, normalize_string_keys, getattr_
+from .compat import copy_array, normalize_string_keys, getattr_, import_file
 
 
 LANGUAGES = {}
@@ -112,15 +112,13 @@ def load_model(name, **overrides):
 
 def load_model_from_link(name, **overrides):
     """Load a model from a shortcut link, or directory in spaCy data path."""
-    init_file = get_data_path() / name / '__init__.py'
-    spec = importlib.util.spec_from_file_location(name, str(init_file))
+    path = get_data_path() / name / '__init__.py'
     try:
-        cls = importlib.util.module_from_spec(spec)
+        cls = import_file(name, path)
     except AttributeError:
         raise IOError(
             "Cant' load '%s'. If you're using a shortcut link, make sure it "
             "points to a valid model package (not just a data directory)." % name)
-    spec.loader.exec_module(cls)
     return cls.load(**overrides)
 
 
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 35d4d17ab..72e30bd2f 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -1,18 +1,25 @@
+from __future__ import unicode_literals
+from libc.stdint cimport int32_t, uint64_t
 import numpy
 from collections import OrderedDict
 import msgpack
 import msgpack_numpy
 msgpack_numpy.patch()
+cimport numpy as np
 
+from .typedefs cimport attr_t
 from .strings cimport StringStore
 from . import util
+from .compat import basestring_
 
 
 cdef class Vectors:
     '''Store, save and load word vectors.'''
     cdef public object data
     cdef readonly StringStore strings
-    cdef public object key2i
+    cdef public object key2row
+    cdef public object keys
+    cdef public int i
 
     def __init__(self, strings, data_or_width):
         self.strings = StringStore()
@@ -21,10 +28,10 @@ cdef class Vectors:
                                            dtype='f')
         else:
             data = data_or_width
+        self.i = 0
         self.data = data
-        self.key2i = {}
-        for i, string in enumerate(strings):
-            self.key2i[self.strings.add(string)] = i
+        self.key2row = {}
+        self.keys = np.ndarray((self.data.shape[0],), dtype='uint64') 
 
     def __reduce__(self):
         return (Vectors, (self.strings, self.data))
@@ -32,7 +39,7 @@ cdef class Vectors:
     def __getitem__(self, key):
         if isinstance(key, basestring):
             key = self.strings[key]
-        i = self.key2i[key]
+        i = self.key2row[key]
         if i is None:
             raise KeyError(key)
         else:
@@ -41,14 +48,36 @@ cdef class Vectors:
     def __setitem__(self, key, vector):
         if isinstance(key, basestring):
             key = self.strings.add(key)
-        i = self.key2i[key]
+        i = self.key2row[key]
         self.data[i] = vector
 
     def __iter__(self):
         yield from self.data
 
     def __len__(self):
-        return len(self.strings)
+        return self.i
+
+    def __contains__(self, key):
+        if isinstance(key, basestring_):
+            key = self.strings[key]
+        return key in self.key2row
+
+    def add(self, key, vector=None):
+        if isinstance(key, basestring_):
+            key = self.strings.add(key)
+        if key not in self.key2row:
+            i = self.i
+            if i >= self.keys.shape[0]:
+                self.keys.resize((self.keys.shape[0]*2,))
+                self.data.resize((self.data.shape[0]*2, self.data.shape[1]))
+            self.key2row[key] = self.i
+            self.keys[self.i] = key
+            self.i += 1
+        else:
+            i = self.key2row[key]
+        if vector is not None:
+            self.data[i] = vector
+        return i
 
     def items(self):
         for i, string in enumerate(self.strings):
@@ -61,34 +90,60 @@ cdef class Vectors:
     def most_similar(self, key):
         raise NotImplementedError
 
-    def to_disk(self, path):
-        raise NotImplementedError
+    def to_disk(self, path, **exclude):
+        serializers = OrderedDict((
+            ('vectors', lambda p: numpy.save(p.open('wb'), self.data, allow_pickle=False)),
+            ('keys', lambda p: numpy.save(p.open('wb'), self.keys, allow_pickle=False)),
+        ))
+        return util.to_disk(path, serializers, exclude)
 
-    def from_disk(self, path):
-        raise NotImplementedError
+    def from_disk(self, path, **exclude):
+        def load_keys(path):
+            if path.exists():
+                self.keys = numpy.load(path)
+                for i, key in enumerate(self.keys):
+                    self.keys[i] = key
+                    self.key2row[key] = i
+
+        def load_vectors(path):
+            if path.exists():
+                self.data = numpy.load(path)
+
+        serializers = OrderedDict((
+            ('keys', load_keys),
+            ('vectors', load_vectors),
+        ))
+        util.from_disk(path, serializers, exclude)
+        return self
 
     def to_bytes(self, **exclude):
         def serialize_weights():
-            if hasattr(self.weights, 'to_bytes'):
-                return self.weights.to_bytes()
+            if hasattr(self.data, 'to_bytes'):
+                return self.data.to_bytes()
             else:
-                return msgpack.dumps(self.weights)
-
+                return msgpack.dumps(self.data)
         serializers = OrderedDict((
-            ('strings', lambda: self.strings.to_bytes()),
-            ('weights', serialize_weights)
+            ('keys', lambda: msgpack.dumps(self.keys)),
+            ('vectors', serialize_weights)
         ))
         return util.to_bytes(serializers, exclude)
 
     def from_bytes(self, data, **exclude):
         def deserialize_weights(b):
-            if hasattr(self.weights, 'from_bytes'):
-                self.weights.from_bytes()
+            if hasattr(self.data, 'from_bytes'):
+                self.data.from_bytes()
             else:
-                self.weights = msgpack.loads(b)
+                self.data = msgpack.loads(b)
+
+        def load_keys(keys):
+            self.keys.resize((len(keys),))
+            for i, key in enumerate(keys):
+                self.keys[i] = key
+                self.key2row[key] = i
 
         deserializers = OrderedDict((
-            ('strings', lambda b: self.strings.from_bytes(b)),
-            ('weights', deserialize_weights)
+            ('keys', lambda b: load_keys(msgpack.loads(b))),
+            ('vectors', deserialize_weights)
         ))
-        return util.from_bytes(deserializers, exclude)
+        util.from_bytes(data, deserializers, exclude)
+        return self
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 149317779..dc141552d 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -19,9 +19,10 @@ from .tokens.token cimport Token
 from .attrs cimport PROB, LANG
 from .structs cimport SerializedLexemeC
 
-from .compat import copy_reg, pickle
+from .compat import copy_reg, pickle, basestring_
 from .lemmatizer import Lemmatizer
 from .attrs import intify_attrs
+from .vectors import Vectors
 from . import util
 from . import attrs
 from . import symbols
@@ -63,6 +64,7 @@ cdef class Vocab:
                 self.strings.add(name)
         self.lex_attr_getters = lex_attr_getters
         self.morphology = Morphology(self.strings, tag_map, lemmatizer)
+        self.vectors = Vectors(self.strings, 300)
 
     property lang:
         def __get__(self):
@@ -242,13 +244,15 @@ cdef class Vocab:
 
     @property
     def vectors_length(self):
-        raise NotImplementedError
+        return len(self.vectors)
 
-    def clear_vectors(self):
+    def clear_vectors(self, new_dim=None):
         """Drop the current vector table. Because all vectors must be the same
         width, you have to call this to change the size of the vectors.
         """
-        raise NotImplementedError
+        if new_dim is None:
+            new_dim = self.vectors.data.shape[1]
+        self.vectors = Vectors(self.strings, new_dim)
 
     def get_vector(self, orth):
         """Retrieve a vector for a word in the vocabulary.
@@ -262,7 +266,9 @@ cdef class Vocab:
 
         RAISES: If no vectors data is loaded, ValueError is raised.
         """
-        raise NotImplementedError
+        if isinstance(orth, basestring_):
+            orth = self.strings.add(orth)
+        return self.vectors[orth]
 
     def set_vector(self, orth, vector):
         """Set a vector for a word in the vocabulary.
@@ -272,15 +278,19 @@ cdef class Vocab:
         RETURNS:
             None
         """
-        raise NotImplementedError
+        if not isinstance(orth, basestring_):
+            orth = self.strings[orth]
+        self.vectors.add(orth, vector=vector)
 
     def has_vector(self, orth):
         """Check whether a word has a vector. Returns False if no
         vectors have been loaded. Words can be looked up by string
         or int ID."""
-        return False
+        if isinstance(orth, basestring_):
+            orth = self.strings.add(orth)
+        return orth in self.vectors
 
-    def to_disk(self, path):
+    def to_disk(self, path, **exclude):
         """Save the current state to a directory.
 
         path (unicode or Path): A path to a directory, which will be created if
@@ -292,8 +302,10 @@ cdef class Vocab:
         self.strings.to_disk(path / 'strings.json')
         with (path / 'lexemes.bin').open('wb') as file_:
             file_.write(self.lexemes_to_bytes())
+        if self.vectors is not None:
+            self.vectors.to_disk(path)
 
-    def from_disk(self, path):
+    def from_disk(self, path, **exclude):
         """Loads state from a directory. Modifies the object in place and
         returns it.
 
@@ -305,6 +317,8 @@ cdef class Vocab:
         self.strings.from_disk(path / 'strings.json')
         with (path / 'lexemes.bin').open('rb') as file_:
             self.lexemes_from_bytes(file_.read())
+        if self.vectors is not None:
+            self.vectors.from_disk(path, exclude='strings.json')
         return self
 
     def to_bytes(self, **exclude):
@@ -313,9 +327,16 @@ cdef class Vocab:
         **exclude: Named attributes to prevent from being serialized.
         RETURNS (bytes): The serialized form of the `Vocab` object.
         """
+        def deserialize_vectors():
+            if self.vectors is None:
+                return None
+            else:
+                return self.vectors.to_bytes(exclude='strings.json')
+ 
         getters = OrderedDict((
             ('strings', lambda: self.strings.to_bytes()),
             ('lexemes', lambda: self.lexemes_to_bytes()),
+            ('vectors', deserialize_vectors)
         ))
         return util.to_bytes(getters, exclude)
 
@@ -326,9 +347,15 @@ cdef class Vocab:
         **exclude: Named attributes to prevent from being loaded.
         RETURNS (Vocab): The `Vocab` object.
         """
+        def serialize_vectors(b):
+            if self.vectors is None:
+                return None
+            else:
+                return self.vectors.from_bytes(b, exclude='strings')
         setters = OrderedDict((
             ('strings', lambda b: self.strings.from_bytes(b)),
             ('lexemes', lambda b: self.lexemes_from_bytes(b)),
+            ('vectors', lambda b: serialize_vectors(b))
         ))
         util.from_bytes(bytes_data, setters, exclude)
         return self
diff --git a/website/assets/css/_base/_utilities.sass b/website/assets/css/_base/_utilities.sass
index 2c40858a8..46c3e84d9 100644
--- a/website/assets/css/_base/_utilities.sass
+++ b/website/assets/css/_base/_utilities.sass
@@ -112,6 +112,10 @@
 .u-nowrap
     white-space: nowrap
 
+.u-break.u-break
+    word-wrap: break-word
+    white-space: initial
+
 .u-no-border
     border: none
 
diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade
index 929985144..7fbbcce97 100644
--- a/website/docs/api/doc.jade
+++ b/website/docs/api/doc.jade
@@ -140,6 +140,43 @@ p Get the number of tokens in the document.
         +cell int
         +cell The number of tokens in the document.
 
++h(2, "char_span") Doc.char_span
+    +tag method
+    +tag-new(2)
+
+p Create a #[code Span] object from the slice #[code doc.text[start : end]].
+
++aside-code("Example").
+    doc = nlp(u'I like New York')
+    span = doc.char_span(7, 15, label=u'GPE')
+    assert span.text == 'New York'
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code start]
+        +cell int
+        +cell The index of the first character of the span.
+
+    +row
+        +cell #[code end]
+        +cell int
+        +cell The index of the first character after the span.
+
+    +row
+        +cell #[code label]
+        +cell uint64 / unicode
+        +cell A label to attach to the Span, e.g. for named entities.
+
+    +row
+        +cell #[code vector]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+        +cell A meaning representation of the span.
+
+    +footrow
+        +cell returns
+        +cell #[code Span]
+        +cell The newly constructed object.
+
 +h(2, "similarity") Doc.similarity
     +tag method
     +tag-model("vectors")
@@ -211,12 +248,12 @@ p
 +table(["Name", "Type", "Description"])
     +row
         +cell #[code attr_ids]
-        +cell ints
+        +cell list
         +cell A list of attribute ID ints.
 
     +footrow
         +cell returns
-        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+        +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
         +cell
             |  The exported attributes as a 2D numpy array, with one row per
             |  token and one column per attribute.
@@ -245,7 +282,7 @@ p
 
     +row
         +cell #[code array]
-        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+        +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
         +cell The attribute values to load.
 
     +footrow
@@ -509,7 +546,7 @@ p
 +table(["Name", "Type", "Description"])
     +footrow
         +cell returns
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
         +cell A 1D numpy array representing the document's semantics.
 
 +h(2, "vector_norm") Doc.vector_norm
diff --git a/website/docs/api/language.jade b/website/docs/api/language.jade
index 9c26f506c..69665ee9d 100644
--- a/website/docs/api/language.jade
+++ b/website/docs/api/language.jade
@@ -111,6 +111,14 @@ p
         +cell -
         +cell A sequence of unicode objects.
 
+    +row
+        +cell #[code as_tuples]
+        +cell bool
+        +cell
+            |  If set to #[code True], inputs should be a sequence of
+            |  #[code (text, context)] tuples. Output will then be a sequence of
+            |  #[code (doc, context)] tuples. Defaults to #[code False].
+
     +row
         +cell #[code n_threads]
         +cell int
diff --git a/website/docs/api/lexeme.jade b/website/docs/api/lexeme.jade
index a0487be9b..6e3f68493 100644
--- a/website/docs/api/lexeme.jade
+++ b/website/docs/api/lexeme.jade
@@ -129,7 +129,7 @@ p A real-valued meaning representation.
 +table(["Name", "Type", "Description"])
     +footrow
         +cell returns
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
         +cell A 1D numpy array representing the lexeme's semantics.
 
 +h(2, "vector_norm") Lexeme.vector_norm
diff --git a/website/docs/api/span.jade b/website/docs/api/span.jade
index 542336714..2ca2d3ea9 100644
--- a/website/docs/api/span.jade
+++ b/website/docs/api/span.jade
@@ -37,7 +37,7 @@ p Create a Span object from the #[code slice doc[start : end]].
 
     +row
         +cell #[code vector]
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
         +cell A meaning representation of the span.
 
     +footrow
@@ -145,11 +145,47 @@ p
         +cell float
         +cell A scalar similarity score. Higher is more similar.
 
++h(2, "to_array") Span.to_array
+    +tag method
+    +tag-new(2)
+
+p
+    |  Given a list of #[code M] attribute IDs, export the tokens to a numpy
+    |  #[code ndarray] of shape #[code (N, M)], where #[code N] is the length of
+    |  the document. The values will be 32-bit integers.
+
++aside-code("Example").
+    from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
+    doc = nlp(u'I like New York in Autumn.')
+    span = doc[2:3]
+    # All strings mapped to integers, for easy export to numpy
+    np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code attr_ids]
+        +cell list
+        +cell A list of attribute ID ints.
+
+    +footrow
+        +cell returns
+        +cell #[code.u-break numpy.ndarray[long, ndim=2]]
+        +cell
+            |  A feature matrix, with one row per word, and one column per
+            |  attribute indicated in the input #[code attr_ids].
+
 +h(2, "merge") Span.merge
     +tag method
 
 p Retokenize the document, such that the span is merged into a single token.
 
++aside-code("Example").
+    doc = nlp(u'I like New York in Autumn.')
+    span = doc[2:3]
+    span.merge()
+    assert len(doc) == 6
+    assert doc[2].text == 'New York'
+
 +table(["Name", "Type", "Description"])
     +row
         +cell #[code **attributes]
@@ -270,7 +306,7 @@ p
 +table(["Name", "Type", "Description"])
     +footrow
         +cell returns
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
         +cell A 1D numpy array representing the span's semantics.
 
 +h(2, "vector_norm") Span.vector_norm
diff --git a/website/docs/api/token.jade b/website/docs/api/token.jade
index 87387e09d..db445d09b 100644
--- a/website/docs/api/token.jade
+++ b/website/docs/api/token.jade
@@ -250,7 +250,7 @@ p A real-valued meaning representation.
 +table(["Name", "Type", "Description"])
     +footrow
         +cell returns
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
         +cell A 1D numpy array representing the token's semantics.
 
 +h(2, "vector_norm") Span.vector_norm