Merge remote-tracking branch 'origin/develop' into develop-irish

2025-07-23 06:29:48 +03:00 · 2017-09-11 08:40:11 +01:00 · 2017-09-11 08:40:11 +01:00 · 1ee75ae337
commit 1ee75ae337
parent c069b4acb5 456bb8a74c
108 changed files with 43967 additions and 458 deletions
--- a/.gitignore
+++ b/.gitignore
@ -40,7 +40,6 @@ venv/
 # Distribution / packaging
 env/
 bin/
 build/
 develop-eggs/
 dist/
--- a/.travis.yml
+++ b/.travis.yml
@ -14,8 +14,7 @@ os:
 env:
  - VIA=compile LC_ALL=en_US.ascii 
  - VIA=compile
-
+  #- VIA=pypi_nightly
 #  - VIA=sdist
 install:
  - "./travis.sh"
@ -23,7 +22,7 @@ install:
 script:
  - "pip install pytest pytest-timeout"
  - if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi
-  - if [[ "${VIA}" == "pypi" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(ospath.dirname(spacy.__file__)))"`; fi
+  - if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
  - if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
 notifications:
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,3 +1,4 @@
 recursive-include include *.h
 include LICENSE
 include README.rst
 include bin/spacy
--- a/README.rst
+++ b/README.rst
@ -229,7 +229,7 @@ Compile from source
 The other way to install spaCy is to clone its
 `GitHub repository <https://github.com/explosion/spaCy>`_ and build it from
 source. That is the common way if you want to make changes to the code base.
-You'll need to make sure that you have a development enviroment consisting of a
+You'll need to make sure that you have a development environment consisting of a
 Python distribution including header files, a compiler,
 `pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_
 and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest.
--- a/bin/spacy
+++ b/bin/spacy
@ -0,0 +1 @@
 python -m spacy "$@"
--- a/setup.py
+++ b/setup.py
@ -28,7 +28,9 @@ MOD_NAMES = [
    'spacy.pipeline',
    'spacy.syntax.stateclass',
    'spacy.syntax._state',
    'spacy.syntax._beam_utils',
    'spacy.tokenizer',
    'spacy._cfile',
    'spacy.syntax.parser',
    'spacy.syntax.nn_parser',
    'spacy.syntax.beam_parser',
@ -187,6 +189,7 @@ def setup_package():
            url=about['__uri__'],
            license=about['__license__'],
            ext_modules=ext_modules,
            scripts=['bin/spacy'],
            install_requires=[
                'numpy>=1.7',
                'murmurhash>=0.28,<0.29',
--- a/spacy/main.py
+++ b/spacy/main.py
@ -3,15 +3,23 @@ from __future__ import print_function
 # NB! This breaks in plac on Python 2!!
 #from __future__ import unicode_literals
 if __name__ == '__main__':
    import plac
    import sys
-    from spacy.cli import download, link, info, package, train, convert
+    from spacy.cli import download, link, info, package, train, convert, model
    from spacy.cli import profile
    from spacy.util import prints
-    commands = {'download': download, 'link': link, 'info': info, 'train': train,
+    commands = {
-                'convert': convert, 'package': package}
+        'download': download,
        'link': link,
        'info': info,
        'train': train,
        'convert': convert,
        'package': package,
        'model': model,
        'profile': profile,
    }
    if len(sys.argv) == 1:
        prints(', '.join(commands), title="Available commands", exits=1)
    command = sys.argv.pop(1)
@ -19,5 +27,7 @@ if __name__ == '__main__':
    if command in commands:
        plac.call(commands[command])
    else:
-        prints("Available: %s" % ', '.join(commands),
+        prints(
-               title="Unknown command: %s" % command, exits=1)
+            "Available: %s" % ', '.join(commands),
            title="Unknown command: %s" % command,
            exits=1)
--- a/spacy/_cfile.pxd
+++ b/spacy/_cfile.pxd
@ -0,0 +1,26 @@
 from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from cymem.cymem cimport Pool
 cdef class CFile:
    cdef FILE* fp
    cdef bint is_open
    cdef Pool mem
    cdef int size # For compatibility with subclass
    cdef int _capacity # For compatibility with subclass
    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
 cdef class StringCFile(CFile):
    cdef unsigned char* data
    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
--- a/spacy/_cfile.pyx
+++ b/spacy/_cfile.pyx
@ -0,0 +1,88 @@
 from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from libc.string cimport memcpy
 cdef class CFile:
    def __init__(self, loc, mode, on_open_error=None):
        if isinstance(mode, unicode):
            mode_str = mode.encode('ascii')
        else:
            mode_str = mode
        if hasattr(loc, 'as_posix'):
            loc = loc.as_posix()
        self.mem = Pool()
        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
        self.fp = fopen(<char*>bytes_loc, mode_str)
        if self.fp == NULL:
            if on_open_error is not None:
                on_open_error()
            else:
                raise IOError("Could not open binary file %s" % bytes_loc)
        self.is_open = True
    def __dealloc__(self):
        if self.is_open:
            fclose(self.fp)
    def close(self):
        fclose(self.fp)
        self.is_open = False
    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
        st = fread(dest, elem_size, number, self.fp)
        if st != number:
            raise IOError
    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
        st = fwrite(src, elem_size, number, self.fp)
        if st != number:
            raise IOError
    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
        cdef void* dest = mem.alloc(number, elem_size)
        self.read_into(dest, number, elem_size)
        return dest
    def write_unicode(self, unicode value):
        cdef bytes py_bytes = value.encode('utf8')
        cdef char* chars = <char*>py_bytes
        self.write(sizeof(char), len(py_bytes), chars)
 cdef class StringCFile:
    def __init__(self, mode, bytes data=b'', on_open_error=None):
        self.mem = Pool()
        self.is_open = 'w' in mode
        self._capacity = max(len(data), 8)
        self.size = len(data)
        self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
        for i in range(len(data)):
            self.data[i] = data[i]
    def close(self):
        self.is_open = False
    def string_data(self):
        return (self.data-self.size)[:self.size]
    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
        memcpy(dest, self.data, elem_size * number)
        self.data += elem_size * number
    cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
        write_size = number * elem_size
        if (self.size + write_size) >= self._capacity:
            self._capacity = (self.size + write_size) * 2
            self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
        memcpy(&self.data[self.size], src, elem_size * number)
        self.size += write_size
    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
        cdef void* dest = mem.alloc(number, elem_size)
        self.read_into(dest, number, elem_size)
        return dest
    def write_unicode(self, unicode value):
        cdef bytes py_bytes = value.encode('utf8')
        cdef char* chars = <char*>py_bytes
        self.write(sizeof(char), len(py_bytes), chars)
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -9,7 +9,7 @@ import cytoolz
 from thinc.neural._classes.convolution import ExtractWindow
 from thinc.neural._classes.static_vectors import StaticVectors
-from thinc.neural._classes.batchnorm import BatchNorm
+from thinc.neural._classes.batchnorm import BatchNorm as BN
 from thinc.neural._classes.layernorm import LayerNorm as LN
 from thinc.neural._classes.resnet import Residual
 from thinc.neural import ReLu
@ -23,8 +23,10 @@ from thinc.neural._classes.attention import ParametricAttention
 from thinc.linear.linear import LinearModel
 from thinc.api import uniqued, wrap, flatten_add_lengths
-from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
+
 from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
 from .tokens.doc import Doc
 from . import util
 import numpy
 import io
@ -208,25 +210,39 @@ class PrecomputableMaxouts(Model):
        return Yfp, backward
 def drop_layer(layer, factor=2.):
    def drop_layer_fwd(X, drop=0.):
        if drop <= 0.:
            return layer.begin_update(X, drop=drop)
        else:
            coinflip = layer.ops.xp.random.random()
            if (coinflip / factor) >= drop:
                return layer.begin_update(X, drop=drop)
            else:
                return X, lambda dX, sgd=None: dX
    model = wrap(drop_layer_fwd, layer)
    model.predict = layer
    return model
 def Tok2Vec(width, embed_size, preprocess=None):
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
-        norm = get_col(cols.index(NORM))     >> HashEmbed(width, embed_size, name='embed_lower')
+        norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
-        prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
+        prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
-        suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
+        suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
-        shape = get_col(cols.index(SHAPE))   >> HashEmbed(width, embed_size//2, name='embed_shape')
+        shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
-        embed = (norm | prefix | suffix | shape )
+        embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3))
        tok2vec = (
            with_flatten(
                asarray(Model.ops, dtype='uint64')
                >> uniqued(embed, column=5)
-                >> LN(Maxout(width, width*4, pieces=3))
+                >> Residual(
-                >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3))
+                    (ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
-                >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3))
+                ) ** 4, pad=4
-                >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3))
+            )
                >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3)),
            pad=4)
        )
        if preprocess not in (False, None):
            tok2vec = preprocess >> tok2vec
@ -321,6 +337,7 @@ def zero_init(model):
 def doc2feats(cols=None):
    if cols is None:
        cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    def forward(docs, drop=0.):
        feats = []
@ -353,20 +370,37 @@ def fine_tune(embedding, combine=None):
            "fine_tune currently only supports addition. Set combine=None")
    def fine_tune_fwd(docs_tokvecs, drop=0.):
        docs, tokvecs = docs_tokvecs
        lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
        vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
-        
+        flat_tokvecs = embedding.ops.flatten(tokvecs)
        flat_vecs = embedding.ops.flatten(vecs)
        output = embedding.ops.unflatten(
-                    embedding.ops.flatten(tokvecs)
+                   (model.mix[0] * flat_tokvecs + model.mix[1] * flat_vecs), lengths)
                    + embedding.ops.flatten(vecs),
                    lengths)
        def fine_tune_bwd(d_output, sgd=None):
-            bp_vecs(d_output, sgd=sgd)
+            flat_grad = model.ops.flatten(d_output)
-            return d_output
+            model.d_mix[0] += flat_tokvecs.dot(flat_grad.T).sum()
            model.d_mix[1] += flat_vecs.dot(flat_grad.T).sum()
            bp_vecs([d_o * model.mix[1] for d_o in d_output], sgd=sgd)
            if sgd is not None:
                sgd(model._mem.weights, model._mem.gradient, key=model.id)
            return [d_o * model.mix[0] for d_o in d_output]
        return output, fine_tune_bwd
    def fine_tune_predict(docs_tokvecs):
        docs, tokvecs = docs_tokvecs
        vecs = embedding(docs)
        return [model.mix[0]*tv+model.mix[1]*v
                for tv, v in zip(tokvecs, vecs)]
    model = wrap(fine_tune_fwd, embedding)
    model.mix = model._mem.add((model.id, 'mix'), (2,))
    model.mix.fill(0.5)
    model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
    model.predict = fine_tune_predict
    return model
@ -422,9 +456,10 @@ def getitem(i):
    return layerize(getitem_fwd)
 def build_tagger_model(nr_class, token_vector_width, **cfg):
    embed_size = util.env_opt('embed_size', 7500)
    with Model.define_operators({'>>': chain, '+': add}):
        # Input: (doc, tensor) tuples
-        private_tok2vec = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats())
+        private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats())
        model = (
            fine_tune(private_tok2vec)
@ -437,30 +472,103 @@ def build_tagger_model(nr_class, token_vector_width, **cfg):
    return model
@layerize
 def SpacyVectors(docs, drop=0.):
    xp = get_array_module(docs[0].vocab.vectors.data)
    width = docs[0].vocab.vectors.data.shape[1]
    batch = []
    for doc in docs:
        indices = numpy.zeros((len(doc),), dtype='i')
        for i, word in enumerate(doc):
            if word.orth in doc.vocab.vectors.key2row:
                indices[i] = doc.vocab.vectors.key2row[word.orth]
            else:
                indices[i] = 0
        vectors = doc.vocab.vectors.data[indices]
        batch.append(vectors)
    return batch, None
 def foreach(layer, drop_factor=1.0):
    '''Map a layer across elements in a list'''
    def foreach_fwd(Xs, drop=0.):
        drop *= drop_factor
        ys = []
        backprops = []
        for X in Xs:
            y, bp_y = layer.begin_update(X, drop=drop)
            ys.append(y)
            backprops.append(bp_y)
        def foreach_bwd(d_ys, sgd=None):
            d_Xs = []
            for d_y, bp_y in zip(d_ys, backprops):
                if bp_y is not None and bp_y is not None:
                    d_Xs.append(d_y, sgd=sgd)
                else:
                    d_Xs.append(None)
            return d_Xs
        return ys, foreach_bwd
    model = wrap(foreach_fwd, layer)
    return model
 def build_text_classifier(nr_class, width=64, **cfg):
-    nr_vector = cfg.get('nr_vector', 200)
+    nr_vector = cfg.get('nr_vector', 5000)
-    with Model.define_operators({'>>': chain, '+': add, '|': concatenate, '**': clone}):
+    with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
-        embed_lower = HashEmbed(width, nr_vector, column=1)
+                                 '**': clone}):
-        embed_prefix = HashEmbed(width//2, nr_vector, column=2)
+        if cfg.get('low_data'):
-        embed_suffix = HashEmbed(width//2, nr_vector, column=3)
+            model = (
-        embed_shape = HashEmbed(width//2, nr_vector, column=4)
+                SpacyVectors
                >> flatten_add_lengths
                >> with_getitem(0,
                    Affine(width, 300)
                )
                >> ParametricAttention(width)
                >> Pooling(sum_pool)
                >> Residual(ReLu(width, width)) ** 2
                >> zero_init(Affine(nr_class, width, drop_factor=0.0))
                >> logistic
            )
            return model
        lower = HashEmbed(width, nr_vector, column=1)
        prefix = HashEmbed(width//2, nr_vector, column=2)
        suffix = HashEmbed(width//2, nr_vector, column=3)
        shape = HashEmbed(width//2, nr_vector, column=4)
        trained_vectors = (
            FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID])
            >> with_flatten(
                uniqued(
                    (lower | prefix | suffix | shape)
                    >> LN(Maxout(width, width+(width//2)*3)),
                    column=0
                )
            )
        )
        static_vectors = (
            SpacyVectors
            >> with_flatten(Affine(width, 300))
        )
        cnn_model = (
-            FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE])
+            # TODO Make concatenate support lists
-            >> _flatten_add_lengths
+            concatenate_lists(trained_vectors, static_vectors)
-            >> with_getitem(0,
+            >> with_flatten(
-                uniqued(
+                LN(Maxout(width, width*2))
-                  (embed_lower | embed_prefix | embed_suffix | embed_shape) 
+                >> Residual(
-                  >> Maxout(width, width+(width//2)*3))
+                    (ExtractWindow(nW=1) >> zero_init(Maxout(width, width*3)))
-                >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
+                ) ** 2, pad=2
                >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
                >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
            )
-            >> ParametricAttention(width,)
+            >> flatten_add_lengths
            >> ParametricAttention(width)
            >> Pooling(sum_pool)
-            >> ReLu(width, width)
+            >> Residual(zero_init(Maxout(width, width)))
            >> zero_init(Affine(nr_class, width, drop_factor=0.0))
        )
        linear_model = (
            _preprocess_doc
            >> LinearModel(nr_class, drop_factor=0.)
@ -475,3 +583,35 @@ def build_text_classifier(nr_class, width=64, **cfg):
    model.lsuv = False
    return model
@layerize
 def flatten(seqs, drop=0.):
    ops = Model.ops
    lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
    def finish_update(d_X, sgd=None):
        return ops.unflatten(d_X, lengths, pad=0)
    X = ops.flatten(seqs, pad=0)
    return X, finish_update
 def concatenate_lists(*layers, **kwargs): # pragma: no cover
    '''Compose two or more models `f`, `g`, etc, such that their outputs are
    concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
    '''
    if not layers:
        return noop()
    drop_factor = kwargs.get('drop_factor', 1.0)
    ops = layers[0].ops
    layers = [chain(layer, flatten) for layer in layers]
    concat = concatenate(*layers)
    def concatenate_lists_fwd(Xs, drop=0.):
        drop *= drop_factor
        lengths = ops.asarray([len(X) for X in Xs], dtype='i')
        flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
        ys = ops.unflatten(flat_y, lengths)
        def concatenate_lists_bwd(d_ys, sgd=None):
            return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
        return ys, concatenate_lists_bwd
    model = wrap(concatenate_lists_fwd, concat)
    return model
--- a/spacy/about.py
+++ b/spacy/about.py
@ -3,7 +3,7 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
 __title__ = 'spacy-nightly'
-__version__ = '2.0.0a7'
+__version__ = '2.0.0a13'
 __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 __uri__ = 'https://spacy.io'
 __author__ = 'Explosion AI'
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -2,5 +2,7 @@ from .download import download
 from .info import info
 from .link import link
 from .package import package
 from .profile import profile
 from .train import train
 from .convert import convert
 from .model import model
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -21,10 +21,10 @@ CONVERTERS = {
@plac.annotations(
    input_file=("input file", "positional", None, str),
    output_dir=("output directory for converted file", "positional", None, str),
-    n_sents=("Number of sentences per doc", "option", "n", float),
+    n_sents=("Number of sentences per doc", "option", "n", int),
    morphology=("Enable appending morphology to tags", "flag", "m", bool)
 )
-def convert(cmd, input_file, output_dir, n_sents, morphology):
+def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
    """
    Convert files into JSON format for use with train command and other
    experiment management functions.
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -8,7 +8,7 @@ import subprocess
 import sys
 from .link import link
-from ..util import prints
+from ..util import prints, get_package_path
 from .. import about
@ -24,15 +24,20 @@ def download(cmd, model, direct=False):
    with version.
    """
    if direct:
-        download_model('{m}/{m}.tar.gz'.format(m=model))
+        dl = download_model('{m}/{m}.tar.gz'.format(m=model))
    else:
        shortcuts = get_json(about.__shortcuts__, "available shortcuts")
        model_name = shortcuts.get(model, model)
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
-        download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
+        dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
        if dl == 0:
            try:
-            link(None, model_name, model, force=True)
+                # Get package path here because link uses
                # pip.get_installed_distributions() to check if model is a package,
                # which fails if model was just installed via subprocess
                package_path = get_package_path(model_name)
                link(None, model_name, model, force=True, model_path=package_path)
            except:
                # Dirty, but since spacy.download and the auto-linking is mostly
                # a convenience wrapper, it's best to show a success message and
@ -73,6 +78,6 @@ def get_version(model, comp):
 def download_model(filename):
    download_url = about.__download_url__ + '/' + filename
-    subprocess.call([sys.executable, '-m',
+    return subprocess.call([sys.executable, '-m',
        'pip', 'install', '--no-cache-dir', download_url],
        env=os.environ.copy())
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@ -14,7 +14,7 @@ from .. import util
    link_name=("name of shortuct link to create", "positional", None, str),
    force=("force overwriting of existing link", "flag", "f", bool)
 )
-def link(cmd, origin, link_name, force=False):
+def link(cmd, origin, link_name, force=False, model_path=None):
    """
    Create a symlink for models within the spacy/data directory. Accepts
    either the name of a pip package, or the local path to the model data
@ -23,7 +23,7 @@ def link(cmd, origin, link_name, force=False):
    if util.is_package(origin):
        model_path = util.get_package_path(origin)
    else:
-        model_path = Path(origin)
+        model_path = Path(origin) if model_path is None else Path(model_path)
    if not model_path.exists():
        prints("The data should be located in %s" % path2str(model_path),
               title="Can't locate model data", exits=1)
--- a/spacy/cli/model.py
+++ b/spacy/cli/model.py
@ -0,0 +1,137 @@
 # coding: utf8
 from __future__ import unicode_literals
 import bz2
 import gzip
 import math
 from ast import literal_eval
 from pathlib import Path
 import numpy as np
 import spacy
 from preshed.counter import PreshCounter
 from .. import util
 from ..compat import fix_text
 def model(cmd, lang, model_dir, freqs_data, clusters_data, vectors_data,
          min_doc_freq=5, min_word_freq=200):
    model_path = Path(model_dir)
    freqs_path = Path(freqs_data)
    clusters_path = Path(clusters_data) if clusters_data else None
    vectors_path = Path(vectors_data) if vectors_data else None
    check_dirs(freqs_path, clusters_path, vectors_path)
    vocab = util.get_lang_class(lang).Defaults.create_vocab()
    nlp = spacy.blank(lang)
    vocab = nlp.vocab
    probs, oov_prob = read_probs(
        freqs_path, min_doc_freq=int(min_doc_freq), min_freq=int(min_doc_freq))
    clusters = read_clusters(clusters_path) if clusters_path else {}
    populate_vocab(vocab, clusters, probs, oov_prob)
    add_vectors(vocab, vectors_path)
    create_model(model_path, nlp)
 def add_vectors(vocab, vectors_path):
    with bz2.BZ2File(vectors_path.as_posix()) as f:
        num_words, dim = next(f).split()
        vocab.clear_vectors(int(dim))
        for line in f:
            word_w_vector = line.decode("utf8").strip().split(" ")
            word = word_w_vector[0]
            vector = np.array([float(val) for val in word_w_vector[1:]])
            if word in vocab:
                vocab.set_vector(word, vector)
 def create_model(model_path, model):
    if not model_path.exists():
        model_path.mkdir()
    model.to_disk(model_path.as_posix())
 def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
    counts = PreshCounter()
    total = 0
    freqs_file = check_unzip(freqs_path)
    for i, line in enumerate(freqs_file):
        freq, doc_freq, key = line.rstrip().split('\t', 2)
        freq = int(freq)
        counts.inc(i + 1, freq)
        total += freq
    counts.smooth()
    log_total = math.log(total)
    freqs_file = check_unzip(freqs_path)
    probs = {}
    for line in freqs_file:
        freq, doc_freq, key = line.rstrip().split('\t', 2)
        doc_freq = int(doc_freq)
        freq = int(freq)
        if doc_freq >= min_doc_freq and freq >= min_freq and len(
                key) < max_length:
            word = literal_eval(key)
            smooth_count = counts.smoother(int(freq))
            probs[word] = math.log(smooth_count) - log_total
    oov_prob = math.log(counts.smoother(0)) - log_total
    return probs, oov_prob
 def read_clusters(clusters_path):
    clusters = {}
    with clusters_path.open() as f:
        for line in f:
            try:
                cluster, word, freq = line.split()
                word = fix_text(word)
            except ValueError:
                continue
            # If the clusterer has only seen the word a few times, its
            # cluster is unreliable.
            if int(freq) >= 3:
                clusters[word] = cluster
            else:
                clusters[word] = '0'
    # Expand clusters with re-casing
    for word, cluster in list(clusters.items()):
        if word.lower() not in clusters:
            clusters[word.lower()] = cluster
        if word.title() not in clusters:
            clusters[word.title()] = cluster
        if word.upper() not in clusters:
            clusters[word.upper()] = cluster
    return clusters
 def populate_vocab(vocab, clusters, probs, oov_prob):
    for word, prob in reversed(
            sorted(list(probs.items()), key=lambda item: item[1])):
        lexeme = vocab[word]
        lexeme.prob = prob
        lexeme.is_oov = False
        # Decode as a little-endian string, so that we can do & 15 to get
        # the first 4 bits. See _parse_features.pyx
        if word in clusters:
            lexeme.cluster = int(clusters[word][::-1], 2)
        else:
            lexeme.cluster = 0
 def check_unzip(file_path):
    file_path_str = file_path.as_posix()
    if file_path_str.endswith('gz'):
        return gzip.open(file_path_str)
    else:
        return file_path.open()
 def check_dirs(freqs_data, clusters_data, vectors_data):
    if not freqs_data.is_file():
        util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
    if clusters_data and not clusters_data.is_file():
        util.sys_exit(
            clusters_data.as_posix(), title="No Brown clusters file found")
    if vectors_data and not vectors_data.is_file():
        util.sys_exit(
            vectors_data.as_posix(), title="No word vectors file found")
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -15,10 +15,11 @@ from .. import about
@plac.annotations(
    input_dir=("directory with model data", "positional", None, str),
    output_dir=("output parent directory", "positional", None, str),
-    meta=("path to meta.json", "option", "m", str),
+    meta_path=("path to meta.json", "option", "m", str),
    create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool),
    force=("force overwriting of existing folder in output directory", "flag", "f", bool)
 )
-def package(cmd, input_dir, output_dir, meta=None, force=False):
+def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False):
    """
    Generate Python package for model data, including meta and required
    installation files. A new directory will be created in the specified
@ -26,7 +27,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
    """
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
-    meta_path = util.ensure_path(meta)
+    meta_path = util.ensure_path(meta_path)
    if not input_path or not input_path.exists():
        prints(input_path, title="Model directory not found", exits=1)
    if not output_path or not output_path.exists():
@ -38,7 +39,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
    template_manifest = get_template('MANIFEST.in')
    template_init = get_template('xx_model_name/__init__.py')
    meta_path = meta_path or input_path / 'meta.json'
-    if meta_path.is_file():
+    if not create_meta and meta_path.is_file():
        prints(meta_path, title="Reading meta.json from file")
        meta = util.read_json(meta_path)
    else:
@ -100,7 +101,7 @@ def generate_meta():
 def generate_pipeline():
    prints("If set to 'True', the default pipeline is used. If set to 'False', "
           "the pipeline will be disabled. Components should be specified as a "
-           "comma-separated list of component names, e.g. vectorizer, tagger, "
+           "comma-separated list of component names, e.g. tensorizer, tagger, "
           "parser, ner. For more information, see the docs on processing pipelines.",
           title="Enter your model's pipeline components")
    pipeline = util.get_raw_input("Pipeline components", True)
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@ -0,0 +1,45 @@
 # coding: utf8
 from __future__ import unicode_literals, division, print_function
 import plac
 from pathlib import Path
 import ujson
 import cProfile
 import pstats
 import spacy
 import sys
 import tqdm
 import cytoolz
 def read_inputs(loc):
    if loc is None:
        file_ = sys.stdin
        file_ = (line.encode('utf8') for line in file_)
    else:
        file_ = Path(loc).open()
    for line in file_:
        data = ujson.loads(line)
        text = data['text']
        yield text
@plac.annotations(
    lang=("model/language", "positional", None, str),
    inputs=("Location of input file", "positional", None, read_inputs)
 )
 def profile(cmd, lang, inputs=None):
    """
    Profile a spaCy pipeline, to find out which functions take the most time.
    """
    nlp = spacy.load(lang) 
    texts = list(cytoolz.take(10000, inputs))
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
    s = pstats.Stats("Profile.prof")
    s.strip_dirs().sort_stats("time").print_stats()
 def parse_texts(nlp, texts):
    for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128):
        pass
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -32,10 +32,12 @@ from ..compat import json_dumps
    resume=("Whether to resume training", "flag", "R", bool),
    no_tagger=("Don't train tagger", "flag", "T", bool),
    no_parser=("Don't train parser", "flag", "P", bool),
-    no_entities=("Don't train NER", "flag", "N", bool)
+    no_entities=("Don't train NER", "flag", "N", bool),
    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
 )
 def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
-          use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False):
+          use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False,
          gold_preproc=False):
    """
    Train a model. Expects data in spaCy's JSON format.
    """
@ -70,8 +72,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                                   util.env_opt('batch_compound', 1.001))
    if resume:
-        prints(output_path / 'model19.pickle', title="Resuming training")
+        prints(output_path / 'model9.pickle', title="Resuming training")
-        nlp = dill.load((output_path / 'model19.pickle').open('rb'))
+        nlp = dill.load((output_path / 'model9.pickle').open('rb'))
    else:
        nlp = lang_class(pipeline=pipeline)
    corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
@ -85,28 +87,26 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
            if resume:
                i += 20
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
-                train_docs = corpus.train_docs(nlp, projectivize=True,
+                train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
-                                               gold_preproc=False, max_length=0)
+                                               gold_preproc=gold_preproc, max_length=0)
                losses = {}
                for batch in minibatch(train_docs, size=batch_sizes):
                    docs, golds = zip(*batch)
                    nlp.update(docs, golds, sgd=optimizer,
                               drop=next(dropout_rates), losses=losses,
-                               update_tensors=True)
+                               update_shared=True)
                    pbar.update(sum(len(doc) for doc in docs))
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ('model%d' % i)
                nlp.to_disk(epoch_model_path)
                with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
                    dill.dump(nlp, file_, -1)
                nlp_loaded = lang_class(pipeline=pipeline)
                nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
                scorer = nlp_loaded.evaluate(
                            corpus.dev_docs(
                                nlp_loaded,
-                                gold_preproc=False))
+                                gold_preproc=gold_preproc))
                acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
                with acc_loc.open('w') as file_:
                    file_.write(json_dumps(scorer.scores))
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -46,19 +46,21 @@ is_osx = sys.platform == 'darwin'
 if is_python2:
    import imp
    bytes_ = str
    unicode_ = unicode
    basestring_ = basestring
    input_ = raw_input
-    json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8')
+    json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False).decode('utf8')
    path2str = lambda path: str(path).decode('utf8')
 elif is_python3:
    import importlib.util
    bytes_ = bytes
    unicode_ = str
    basestring_ = str
    input_ = input
-    json_dumps = lambda data: ujson.dumps(data, indent=2)
+    json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False)
    path2str = lambda path: str(path)
@ -102,3 +104,12 @@ def normalize_string_keys(old):
    return new
 def import_file(name, loc):
    loc = str(loc)
    if is_python2:
        return imp.load_source(name, loc)
    else:
        spec = importlib.util.spec_from_file_location(name, str(loc))
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        return module
--- a/spacy/deprecated.py
+++ b/spacy/deprecated.py
@ -15,7 +15,7 @@ def depr_model_download(lang):
    lang (unicode): Language shortcut, 'en' or 'de'.
    """
    prints("The spacy.%s.download command is now deprecated. Please use "
-           "python -m spacy download [model name or shortcut] instead. For "
+           "spacy download [model name or shortcut] instead. For "
           "more info, see the documentation:" % lang,
           about.__docs_models__,
           "Downloading default '%s' model now..." % lang,
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@ -60,7 +60,7 @@ GLOSSARY = {
    'JJR':          'adjective, comparative',
    'JJS':          'adjective, superlative',
    'LS':           'list item marker',
-    'MD':           'verb, modal auxillary',
+    'MD':           'verb, modal auxiliary',
    'NIL':          'missing tag',
    'NN':           'noun, singular or mass',
    'NNP':          'noun, proper singular',
@ -91,7 +91,7 @@ GLOSSARY = {
    'NFP':          'superfluous punctuation',
    'GW':           'additional word in multi-word expression',
    'XX':           'unknown',
-    'BES':          'auxillary "be"',
+    'BES':          'auxiliary "be"',
    'HVS':          'forms of "have"',
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@ -9,6 +9,7 @@ cdef struct GoldParseC:
    int* tags
    int* heads
    int* has_dep
    int* sent_start
    attr_t* labels
    int** brackets
    Transition* ner
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -406,11 +406,11 @@ cdef class GoldParse:
        if tags is None:
            tags = [None for _ in doc]
        if heads is None:
-            heads = [token.i for token in doc]
+            heads = [None for token in doc]
        if deps is None:
            deps = [None for _ in doc]
        if entities is None:
-            entities = ['-' for _ in doc]
+            entities = [None for _ in doc]
        elif len(entities) == 0:
            entities = ['O' for _ in doc]
        elif not isinstance(entities[0], basestring):
@ -426,6 +426,7 @@ cdef class GoldParse:
        self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
        self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
        self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
        self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
        self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
        self.cats = list(cats)
@ -482,6 +483,10 @@ cdef class GoldParse:
        """
        return not nonproj.is_nonproj_tree(self.heads)
    @property
    def sent_starts(self):
        return [self.c.sent_start[i] for i in range(self.length)]
 def biluo_tags_from_offsets(doc, entities, missing='O'):
    """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -27,7 +27,7 @@ ALPHA_UPPER = merge_char_classes(_upper + _uncased)
 _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
          'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
-          'TB T G M K')
+          'TB T G M K %')
 _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
 _punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
 _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
--- a/spacy/lang/da/examples.py
+++ b/spacy/lang/da/examples.py
@ -0,0 +1,18 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.da.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple overvejer at købe et britisk statup for 1 milliard dollar",
    "Selvkørende biler flytter forsikringsansvaret over på producenterne",
    "San Francisco overvejer at forbyde leverandørrobotter på fortov",
    "London er en stor by i Storbritannien"
 ]
--- a/spacy/lang/de/examples.py
+++ b/spacy/lang/de/examples.py
@ -0,0 +1,22 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.de.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
    "Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
    "Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz",
    "Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion",
    "San Francisco erwägt Verbot von Lieferrobotern",
    "Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
    "Wo bist du?",
    "Was ist die Hauptstadt von Deutschland?"
 ]
--- a/spacy/lang/en/examples.py
+++ b/spacy/lang/en/examples.py
@ -0,0 +1,22 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.en.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple is looking at buying U.K. startup for $1 billion",
    "Autonomous cars shift insurance liability toward manufacturers",
    "San Francisco considers banning sidewalk delivery robots",
    "London is a big city in the United Kingdom.",
    "Where are you?",
    "Who is the president of France?",
    "What is the capital of the United States?",
    "When was Barack Obama born?"
 ]
--- a/spacy/lang/en/morph_rules.py
+++ b/spacy/lang/en/morph_rules.py
@ -59,7 +59,8 @@ MORPH_RULES = {
    "VBP": {
        "are":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
-        "'re":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}
+        "'re":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
        "am":           {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"},
    },
    "VBD": {
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -232,7 +232,10 @@ for verb_data in [
    {ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
    {ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
    {ORTH: "was", LEMMA: "be", NORM: "was"},
-    {ORTH: "were", LEMMA: "be", NORM: "were"}]:
+    {ORTH: "were", LEMMA: "be", NORM: "were"},
    {ORTH: "have", NORM: "have"},
    {ORTH: "has", LEMMA: "have", NORM: "has"},
    {ORTH: "dare", NORM: "dare"}]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
    for data in [verb_data, verb_data_tc]:
--- a/spacy/lang/es/examples.py
+++ b/spacy/lang/es/examples.py
@ -0,0 +1,22 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.es.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares",
    "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes",
    "San Francisco analiza prohibir los robots delivery",
    "Londres es una gran ciudad del Reino Unido",
    "El gato come pescado",
    "Veo al hombre con el telescopio",
    "La araña come moscas",
    "El pingüino incuba en su nido"
 ]
--- a/spacy/lang/fr/examples.py
+++ b/spacy/lang/fr/examples.py
@ -0,0 +1,26 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.fr.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple cherche a acheter une startup anglaise pour 1 milliard de dollard",
    "Les voitures autonomes voient leur assurances décalées vers les constructeurs",
    "San Francisco envisage d'interdire les robots coursiers",
    "Londres est une grande ville du Royaume-Uni",
    "L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe",
    "Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon",
    "La France ne devrait pas manquer d'électricité cet été, même en cas de canicule",
    "Nouvelles attaques de Trump contre le maire de Londres",
    "Où es-tu ?",
    "Qui est le président de la France ?",
    "Où est la capitale des Etats-Unis ?",
    "Quand est né Barack Obama ?"
 ]
--- a/spacy/lang/he/examples.py
+++ b/spacy/lang/he/examples.py
@ -0,0 +1,28 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.he.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל',
    'רה"מ הודיע כי יחרים טקס בחסותו',
    'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100',
    'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית',
    'סע לשלום, המפתחות בפנים.',
    'מלצר, פעמיים טורקי!',
    'ואהבת לרעך כמוך.',
    'היום נעשה משהו בלתי נשכח.',
    'איפה הילד?',
    'מיהו נשיא צרפת?',
    'מהי בירת ארצות הברית?',
    "איך קוראים בעברית לצ'ופצ'יק של הקומקום?",
    'מה הייתה הדקה?',
    'מי אומר שלום ראשון, זה שעולה או זה שיורד?'
 ]
--- a/spacy/lang/id/init.py
+++ b/spacy/lang/id/init.py
@ -0,0 +1,42 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .stop_words import STOP_WORDS
 from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .norm_exceptions import NORM_EXCEPTIONS
 from .lemmatizer import LOOKUP
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
 from ...lemmatizerlookup import Lemmatizer
 from ...attrs import LANG
 from ...util import update_exc
 class IndonesianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'id'
    lex_attr_getters.update(LEX_ATTRS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = set(STOP_WORDS)
    prefixes = tuple(TOKENIZER_PREFIXES)
    suffixes = tuple(TOKENIZER_SUFFIXES)
    infixes = tuple(TOKENIZER_INFIXES)
    syntax_iterators = dict(SYNTAX_ITERATORS)
    @classmethod
    def create_lemmatizer(cls, nlp=None):
        return Lemmatizer(LOOKUP)
 class Indonesian(Language):
    lang = 'id'
    Defaults = IndonesianDefaults
 __all__ = ['Indonesian']
--- a/spacy/lang/id/_tokenizer_exceptions_list.py
+++ b/spacy/lang/id/_tokenizer_exceptions_list.py
--- a/spacy/lang/id/examples.py
+++ b/spacy/lang/id/examples.py
@ -0,0 +1,22 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.en.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Al Qaidah mengklaim bom mobil yang menewaskan 60 Orang di Mali",
    "Abu Sayyaf mengeksekusi sandera warga Filipina",
    "Penyaluran pupuk berasal dari lima lokasi yakni Bontang, Kalimantan Timur, Surabaya, Banyuwangi, Semarang, dan Makassar.",
    "PT Pupuk Kaltim telah menyalurkan 274.707 ton pupuk bersubsidi ke wilayah penyaluran di 14 provinsi.",
    "Jakarta adalah kota besar yang nyaris tidak pernah tidur."
    "Kamu ada di mana semalam?",
    "Siapa yang membeli makanan ringan tersebut?",
    "Siapa presiden pertama Republik Indonesia?"
 ]
--- a/spacy/lang/id/lemmatizer.py
+++ b/spacy/lang/id/lemmatizer.py
--- a/spacy/lang/id/lex_attrs.py
+++ b/spacy/lang/id/lex_attrs.py
@ -0,0 +1,42 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...attrs import LIKE_NUM
 _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
              'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen',
              'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty',
              'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety',
              'hundred', 'thousand', 'million', 'billion', 'trillion', 'quadrillion',
              'gajillion', 'bazillion',
              'nol', 'satu', 'dua', 'tiga', 'empat', 'lima', 'enam', 'tujuh',
              'delapan', 'sembilan', 'sepuluh', 'sebelas', 'duabelas', 'tigabelas',
              'empatbelas', 'limabelas', 'enambelas', 'tujuhbelas', 'delapanbelas',
              'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta',
              'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun',
              'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun',
              'noniliun', 'desiliun',
              ]
 def like_num(text):
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
    if text.count('/') == 1:
        num, denom = text.split('/')
        if num.isdigit() and denom.isdigit():
            return True
    if text in _num_words:
        return True
    if text.count('-') == 1:
        _, num = text.split('-')
        if num.isdigit() or num in _num_words:
            return True
    return False
 LEX_ATTRS = {
    LIKE_NUM: like_num
 }
--- a/spacy/lang/id/norm_exceptions.py
+++ b/spacy/lang/id/norm_exceptions.py
@ -0,0 +1,17 @@
 # coding: utf8
 from __future__ import unicode_literals
 _exc = {
    "Rp": "$",
    "IDR": "$",
    "RMB": "$",
    "USD": "$",
    "AUD": "$",
    "GBP": "$",
 }
 NORM_EXCEPTIONS = {}
 for string, norm in _exc.items():
    NORM_EXCEPTIONS[string] = norm
    NORM_EXCEPTIONS[string.title()] = norm
--- a/spacy/lang/id/punctuation.py
+++ b/spacy/lang/id/punctuation.py
@ -0,0 +1,53 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from ..char_classes import merge_chars, split_chars, _currency, _units
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
 from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER, HYPHENS
 _units = (_units + 's bit Gbps Mbps mbps Kbps kbps ƒ ppi px '
          'Hz kHz MHz GHz mAh '
          'ratus rb ribu ribuan '
          'juta jt jutaan mill?iar million bil[l]?iun bilyun billion '
          )
 _currency = (_currency + r' USD Rp IDR RMB SGD S\$')
 _months = ('Januari Februari Maret April Mei Juni Juli Agustus September '
           'Oktober November Desember January February March May June '
           'July August October December Jan Feb Mar Jun Jul Aug Sept '
           'Oct Okt Nov Des ')
 UNITS = merge_chars(_units)
 CURRENCY = merge_chars(_currency)
 HTML_PREFIX = r'<(b|strong|i|em|p|span|div|br)\s?/>|<a([^>]+)>'
 HTML_SUFFIX = r'</(b|strong|i|em|p|span|div|a)>'
 MONTHS = merge_chars(_months)
 LIST_CURRENCY = split_chars(_currency)
 TOKENIZER_PREFIXES.remove('#') # hashtag
 _prefixes = TOKENIZER_PREFIXES + LIST_CURRENCY + [HTML_PREFIX] + ['/', '—']
 _suffixes = TOKENIZER_SUFFIXES + [r'\-[Nn]ya', '-[KkMm]u', '[—-]'] + [
        r'(?<={c})(?:[0-9]+)'.format(c=CURRENCY),
        r'(?<=[0-9])(?:{u})'.format(u=UNITS),
        r'(?<=[0-9])%',
        r'(?<=[0-9{a}]{h})(?:[\.,:-])'.format(a=ALPHA, h=HTML_SUFFIX),
        r'(?<=[0-9{a}])(?:{h})'.format(a=ALPHA, h=HTML_SUFFIX),
    ]
 _infixes = TOKENIZER_INFIXES + [
    r'(?<=[0-9])[\\/](?=[0-9%-])',
    r'(?<=[0-9])%(?=[{a}0-9/])'.format(a=ALPHA),
    r'(?<={u})[\/-](?=[0-9])'.format(u=UNITS),
    r'(?<={m})[\/-](?=[0-9])'.format(m=MONTHS),
    r'(?<=[0-9\)][\.,])"(?=[0-9])',
    r'(?<=[{a}\)][\.,\'])["—](?=[{a}])'.format(a=ALPHA),
    r'(?<=[{a}])-(?=[0-9])'.format(a=ALPHA),
    r'(?<=[0-9])-(?=[{a}])'.format(a=ALPHA),
    r'(?<=[{a}])[\/-](?={c}{a})'.format(a=ALPHA, c=CURRENCY),
 ]
 TOKENIZER_PREFIXES = _prefixes
 TOKENIZER_SUFFIXES = _suffixes
 TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/id/stop_words.py
+++ b/spacy/lang/id/stop_words.py
@ -0,0 +1,763 @@
 # coding: utf8
 from __future__ import unicode_literals
 STOP_WORDS = set("""
 ada
 adalah
 adanya
 adapun
 agak
 agaknya
 agar
 akan
 akankah
 akhir
 akhiri
 akhirnya
 aku
 akulah
 amat
 amatlah
 anda
 andalah
 antar
 antara
 antaranya
 apa
 apaan
 apabila
 apakah
 apalagi
 apatah
 artinya
 asal
 asalkan
 atas
 atau
 ataukah
 ataupun
 awal
 awalnya
 bagai
 bagaikan
 bagaimana
 bagaimanakah
 bagaimanapun
 bagi
 bagian
 bahkan
 bahwa
 bahwasanya
 baik
 bakal
 bakalan
 balik
 banyak
 bapak
 baru
 bawah
 beberapa
 begini
 beginian
 beginikah
 beginilah
 begitu
 begitukah
 begitulah
 begitupun
 bekerja
 belakang
 belakangan
 belum
 belumlah
 benar
 benarkah
 benarlah
 berada
 berakhir
 berakhirlah
 berakhirnya
 berapa
 berapakah
 berapalah
 berapapun
 berarti
 berawal
 berbagai
 berdatangan
 beri
 berikan
 berikut
 berikutnya
 berjumlah
 berkali-kali
 berkata
 berkehendak
 berkeinginan
 berkenaan
 berlainan
 berlalu
 berlangsung
 berlebihan
 bermacam
 bermacam-macam
 bermaksud
 bermula
 bersama
 bersama-sama
 bersiap
 bersiap-siap
 bertanya
 bertanya-tanya
 berturut
 berturut-turut
 bertutur
 berujar
 berupa
 besar
 betul
 betulkah
 biasa
 biasanya
 bila
 bilakah
 bisa
 bisakah
 boleh
 bolehkah
 bolehlah
 buat
 bukan
 bukankah
 bukanlah
 bukannya
 bulan
 bung
 cara
 caranya
 cukup
 cukupkah
 cukuplah
 cuma
 dahulu
 dalam
 dan
 dapat
 dari
 daripada
 datang
 dekat
 demi
 demikian
 demikianlah
 dengan
 depan
 di
 dia
 diakhiri
 diakhirinya
 dialah
 diantara
 diantaranya
 diberi
 diberikan
 diberikannya
 dibuat
 dibuatnya
 didapat
 didatangkan
 digunakan
 diibaratkan
 diibaratkannya
 diingat
 diingatkan
 diinginkan
 dijawab
 dijelaskan
 dijelaskannya
 dikarenakan
 dikatakan
 dikatakannya
 dikerjakan
 diketahui
 diketahuinya
 dikira
 dilakukan
 dilalui
 dilihat
 dimaksud
 dimaksudkan
 dimaksudkannya
 dimaksudnya
 diminta
 dimintai
 dimisalkan
 dimulai
 dimulailah
 dimulainya
 dimungkinkan
 dini
 dipastikan
 diperbuat
 diperbuatnya
 dipergunakan
 diperkirakan
 diperlihatkan
 diperlukan
 diperlukannya
 dipersoalkan
 dipertanyakan
 dipunyai
 diri
 dirinya
 disampaikan
 disebut
 disebutkan
 disebutkannya
 disini
 disinilah
 ditambahkan
 ditandaskan
 ditanya
 ditanyai
 ditanyakan
 ditegaskan
 ditujukan
 ditunjuk
 ditunjuki
 ditunjukkan
 ditunjukkannya
 ditunjuknya
 dituturkan
 dituturkannya
 diucapkan
 diucapkannya
 diungkapkan
 dong
 dua
 dulu
 empat
 enggak
 enggaknya
 entah
 entahlah
 guna
 gunakan
 hal
 hampir
 hanya
 hanyalah
 hari
 harus
 haruslah
 harusnya
 hendak
 hendaklah
 hendaknya
 hingga
 ia
 ialah
 ibarat
 ibaratkan
 ibaratnya
 ibu
 ikut
 ingat
 ingat-ingat
 ingin
 inginkah
 inginkan
 ini
 inikah
 inilah
 itu
 itukah
 itulah
 jadi
 jadilah
 jadinya
 jangan
 jangankan
 janganlah
 jauh
 jawab
 jawaban
 jawabnya
 jelas
 jelaskan
 jelaslah
 jelasnya
 jika
 jikalau
 juga
 jumlah
 jumlahnya
 justru
 kala
 kalau
 kalaulah
 kalaupun
 kalian
 kami
 kamilah
 kamu
 kamulah
 kan
 kapan
 kapankah
 kapanpun
 karena
 karenanya
 kasus
 kata
 katakan
 katakanlah
 katanya
 ke
 keadaan
 kebetulan
 kecil
 kedua
 keduanya
 keinginan
 kelamaan
 kelihatan
 kelihatannya
 kelima
 keluar
 kembali
 kemudian
 kemungkinan
 kemungkinannya
 kenapa
 kepada
 kepadanya
 kesampaian
 keseluruhan
 keseluruhannya
 keterlaluan
 ketika
 khususnya
 kini
 kinilah
 kira
 kira-kira
 kiranya
 kita
 kitalah
 kok
 kurang
 lagi
 lagian
 lah
 lain
 lainnya
 lalu
 lama
 lamanya
 lanjut
 lanjutnya
 lebih
 lewat
 lima
 luar
 macam
 maka
 makanya
 makin
 malah
 malahan
 mampu
 mampukah
 mana
 manakala
 manalagi
 masa
 masalah
 masalahnya
 masih
 masihkah
 masing
 masing-masing
 mau
 maupun
 melainkan
 melakukan
 melalui
 melihat
 melihatnya
 memang
 memastikan
 memberi
 memberikan
 membuat
 memerlukan
 memihak
 meminta
 memintakan
 memisalkan
 memperbuat
 mempergunakan
 memperkirakan
 memperlihatkan
 mempersiapkan
 mempersoalkan
 mempertanyakan
 mempunyai
 memulai
 memungkinkan
 menaiki
 menambahkan
 menandaskan
 menanti
 menanti-nanti
 menantikan
 menanya
 menanyai
 menanyakan
 mendapat
 mendapatkan
 mendatang
 mendatangi
 mendatangkan
 menegaskan
 mengakhiri
 mengapa
 mengatakan
 mengatakannya
 mengenai
 mengerjakan
 mengetahui
 menggunakan
 menghendaki
 mengibaratkan
 mengibaratkannya
 mengingat
 mengingatkan
 menginginkan
 mengira
 mengucapkan
 mengucapkannya
 mengungkapkan
 menjadi
 menjawab
 menjelaskan
 menuju
 menunjuk
 menunjuki
 menunjukkan
 menunjuknya
 menurut
 menuturkan
 menyampaikan
 menyangkut
 menyatakan
 menyebutkan
 menyeluruh
 menyiapkan
 merasa
 mereka
 merekalah
 merupakan
 meski
 meskipun
 meyakini
 meyakinkan
 minta
 mirip
 misal
 misalkan
 misalnya
 mula
 mulai
 mulailah
 mulanya
 mungkin
 mungkinkah
 nah
 naik
 namun
 nanti
 nantinya
 nyaris
 nyatanya
 oleh
 olehnya
 pada
 padahal
 padanya
 pak
 paling
 panjang
 pantas
 para
 pasti
 pastilah
 penting
 pentingnya
 per
 percuma
 perlu
 perlukah
 perlunya
 pernah
 persoalan
 pertama
 pertama-tama
 pertanyaan
 pertanyakan
 pihak
 pihaknya
 pukul
 pula
 pun
 punya
 rasa
 rasanya
 rata
 rupanya
 saat
 saatnya
 saja
 sajalah
 saling
 sama
 sama-sama
 sambil
 sampai
 sampai-sampai
 sampaikan
 sana
 sangat
 sangatlah
 satu
 saya
 sayalah
 se
 sebab
 sebabnya
 sebagai
 sebagaimana
 sebagainya
 sebagian
 sebaik
 sebaik-baiknya
 sebaiknya
 sebaliknya
 sebanyak
 sebegini
 sebegitu
 sebelum
 sebelumnya
 sebenarnya
 seberapa
 sebesar
 sebetulnya
 sebisanya
 sebuah
 sebut
 sebutlah
 sebutnya
 secara
 secukupnya
 sedang
 sedangkan
 sedemikian
 sedikit
 sedikitnya
 seenaknya
 segala
 segalanya
 segera
 seharusnya
 sehingga
 seingat
 sejak
 sejauh
 sejenak
 sejumlah
 sekadar
 sekadarnya
 sekali
 sekali-kali
 sekalian
 sekaligus
 sekalipun
 sekarang
 sekarang
 sekecil
 seketika
 sekiranya
 sekitar
 sekitarnya
 sekurang-kurangnya
 sekurangnya
 sela
 selain
 selaku
 selalu
 selama
 selama-lamanya
 selamanya
 selanjutnya
 seluruh
 seluruhnya
 semacam
 semakin
 semampu
 semampunya
 semasa
 semasih
 semata
 semata-mata
 semaunya
 sementara
 semisal
 semisalnya
 sempat
 semua
 semuanya
 semula
 sendiri
 sendirian
 sendirinya
 seolah
 seolah-olah
 seorang
 sepanjang
 sepantasnya
 sepantasnyalah
 seperlunya
 seperti
 sepertinya
 sepihak
 sering
 seringnya
 serta
 serupa
 sesaat
 sesama
 sesampai
 sesegera
 sesekali
 seseorang
 sesuatu
 sesuatunya
 sesudah
 sesudahnya
 setelah
 setempat
 setengah
 seterusnya
 setiap
 setiba
 setibanya
 setidak-tidaknya
 setidaknya
 setinggi
 seusai
 sewaktu
 siap
 siapa
 siapakah
 siapapun
 sini
 sinilah
 soal
 soalnya
 suatu
 sudah
 sudahkah
 sudahlah
 supaya
 tadi
 tadinya
 tahu
 tahun
 tak
 tambah
 tambahnya
 tampak
 tampaknya
 tandas
 tandasnya
 tanpa
 tanya
 tanyakan
 tanyanya
 tapi
 tegas
 tegasnya
 telah
 tempat
 tengah
 tentang
 tentu
 tentulah
 tentunya
 tepat
 terakhir
 terasa
 terbanyak
 terdahulu
 terdapat
 terdiri
 terhadap
 terhadapnya
 teringat
 teringat-ingat
 terjadi
 terjadilah
 terjadinya
 terkira
 terlalu
 terlebih
 terlihat
 termasuk
 ternyata
 tersampaikan
 tersebut
 tersebutlah
 tertentu
 tertuju
 terus
 terutama
 tetap
 tetapi
 tiap
 tiba
 tiba-tiba
 tidak
 tidakkah
 tidaklah
 tiga
 tinggi
 toh
 tunjuk
 turut
 tutur
 tuturnya
 ucap
 ucapnya
 ujar
 ujarnya
 umum
 umumnya
 ungkap
 ungkapnya
 untuk
 usah
 usai
 waduh
 wah
 wahai
 waktu
 waktunya
 walau
 walaupun
 wong
 yaitu
 yakin
 yakni
 yang
 """.split())
--- a/spacy/lang/id/syntax_iterators.py
+++ b/spacy/lang/id/syntax_iterators.py
@ -0,0 +1,42 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...symbols import NOUN, PROPN, PRON
 def noun_chunks(obj):
    """
    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
    """
    labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
    doc = obj.doc  # Ensure works on both Doc and Span.
    np_deps = [doc.vocab.strings[label] for label in labels]
    conj = doc.vocab.strings.add('conj')
    np_label = doc.vocab.strings.add('NP')
    seen = set()
    for i, word in enumerate(obj):
        if word.pos not in (NOUN, PROPN, PRON):
            continue
        # Prevent nested chunks from being produced
        if word.i in seen:
            continue
        if word.dep in np_deps:
            if any(w.i in seen for w in word.subtree):
                continue
            seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
            yield word.left_edge.i, word.right_edge.i+1, np_label
        elif word.dep == conj:
            head = word.head
            while head.dep == conj and head.head.i < head.i:
                head = head.head
            # If the head is an NP, and we're coordinated to it, we're an NP
            if head.dep in np_deps:
                if any(w.i in seen for w in word.subtree):
                    continue
                seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
                yield word.left_edge.i, word.right_edge.i+1, np_label
 SYNTAX_ITERATORS = {
    'noun_chunks': noun_chunks
 }
--- a/spacy/lang/id/tokenizer_exceptions.py
+++ b/spacy/lang/id/tokenizer_exceptions.py
@ -0,0 +1,50 @@
 # coding: utf8
 from __future__ import unicode_literals
 import regex as re
 from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
 from ..tokenizer_exceptions import URL_PATTERN
 from ...symbols import ORTH
 _exc = {}
 for orth in ID_BASE_EXCEPTIONS:
    _exc[orth] = [{ORTH: orth}]
    orth_title = orth.title()
    _exc[orth_title] = [{ORTH: orth_title}]
    orth_caps = orth.upper()
    _exc[orth_caps] = [{ORTH: orth_caps}]
    orth_lower = orth.lower()
    _exc[orth_lower] = [{ORTH: orth_lower}]
    if '-' in orth:
        orth_title = '-'.join([part.title() for part in orth.split('-')])
        _exc[orth_title] = [{ORTH: orth_title}]
        orth_caps = '-'.join([part.upper() for part in orth.split('-')])
        _exc[orth_caps] = [{ORTH: orth_caps}]
 for orth in [
    "'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.",
    "E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.",
    "Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.",
    "Ph.D.", "Rep.", "Rev.", "Sen.", "St.", "vs.",
    "B.A.", "B.Ch.E.", "B.Sc.", "Dr.", "Dra.", "Drs.", "Hj.", "Ka.", "Kp.",
    "M.Ag.", "M.Hum.", "M.Kes,", "M.Kom.", "M.M.", "M.P.", "M.Pd.", "M.Sc.",
    "M.Si.", "M.Sn.", "M.T.", "M.Th.", "No.", "Pjs.", "Plt.", "R.A.", "S.Ag.",
    "S.E.", "S.H.", "S.Hut.", "S.K.M.", "S.Kedg.", "S.Kedh.", "S.Kom.",
    "S.Pd.", "S.Pol.", "S.Psi.", "S.S.", "S.Sos.", "S.T.", "S.Tekp.", "S.Th.",
    "a.l.", "a.n.", "a.s.", "b.d.", "d.a.", "d.l.", "d/h", "dkk.", "dll.",
    "dr.", "drh.", "ds.", "dsb.", "dst.", "faks.", "fax.", "hlm.", "i/o",
    "n.b.", "p.p." "pjs.", "s.d.", "tel.", "u.p.",
    ]:
    _exc[orth] = [{ORTH: orth}]
 TOKENIZER_EXCEPTIONS = dict(_exc)
--- a/spacy/lang/it/examples.py
+++ b/spacy/lang/it/examples.py
@ -0,0 +1,18 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.it.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
    "Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
    "San Francisco prevede di bandire i robot di consegna porta a porta",
    "Londra è una grande città del Regno Unito."
 ]
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@ -137,6 +137,7 @@ LEX_ATTRS = {
    attrs.IS_UPPER: lambda string: string.isupper(),
    attrs.IS_STOP: lambda string: False,
    attrs.IS_OOV: lambda string: True,
    attrs.PROB: lambda string: -20.,
    attrs.LIKE_EMAIL: like_email,
    attrs.LIKE_NUM: like_num,
    attrs.IS_PUNCT: is_punct,
--- a/spacy/lang/nb/examples.py
+++ b/spacy/lang/nb/examples.py
@ -0,0 +1,18 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.nb.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar",
    "Selvkjørende biler flytter forsikringsansvaret over på produsentene ",
    "San Francisco vurderer å forby robotbud på fortauene",
    "London er en stor by i Storbritannia."
 ]
--- a/spacy/lang/pl/examples.py
+++ b/spacy/lang/pl/examples.py
@ -0,0 +1,20 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.pl.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Poczuł przyjemną woń mocnej kawy.",
    "Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
    "Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.",
    "Nowy abonament pod lupą Komisji Europejskiej",
    "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
    "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”."
 ]
--- a/spacy/lang/pt/examples.py
+++ b/spacy/lang/pt/examples.py
@ -0,0 +1,18 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.pt.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares",
    "Carros autônomos empurram a responsabilidade do seguro para os fabricantes."
    "São Francisco considera banir os robôs de entrega que andam pelas calçadas",
    "Londres é a maior cidade do Reino Unido"
 ]
--- a/spacy/lang/sv/examples.py
+++ b/spacy/lang/sv/examples.py
@ -0,0 +1,18 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.sv.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple överväger att köpa brittisk startup för 1 miljard dollar.",
    "Självkörande bilar förskjuter försäkringsansvar mot tillverkare.",
    "San Fransisco överväger förbud mot leveransrobotar på trottoarer.".
    "London är en storstad i Storbritannien."
 ]
--- a/spacy/language.py
+++ b/spacy/language.py
@ -95,7 +95,7 @@ class BaseDefaults(object):
        meta = nlp.meta if nlp is not None else {}
        # Resolve strings, like "cnn", "lstm", etc
        pipeline = []
-        for entry in cls.pipeline:
+        for entry in meta.get('pipeline', []):
            if entry in disable or getattr(entry, 'name', entry) in disable:
                continue
            factory = cls.Defaults.factories[entry]
@ -200,6 +200,7 @@ class Language(object):
            else:
                flat_list.append(pipe)
        self.pipeline = flat_list
        self._optimizer = None
    @property
    def meta(self):
@ -278,7 +279,7 @@ class Language(object):
        return self.tokenizer(text)
    def update(self, docs, golds, drop=0., sgd=None, losses=None,
-            update_tensors=False):
+            update_shared=False):
        """Update the models in the pipeline.
        docs (iterable): A batch of `Doc` objects.
@ -298,6 +299,10 @@ class Language(object):
                "Got: %d, %d" % (len(docs), len(golds)))
        if len(docs) == 0:
            return
        if sgd is None:
            if self._optimizer is None:
                self._optimizer = Adam(Model.ops, 0.001)
            sgd = self._optimizer
        tok2vec = self.pipeline[0]
        feats = tok2vec.doc2feats(docs)
        grads = {}
@ -305,14 +310,18 @@ class Language(object):
            grads[key] = (W, dW)
        pipes = list(self.pipeline[1:])
        random.shuffle(pipes)
        tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
        all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
        for proc in pipes:
            if not hasattr(proc, 'update'):
                continue
            tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
            d_tokvecses = proc.update((docs, tokvecses), golds,
                                      drop=drop, sgd=get_grads, losses=losses)
-            if update_tensors and d_tokvecses is not None:
+            if update_shared and d_tokvecses is not None:
-                bp_tokvecses(d_tokvecses, sgd=sgd)
+                for i, d_tv in enumerate(d_tokvecses):
                    all_d_tokvecses[i] += d_tv
        if update_shared and bp_tokvecses is not None:
            bp_tokvecses(all_d_tokvecses, sgd=sgd)
        for key, (W, dW) in grads.items():
            sgd(W, dW, key=key)
        # Clear the tensor variable, to free GPU memory.
@ -375,11 +384,11 @@ class Language(object):
        eps = util.env_opt('optimizer_eps', 1e-08)
        L2 = util.env_opt('L2_penalty', 1e-6)
        max_grad_norm = util.env_opt('grad_norm_clip', 1.)
-        optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
+        self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
                              beta2=beta2, eps=eps)
-        optimizer.max_grad_norm = max_grad_norm
+        self._optimizer.max_grad_norm = max_grad_norm
-        optimizer.device = device
+        self._optimizer.device = device
-        return optimizer
+        return self._optimizer
    def evaluate(self, docs_golds):
        scorer = Scorer()
@ -427,11 +436,16 @@ class Language(object):
            except StopIteration:
                pass
-    def pipe(self, texts, tuples=False, n_threads=2, batch_size=1000, disable=[]):
+    def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
            disable=[]):
        """Process texts as a stream, and yield `Doc` objects in order. Supports
        GIL-free multi-threading.
        texts (iterator): A sequence of texts to process.
        as_tuples (bool):
            If set to True, inputs should be a sequence of
            (text, context) tuples. Output will then be a sequence of
            (doc, context) tuples. Defaults to False.
        n_threads (int): The number of worker threads to use. If -1, OpenMP will
            decide how many to use at run time. Default is 2.
        batch_size (int): The number of texts to buffer.
@ -443,7 +457,7 @@ class Language(object):
            >>>     for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
            >>>         assert doc.is_parsed
        """
-        if tuples:
+        if as_tuples:
            text_context1, text_context2 = itertools.tee(texts)
            texts = (tc[0] for tc in text_context1)
            contexts = (tc[1] for tc in text_context2)
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -25,6 +25,7 @@ class Lemmatizer(object):
        elif univ_pos == PUNCT:
            univ_pos = 'punct'
        # See Issue #435 for example of where this logic is requied.
        print("Check base form", string)
        if self.is_base_form(univ_pos, morphology):
            return set([string.lower()])
        lemmas = lemmatize(string, self.index.get(univ_pos, {}),
@ -38,12 +39,20 @@ class Lemmatizer(object):
        avoid lemmatization entirely.
        """
        morphology = {} if morphology is None else morphology
-        others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
+        others = [key for key in morphology
                  if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
        true_morph_key = morphology.get('morph', 0)
        if univ_pos == 'noun' and morphology.get('Number') == 'sing':
            return True
        elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
            return True
        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
        # morphology
        elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
                                     morphology.get('Tense') == 'pres' and \
                                     morphology.get('Number') is None and \
                                     not others):
            return True
        elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
            return True
        elif VerbForm_inf in morphology:
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -171,6 +171,8 @@ cdef class Lexeme:
    property rank:
        def __get__(self):
            return self.c.id
        def __set__(self, value):
            self.c.id = value
    property sentiment:
        def __get__(self):
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -46,6 +46,43 @@ from ._ml import build_text_classifier, build_tagger_model
 from .parts_of_speech import X
 class SentenceSegmenter(object):
    '''A simple spaCy hook, to allow custom sentence boundary detection logic
    (that doesn't require the dependency parse).
    To change the sentence boundary detection strategy, pass a generator
    function `strategy` on initialization, or assign a new strategy to
    the .strategy attribute.
    Sentence detection strategies should be generators that take `Doc` objects
    and yield `Span` objects for each sentence.
    '''
    name = 'sbd'
    def __init__(self, vocab, strategy=None):
        self.vocab = vocab
        if strategy is None or strategy == 'on_punct':
            strategy = self.split_on_punct
        self.strategy = strategy
    def __call__(self, doc):
        doc.user_hooks['sents'] = self.strategy
    @staticmethod
    def split_on_punct(doc):
        start = 0
        seen_period = False
        for i, word in enumerate(doc):
            if seen_period and not word.is_punct:
                yield doc[start : word.i]
                start = word.i
                seen_period = False
            elif word.text in ['.', '!', '?']:
                seen_period = True
        if start < len(doc):
            yield doc[start : len(doc)]
 class BaseThincComponent(object):
    name = None
@ -91,16 +128,21 @@ class BaseThincComponent(object):
    def to_bytes(self, **exclude):
        serialize = OrderedDict((
            ('cfg', lambda: json_dumps(self.cfg)),
            ('model', lambda: self.model.to_bytes()),
            ('vocab', lambda: self.vocab.to_bytes())
        ))
        return util.to_bytes(serialize, exclude)
    def from_bytes(self, bytes_data, **exclude):
        def load_model(b):
            if self.model is True:
-            self.model = self.Model()
+                self.model = self.Model(**self.cfg)
            self.model.from_bytes(b)
        deserialize = OrderedDict((
-            ('model', lambda b: self.model.from_bytes(b)),
+            ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
            ('model', load_model),
            ('vocab', lambda b: self.vocab.from_bytes(b))
        ))
        util.from_bytes(bytes_data, deserialize, exclude)
@ -108,19 +150,22 @@ class BaseThincComponent(object):
    def to_disk(self, path, **exclude):
        serialize = OrderedDict((
            ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
-            ('vocab', lambda p: self.vocab.to_disk(p)),
+            ('vocab', lambda p: self.vocab.to_disk(p))
            ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg)))
        ))
        util.to_disk(path, serialize, exclude)
    def from_disk(self, path, **exclude):
        def load_model(p):
            if self.model is True:
-            self.model = self.Model()
+                self.model = self.Model(**self.cfg)
            self.model.from_bytes(p.open('rb').read())
        deserialize = OrderedDict((
-            ('model', lambda p: self.model.from_bytes(p.open('rb').read())),
+            ('cfg', lambda p: self.cfg.update(_load_cfg(p))),
            ('model', load_model),
            ('vocab', lambda p: self.vocab.from_disk(p)),
            ('cfg', lambda p: self.cfg.update(_load_cfg(p)))
        ))
        util.from_disk(path, deserialize, exclude)
        return self
@ -138,7 +183,7 @@ class TokenVectorEncoder(BaseThincComponent):
    name = 'tensorizer'
    @classmethod
-    def Model(cls, width=128, embed_size=7500, **cfg):
+    def Model(cls, width=128, embed_size=4000, **cfg):
        """Create a new statistical model for the class.
        width (int): Output size of the model.
@ -284,6 +329,8 @@ class NeuralTagger(BaseThincComponent):
        cdef Vocab vocab = self.vocab
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, 'get'):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
                # Don't clobber preset POS tags
                if doc.c[j].tag == 0 and doc.c[j].pos == 0:
@ -292,6 +339,8 @@ class NeuralTagger(BaseThincComponent):
        doc.is_tagged = True
    def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
        docs, tokvecs = docs_tokvecs
        if self.model.nI is None:
@ -300,6 +349,8 @@ class NeuralTagger(BaseThincComponent):
        loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
        d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
        if losses is not None:
            losses[self.name] += loss
        return d_tokvecs
    def get_loss(self, docs, golds, scores):
@ -366,7 +417,8 @@ class NeuralTagger(BaseThincComponent):
    def from_bytes(self, bytes_data, **exclude):
        def load_model(b):
            if self.model is True:
-                token_vector_width = util.env_opt('token_vector_width', 128)
+                token_vector_width = util.env_opt('token_vector_width',
                        self.cfg.get('token_vector_width', 128))
                self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
            self.model.from_bytes(b)
@ -400,7 +452,8 @@ class NeuralTagger(BaseThincComponent):
    def from_disk(self, path, **exclude):
        def load_model(p):
            if self.model is True:
-                token_vector_width = util.env_opt('token_vector_width', 128)
+                token_vector_width = util.env_opt('token_vector_width',
                        self.cfg.get('token_vector_width', 128))
                self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
            self.model.from_bytes(p.open('rb').read())
@ -595,12 +648,13 @@ class TextCategorizer(BaseThincComponent):
        return mean_square_error, d_scores
    def begin_training(self, gold_tuples=tuple(), pipeline=None):
-        if pipeline:
+        if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
            token_vector_width = pipeline[0].model.nO
        else:
            token_vector_width = 64
        if self.model is True:
-            self.model = self.Model(len(self.labels), token_vector_width)
+            self.model = self.Model(len(self.labels), token_vector_width,
                                    **self.cfg)
 cdef class EntityRecognizer(LinearParser):
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -215,7 +215,10 @@ cdef class StringStore:
        path = util.ensure_path(path)
        with path.open('r') as file_:
            strings = ujson.load(file_)
        prev = list(self)
        self._reset_and_load(strings)
        for word in prev:
            self.add(word)
        return self
    def to_bytes(self, **exclude):
@ -234,7 +237,10 @@ cdef class StringStore:
        RETURNS (StringStore): The `StringStore` object.
        """
        strings = ujson.loads(bytes_data)
        prev = list(self)
        self._reset_and_load(strings)
        for word in prev:
            self.add(word)
        return self
    def set_frozen(self, bint is_frozen):
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@ -0,0 +1,286 @@
 # cython: infer_types=True
 # cython: profile=True
 cimport numpy as np
 import numpy
 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 from thinc.extra.search cimport Beam
 from thinc.extra.search import MaxViolation
 from thinc.typedefs cimport hash_t, class_t
 from thinc.extra.search cimport MaxViolation
 from .transition_system cimport TransitionSystem, Transition
 from .stateclass cimport StateClass
 from ..gold cimport GoldParse
 from ..tokens.doc cimport Doc
 # These are passed as callbacks to thinc.search.Beam
 cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
    dest = <StateClass>_dest
    src = <StateClass>_src
    moves = <const Transition*>_moves
    dest.clone(src)
    moves[clas].do(dest.c, moves[clas].label)
 cdef int _check_final_state(void* _state, void* extra_args) except -1:
    return (<StateClass>_state).is_final()
 def _cleanup(Beam beam):
    for i in range(beam.width):
        Py_XDECREF(<PyObject*>beam._states[i].content)
        Py_XDECREF(<PyObject*>beam._parents[i].content)
 cdef hash_t _hash_state(void* _state, void* _) except 0:
    state = <StateClass>_state
    if state.c.is_final():
        return 1
    else:
        return state.c.hash()
 cdef class ParserBeam(object):
    cdef public TransitionSystem moves
    cdef public object states
    cdef public object golds
    cdef public object beams
    cdef public object dones
    def __init__(self, TransitionSystem moves, states, golds,
            int width, float density):
        self.moves = moves
        self.states = states
        self.golds = golds
        self.beams = []
        cdef Beam beam
        cdef StateClass state, st
        for state in states:
            beam = Beam(self.moves.n_moves, width, density)
            beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
            for i in range(beam.width):
                st = <StateClass>beam.at(i)
                st.c.offset = state.c.offset
            self.beams.append(beam)
        self.dones = [False] * len(self.beams)
    def __dealloc__(self):
        if self.beams is not None:
            for beam in self.beams:
                if beam is not None:
                    _cleanup(beam)
    @property
    def is_done(self):
        return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams))
    def __getitem__(self, i):
        return self.beams[i]
    def __len__(self):
        return len(self.beams)
    def advance(self, scores, follow_gold=False):
        cdef Beam beam
        for i, beam in enumerate(self.beams):
            if beam.is_done or not scores[i].size or self.dones[i]:
                continue
            self._set_scores(beam, scores[i])
            if self.golds is not None:
                self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
            if follow_gold:
                beam.advance(_transition_state, NULL, <void*>self.moves.c)
            else:
                beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
            beam.check_done(_check_final_state, NULL)
            if beam.is_done and self.golds is not None:
                for j in range(beam.size):
                    state = <StateClass>beam.at(j)
                    if state.is_final():
                        try:
                            if self.moves.is_gold_parse(state, self.golds[i]):
                                beam._states[j].loss = 0.0
                            elif beam._states[j].loss == 0.0:
                                beam._states[j].loss = 1.0
                        except NotImplementedError:
                            break
    def _set_scores(self, Beam beam, float[:, ::1] scores):
        cdef float* c_scores = &scores[0, 0]
        cdef int nr_state = min(scores.shape[0], beam.size)
        cdef int nr_class = scores.shape[1]
        for i in range(nr_state):
            state = <StateClass>beam.at(i)
            if not state.is_final():
                for j in range(nr_class):
                    beam.scores[i][j] = c_scores[i * nr_class + j]
                self.moves.set_valid(beam.is_valid[i], state.c)
            else:
                for j in range(beam.nr_class):
                    beam.scores[i][j] = 0
                    beam.costs[i][j] = 0
    def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
        for i in range(beam.size):
            state = <StateClass>beam.at(i)
            if not state.c.is_final():
                self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
                if follow_gold:
                    for j in range(beam.nr_class):
                        if beam.costs[i][j] >= 1:
                            beam.is_valid[i][j] = 0
 def get_token_ids(states, int n_tokens):
    cdef StateClass state
    cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
                                      dtype='int32', order='C')
    c_ids = <int*>ids.data
    for i, state in enumerate(states):
        if not state.is_final():
            state.c.set_context_tokens(c_ids, n_tokens)
        else:
            ids[i] = -1
        c_ids += ids.shape[1]
    return ids
 nr_update = 0
 def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
                states, tokvecs, golds,
                state2vec, vec2scores, 
                int width, float density,
                sgd=None, losses=None, drop=0.):
    global nr_update
    cdef MaxViolation violn
    nr_update += 1
    pbeam = ParserBeam(moves, states, golds,
                       width=width, density=density)
    gbeam = ParserBeam(moves, states, golds,
                       width=width, density=0.0)
    cdef StateClass state
    beam_maps = []
    backprops = []
    violns = [MaxViolation() for _ in range(len(states))]
    for t in range(max_steps):
        if pbeam.is_done and gbeam.is_done:
            break
        # The beam maps let us find the right row in the flattened scores
        # arrays for each state. States are identified by (example id, history).
        # We keep a different beam map for each step (since we'll have a flat
        # scores array for each step). The beam map will let us take the per-state
        # losses, and compute the gradient for each (step, state, class).
        beam_maps.append({})
        # Gather all states from the two beams in a list. Some stats may occur
        # in both beams. To figure out which beam each state belonged to,
        # we keep two lists of indices, p_indices and g_indices
        states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
        if not states:
            break
        # Now that we have our flat list of states, feed them through the model
        token_ids = get_token_ids(states, nr_feature)
        vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
        scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
        # Store the callbacks for the backward pass
        backprops.append((token_ids, bp_vectors, bp_scores))
        # Unpack the flat scores into lists for the two beams. The indices arrays
        # tell us which example and state the scores-row refers to.
        p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
        g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')  for indices in g_indices]
        # Now advance the states in the beams. The gold beam is contrained to
        # to follow only gold analyses.
        pbeam.advance(p_scores)
        gbeam.advance(g_scores, follow_gold=True)
        # Track the "maximum violation", to use in the update.
        for i, violn in enumerate(violns):
            violn.check_crf(pbeam[i], gbeam[i])
    histories = []
    losses = []
    for violn in violns:
        if violn.p_hist:
            histories.append(violn.p_hist + violn.g_hist)
            losses.append(violn.p_probs + violn.g_probs)
        else:
            histories.append([])
            losses.append([])
    states_d_scores = get_gradient(moves.n_moves, beam_maps, histories, losses)
    return states_d_scores, backprops[:len(states_d_scores)]
 def get_states(pbeams, gbeams, beam_map, nr_update):
    seen = {}
    states = []
    p_indices = []
    g_indices = []
    cdef Beam pbeam, gbeam
    assert len(pbeams) == len(gbeams)
    for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
        p_indices.append([])
        g_indices.append([])
        for i in range(pbeam.size):
            state = <StateClass>pbeam.at(i)
            if not state.is_final():
                key = tuple([eg_id] + pbeam.histories[i])
                assert key not in seen, (key, seen)
                seen[key] = len(states)
                p_indices[-1].append(len(states))
                states.append(state)
        beam_map.update(seen)
        for i in range(gbeam.size):
            state = <StateClass>gbeam.at(i)
            if not state.is_final():
                key = tuple([eg_id] + gbeam.histories[i])
                if key in seen:
                    g_indices[-1].append(seen[key])
                else:
                    g_indices[-1].append(len(states))
                    beam_map[key] = len(states)
                    states.append(state)
    p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices]
    g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices]
    return states, p_idx, g_idx
 def get_gradient(nr_class, beam_maps, histories, losses):
    """
    The global model assigns a loss to each parse. The beam scores
    are additive, so the same gradient is applied to each action
    in the history. This gives the gradient of a single *action*
    for a beam state -- so we have "the gradient of loss for taking
    action i given history H."
    Histories: Each hitory is a list of actions
    Each candidate has a history
    Each beam has multiple candidates
    Each batch has multiple beams
    So history is list of lists of lists of ints
    """
    nr_step = len(beam_maps)
    grads = []
    nr_step = 0
    for eg_id, hists in enumerate(histories):
        for loss, hist in zip(losses[eg_id], hists):
            if loss != 0.0 and not numpy.isnan(loss):
                nr_step = max(nr_step, len(hist))
    for i in range(nr_step):
        grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f'))
    assert len(histories) == len(losses)
    for eg_id, hists in enumerate(histories):
        for loss, hist in zip(losses[eg_id], hists):
            if loss == 0.0 or numpy.isnan(loss):
                continue
            key = tuple([eg_id])
            # Adjust loss for length
            avg_loss = loss / len(hist)
            loss += avg_loss * (nr_step - len(hist))
            for j, clas in enumerate(hist):
                i = beam_maps[j][key]
                # In step j, at state i action clas
                # resulted in loss
                grads[j][i, clas] += loss
                key = key + tuple([clas])
    return grads
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@ -37,6 +37,7 @@ cdef cppclass StateC:
        this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
        this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
        this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity))
        this.offset = 0
        cdef int i
        for i in range(length + (PADDING * 2)):
            this._ents[i].end = -1
@ -73,7 +74,16 @@ cdef cppclass StateC:
        free(this.shifted - PADDING)
    void set_context_tokens(int* ids, int n) nogil:
-        if n == 13:
+        if n == 8:
            ids[0] = this.B(0)
            ids[1] = this.B(1)
            ids[2] = this.S(0)
            ids[3] = this.S(1)
            ids[4] = this.H(this.S(0))
            ids[5] = this.L(this.B(0), 1)
            ids[6] = this.L(this.S(0), 2)
            ids[7] = this.R(this.S(0), 1)
        elif n == 13:
            ids[0] = this.B(0)
            ids[1] = this.B(1)
            ids[2] = this.S(0)
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -20,7 +20,7 @@ from .transition_system cimport do_func_t, get_cost_func_t
 from .transition_system cimport move_cost_func_t, label_cost_func_t
 from ..gold cimport GoldParse
 from ..gold cimport GoldParseC
-from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
+from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE, IS_PUNCT
 from ..lexeme cimport Lexeme
 from ..structs cimport TokenC
@ -286,7 +286,7 @@ cdef class Break:
        return 0
 cdef int _get_root(int word, const GoldParseC* gold) nogil:
-    while gold.heads[word] != word and not gold.has_dep[word] and word >= 0:
+    while gold.heads[word] != word and gold.has_dep[word] and word >= 0:
        word = gold.heads[word]
    if not gold.has_dep[word]:
        return -1
@ -351,6 +351,20 @@ cdef class ArcEager(TransitionSystem):
        def __get__(self):
            return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
    def is_gold_parse(self, StateClass state, GoldParse gold):
        predicted = set()
        truth = set()
        for i in range(gold.length):
            if gold.cand_to_gold[i] is None:
                continue
            if state.safe_get(i).dep:
                predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep]))
            else:
                predicted.add((i, state.H(i), 'ROOT'))
            id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
            truth.add((id_, head, dep))
        return truth == predicted
    def has_gold(self, GoldParse gold, start=0, end=None):
        end = end or len(gold.heads)
        if all([tag is None for tag in gold.heads[start:end]]):
@ -502,9 +516,11 @@ cdef class ArcEager(TransitionSystem):
                    "before training and after parsing. Either pass make_projective=True "
                    "to the GoldParse class, or use PseudoProjectivity.preprocess_training_data")
            else:
                print(gold.orig_annot)
                print(gold.words)
                print(gold.heads)
                print(gold.labels)
                print(gold.sent_starts)
                raise ValueError(
                    "Could not find a gold-standard action to supervise the dependency "
                    "parser.\n"
--- a/spacy/syntax/beam_parser.pyx
+++ b/spacy/syntax/beam_parser.pyx
@ -107,7 +107,7 @@ cdef class BeamParser(Parser):
            # The non-monotonic oracle makes it difficult to ensure final costs are
            # correct. Therefore do final correction
            for i in range(pred.size):
-                if is_gold(<StateClass>pred.at(i), gold_parse, self.moves.strings):
+                if self.moves.is_gold_parse(<StateClass>pred.at(i), gold_parse):
                    pred._states[i].loss = 0.0
                elif pred._states[i].loss == 0.0:
                    pred._states[i].loss = 1.0
@ -213,7 +213,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
        if not pred._states[i].is_done or pred._states[i].loss == 0:
            continue
        state = <StateClass>pred.at(i)
-        if is_gold(state, gold_parse, moves.strings) == True:
+        if moves.is_gold_parse(state, gold_parse) == True:
            for dep in gold_parse.orig_annot:
                print(dep[1], dep[3], dep[4])
            print("Cost", pred._states[i].loss)
@ -227,7 +227,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
        if not gold._states[i].is_done:
            continue
        state = <StateClass>gold.at(i)
-        if is_gold(state, gold_parse, moves.strings) == False:
+        if moves.is_gold(state, gold_parse) == False:
            print("Truth")
            for dep in gold_parse.orig_annot:
                print(dep[1], dep[3], dep[4])
@ -237,16 +237,3 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
            raise Exception("Gold parse is not gold-standard")
 def is_gold(StateClass state, GoldParse gold, StringStore strings):
    predicted = set()
    truth = set()
    for i in range(gold.length):
        if gold.cand_to_gold[i] is None:
            continue
        if state.safe_get(i).dep:
            predicted.add((i, state.H(i), strings[state.safe_get(i).dep]))
        else:
            predicted.add((i, state.H(i), 'ROOT'))
        id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
        truth.add((id_, head, dep))
    return truth == predicted
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -113,7 +113,7 @@ cdef class BiluoPushDown(TransitionSystem):
    def has_gold(self, GoldParse gold, start=0, end=None):
        end = end or len(gold.ner)
-        if all([tag == '-' for tag in gold.ner[start:end]]):
+        if all([tag in ('-', None) for tag in gold.ner[start:end]]):
            return False
        else:
            return True
--- a/spacy/syntax/nn_parser.pxd
+++ b/spacy/syntax/nn_parser.pxd
@ -14,4 +14,8 @@ cdef class Parser:
    cdef readonly TransitionSystem moves
    cdef readonly object cfg
    cdef void _parse_step(self, StateC* state,
            const float* feat_weights,
            int nr_class, int nr_feat, int nr_piece) nogil
    #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -36,15 +36,19 @@ from murmurhash.mrmr cimport hash64
 from preshed.maps cimport MapStruct
 from preshed.maps cimport map_get
-from thinc.api import layerize, chain, noop, clone
+from thinc.api import layerize, chain, noop, clone, with_flatten
-from thinc.neural import Model, Affine, ELU, ReLu, Maxout
+from thinc.neural import Model, Affine, ReLu, Maxout
 from thinc.neural._classes.batchnorm import BatchNorm as BN
 from thinc.neural._classes.selu import SELU
 from thinc.neural._classes.layernorm import LayerNorm
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module
 from .. import util
 from ..util import get_async, get_cuda_stream
 from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
-from .._ml import Tok2Vec, doc2feats, rebatch
+from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
 from .._ml import Residual, drop_layer
 from ..compat import json_dumps
 from . import _parse_features
@ -59,8 +63,10 @@ from ..structs cimport TokenC
 from ..tokens.doc cimport Doc
 from ..strings cimport StringStore
 from ..gold cimport GoldParse
-from ..attrs cimport TAG, DEP
+from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
 from . import _beam_utils
 USE_FINE_TUNE = True
 def get_templates(*args, **kwargs):
    return []
@ -232,12 +238,14 @@ cdef class Parser:
    Base class of the DependencyParser and EntityRecognizer.
    """
    @classmethod
-    def Model(cls, nr_class, token_vector_width=128, hidden_width=128, depth=1, **cfg):
+    def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg):
        depth = util.env_opt('parser_hidden_depth', depth)
        token_vector_width = util.env_opt('token_vector_width', token_vector_width)
        hidden_width = util.env_opt('hidden_width', hidden_width)
        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
-        tensors = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats())
+        embed_size = util.env_opt('embed_size', 4000)
        tensors = fine_tune(Tok2Vec(token_vector_width, embed_size,
                                    preprocess=doc2feats()))
        if parser_maxout_pieces == 1:
            lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
                        nF=cls.nr_feature,
@ -249,10 +257,15 @@ cdef class Parser:
                        nI=token_vector_width)
        with Model.use_device('cpu'):
            if depth == 0:
                upper = chain()
                upper.is_noop = True
            else:
                upper = chain(
                    clone(Maxout(hidden_width), (depth-1)),
                    zero_init(Affine(nr_class, drop_factor=0.0))
                )
                upper.is_noop = False
        # TODO: This is an unfortunate hack atm!
        # Used to set input dimensions in network.
        lower.begin_training(lower.ops.allocate((500, token_vector_width)))
@ -290,6 +303,10 @@ cdef class Parser:
            self.moves = self.TransitionSystem(self.vocab.strings, {})
        else:
            self.moves = moves
        if 'beam_width' not in cfg:
            cfg['beam_width'] = util.env_opt('beam_width', 1)
        if 'beam_density' not in cfg:
            cfg['beam_density'] = util.env_opt('beam_density', 0.0)
        self.cfg = cfg
        if 'actions' in self.cfg:
            for action, labels in self.cfg.get('actions', {}).items():
@ -312,7 +329,7 @@ cdef class Parser:
        if beam_width is None:
            beam_width = self.cfg.get('beam_width', 1)
        if beam_density is None:
-            beam_density = self.cfg.get('beam_density', 0.001)
+            beam_density = self.cfg.get('beam_density', 0.0)
        cdef Beam beam
        if beam_width == 1:
            states = self.parse_batch([doc], [doc.tensor])
@ -328,7 +345,7 @@ cdef class Parser:
            return output
    def pipe(self, docs, int batch_size=1000, int n_threads=2,
-             beam_width=1, beam_density=0.001):
+             beam_width=None, beam_density=None):
        """
        Process a stream of documents.
@ -340,15 +357,23 @@ cdef class Parser:
                The number of threads with which to work on the buffer in parallel.
        Yields (Doc): Documents, in order.
        """
        if beam_width is None:
            beam_width = self.cfg.get('beam_width', 1)
        if beam_density is None:
            beam_density = self.cfg.get('beam_density', 0.0)
        cdef Doc doc
        cdef Beam beam
        for docs in cytoolz.partition_all(batch_size, docs):
            docs = list(docs)
            tokvecs = [doc.tensor for doc in docs]
            if beam_width == 1:
                parse_states = self.parse_batch(docs, tokvecs)
            else:
-                parse_states = self.beam_parse(docs, tokvecs,
+                beams = self.beam_parse(docs, tokvecs,
                            beam_width=beam_width, beam_density=beam_density)
                parse_states = []
                for beam in beams:
                    parse_states.append(<StateClass>beam.at(0))
            self.set_annotations(docs, parse_states)
            yield from docs
@ -367,7 +392,8 @@ cdef class Parser:
            tokvecses = [tokvecses]
        tokvecs = self.model[0].ops.flatten(tokvecses)
-        tokvecs += self.model[0].ops.flatten(self.model[0](docs))
+        if USE_FINE_TUNE:
            tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
        nr_state = len(docs)
        nr_class = self.moves.n_moves
@ -391,7 +417,14 @@ cdef class Parser:
        cdef np.ndarray scores
        c_token_ids = <int*>token_ids.data
        c_is_valid = <int*>is_valid.data
        cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
        while not next_step.empty():
            if not has_hidden:
                for i in cython.parallel.prange(
                        next_step.size(), num_threads=6, nogil=True):
                    self._parse_step(next_step[i],
                        feat_weights, nr_class, nr_feat, nr_piece)
            else:
                for i in range(next_step.size()):
                    st = next_step[i]
                    st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
@ -412,19 +445,22 @@ cdef class Parser:
                    next_step.push_back(st)
        return states
-    def beam_parse(self, docs, tokvecses, int beam_width=8, float beam_density=0.001):
+    def beam_parse(self, docs, tokvecses, int beam_width=3, float beam_density=0.001):
        cdef Beam beam
        cdef np.ndarray scores
        cdef Doc doc
        cdef int nr_class = self.moves.n_moves
        cdef StateClass stcls, output
        tokvecs = self.model[0].ops.flatten(tokvecses)
-        tokvecs += self.model[0].ops.flatten(self.model[0](docs))
+        if USE_FINE_TUNE:
            tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
        cuda_stream = get_cuda_stream()
        state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
                                                     cuda_stream, 0.0)
        beams = []
        cdef int offset = 0
        cdef int j = 0
        cdef int k
        for doc in docs:
            beam = Beam(nr_class, beam_width, min_density=beam_density)
            beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
@ -437,22 +473,56 @@ cdef class Parser:
                states = []
                for i in range(beam.size):
                    stcls = <StateClass>beam.at(i)
                    # This way we avoid having to score finalized states
                    # We do have to take care to keep indexes aligned, though
                    if not stcls.is_final():
                        states.append(stcls)
                token_ids = self.get_token_ids(states)
                vectors = state2vec(token_ids)
                scores = vec2scores(vectors)
                j = 0
                c_scores = <float*>scores.data
                for i in range(beam.size):
                    stcls = <StateClass>beam.at(i)
                    if not stcls.is_final():
                        self.moves.set_valid(beam.is_valid[i], stcls.c)
-                        for j in range(nr_class):
+                        for k in range(nr_class):
-                            beam.scores[i][j] = scores[i, j]
+                            beam.scores[i][k] = c_scores[j * scores.shape[1] + k]
                        j += 1
                beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
                beam.check_done(_check_final_state, NULL)
            beams.append(beam)
        return beams
    cdef void _parse_step(self, StateC* state,
            const float* feat_weights,
            int nr_class, int nr_feat, int nr_piece) nogil:
        '''This only works with no hidden layers -- fast but inaccurate'''
        #for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True):
        #    self._parse_step(next_step[i], feat_weights, nr_class, nr_feat)
        token_ids = <int*>calloc(nr_feat, sizeof(int))
        scores = <float*>calloc(nr_class * nr_piece, sizeof(float))
        is_valid = <int*>calloc(nr_class, sizeof(int))
        state.set_context_tokens(token_ids, nr_feat)
        sum_state_features(scores,
            feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece)
        self.moves.set_valid(is_valid, state)
        guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece)
        action = self.moves.c[guess]
        action.do(state, action.label)
        free(is_valid)
        free(scores)
        free(token_ids)
    def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
        if not any(self.moves.has_gold(gold) for gold in golds):
            return None
        if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
            return self.update_beam(docs_tokvecs, golds,
                    self.cfg['beam_width'], self.cfg['beam_density'],
                    drop=drop, sgd=sgd, losses=losses)
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
        docs, tokvec_lists = docs_tokvecs
@ -460,9 +530,9 @@ cdef class Parser:
        if isinstance(docs, Doc) and isinstance(golds, GoldParse):
            docs = [docs]
            golds = [golds]
-        my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs, drop=0.)
+        if USE_FINE_TUNE:
-        my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
+            tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
-        tokvecs += my_tokvecs
+            tokvecs = self.model[0].ops.flatten(tokvecs)
        cuda_stream = get_cuda_stream()
@ -489,13 +559,14 @@ cdef class Parser:
            scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
            d_scores = self.get_batch_loss(states, golds, scores)
-            d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
+            d_scores /= len(docs)
            d_vector = bp_scores(d_scores, sgd=sgd)
            if drop != 0:
                d_vector *= mask
            if isinstance(self.model[0].ops, CupyOps) \
            and not isinstance(token_ids, state2vec.ops.xp.ndarray):
-                # Move token_ids and d_vector to CPU, asynchronously
+                # Move token_ids and d_vector to GPU, asynchronously
                backprops.append((
                    get_async(cuda_stream, token_ids),
                    get_async(cuda_stream, d_vector),
@ -513,7 +584,63 @@ cdef class Parser:
        self._make_updates(d_tokvecs,
            backprops, sgd, cuda_stream)
        d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
-        #bp_my_tokvecs(d_tokvecs, sgd=sgd)
+        if USE_FINE_TUNE:
            d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
        return d_tokvecs
    def update_beam(self, docs_tokvecs, golds, width=None, density=None,
            drop=0., sgd=None, losses=None):
        if not any(self.moves.has_gold(gold) for gold in golds):
            return None
        if not golds:
            return None
        if width is None:
            width = self.cfg.get('beam_width', 2)
        if density is None:
            density = self.cfg.get('beam_density', 0.0)
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
        docs, tokvecs = docs_tokvecs
        lengths = [len(d) for d in docs]
        assert min(lengths) >= 1
        tokvecs = self.model[0].ops.flatten(tokvecs)
        if USE_FINE_TUNE:
            tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
            tokvecs = self.model[0].ops.flatten(tokvecs)
        states = self.moves.init_batch(docs)
        for gold in golds:
            self.moves.preprocess_gold(gold)
        cuda_stream = get_cuda_stream()
        state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0)
        states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
                                        states, tokvecs, golds,
                                        state2vec, vec2scores,
                                        width, density,
                                        sgd=sgd, drop=drop, losses=losses)
        backprop_lower = []
        cdef float batch_size = len(docs)
        for i, d_scores in enumerate(states_d_scores):
            d_scores /= batch_size
            if losses is not None:
                losses[self.name] += (d_scores**2).sum()
            ids, bp_vectors, bp_scores = backprops[i]
            d_vector = bp_scores(d_scores, sgd=sgd)
            if isinstance(self.model[0].ops, CupyOps) \
            and not isinstance(ids, state2vec.ops.xp.ndarray):
                backprop_lower.append((
                    get_async(cuda_stream, ids),
                    get_async(cuda_stream, d_vector),
                    bp_vectors))
            else:
                backprop_lower.append((ids, d_vector, bp_vectors))
        d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
        self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
        d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
        if USE_FINE_TUNE:
            d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
        return d_tokvecs
    def _init_gold_batch(self, whole_docs, whole_golds):
@ -559,14 +686,10 @@ cdef class Parser:
        xp = get_array_module(d_tokvecs)
        for ids, d_vector, bp_vector in backprops:
            d_state_features = bp_vector(d_vector, sgd=sgd)
-            active_feats = ids * (ids >= 0)
+            mask = ids >= 0
-            active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1))
+            d_state_features *= mask.reshape(ids.shape + (1,))
-            if hasattr(xp, 'scatter_add'):
+            self.model[0].ops.scatter_add(d_tokvecs, ids * mask,
-                xp.scatter_add(d_tokvecs,
+                d_state_features)
                    ids, d_state_features * active_feats)
            else:
                xp.add.at(d_tokvecs,
                    ids, d_state_features * active_feats)
    @property
    def move_names(self):
@ -582,7 +705,7 @@ cdef class Parser:
                        lower, stream, drop=dropout)
        return state2vec, upper
-    nr_feature = 13
+    nr_feature = 8
    def get_token_ids(self, states):
        cdef StateClass state
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -99,6 +99,9 @@ cdef class TransitionSystem:
    def preprocess_gold(self, GoldParse gold):
        raise NotImplementedError
    def is_gold_parse(self, StateClass state, GoldParse gold):
        raise NotImplementedError
    cdef Transition lookup_transition(self, object name) except *:
        raise NotImplementedError
@ -145,7 +148,7 @@ cdef class TransitionSystem:
    def add_action(self, int action, label_name):
        cdef attr_t label_id
-        if not isinstance(label_name, int):
+        if not isinstance(label_name, (int, long)):
            label_id = self.strings.add(label_name)
        else:
            label_id = label_name
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -11,9 +11,9 @@ from ..strings import StringStore
 from .. import util
-_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
+_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id',
-              'nl', 'pl', 'pt', 'sv', 'xx']
+              'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
-_models = {'en': ['en_depent_web_sm', 'en_core_web_md'],
+_models = {'en': ['en_core_web_sm'],
           'de': ['de_core_news_md'],
           'fr': ['fr_depvec_web_lg'],
           'xx': ['xx_ent_web_md']}
@ -86,6 +86,9 @@ def hu_tokenizer():
 def fi_tokenizer():
    return util.get_lang_class('fi').Defaults.create_tokenizer()
@pytest.fixture
 def id_tokenizer():
    return util.get_lang_class('id').Defaults.create_tokenizer()
@pytest.fixture
 def sv_tokenizer():
--- a/spacy/tests/lang/en/test_lemmatizer.py
+++ b/spacy/tests/lang/en/test_lemmatizer.py
@ -2,12 +2,18 @@
 from __future__ import unicode_literals
 import pytest
 from ....tokens.doc import Doc
@pytest.fixture
 def en_lemmatizer(EN):
    return EN.Defaults.create_lemmatizer()
@pytest.mark.models('en')
 def test_doc_lemmatization(EN):
    doc = Doc(EN.vocab, words=['bleed'])
    doc[0].tag_ = 'VBP'
    assert doc[0].lemma_ == 'bleed'
@pytest.mark.models('en')
@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]),
@ -19,6 +25,16 @@ def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
    assert en_lemmatizer.noun(text) == set(lemmas)
@pytest.mark.models('en')
@pytest.mark.parametrize('text,lemmas', [("bleed", ["bleed"]),
                                         ("feed", ["feed"]),
                                         ("need", ["need"]),
                                         ("ring", ["ring"]),
                                         ("axes", ["axis", "axe", "ax"])])
 def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
    assert en_lemmatizer.noun(text) == set(lemmas)
@pytest.mark.xfail
@pytest.mark.models('en')
 def test_en_lemmatizer_base_forms(en_lemmatizer):
--- a/spacy/tests/lang/en/test_tagger.py
+++ b/spacy/tests/lang/en/test_tagger.py
@ -25,7 +25,6 @@ def test_tag_names(EN):
    doc = EN(text, disable=['parser'])
    assert type(doc[2].pos) == int
    assert isinstance(doc[2].pos_, six.text_type)
    assert type(doc[2].dep) == int
    assert isinstance(doc[2].dep_, six.text_type)
    assert doc[2].tag_ == u'NNS'
--- a/spacy/tests/lang/id/init.py
+++ b/spacy/tests/lang/id/init.py
--- a/spacy/tests/lang/id/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/id/test_prefix_suffix_infix.py
@ -0,0 +1,115 @@
 # coding: utf-8
 """Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
 from __future__ import unicode_literals
 import pytest
@pytest.mark.parametrize('text', ["(Ma'arif)"])
 def test_tokenizer_splits_no_special(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Ma'arif"])
 def test_tokenizer_splits_no_punct(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 1
@pytest.mark.parametrize('text', ["(Ma'arif"])
 def test_tokenizer_splits_prefix_punct(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 2
@pytest.mark.parametrize('text', ["Ma'arif)"])
 def test_tokenizer_splits_suffix_punct(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(Ma'arif)"])
 def test_tokenizer_splits_even_wrap(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(Ma'arif?)"])
 def test_tokenizer_splits_uneven_wrap(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 4
@pytest.mark.parametrize('text,length', [("S.Kom.", 1), ("SKom.", 2), ("(S.Kom.", 2)])
 def test_tokenizer_splits_prefix_interact(id_tokenizer, text, length):
    tokens = id_tokenizer(text)
    assert len(tokens) == length
@pytest.mark.parametrize('text', ["S.Kom.)"])
 def test_tokenizer_splits_suffix_interact(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(S.Kom.)"])
 def test_tokenizer_splits_even_wrap_interact(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(S.Kom.?)"])
 def test_tokenizer_splits_uneven_wrap_interact(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 4
@pytest.mark.parametrize('text,length', [("gara-gara", 1), ("Jokowi-Ahok", 3), ("Sukarno-Hatta", 3)])
 def test_tokenizer_splits_hyphens(id_tokenizer, text, length):
    tokens = id_tokenizer(text)
    assert len(tokens) == length
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
 def test_tokenizer_splits_numeric_range(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["ini.Budi", "Halo.Bandung"])
 def test_tokenizer_splits_period_infix(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Halo,Bandung", "satu,dua"])
 def test_tokenizer_splits_comma_infix(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 3
    assert tokens[0].text == text.split(",")[0]
    assert tokens[1].text == ","
    assert tokens[2].text == text.split(",")[1]
@pytest.mark.parametrize('text', ["halo...Bandung", "dia...pergi"])
 def test_tokenizer_splits_ellipsis_infix(id_tokenizer, text):
    tokens = id_tokenizer(text)
    assert len(tokens) == 3
 def test_tokenizer_splits_double_hyphen_infix(id_tokenizer):
    tokens = id_tokenizer("Arsene Wenger--manajer Arsenal--melakukan konferensi pers.")
    assert len(tokens) == 10
    assert tokens[0].text == "Arsene"
    assert tokens[1].text == "Wenger"
    assert tokens[2].text == "--"
    assert tokens[3].text == "manajer"
    assert tokens[4].text == "Arsenal"
    assert tokens[5].text == "--"
    assert tokens[6].text == "melakukan"
    assert tokens[7].text == "konferensi"
    assert tokens[8].text == "pers"
    assert tokens[9].text == "."
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@ -78,3 +78,16 @@ def test_predict_doc_beam(parser, tok2vec, model, doc):
    parser(doc, beam_width=32, beam_density=0.001)
    for word in doc:
        print(word.text, word.head, word.dep_)
 def test_update_doc_beam(parser, tok2vec, model, doc, gold):
    parser.model = model
    tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
    d_tokvecs = parser.update_beam(([doc], tokvecs), [gold])
    assert d_tokvecs[0].shape == tokvecs[0].shape
    def optimize(weights, gradient, key=None):
        weights -= 0.001 * gradient
    bp_tokvecs(d_tokvecs, sgd=optimize)
    assert d_tokvecs[0].sum() == 0.
--- a/spacy/tests/parser/test_nn_beam.py
+++ b/spacy/tests/parser/test_nn_beam.py
@ -0,0 +1,87 @@
 from __future__ import unicode_literals
 import pytest
 import numpy
 from thinc.api import layerize
 from ...vocab import Vocab
 from ...syntax.arc_eager import ArcEager
 from ...tokens import Doc
 from ...gold import GoldParse
 from ...syntax._beam_utils import ParserBeam, update_beam
 from ...syntax.stateclass import StateClass
@pytest.fixture
 def vocab():
    return Vocab()
@pytest.fixture
 def moves(vocab):
    aeager = ArcEager(vocab.strings, {})
    aeager.add_action(2, 'nsubj')
    aeager.add_action(3, 'dobj')
    aeager.add_action(2, 'aux')
    return aeager
@pytest.fixture
 def docs(vocab):
    return [Doc(vocab, words=['Rats', 'bite', 'things'])]
@pytest.fixture
 def states(docs):
    return [StateClass(doc) for doc in docs]
@pytest.fixture
 def tokvecs(docs, vector_size):
    output = []
    for doc in docs:
        vec = numpy.random.uniform(-0.1, 0.1, (len(doc), vector_size))
        output.append(numpy.asarray(vec))
    return output
@pytest.fixture
 def golds(docs):
    return [GoldParse(doc) for doc in docs]
@pytest.fixture
 def batch_size(docs):
    return len(docs)
@pytest.fixture
 def beam_width():
    return 4
@pytest.fixture
 def vector_size():
    return 6
@pytest.fixture
 def beam(moves, states, golds, beam_width):
    return ParserBeam(moves, states, golds, width=beam_width, density=0.0)
@pytest.fixture
 def scores(moves, batch_size, beam_width):
    return [
        numpy.asarray(
            numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)),
            dtype='f')
        for _ in range(batch_size)]
 def test_create_beam(beam):
    pass
 def test_beam_advance(beam, scores):
    beam.advance(scores)
 def test_beam_advance_too_few_scores(beam, scores):
    with pytest.raises(IndexError):
        beam.advance(scores[:-1])
--- a/spacy/tests/regression/test_issue1257.py
+++ b/spacy/tests/regression/test_issue1257.py
@ -0,0 +1,12 @@
 '''Test tokens compare correctly'''
 from __future__ import unicode_literals
 from ..util import get_doc
 from ...vocab import Vocab
 def test_issue1257():
    doc1 = get_doc(Vocab(), ['a', 'b', 'c'])
    doc2 = get_doc(Vocab(), ['a', 'c', 'e'])
    assert doc1[0] != doc2[0]
    assert not doc1[0] == doc2[0]
--- a/spacy/tests/regression/test_issue1305.py
+++ b/spacy/tests/regression/test_issue1305.py
@ -0,0 +1,8 @@
 import pytest
@pytest.mark.models('en')
 def test_issue1305(EN):
    '''Test lemmatization of English VBZ'''
    assert EN.vocab.morphology.lemmatizer('works', 'verb') == set(['work'])
    doc = EN(u'This app works well')
    assert doc[2].lemma_ == 'work'
--- a/spacy/tests/regression/test_issue429.py
+++ b/spacy/tests/regression/test_issue429.py
@ -13,7 +13,10 @@ def test_issue429(EN):
            return None
        spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
        for ent_id, label, span in spans:
-        span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label])
+            span.merge(
                tag=('NNP' if label else span.root.tag_),
                lemma=span.text,
                label='PERSON')
    doc = EN('a')
    matcher = Matcher(EN.vocab)
--- a/spacy/tests/serialize/test_serialize_tagger.py
+++ b/spacy/tests/serialize/test_serialize_tagger.py
@ -11,8 +11,8 @@ import pytest
 def taggers(en_vocab):
    tagger1 = Tagger(en_vocab)
    tagger2 = Tagger(en_vocab)
-    tagger1.model = tagger1.Model(None, None)
+    tagger1.model = tagger1.Model(8, 8)
-    tagger2.model = tagger2.Model(None, None)
+    tagger2.model = tagger1.model
    return (tagger1, tagger2)
@ -20,7 +20,6 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
    tagger1, tagger2 = taggers
    tagger1_b = tagger1.to_bytes()
    tagger2_b = tagger2.to_bytes()
    assert tagger1_b == tagger2_b
    tagger1 = tagger1.from_bytes(tagger1_b)
    assert tagger1.to_bytes() == tagger1_b
    new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)
--- a/spacy/tests/spans/test_span.py
+++ b/spacy/tests/spans/test_span.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 from ..util import get_doc
 from ...attrs import ORTH, LENGTH
 import pytest
@ -89,3 +90,19 @@ def test_spans_are_hashable(en_tokenizer):
    span3 = tokens[0:2]
    assert hash(span3) == hash(span1)
 def test_spans_by_character(doc):
    span1 = doc[1:-2]
    span2 = doc.char_span(span1.start_char, span1.end_char, label='GPE')
    assert span1.start_char == span2.start_char
    assert span1.end_char == span2.end_char
    assert span2.label_ == 'GPE'
 def test_span_to_array(doc):
    span = doc[1:-2]
    arr = span.to_array([ORTH, LENGTH])
    assert arr.shape == (len(span), 2)
    assert arr[0, 0] == span[0].orth
    assert arr[0, 1] == len(span[0])
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@ -79,9 +79,9 @@ def add_vecs_to_vocab(vocab, vectors):
    """Add list of vector tuples to given vocab. All vectors need to have the
    same length. Format: [("text", [1, 2, 3])]"""
    length = len(vectors[0][1])
-    vocab.resize_vectors(length)
+    vocab.clear_vectors(length)
    for word, vec in vectors:
-        vocab[word].vector = vec
+        vocab.set_vector(word, vec)
    return vocab
--- a/spacy/tests/vectors/test_similarity.py
+++ b/spacy/tests/vectors/test_similarity.py
@ -14,10 +14,9 @@ def vectors():
@pytest.fixture()
 def vocab(en_vocab, vectors):
-    #return add_vecs_to_vocab(en_vocab, vectors)
+    add_vecs_to_vocab(en_vocab, vectors)
-    return None
+    return en_vocab
@pytest.mark.xfail
 def test_vectors_similarity_LL(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    lex1 = vocab[word1]
@ -31,7 +30,6 @@ def test_vectors_similarity_LL(vocab, vectors):
    assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
@pytest.mark.xfail
 def test_vectors_similarity_TT(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = get_doc(vocab, words=[word1, word2])
@ -44,21 +42,18 @@ def test_vectors_similarity_TT(vocab, vectors):
    assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
@pytest.mark.xfail
 def test_vectors_similarity_TD(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = get_doc(vocab, words=[word1, word2])
    assert doc.similarity(doc[0]) == doc[0].similarity(doc)
@pytest.mark.xfail
 def test_vectors_similarity_DS(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = get_doc(vocab, words=[word1, word2])
    assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
@pytest.mark.xfail
 def test_vectors_similarity_TS(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = get_doc(vocab, words=[word1, word2])
--- a/spacy/tests/vectors/test_vectors.py
+++ b/spacy/tests/vectors/test_vectors.py
@ -2,6 +2,8 @@
 from __future__ import unicode_literals
 from ...vectors import Vectors
 from ...tokenizer import Tokenizer
 from ..util import add_vecs_to_vocab, get_doc
 import numpy
 import pytest
@ -11,22 +13,42 @@ import pytest
 def strings():
    return ["apple", "orange"]
@pytest.fixture
 def vectors():
    return [
        ("apple", [1, 2, 3]),
        ("orange", [-1, -2, -3]),
        ('and', [-1, -1, -1]),
        ('juice', [5, 5, 10]),
        ('pie', [7, 6.3, 8.9])]
@pytest.fixture
 def data():
    return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype='f')
@pytest.fixture()
 def vocab(en_vocab, vectors):
    add_vecs_to_vocab(en_vocab, vectors)
    return en_vocab
 def test_init_vectors_with_data(strings, data):
    v = Vectors(strings, data)
    assert v.shape == data.shape
 def test_init_vectors_with_width(strings):
    v = Vectors(strings, 3)
    for string in strings:
        v.add(string)
    assert v.shape == (len(strings), 3)
 def test_get_vector(strings, data):
    v = Vectors(strings, data)
    for string in strings:
        v.add(string)
    assert list(v[strings[0]]) == list(data[0])
    assert list(v[strings[0]]) != list(data[1])
    assert list(v[strings[1]]) != list(data[0])
@ -35,6 +57,8 @@ def test_get_vector(strings, data):
 def test_set_vector(strings, data):
    orig = data.copy()
    v = Vectors(strings, data)
    for string in strings:
        v.add(string)
    assert list(v[strings[0]]) == list(orig[0])
    assert list(v[strings[0]]) != list(orig[1])
    v[strings[0]] = data[1]
@ -42,125 +66,111 @@ def test_set_vector(strings, data):
    assert list(v[strings[0]]) != list(orig[0])
-#
+
-#@pytest.fixture()
+@pytest.fixture()
-#def tokenizer_v(vocab):
+def tokenizer_v(vocab):
-#    return Tokenizer(vocab, {}, None, None, None)
+    return Tokenizer(vocab, {}, None, None, None)
-#
+
-#
+
-#@pytest.mark.xfail
+@pytest.mark.parametrize('text', ["apple and orange"])
-#@pytest.mark.parametrize('text', ["apple and orange"])
+def test_vectors_token_vector(tokenizer_v, vectors, text):
-#def test_vectors_token_vector(tokenizer_v, vectors, text):
+    doc = tokenizer_v(text)
-#    doc = tokenizer_v(text)
+    assert vectors[0] == (doc[0].text, list(doc[0].vector))
-#    assert vectors[0] == (doc[0].text, list(doc[0].vector))
+    assert vectors[1] == (doc[2].text, list(doc[2].vector))
-#    assert vectors[1] == (doc[2].text, list(doc[2].vector))
+
-#
+
-#
+@pytest.mark.parametrize('text', ["apple", "orange"])
-#@pytest.mark.xfail
+def test_vectors_lexeme_vector(vocab, text):
-#@pytest.mark.parametrize('text', ["apple", "orange"])
+    lex = vocab[text]
-#def test_vectors_lexeme_vector(vocab, text):
+    assert list(lex.vector)
-#    lex = vocab[text]
+    assert lex.vector_norm
-#    assert list(lex.vector)
+
-#    assert lex.vector_norm
+
-#
+@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
-#
+def test_vectors_doc_vector(vocab, text):
-#@pytest.mark.xfail
+    doc = get_doc(vocab, text)
-#@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
+    assert list(doc.vector)
-#def test_vectors_doc_vector(vocab, text):
+    assert doc.vector_norm
-#    doc = get_doc(vocab, text)
+
-#    assert list(doc.vector)
+
-#    assert doc.vector_norm
+@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
-#
+def test_vectors_span_vector(vocab, text):
-#
+    span = get_doc(vocab, text)[0:2]
-#@pytest.mark.xfail
+    assert list(span.vector)
-#@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
+    assert span.vector_norm
-#def test_vectors_span_vector(vocab, text):
+
-#    span = get_doc(vocab, text)[0:2]
+
-#    assert list(span.vector)
+@pytest.mark.parametrize('text', ["apple orange"])
-#    assert span.vector_norm
+def test_vectors_token_token_similarity(tokenizer_v, text):
-#
+    doc = tokenizer_v(text)
-#
+    assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
-#@pytest.mark.xfail
+    assert -1. < doc[0].similarity(doc[1]) < 1.0
-#@pytest.mark.parametrize('text', ["apple orange"])
+
-#def test_vectors_token_token_similarity(tokenizer_v, text):
+
-#    doc = tokenizer_v(text)
+@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
-#    assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
+def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
-#    assert 0.0 < doc[0].similarity(doc[1]) < 1.0
+    token = tokenizer_v(text1)
-#
+    lex = vocab[text2]
-#
+    assert token.similarity(lex) == lex.similarity(token)
-#@pytest.mark.xfail
+    assert -1. < token.similarity(lex) < 1.0
-#@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
+
-#def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
+
-#    token = tokenizer_v(text1)
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#    lex = vocab[text2]
+def test_vectors_token_span_similarity(vocab, text):
-#    assert token.similarity(lex) == lex.similarity(token)
+    doc = get_doc(vocab, text)
-#    assert 0.0 < token.similarity(lex) < 1.0
+    assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0])
-#
+    assert -1. < doc[0].similarity(doc[1:3]) < 1.0
-#
+
-#@pytest.mark.xfail
+
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#def test_vectors_token_span_similarity(vocab, text):
+def test_vectors_token_doc_similarity(vocab, text):
-#    doc = get_doc(vocab, text)
+    doc = get_doc(vocab, text)
-#    assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0])
+    assert doc[0].similarity(doc) == doc.similarity(doc[0])
-#    assert 0.0 < doc[0].similarity(doc[1:3]) < 1.0
+    assert -1. < doc[0].similarity(doc) < 1.0
-#
+
-#
+
-#@pytest.mark.xfail
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+def test_vectors_lexeme_span_similarity(vocab, text):
-#def test_vectors_token_doc_similarity(vocab, text):
+    doc = get_doc(vocab, text)
-#    doc = get_doc(vocab, text)
+    lex = vocab[text[0]]
-#    assert doc[0].similarity(doc) == doc.similarity(doc[0])
+    assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex)
-#    assert 0.0 < doc[0].similarity(doc) < 1.0
+    assert -1. < doc.similarity(doc[1:3]) < 1.0
-#
+
-#
+
-#@pytest.mark.xfail
+@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
-#def test_vectors_lexeme_span_similarity(vocab, text):
+    lex1 = vocab[text1]
-#    doc = get_doc(vocab, text)
+    lex2 = vocab[text2]
-#    lex = vocab[text[0]]
+    assert lex1.similarity(lex2) == lex2.similarity(lex1)
-#    assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex)
+    assert -1. < lex1.similarity(lex2) < 1.0
-#    assert 0.0 < doc.similarity(doc[1:3]) < 1.0
+
-#
+
-#
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#@pytest.mark.xfail
+def test_vectors_lexeme_doc_similarity(vocab, text):
-#@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
+    doc = get_doc(vocab, text)
-#def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
+    lex = vocab[text[0]]
-#    lex1 = vocab[text1]
+    assert lex.similarity(doc) == doc.similarity(lex)
-#    lex2 = vocab[text2]
+    assert -1. < lex.similarity(doc) < 1.0
-#    assert lex1.similarity(lex2) == lex2.similarity(lex1)
+
-#    assert 0.0 < lex1.similarity(lex2) < 1.0
+
-#
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#
+def test_vectors_span_span_similarity(vocab, text):
-#@pytest.mark.xfail
+    doc = get_doc(vocab, text)
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+    assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
-#def test_vectors_lexeme_doc_similarity(vocab, text):
+    assert -1. < doc[0:2].similarity(doc[1:3]) < 1.0
-#    doc = get_doc(vocab, text)
+
-#    lex = vocab[text[0]]
+
-#    assert lex.similarity(doc) == doc.similarity(lex)
+@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
-#    assert 0.0 < lex.similarity(doc) < 1.0
+def test_vectors_span_doc_similarity(vocab, text):
-#
+    doc = get_doc(vocab, text)
-#
+    assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
-#@pytest.mark.xfail
+    assert -1. < doc[0:2].similarity(doc) < 1.0
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+
-#def test_vectors_span_span_similarity(vocab, text):
+
-#    doc = get_doc(vocab, text)
+@pytest.mark.parametrize('text1,text2', [
-#    assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
+    (["apple", "and", "apple", "pie"], ["orange", "juice"])])
-#    assert 0.0 < doc[0:2].similarity(doc[1:3]) < 1.0
+def test_vectors_doc_doc_similarity(vocab, text1, text2):
-#
+    doc1 = get_doc(vocab, text1)
-#
+    doc2 = get_doc(vocab, text2)
-#@pytest.mark.xfail
+    assert doc1.similarity(doc2) == doc2.similarity(doc1)
-#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
+    assert -1. < doc1.similarity(doc2) < 1.0
 #def test_vectors_span_doc_similarity(vocab, text):
 #    doc = get_doc(vocab, text)
 #    assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
 #    assert 0.0 < doc[0:2].similarity(doc) < 1.0
 #
 #
 #@pytest.mark.xfail
 #@pytest.mark.parametrize('text1,text2', [
 #    (["apple", "and", "apple", "pie"], ["orange", "juice"])])
 #def test_vectors_doc_doc_similarity(vocab, text1, text2):
 #    doc1 = get_doc(vocab, text1)
 #    doc2 = get_doc(vocab, text2)
 #    assert doc1.similarity(doc2) == doc2.similarity(doc1)
 #    assert 0.0 < doc1.similarity(doc2) < 1.0
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -238,6 +238,29 @@ cdef class Doc:
    def doc(self):
        return self
    def char_span(self, int start_idx, int end_idx, label=0, vector=None):
        """Create a `Span` object from the slice `doc.text[start : end]`.
        doc (Doc): The parent document.
        start (int): The index of the first character of the span.
        end (int): The index of the first character after the span.
        label (uint64 or string): A label to attach to the Span, e.g. for named entities.
        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
        RETURNS (Span): The newly constructed object.
        """
        if not isinstance(label, int):
            label = self.vocab.strings.add(label)
        cdef int start = token_by_start(self.c, self.length, start_idx)
        if start == -1:
            return None
        cdef int end = token_by_end(self.c, self.length, end_idx)
        if end == -1:
            return None
        # Currently we have the token index, we want the range-end index
        end += 1
        cdef Span span = Span(self, start, end, label=label, vector=vector)
        return span
    def similarity(self, other):
        """Make a semantic similarity estimate. The default estimate is cosine
        similarity using an average of word vectors.
@ -280,8 +303,14 @@ cdef class Doc:
                return self.user_hooks['vector'](self)
            if self._vector is not None:
                return self._vector
-            elif self.has_vector and len(self):
+            elif not len(self):
-                self._vector = sum(t.vector for t in self) / len(self)
+                self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
                return self._vector
            elif self.has_vector:
                vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
                for token in self.c[:self.length]:
                    vector += self.vocab.get_vector(token.lex.orth)
                self._vector = vector / len(self)
                return self._vector
            elif self.tensor is not None:
                self._vector = self.tensor.mean(axis=0)
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@ -15,5 +15,5 @@ cdef class Span:
    cdef public _vector
    cdef public _vector_norm
    cpdef int _recalculate_indices(self) except -1
    cpdef np.ndarray to_array(self, object features)
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -7,7 +7,7 @@ import numpy
 import numpy.linalg
 from libc.math cimport sqrt
-from .doc cimport token_by_start, token_by_end
+from .doc cimport token_by_start, token_by_end, get_token_attr
 from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport flags_t, attr_t, hash_t
 from ..attrs cimport attr_id_t
@ -135,6 +135,29 @@ cdef class Span:
            return 0.0
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
    cpdef np.ndarray to_array(self, object py_attr_ids):
        """Given a list of M attribute IDs, export the tokens to a numpy
        `ndarray` of shape `(N, M)`, where `N` is the length of the document.
        The values will be 32-bit integers.
        attr_ids (list[int]): A list of attribute ID ints.
        RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
            per word, and one column per attribute indicated in the input
            `attr_ids`.
        """
        cdef int i, j
        cdef attr_id_t feature
        cdef np.ndarray[attr_t, ndim=2] output
        # Make an array from the attributes --- otherwise our inner loop is Python
        # dict iteration.
        cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
        cdef int length = self.end - self.start
        output = numpy.ndarray(shape=(length, len(attr_ids)), dtype=numpy.uint64)
        for i in range(self.start, self.end):
            for j, feature in enumerate(attr_ids):
                output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature)
        return output
    cpdef int _recalculate_indices(self) except -1:
        if self.end > self.doc.length \
        or self.doc.c[self.start].idx != self.start_char \
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -62,18 +62,26 @@ cdef class Token:
    def __richcmp__(self, Token other, int op):
        # http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
        cdef Doc my_doc = self.doc
        cdef Doc other_doc = other.doc
        my = self.idx
        their = other.idx if other is not None else None
        if op == 0:
            return my < their
        elif op == 2:
            if my_doc is other_doc:
                return my == their
            else:
                return False
        elif op == 4:
            return my > their
        elif op == 1:
            return my <= their
        elif op == 3:
            if my_doc is other_doc:
                return my != their
            else:
                return True
        elif op == 5:
            return my >= their
        else:
--- a/spacy/util.py
+++ b/spacy/util.py
@ -22,7 +22,7 @@ import ujson
 from .symbols import ORTH
 from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
-from .compat import copy_array, normalize_string_keys, getattr_
+from .compat import copy_array, normalize_string_keys, getattr_, import_file
 LANGUAGES = {}
@ -112,15 +112,13 @@ def load_model(name, **overrides):
 def load_model_from_link(name, **overrides):
    """Load a model from a shortcut link, or directory in spaCy data path."""
-    init_file = get_data_path() / name / '__init__.py'
+    path = get_data_path() / name / '__init__.py'
    spec = importlib.util.spec_from_file_location(name, init_file)
    try:
-        cls = importlib.util.module_from_spec(spec)
+        cls = import_file(name, path)
    except AttributeError:
        raise IOError(
            "Cant' load '%s'. If you're using a shortcut link, make sure it "
            "points to a valid model package (not just a data directory)." % name)
    spec.loader.exec_module(cls)
    return cls.load(**overrides)
@ -171,8 +169,8 @@ def get_model_meta(path):
        raise IOError("Could not read meta.json from %s" % meta_path)
    meta = read_json(meta_path)
    for setting in ['lang', 'name', 'version']:
-        if setting not in meta:
+        if setting not in meta or not meta[setting]:
-            raise ValueError('No %s setting found in model meta.json' % setting)
+            raise ValueError("No valid '%s' setting found in model meta.json" % setting)
    return meta
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@ -1,18 +1,25 @@
 from __future__ import unicode_literals
 from libc.stdint cimport int32_t, uint64_t
 import numpy
 from collections import OrderedDict
 import msgpack
 import msgpack_numpy
 msgpack_numpy.patch()
 cimport numpy as np
 from .typedefs cimport attr_t
 from .strings cimport StringStore
 from . import util
 from .compat import basestring_
 cdef class Vectors:
    '''Store, save and load word vectors.'''
    cdef public object data
    cdef readonly StringStore strings
-    cdef public object key2i
+    cdef public object key2row
    cdef public object keys
    cdef public int i
    def __init__(self, strings, data_or_width):
        self.strings = StringStore()
@ -21,10 +28,10 @@ cdef class Vectors:
                                           dtype='f')
        else:
            data = data_or_width
        self.i = 0
        self.data = data
-        self.key2i = {}
+        self.key2row = {}
-        for i, string in enumerate(strings):
+        self.keys = np.ndarray((self.data.shape[0],), dtype='uint64') 
            self.key2i[self.strings.add(string)] = i
    def __reduce__(self):
        return (Vectors, (self.strings, self.data))
@ -32,7 +39,7 @@ cdef class Vectors:
    def __getitem__(self, key):
        if isinstance(key, basestring):
            key = self.strings[key]
-        i = self.key2i[key]
+        i = self.key2row[key]
        if i is None:
            raise KeyError(key)
        else:
@ -41,14 +48,36 @@ cdef class Vectors:
    def __setitem__(self, key, vector):
        if isinstance(key, basestring):
            key = self.strings.add(key)
-        i = self.key2i[key]
+        i = self.key2row[key]
        self.data[i] = vector
    def __iter__(self):
        yield from self.data
    def __len__(self):
-        return len(self.strings)
+        return self.i
    def __contains__(self, key):
        if isinstance(key, basestring_):
            key = self.strings[key]
        return key in self.key2row
    def add(self, key, vector=None):
        if isinstance(key, basestring_):
            key = self.strings.add(key)
        if key not in self.key2row:
            i = self.i
            if i >= self.keys.shape[0]:
                self.keys.resize((self.keys.shape[0]*2,))
                self.data.resize((self.data.shape[0]*2, self.data.shape[1]))
            self.key2row[key] = self.i
            self.keys[self.i] = key
            self.i += 1
        else:
            i = self.key2row[key]
        if vector is not None:
            self.data[i] = vector
        return i
    def items(self):
        for i, string in enumerate(self.strings):
@ -61,34 +90,87 @@ cdef class Vectors:
    def most_similar(self, key):
        raise NotImplementedError
-    def to_disk(self, path):
+    def from_glove(self, path):
-        raise NotImplementedError
+        '''Load GloVe vectors from a directory. Assumes binary format,
        that the vocab is in a vocab.txt, and that vectors are named
        vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32
        vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc.
        By default GloVe outputs 64-bit vectors.'''
        path = util.ensure_path(path)
        for name in path.iterdir():
            if name.parts[-1].startswith('vectors'):
                _, dims, dtype, _2 = name.parts[-1].split('.')
                self.width = int(dims)
                break
        else:
            raise IOError("Expected file named e.g. vectors.128.f.bin")
        bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims,
                                                             dtype=dtype)
        with bin_loc.open('rb') as file_:
            self.data = numpy.fromfile(file_, dtype='float64')
            self.data = numpy.ascontiguousarray(self.data, dtype='float32')
        n = 0
        with (path / 'vocab.txt').open('r') as file_:
            for line in file_:
                self.add(line.strip())
                n += 1
        if (self.data.size % self.width) == 0:
            self.data
-    def from_disk(self, path):
+    def to_disk(self, path, **exclude):
-        raise NotImplementedError
+        serializers = OrderedDict((
            ('vectors', lambda p: numpy.save(p.open('wb'), self.data, allow_pickle=False)),
            ('keys', lambda p: numpy.save(p.open('wb'), self.keys, allow_pickle=False)),
        ))
        return util.to_disk(path, serializers, exclude)
    def from_disk(self, path, **exclude):
        def load_keys(path):
            if path.exists():
                self.keys = numpy.load(path)
                for i, key in enumerate(self.keys):
                    self.keys[i] = key
                    self.key2row[key] = i
        def load_vectors(path):
            if path.exists():
                self.data = numpy.load(path)
        serializers = OrderedDict((
            ('keys', load_keys),
            ('vectors', load_vectors),
        ))
        util.from_disk(path, serializers, exclude)
        return self
    def to_bytes(self, **exclude):
        def serialize_weights():
-            if hasattr(self.weights, 'to_bytes'):
+            if hasattr(self.data, 'to_bytes'):
-                return self.weights.to_bytes()
+                return self.data.to_bytes()
            else:
-                return msgpack.dumps(self.weights)
+                return msgpack.dumps(self.data)
        serializers = OrderedDict((
-            ('strings', lambda: self.strings.to_bytes()),
+            ('keys', lambda: msgpack.dumps(self.keys)),
-            ('weights', serialize_weights)
+            ('vectors', serialize_weights)
        ))
        return util.to_bytes(serializers, exclude)
    def from_bytes(self, data, **exclude):
        def deserialize_weights(b):
-            if hasattr(self.weights, 'from_bytes'):
+            if hasattr(self.data, 'from_bytes'):
-                self.weights.from_bytes()
+                self.data.from_bytes()
            else:
-                self.weights = msgpack.loads(b)
+                self.data = msgpack.loads(b)
        def load_keys(keys):
            self.keys.resize((len(keys),))
            for i, key in enumerate(keys):
                self.keys[i] = key
                self.key2row[key] = i
        deserializers = OrderedDict((
-            ('strings', lambda b: self.strings.from_bytes(b)),
+            ('keys', lambda b: load_keys(msgpack.loads(b))),
-            ('weights', deserialize_weights)
+            ('vectors', deserialize_weights)
        ))
-        return util.from_bytes(deserializers, exclude)
+        util.from_bytes(data, deserializers, exclude)
        return self
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import bz2
 import ujson
 import re
 import numpy
 from libc.string cimport memset, memcpy
 from libc.stdint cimport int32_t
@ -19,9 +20,10 @@ from .tokens.token cimport Token
 from .attrs cimport PROB, LANG
 from .structs cimport SerializedLexemeC
-from .compat import copy_reg, pickle
+from .compat import copy_reg, pickle, basestring_
 from .lemmatizer import Lemmatizer
 from .attrs import intify_attrs
 from .vectors import Vectors
 from . import util
 from . import attrs
 from . import symbols
@ -63,6 +65,7 @@ cdef class Vocab:
                self.strings.add(name)
        self.lex_attr_getters = lex_attr_getters
        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
        self.vectors = Vectors(self.strings, 300)
    property lang:
        def __get__(self):
@ -242,13 +245,15 @@ cdef class Vocab:
    @property
    def vectors_length(self):
-        raise NotImplementedError
+        return self.vectors.data.shape[1]
-    def clear_vectors(self):
+    def clear_vectors(self, new_dim=None):
        """Drop the current vector table. Because all vectors must be the same
        width, you have to call this to change the size of the vectors.
        """
-        raise NotImplementedError
+        if new_dim is None:
            new_dim = self.vectors.data.shape[1]
        self.vectors = Vectors(self.strings, new_dim)
    def get_vector(self, orth):
        """Retrieve a vector for a word in the vocabulary.
@ -262,7 +267,12 @@ cdef class Vocab:
        RAISES: If no vectors data is loaded, ValueError is raised.
        """
-        raise NotImplementedError
+        if isinstance(orth, basestring_):
            orth = self.strings.add(orth)
        if orth in self.vectors.key2row:
            return self.vectors[orth]
        else:
            return numpy.zeros((self.vectors_length,), dtype='f')
    def set_vector(self, orth, vector):
        """Set a vector for a word in the vocabulary.
@ -272,15 +282,19 @@ cdef class Vocab:
        RETURNS:
            None
        """
-        raise NotImplementedError
+        if not isinstance(orth, basestring_):
            orth = self.strings[orth]
        self.vectors.add(orth, vector=vector)
    def has_vector(self, orth):
        """Check whether a word has a vector. Returns False if no
        vectors have been loaded. Words can be looked up by string
        or int ID."""
-        return False
+        if isinstance(orth, basestring_):
            orth = self.strings.add(orth)
        return orth in self.vectors
-    def to_disk(self, path):
+    def to_disk(self, path, **exclude):
        """Save the current state to a directory.
        path (unicode or Path): A path to a directory, which will be created if
@ -292,8 +306,10 @@ cdef class Vocab:
        self.strings.to_disk(path / 'strings.json')
        with (path / 'lexemes.bin').open('wb') as file_:
            file_.write(self.lexemes_to_bytes())
        if self.vectors is not None:
            self.vectors.to_disk(path)
-    def from_disk(self, path):
+    def from_disk(self, path, **exclude):
        """Loads state from a directory. Modifies the object in place and
        returns it.
@ -305,6 +321,8 @@ cdef class Vocab:
        self.strings.from_disk(path / 'strings.json')
        with (path / 'lexemes.bin').open('rb') as file_:
            self.lexemes_from_bytes(file_.read())
        if self.vectors is not None:
            self.vectors.from_disk(path, exclude='strings.json')
        return self
    def to_bytes(self, **exclude):
@ -313,9 +331,16 @@ cdef class Vocab:
        **exclude: Named attributes to prevent from being serialized.
        RETURNS (bytes): The serialized form of the `Vocab` object.
        """
        def deserialize_vectors():
            if self.vectors is None:
                return None
            else:
                return self.vectors.to_bytes(exclude='strings.json')
        getters = OrderedDict((
            ('strings', lambda: self.strings.to_bytes()),
            ('lexemes', lambda: self.lexemes_to_bytes()),
            ('vectors', deserialize_vectors)
        ))
        return util.to_bytes(getters, exclude)
@ -326,9 +351,15 @@ cdef class Vocab:
        **exclude: Named attributes to prevent from being loaded.
        RETURNS (Vocab): The `Vocab` object.
        """
        def serialize_vectors(b):
            if self.vectors is None:
                return None
            else:
                return self.vectors.from_bytes(b, exclude='strings')
        setters = OrderedDict((
            ('strings', lambda b: self.strings.from_bytes(b)),
            ('lexemes', lambda b: self.lexemes_from_bytes(b)),
            ('vectors', lambda b: serialize_vectors(b))
        ))
        util.from_bytes(bytes_data, setters, exclude)
        return self
--- a/travis.sh
+++ b/travis.sh
@ -2,9 +2,8 @@
 if [ "${VIA}" == "pypi" ]; then
    rm -rf *
-    pip install spacy
+    pip install spacy-nightly
-    python -m spacy.en.download
+    python -m spacy download en
    python -m spacy.de.download
 fi
 if [[ "${VIA}" == "sdist" && "${TRAVIS_PULL_REQUEST}" == "false" ]]; then
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -103,20 +103,20 @@ mixin button(url, trusted, ...style)
    label    - [string] aside title (optional or false for no label)
    language - [string] language for syntax highlighting (default: "python")
               supports basic relevant languages available for PrismJS
-    icon     - [string] icon to display next to code block, mostly used for old/new
+    prompt    - [string] prompt or icon to display next to code block, (mostly used for old/new)
    height   - [integer] optional height to clip code block to
-mixin code(label, language, icon, height)
+mixin code(label, language, prompt, height)
    pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
        if label
            h4.u-text-label.u-text-label--dark=label
-
+        - var icon = (prompt == 'accept' || prompt == 'reject')
        if icon
            - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
            .c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
                +icon(icon, 18)
-        code.c-code-block__content
+        code.c-code-block__content(data-prompt=icon ? null : prompt)
            block
--- a/website/assets/css/_base/_utilities.sass
+++ b/website/assets/css/_base/_utilities.sass
@ -112,6 +112,10 @@
 .u-nowrap
    white-space: nowrap
 .u-break.u-break
    word-wrap: break-word
    white-space: initial
 .u-no-border
    border: none
--- a/website/assets/css/_components/_code.sass
+++ b/website/assets/css/_components/_code.sass
@ -35,6 +35,13 @@
    font: normal normal 1.1rem/#{2} $font-code
    padding: 1em 2em
    &[data-prompt]:before,
        content: attr(data-prompt)
        margin-right: 0.65em
        display: inline-block
        vertical-align: middle
        opacity: 0.5
 //- Inline code
--- a/website/docs/api/_annotation/_pos-tags.jade
+++ b/website/docs/api/_annotation/_pos-tags.jade
@ -21,7 +21,7 @@ p
    +pos-row("$", "SYM", "SymType=currency", "symbol, currency")
    +pos-row("ADD", "X", "", "email")
    +pos-row("AFX", "ADJ", "Hyph=yes", "affix")
-    +pos-row("BES", "VERB", "", 'auxillary "be"')
+    +pos-row("BES", "VERB", "", 'auxiliary "be"')
    +pos-row("CC", "CONJ", "ConjType=coor", "conjunction, coordinating")
    +pos-row("CD", "NUM", "NumType=card", "cardinal number")
    +pos-row("DT", "DET", "determiner")
@ -35,7 +35,7 @@ p
    +pos-row("JJR", "ADJ", "Degree=comp", "adjective, comparative")
    +pos-row("JJS", "ADJ", "Degree=sup", "adjective, superlative")
    +pos-row("LS", "PUNCT", "NumType=ord", "list item marker")
-    +pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxillary")
+    +pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxiliary")
    +pos-row("NFP", "PUNCT", "", "superfluous punctuation")
    +pos-row("NIL", "", "", "missing tag")
    +pos-row("NN", "NOUN", "Number=sing", "noun, singular or mass")
--- a/website/docs/api/cli.jade
+++ b/website/docs/api/cli.jade
@ -5,16 +5,7 @@ include ../../_includes/_mixins
 p
    |  As of v1.7.0, spaCy comes with new command line helpers to download and
    |  link models and show useful debugging information. For a list of available
-    |  commands, type #[code python -m spacy]. To make the command even more
+    |  commands, type #[code spacy --help].
    |  convenient, we recommend
    |  #[+a("https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537") creating an alias]
    |  mapping #[code python -m spacy] to #[code spacy].
 +aside("Why python -m?")
    |  The problem with a global entry point is that it's resolved by looking up
    |  entries in your #[code PATH] environment variable. This can give you
    |  unexpected results, like executing the wrong spaCy installation.
    |  #[code python -m] prevents fallbacks to system modules.
 +infobox("⚠️ Deprecation note")
    |  As of spaCy 2.0, the #[code model] command to initialise a model data
@ -33,8 +24,8 @@ p
    |  Direct downloads don't perform any compatibility checks and require the
    |  model name to be specified with its version (e.g., #[code en_core_web_sm-1.2.0]).
-+code(false, "bash").
+code(false, "bash", "$").
-    python -m spacy download [model] [--direct]
+    spacy download [model] [--direct]
 +table(["Argument", "Type", "Description"])
    +row
@ -80,8 +71,8 @@ p
    |  or use the #[+api("cli#package") #[code package]] command to create a
    |  model package.
-+code(false, "bash").
+code(false, "bash", "$").
-    python -m spacy link [origin] [link_name] [--force]
+    spacy link [origin] [link_name] [--force]
 +table(["Argument", "Type", "Description"])
    +row
@ -112,8 +103,8 @@ p
    |  markup to copy-paste into #[+a(gh("spacy") + "/issues") GitHub issues].
 +code(false, "bash").
-    python -m spacy info [--markdown]
+    spacy info [--markdown]
-    python -m spacy info [model] [--markdown]
+    spacy info [model] [--markdown]
 +table(["Argument", "Type", "Description"])
    +row
@ -139,8 +130,8 @@ p
    |  functions. The right converter is chosen based on the file extension of
    |  the input file. Currently only supports #[code .conllu].
-+code(false, "bash").
+code(false, "bash", "$").
-    python -m spacy convert [input_file] [output_dir] [--n-sents] [--morphology]
+    spacy convert [input_file] [output_dir] [--n-sents] [--morphology]
 +table(["Argument", "Type", "Description"])
    +row
@ -174,8 +165,8 @@ p
    |  Train a model. Expects data in spaCy's
    |  #[+a("/docs/api/annotation#json-input") JSON format].
-+code(false, "bash").
+code(false, "bash", "$").
-    python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
+    spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
 +table(["Argument", "Type", "Description"])
    +row
@ -345,8 +336,8 @@ p
    |  sure you're always using the latest versions. This means you need to be
    |  connected to the internet to use this command.
-+code(false, "bash").
+code(false, "bash", "$").
-    python -m spacy package [input_dir] [output_dir] [--meta] [--force]
+    spacy package [input_dir] [output_dir] [--meta] [--force]
 +table(["Argument", "Type", "Description"])
    +row
@ -360,10 +351,17 @@ p
        +cell Directory to create package folder in.
    +row
-        +cell #[code meta]
+        +cell #[code --meta-path], #[code -m]
        +cell option
        +cell Path to meta.json file (optional).
    +row
        +cell #[code --create-meta], #[code -c]
        +cell flag
        +cell
            |  Create a meta.json file on the command line, even if one already
            |  exists in the directory.
    +row
        +cell #[code --force], #[code -f]
        +cell flag
--- a/website/docs/api/doc.jade
+++ b/website/docs/api/doc.jade
@ -140,6 +140,43 @@ p Get the number of tokens in the document.
        +cell int
        +cell The number of tokens in the document.
 +h(2, "char_span") Doc.char_span
    +tag method
    +tag-new(2)
 p Create a #[code Span] object from the slice #[code doc.text[start : end]].
 +aside-code("Example").
    doc = nlp(u'I like New York')
    span = doc.char_span(7, 15, label=u'GPE')
    assert span.text == 'New York'
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code start]
        +cell int
        +cell The index of the first character of the span.
    +row
        +cell #[code end]
        +cell int
        +cell The index of the first character after the span.
    +row
        +cell #[code label]
        +cell uint64 / unicode
        +cell A label to attach to the Span, e.g. for named entities.
    +row
        +cell #[code vector]
        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
        +cell A meaning representation of the span.
    +footrow
        +cell returns
        +cell #[code Span]
        +cell The newly constructed object.
 +h(2, "similarity") Doc.similarity
    +tag method
    +tag-model("vectors")
@ -211,12 +248,12 @@ p
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code attr_ids]
-        +cell ints
+        +cell list
        +cell A list of attribute ID ints.
    +footrow
        +cell returns
-        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+        +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
        +cell
            |  The exported attributes as a 2D numpy array, with one row per
            |  token and one column per attribute.
@ -245,7 +282,7 @@ p
    +row
        +cell #[code array]
-        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+        +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
        +cell The attribute values to load.
    +footrow
@ -509,7 +546,7 @@ p
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
        +cell A 1D numpy array representing the document's semantics.
 +h(2, "vector_norm") Doc.vector_norm
--- a/website/docs/api/language-models.jade
+++ b/website/docs/api/language-models.jade
@ -8,9 +8,9 @@ p
 +aside-code("Download language models", "bash").
-    python -m spacy download en
+    spacy download en
-    python -m spacy download de
+    spacy download de
-    python -m spacy download fr
+    spacy download fr
 +table([ "Language", "Token", "SBD", "Lemma", "POS", "NER", "Dep", "Vector", "Sentiment"])
    +row
--- a/website/docs/api/language.jade
+++ b/website/docs/api/language.jade
@ -111,6 +111,14 @@ p
        +cell -
        +cell A sequence of unicode objects.
    +row
        +cell #[code as_tuples]
        +cell bool
        +cell
            |  If set to #[code True], inputs should be a sequence of
            |  #[code (text, context)] tuples. Output will then be a sequence of
            |  #[code (doc, context)] tuples. Defaults to #[code False].
    +row
        +cell #[code n_threads]
        +cell int
--- a/website/docs/api/lexeme.jade
+++ b/website/docs/api/lexeme.jade
@ -129,7 +129,7 @@ p A real-valued meaning representation.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
        +cell A 1D numpy array representing the lexeme's semantics.
 +h(2, "vector_norm") Lexeme.vector_norm
--- a/website/docs/api/span.jade
+++ b/website/docs/api/span.jade
@ -37,7 +37,7 @@ p Create a Span object from the #[code slice doc[start : end]].
    +row
        +cell #[code vector]
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
        +cell A meaning representation of the span.
    +footrow
@ -145,11 +145,47 @@ p
        +cell float
        +cell A scalar similarity score. Higher is more similar.
 +h(2, "to_array") Span.to_array
    +tag method
    +tag-new(2)
 p
    |  Given a list of #[code M] attribute IDs, export the tokens to a numpy
    |  #[code ndarray] of shape #[code (N, M)], where #[code N] is the length of
    |  the document. The values will be 32-bit integers.
 +aside-code("Example").
    from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
    doc = nlp(u'I like New York in Autumn.')
    span = doc[2:3]
    # All strings mapped to integers, for easy export to numpy
    np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code attr_ids]
        +cell list
        +cell A list of attribute ID ints.
    +footrow
        +cell returns
        +cell #[code.u-break numpy.ndarray[long, ndim=2]]
        +cell
            |  A feature matrix, with one row per word, and one column per
            |  attribute indicated in the input #[code attr_ids].
 +h(2, "merge") Span.merge
    +tag method
 p Retokenize the document, such that the span is merged into a single token.
 +aside-code("Example").
    doc = nlp(u'I like New York in Autumn.')
    span = doc[2:3]
    span.merge()
    assert len(doc) == 6
    assert doc[2].text == 'New York'
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code **attributes]
@ -169,7 +205,7 @@ p Retokenize the document, such that the span is merged into a single token.
 p
    |  The token within the span that's highest in the parse tree. If there's a
-    |  tie, the earlist is prefered.
+    |  tie, the earliest is preferred.
 +aside-code("Example").
    doc = nlp(u'I like New York in Autumn.')
@ -270,7 +306,7 @@ p
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
        +cell A 1D numpy array representing the span's semantics.
 +h(2, "vector_norm") Span.vector_norm
--- a/website/docs/api/token.jade
+++ b/website/docs/api/token.jade
@ -250,7 +250,7 @@ p A real-valued meaning representation.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
        +cell A 1D numpy array representing the token's semantics.
 +h(2, "vector_norm") Span.vector_norm
--- a/website/docs/usage/adding-languages.jade
+++ b/website/docs/usage/adding-languages.jade
@ -205,7 +205,7 @@ p
 +infobox("Why lazy-loading?")
    |  Some languages contain large volumes of custom data, like lemmatizer
-    |  loopup tables, or complex regular expression that are expensive to
+    |  lookup tables, or complex regular expression that are expensive to
    |  compute. As of spaCy v2.0, #[code Language] classes are not imported on
    |  initialisation and are only loaded when you import them directly, or load
    |  a model that requires a language to be loaded. To lazy-load languages in
@ -789,4 +789,4 @@ p
    |  model use the using spaCy's #[+api("cli#train") #[code train]] command:
 +code(false, "bash").
-    python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
+    spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
--- a/website/docs/usage/customizing-tokenizer.jade
+++ b/website/docs/usage/customizing-tokenizer.jade
@ -39,7 +39,7 @@ p
 +h(2, "special-cases") Adding special case tokenization rules
 p
-    |  Most domains have at least some idiosyncracies that require custom
+    |  Most domains have at least some idiosyncrasies that require custom
    |  tokenization rules. This could be very certain expressions, or
    |  abbreviations only used in this specific field.
--- a/website/docs/usage/index.jade
+++ b/website/docs/usage/index.jade
@ -32,10 +32,10 @@ p
    +qs({package: 'source'}) pip install -r requirements.txt
    +qs({package: 'source'}) pip install -e .
-    +qs({model: 'en'}) python -m spacy download en
+    +qs({model: 'en'}) spacy download en
-    +qs({model: 'de'}) python -m spacy download de
+    +qs({model: 'de'}) spacy download de
-    +qs({model: 'fr'}) python -m spacy download fr
+    +qs({model: 'fr'}) spacy download fr
-    +qs({model: 'es'}) python -m spacy download es
+    +qs({model: 'es'}) spacy download es
 +h(2, "installation") Installation instructions
@ -52,7 +52,7 @@ p Using pip, spaCy releases are currently only available as source packages.
    |  and available models, see the #[+a("/docs/usage/models") docs on models].
    +code.o-no-block.
-        python -m spacy download en
+        spacy download en
        &gt;&gt;&gt; import spacy
        &gt;&gt;&gt; nlp = spacy.load('en')
@ -109,7 +109,7 @@ p
    |  The other way to install spaCy is to clone its
    |  #[+a(gh("spaCy")) GitHub repository] and build it from source. That is
    |  the common way if you want to make changes to the code base. You'll need to
-    |  make sure that you have a development enviroment consisting of a Python
+    |  make sure that you have a development environment consisting of a Python
    |  distribution including header files, a compiler,
    |  #[+a("https://pip.pypa.io/en/latest/installing/") pip],
    |  #[+a("https://virtualenv.pypa.io/") virtualenv] and
@ -312,7 +312,9 @@ p
    |  This error may occur when running the #[code spacy] command from the
    |  command line. spaCy does not currently add an entry to our #[code PATH]
    |  environment variable, as this can lead to unexpected results, especially
-    |  when using #[code virtualenv]. Run the command with #[code python -m],
+    |  when using #[code virtualenv]. Instead, spaCy adds an auto-alias that
    |  maps #[code spacy] to #[code python -m spacy]. If this is not working as
    |  expected, run the command with #[code python -m], yourself –
    |  for example #[code python -m spacy download en]. For more info on this,
    |  see #[+api("cli#download") download].
--- a/website/docs/usage/lightning-tour.jade
+++ b/website/docs/usage/lightning-tour.jade
@ -10,8 +10,8 @@ p
 +h(2, "models") Install models and process text
 +code(false, "bash").
-    python -m spacy download en
+    spacy download en
-    python -m spacy download de
+    spacy download de
 +code.
    import spacy
--- a/Show More
+++ b/Show More