Merge branch 'develop' into support-danish

2025-12-01 23:36:02 +03:00 · 2017-10-24 11:53:19 +02:00 · 2017-10-24 11:53:19 +02:00 · facf77e541
commit facf77e541
parent 85144835da ccd2ab1a62
452 changed files with 59179 additions and 12048 deletions
--- a/.appveyor.yml
+++ b/.appveyor.yml
@ -1 +1,55 @@
+environment:
+
+  matrix:
+
+    # For Python versions available on Appveyor, see
+    # http://www.appveyor.com/docs/installed-software#python
+    # The list here is complete (excluding Python 2.6, which
+    # isn't covered by this document) at the time of writing.
+
+    - PYTHON: "C:\\Python27"
+    #- PYTHON: "C:\\Python33"
+    #- PYTHON: "C:\\Python34"
+    #- PYTHON: "C:\\Python35"
+    #- PYTHON: "C:\\Python27-x64"
+    #- PYTHON: "C:\\Python33-x64"
+    #- DISTUTILS_USE_SDK: "1"
+    #- PYTHON: "C:\\Python34-x64"
+    #- DISTUTILS_USE_SDK: "1"
+    #- PYTHON: "C:\\Python35-x64"
+    - PYTHON: "C:\\Python36-x64"
+
+install:
+  # We need wheel installed to build wheels
+  - "%PYTHON%\\python.exe -m pip install wheel"
+  - "%PYTHON%\\python.exe -m pip install cython"
+  - "%PYTHON%\\python.exe -m pip install -r requirements.txt"
+  - "%PYTHON%\\python.exe -m pip install -e ."
+
 build: off
+
+test_script:
+  # Put your test command here.
+  # If you don't need to build C extensions on 64-bit Python 3.3 or 3.4,
+  # you can remove "build.cmd" from the front of the command, as it's
+  # only needed to support those cases.
+  # Note that you must use the environment variable %PYTHON% to refer to
+  # the interpreter you're using - Appveyor does not do anything special
+  # to put the Python version you want to use on PATH.
+  - "%PYTHON%\\python.exe -m pytest spacy/"
+
+after_test:
+  # This step builds your wheels.
+  # Again, you only need build.cmd if you're building C extensions for
+  # 64-bit Python 3.3/3.4. And you need to use %PYTHON% to get the correct
+  # interpreter
+  - "%PYTHON%\\python.exe setup.py bdist_wheel"
+
+artifacts:
+  # bdist_wheel puts your built wheel in the dist directory
+  - path: dist\*
+
+#on_success:
+#  You can use this step to upload your artifacts to a public website.
+#  See Appveyor's documentation for more details. Or you can simply
+#  access your wheels from the Appveyor "artifacts" tab for your build.
--- a/.buildkite/sdist.yml
+++ b/.buildkite/sdist.yml
@ -0,0 +1,11 @@
+steps:
+  -
+    command: "fab env clean make test sdist"
+    label: ":dizzy: :python:"
+    artifact_paths: "dist/*.tar.gz"
+  - wait
+  - trigger: "spacy-sdist-against-models"
+    label: ":dizzy: :hammer:"
+    build:
+      env:
+        SPACY_VERSION: "{$SPACY_VERSION}"
--- a/.github/contributors/ramananbalakrishnan.md
+++ b/.github/contributors/ramananbalakrishnan.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your 
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Ramanan Balakrishnan |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2017-10-19           |
+| GitHub username                | ramananbalakrishnan  |
+| Website (optional)             |                      |
--- a/.gitignore
+++ b/.gitignore
@ -1,14 +1,12 @@
 # spaCy
 spacy/data/
 corpora/
-models/
+/models/
 keys/

 # Website
 website/www/
 website/_deploy.sh
-website/package.json
-website/announcement.jade
 website/.gitignore

 # Cython / C extensions
@ -40,7 +38,6 @@ venv/

 # Distribution / packaging
 env/
-bin/
 build/
 develop-eggs/
 dist/
--- a/.travis.yml
+++ b/.travis.yml
@ -14,8 +14,7 @@ os:
 env:
  - VIA=compile LC_ALL=en_US.ascii 
  - VIA=compile
-
-#  - VIA=sdist
+  #- VIA=pypi_nightly

 install:
  - "./travis.sh"
@ -23,7 +22,7 @@ install:
 script:
  - "pip install pytest pytest-timeout"
  - if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi
-  - if [[ "${VIA}" == "pypi" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(ospath.dirname(spacy.__file__)))"`; fi
+  - if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
  - if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
  
 notifications:
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,3 +1,4 @@
 recursive-include include *.h
 include LICENSE
 include README.rst
+include bin/spacy
--- a/README.rst
+++ b/README.rst
@ -229,7 +229,7 @@ Compile from source
 The other way to install spaCy is to clone its
 `GitHub repository <https://github.com/explosion/spaCy>`_ and build it from
 source. That is the common way if you want to make changes to the code base.
-You'll need to make sure that you have a development enviroment consisting of a
+You'll need to make sure that you have a development environment consisting of a
 Python distribution including header files, a compiler,
 `pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_
 and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest.
--- a/bin/spacy
+++ b/bin/spacy
@ -0,0 +1 @@
+python -m spacy "$@"
--- a/examples/chainer_sentiment.py
+++ b/examples/chainer_sentiment.py
@ -1,322 +0,0 @@
-'''WIP --- Doesn't work well yet'''
-import plac
-import random
-import six
-
-import cProfile
-import pstats
-
-import pathlib
-import cPickle as pickle
-from itertools import izip
-
-import spacy
-
-import cytoolz
-import cupy as xp
-import cupy.cuda
-import chainer.cuda
-
-import chainer.links as L
-import chainer.functions as F
-from chainer import Chain, Variable, report
-import chainer.training
-import chainer.optimizers
-from chainer.training import extensions
-from chainer.iterators import SerialIterator
-from chainer.datasets import TupleDataset
-
-
-class SentimentAnalyser(object):
-    @classmethod
-    def load(cls, path, nlp, max_length=100):
-        raise NotImplementedError
-        #with (path / 'config.json').open() as file_:
-        #    model = model_from_json(file_.read())
-        #with (path / 'model').open('rb') as file_:
-        #    lstm_weights = pickle.load(file_)
-        #embeddings = get_embeddings(nlp.vocab)
-        #model.set_weights([embeddings] + lstm_weights)
-        #return cls(model, max_length=max_length)
-
-    def __init__(self, model, max_length=100):
-        self._model = model
-        self.max_length = max_length
-
-    def __call__(self, doc):
-        X = get_features([doc], self.max_length)
-        y = self._model.predict(X)
-        self.set_sentiment(doc, y)
-
-    def pipe(self, docs, batch_size=1000, n_threads=2):
-        for minibatch in cytoolz.partition_all(batch_size, docs):
-            minibatch = list(minibatch)
-            sentences = []
-            for doc in minibatch:
-                sentences.extend(doc.sents)
-            Xs = get_features(sentences, self.max_length)
-            ys = self._model.predict(Xs)
-            for sent, label in zip(sentences, ys):
-                sent.doc.sentiment += label - 0.5
-            for doc in minibatch:
-                yield doc
-
-    def set_sentiment(self, doc, y):
-        doc.sentiment = float(y[0])
-        # Sentiment has a native slot for a single float.
-        # For arbitrary data storage, there's:
-        # doc.user_data['my_data'] = y
-
-
-class Classifier(Chain):
-    def __init__(self, predictor):
-        super(Classifier, self).__init__(predictor=predictor)
-
-    def __call__(self, x, t):
-        y = self.predictor(x)
-        loss = F.softmax_cross_entropy(y, t)
-        accuracy = F.accuracy(y, t)
-        report({'loss': loss, 'accuracy': accuracy}, self)
-        return loss
-
-
-class SentimentModel(Chain):
-    def __init__(self, nlp, shape, **settings):
-        Chain.__init__(self,
-            embed=_Embed(shape['nr_vector'], shape['nr_dim'], shape['nr_hidden'],
-                set_vectors=lambda arr: set_vectors(arr, nlp.vocab)),
-            encode=_Encode(shape['nr_hidden'], shape['nr_hidden']),
-            attend=_Attend(shape['nr_hidden'], shape['nr_hidden']),
-            predict=_Predict(shape['nr_hidden'], shape['nr_class']))
-        self.to_gpu(0)
-
-    def __call__(self, sentence):
-        return self.predict(
-                  self.attend(
-                      self.encode(
-                          self.embed(sentence))))
-
-
-class _Embed(Chain):
-    def __init__(self, nr_vector, nr_dim, nr_out, set_vectors=None):
-        Chain.__init__(self,
-            embed=L.EmbedID(nr_vector, nr_dim, initialW=set_vectors),
-            project=L.Linear(None, nr_out, nobias=True))
-        self.embed.W.volatile = False
-
-    def __call__(self, sentence):
-        return [self.project(self.embed(ts)) for ts in F.transpose(sentence)]
-
-
-class _Encode(Chain):
-    def __init__(self, nr_in, nr_out):
-        Chain.__init__(self,
-            fwd=L.LSTM(nr_in, nr_out),
-            bwd=L.LSTM(nr_in, nr_out),
-            mix=L.Bilinear(nr_out, nr_out, nr_out))
-
-    def __call__(self, sentence):
-        self.fwd.reset_state()
-        fwds = map(self.fwd, sentence)
-        self.bwd.reset_state()
-        bwds = reversed(map(self.bwd, reversed(sentence)))
-        return [F.elu(self.mix(f, b)) for f, b in zip(fwds, bwds)]
-
-
-class _Attend(Chain):
-    def __init__(self, nr_in, nr_out):
-        Chain.__init__(self)
-
-    def __call__(self, sentence):
-        sent = sum(sentence)
-        return sent
-
-
-class _Predict(Chain):
-    def __init__(self, nr_in, nr_out):
-        Chain.__init__(self,
-            l1=L.Linear(nr_in, nr_in),
-            l2=L.Linear(nr_in, nr_out))
-
-    def __call__(self, vector):
-        vector = self.l1(vector)
-        vector = F.elu(vector)
-        vector = self.l2(vector)
-        return vector
-
-
-class SentenceDataset(TupleDataset):
-    def __init__(self, nlp, texts, labels, max_length):
-        self.max_length = max_length
-        sents, labels = self._get_labelled_sentences(
-            nlp.pipe(texts, batch_size=5000, n_threads=3),
-            labels)
-        TupleDataset.__init__(self,
-            get_features(sents, max_length),
-            labels)
-
-    def __getitem__(self, index):
-        batches = [dataset[index] for dataset in self._datasets]
-        if isinstance(index, slice):
-            length = len(batches[0])
-            returns = [tuple([batch[i] for batch in batches])
-                       for i in six.moves.range(length)]
-            return returns
-        else:
-            return tuple(batches)
-
-    def _get_labelled_sentences(self, docs, doc_labels):
-        labels = []
-        sentences = []
-        for doc, y in izip(docs, doc_labels):
-            for sent in doc.sents:
-                sentences.append(sent)
-                labels.append(y)
-        return sentences, xp.asarray(labels, dtype='i')
-
-
-class DocDataset(TupleDataset):
-    def __init__(self, nlp, texts, labels):
-        self.max_length = max_length
-        DatasetMixin.__init__(self,
-            get_features(
-                nlp.pipe(texts, batch_size=5000, n_threads=3), self.max_length),
-            labels)
-
-def read_data(data_dir, limit=0):
-    examples = []
-    for subdir, label in (('pos', 1), ('neg', 0)):
-        for filename in (data_dir / subdir).iterdir():
-            with filename.open() as file_:
-                text = file_.read()
-            examples.append((text, label))
-    random.shuffle(examples)
-    if limit >= 1:
-        examples = examples[:limit]
-    return zip(*examples) # Unzips into two lists
-
-
-def get_features(docs, max_length):
-    docs = list(docs)
-    Xs = xp.zeros((len(docs), max_length), dtype='i')
-    for i, doc in enumerate(docs):
-        j = 0
-        for token in doc:
-            if token.has_vector and not token.is_punct and not token.is_space:
-                Xs[i, j] = token.norm
-                j += 1
-                if j >= max_length:
-                    break
-    return Xs
-
-
-def set_vectors(vectors, vocab):
-    for lex in vocab:
-        if lex.has_vector and (lex.rank+1) < vectors.shape[0]:
-            lex.norm = lex.rank+1
-            vectors[lex.rank + 1] = lex.vector
-        else:
-            lex.norm = 0
-    return vectors
-
-
-def train(train_texts, train_labels, dev_texts, dev_labels,
-        lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5,
-        by_sentence=True):
-    nlp = spacy.load('en', entity=False)
-    if 'nr_vector' not in lstm_shape:
-        lstm_shape['nr_vector'] = max(lex.rank+1 for lex in nlp.vocab if lex.has_vector)
-    if 'nr_dim' not in lstm_shape:
-        lstm_shape['nr_dim'] = nlp.vocab.vectors_length
-    print("Make model")
-    model = Classifier(SentimentModel(nlp, lstm_shape, **lstm_settings))
-    print("Parsing texts...")
-    if by_sentence:
-        train_data = SentenceDataset(nlp, train_texts, train_labels, lstm_shape['max_length'])
-        dev_data = SentenceDataset(nlp, dev_texts, dev_labels, lstm_shape['max_length'])
-    else:
-        train_data = DocDataset(nlp, train_texts, train_labels)
-        dev_data = DocDataset(nlp, dev_texts, dev_labels)
-    train_iter = SerialIterator(train_data, batch_size=batch_size,
-                                shuffle=True, repeat=True)
-    dev_iter = SerialIterator(dev_data, batch_size=batch_size,
-                              shuffle=False, repeat=False)
-    optimizer = chainer.optimizers.Adam()
-    optimizer.setup(model)
-    updater = chainer.training.StandardUpdater(train_iter, optimizer, device=0)
-    trainer = chainer.training.Trainer(updater, (1, 'epoch'), out='result')
-
-    trainer.extend(extensions.Evaluator(dev_iter, model, device=0))
-    trainer.extend(extensions.LogReport())
-    trainer.extend(extensions.PrintReport([
-        'epoch', 'main/accuracy', 'validation/main/accuracy']))
-    trainer.extend(extensions.ProgressBar())
-    
-    trainer.run()
-
-
-def evaluate(model_dir, texts, labels, max_length=100):
-    def create_pipeline(nlp):
-        '''
-        This could be a lambda, but named functions are easier to read in Python.
-        '''
-        return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp,
-                                                               max_length=max_length)]
-    
-    nlp = spacy.load('en')
-    nlp.pipeline = create_pipeline(nlp)
-
-    correct = 0
-    i = 0 
-    for doc in nlp.pipe(texts, batch_size=1000, n_threads=4):
-        correct += bool(doc.sentiment >= 0.5) == bool(labels[i])
-        i += 1
-    return float(correct) / i
-
-
-@plac.annotations(
-    train_dir=("Location of training file or directory"),
-    dev_dir=("Location of development file or directory"),
-    model_dir=("Location of output model directory",),
-    is_runtime=("Demonstrate run-time usage", "flag", "r", bool),
-    nr_hidden=("Number of hidden units", "option", "H", int),
-    max_length=("Maximum sentence length", "option", "L", int),
-    dropout=("Dropout", "option", "d", float),
-    learn_rate=("Learn rate", "option", "e", float),
-    nb_epoch=("Number of training epochs", "option", "i", int),
-    batch_size=("Size of minibatches for training LSTM", "option", "b", int),
-    nr_examples=("Limit to N examples", "option", "n", int)
-)
-def main(model_dir, train_dir, dev_dir,
-         is_runtime=False,
-         nr_hidden=64, max_length=100, # Shape
-         dropout=0.5, learn_rate=0.001, # General NN config
-         nb_epoch=5, batch_size=32, nr_examples=-1):  # Training params
-    model_dir = pathlib.Path(model_dir)
-    train_dir = pathlib.Path(train_dir)
-    dev_dir = pathlib.Path(dev_dir)
-    if is_runtime:
-        dev_texts, dev_labels = read_data(dev_dir)
-        acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length)
-        print(acc)
-    else:
-        print("Read data")
-        train_texts, train_labels = read_data(train_dir, limit=nr_examples)
-        dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples)
-        print("Using GPU 0")
-        #chainer.cuda.get_device(0).use()
-        train_labels = xp.asarray(train_labels, dtype='i')
-        dev_labels = xp.asarray(dev_labels, dtype='i')
-        lstm = train(train_texts, train_labels, dev_texts, dev_labels,
-                     {'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 2,
-                      'nr_vector': 5000},
-                      {'dropout': 0.5, 'lr': learn_rate},
-                      {},
-                      nb_epoch=nb_epoch, batch_size=batch_size)
-
-
-if __name__ == '__main__':
-    #cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
-    #s = pstats.Stats("Profile.prof")
-    #s.strip_dirs().sort_stats("time").print_stats()
-    plac.call(main)
--- a/examples/nn_text_class.py
+++ b/examples/nn_text_class.py
@ -1,281 +0,0 @@
-"""This script expects something like a binary sentiment data set, such as
- that available here: `http://www.cs.cornell.edu/people/pabo/movie-review-data/`
-
-It expects a directory structure like: `data_dir/train/{pos|neg}`
- and `data_dir/test/{pos|neg}`. Put (say) 90% of the files in the former
- and the remainder in the latter.
-"""
-
-from __future__ import unicode_literals
-from __future__ import print_function
-from __future__ import division
-
-from collections import defaultdict
-from pathlib import Path
-import numpy
-import plac
-
-import spacy.en
-
-
-def read_data(nlp, data_dir):
-    for subdir, label in (('pos', 1), ('neg', 0)):
-        for filename in (data_dir / subdir).iterdir():
-            text = filename.open().read()
-            doc = nlp(text)
-            if len(doc) >= 1:
-                yield doc, label
-
-
-def partition(examples, split_size):
-    examples = list(examples)
-    numpy.random.shuffle(examples)
-    n_docs = len(examples)
-    split = int(n_docs * split_size)
-    return examples[:split], examples[split:]
-
-
-def minibatch(data, bs=24):
-    for i in range(0, len(data), bs):
-        yield data[i:i+bs]
-
-
-class Extractor(object):
-    def __init__(self, nlp, vector_length, dropout=0.3):
-        self.nlp = nlp
-        self.dropout = dropout
-        self.vector = numpy.zeros((vector_length, ))
-
-    def doc2bow(self, doc, dropout=None):
-        if dropout is None:
-            dropout = self.dropout
-        bow = defaultdict(int)
-        all_words = defaultdict(int)
-        for word in doc:
-            if numpy.random.random() >= dropout and not word.is_punct:
-                bow[word.lower] += 1
-            all_words[word.lower] += 1
-        if sum(bow.values()) >= 1:
-            return bow
-        else:
-            return all_words
-
-    def bow2vec(self, bow, E):
-        self.vector.fill(0)
-        n = 0
-        for orth_id, freq in bow.items():
-            self.vector += self.nlp.vocab[self.nlp.vocab.strings[orth_id]].vector * freq
-            # Apply the fine-tuning we've learned
-            if orth_id < E.shape[0]:
-                self.vector += E[orth_id] * freq
-            n += freq
-        return self.vector / n
-
-
-class NeuralNetwork(object):
-    def __init__(self, depth, width, n_classes, n_vocab, extracter, optimizer):
-        self.depth = depth
-        self.width = width
-        self.n_classes = n_classes
-        self.weights = Params.random(depth, width, width, n_classes, n_vocab)
-        self.doc2bow = extracter.doc2bow
-        self.bow2vec = extracter.bow2vec
-        self.optimizer = optimizer
-        self._gradient = Params.zero(depth, width, width, n_classes, n_vocab)
-        self._activity = numpy.zeros((depth, width))
-
-    def train(self, batch):
-        activity = self._activity
-        gradient = self._gradient
-        activity.fill(0)
-        gradient.data.fill(0)
-        loss = 0
-        word_freqs = defaultdict(int)
-        for doc, label in batch:
-            word_ids = self.doc2bow(doc)
-            vector = self.bow2vec(word_ids, self.weights.E)
-            self.forward(activity, vector)
-            loss += self.backprop(vector, gradient, activity, word_ids, label)
-            for w, freq in word_ids.items():
-                word_freqs[w] += freq
-        self.optimizer(self.weights, gradient, len(batch), word_freqs)
-        return loss
-
-    def predict(self, doc):
-        actv = self._activity
-        actv.fill(0)
-        W = self.weights.W
-        b = self.weights.b
-        E = self.weights.E
-        
-        vector = self.bow2vec(self.doc2bow(doc, dropout=0.0), E)
-        self.forward(actv, vector)
-        return numpy.argmax(softmax(actv[-1], W[-1], b[-1]))
-
-    def forward(self, actv, in_):
-        actv.fill(0)
-        W = self.weights.W; b = self.weights.b
-        actv[0] = relu(in_, W[0], b[0])
-        for i in range(1, self.depth):
-            actv[i] = relu(actv[i-1], W[i], b[i])
-
-    def backprop(self, input_vector, gradient, activity, ids, label):
-        W = self.weights.W
-        b = self.weights.b
-
-        target = numpy.zeros(self.n_classes)
-        target[label] = 1.0
-        pred = softmax(activity[-1], W[-1], b[-1])
-        delta = pred - target
-
-        for i in range(self.depth, 0, -1):
-            gradient.b[i] += delta
-            gradient.W[i] += numpy.outer(delta, activity[i-1])
-            delta = d_relu(activity[i-1]) * W[i].T.dot(delta)
-
-        gradient.b[0] += delta
-        gradient.W[0] += numpy.outer(delta, input_vector)
-        tuning = W[0].T.dot(delta).reshape((self.width,)) / len(ids)
-        for w, freq in ids.items():
-            if w < gradient.E.shape[0]:
-                gradient.E[w] += tuning * freq
-        return -sum(target * numpy.log(pred))
-
-
-def softmax(actvn, W, b):
-    w = W.dot(actvn) + b
-    ew = numpy.exp(w - max(w))
-    return (ew / sum(ew)).ravel()
-
-
-def relu(actvn, W, b):
-    x = W.dot(actvn) + b
-    return x * (x > 0)
-
-
-def d_relu(x):
-    return x > 0
-
-
-class Adagrad(object):
-    def __init__(self, lr, rho):
-        self.eps = 1e-3
-        # initial learning rate
-        self.learning_rate = lr
-        self.rho = rho
-        # stores sum of squared gradients 
-        #self.h = numpy.zeros(self.dim)
-        #self._curr_rate = numpy.zeros(self.h.shape)
-        self.h = None
-        self._curr_rate = None
-    
-    def __call__(self, weights, gradient, batch_size, word_freqs):
-        if self.h is None:
-            self.h = numpy.zeros(gradient.data.shape)
-            self._curr_rate = numpy.zeros(gradient.data.shape)
-        self.L2_penalty(gradient, weights, word_freqs)
-        update = self.rescale(gradient.data / batch_size)
-        weights.data -= update
-
-    def rescale(self, gradient):
-        if self.h is None:
-            self.h = numpy.zeros(gradient.data.shape)
-            self._curr_rate = numpy.zeros(gradient.data.shape)
-        self._curr_rate.fill(0)
-        self.h += gradient ** 2
-        self._curr_rate = self.learning_rate / (numpy.sqrt(self.h) + self.eps)
-        return self._curr_rate * gradient
-
-    def L2_penalty(self, gradient, weights, word_freqs):
-        # L2 Regularization
-        for i in range(len(weights.W)):
-            gradient.W[i] += weights.W[i] * self.rho
-            gradient.b[i] += weights.b[i] * self.rho
-        for w, freq in word_freqs.items():
-            if w < gradient.E.shape[0]:
-                gradient.E[w] += weights.E[w] * self.rho
-
-
-class Params(object):
-    @classmethod
-    def zero(cls, depth, n_embed, n_hidden, n_labels, n_vocab):
-        return cls(depth, n_embed, n_hidden, n_labels, n_vocab, lambda x: numpy.zeros((x,)))
-
-    @classmethod
-    def random(cls, depth, nE, nH, nL, nV):
-        return cls(depth, nE, nH, nL, nV, lambda x: (numpy.random.rand(x) * 2 - 1) * 0.08)
-
-    def __init__(self, depth, n_embed, n_hidden, n_labels, n_vocab, initializer):
-        nE = n_embed; nH = n_hidden; nL = n_labels; nV = n_vocab
-        n_weights = sum([
-            (nE * nH) + nH, 
-            (nH * nH  + nH) * depth,
-            (nH * nL) + nL,
-            (nV * nE)
-        ])
-        self.data = initializer(n_weights)
-        self.W = []
-        self.b = []
-        i = self._add_layer(0, nE, nH)
-        for _ in range(1, depth):
-            i = self._add_layer(i, nH, nH)
-        i = self._add_layer(i, nL, nH)
-        self.E = self.data[i : i + (nV * nE)].reshape((nV, nE))
-        self.E.fill(0)
-
-    def _add_layer(self, start, x, y):
-        end = start + (x * y)
-        self.W.append(self.data[start : end].reshape((x, y)))
-        self.b.append(self.data[end : end + x].reshape((x, )))
-        return end + x
-
-
-@plac.annotations(
-    data_dir=("Data directory", "positional", None, Path),
-    n_iter=("Number of iterations (epochs)", "option", "i", int),
-    width=("Size of hidden layers", "option", "H", int),
-    depth=("Depth", "option", "d", int),
-    dropout=("Drop-out rate", "option", "r", float),
-    rho=("Regularization penalty", "option", "p", float),
-    eta=("Learning rate", "option", "e", float),
-    batch_size=("Batch size", "option", "b", int),
-    vocab_size=("Number of words to fine-tune", "option", "w", int),
-)
-def main(data_dir, depth=3, width=300, n_iter=5, vocab_size=40000,
-         batch_size=24, dropout=0.3, rho=1e-5, eta=0.005):
-    n_classes = 2
-    print("Loading")
-    nlp = spacy.en.English(parser=False)
-    train_data, dev_data = partition(read_data(nlp, data_dir / 'train'), 0.8)
-    print("Begin training")
-    extracter = Extractor(nlp, width, dropout=0.3)
-    optimizer = Adagrad(eta, rho)
-    model = NeuralNetwork(depth, width, n_classes, vocab_size, extracter, optimizer)
-    prev_best = 0
-    best_weights = None
-    for epoch in range(n_iter):
-        numpy.random.shuffle(train_data)
-        train_loss = 0.0
-        for batch in minibatch(train_data, bs=batch_size):
-            train_loss += model.train(batch)
-        n_correct = sum(model.predict(x) == y for x, y in dev_data)
-        print(epoch, train_loss, n_correct / len(dev_data))
-        if n_correct >= prev_best:
-            best_weights = model.weights.data.copy()
-            prev_best = n_correct
-
-    model.weights.data = best_weights
-    print("Evaluating")
-    eval_data = list(read_data(nlp, data_dir / 'test'))
-    n_correct = sum(model.predict(x) == y for x, y in eval_data)
-    print(n_correct / len(eval_data))
- 
-
-
-if __name__ == '__main__':
-    #import cProfile
-    #import pstats
-    #cProfile.runctx("main(Path('data/aclImdb'))", globals(), locals(), "Profile.prof")
-    #s = pstats.Stats("Profile.prof")
-    #s.strip_dirs().sort_stats("time").print_stats(100)
-    plac.call(main)
--- a/examples/multi_word_matches.py
+++ b/examples/multi_word_matches.py
@ -20,72 +20,72 @@ The algorithm is O(n) at run-time for document of length n because we're only ev
 matching over the tag patterns. So no matter how many phrases we're looking for,
 our pattern set stays very small (exact size depends on the maximum length we're
 looking for, as the query language currently has no quantifiers)
+
+The example expects a .bz2 file from the Reddit corpus, and a patterns file,
+formatted in jsonl as a sequence of entries like this:
+
+{"text":"Anchorage"}
+{"text":"Angola"}
+{"text":"Ann Arbor"}
+{"text":"Annapolis"}
+{"text":"Appalachia"}
+{"text":"Argentina"}
 """
 from __future__ import print_function, unicode_literals, division
-from ast import literal_eval
 from bz2 import BZ2File
 import time
 import math
 import codecs

 import plac
+import ujson

-from preshed.maps import PreshMap
-from preshed.counter import PreshCounter
-from spacy.strings import hash_string
-from spacy.en import English
 from spacy.matcher import PhraseMatcher
+import spacy


 def read_gazetteer(tokenizer, loc, n=-1):
    for i, line in enumerate(open(loc)):
-        phrase = literal_eval('u' + line.strip())
-        if ' (' in phrase and phrase.endswith(')'):
-            phrase = phrase.split(' (', 1)[0]
-        if i >= n:
-            break
-        phrase = tokenizer(phrase)
-        if all((t.is_lower and t.prob >= -10) for t in phrase):
-            continue
+        data = ujson.loads(line.strip())
+        phrase = tokenizer(data['text'])
+        for w in phrase:
+            _ = tokenizer.vocab[w.text]
        if len(phrase) >= 2:
            yield phrase


-def read_text(bz2_loc):
+def read_text(bz2_loc, n=10000):
    with BZ2File(bz2_loc) as file_:
-        for line in file_:
-            yield line.decode('utf8')
+        for i, line in enumerate(file_):
+            data = ujson.loads(line)
+            yield data['body']
+            if i >= n:
+                break


 def get_matches(tokenizer, phrases, texts, max_length=6):
-    matcher = PhraseMatcher(tokenizer.vocab, phrases, max_length=max_length)
-    print("Match")
+    matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length)
+    matcher.add('Phrase', None, *phrases)
    for text in texts:
        doc = tokenizer(text)
+        for w in doc:
+            _ = doc.vocab[w.text]
        matches = matcher(doc)
-        for mwe in doc.ents:
-            yield mwe
+        for ent_id, start, end in matches:
+            yield (ent_id, doc[start:end].text)


-def main(patterns_loc, text_loc, counts_loc, n=10000000):
-    nlp = English(parser=False, tagger=False, entity=False)
-    print("Make matcher")
-    phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n)
-    counts = PreshCounter()
+def main(patterns_loc, text_loc, n=10000):
+    nlp = spacy.blank('en')
+    nlp.vocab.lex_attr_getters = {}
+    phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
+    count = 0
    t1 = time.time()
-    for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)):
-        counts.inc(hash_string(mwe.text), 1)
+    for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)):
+        count += 1
    t2 = time.time()
-    print("10m tokens in %d s" % (t2 - t1))
-    
-    with codecs.open(counts_loc, 'w', 'utf8') as file_:
-        for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n):
-            text = phrase.string
-            key = hash_string(text)
-            count = counts[key]
-            if count != 0:
-                file_.write('%d\t%s\n' % (count, text))
-    
+    print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
+

 if __name__ == '__main__':
    if False:
--- a/examples/pipeline/custom_attr_methods.py
+++ b/examples/pipeline/custom_attr_methods.py
@ -0,0 +1,52 @@
+# coding: utf-8
+"""This example contains several snippets of methods that can be set via custom
+Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like
+they're "bound" to the object and are partially applied – i.e. the object
+they're called on is passed in as the first argument."""
+from __future__ import unicode_literals
+
+from spacy.lang.en import English
+from spacy.tokens import Doc, Span
+from spacy import displacy
+from pathlib import Path
+
+
+def to_html(doc, output='/tmp', style='dep'):
+    """Doc method extension for saving the current state as a displaCy
+    visualization.
+    """
+    # generate filename from first six non-punct tokens
+    file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
+    output_path = Path(output) / file_name
+    html = displacy.render(doc, style=style, page=True)  # render markup
+    output_path.open('w', encoding='utf-8').write(html)  # save to file
+    print('Saved HTML to {}'.format(output_path))
+
+
+Doc.set_extension('to_html', method=to_html)
+
+nlp = English()
+doc = nlp(u"This is a sentence about Apple.")
+# add entity manually for demo purposes, to make it work without a model
+doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
+doc._.to_html(style='ent')
+
+
+def overlap_tokens(doc, other_doc):
+    """Get the tokens from the original Doc that are also in the comparison Doc.
+    """
+    overlap = []
+    other_tokens = [token.text for token in other_doc]
+    for token in doc:
+        if token.text in other_tokens:
+            overlap.append(token)
+    return overlap
+
+
+Doc.set_extension('overlap', method=overlap_tokens)
+
+nlp = English()
+doc1 = nlp(u"Peach emoji is where it has always been.")
+doc2 = nlp(u"Peach is the superior emoji.")
+tokens = doc1._.overlap(doc2)
+print(tokens)
--- a/examples/pipeline/custom_component_countries_api.py
+++ b/examples/pipeline/custom_component_countries_api.py
@ -0,0 +1,108 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import requests
+
+from spacy.lang.en import English
+from spacy.matcher import PhraseMatcher
+from spacy.tokens import Doc, Span, Token
+
+
+class RESTCountriesComponent(object):
+    """Example of a spaCy v2.0 pipeline component that requests all countries
+    via the REST Countries API, merges country names into one token, assigns
+    entity labels and sets attributes on country tokens, e.g. the capital and
+    lat/lng coordinates. Can be extended with more details from the API.
+
+    REST Countries API: https://restcountries.eu
+    API License: Mozilla Public License MPL 2.0
+    """
+    name = 'rest_countries' # component name, will show up in the pipeline
+
+    def __init__(self, nlp, label='GPE'):
+        """Initialise the pipeline component. The shared nlp instance is used
+        to initialise the matcher with the shared vocab, get the label ID and
+        generate Doc objects as phrase match patterns.
+        """
+        # Make request once on initialisation and store the data
+        r = requests.get('https://restcountries.eu/rest/v2/all')
+        r.raise_for_status()  # make sure requests raises an error if it fails
+        countries = r.json()
+
+        # Convert API response to dict keyed by country name for easy lookup
+        # This could also be extended using the alternative and foreign language
+        # names provided by the API
+        self.countries = {c['name']: c for c in countries}
+        self.label = nlp.vocab.strings[label]  # get entity label ID
+
+        # Set up the PhraseMatcher with Doc patterns for each country name
+        patterns = [nlp(c) for c in self.countries.keys()]
+        self.matcher = PhraseMatcher(nlp.vocab)
+        self.matcher.add('COUNTRIES', None, *patterns)
+
+        # Register attribute on the Token. We'll be overwriting this based on
+        # the matches, so we're only setting a default value, not a getter.
+        # If no default value is set, it defaults to None.
+        Token.set_extension('is_country', default=False)
+        Token.set_extension('country_capital')
+        Token.set_extension('country_latlng')
+        Token.set_extension('country_flag')
+
+        # Register attributes on Doc and Span via a getter that checks if one of
+        # the contained tokens is set to is_country == True.
+        Doc.set_extension('has_country', getter=self.has_country)
+        Span.set_extension('has_country', getter=self.has_country)
+
+
+    def __call__(self, doc):
+        """Apply the pipeline component on a Doc object and modify it if matches
+        are found. Return the Doc, so it can be processed by the next component
+        in the pipeline, if available.
+        """
+        matches = self.matcher(doc)
+        spans = []  # keep the spans for later so we can merge them afterwards
+        for _, start, end in matches:
+            # Generate Span representing the entity & set label
+            entity = Span(doc, start, end, label=self.label)
+            spans.append(entity)
+            # Set custom attribute on each token of the entity
+            # Can be extended with other data returned by the API, like
+            # currencies, country code, flag, calling code etc.
+            for token in entity:
+                token._.set('is_country', True)
+                token._.set('country_capital', self.countries[entity.text]['capital'])
+                token._.set('country_latlng', self.countries[entity.text]['latlng'])
+                token._.set('country_flag', self.countries[entity.text]['flag'])
+            # Overwrite doc.ents and add entity – be careful not to replace!
+            doc.ents = list(doc.ents) + [entity]
+        for span in spans:
+            # Iterate over all spans and merge them into one token. This is done
+            # after setting the entities – otherwise, it would cause mismatched
+            # indices!
+            span.merge()
+        return doc  # don't forget to return the Doc!
+
+    def has_country(self, tokens):
+        """Getter for Doc and Span attributes. Returns True if one of the tokens
+        is a country. Since the getter is only called when we access the
+        attribute, we can refer to the Token's 'is_country' attribute here,
+        which is already set in the processing step."""
+        return any([t._.get('is_country') for t in tokens])
+
+
+# For simplicity, we start off with only the blank English Language class and
+# no model or pre-defined pipeline loaded.
+
+nlp = English()
+rest_countries = RESTCountriesComponent(nlp)  # initialise component
+nlp.add_pipe(rest_countries) # add it to the pipeline
+
+doc = nlp(u"Some text about Colombia and the Czech Republic")
+
+print('Pipeline', nlp.pipe_names)  # pipeline contains component name
+print('Doc has countries', doc._.has_country)  # Doc contains countries
+for token in doc:
+    if token._.is_country:
+        print(token.text, token._.country_capital, token._.country_latlng,
+              token._.country_flag)  # country data
+print('Entities', [(e.text, e.label_) for e in doc.ents])  # all countries are entities
--- a/examples/pipeline/custom_component_entities.py
+++ b/examples/pipeline/custom_component_entities.py
@ -0,0 +1,85 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from spacy.lang.en import English
+from spacy.matcher import PhraseMatcher
+from spacy.tokens import Doc, Span, Token
+
+
+class TechCompanyRecognizer(object):
+    """Example of a spaCy v2.0 pipeline component that sets entity annotations
+    based on list of single or multiple-word company names. Companies are
+    labelled as ORG and their spans are merged into one token. Additionally,
+    ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
+    respectively."""
+    name = 'tech_companies'  # component name, will show up in the pipeline
+
+    def __init__(self, nlp, companies=tuple(), label='ORG'):
+        """Initialise the pipeline component. The shared nlp instance is used
+        to initialise the matcher with the shared vocab, get the label ID and
+        generate Doc objects as phrase match patterns.
+        """
+        self.label = nlp.vocab.strings[label]  # get entity label ID
+
+        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
+        # so even if the list of companies is long, it's very efficient
+        patterns = [nlp(org) for org in companies]
+        self.matcher = PhraseMatcher(nlp.vocab)
+        self.matcher.add('TECH_ORGS', None, *patterns)
+
+        # Register attribute on the Token. We'll be overwriting this based on
+        # the matches, so we're only setting a default value, not a getter.
+        Token.set_extension('is_tech_org', default=False)
+
+        # Register attributes on Doc and Span via a getter that checks if one of
+        # the contained tokens is set to is_tech_org == True.
+        Doc.set_extension('has_tech_org', getter=self.has_tech_org)
+        Span.set_extension('has_tech_org', getter=self.has_tech_org)
+
+    def __call__(self, doc):
+        """Apply the pipeline component on a Doc object and modify it if matches
+        are found. Return the Doc, so it can be processed by the next component
+        in the pipeline, if available.
+        """
+        matches = self.matcher(doc)
+        spans = []  # keep the spans for later so we can merge them afterwards
+        for _, start, end in matches:
+            # Generate Span representing the entity & set label
+            entity = Span(doc, start, end, label=self.label)
+            spans.append(entity)
+            # Set custom attribute on each token of the entity
+            for token in entity:
+                token._.set('is_tech_org', True)
+            # Overwrite doc.ents and add entity – be careful not to replace!
+            doc.ents = list(doc.ents) + [entity]
+        for span in spans:
+            # Iterate over all spans and merge them into one token. This is done
+            # after setting the entities – otherwise, it would cause mismatched
+            # indices!
+            span.merge()
+        return doc  # don't forget to return the Doc!
+
+    def has_tech_org(self, tokens):
+        """Getter for Doc and Span attributes. Returns True if one of the tokens
+        is a tech org. Since the getter is only called when we access the
+        attribute, we can refer to the Token's 'is_tech_org' attribute here,
+        which is already set in the processing step."""
+        return any([t._.get('is_tech_org') for t in tokens])
+
+
+# For simplicity, we start off with only the blank English Language class and
+# no model or pre-defined pipeline loaded.
+
+nlp = English()
+companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple']  # etc.
+component = TechCompanyRecognizer(nlp, companies)  # initialise component
+nlp.add_pipe(component, last=True)  # add it to the pipeline as the last element
+
+doc = nlp(u"Alphabet Inc. is the company behind Google.")
+
+print('Pipeline', nlp.pipe_names)  # pipeline contains component name
+print('Tokens', [t.text for t in doc])  # company names from the list are merged
+print('Doc has_tech_org', doc._.has_tech_org)  # Doc contains tech orgs
+print('Token 0 is_tech_org', doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org
+print('Token 1 is_tech_org', doc[1]._.is_tech_org)  # "is" is not
+print('Entities', [(e.text, e.label_) for e in doc.ents])  # all orgs are entities
--- a/examples/training/train_ner_standalone.py
+++ b/examples/training/train_ner_standalone.py
@ -6,31 +6,36 @@ To achieve that, it duplicates some of spaCy's internal functionality.

 Specifically, in this example, we don't use spaCy's built-in Language class to
 wire together the Vocab, Tokenizer and EntityRecognizer. Instead, we write
-our own simle Pipeline class, so that it's easier to see how the pieces
+our own simple Pipeline class, so that it's easier to see how the pieces
 interact.

 Input data:
 https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip

 Developed for: spaCy 1.7.1
-Last tested for: spaCy 1.7.1
+Last tested for: spaCy 2.0.0a13
 '''
 from __future__ import unicode_literals, print_function
 import plac
 from pathlib import Path
 import random
 import json
+import tqdm
+
+from thinc.neural.optimizers import Adam
+from thinc.neural.ops import NumpyOps

-import spacy.orth as orth_funcs
 from spacy.vocab import Vocab
-from spacy.pipeline import BeamEntityRecognizer
-from spacy.pipeline import EntityRecognizer
+from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer
 from spacy.tokenizer import Tokenizer
 from spacy.tokens import Doc
 from spacy.attrs import *
 from spacy.gold import GoldParse
-from spacy.gold import _iob_to_biluo as iob_to_biluo
+from spacy.gold import iob_to_biluo
+from spacy.gold import minibatch
 from spacy.scorer import Scorer
+import spacy.util
+

 try:
    unicode
@ -38,96 +43,38 @@ except NameError:
    unicode = str


+spacy.util.set_env_log(True)
+
+
 def init_vocab():
    return Vocab(
        lex_attr_getters={
            LOWER: lambda string: string.lower(),
-            SHAPE: orth_funcs.word_shape,
+            NORM: lambda string: string.lower(),
            PREFIX: lambda string: string[0],
            SUFFIX: lambda string: string[-3:],
-            CLUSTER: lambda string: 0,
-            IS_ALPHA: orth_funcs.is_alpha,
-            IS_ASCII: orth_funcs.is_ascii,
-            IS_DIGIT: lambda string: string.isdigit(),
-            IS_LOWER: orth_funcs.is_lower,
-            IS_PUNCT: orth_funcs.is_punct,
-            IS_SPACE: lambda string: string.isspace(),
-            IS_TITLE: orth_funcs.is_title,
-            IS_UPPER: orth_funcs.is_upper,
-            IS_STOP: lambda string: False,
-            IS_OOV: lambda string: True
        })


-def save_vocab(vocab, path):
-    path = Path(path)
-    if not path.exists():
-        path.mkdir()
-    elif not path.is_dir():
-        raise IOError("Can't save vocab to %s\nNot a directory" % path)
-    with (path / 'strings.json').open('w') as file_:
-        vocab.strings.dump(file_)
-    vocab.dump((path / 'lexemes.bin').as_posix())
-
-
-def load_vocab(path):
-    path = Path(path)
-    if not path.exists():
-        raise IOError("Cannot load vocab from %s\nDoes not exist" % path)
-    if not path.is_dir():
-        raise IOError("Cannot load vocab from %s\nNot a directory" % path)
-    return Vocab.load(path)
-
-
-def init_ner_model(vocab, features=None):
-    if features is None:
-        features = tuple(EntityRecognizer.feature_templates)
-    return EntityRecognizer(vocab, features=features)
-
-
-def save_ner_model(model, path):
-    path = Path(path)
-    if not path.exists():
-        path.mkdir()
-    if not path.is_dir():
-        raise IOError("Can't save model to %s\nNot a directory" % path)
-    model.model.dump((path / 'model').as_posix())
-    with (path / 'config.json').open('w') as file_:
-        data = json.dumps(model.cfg)
-        if not isinstance(data, unicode):
-            data = data.decode('utf8')
-        file_.write(data)
-
-
-def load_ner_model(vocab, path):
-    return EntityRecognizer.load(path, vocab)
-
-
 class Pipeline(object):
-    @classmethod
-    def load(cls, path):
-        path = Path(path)
-        if not path.exists():
-            raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
-        if not path.is_dir():
-            raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
-        vocab = load_vocab(path)
-        tokenizer = Tokenizer(vocab, {}, None, None, None)
-        ner_model = load_ner_model(vocab, path / 'ner')
-        return cls(vocab, tokenizer, ner_model)
-
    def __init__(self, vocab=None, tokenizer=None, entity=None):
        if vocab is None:
            vocab = init_vocab()
        if tokenizer is None:
            tokenizer = Tokenizer(vocab, {}, None, None, None)
        if entity is None:
-            entity = init_ner_model(self.vocab)
+            entity = NeuralEntityRecognizer(vocab)
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.entity = entity
        self.pipeline = [self.entity]

+    def begin_training(self):
+        for model in self.pipeline:
+            model.begin_training([])
+        optimizer = Adam(NumpyOps(), 0.001)
+        return optimizer
+
    def __call__(self, input_):
        doc = self.make_doc(input_)
        for process in self.pipeline:
@ -147,14 +94,16 @@ class Pipeline(object):
        gold = GoldParse(doc, entities=annotations)
        return gold

-    def update(self, input_, annot):
-        doc = self.make_doc(input_)
-        gold = self.make_gold(input_, annot)
-        for ner in gold.ner:
-            if ner not in (None, '-', 'O'):
-                action, label = ner.split('-', 1)
-                self.entity.add_label(label)
-        return self.entity.update(doc, gold)
+    def update(self, inputs, annots, sgd, losses=None, drop=0.):
+        if losses is None:
+            losses = {}
+        docs = [self.make_doc(input_) for input_ in inputs]
+        golds = [self.make_gold(input_, annot) for input_, annot in
+                 zip(inputs, annots)]
+
+        self.entity.update(docs, golds, drop=drop,
+                           sgd=sgd, losses=losses)
+        return losses

    def evaluate(self, examples):
        scorer = Scorer()
@ -164,43 +113,44 @@ class Pipeline(object):
            scorer.score(doc, gold)
        return scorer.scores

-    def average_weights(self):
-        self.entity.model.end_training()
-
-    def save(self, path):
+    def to_disk(self, path):
        path = Path(path)
        if not path.exists():
            path.mkdir()
        elif not path.is_dir():
            raise IOError("Can't save pipeline to %s\nNot a directory" % path)
-        save_vocab(self.vocab, path / 'vocab')
-        save_ner_model(self.entity, path / 'ner')
+        self.vocab.to_disk(path / 'vocab')
+        self.entity.to_disk(path / 'ner')
+
+    def from_disk(self, path):
+        path = Path(path)
+        if not path.exists():
+            raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
+        if not path.is_dir():
+            raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
+        self.vocab = self.vocab.from_disk(path / 'vocab')
+        self.entity = self.entity.from_disk(path / 'ner')


-def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5):
-    next_epoch = train_examples
+def train(nlp, train_examples, dev_examples, nr_epoch=5):
+    sgd = nlp.begin_training()
    print("Iter", "Loss", "P", "R", "F")
    for i in range(nr_epoch):
-        this_epoch = next_epoch
-        next_epoch = []
-        loss = 0
-        for input_, annot in this_epoch:
-            loss += nlp.update(input_, annot)
-            if (i+1) < nr_epoch:
-                next_epoch.append((input_, annot))
-        random.shuffle(next_epoch)
+        random.shuffle(train_examples)
+        losses = {}
+        for batch in minibatch(tqdm.tqdm(train_examples, leave=False), size=8):
+            inputs, annots = zip(*batch)
+            nlp.update(list(inputs), list(annots), sgd, losses=losses)
        scores = nlp.evaluate(dev_examples)
-        report_scores(i, loss, scores)
-    nlp.average_weights()
-    scores = nlp.evaluate(dev_examples)
-    report_scores(channels, i+1, loss, scores)
+        report_scores(i+1, losses['ner'], scores)


 def report_scores(i, loss, scores):
    precision = '%.2f' % scores['ents_p']
    recall = '%.2f' % scores['ents_r']
    f_measure = '%.2f' % scores['ents_f']
-    print('%d %s %s %s' % (int(loss), precision, recall, f_measure))
+    print('Epoch %d: %d %s %s %s' % (
+        i, int(loss), precision, recall, f_measure))


 def read_examples(path):
@ -208,7 +158,8 @@ def read_examples(path):
    with path.open() as file_:
        sents = file_.read().strip().split('\n\n')
        for sent in sents:
-            if not sent.strip():
+            sent = sent.strip()
+            if not sent:
                continue
            tokens = sent.split('\n')
            while tokens and tokens[0].startswith('#'):
@ -217,28 +168,39 @@ def read_examples(path):
            iob = []
            for token in tokens:
                if token.strip():
-                    pieces = token.split()
+                    pieces = token.split('\t')
                    words.append(pieces[1])
                    iob.append(pieces[2])
            yield words, iob_to_biluo(iob)


+def get_labels(examples):
+    labels = set()
+    for words, tags in examples:
+        for tag in tags:
+            if '-' in tag:
+                labels.add(tag.split('-')[1])
+    return sorted(labels)
+
+
@plac.annotations(
    model_dir=("Path to save the model", "positional", None, Path),
    train_loc=("Path to your training data", "positional", None, Path),
    dev_loc=("Path to your development data", "positional", None, Path),
 )
-def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'),
-        train_loc=None, dev_loc=None, nr_epoch=30):
-    
-    train_examples = read_examples(train_loc)
+def main(model_dir, train_loc, dev_loc, nr_epoch=30):
+    print(model_dir, train_loc, dev_loc)
+    train_examples = list(read_examples(train_loc))
    dev_examples = read_examples(dev_loc)
-    nlp = Pipeline.load(model_dir)
+    nlp = Pipeline()
+    for label in get_labels(train_examples):
+        nlp.entity.add_label(label)
+        print("Add label", label)

-    train(nlp, train_examples, list(dev_examples), ctx, nr_epoch)
+    train(nlp, train_examples, list(dev_examples), nr_epoch)

-    nlp.save(model_dir)
+    nlp.to_disk(model_dir)


 if __name__ == '__main__':
-    main()
+    plac.call(main)
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@ -25,7 +25,7 @@ For more details, see the documentation:
 * Saving and loading models: https://spacy.io/docs/usage/saving-loading

 Developed for: spaCy 1.7.6
-Last tested for: spaCy 1.7.6
+Last updated for: spaCy 2.0.0a13
 """
 from __future__ import unicode_literals, print_function

@ -34,55 +34,40 @@ from pathlib import Path
 import random

 import spacy
-from spacy.gold import GoldParse
-from spacy.tagger import Tagger
+from spacy.gold import GoldParse, minibatch
+from spacy.pipeline import NeuralEntityRecognizer
+from spacy.pipeline import TokenVectorEncoder


+def get_gold_parses(tokenizer, train_data):
+    '''Shuffle and create GoldParse objects'''
+    random.shuffle(train_data)
+    for raw_text, entity_offsets in train_data:
+        doc = tokenizer(raw_text)
+        gold = GoldParse(doc, entities=entity_offsets)
+        yield doc, gold
+
+ 
 def train_ner(nlp, train_data, output_dir):
-    # Add new words to vocab
-    for raw_text, _ in train_data:
-        doc = nlp.make_doc(raw_text)
-        for word in doc:
-            _ = nlp.vocab[word.orth]
    random.seed(0)
-    # You may need to change the learning rate. It's generally difficult to
-    # guess what rate you should set, especially when you have limited data.
-    nlp.entity.model.learn_rate = 0.001
-    for itn in range(1000):
-        random.shuffle(train_data)
-        loss = 0.
-        for raw_text, entity_offsets in train_data:
-            gold = GoldParse(doc, entities=entity_offsets)
-            # By default, the GoldParse class assumes that the entities
-            # described by offset are complete, and all other words should
-            # have the tag 'O'. You can tell it to make no assumptions
-            # about the tag of a word by giving it the tag '-'.
-            # However, this allows a trivial solution to the current
-            # learning problem: if words are either 'any tag' or 'ANIMAL',
-            # the model can learn that all words can be tagged 'ANIMAL'.
-            #for i in range(len(gold.ner)):
-                #if not gold.ner[i].endswith('ANIMAL'):
-                #    gold.ner[i] = '-'
-            doc = nlp.make_doc(raw_text)
-            nlp.tagger(doc)
-            # As of 1.9, spaCy's parser now lets you supply a dropout probability
-            # This might help the model generalize better from only a few
-            # examples.
-            loss += nlp.entity.update(doc, gold, drop=0.9)
-        if loss == 0:
-            break
-    # This step averages the model's weights. This may or may not be good for
-    # your situation --- it's empirical.
-    nlp.end_training()
-    if output_dir:
-        if not output_dir.exists():
-            output_dir.mkdir()
-        nlp.save_to_directory(output_dir)
+    optimizer = nlp.begin_training(lambda: [])
+    nlp.meta['name'] = 'en_ent_animal'
+    for itn in range(50):
+        losses = {}
+        for batch in minibatch(get_gold_parses(nlp.make_doc, train_data), size=3):
+            docs, golds = zip(*batch)
+            nlp.update(docs, golds, losses=losses, sgd=optimizer, drop=0.35)
+        print(losses)
+    if not output_dir:
+        return
+    elif not output_dir.exists():
+        output_dir.mkdir()
+    nlp.to_disk(output_dir)


 def main(model_name, output_directory=None):
-    print("Loading initial model", model_name)
-    nlp = spacy.load(model_name)
+    print("Creating initial model", model_name)
+    nlp = spacy.blank(model_name)
    if output_directory is not None:
        output_directory = Path(output_directory)

@ -91,6 +76,11 @@ def main(model_name, output_directory=None):
            "Horses are too tall and they pretend to care about your feelings",
            [(0, 6, 'ANIMAL')],
        ),
+        (
+            "Do they bite?", 
+            [],
+        ),
+ 
        (
            "horses are too tall and they pretend to care about your feelings",
            [(0, 6, 'ANIMAL')]
@ -109,18 +99,21 @@ def main(model_name, output_directory=None):
        )

    ]
-    nlp.entity.add_label('ANIMAL')
+    nlp.add_pipe(TokenVectorEncoder(nlp.vocab))
+    ner = NeuralEntityRecognizer(nlp.vocab)
+    ner.add_label('ANIMAL')
+    nlp.add_pipe(ner)
    train_ner(nlp, train_data, output_directory)

    # Test that the entity is recognized
-    doc = nlp('Do you like horses?')
+    text = 'Do you like horses?'
    print("Ents in 'Do you like horses?':")
+    doc = nlp(text)
    for ent in doc.ents:
        print(ent.label_, ent.text)
    if output_directory:
        print("Loading from", output_directory)
-        nlp2 = spacy.load('en', path=output_directory)
-        nlp2.entity.add_label('ANIMAL')
+        nlp2 = spacy.load(output_directory)
        doc2 = nlp2('Do you like horses?')
        for ent in doc2.ents:
            print(ent.label_, ent.text)
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@ -0,0 +1,121 @@
+'''Train a multi-label convolutional neural network text classifier,
+using the spacy.pipeline.TextCategorizer component. The model is then added
+to spacy.pipeline, and predictions are available at `doc.cats`.
+'''
+from __future__ import unicode_literals
+import plac
+import random
+import tqdm
+
+from thinc.neural.optimizers import Adam
+from thinc.neural.ops import NumpyOps
+import thinc.extra.datasets
+
+import spacy.lang.en
+from spacy.gold import GoldParse, minibatch
+from spacy.util import compounding
+from spacy.pipeline import TextCategorizer
+
+# TODO: Remove this once we're not supporting models trained with thinc <6.9.0
+import thinc.neural._classes.layernorm
+thinc.neural._classes.layernorm.set_compat_six_eight(False)
+
+
+def train_textcat(tokenizer, textcat,
+                  train_texts, train_cats, dev_texts, dev_cats,
+                  n_iter=20):
+    '''
+    Train the TextCategorizer without associated pipeline.
+    '''
+    textcat.begin_training()
+    optimizer = Adam(NumpyOps(), 0.001)
+    train_docs = [tokenizer(text) for text in train_texts]
+    train_gold = [GoldParse(doc, cats=cats) for doc, cats in
+                  zip(train_docs, train_cats)]
+    train_data = list(zip(train_docs, train_gold))
+    batch_sizes = compounding(4., 128., 1.001)
+    for i in range(n_iter):
+        losses = {}
+        # Progress bar and minibatching
+        batches = minibatch(tqdm.tqdm(train_data, leave=False), size=batch_sizes)
+        for batch in batches:
+            docs, golds = zip(*batch)
+            textcat.update(docs, golds, sgd=optimizer, drop=0.2,
+                losses=losses)
+        with textcat.model.use_params(optimizer.averages):
+            scores = evaluate(tokenizer, textcat, dev_texts, dev_cats)
+        yield losses['textcat'], scores
+
+
+def evaluate(tokenizer, textcat, texts, cats):
+    docs = (tokenizer(text) for text in texts)
+    tp = 1e-8 # True positives
+    fp = 1e-8 # False positives
+    fn = 1e-8 # False negatives
+    tn = 1e-8 # True negatives
+    for i, doc in enumerate(textcat.pipe(docs)):
+        gold = cats[i]
+        for label, score in doc.cats.items():
+            if label not in gold:
+                continue
+            if score >= 0.5 and gold[label] >= 0.5:
+                tp += 1.
+            elif score >= 0.5 and gold[label] < 0.5:
+                fp += 1.
+            elif score < 0.5 and gold[label] < 0.5:
+                tn += 1
+            elif score < 0.5 and gold[label] >= 0.5:
+                fn += 1
+    precis = tp / (tp + fp)
+    recall = tp / (tp + fn)
+    fscore = 2 * (precis * recall) / (precis + recall)
+    return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}
+
+
+def load_data(limit=0):
+    # Partition off part of the train data --- avoid running experiments
+    # against test.
+    train_data, _ = thinc.extra.datasets.imdb()
+
+    random.shuffle(train_data)
+    train_data = train_data[-limit:]
+
+    texts, labels = zip(*train_data)
+    cats = [{'POSITIVE': bool(y)} for y in labels]
+
+    split = int(len(train_data) * 0.8)
+
+    train_texts = texts[:split]
+    train_cats = cats[:split]
+    dev_texts = texts[split:]
+    dev_cats = cats[split:]
+    return (train_texts, train_cats), (dev_texts, dev_cats)
+
+
+def main(model_loc=None):
+    nlp = spacy.lang.en.English()
+    tokenizer = nlp.tokenizer
+    textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])
+
+    print("Load IMDB data")
+    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000)
+
+    print("Itn.\tLoss\tP\tR\tF")
+    progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'
+
+    for i, (loss, scores) in enumerate(train_textcat(tokenizer, textcat,
+                                       train_texts, train_cats,
+                                       dev_texts, dev_cats, n_iter=20)):
+        print(progress.format(i=i, loss=loss, **scores))
+    # How to save, load and use
+    nlp.pipeline.append(textcat)
+    if model_loc is not None:
+        nlp.to_disk(model_loc)
+
+        nlp = spacy.load(model_loc)
+        doc = nlp(u'This movie sucked!')
+        print(doc.cats)
+
+
+if __name__ == '__main__':
+    plac.call(main)
--- a/examples/vectors_fast_text.py
+++ b/examples/vectors_fast_text.py
@ -0,0 +1,30 @@
+'''Load vectors for a language trained using FastText
+
+https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
+'''
+from __future__ import unicode_literals
+import plac
+import numpy
+
+import spacy.language
+
+
+def main(vectors_loc):
+    nlp = spacy.language.Language()
+
+    with open(vectors_loc, 'rb') as file_:
+        header = file_.readline()
+        nr_row, nr_dim = header.split()
+        nlp.vocab.clear_vectors(int(nr_dim))
+        for line in file_:
+            line = line.decode('utf8')
+            pieces = line.split() 
+            word = pieces[0]
+            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
+            nlp.vocab.set_vector(word, vector)
+    doc = nlp(u'class colspan')
+    print(doc[0].similarity(doc[1]))
+
+
+if __name__ == '__main__':
+    plac.call(main)
--- a/fabfile.py
+++ b/fabfile.py
@ -14,6 +14,7 @@ VENV_DIR = path.join(PWD, ENV)
 def env(lang='python2.7'):
    if path.exists(VENV_DIR):
        local('rm -rf {env}'.format(env=VENV_DIR))
+    local('pip install virtualenv')
    local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))


@ -32,6 +33,10 @@ def make():
            local('pip install -r requirements.txt')
            local('python setup.py build_ext --inplace')

+def sdist():
+    with virtualenv(VENV_DIR):
+        with lcd(path.dirname(__file__)):
+            local('python setup.py sdist')

 def clean():
    with lcd(path.dirname(__file__)):
--- a/requirements.txt
+++ b/requirements.txt
@ -1,9 +1,9 @@
-cython<0.24
+cython>=0.24,<0.27.0
 pathlib
 numpy>=1.7
 cymem>=1.30,<1.32
 preshed>=1.0.0,<2.0.0
-thinc>=6.7.3,<6.8.0
+thinc>=6.9.0,<6.10.0
 murmurhash>=0.28,<0.29
 plac<1.0.0,>=0.9.6
 six
@ -13,7 +13,7 @@ requests>=2.13.0,<3.0.0
 regex==2017.4.5
 ftfy>=4.4.2,<5.0.0
 pytest>=3.0.6,<4.0.0
-pip>=9.0.0,<10.0.0
 mock>=2.0.0,<3.0.0
 msgpack-python
 msgpack-numpy
+html5lib==1.0b8
--- a/setup.py
+++ b/setup.py
@ -28,7 +28,9 @@ MOD_NAMES = [
    'spacy.pipeline',
    'spacy.syntax.stateclass',
    'spacy.syntax._state',
+    'spacy.syntax._beam_utils',
    'spacy.tokenizer',
+    'spacy._cfile',
    'spacy.syntax.parser',
    'spacy.syntax.nn_parser',
    'spacy.syntax.beam_parser',
@ -51,7 +53,8 @@ MOD_NAMES = [
 COMPILE_OPTIONS =  {
    'msvc': ['/Ox', '/EHsc'],
    'mingw32' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'],
-    'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function']
+    'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function',
+               '-march=native']
 }


@ -187,14 +190,14 @@ def setup_package():
            url=about['__uri__'],
            license=about['__license__'],
            ext_modules=ext_modules,
+            scripts=['bin/spacy'],
            install_requires=[
                'numpy>=1.7',
                'murmurhash>=0.28,<0.29',
                'cymem>=1.30,<1.32',
                'preshed>=1.0.0,<2.0.0',
-                'thinc>=6.7.3,<6.8.0',
+                'thinc>=6.9.0,<6.10.0',
                'plac<1.0.0,>=0.9.6',
-                'pip>=9.0.0,<10.0.0',
                'six',
                'pathlib',
                'ujson>=1.35',
--- a/spacy/init.py
+++ b/spacy/init.py
@ -4,14 +4,21 @@ from __future__ import unicode_literals
 from .cli.info import info as cli_info
 from .glossary import explain
 from .deprecated import resolve_load_name
+#from .about import __version__
 from .about import __version__
 from . import util


 def load(name, **overrides):
+    from .deprecated import resolve_load_name
    name = resolve_load_name(name, **overrides)
    return util.load_model(name, **overrides)


+def blank(name, **kwargs):
+    LangClass = util.get_lang_class(name)
+    return LangClass(**kwargs)
+
+
 def info(model=None, markdown=False):
    return cli_info(None, model, markdown)
--- a/spacy/main.py
+++ b/spacy/main.py
@ -3,15 +3,25 @@ from __future__ import print_function
 # NB! This breaks in plac on Python 2!!
 #from __future__ import unicode_literals

-
 if __name__ == '__main__':
    import plac
    import sys
-    from spacy.cli import download, link, info, package, train, convert
+    from spacy.cli import download, link, info, package, train, convert, model
+    from spacy.cli import profile, evaluate, validate
    from spacy.util import prints

-    commands = {'download': download, 'link': link, 'info': info, 'train': train,
-                'convert': convert, 'package': package}
+    commands = {
+        'download': download,
+        'link': link,
+        'info': info,
+        'train': train,
+        'evaluate': evaluate,
+        'convert': convert,
+        'package': package,
+        'model': model,
+        'profile': profile,
+        'validate': validate
+    }
    if len(sys.argv) == 1:
        prints(', '.join(commands), title="Available commands", exits=1)
    command = sys.argv.pop(1)
@ -19,5 +29,7 @@ if __name__ == '__main__':
    if command in commands:
        plac.call(commands[command])
    else:
-        prints("Available: %s" % ', '.join(commands),
-               title="Unknown command: %s" % command, exits=1)
+        prints(
+            "Available: %s" % ', '.join(commands),
+            title="Unknown command: %s" % command,
+            exits=1)
--- a/spacy/_cfile.pxd
+++ b/spacy/_cfile.pxd
@ -0,0 +1,26 @@
+from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
+from cymem.cymem cimport Pool
+
+cdef class CFile:
+    cdef FILE* fp
+    cdef bint is_open
+    cdef Pool mem
+    cdef int size # For compatibility with subclass
+    cdef int _capacity # For compatibility with subclass
+
+    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
+
+    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
+
+    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
+
+
+
+cdef class StringCFile(CFile):
+    cdef unsigned char* data
+ 
+    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
+
+    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
+    
+    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
--- a/spacy/_cfile.pyx
+++ b/spacy/_cfile.pyx
@ -0,0 +1,88 @@
+from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
+from libc.string cimport memcpy
+
+
+cdef class CFile:
+    def __init__(self, loc, mode, on_open_error=None):
+        if isinstance(mode, unicode):
+            mode_str = mode.encode('ascii')
+        else:
+            mode_str = mode
+        if hasattr(loc, 'as_posix'):
+            loc = loc.as_posix()
+        self.mem = Pool()
+        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
+        self.fp = fopen(<char*>bytes_loc, mode_str)
+        if self.fp == NULL:
+            if on_open_error is not None:
+                on_open_error()
+            else:
+                raise IOError("Could not open binary file %s" % bytes_loc)
+        self.is_open = True
+
+    def __dealloc__(self):
+        if self.is_open:
+            fclose(self.fp)
+
+    def close(self):
+        fclose(self.fp)
+        self.is_open = False
+
+    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
+        st = fread(dest, elem_size, number, self.fp)
+        if st != number:
+            raise IOError
+
+    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
+        st = fwrite(src, elem_size, number, self.fp)
+        if st != number:
+            raise IOError
+
+    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
+        cdef void* dest = mem.alloc(number, elem_size)
+        self.read_into(dest, number, elem_size)
+        return dest
+
+    def write_unicode(self, unicode value):
+        cdef bytes py_bytes = value.encode('utf8')
+        cdef char* chars = <char*>py_bytes
+        self.write(sizeof(char), len(py_bytes), chars)
+
+
+cdef class StringCFile:
+    def __init__(self, mode, bytes data=b'', on_open_error=None):
+        self.mem = Pool()
+        self.is_open = 'w' in mode
+        self._capacity = max(len(data), 8)
+        self.size = len(data)
+        self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
+        for i in range(len(data)):
+            self.data[i] = data[i]
+
+    def close(self):
+        self.is_open = False
+
+    def string_data(self):
+        return (self.data-self.size)[:self.size]
+
+    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
+        memcpy(dest, self.data, elem_size * number)
+        self.data += elem_size * number
+
+    cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
+        write_size = number * elem_size
+        if (self.size + write_size) >= self._capacity:
+            self._capacity = (self.size + write_size) * 2
+            self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
+        memcpy(&self.data[self.size], src, elem_size * number)
+        self.size += write_size
+
+    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
+        cdef void* dest = mem.alloc(number, elem_size)
+        self.read_into(dest, number, elem_size)
+        return dest
+
+    def write_unicode(self, unicode value):
+        cdef bytes py_bytes = value.encode('utf8')
+        cdef char* chars = <char*>py_bytes
+        self.write(sizeof(char), len(py_bytes), chars)
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -1,24 +1,106 @@
 import ujson
-from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
-from thinc.neural import Model, Maxout, Softmax, Affine
-from thinc.neural._classes.hash_embed import HashEmbed
-from thinc.neural.ops import NumpyOps, CupyOps
+from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
+from thinc.i2v import HashEmbed, StaticVectors
+from thinc.t2t import ExtractWindow, ParametricAttention
+from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
+from thinc.misc import Residual
+from thinc.misc import BatchNorm as BN
+from thinc.misc import LayerNorm as LN
+
+from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
+from thinc.api import FeatureExtracter, with_getitem
+from thinc.api import uniqued, wrap, flatten_add_lengths, noop
+
+from thinc.linear.linear import LinearModel
+from thinc.neural.ops import NumpyOps, CupyOps
+from thinc.neural.util import get_array_module
+
+import random
+import cytoolz

-from thinc.neural._classes.convolution import ExtractWindow
-from thinc.neural._classes.static_vectors import StaticVectors
-from thinc.neural._classes.batchnorm import BatchNorm
-from thinc.neural._classes.resnet import Residual
-from thinc.neural import ReLu
 from thinc import describe
 from thinc.describe import Dimension, Synapses, Biases, Gradient
 from thinc.neural._classes.affine import _set_dimensions_if_needed
+import thinc.extra.load_nlp

-from .attrs import ID, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
+from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
 from .tokens.doc import Doc
+from . import util

 import numpy
 import io

+# TODO: Unset this once we don't want to support models previous models.
+import thinc.neural._classes.layernorm
+thinc.neural._classes.layernorm.set_compat_six_eight(False)
+
+VECTORS_KEY = 'spacy_pretrained_vectors'
+
+@layerize
+def _flatten_add_lengths(seqs, pad=0, drop=0.):
+    ops = Model.ops
+    lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
+    def finish_update(d_X, sgd=None):
+        return ops.unflatten(d_X, lengths, pad=pad)
+    X = ops.flatten(seqs, pad=pad)
+    return (X, lengths), finish_update
+
+
+@layerize
+def _logistic(X, drop=0.):
+    xp = get_array_module(X)
+    if not isinstance(X, xp.ndarray):
+        X = xp.asarray(X)
+    # Clip to range (-10, 10)
+    X = xp.minimum(X, 10., X)
+    X = xp.maximum(X, -10., X)
+    Y = 1. / (1. + xp.exp(-X))
+    def logistic_bwd(dY, sgd=None):
+        dX = dY * (Y * (1-Y))
+        return dX
+    return Y, logistic_bwd
+
+
+@layerize
+def add_tuples(X, drop=0.):
+    """Give inputs of sequence pairs, where each sequence is (vals, length),
+    sum the values, returning a single sequence.
+
+    If input is:
+    ((vals1, length), (vals2, length)
+    Output is:
+    (vals1+vals2, length)
+
+    vals are a single tensor for the whole batch.
+    """
+    (vals1, length1), (vals2, length2) = X
+    assert length1 == length2
+
+    def add_tuples_bwd(dY, sgd=None):
+        return (dY, dY)
+
+    return (vals1+vals2, length), add_tuples_bwd
+
+
+def _zero_init(model):
+    def _zero_init_impl(self, X, y):
+        self.W.fill(0)
+    model.on_data_hooks.append(_zero_init_impl)
+    if model.W is not None:
+        model.W.fill(0.)
+    return model
+
+
+@layerize
+def _preprocess_doc(docs, drop=0.):
+    keys = [doc.to_array([LOWER]) for doc in docs]
+    keys = [a[:, 0] for a in keys]
+    ops = Model.ops
+    lengths = ops.asarray([arr.shape[0] for arr in keys])
+    keys = ops.xp.concatenate(keys)
+    vals = ops.allocate(keys.shape[0]) + 1
+    return (keys, vals, lengths), None
+

 def _init_for_precomputed(W, ops):
    if (W**2).sum() != 0.:
@ -27,6 +109,7 @@ def _init_for_precomputed(W, ops):
    ops.xavier_uniform_init(reshaped)
    W[:] = reshaped.reshape(W.shape)

+
@describe.on_data(_set_dimensions_if_needed)
@describe.attributes(
    nI=Dimension("Input size"),
@ -130,34 +213,163 @@ class PrecomputableMaxouts(Model):
            return dXf
        return Yfp, backward

-def Tok2Vec(width, embed_size, preprocess=None):
-    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
-    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
-        norm = get_col(cols.index(NORM))   >> HashEmbed(width, embed_size, name='embed_lower')
-        prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
-        suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
-        shape = get_col(cols.index(SHAPE))   >> HashEmbed(width, embed_size//2, name='embed_shape')
+# Thinc's Embed class is a bit broken atm, so drop this here.
+from thinc import describe
+from thinc.neural._classes.embed import _uniform_init

-        embed = (norm | prefix | suffix | shape )
-        tok2vec = (
-            with_flatten(
-                asarray(Model.ops, dtype='uint64')
-                >> embed
-                >> Maxout(width, width*4, pieces=3)
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-                >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
-            pad=4)
+
+@describe.attributes(
+    nV=describe.Dimension("Number of vectors"),
+    nO=describe.Dimension("Size of output"),
+    vectors=describe.Weights("Embedding table",
+        lambda obj: (obj.nV, obj.nO),
+        _uniform_init(-0.1, 0.1)
+    ),
+    d_vectors=describe.Gradient("vectors")
+)
+class Embed(Model):
+    name = 'embed'
+
+    def __init__(self, nO, nV=None, **kwargs):
+        if nV is not None:
+            nV += 1
+        Model.__init__(self, **kwargs)
+        if 'name' in kwargs:
+            self.name = kwargs['name']
+        self.column = kwargs.get('column', 0)
+        self.nO = nO
+        self.nV = nV
+
+    def predict(self, ids):
+        if ids.ndim == 2:
+            ids = ids[:, self.column]
+        return self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f')
+
+    def begin_update(self, ids, drop=0.):
+        if ids.ndim == 2:
+            ids = ids[:, self.column]
+        vectors = self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f')
+        def backprop_embed(d_vectors, sgd=None):
+            n_vectors = d_vectors.shape[0]
+            self.ops.scatter_add(self.d_vectors, ids, d_vectors)
+            if sgd is not None:
+                sgd(self._mem.weights, self._mem.gradient, key=self.id)
+            return None
+        return vectors, backprop_embed
+
+
+def HistoryFeatures(nr_class, hist_size=8, nr_dim=8):
+    '''Wrap a model, adding features representing action history.'''
+    if hist_size == 0:
+        return layerize(noop())
+    embed_tables = [Embed(nr_dim, nr_class, column=i, name='embed%d')
+                    for i in range(hist_size)]
+    embed = chain(concatenate(*embed_tables),
+                  LN(Maxout(hist_size*nr_dim, hist_size*nr_dim)))
+    ops = embed.ops
+    def add_history_fwd(vectors_hists, drop=0.):
+        vectors, hist_ids = vectors_hists
+        hist_feats, bp_hists = embed.begin_update(hist_ids, drop=drop)
+        outputs = ops.xp.hstack((vectors, hist_feats))
+
+        def add_history_bwd(d_outputs, sgd=None):
+            d_vectors = d_outputs[:, :vectors.shape[1]]
+            d_hists = d_outputs[:, vectors.shape[1]:]
+            bp_hists(d_hists, sgd=sgd)
+            return embed.ops.xp.ascontiguousarray(d_vectors)
+        return outputs, add_history_bwd
+    return wrap(add_history_fwd, embed)
+
+
+def drop_layer(layer, factor=2.):
+    def drop_layer_fwd(X, drop=0.):
+        if drop <= 0.:
+            return layer.begin_update(X, drop=drop)
+        else:
+            coinflip = layer.ops.xp.random.random()
+            if (coinflip / factor) >= drop:
+                return layer.begin_update(X, drop=drop)
+            else:
+                return X, lambda dX, sgd=None: dX
+
+    model = wrap(drop_layer_fwd, layer)
+    model.predict = layer
+    return model
+
+def link_vectors_to_models(vocab):
+    vectors = vocab.vectors
+    ops = Model.ops
+    for word in vocab:
+        if word.orth in vectors.key2row:
+            word.rank = vectors.key2row[word.orth]
+        else:
+            word.rank = 0
+    data = ops.asarray(vectors.data)
+    # Set an entry here, so that vectors are accessed by StaticVectors
+    # (unideal, I know)
+    thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
+
+def Tok2Vec(width, embed_size, **kwargs):
+    pretrained_dims = kwargs.get('pretrained_dims', 0)
+    cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
+    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
+    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
+                                 '*': reapply}):
+        norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
+        prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
+        suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
+        shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
+        if pretrained_dims is not None and pretrained_dims >= 1:
+            glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
+
+            embed = uniqued(
+                (glove | norm | prefix | suffix | shape)
+                >> LN(Maxout(width, width*5, pieces=3)), column=5)
+        else:
+            embed = uniqued(
+                (norm | prefix | suffix | shape)
+                >> LN(Maxout(width, width*4, pieces=3)), column=5)
+
+
+        convolution = Residual(
+            ExtractWindow(nW=1)
+            >> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
        )
-        if preprocess not in (False, None):
-            tok2vec = preprocess >> tok2vec
+
+        tok2vec = (
+            FeatureExtracter(cols)
+            >> with_flatten(
+                embed >> (convolution ** 4), pad=4)
+        )
+
        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
        tok2vec.nO = width
        tok2vec.embed = embed
    return tok2vec


+def reapply(layer, n_times):
+    def reapply_fwd(X, drop=0.):
+        backprops = []
+        for i in range(n_times):
+            Y, backprop = layer.begin_update(X, drop=drop)
+            X = Y
+            backprops.append(backprop)
+        def reapply_bwd(dY, sgd=None):
+            dX = None
+            for backprop in reversed(backprops):
+                dY = backprop(dY, sgd=sgd)
+                if dX is None:
+                    dX = dY
+                else:
+                    dX += dY
+            return dX
+        return Y, reapply_bwd
+    return wrap(reapply_fwd, layer)
+
+
+
+
 def asarray(ops, dtype):
    def forward(X, drop=0.):
        return ops.asarray(X, dtype=dtype), None
@ -243,7 +455,8 @@ def zero_init(model):


 def doc2feats(cols=None):
-    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
+    if cols is None:
+        cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    def forward(docs, drop=0.):
        feats = []
        for doc in docs:
@ -269,6 +482,46 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
    return vectors, backward


+def fine_tune(embedding, combine=None):
+    if combine is not None:
+        raise NotImplementedError(
+            "fine_tune currently only supports addition. Set combine=None")
+    def fine_tune_fwd(docs_tokvecs, drop=0.):
+        docs, tokvecs = docs_tokvecs
+
+        lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
+
+        vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
+        flat_tokvecs = embedding.ops.flatten(tokvecs)
+        flat_vecs = embedding.ops.flatten(vecs)
+        output = embedding.ops.unflatten(
+                   (model.mix[0] * flat_tokvecs + model.mix[1] * flat_vecs), lengths)
+
+        def fine_tune_bwd(d_output, sgd=None):
+            flat_grad = model.ops.flatten(d_output)
+            model.d_mix[0] += flat_tokvecs.dot(flat_grad.T).sum()
+            model.d_mix[1] += flat_vecs.dot(flat_grad.T).sum()
+
+            bp_vecs([d_o * model.mix[1] for d_o in d_output], sgd=sgd)
+            if sgd is not None:
+                sgd(model._mem.weights, model._mem.gradient, key=model.id)
+            return [d_o * model.mix[0] for d_o in d_output]
+        return output, fine_tune_bwd
+
+    def fine_tune_predict(docs_tokvecs):
+        docs, tokvecs = docs_tokvecs
+        vecs = embedding(docs)
+        return [model.mix[0]*tv+model.mix[1]*v
+                for tv, v in zip(tokvecs, vecs)]
+
+    model = wrap(fine_tune_fwd, embedding)
+    model.mix = model._mem.add((model.id, 'mix'), (2,))
+    model.mix.fill(0.5)
+    model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
+    model.predict = fine_tune_predict
+    return model
+
+
@layerize
 def flatten(seqs, drop=0.):
    if isinstance(seqs[0], numpy.ndarray):
@ -282,3 +535,211 @@ def flatten(seqs, drop=0.):
        return ops.unflatten(d_X, lengths)
    X = ops.xp.vstack(seqs)
    return X, finish_update
+
+
+@layerize
+def logistic(X, drop=0.):
+    xp = get_array_module(X)
+    if not isinstance(X, xp.ndarray):
+        X = xp.asarray(X)
+    # Clip to range (-10, 10)
+    X = xp.minimum(X, 10., X)
+    X = xp.maximum(X, -10., X)
+    Y = 1. / (1. + xp.exp(-X))
+    def logistic_bwd(dY, sgd=None):
+        dX = dY * (Y * (1-Y))
+        return dX
+    return Y, logistic_bwd
+
+
+def zero_init(model):
+    def _zero_init_impl(self, X, y):
+        self.W.fill(0)
+    model.on_data_hooks.append(_zero_init_impl)
+    return model
+
+@layerize
+def preprocess_doc(docs, drop=0.):
+    keys = [doc.to_array([LOWER]) for doc in docs]
+    keys = [a[:, 0] for a in keys]
+    ops = Model.ops
+    lengths = ops.asarray([arr.shape[0] for arr in keys])
+    keys = ops.xp.concatenate(keys)
+    vals = ops.allocate(keys.shape[0]) + 1
+    return (keys, vals, lengths), None
+
+def getitem(i):
+    def getitem_fwd(X, drop=0.):
+        return X[i], None
+    return layerize(getitem_fwd)
+
+def build_tagger_model(nr_class, **cfg):
+    embed_size = util.env_opt('embed_size', 7000)
+    if 'token_vector_width' in cfg:
+        token_vector_width = cfg['token_vector_width']
+    else:
+        token_vector_width = util.env_opt('token_vector_width', 128)
+    pretrained_dims = cfg.get('pretrained_dims', 0)
+    with Model.define_operators({'>>': chain, '+': add}):
+        if 'tok2vec' in cfg:
+            tok2vec = cfg['tok2vec']
+        else:
+            tok2vec = Tok2Vec(token_vector_width, embed_size,
+                              pretrained_dims=pretrained_dims)
+        model = (
+            tok2vec
+            >> with_flatten(Softmax(nr_class, token_vector_width))
+        )
+    model.nI = None
+    model.tok2vec = tok2vec
+    return model
+
+
+@layerize
+def SpacyVectors(docs, drop=0.):
+    xp = get_array_module(docs[0].vocab.vectors.data)
+    width = docs[0].vocab.vectors.data.shape[1]
+    batch = []
+    for doc in docs:
+        indices = numpy.zeros((len(doc),), dtype='i')
+        for i, word in enumerate(doc):
+            if word.orth in doc.vocab.vectors.key2row:
+                indices[i] = doc.vocab.vectors.key2row[word.orth]
+            else:
+                indices[i] = 0
+        vectors = doc.vocab.vectors.data[indices]
+        batch.append(vectors)
+    return batch, None
+
+
+def foreach(layer, drop_factor=1.0):
+    '''Map a layer across elements in a list'''
+    def foreach_fwd(Xs, drop=0.):
+        drop *= drop_factor
+        ys = []
+        backprops = []
+        for X in Xs:
+            y, bp_y = layer.begin_update(X, drop=drop)
+            ys.append(y)
+            backprops.append(bp_y)
+        def foreach_bwd(d_ys, sgd=None):
+            d_Xs = []
+            for d_y, bp_y in zip(d_ys, backprops):
+                if bp_y is not None and bp_y is not None:
+                    d_Xs.append(d_y, sgd=sgd)
+                else:
+                    d_Xs.append(None)
+            return d_Xs
+        return ys, foreach_bwd
+    model = wrap(foreach_fwd, layer)
+    return model
+
+
+def build_text_classifier(nr_class, width=64, **cfg):
+    nr_vector = cfg.get('nr_vector', 5000)
+    pretrained_dims = cfg.get('pretrained_dims', 0)
+    with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
+                                 '**': clone}):
+        if cfg.get('low_data'):
+            model = (
+                SpacyVectors
+                >> flatten_add_lengths
+                >> with_getitem(0,
+                    Affine(width, pretrained_dims)
+                )
+                >> ParametricAttention(width)
+                >> Pooling(sum_pool)
+                >> Residual(ReLu(width, width)) ** 2
+                >> zero_init(Affine(nr_class, width, drop_factor=0.0))
+                >> logistic
+            )
+            return model
+
+
+        lower = HashEmbed(width, nr_vector, column=1)
+        prefix = HashEmbed(width//2, nr_vector, column=2)
+        suffix = HashEmbed(width//2, nr_vector, column=3)
+        shape = HashEmbed(width//2, nr_vector, column=4)
+
+        trained_vectors = (
+            FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID])
+            >> with_flatten(
+                uniqued(
+                    (lower | prefix | suffix | shape)
+                    >> LN(Maxout(width, width+(width//2)*3)),
+                    column=0
+                )
+            )
+        )
+
+        if pretrained_dims:
+            static_vectors = (
+                SpacyVectors
+                >> with_flatten(Affine(width, pretrained_dims))
+            )
+            # TODO Make concatenate support lists
+            vectors = concatenate_lists(trained_vectors, static_vectors)
+            vectors_width = width*2
+        else:
+            vectors = trained_vectors
+            vectors_width = width
+            static_vectors = None
+        cnn_model = (
+            vectors
+            >> with_flatten(
+                LN(Maxout(width, vectors_width))
+                >> Residual(
+                    (ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
+                ) ** 2, pad=2
+            )
+            >> flatten_add_lengths
+            >> ParametricAttention(width)
+            >> Pooling(sum_pool)
+            >> Residual(zero_init(Maxout(width, width)))
+            >> zero_init(Affine(nr_class, width, drop_factor=0.0))
+        )
+
+        linear_model = (
+            _preprocess_doc
+            >> LinearModel(nr_class, drop_factor=0.)
+        )
+
+        model = (
+            (linear_model | cnn_model)
+            >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
+            >> logistic
+        )
+    model.nO = nr_class
+    model.lsuv = False
+    return model
+
+@layerize
+def flatten(seqs, drop=0.):
+    ops = Model.ops
+    lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
+    def finish_update(d_X, sgd=None):
+        return ops.unflatten(d_X, lengths, pad=0)
+    X = ops.flatten(seqs, pad=0)
+    return X, finish_update
+
+
+def concatenate_lists(*layers, **kwargs): # pragma: no cover
+    '''Compose two or more models `f`, `g`, etc, such that their outputs are
+    concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
+    '''
+    if not layers:
+        return noop()
+    drop_factor = kwargs.get('drop_factor', 1.0)
+    ops = layers[0].ops
+    layers = [chain(layer, flatten) for layer in layers]
+    concat = concatenate(*layers)
+    def concatenate_lists_fwd(Xs, drop=0.):
+        drop *= drop_factor
+        lengths = ops.asarray([len(X) for X in Xs], dtype='i')
+        flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
+        ys = ops.unflatten(flat_y, lengths)
+        def concatenate_lists_bwd(d_ys, sgd=None):
+            return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
+        return ys, concatenate_lists_bwd
+    model = wrap(concatenate_lists_fwd, concat)
+    return model
--- a/spacy/about.py
+++ b/spacy/about.py
@ -3,14 +3,15 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py

 __title__ = 'spacy-nightly'
-__version__ = '2.0.0a1'
+__version__ = '2.0.0a17'
 __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 __uri__ = 'https://spacy.io'
 __author__ = 'Explosion AI'
 __email__ = 'contact@explosion.ai'
 __license__ = 'MIT'
+__release__ = False

-__docs_models__ = 'https://spacy.io/docs/usage/models'
+__docs_models__ = 'https://alpha.spacy.io/usage/models'
 __download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
 __compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
 __shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json'
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -1,5 +1,5 @@
 # Reserve 64 values for flag features
-cpdef enum attr_id_t:
+cdef enum attr_id_t:
    NULL_ATTR
    IS_ALPHA
    IS_ASCII
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -94,6 +94,7 @@ IDS = {

 # ATTR IDs, in order of the symbol
 NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
+locals().update(IDS)


 def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -2,5 +2,9 @@ from .download import download
 from .info import info
 from .link import link
 from .package import package
+from .profile import profile
 from .train import train
+from .evaluate import evaluate
 from .convert import convert
+from .model import model
+from .validate import validate
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -4,7 +4,7 @@ from __future__ import unicode_literals
 import plac
 from pathlib import Path

-from .converters import conllu2json, iob2json
+from .converters import conllu2json, iob2json, conll_ner2json
 from ..util import prints

 # Converters are matched by file extension. To add a converter, add a new entry
@ -12,19 +12,22 @@ from ..util import prints
 # from /converters.

 CONVERTERS = {
-    '.conllu': conllu2json,
-    '.conll': conllu2json,
-    '.iob': iob2json
+    'conllu': conllu2json,
+    'conll': conllu2json,
+    'ner': conll_ner2json,
+    'iob': iob2json,
 }


@plac.annotations(
    input_file=("input file", "positional", None, str),
    output_dir=("output directory for converted file", "positional", None, str),
-    n_sents=("Number of sentences per doc", "option", "n", float),
+    n_sents=("Number of sentences per doc", "option", "n", int),
+    converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
    morphology=("Enable appending morphology to tags", "flag", "m", bool)
 )
-def convert(cmd, input_file, output_dir, n_sents, morphology):
+def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
+            converter='auto'):
    """
    Convert files into JSON format for use with train command and other
    experiment management functions.
@ -35,9 +38,11 @@ def convert(cmd, input_file, output_dir, n_sents, morphology):
        prints(input_path, title="Input file not found", exits=1)
    if not output_path.exists():
        prints(output_path, title="Output directory not found", exits=1)
-    file_ext = input_path.suffix
-    if not file_ext in CONVERTERS:
-        prints("Can't find converter for %s" % input_path.parts[-1],
-               title="Unknown format", exits=1)
-    CONVERTERS[file_ext](input_path, output_path,
-            n_sents=n_sents, use_morphology=morphology)
+    if converter == 'auto':
+        converter = input_path.suffix[1:]
+    if not converter in CONVERTERS:
+            prints("Can't find converter for %s" % converter,
+                title="Unknown format", exits=1)
+    func = CONVERTERS[converter]
+    func(input_path, output_path,
+         n_sents=n_sents, use_morphology=morphology)
--- a/spacy/cli/converters/init.py
+++ b/spacy/cli/converters/init.py
@ -1,2 +1,3 @@
 from .conllu2json import conllu2json
 from .iob2json import iob2json
+from .conll_ner2json import conll_ner2json
--- a/spacy/cli/converters/conll_ner2json.py
+++ b/spacy/cli/converters/conll_ner2json.py
@ -0,0 +1,50 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...compat import json_dumps, path2str
+from ...util import prints
+from ...gold import iob_to_biluo
+
+
+def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
+    """
+    Convert files in the CoNLL-2003 NER format into JSON format for use with train cli.
+    """
+    docs = read_conll_ner(input_path)
+
+    output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
+    output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
+    output_file = output_path / output_filename
+    with output_file.open('w', encoding='utf-8') as f:
+        f.write(json_dumps(docs))
+    prints("Created %d documents" % len(docs),
+           title="Generated output file %s" % path2str(output_file))
+
+
+def read_conll_ner(input_path):
+    text = input_path.open('r', encoding='utf-8').read()
+    i = 0
+    delimit_docs = '-DOCSTART- -X- O O'
+    output_docs = []
+    for doc in text.strip().split(delimit_docs):
+        doc = doc.strip()
+        if not doc:
+            continue
+        output_doc = []
+        for sent in doc.split('\n\n'):
+            sent = sent.strip()
+            if not sent:
+                continue
+            lines = [line.strip() for line in sent.split('\n') if line.strip()]
+            words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
+            biluo_ents = iob_to_biluo(iob_ents)
+            output_doc.append({'tokens': [
+                {'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in
+                zip(words, tags, biluo_ents)
+            ]})
+        output_docs.append({
+            'id': len(output_docs),
+            'paragraphs': [{'sentences': output_doc}]
+        })
+        output_doc = []
+    return output_docs
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@ -1,5 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals
+from cytoolz import partition_all, concat

 from ...compat import json_dumps, path2str
 from ...util import prints
@ -10,11 +11,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
    """
    Convert IOB files into JSON format for use with train cli.
    """
-    # TODO: This isn't complete yet -- need to map from IOB to
-    # BILUO
    with input_path.open('r', encoding='utf8') as file_:
-        docs = read_iob(file_)
-
+        sentences = read_iob(file_)
+    docs = merge_sentences(sentences, n_sents)
    output_filename = input_path.parts[-1].replace(".iob", ".json")
    output_file = output_path / output_filename
    with output_file.open('w', encoding='utf-8') as f:
@ -23,9 +22,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
           title="Generated output file %s" % path2str(output_file))


-def read_iob(file_):
+def read_iob(raw_sents):
    sentences = []
-    for line in file_:
+    for line in raw_sents:
        if not line.strip():
            continue
        tokens = [t.split('|') for t in line.split()]
@ -43,3 +42,15 @@ def read_iob(file_):
    paragraphs = [{'sentences': [sent]} for sent in sentences]
    docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs]
    return docs
+
+def merge_sentences(docs, n_sents):
+    counter = 0
+    merged = []
+    for group in partition_all(n_sents, docs):
+        group = list(group)
+        first = group.pop(0)
+        to_extend = first['paragraphs'][0]['sentences']
+        for sent in group[1:]:
+            to_extend.extend(sent['paragraphs'][0]['sentences'])
+        merged.append(first)
+    return merged
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -8,7 +8,7 @@ import subprocess
 import sys

 from .link import link
-from ..util import prints
+from ..util import prints, get_package_path
 from .. import about


@ -24,24 +24,29 @@ def download(cmd, model, direct=False):
    with version.
    """
    if direct:
-        download_model('{m}/{m}.tar.gz'.format(m=model))
+        dl = download_model('{m}/{m}.tar.gz'.format(m=model))
    else:
        shortcuts = get_json(about.__shortcuts__, "available shortcuts")
        model_name = shortcuts.get(model, model)
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
-        download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
-        try:
-            link(None, model_name, model, force=True)
-        except:
-            # Dirty, but since spacy.download and the auto-linking is mostly
-            # a convenience wrapper, it's best to show a success message and
-            # loading instructions, even if linking fails.
-            prints("Creating a shortcut link for 'en' didn't work (maybe you "
-                   "don't have admin permissions?), but you can still load "
-                   "the model via its full package name:",
-                   "nlp = spacy.load('%s')" % model_name,
-                   title="Download successful")
+        dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
+        if dl == 0:
+            try:
+                # Get package path here because link uses
+                # pip.get_installed_distributions() to check if model is a package,
+                # which fails if model was just installed via subprocess
+                package_path = get_package_path(model_name)
+                link(None, model_name, model, force=True, model_path=package_path)
+            except:
+                # Dirty, but since spacy.download and the auto-linking is mostly
+                # a convenience wrapper, it's best to show a success message and
+                # loading instructions, even if linking fails.
+                prints("Creating a shortcut link for 'en' didn't work (maybe you "
+                    "don't have admin permissions?), but you can still load "
+                    "the model via its full package name:",
+                    "nlp = spacy.load('%s')" % model_name,
+                    title="Download successful")


 def get_json(url, desc):
@ -73,6 +78,6 @@ def get_version(model, comp):

 def download_model(filename):
    download_url = about.__download_url__ + '/' + filename
-    subprocess.call([sys.executable, '-m',
+    return subprocess.call([sys.executable, '-m',
        'pip', 'install', '--no-cache-dir', download_url],
        env=os.environ.copy())
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -0,0 +1,120 @@
+# coding: utf8
+from __future__ import unicode_literals, division, print_function
+
+import plac
+import json
+from collections import defaultdict
+import cytoolz
+from pathlib import Path
+import dill
+import tqdm
+from thinc.neural._classes.model import Model
+from thinc.neural.optimizers import linear_decay
+from timeit import default_timer as timer
+import random
+import numpy.random
+
+from ..tokens.doc import Doc
+from ..scorer import Scorer
+from ..gold import GoldParse, merge_sents
+from ..gold import GoldCorpus, minibatch
+from ..util import prints
+from .. import util
+from .. import about
+from .. import displacy
+from ..compat import json_dumps
+
+random.seed(0)
+numpy.random.seed(0)
+
+
+@plac.annotations(
+    model=("Model name or path", "positional", None, str),
+    data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
+    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
+    gpu_id=("Use GPU", "option", "g", int),
+    displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
+    displacy_limit=("Limit of parses to render as HTML", "option", "dl", int)
+)
+def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
+             displacy_path=None, displacy_limit=25):
+    """
+    Evaluate a model. To render a sample of parses in a HTML file, set an output
+    directory as the displacy_path argument.
+    """
+    if gpu_id >= 0:
+        util.use_gpu(gpu_id)
+    util.set_env_log(False)
+    data_path = util.ensure_path(data_path)
+    displacy_path = util.ensure_path(displacy_path)
+    if not data_path.exists():
+        prints(data_path, title="Evaluation data not found", exits=1)
+    if displacy_path and not displacy_path.exists():
+        prints(displacy_path, title="Visualization output directory not found", exits=1)
+    corpus = GoldCorpus(data_path, data_path)
+    nlp = util.load_model(model)
+    dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
+    begin = timer()
+    scorer = nlp.evaluate(dev_docs, verbose=False)
+    end = timer()
+    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
+    print_results(scorer, time=end - begin, words=nwords,
+                  wps=nwords / (end - begin))
+    if displacy_path:
+        docs, golds = zip(*dev_docs)
+        render_deps = 'parser' in nlp.meta.get('pipeline', [])
+        render_ents = 'ner' in nlp.meta.get('pipeline', [])
+        render_parses(docs, displacy_path, model_name=model, limit=displacy_limit,
+                      deps=render_deps, ents=render_ents)
+        prints(displacy_path, title="Generated %s parses as HTML" % displacy_limit)
+
+
+def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=True):
+    docs[0].user_data['title'] = model_name
+    if ents:
+        with (output_path / 'entities.html').open('w') as file_:
+            html = displacy.render(docs[:limit], style='ent', page=True)
+            file_.write(html)
+    if deps:
+        with (output_path / 'parses.html').open('w') as file_:
+            html = displacy.render(docs[:limit], style='dep', page=True, options={'compact': True})
+            file_.write(html)
+
+
+def print_progress(itn, losses, dev_scores, wps=0.0):
+    scores = {}
+    for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
+                'ents_p', 'ents_r', 'ents_f', 'wps']:
+        scores[col] = 0.0
+    scores['dep_loss'] = losses.get('parser', 0.0)
+    scores['ner_loss'] = losses.get('ner', 0.0)
+    scores['tag_loss'] = losses.get('tagger', 0.0)
+    scores.update(dev_scores)
+    scores['wps'] = wps
+    tpl = '\t'.join((
+        '{:d}',
+        '{dep_loss:.3f}',
+        '{ner_loss:.3f}',
+        '{uas:.3f}',
+        '{ents_p:.3f}',
+        '{ents_r:.3f}',
+        '{ents_f:.3f}',
+        '{tags_acc:.3f}',
+        '{token_acc:.3f}',
+        '{wps:.1f}'))
+    print(tpl.format(itn, **scores))
+
+
+def print_results(scorer, time, words, wps):
+    results = {
+        'Time': '%.2f s' % time,
+        'Words': words,
+        'Words/s': '%.0f' % wps,
+        'TOK': '%.2f' % scorer.token_acc,
+        'POS': '%.2f' % scorer.tags_acc,
+        'UAS': '%.2f' % scorer.uas,
+        'LAS': '%.2f' % scorer.las,
+        'NER P': '%.2f' % scorer.ents_p,
+        'NER R': '%.2f' % scorer.ents_r,
+        'NER F': '%.2f' % scorer.ents_f}
+    util.print_table(results, title="Results")
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@ -14,7 +14,7 @@ from .. import util
    link_name=("name of shortuct link to create", "positional", None, str),
    force=("force overwriting of existing link", "flag", "f", bool)
 )
-def link(cmd, origin, link_name, force=False):
+def link(cmd, origin, link_name, force=False, model_path=None):
    """
    Create a symlink for models within the spacy/data directory. Accepts
    either the name of a pip package, or the local path to the model data
@ -23,10 +23,17 @@ def link(cmd, origin, link_name, force=False):
    if util.is_package(origin):
        model_path = util.get_package_path(origin)
    else:
-        model_path = Path(origin)
+        model_path = Path(origin) if model_path is None else Path(model_path)
    if not model_path.exists():
        prints("The data should be located in %s" % path2str(model_path),
               title="Can't locate model data", exits=1)
+    data_path = util.get_data_path()
+    if not data_path or not data_path.exists():
+        spacy_loc = Path(__file__).parent.parent
+        prints("Make sure a directory `/data` exists within your spaCy "
+               "installation and try again. The data directory should be "
+               "located here:", path2str(spacy_loc), exits=1,
+               title="Can't find the spaCy data path to create model symlink")
    link_path = util.get_data_path() / link_name
    if link_path.exists() and not force:
        prints("To overwrite an existing link, use the --force flag.",
--- a/spacy/cli/model.py
+++ b/spacy/cli/model.py
@ -0,0 +1,137 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import bz2
+import gzip
+import math
+from ast import literal_eval
+from pathlib import Path
+
+import numpy as np
+import spacy
+from preshed.counter import PreshCounter
+
+from .. import util
+from ..compat import fix_text
+
+
+def model(cmd, lang, model_dir, freqs_data, clusters_data, vectors_data,
+          min_doc_freq=5, min_word_freq=200):
+    model_path = Path(model_dir)
+    freqs_path = Path(freqs_data)
+    clusters_path = Path(clusters_data) if clusters_data else None
+    vectors_path = Path(vectors_data) if vectors_data else None
+
+    check_dirs(freqs_path, clusters_path, vectors_path)
+    vocab = util.get_lang_class(lang).Defaults.create_vocab()
+    nlp = spacy.blank(lang)
+    vocab = nlp.vocab
+    probs, oov_prob = read_probs(
+        freqs_path, min_doc_freq=int(min_doc_freq), min_freq=int(min_doc_freq))
+    clusters = read_clusters(clusters_path) if clusters_path else {}
+    populate_vocab(vocab, clusters, probs, oov_prob)
+    add_vectors(vocab, vectors_path)
+    create_model(model_path, nlp)
+
+
+def add_vectors(vocab, vectors_path):
+    with bz2.BZ2File(vectors_path.as_posix()) as f:
+        num_words, dim = next(f).split()
+        vocab.clear_vectors(int(dim))
+        for line in f:
+            word_w_vector = line.decode("utf8").strip().split(" ")
+            word = word_w_vector[0]
+            vector = np.array([float(val) for val in word_w_vector[1:]])
+            if word in vocab:
+                vocab.set_vector(word, vector)
+
+
+def create_model(model_path, model):
+    if not model_path.exists():
+        model_path.mkdir()
+    model.to_disk(model_path.as_posix())
+
+
+def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
+    counts = PreshCounter()
+    total = 0
+    freqs_file = check_unzip(freqs_path)
+    for i, line in enumerate(freqs_file):
+        freq, doc_freq, key = line.rstrip().split('\t', 2)
+        freq = int(freq)
+        counts.inc(i + 1, freq)
+        total += freq
+    counts.smooth()
+    log_total = math.log(total)
+    freqs_file = check_unzip(freqs_path)
+    probs = {}
+    for line in freqs_file:
+        freq, doc_freq, key = line.rstrip().split('\t', 2)
+        doc_freq = int(doc_freq)
+        freq = int(freq)
+        if doc_freq >= min_doc_freq and freq >= min_freq and len(
+                key) < max_length:
+            word = literal_eval(key)
+            smooth_count = counts.smoother(int(freq))
+            probs[word] = math.log(smooth_count) - log_total
+    oov_prob = math.log(counts.smoother(0)) - log_total
+    return probs, oov_prob
+
+
+def read_clusters(clusters_path):
+    clusters = {}
+    with clusters_path.open() as f:
+        for line in f:
+            try:
+                cluster, word, freq = line.split()
+                word = fix_text(word)
+            except ValueError:
+                continue
+            # If the clusterer has only seen the word a few times, its
+            # cluster is unreliable.
+            if int(freq) >= 3:
+                clusters[word] = cluster
+            else:
+                clusters[word] = '0'
+    # Expand clusters with re-casing
+    for word, cluster in list(clusters.items()):
+        if word.lower() not in clusters:
+            clusters[word.lower()] = cluster
+        if word.title() not in clusters:
+            clusters[word.title()] = cluster
+        if word.upper() not in clusters:
+            clusters[word.upper()] = cluster
+    return clusters
+
+
+def populate_vocab(vocab, clusters, probs, oov_prob):
+    for word, prob in reversed(
+            sorted(list(probs.items()), key=lambda item: item[1])):
+        lexeme = vocab[word]
+        lexeme.prob = prob
+        lexeme.is_oov = False
+        # Decode as a little-endian string, so that we can do & 15 to get
+        # the first 4 bits. See _parse_features.pyx
+        if word in clusters:
+            lexeme.cluster = int(clusters[word][::-1], 2)
+        else:
+            lexeme.cluster = 0
+
+
+def check_unzip(file_path):
+    file_path_str = file_path.as_posix()
+    if file_path_str.endswith('gz'):
+        return gzip.open(file_path_str)
+    else:
+        return file_path.open()
+
+
+def check_dirs(freqs_data, clusters_data, vectors_data):
+    if not freqs_data.is_file():
+        util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
+    if clusters_data and not clusters_data.is_file():
+        util.sys_exit(
+            clusters_data.as_posix(), title="No Brown clusters file found")
+    if vectors_data and not vectors_data.is_file():
+        util.sys_exit(
+            vectors_data.as_posix(), title="No word vectors file found")
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -15,10 +15,11 @@ from .. import about
@plac.annotations(
    input_dir=("directory with model data", "positional", None, str),
    output_dir=("output parent directory", "positional", None, str),
-    meta=("path to meta.json", "option", "m", str),
+    meta_path=("path to meta.json", "option", "m", str),
+    create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool),
    force=("force overwriting of existing folder in output directory", "flag", "f", bool)
 )
-def package(cmd, input_dir, output_dir, meta=None, force=False):
+def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False):
    """
    Generate Python package for model data, including meta and required
    installation files. A new directory will be created in the specified
@ -26,7 +27,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
    """
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
-    meta_path = util.ensure_path(meta)
+    meta_path = util.ensure_path(meta_path)
    if not input_path or not input_path.exists():
        prints(input_path, title="Model directory not found", exits=1)
    if not output_path or not output_path.exists():
@ -38,7 +39,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
    template_manifest = get_template('MANIFEST.in')
    template_init = get_template('xx_model_name/__init__.py')
    meta_path = meta_path or input_path / 'meta.json'
-    if meta_path.is_file():
+    if not create_meta and meta_path.is_file():
        prints(meta_path, title="Reading meta.json from file")
        meta = util.read_json(meta_path)
    else:
@ -100,12 +101,15 @@ def generate_meta():
 def generate_pipeline():
    prints("If set to 'True', the default pipeline is used. If set to 'False', "
           "the pipeline will be disabled. Components should be specified as a "
-           "comma-separated list of component names, e.g. vectorizer, tagger, "
+           "comma-separated list of component names, e.g. tensorizer, tagger, "
           "parser, ner. For more information, see the docs on processing pipelines.",
           title="Enter your model's pipeline components")
    pipeline = util.get_raw_input("Pipeline components", True)
-    replace = {'True': True, 'False': False}
-    return replace[pipeline] if pipeline in replace else pipeline.split(', ')
+    subs = {'True': True, 'False': False}
+    if pipeline in subs:
+        return subs[pipeline]
+    else:
+        return [p.strip() for p in pipeline.split(',')]


 def validate_meta(meta, keys):
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@ -0,0 +1,45 @@
+# coding: utf8
+from __future__ import unicode_literals, division, print_function
+
+import plac
+from pathlib import Path
+import ujson
+import cProfile
+import pstats
+
+import spacy
+import sys
+import tqdm
+import cytoolz
+
+
+def read_inputs(loc):
+    if loc is None:
+        file_ = sys.stdin
+        file_ = (line.encode('utf8') for line in file_)
+    else:
+        file_ = Path(loc).open()
+    for line in file_:
+        data = ujson.loads(line)
+        text = data['text']
+        yield text
+
+
+@plac.annotations(
+    lang=("model/language", "positional", None, str),
+    inputs=("Location of input file", "positional", None, read_inputs)
+)
+def profile(cmd, lang, inputs=None):
+    """
+    Profile a spaCy pipeline, to find out which functions take the most time.
+    """
+    nlp = spacy.load(lang) 
+    texts = list(cytoolz.take(10000, inputs))
+    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
+    s = pstats.Stats("Profile.prof")
+    s.strip_dirs().sort_stats("time").print_stats()
+
+
+def parse_texts(nlp, texts):
+    for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128):
+        pass
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -8,8 +8,11 @@ import cytoolz
 from pathlib import Path
 import dill
 import tqdm
+from thinc.neural._classes.model import Model
 from thinc.neural.optimizers import linear_decay
 from timeit import default_timer as timer
+import random
+import numpy.random

 from ..tokens.doc import Doc
 from ..scorer import Scorer
@ -17,9 +20,13 @@ from ..gold import GoldParse, merge_sents
 from ..gold import GoldCorpus, minibatch
 from ..util import prints
 from .. import util
+from .. import about
 from .. import displacy
 from ..compat import json_dumps

+random.seed(0)
+numpy.random.seed(0)
+

@plac.annotations(
    lang=("model language", "positional", None, str),
@ -29,13 +36,17 @@ from ..compat import json_dumps
    n_iter=("number of iterations", "option", "n", int),
    n_sents=("number of sentences", "option", "ns", int),
    use_gpu=("Use GPU", "option", "g", int),
-    resume=("Whether to resume training", "flag", "R", bool),
+    vectors=("Model to load vectors from", "option", "v"),
    no_tagger=("Don't train tagger", "flag", "T", bool),
    no_parser=("Don't train parser", "flag", "P", bool),
-    no_entities=("Don't train NER", "flag", "N", bool)
+    no_entities=("Don't train NER", "flag", "N", bool),
+    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
+    version=("Model version", "option", "V", str),
+    meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
 )
-def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
-          use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False):
+def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
+          use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
+          gold_preproc=False, version="0.0.0", meta_path=None):
    """
    Train a model. Expects data in spaCy's JSON format.
    """
@ -44,19 +55,26 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
    output_path = util.ensure_path(output_dir)
    train_path = util.ensure_path(train_data)
    dev_path = util.ensure_path(dev_data)
+    meta_path = util.ensure_path(meta_path)
    if not output_path.exists():
        output_path.mkdir()
    if not train_path.exists():
        prints(train_path, title="Training data not found", exits=1)
    if dev_path and not dev_path.exists():
        prints(dev_path, title="Development data not found", exits=1)
+    if meta_path is not None and not meta_path.exists():
+        prints(meta_path, title="meta.json not found", exits=1)
+    meta = util.read_json(meta_path) if meta_path else {}
+    if not isinstance(meta, dict):
+        prints("Expected dict but got: {}".format(type(meta)),
+               title="Not a valid meta.json format", exits=1)
+    meta.setdefault('lang', lang)
+    meta.setdefault('name', 'unnamed')

-    lang_class = util.get_lang_class(lang)
-
-    pipeline = ['token_vectors', 'tags', 'dependencies', 'entities']
-    if no_tagger and 'tags' in pipeline: pipeline.remove('tags')
-    if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies')
-    if no_entities and 'entities' in pipeline: pipeline.remove('entities')
+    pipeline = ['tagger', 'parser', 'ner']
+    if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
+    if no_parser and 'parser' in pipeline: pipeline.remove('parser')
+    if no_entities and 'ner' in pipeline: pipeline.remove('ner')

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
@ -66,27 +84,29 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                                  util.env_opt('dropout_to', 0.2),
                                  util.env_opt('dropout_decay', 0.0))
    batch_sizes = util.compounding(util.env_opt('batch_from', 1),
-                                   util.env_opt('batch_to', 64),
+                                   util.env_opt('batch_to', 16),
                                   util.env_opt('batch_compound', 1.001))
-
-    if resume:
-        prints(output_path / 'model19.pickle', title="Resuming training")
-        nlp = dill.load((output_path / 'model19.pickle').open('rb'))
-    else:
-        nlp = lang_class(pipeline=pipeline)
    corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
    n_train_words = corpus.count_train()

+    lang_class = util.get_lang_class(lang)
+    nlp = lang_class()
+    meta['pipeline'] = pipeline
+    nlp.meta.update(meta)
+    if vectors:
+        util.load_model(vectors, vocab=nlp.vocab)
+    for name in pipeline:
+        nlp.add_pipe(nlp.create_pipe(name), name=name)
    optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
+    nlp._optimizer = None

-    print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
+    print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
    try:
+        train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
+                                       gold_preproc=gold_preproc, max_length=0)
+        train_docs = list(train_docs)
        for i in range(n_iter):
-            if resume:
-                i += 20
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
-                train_docs = corpus.train_docs(nlp, projectivize=True,
-                                               gold_preproc=False, max_length=0)
                losses = {}
                for batch in minibatch(train_docs, size=batch_sizes):
                    docs, golds = zip(*batch)
@ -98,24 +118,51 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                util.set_env_log(False)
                epoch_model_path = output_path / ('model%d' % i)
                nlp.to_disk(epoch_model_path)
-                with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
-                    dill.dump(nlp, file_, -1)
-                nlp_loaded = lang_class(pipeline=pipeline)
-                nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
-                scorer = nlp_loaded.evaluate(
-                            corpus.dev_docs(
+                nlp_loaded = util.load_model_from_path(epoch_model_path)
+                dev_docs = list(corpus.dev_docs(
                                nlp_loaded,
-                                gold_preproc=False))
+                                gold_preproc=gold_preproc))
+                nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
+                start_time = timer()
+                scorer = nlp_loaded.evaluate(dev_docs)
+                end_time = timer()
+                if use_gpu < 0:
+                    gpu_wps = None
+                    cpu_wps = nwords/(end_time-start_time)
+                else:
+                    gpu_wps = nwords/(end_time-start_time)
+                    with Model.use_device('cpu'):
+                        nlp_loaded = util.load_model_from_path(epoch_model_path)
+                        dev_docs = list(corpus.dev_docs(
+                                        nlp_loaded, gold_preproc=gold_preproc))
+                        start_time = timer()
+                        scorer = nlp_loaded.evaluate(dev_docs)
+                        end_time = timer()
+                        cpu_wps = nwords/(end_time-start_time)
                acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
                with acc_loc.open('w') as file_:
                    file_.write(json_dumps(scorer.scores))
+                meta_loc = output_path / ('model%d' % i) / 'meta.json'
+                meta['accuracy'] = scorer.scores
+                meta['speed'] = {'nwords': nwords, 'cpu':cpu_wps, 'gpu': gpu_wps}
+                meta['lang'] = nlp.lang
+                meta['pipeline'] = pipeline
+                meta['spacy_version'] = '>=%s' % about.__version__
+                meta.setdefault('name', 'model%d' % i)
+                meta.setdefault('version', version)
+
+                with meta_loc.open('w') as file_:
+                    file_.write(json_dumps(meta))
                util.set_env_log(True)
-            print_progress(i, losses, scorer.scores)
+            print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
    finally:
        print("Saving model...")
-        with (output_path / 'model-final.pickle').open('wb') as file_:
-            with nlp.use_params(optimizer.averages):
-                dill.dump(nlp, file_, -1)
+        try:
+            with (output_path / 'model-final.pickle').open('wb') as file_:
+                with nlp.use_params(optimizer.averages):
+                    dill.dump(nlp, file_, -1)
+        except:
+            print("Error saving model")


 def _render_parses(i, to_render):
@ -128,25 +175,30 @@ def _render_parses(i, to_render):
        file_.write(html)


-def print_progress(itn, losses, dev_scores, wps=0.0):
+def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0):
    scores = {}
    for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
-                'ents_p', 'ents_r', 'ents_f', 'wps']:
+                'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']:
        scores[col] = 0.0
    scores['dep_loss'] = losses.get('parser', 0.0)
+    scores['ner_loss'] = losses.get('ner', 0.0)
    scores['tag_loss'] = losses.get('tagger', 0.0)
    scores.update(dev_scores)
-    scores['wps'] = wps
+    scores['cpu_wps'] = cpu_wps
+    scores['gpu_wps'] = gpu_wps or 0.0
    tpl = '\t'.join((
        '{:d}',
        '{dep_loss:.3f}',
+        '{ner_loss:.3f}',
        '{uas:.3f}',
        '{ents_p:.3f}',
        '{ents_r:.3f}',
        '{ents_f:.3f}',
        '{tags_acc:.3f}',
        '{token_acc:.3f}',
-        '{wps:.1f}'))
+        '{cpu_wps:.1f}',
+        '{gpu_wps:.1f}',
+    ))
    print(tpl.format(itn, **scores))


--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@ -0,0 +1,123 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import requests
+import pkg_resources
+from pathlib import Path
+
+from ..compat import path2str, locale_escape
+from ..util import prints, get_data_path, read_json
+from .. import about
+
+
+def validate(cmd):
+    """Validate that the currently installed version of spaCy is compatible
+    with the installed models. Should be run after `pip install -U spacy`.
+    """
+    r = requests.get(about.__compatibility__)
+    if r.status_code != 200:
+        prints("Couldn't fetch compatibility table.",
+               title="Server error (%d)" % r.status_code, exits=1)
+    compat = r.json()['spacy']
+    all_models = set()
+    for spacy_v, models in dict(compat).items():
+        all_models.update(models.keys())
+        for model, model_vs in models.items():
+            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
+
+    current_compat = compat[about.__version__]
+    model_links = get_model_links(current_compat)
+    model_pkgs = get_model_pkgs(current_compat, all_models)
+    incompat_links = {l for l, d in model_links.items() if not d['compat']}
+    incompat_models = {d['name'] for _, d in model_pkgs.items() if not d['compat']}
+    incompat_models.update([d['name'] for _, d in model_links.items() if not d['compat']])
+    na_models = [m for m in incompat_models if m not in current_compat]
+    update_models = [m for m in incompat_models if m in current_compat]
+
+    prints(path2str(Path(__file__).parent.parent),
+           title="Installed models (spaCy v{})".format(about.__version__))
+    if model_links or model_pkgs:
+        print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', ''))
+        for name, data in model_pkgs.items():
+            print(get_model_row(current_compat, name, data, 'package'))
+        for name, data in model_links.items():
+            print(get_model_row(current_compat, name, data, 'link'))
+    else:
+        prints("No models found in your current environment.", exits=0)
+
+    if update_models:
+        cmd = '    python -m spacy download {}'
+        print("\n    Use the following commands to update the model packages:")
+        print('\n'.join([cmd.format(pkg) for pkg in update_models]))
+
+    if na_models:
+        prints("The following models are not available for spaCy v{}: {}"
+               .format(about.__version__, ', '.join(na_models)))
+
+    if incompat_links:
+        prints("You may also want to overwrite the incompatible links using "
+               "the `spacy link` command with `--force`, or remove them from "
+               "the data directory. Data path: {}"
+               .format(path2str(get_data_path())))
+
+
+def get_model_links(compat):
+    links = {}
+    data_path = get_data_path()
+    if data_path:
+        models = [p for p in data_path.iterdir() if is_model_path(p)]
+        for model in models:
+            meta_path = Path(model) / 'meta.json'
+            if not meta_path.exists():
+                continue
+            meta = read_json(meta_path)
+            link = model.parts[-1]
+            name = meta['lang'] + '_' + meta['name']
+            links[link] = {'name': name, 'version': meta['version'],
+                           'compat': is_compat(compat, name, meta['version'])}
+    return links
+
+
+def get_model_pkgs(compat, all_models):
+    pkgs = {}
+    for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
+        package = pkg_name.replace('-', '_')
+        if package in all_models:
+            version = pkg_data.version
+            pkgs[pkg_name] = {'name': package, 'version': version,
+                              'compat': is_compat(compat, package, version)}
+    return pkgs
+
+
+def get_model_row(compat, name, data, type='package'):
+    tpl_row = '    {:<10}' + ('  {:<20}' * 4)
+    tpl_red = '\x1b[38;5;1m{}\x1b[0m'
+    tpl_green = '\x1b[38;5;2m{}\x1b[0m'
+    if data['compat']:
+        comp = tpl_green.format(locale_escape('✔', errors='ignore'))
+        version = tpl_green.format(data['version'])
+    else:
+        comp = '--> {}'.format(compat.get(data['name'], ['n/a'])[0])
+        version = tpl_red.format(data['version'])
+    return get_row(type, name, data['name'], version, comp)
+
+
+def get_row(*args):
+    tpl_row = '    {:<10}' + ('  {:<20}' * 4)
+    return tpl_row.format(*args)
+
+
+def is_model_path(model_path):
+    exclude = ['cache', 'pycache', '__pycache__']
+    name = model_path.parts[-1]
+    return model_path.is_dir() and name not in exclude and not name.startswith('.')
+
+
+def is_compat(compat, name, version):
+    return name in compat and version in compat[name]
+
+
+def reformat_version(version):
+    if version.endswith('-alpha'):
+        return version.replace('-alpha', 'a0')
+    return version.replace('-alpha', 'a')
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -5,6 +5,8 @@ import six
 import ftfy
 import sys
 import ujson
+import itertools
+import locale

 from thinc.neural.util import copy_array

@ -35,6 +37,7 @@ CudaStream = CudaStream
 cupy = cupy
 fix_text = ftfy.fix_text
 copy_array = copy_array
+izip = getattr(itertools, 'izip', zip)

 is_python2 = six.PY2
 is_python3 = six.PY3
@ -44,21 +47,31 @@ is_osx = sys.platform == 'darwin'


 if is_python2:
+    import imp
    bytes_ = str
    unicode_ = unicode
    basestring_ = basestring
    input_ = raw_input
-    json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8')
+    json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False).decode('utf8')
    path2str = lambda path: str(path).decode('utf8')

 elif is_python3:
+    import importlib.util
    bytes_ = bytes
    unicode_ = str
    basestring_ = str
    input_ = input
-    json_dumps = lambda data: ujson.dumps(data, indent=2)
+    json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False)
    path2str = lambda path: str(path)

+
+def b_to_str(b_str):
+    if is_python2:
+        return b_str
+    # important: if no encoding is set, string becomes "b'...'"
+    return str(b_str, encoding='utf8')
+
+
 def getattr_(obj, name, *default):
    if is_python3 and isinstance(name, bytes):
        name = name.decode('utf8')
@ -92,3 +105,21 @@ def normalize_string_keys(old):
    return new


+def import_file(name, loc):
+    loc = str(loc)
+    if is_python2:
+        return imp.load_source(name, loc)
+    else:
+        spec = importlib.util.spec_from_file_location(name, str(loc))
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        return module
+
+
+def locale_escape(string, errors='replace'):
+    '''
+    Mangle non-supported characters, for savages with ascii terminals.
+    '''
+    encoding = locale.getpreferredencoding()
+    string = string.encode(encoding, errors).decode('utf8')
+    return string
--- a/spacy/deprecated.py
+++ b/spacy/deprecated.py
@ -15,7 +15,7 @@ def depr_model_download(lang):
    lang (unicode): Language shortcut, 'en' or 'de'.
    """
    prints("The spacy.%s.download command is now deprecated. Please use "
-           "python -m spacy download [model name or shortcut] instead. For "
+           "spacy download [model name or shortcut] instead. For "
           "more info, see the documentation:" % lang,
           about.__docs_models__,
           "Downloading default '%s' model now..." % lang,
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -3,6 +3,7 @@ from __future__ import unicode_literals

 from .render import DependencyRenderer, EntityRenderer
 from ..tokens import Doc
+from ..compat import b_to_str
 from ..util import prints, is_in_jupyter


@ -65,7 +66,9 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,


 def app(environ, start_response):
-    start_response('200 OK', [('Content-type', 'text/html; charset=utf-8')])
+    # headers and status need to be bytes in Python 2, see #1227
+    headers = [(b_to_str(b'Content-type'), b_to_str(b'text/html; charset=utf-8'))]
+    start_response(b_to_str(b'200 OK'), headers)
    res = _html['parsed'].encode(encoding='utf-8')
    return [res]

--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@ -60,7 +60,7 @@ GLOSSARY = {
    'JJR':          'adjective, comparative',
    'JJS':          'adjective, superlative',
    'LS':           'list item marker',
-    'MD':           'verb, modal auxillary',
+    'MD':           'verb, modal auxiliary',
    'NIL':          'missing tag',
    'NN':           'noun, singular or mass',
    'NNP':          'noun, proper singular',
@ -91,7 +91,7 @@ GLOSSARY = {
    'NFP':          'superfluous punctuation',
    'GW':           'additional word in multi-word expression',
    'XX':           'unknown',
-    'BES':          'auxillary "be"',
+    'BES':          'auxiliary "be"',
    'HVS':          'forms of "have"',


--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@ -9,6 +9,7 @@ cdef struct GoldParseC:
    int* tags
    int* heads
    int* has_dep
+    int* sent_start
    attr_t* labels
    int** brackets
    Transition* ner
@ -29,6 +30,7 @@ cdef class GoldParse:
    cdef public list ner
    cdef public list ents
    cdef public dict brackets
+    cdef public object cats

    cdef readonly list cand_to_gold
    cdef readonly list gold_to_cand
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -7,6 +7,7 @@ import re
 import ujson
 import random
 import cytoolz
+import itertools

 from .syntax import nonproj
 from .util import ensure_path
@ -146,9 +147,13 @@ def minibatch(items, size=8):
    '''Iterate over batches of items. `size` may be an iterator,
    so that batch-size can vary on each step.
    '''
+    if isinstance(size, int):
+        size_ = itertools.repeat(8)
+    else:
+        size_ = size
    items = iter(items)
    while True:
-        batch_size = next(size) #if hasattr(size, '__next__') else size
+        batch_size = next(size_)
        batch = list(cytoolz.take(int(batch_size), items))
        if len(batch) == 0:
            break
@ -208,7 +213,7 @@ class GoldCorpus(object):
        train_tuples = self.train_tuples
        if projectivize:
            train_tuples = nonproj.preprocess_training_data(
-                               self.train_tuples)
+                               self.train_tuples, label_freq_cutoff=100)
        random.shuffle(train_tuples)
        gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
                                        max_length=max_length,
@ -381,7 +386,8 @@ cdef class GoldParse:
                   make_projective=make_projective)

    def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
-                 deps=None, entities=None, make_projective=False):
+                 deps=None, entities=None, make_projective=False,
+                 cats=None):
        """Create a GoldParse.

        doc (Doc): The document the annotations refer to.
@ -392,6 +398,15 @@ cdef class GoldParse:
        entities (iterable): A sequence of named entity annotations, either as
            BILUO tag strings, or as `(start_char, end_char, label)` tuples,
            representing the entity positions.
+        cats (dict): Labels for text classification. Each key in the dictionary
+            may be a string or an int, or a `(start_char, end_char, label)`
+            tuple, indicating that the label is applied to only part of the
+            document (usually a sentence). Unlike entity annotations, label
+            annotations can overlap, i.e. a single word can be covered by
+            multiple labelled spans. The TextCategorizer component expects
+            true examples of a label to have the value 1.0, and negative examples
+            of a label to have the value 0.0. Labels not in the dictionary are
+            treated as missing -- the gradient for those labels will be zero.
        RETURNS (GoldParse): The newly constructed object.
        """
        if words is None:
@ -399,11 +414,11 @@ cdef class GoldParse:
        if tags is None:
            tags = [None for _ in doc]
        if heads is None:
-            heads = [token.i for token in doc]
+            heads = [None for token in doc]
        if deps is None:
            deps = [None for _ in doc]
        if entities is None:
-            entities = ['-' for _ in doc]
+            entities = [None for _ in doc]
        elif len(entities) == 0:
            entities = ['O' for _ in doc]
        elif not isinstance(entities[0], basestring):
@ -419,8 +434,10 @@ cdef class GoldParse:
        self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
        self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
        self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
+        self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
        self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))

+        self.cats = {} if cats is None else dict(cats)
        self.words = [None] * len(doc)
        self.tags = [None] * len(doc)
        self.heads = [None] * len(doc)
@ -474,8 +491,12 @@ cdef class GoldParse:
        """
        return not nonproj.is_nonproj_tree(self.heads)

+    @property
+    def sent_starts(self):
+        return [self.c.sent_start[i] for i in range(self.length)]

-def biluo_tags_from_offsets(doc, entities):
+
+def biluo_tags_from_offsets(doc, entities, missing='O'):
    """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
    scheme (BILUO).

@ -527,7 +548,7 @@ def biluo_tags_from_offsets(doc, entities):
            if i in entity_chars:
                break
        else:
-            biluo[token.i] = 'O'
+            biluo[token.i] = missing
    return biluo


--- a/spacy/lang/bn/init.py
+++ b/spacy/lang/bn/init.py
@ -16,15 +16,13 @@ from ...util import update_exc
 class BengaliDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'bn'
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    tag_map = TAG_MAP
    stop_words = STOP_WORDS
    lemma_rules = LEMMA_RULES
-
-    prefixes = tuple(TOKENIZER_PREFIXES)
-    suffixes = tuple(TOKENIZER_SUFFIXES)
-    infixes = tuple(TOKENIZER_INFIXES)
+    prefixes = TOKENIZER_PREFIXES
+    suffixes = TOKENIZER_SUFFIXES
+    infixes = TOKENIZER_INFIXES


 class Bengali(Language):
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -27,13 +27,21 @@ ALPHA_UPPER = merge_char_classes(_upper + _uncased)

 _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
          'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
-          'TB T G M K')
+          'TB T G M K %')
 _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
-_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
-_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
-_hyphens = '- – — -- ---'
+
+# These expressions contain various unicode variations, including characters
+# used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
+# conflicts, spaCy's base tokenizer should handle all of those by default
+_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ？ ！ ， 、 ； ： ～ · ।'
+_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 （ ） 〔 〕 【 】 《 》 〈 〉'
+_hyphens = '- – — -- --- —— ~'
+
+# Various symbols like dingbats, but also emoji
+# Details: https://www.compart.com/en/unicode/category/So
 _other_symbols = r'[\p{So}]'

+
 UNITS = merge_chars(_units)
 CURRENCY = merge_chars(_currency)
 QUOTES = merge_chars(_quotes)
--- a/spacy/lang/da/init.py
+++ b/spacy/lang/da/init.py
@ -19,11 +19,10 @@ class DanishDefaults(Language.Defaults):
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: 'da'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    #morph_rules = dict(MORPH_RULES)
-    tag_map = dict(TAG_MAP)
-    stop_words = set(STOP_WORDS)
+    # morph_rules = MORPH_RULES
+    tag_map = TAG_MAP
+    stop_words = STOP_WORDS


 class Danish(Language):
--- a/spacy/lang/da/examples.py
+++ b/spacy/lang/da/examples.py
@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.da.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple overvejer at købe et britisk statup for 1 milliard dollar",
+    "Selvkørende biler flytter forsikringsansvaret over på producenterne",
+    "San Francisco overvejer at forbyde leverandørrobotter på fortov",
+    "London er en stor by i Storbritannien"
+]
--- a/spacy/lang/de/init.py
+++ b/spacy/lang/de/init.py
@ -3,6 +3,7 @@ from __future__ import unicode_literals

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .norm_exceptions import NORM_EXCEPTIONS
+from .punctuation import TOKENIZER_INFIXES
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .lemmatizer import LOOKUP
@ -11,7 +12,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...lemmatizerlookup import Lemmatizer
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups

@ -21,15 +21,12 @@ class GermanDefaults(Language.Defaults):
    lex_attr_getters[LANG] = lambda text: 'de'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
                                         NORM_EXCEPTIONS, BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    tag_map = dict(TAG_MAP)
-    stop_words = set(STOP_WORDS)
-    syntax_iterators = dict(SYNTAX_ITERATORS)
-
-    @classmethod
-    def create_lemmatizer(cls, nlp=None):
-        return Lemmatizer(LOOKUP)
+    infixes = TOKENIZER_INFIXES
+    tag_map = TAG_MAP
+    stop_words = STOP_WORDS
+    syntax_iterators = SYNTAX_ITERATORS
+    lemma_lookup = LOOKUP


 class German(Language):
--- a/spacy/lang/de/examples.py
+++ b/spacy/lang/de/examples.py
@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.de.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
+    "Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
+    "Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz",
+    "Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion",
+    "San Francisco erwägt Verbot von Lieferrobotern",
+    "Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
+    "Wo bist du?",
+    "Was ist die Hauptstadt von Deutschland?"
+]
--- a/spacy/lang/de/punctuation.py
+++ b/spacy/lang/de/punctuation.py
@ -0,0 +1,20 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+
+
+_quotes = QUOTES.replace("'", '')
+
+_infixes = (LIST_ELLIPSES + LIST_ICONS +
+            [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
+             r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
+             r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
+             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
+             r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes),
+             r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
+             r'(?<=[0-9])-(?=[0-9])'])
+
+
+TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/de/tag_map.py
+++ b/spacy/lang/de/tag_map.py
@ -62,5 +62,5 @@ TAG_MAP = {
    "VVIZU":    {POS: VERB, "VerbForm": "inf"},
    "VVPP":     {POS: VERB, "Aspect": "perf", "VerbForm": "part"},
    "XY":       {POS: X},
-    "SP":       {POS: SPACE}
+    "_SP":      {POS: SPACE}
 }
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -7,7 +7,7 @@ from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .morph_rules import MORPH_RULES
-from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
+from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC, LOOKUP
 from .syntax_iterators import SYNTAX_ITERATORS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
@ -16,22 +16,24 @@ from ...language import Language
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups

+def _return_en(_):
+    return 'en'

 class EnglishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
-    lex_attr_getters[LANG] = lambda text: 'en'
+    lex_attr_getters[LANG] = _return_en
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
                                         BASE_NORMS, NORM_EXCEPTIONS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    tag_map = dict(TAG_MAP)
-    stop_words = set(STOP_WORDS)
-    morph_rules = dict(MORPH_RULES)
-    lemma_rules = dict(LEMMA_RULES)
-    lemma_index = dict(LEMMA_INDEX)
-    lemma_exc = dict(LEMMA_EXC)
-    syntax_iterators = dict(SYNTAX_ITERATORS)
+    tag_map = TAG_MAP
+    stop_words = STOP_WORDS
+    morph_rules = MORPH_RULES
+    lemma_rules = LEMMA_RULES
+    lemma_index = LEMMA_INDEX
+    lemma_exc = LEMMA_EXC
+    lemma_lookup = LOOKUP
+    syntax_iterators = SYNTAX_ITERATORS


 class English(Language):
--- a/spacy/lang/en/examples.py
+++ b/spacy/lang/en/examples.py
@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.en.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple is looking at buying U.K. startup for $1 billion",
+    "Autonomous cars shift insurance liability toward manufacturers",
+    "San Francisco considers banning sidewalk delivery robots",
+    "London is a big city in the United Kingdom.",
+    "Where are you?",
+    "Who is the president of France?",
+    "What is the capital of the United States?",
+    "When was Barack Obama born?"
+]
--- a/spacy/lang/en/morph_rules.py
+++ b/spacy/lang/en/morph_rules.py
@ -59,7 +59,8 @@ MORPH_RULES = {

    "VBP": {
        "are":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
-        "'re":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}
+        "'re":          {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
+        "am":           {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"},
    },

    "VBD": {
--- a/spacy/lang/en/syntax_iterators.py
+++ b/spacy/lang/en/syntax_iterators.py
@ -8,7 +8,7 @@ def noun_chunks(obj):
    """
    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
    """
-    labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
+    labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'dative', 'appos',
              'attr', 'ROOT']
    doc = obj.doc # Ensure works on both Doc and Span.
    np_deps = [doc.vocab.strings.add(label) for label in labels]
--- a/spacy/lang/en/tag_map.py
+++ b/spacy/lang/en/tag_map.py
@ -55,11 +55,11 @@ TAG_MAP = {
    "WP":       {POS: NOUN, "PronType": "int|rel"},
    "WP$":      {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
    "WRB":      {POS: ADV, "PronType": "int|rel"},
-    "SP":       {POS: SPACE},
    "ADD":      {POS: X},
    "NFP":      {POS: PUNCT},
    "GW":       {POS: X},
    "XX":       {POS: X},
    "BES":      {POS: VERB},
-    "HVS":      {POS: VERB}
+    "HVS":      {POS: VERB},
+    "_SP":       {POS: SPACE},
 }
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -232,7 +232,10 @@ for verb_data in [
    {ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
    {ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
    {ORTH: "was", LEMMA: "be", NORM: "was"},
-    {ORTH: "were", LEMMA: "be", NORM: "were"}]:
+    {ORTH: "were", LEMMA: "be", NORM: "were"},
+    {ORTH: "have", NORM: "have"},
+    {ORTH: "has", LEMMA: "have", NORM: "has"},
+    {ORTH: "dare", NORM: "dare"}]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
    for data in [verb_data, verb_data_tc]:
--- a/spacy/lang/es/init.py
+++ b/spacy/lang/es/init.py
@ -10,7 +10,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...lemmatizerlookup import Lemmatizer
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups

@ -19,15 +18,11 @@ class SpanishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'es'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    tag_map = dict(TAG_MAP)
-    stop_words = set(STOP_WORDS)
-    sytax_iterators = dict(SYNTAX_ITERATORS)
-
-    @classmethod
-    def create_lemmatizer(cls, nlp=None):
-        return Lemmatizer(LOOKUP)
+    tag_map = TAG_MAP
+    stop_words = STOP_WORDS
+    sytax_iterators = SYNTAX_ITERATORS
+    lemma_lookup = LOOKUP


 class Spanish(Language):
--- a/spacy/lang/es/examples.py
+++ b/spacy/lang/es/examples.py
@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.es.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares",
+    "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes",
+    "San Francisco analiza prohibir los robots delivery",
+    "Londres es una gran ciudad del Reino Unido",
+    "El gato come pescado",
+    "Veo al hombre con el telescopio",
+    "La araña come moscas",
+    "El pingüino incuba en su nido"
+]
--- a/spacy/lang/es/tag_map.py
+++ b/spacy/lang/es/tag_map.py
@ -303,5 +303,5 @@ TAG_MAP = {
    "VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"},
    "VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"},
    "X___": {"morph": "_", "pos": "X"},
-    "SP": {"morph": "_", "pos": "SPACE"},
+    "_SP": {"morph": "_", "pos": "SPACE"},
 }
--- a/spacy/lang/fi/init.py
+++ b/spacy/lang/fi/init.py
@ -15,9 +15,8 @@ class FinnishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'fi'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
+    stop_words = STOP_WORDS


 class Finnish(Language):
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -4,32 +4,29 @@ from __future__ import unicode_literals
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
+from .lex_attrs import LEX_ATTRS
 from .lemmatizer import LOOKUP
 from .syntax_iterators import SYNTAX_ITERATORS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...lemmatizerlookup import Lemmatizer
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups


 class FrenchDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: 'fr'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
-    infixes = tuple(TOKENIZER_INFIXES)
-    suffixes = tuple(TOKENIZER_SUFFIXES)
+    stop_words = STOP_WORDS
+    infixes = TOKENIZER_INFIXES
+    suffixes = TOKENIZER_SUFFIXES
    token_match = TOKEN_MATCH
-    syntax_iterators = dict(SYNTAX_ITERATORS)
-
-    @classmethod
-    def create_lemmatizer(cls, nlp=None):
-        return Lemmatizer(LOOKUP)
+    syntax_iterators = SYNTAX_ITERATORS
+    lemma_lookup = LOOKUP


 class French(Language):
--- a/spacy/lang/fr/examples.py
+++ b/spacy/lang/fr/examples.py
@ -0,0 +1,26 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.fr.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple cherche a acheter une startup anglaise pour 1 milliard de dollard",
+    "Les voitures autonomes voient leur assurances décalées vers les constructeurs",
+    "San Francisco envisage d'interdire les robots coursiers",
+    "Londres est une grande ville du Royaume-Uni",
+    "L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe",
+    "Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon",
+    "La France ne devrait pas manquer d'électricité cet été, même en cas de canicule",
+    "Nouvelles attaques de Trump contre le maire de Londres",
+    "Où es-tu ?",
+    "Qui est le président de la France ?",
+    "Où est la capitale des Etats-Unis ?",
+    "Quand est né Barack Obama ?"
+]
--- a/spacy/lang/fr/lex_attrs.py
+++ b/spacy/lang/fr/lex_attrs.py
@ -0,0 +1,41 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...attrs import LIKE_NUM
+
+
+_num_words = set("""
+zero un deux trois quatre cinq six sept huit neuf dix
+onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf
+vingt trente quanrante cinquante soixante septante quatre-vingt huitante nonante
+cent mille mil million milliard billion quadrillion quintillion
+sextillion septillion octillion nonillion decillion
+""".split())
+
+_ordinal_words = set("""
+premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième
+onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neufième
+vingtième trentième quanrantième cinquantième soixantième septantième quatre-vingtième huitantième nonantième
+centième millième millionnième milliardième billionnième quadrillionnième quintillionnième
+sextillionnième septillionnième octillionnième nonillionnième decillionnième
+""".split())
+
+
+def like_num(text):
+    # Might require more work?
+    # See this discussion: https://github.com/explosion/spaCy/pull/1161
+    text = text.replace(',', '').replace('.', '')
+    if text.isdigit():
+        return True
+    if text.count('/') == 1:
+        num, denom = text.split('/')
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text in _num_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {
+    LIKE_NUM: like_num
+}
--- a/spacy/lang/he/init.py
+++ b/spacy/lang/he/init.py
@ -12,9 +12,8 @@ from ...util import update_exc
 class HebrewDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'he'
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
+    stop_words = STOP_WORDS


 class Hebrew(Language):
--- a/spacy/lang/he/examples.py
+++ b/spacy/lang/he/examples.py
@ -0,0 +1,28 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.he.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל',
+    'רה"מ הודיע כי יחרים טקס בחסותו',
+    'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100',
+    'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית',
+    'סע לשלום, המפתחות בפנים.',
+    'מלצר, פעמיים טורקי!',
+    'ואהבת לרעך כמוך.',
+    'היום נעשה משהו בלתי נשכח.',
+    'איפה הילד?',
+    'מיהו נשיא צרפת?',
+    'מהי בירת ארצות הברית?',
+    "איך קוראים בעברית לצ'ופצ'יק של הקומקום?",
+    'מה הייתה הדקה?',
+    'מי אומר שלום ראשון, זה שעולה או זה שיורד?'
+]
--- a/spacy/lang/hi/init.py
+++ b/spacy/lang/hi/init.py
@ -0,0 +1,24 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .stop_words import STOP_WORDS
+from .lex_attrs import LEX_ATTRS
+
+from ..norm_exceptions import BASE_NORMS
+from ...language import Language
+from ...attrs import LANG
+
+
+class HindiDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters.update(LEX_ATTRS)
+    lex_attr_getters[LANG] = lambda text: 'hi'
+    stop_words = STOP_WORDS
+
+
+class Hindi(Language):
+    lang = 'hi'
+    Defaults = HindiDefaults
+
+
+__all__ = ['Hindi']
--- a/spacy/lang/hi/lex_attrs.py
+++ b/spacy/lang/hi/lex_attrs.py
@ -0,0 +1,38 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ..norm_exceptions import BASE_NORMS
+from ...attrs import NORM
+from ...util import add_lookups
+
+
+_stem_suffixes = [
+    ["ो","े","ू","ु","ी","ि","ा"],
+    ["कर","ाओ","िए","ाई","ाए","ने","नी","ना","ते","ीं","ती","ता","ाँ","ां","ों","ें"],
+    ["ाकर","ाइए","ाईं","ाया","ेगी","ेगा","ोगी","ोगे","ाने","ाना","ाते","ाती","ाता","तीं","ाओं","ाएं","ुओं","ुएं","ुआं"],
+    ["ाएगी","ाएगा","ाओगी","ाओगे","एंगी","ेंगी","एंगे","ेंगे","ूंगी","ूंगा","ातीं","नाओं","नाएं","ताओं","ताएं","ियाँ","ियों","ियां"],
+    ["ाएंगी","ाएंगे","ाऊंगी","ाऊंगा","ाइयाँ","ाइयों","ाइयां"]
+]
+
+
+def norm(string):
+    # normalise base exceptions, e.g. punctuation or currency symbols
+    if string in BASE_NORMS:
+        return BASE_NORMS[string]
+    # set stem word as norm, if available, adapted from:
+    # http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf
+    # http://research.variancia.com/hindi_stemmer/
+    # https://github.com/taranjeet/hindi-tokenizer/blob/master/HindiTokenizer.py#L142
+    for suffix_group in reversed(_stem_suffixes):
+        length = len(suffix_group[0])
+        if len(string) <= length:
+            break
+        for suffix in suffix_group:
+            if string.endswith(suffix):
+                return string[:-length]
+    return string
+
+
+LEX_ATTRS = {
+    NORM: norm
+}
--- a/spacy/lang/hi/stop_words.py
+++ b/spacy/lang/hi/stop_words.py
@ -0,0 +1,177 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+# Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt
+
+STOP_WORDS = set("""
+अत
+अपना
+अपनी
+अपने
+अभी
+अंदर
+आदि
+आप
+इत्यादि
+इन
+इनका
+इन्हीं
+इन्हें
+इन्हों
+इस
+इसका
+इसकी
+इसके
+इसमें
+इसी
+इसे
+उन
+उनका
+उनकी
+उनके
+उनको
+उन्हीं
+उन्हें
+उन्हों
+उस
+उसके
+उसी
+उसे
+एक
+एवं
+एस
+ऐसे
+और
+कई
+कर
+करता
+करते
+करना
+करने
+करें
+कहते
+कहा
+का
+काफ़ी
+कि
+कितना
+किन्हें
+किन्हों
+किया
+किर
+किस
+किसी
+किसे
+की
+कुछ
+कुल
+के
+को
+कोई
+कौन
+कौनसा
+गया
+घर
+जब
+जहाँ
+जा
+जितना
+जिन
+जिन्हें
+जिन्हों
+जिस
+जिसे
+जीधर
+जैसा
+जैसे
+जो
+तक
+तब
+तरह
+तिन
+तिन्हें
+तिन्हों
+तिस
+तिसे
+तो
+था
+थी
+थे
+दबारा
+दिया
+दुसरा
+दूसरे
+दो
+द्वारा
+न
+नके
+नहीं
+ना
+निहायत
+नीचे
+ने
+पर
+पहले
+पूरा
+पे
+फिर
+बनी
+बही
+बहुत
+बाद
+बाला
+बिलकुल
+भी
+भीतर
+मगर
+मानो
+मे
+में
+यदि
+यह
+यहाँ
+यही
+या
+यिह
+ये
+रखें
+रहा
+रहे
+ऱ्वासा
+लिए
+लिये
+लेकिन
+व
+वग़ैरह
+वर्ग
+वह
+वहाँ
+वहीं
+वाले
+वुह
+वे
+सकता
+सकते
+सबसे
+सभी
+साथ
+साबुत
+साभ
+सारा
+से
+सो
+संग
+ही
+हुआ
+हुई
+हुए
+है
+हैं
+हो
+होता
+होती
+होते
+होना
+होने
+""".split())
--- a/spacy/lang/hu/init.py
+++ b/spacy/lang/hu/init.py
@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...lemmatizerlookup import Lemmatizer
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups

@ -18,17 +17,13 @@ class HungarianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'hu'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
-    prefixes = tuple(TOKENIZER_PREFIXES)
-    suffixes = tuple(TOKENIZER_SUFFIXES)
-    infixes = tuple(TOKENIZER_INFIXES)
+    stop_words = STOP_WORDS
+    prefixes = TOKENIZER_PREFIXES
+    suffixes = TOKENIZER_SUFFIXES
+    infixes = TOKENIZER_INFIXES
    token_match = TOKEN_MATCH
-
-    @classmethod
-    def create_lemmatizer(cls, nlp=None):
-        return Lemmatizer(LOOKUP)
+    lemma_lookup = LOOKUP


 class Hungarian(Language):
--- a/spacy/lang/hu/examples.py
+++ b/spacy/lang/hu/examples.py
@ -0,0 +1,17 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.hu.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Az Apple egy brit startup vásárlását tervezi 1 milliárd dollár értékben.",
+    "San Francisco vezetése mérlegeli a járdát használó szállító robotok betiltását.",
+    "London az Egyesült Királyság egy nagy városa."
+]
--- a/spacy/lang/id/init.py
+++ b/spacy/lang/id/init.py
@ -0,0 +1,36 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .stop_words import STOP_WORDS
+from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .norm_exceptions import NORM_EXCEPTIONS
+from .lemmatizer import LOOKUP
+from .lex_attrs import LEX_ATTRS
+from .syntax_iterators import SYNTAX_ITERATORS
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ...language import Language
+from ...attrs import LANG
+from ...util import update_exc
+
+
+class IndonesianDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'id'
+    lex_attr_getters.update(LEX_ATTRS)
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = STOP_WORDS
+    prefixes = TOKENIZER_PREFIXES
+    suffixes = TOKENIZER_SUFFIXES
+    infixes = TOKENIZER_INFIXES
+    syntax_iterators = SYNTAX_ITERATORS
+    lemma_lookup = LOOKUP
+
+
+class Indonesian(Language):
+    lang = 'id'
+    Defaults = IndonesianDefaults
+
+
+__all__ = ['Indonesian']
--- a/spacy/lang/id/_tokenizer_exceptions_list.py
+++ b/spacy/lang/id/_tokenizer_exceptions_list.py
--- a/spacy/lang/id/examples.py
+++ b/spacy/lang/id/examples.py
@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.en.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Al Qaidah mengklaim bom mobil yang menewaskan 60 Orang di Mali",
+    "Abu Sayyaf mengeksekusi sandera warga Filipina",
+    "Penyaluran pupuk berasal dari lima lokasi yakni Bontang, Kalimantan Timur, Surabaya, Banyuwangi, Semarang, dan Makassar.",
+    "PT Pupuk Kaltim telah menyalurkan 274.707 ton pupuk bersubsidi ke wilayah penyaluran di 14 provinsi.",
+    "Jakarta adalah kota besar yang nyaris tidak pernah tidur."
+    "Kamu ada di mana semalam?",
+    "Siapa yang membeli makanan ringan tersebut?",
+    "Siapa presiden pertama Republik Indonesia?"
+]
--- a/spacy/lang/id/lemmatizer.py
+++ b/spacy/lang/id/lemmatizer.py
--- a/spacy/lang/id/lex_attrs.py
+++ b/spacy/lang/id/lex_attrs.py
@ -0,0 +1,41 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...attrs import LIKE_NUM
+
+
+_num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
+              'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen',
+              'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty',
+              'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety',
+              'hundred', 'thousand', 'million', 'billion', 'trillion', 'quadrillion',
+              'gajillion', 'bazillion',
+              'nol', 'satu', 'dua', 'tiga', 'empat', 'lima', 'enam', 'tujuh',
+              'delapan', 'sembilan', 'sepuluh', 'sebelas', 'duabelas', 'tigabelas',
+              'empatbelas', 'limabelas', 'enambelas', 'tujuhbelas', 'delapanbelas',
+              'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta',
+              'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun',
+              'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun',
+              'noniliun', 'desiliun']
+
+
+def like_num(text):
+    text = text.replace(',', '').replace('.', '')
+    if text.isdigit():
+        return True
+    if text.count('/') == 1:
+        num, denom = text.split('/')
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text in _num_words:
+        return True
+    if text.count('-') == 1:
+        _, num = text.split('-')
+        if num.isdigit() or num in _num_words:
+            return True
+    return False
+
+
+LEX_ATTRS = {
+    LIKE_NUM: like_num
+}
--- a/spacy/lang/id/norm_exceptions.py
+++ b/spacy/lang/id/norm_exceptions.py
@ -0,0 +1,17 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+_exc = {
+    "Rp": "$",
+    "IDR": "$",
+    "RMB": "$",
+    "USD": "$",
+    "AUD": "$",
+    "GBP": "$",
+}
+
+NORM_EXCEPTIONS = {}
+
+for string, norm in _exc.items():
+    NORM_EXCEPTIONS[string] = norm
+    NORM_EXCEPTIONS[string.title()] = norm
--- a/spacy/lang/id/punctuation.py
+++ b/spacy/lang/id/punctuation.py
@ -0,0 +1,53 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
+from ..char_classes import merge_chars, split_chars, _currency, _units
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
+from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER, HYPHENS
+
+_units = (_units + 's bit Gbps Mbps mbps Kbps kbps ƒ ppi px '
+          'Hz kHz MHz GHz mAh '
+          'ratus rb ribu ribuan '
+          'juta jt jutaan mill?iar million bil[l]?iun bilyun billion '
+          )
+_currency = (_currency + r' USD Rp IDR RMB SGD S\$')
+_months = ('Januari Februari Maret April Mei Juni Juli Agustus September '
+           'Oktober November Desember January February March May June '
+           'July August October December Jan Feb Mar Jun Jul Aug Sept '
+           'Oct Okt Nov Des ')
+
+
+UNITS = merge_chars(_units)
+CURRENCY = merge_chars(_currency)
+HTML_PREFIX = r'<(b|strong|i|em|p|span|div|br)\s?/>|<a([^>]+)>'
+HTML_SUFFIX = r'</(b|strong|i|em|p|span|div|a)>'
+MONTHS = merge_chars(_months)
+LIST_CURRENCY = split_chars(_currency)
+
+TOKENIZER_PREFIXES.remove('#') # hashtag
+_prefixes = TOKENIZER_PREFIXES + LIST_CURRENCY + [HTML_PREFIX] + ['/', '—']
+
+_suffixes = TOKENIZER_SUFFIXES + [r'\-[Nn]ya', '-[KkMm]u', '[—-]'] + [
+        r'(?<={c})(?:[0-9]+)'.format(c=CURRENCY),
+        r'(?<=[0-9])(?:{u})'.format(u=UNITS),
+        r'(?<=[0-9])%',
+        r'(?<=[0-9{a}]{h})(?:[\.,:-])'.format(a=ALPHA, h=HTML_SUFFIX),
+        r'(?<=[0-9{a}])(?:{h})'.format(a=ALPHA, h=HTML_SUFFIX),
+    ]
+
+_infixes = TOKENIZER_INFIXES + [
+    r'(?<=[0-9])[\\/](?=[0-9%-])',
+    r'(?<=[0-9])%(?=[{a}0-9/])'.format(a=ALPHA),
+    r'(?<={u})[\/-](?=[0-9])'.format(u=UNITS),
+    r'(?<={m})[\/-](?=[0-9])'.format(m=MONTHS),
+    r'(?<=[0-9\)][\.,])"(?=[0-9])',
+    r'(?<=[{a}\)][\.,\'])["—](?=[{a}])'.format(a=ALPHA),
+    r'(?<=[{a}])-(?=[0-9])'.format(a=ALPHA),
+    r'(?<=[0-9])-(?=[{a}])'.format(a=ALPHA),
+    r'(?<=[{a}])[\/-](?={c}{a})'.format(a=ALPHA, c=CURRENCY),
+]
+
+TOKENIZER_PREFIXES = _prefixes
+TOKENIZER_SUFFIXES = _suffixes
+TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/id/stop_words.py
+++ b/spacy/lang/id/stop_words.py
@ -0,0 +1,763 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+STOP_WORDS = set("""
+ada
+adalah
+adanya
+adapun
+agak
+agaknya
+agar
+akan
+akankah
+akhir
+akhiri
+akhirnya
+aku
+akulah
+amat
+amatlah
+anda
+andalah
+antar
+antara
+antaranya
+apa
+apaan
+apabila
+apakah
+apalagi
+apatah
+artinya
+asal
+asalkan
+atas
+atau
+ataukah
+ataupun
+awal
+awalnya
+bagai
+bagaikan
+bagaimana
+bagaimanakah
+bagaimanapun
+bagi
+bagian
+bahkan
+bahwa
+bahwasanya
+baik
+bakal
+bakalan
+balik
+banyak
+bapak
+baru
+bawah
+beberapa
+begini
+beginian
+beginikah
+beginilah
+begitu
+begitukah
+begitulah
+begitupun
+bekerja
+belakang
+belakangan
+belum
+belumlah
+benar
+benarkah
+benarlah
+berada
+berakhir
+berakhirlah
+berakhirnya
+berapa
+berapakah
+berapalah
+berapapun
+berarti
+berawal
+berbagai
+berdatangan
+beri
+berikan
+berikut
+berikutnya
+berjumlah
+berkali-kali
+berkata
+berkehendak
+berkeinginan
+berkenaan
+berlainan
+berlalu
+berlangsung
+berlebihan
+bermacam
+bermacam-macam
+bermaksud
+bermula
+bersama
+bersama-sama
+bersiap
+bersiap-siap
+bertanya
+bertanya-tanya
+berturut
+berturut-turut
+bertutur
+berujar
+berupa
+besar
+betul
+betulkah
+biasa
+biasanya
+bila
+bilakah
+bisa
+bisakah
+boleh
+bolehkah
+bolehlah
+buat
+bukan
+bukankah
+bukanlah
+bukannya
+bulan
+bung
+cara
+caranya
+cukup
+cukupkah
+cukuplah
+cuma
+dahulu
+dalam
+dan
+dapat
+dari
+daripada
+datang
+dekat
+demi
+demikian
+demikianlah
+dengan
+depan
+di
+dia
+diakhiri
+diakhirinya
+dialah
+diantara
+diantaranya
+diberi
+diberikan
+diberikannya
+dibuat
+dibuatnya
+didapat
+didatangkan
+digunakan
+diibaratkan
+diibaratkannya
+diingat
+diingatkan
+diinginkan
+dijawab
+dijelaskan
+dijelaskannya
+dikarenakan
+dikatakan
+dikatakannya
+dikerjakan
+diketahui
+diketahuinya
+dikira
+dilakukan
+dilalui
+dilihat
+dimaksud
+dimaksudkan
+dimaksudkannya
+dimaksudnya
+diminta
+dimintai
+dimisalkan
+dimulai
+dimulailah
+dimulainya
+dimungkinkan
+dini
+dipastikan
+diperbuat
+diperbuatnya
+dipergunakan
+diperkirakan
+diperlihatkan
+diperlukan
+diperlukannya
+dipersoalkan
+dipertanyakan
+dipunyai
+diri
+dirinya
+disampaikan
+disebut
+disebutkan
+disebutkannya
+disini
+disinilah
+ditambahkan
+ditandaskan
+ditanya
+ditanyai
+ditanyakan
+ditegaskan
+ditujukan
+ditunjuk
+ditunjuki
+ditunjukkan
+ditunjukkannya
+ditunjuknya
+dituturkan
+dituturkannya
+diucapkan
+diucapkannya
+diungkapkan
+dong
+dua
+dulu
+empat
+enggak
+enggaknya
+entah
+entahlah
+guna
+gunakan
+hal
+hampir
+hanya
+hanyalah
+hari
+harus
+haruslah
+harusnya
+hendak
+hendaklah
+hendaknya
+hingga
+ia
+ialah
+ibarat
+ibaratkan
+ibaratnya
+ibu
+ikut
+ingat
+ingat-ingat
+ingin
+inginkah
+inginkan
+ini
+inikah
+inilah
+itu
+itukah
+itulah
+jadi
+jadilah
+jadinya
+jangan
+jangankan
+janganlah
+jauh
+jawab
+jawaban
+jawabnya
+jelas
+jelaskan
+jelaslah
+jelasnya
+jika
+jikalau
+juga
+jumlah
+jumlahnya
+justru
+kala
+kalau
+kalaulah
+kalaupun
+kalian
+kami
+kamilah
+kamu
+kamulah
+kan
+kapan
+kapankah
+kapanpun
+karena
+karenanya
+kasus
+kata
+katakan
+katakanlah
+katanya
+ke
+keadaan
+kebetulan
+kecil
+kedua
+keduanya
+keinginan
+kelamaan
+kelihatan
+kelihatannya
+kelima
+keluar
+kembali
+kemudian
+kemungkinan
+kemungkinannya
+kenapa
+kepada
+kepadanya
+kesampaian
+keseluruhan
+keseluruhannya
+keterlaluan
+ketika
+khususnya
+kini
+kinilah
+kira
+kira-kira
+kiranya
+kita
+kitalah
+kok
+kurang
+lagi
+lagian
+lah
+lain
+lainnya
+lalu
+lama
+lamanya
+lanjut
+lanjutnya
+lebih
+lewat
+lima
+luar
+macam
+maka
+makanya
+makin
+malah
+malahan
+mampu
+mampukah
+mana
+manakala
+manalagi
+masa
+masalah
+masalahnya
+masih
+masihkah
+masing
+masing-masing
+mau
+maupun
+melainkan
+melakukan
+melalui
+melihat
+melihatnya
+memang
+memastikan
+memberi
+memberikan
+membuat
+memerlukan
+memihak
+meminta
+memintakan
+memisalkan
+memperbuat
+mempergunakan
+memperkirakan
+memperlihatkan
+mempersiapkan
+mempersoalkan
+mempertanyakan
+mempunyai
+memulai
+memungkinkan
+menaiki
+menambahkan
+menandaskan
+menanti
+menanti-nanti
+menantikan
+menanya
+menanyai
+menanyakan
+mendapat
+mendapatkan
+mendatang
+mendatangi
+mendatangkan
+menegaskan
+mengakhiri
+mengapa
+mengatakan
+mengatakannya
+mengenai
+mengerjakan
+mengetahui
+menggunakan
+menghendaki
+mengibaratkan
+mengibaratkannya
+mengingat
+mengingatkan
+menginginkan
+mengira
+mengucapkan
+mengucapkannya
+mengungkapkan
+menjadi
+menjawab
+menjelaskan
+menuju
+menunjuk
+menunjuki
+menunjukkan
+menunjuknya
+menurut
+menuturkan
+menyampaikan
+menyangkut
+menyatakan
+menyebutkan
+menyeluruh
+menyiapkan
+merasa
+mereka
+merekalah
+merupakan
+meski
+meskipun
+meyakini
+meyakinkan
+minta
+mirip
+misal
+misalkan
+misalnya
+mula
+mulai
+mulailah
+mulanya
+mungkin
+mungkinkah
+nah
+naik
+namun
+nanti
+nantinya
+nyaris
+nyatanya
+oleh
+olehnya
+pada
+padahal
+padanya
+pak
+paling
+panjang
+pantas
+para
+pasti
+pastilah
+penting
+pentingnya
+per
+percuma
+perlu
+perlukah
+perlunya
+pernah
+persoalan
+pertama
+pertama-tama
+pertanyaan
+pertanyakan
+pihak
+pihaknya
+pukul
+pula
+pun
+punya
+rasa
+rasanya
+rata
+rupanya
+saat
+saatnya
+saja
+sajalah
+saling
+sama
+sama-sama
+sambil
+sampai
+sampai-sampai
+sampaikan
+sana
+sangat
+sangatlah
+satu
+saya
+sayalah
+se
+sebab
+sebabnya
+sebagai
+sebagaimana
+sebagainya
+sebagian
+sebaik
+sebaik-baiknya
+sebaiknya
+sebaliknya
+sebanyak
+sebegini
+sebegitu
+sebelum
+sebelumnya
+sebenarnya
+seberapa
+sebesar
+sebetulnya
+sebisanya
+sebuah
+sebut
+sebutlah
+sebutnya
+secara
+secukupnya
+sedang
+sedangkan
+sedemikian
+sedikit
+sedikitnya
+seenaknya
+segala
+segalanya
+segera
+seharusnya
+sehingga
+seingat
+sejak
+sejauh
+sejenak
+sejumlah
+sekadar
+sekadarnya
+sekali
+sekali-kali
+sekalian
+sekaligus
+sekalipun
+sekarang
+sekarang
+sekecil
+seketika
+sekiranya
+sekitar
+sekitarnya
+sekurang-kurangnya
+sekurangnya
+sela
+selain
+selaku
+selalu
+selama
+selama-lamanya
+selamanya
+selanjutnya
+seluruh
+seluruhnya
+semacam
+semakin
+semampu
+semampunya
+semasa
+semasih
+semata
+semata-mata
+semaunya
+sementara
+semisal
+semisalnya
+sempat
+semua
+semuanya
+semula
+sendiri
+sendirian
+sendirinya
+seolah
+seolah-olah
+seorang
+sepanjang
+sepantasnya
+sepantasnyalah
+seperlunya
+seperti
+sepertinya
+sepihak
+sering
+seringnya
+serta
+serupa
+sesaat
+sesama
+sesampai
+sesegera
+sesekali
+seseorang
+sesuatu
+sesuatunya
+sesudah
+sesudahnya
+setelah
+setempat
+setengah
+seterusnya
+setiap
+setiba
+setibanya
+setidak-tidaknya
+setidaknya
+setinggi
+seusai
+sewaktu
+siap
+siapa
+siapakah
+siapapun
+sini
+sinilah
+soal
+soalnya
+suatu
+sudah
+sudahkah
+sudahlah
+supaya
+tadi
+tadinya
+tahu
+tahun
+tak
+tambah
+tambahnya
+tampak
+tampaknya
+tandas
+tandasnya
+tanpa
+tanya
+tanyakan
+tanyanya
+tapi
+tegas
+tegasnya
+telah
+tempat
+tengah
+tentang
+tentu
+tentulah
+tentunya
+tepat
+terakhir
+terasa
+terbanyak
+terdahulu
+terdapat
+terdiri
+terhadap
+terhadapnya
+teringat
+teringat-ingat
+terjadi
+terjadilah
+terjadinya
+terkira
+terlalu
+terlebih
+terlihat
+termasuk
+ternyata
+tersampaikan
+tersebut
+tersebutlah
+tertentu
+tertuju
+terus
+terutama
+tetap
+tetapi
+tiap
+tiba
+tiba-tiba
+tidak
+tidakkah
+tidaklah
+tiga
+tinggi
+toh
+tunjuk
+turut
+tutur
+tuturnya
+ucap
+ucapnya
+ujar
+ujarnya
+umum
+umumnya
+ungkap
+ungkapnya
+untuk
+usah
+usai
+waduh
+wah
+wahai
+waktu
+waktunya
+walau
+walaupun
+wong
+yaitu
+yakin
+yakni
+yang
+""".split())
--- a/spacy/lang/id/syntax_iterators.py
+++ b/spacy/lang/id/syntax_iterators.py
@ -0,0 +1,42 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import NOUN, PROPN, PRON
+
+
+def noun_chunks(obj):
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
+    doc = obj.doc  # Ensure works on both Doc and Span.
+    np_deps = [doc.vocab.strings[label] for label in labels]
+    conj = doc.vocab.strings.add('conj')
+    np_label = doc.vocab.strings.add('NP')
+    seen = set()
+    for i, word in enumerate(obj):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.i in seen:
+            continue
+        if word.dep in np_deps:
+            if any(w.i in seen for w in word.subtree):
+                continue
+            seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
+            yield word.left_edge.i, word.right_edge.i+1, np_label
+        elif word.dep == conj:
+            head = word.head
+            while head.dep == conj and head.head.i < head.i:
+                head = head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                if any(w.i in seen for w in word.subtree):
+                    continue
+                seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
+                yield word.left_edge.i, word.right_edge.i+1, np_label
+
+
+SYNTAX_ITERATORS = {
+    'noun_chunks': noun_chunks
+}
--- a/spacy/lang/id/tokenizer_exceptions.py
+++ b/spacy/lang/id/tokenizer_exceptions.py
@ -0,0 +1,50 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import regex as re
+
+from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
+from ..tokenizer_exceptions import URL_PATTERN
+from ...symbols import ORTH
+
+
+_exc = {}
+
+for orth in ID_BASE_EXCEPTIONS:
+    _exc[orth] = [{ORTH: orth}]
+
+    orth_title = orth.title()
+    _exc[orth_title] = [{ORTH: orth_title}]
+
+    orth_caps = orth.upper()
+    _exc[orth_caps] = [{ORTH: orth_caps}]
+
+    orth_lower = orth.lower()
+    _exc[orth_lower] = [{ORTH: orth_lower}]
+
+    if '-' in orth:
+        orth_title = '-'.join([part.title() for part in orth.split('-')])
+        _exc[orth_title] = [{ORTH: orth_title}]
+
+        orth_caps = '-'.join([part.upper() for part in orth.split('-')])
+        _exc[orth_caps] = [{ORTH: orth_caps}]
+
+
+for orth in [
+    "'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.",
+    "E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.",
+    "Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.",
+    "Ph.D.", "Rep.", "Rev.", "Sen.", "St.", "vs.",
+    "B.A.", "B.Ch.E.", "B.Sc.", "Dr.", "Dra.", "Drs.", "Hj.", "Ka.", "Kp.",
+    "M.Ag.", "M.Hum.", "M.Kes,", "M.Kom.", "M.M.", "M.P.", "M.Pd.", "M.Sc.",
+    "M.Si.", "M.Sn.", "M.T.", "M.Th.", "No.", "Pjs.", "Plt.", "R.A.", "S.Ag.",
+    "S.E.", "S.H.", "S.Hut.", "S.K.M.", "S.Kedg.", "S.Kedh.", "S.Kom.",
+    "S.Pd.", "S.Pol.", "S.Psi.", "S.S.", "S.Sos.", "S.T.", "S.Tekp.", "S.Th.",
+    "a.l.", "a.n.", "a.s.", "b.d.", "d.a.", "d.l.", "d/h", "dkk.", "dll.",
+    "dr.", "drh.", "ds.", "dsb.", "dst.", "faks.", "fax.", "hlm.", "i/o",
+    "n.b.", "p.p." "pjs.", "s.d.", "tel.", "u.p.",
+    ]:
+    _exc[orth] = [{ORTH: orth}]
+
+TOKENIZER_EXCEPTIONS = dict(_exc)
+
--- a/spacy/lang/it/init.py
+++ b/spacy/lang/it/init.py
@ -7,7 +7,6 @@ from .lemmatizer import LOOKUP
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...lemmatizerlookup import Lemmatizer
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups

@ -16,13 +15,9 @@ class ItalianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'it'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
-
-    @classmethod
-    def create_lemmatizer(cls, nlp=None):
-        return Lemmatizer(LOOKUP)
+    stop_words = STOP_WORDS
+    lemma_lookup = LOOKUP


 class Italian(Language):
--- a/spacy/lang/it/examples.py
+++ b/spacy/lang/it/examples.py
@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.it.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
+    "Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
+    "San Francisco prevede di bandire i robot di consegna porta a porta",
+    "Londra è una grande città del Regno Unito."
+]
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -4,18 +4,36 @@ from __future__ import unicode_literals, print_function
 from ...language import Language
 from ...attrs import LANG
 from ...tokens import Doc
+from ...tokenizer import Tokenizer
+
+
+class JapaneseTokenizer(object):
+    def __init__(self, cls, nlp=None):
+        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
+        try:
+            from janome.tokenizer import Tokenizer
+        except ImportError:
+            raise ImportError("The Japanese tokenizer requires the Janome "
+                              "library: https://github.com/mocobeta/janome")
+        self.tokenizer = Tokenizer()
+
+    def __call__(self, text):
+        words = [x.surface for x in self.tokenizer.tokenize(text)]
+        return Doc(self.vocab, words=words, spaces=[False]*len(words))
+
+
+class JapaneseDefaults(Language.Defaults):
+    @classmethod
+    def create_tokenizer(cls, nlp=None):
+        return JapaneseTokenizer(cls, nlp)


 class Japanese(Language):
    lang = 'ja'
+    Defaults = JapaneseDefaults

    def make_doc(self, text):
-        try:
-            from janome.tokenizer import Tokenizer
-        except ImportError:
-            raise ImportError("The Japanese tokenizer requires the Janome library: "
-                              "https://github.com/mocobeta/janome")
-        words = [x.surface for x in Tokenizer().tokenize(text)]
+        words = self.tokenizer(text)
        return Doc(self.vocab, words=words, spaces=[False]*len(words))


--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@ -122,21 +122,35 @@ def word_shape(text):
            shape.append(shape_char)
    return ''.join(shape)

+def lower(string): return string.lower()
+def prefix(string): return string[0]
+def suffix(string): return string[-3:]
+def cluster(string): return 0
+def is_alpha(string): return string.isalpha()
+def is_digit(string): return string.isdigit()
+def is_lower(string): return string.islower()
+def is_space(string): return string.isspace()
+def is_title(string): return string.istitle()
+def is_upper(string): return string.isupper()
+def is_stop(string, stops=set()): return string in stops
+def is_oov(string): return True
+def get_prob(string): return -20.

 LEX_ATTRS = {
-    attrs.LOWER: lambda string: string.lower(),
-    attrs.NORM: lambda string: string.lower(),
-    attrs.PREFIX: lambda string: string[0],
-    attrs.SUFFIX: lambda string: string[-3:],
-    attrs.CLUSTER: lambda string: 0,
-    attrs.IS_ALPHA: lambda string: string.isalpha(),
-    attrs.IS_DIGIT: lambda string: string.isdigit(),
-    attrs.IS_LOWER: lambda string: string.islower(),
-    attrs.IS_SPACE: lambda string: string.isspace(),
-    attrs.IS_TITLE: lambda string: string.istitle(),
-    attrs.IS_UPPER: lambda string: string.isupper(),
-    attrs.IS_STOP: lambda string: False,
-    attrs.IS_OOV: lambda string: True,
+    attrs.LOWER: lower,
+    attrs.NORM: lower,
+    attrs.PREFIX: prefix,
+    attrs.SUFFIX: suffix,
+    attrs.CLUSTER: cluster,
+    attrs.IS_ALPHA: is_alpha,
+    attrs.IS_DIGIT: is_digit,
+    attrs.IS_LOWER: is_lower,
+    attrs.IS_SPACE: is_space,
+    attrs.IS_TITLE: is_title,
+    attrs.IS_UPPER: is_upper,
+    attrs.IS_STOP: is_stop,
+    attrs.IS_OOV: is_oov,
+    attrs.PROB: get_prob,
    attrs.LIKE_EMAIL: like_email,
    attrs.LIKE_NUM: like_num,
    attrs.IS_PUNCT: is_punct,
--- a/spacy/lang/nb/init.py
+++ b/spacy/lang/nb/init.py
@ -16,9 +16,8 @@ class NorwegianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'nb'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
+    stop_words = STOP_WORDS


 class Norwegian(Language):
--- a/spacy/lang/nb/examples.py
+++ b/spacy/lang/nb/examples.py
@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.nb.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar",
+    "Selvkjørende biler flytter forsikringsansvaret over på produsentene ",
+    "San Francisco vurderer å forby robotbud på fortauene",
+    "London er en stor by i Storbritannia."
+]
--- a/spacy/lang/nl/init.py
+++ b/spacy/lang/nl/init.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals

 from .stop_words import STOP_WORDS
+from .lex_attrs import LEX_ATTRS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
@ -12,11 +13,11 @@ from ...util import update_exc, add_lookups

 class DutchDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: 'nl'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
+    stop_words = STOP_WORDS


 class Dutch(Language):
--- a/spacy/lang/nl/lex_attrs.py
+++ b/spacy/lang/nl/lex_attrs.py
@ -0,0 +1,40 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...attrs import LIKE_NUM
+
+
+_num_words = set("""
+nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
+veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
+duizend miljoen miljard biljoen biljard triljoen triljard
+""".split())
+
+_ordinal_words = set("""
+eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
+twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
+zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
+miljardste biljoenste biljardste triljoenste triljardste
+""".split())
+
+
+def like_num(text):
+    # This only does the most basic check for whether a token is a digit
+    # or matches one of the number words. In order to handle numbers like
+    # "drieëntwintig", more work is required.
+    # See this discussion: https://github.com/explosion/spaCy/pull/1177
+    text = text.replace(',', '').replace('.', '')
+    if text.isdigit():
+        return True
+    if text.count('/') == 1:
+        num, denom = text.split('/')
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text in _num_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {
+    LIKE_NUM: like_num
+}
--- a/spacy/lang/norm_exceptions.py
+++ b/spacy/lang/norm_exceptions.py
@ -31,11 +31,21 @@ BASE_NORMS = {
    "„": '"',
    "»": '"',
    "«": '"',
+    "‘‘": '"',
+    "’’": '"',
+    "？": "?",
+    "！": "!",
+    "，": ",",
+    "；": ";",
+    "：": ":",
+    "。": ".",
+    "।": ".",
    "…": "...",
    "—": "-",
    "–": "-",
    "--": "-",
    "---": "-",
+    "——": "-",
    "€": "$",
    "£": "$",
    "¥": "$",
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -15,9 +15,8 @@ class PolishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'pl'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
+    stop_words = STOP_WORDS


 class Polish(Language):
--- a/spacy/lang/pl/examples.py
+++ b/spacy/lang/pl/examples.py
@ -0,0 +1,20 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.pl.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Poczuł przyjemną woń mocnej kawy.",
+    "Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
+    "Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.",
+    "Nowy abonament pod lupą Komisji Europejskiej",
+    "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
+    "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”."
+]
--- a/spacy/lang/pt/init.py
+++ b/spacy/lang/pt/init.py
@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...lemmatizerlookup import Lemmatizer
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups

@ -19,13 +18,9 @@ class PortugueseDefaults(Language.Defaults):
    lex_attr_getters[LANG] = lambda text: 'pt'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
    lex_attr_getters.update(LEX_ATTRS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
-
-    @classmethod
-    def create_lemmatizer(cls, nlp=None):
-        return Lemmatizer(LOOKUP)
+    stop_words = STOP_WORDS
+    lemma_lookup = LOOKUP


 class Portuguese(Language):
--- a/spacy/lang/pt/examples.py
+++ b/spacy/lang/pt/examples.py
@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.pt.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares",
+    "Carros autônomos empurram a responsabilidade do seguro para os fabricantes."
+    "São Francisco considera banir os robôs de entrega que andam pelas calçadas",
+    "Londres é a maior cidade do Reino Unido"
+]
--- a/spacy/lang/sv/init.py
+++ b/spacy/lang/sv/init.py
@ -9,7 +9,6 @@ from .lemmatizer import LEMMA_RULES, LOOKUP
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...lemmatizerlookup import Lemmatizer
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups

@ -18,13 +17,10 @@ class SwedishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'sv'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
-
-    @classmethod
-    def create_lemmatizer(cls, nlp=None):
-        return Lemmatizer(LOOKUP)
+    stop_words = STOP_WORDS
+    lemma_rules = LEMMA_RULES
+    lemma_lookup = LOOKUP


 class Swedish(Language):
--- a/Show More
+++ b/Show More