Wrap try/except around model saving

2025-07-01 18:33:12 +03:00 · 2017-10-05 08:14:24 -05:00 · 2017-10-05 08:14:24 -05:00 · c6cd81f192
commit c6cd81f192
parent 5743b06e36 b621a2e964
331 changed files with 10443 additions and 10377 deletions
--- a/.appveyor.yml
+++ b/.appveyor.yml
@ -1 +1,55 @@
 environment:
  matrix:
    # For Python versions available on Appveyor, see
    # http://www.appveyor.com/docs/installed-software#python
    # The list here is complete (excluding Python 2.6, which
    # isn't covered by this document) at the time of writing.
    - PYTHON: "C:\\Python27"
    #- PYTHON: "C:\\Python33"
    #- PYTHON: "C:\\Python34"
    #- PYTHON: "C:\\Python35"
    #- PYTHON: "C:\\Python27-x64"
    #- PYTHON: "C:\\Python33-x64"
    #- DISTUTILS_USE_SDK: "1"
    #- PYTHON: "C:\\Python34-x64"
    #- DISTUTILS_USE_SDK: "1"
    #- PYTHON: "C:\\Python35-x64"
    - PYTHON: "C:\\Python36-x64"
 install:
  # We need wheel installed to build wheels
  - "%PYTHON%\\python.exe -m pip install wheel"
  - "%PYTHON%\\python.exe -m pip install cython"
  - "%PYTHON%\\python.exe -m pip install -r requirements.txt"
  - "%PYTHON%\\python.exe -m pip install -e ."
 build: off
 test_script:
  # Put your test command here.
  # If you don't need to build C extensions on 64-bit Python 3.3 or 3.4,
  # you can remove "build.cmd" from the front of the command, as it's
  # only needed to support those cases.
  # Note that you must use the environment variable %PYTHON% to refer to
  # the interpreter you're using - Appveyor does not do anything special
  # to put the Python version you want to use on PATH.
  - "%PYTHON%\\python.exe -m pytest spacy/"
 after_test:
  # This step builds your wheels.
  # Again, you only need build.cmd if you're building C extensions for
  # 64-bit Python 3.3/3.4. And you need to use %PYTHON% to get the correct
  # interpreter
  - "%PYTHON%\\python.exe setup.py bdist_wheel"
 artifacts:
  # bdist_wheel puts your built wheel in the dist directory
  - path: dist\*
 #on_success:
 #  You can use this step to upload your artifacts to a public website.
 #  See Appveyor's documentation for more details. Or you can simply
 #  access your wheels from the Appveyor "artifacts" tab for your build.
--- a/.buildkite/sdist.yml
+++ b/.buildkite/sdist.yml
@ -0,0 +1,11 @@
 steps:
  -
    command: "fab env clean make test sdist"
    label: ":dizzy: :python:"
    artifact_paths: "dist/*.tar.gz"
  - wait
  - trigger: "spacy-sdist-against-models"
    label: ":dizzy: :hammer:"
    build:
      env:
        SPACY_VERSION: "{$SPACY_VERSION}"
--- a/.gitignore
+++ b/.gitignore
@ -1,14 +1,12 @@
 # spaCy
 spacy/data/
 corpora/
-models/
+/models/
 keys/
 # Website
 website/www/
 website/_deploy.sh
 website/package.json
 website/announcement.jade
 website/.gitignore
 # Cython / C extensions
--- a/examples/chainer_sentiment.py
+++ b/examples/chainer_sentiment.py
@ -1,322 +0,0 @@
 '''WIP --- Doesn't work well yet'''
 import plac
 import random
 import six
 import cProfile
 import pstats
 import pathlib
 import cPickle as pickle
 from itertools import izip
 import spacy
 import cytoolz
 import cupy as xp
 import cupy.cuda
 import chainer.cuda
 import chainer.links as L
 import chainer.functions as F
 from chainer import Chain, Variable, report
 import chainer.training
 import chainer.optimizers
 from chainer.training import extensions
 from chainer.iterators import SerialIterator
 from chainer.datasets import TupleDataset
 class SentimentAnalyser(object):
    @classmethod
    def load(cls, path, nlp, max_length=100):
        raise NotImplementedError
        #with (path / 'config.json').open() as file_:
        #    model = model_from_json(file_.read())
        #with (path / 'model').open('rb') as file_:
        #    lstm_weights = pickle.load(file_)
        #embeddings = get_embeddings(nlp.vocab)
        #model.set_weights([embeddings] + lstm_weights)
        #return cls(model, max_length=max_length)
    def __init__(self, model, max_length=100):
        self._model = model
        self.max_length = max_length
    def __call__(self, doc):
        X = get_features([doc], self.max_length)
        y = self._model.predict(X)
        self.set_sentiment(doc, y)
    def pipe(self, docs, batch_size=1000, n_threads=2):
        for minibatch in cytoolz.partition_all(batch_size, docs):
            minibatch = list(minibatch)
            sentences = []
            for doc in minibatch:
                sentences.extend(doc.sents)
            Xs = get_features(sentences, self.max_length)
            ys = self._model.predict(Xs)
            for sent, label in zip(sentences, ys):
                sent.doc.sentiment += label - 0.5
            for doc in minibatch:
                yield doc
    def set_sentiment(self, doc, y):
        doc.sentiment = float(y[0])
        # Sentiment has a native slot for a single float.
        # For arbitrary data storage, there's:
        # doc.user_data['my_data'] = y
 class Classifier(Chain):
    def __init__(self, predictor):
        super(Classifier, self).__init__(predictor=predictor)
    def __call__(self, x, t):
        y = self.predictor(x)
        loss = F.softmax_cross_entropy(y, t)
        accuracy = F.accuracy(y, t)
        report({'loss': loss, 'accuracy': accuracy}, self)
        return loss
 class SentimentModel(Chain):
    def __init__(self, nlp, shape, **settings):
        Chain.__init__(self,
            embed=_Embed(shape['nr_vector'], shape['nr_dim'], shape['nr_hidden'],
                set_vectors=lambda arr: set_vectors(arr, nlp.vocab)),
            encode=_Encode(shape['nr_hidden'], shape['nr_hidden']),
            attend=_Attend(shape['nr_hidden'], shape['nr_hidden']),
            predict=_Predict(shape['nr_hidden'], shape['nr_class']))
        self.to_gpu(0)
    def __call__(self, sentence):
        return self.predict(
                  self.attend(
                      self.encode(
                          self.embed(sentence))))
 class _Embed(Chain):
    def __init__(self, nr_vector, nr_dim, nr_out, set_vectors=None):
        Chain.__init__(self,
            embed=L.EmbedID(nr_vector, nr_dim, initialW=set_vectors),
            project=L.Linear(None, nr_out, nobias=True))
        self.embed.W.volatile = False
    def __call__(self, sentence):
        return [self.project(self.embed(ts)) for ts in F.transpose(sentence)]
 class _Encode(Chain):
    def __init__(self, nr_in, nr_out):
        Chain.__init__(self,
            fwd=L.LSTM(nr_in, nr_out),
            bwd=L.LSTM(nr_in, nr_out),
            mix=L.Bilinear(nr_out, nr_out, nr_out))
    def __call__(self, sentence):
        self.fwd.reset_state()
        fwds = map(self.fwd, sentence)
        self.bwd.reset_state()
        bwds = reversed(map(self.bwd, reversed(sentence)))
        return [F.elu(self.mix(f, b)) for f, b in zip(fwds, bwds)]
 class _Attend(Chain):
    def __init__(self, nr_in, nr_out):
        Chain.__init__(self)
    def __call__(self, sentence):
        sent = sum(sentence)
        return sent
 class _Predict(Chain):
    def __init__(self, nr_in, nr_out):
        Chain.__init__(self,
            l1=L.Linear(nr_in, nr_in),
            l2=L.Linear(nr_in, nr_out))
    def __call__(self, vector):
        vector = self.l1(vector)
        vector = F.elu(vector)
        vector = self.l2(vector)
        return vector
 class SentenceDataset(TupleDataset):
    def __init__(self, nlp, texts, labels, max_length):
        self.max_length = max_length
        sents, labels = self._get_labelled_sentences(
            nlp.pipe(texts, batch_size=5000, n_threads=3),
            labels)
        TupleDataset.__init__(self,
            get_features(sents, max_length),
            labels)
    def __getitem__(self, index):
        batches = [dataset[index] for dataset in self._datasets]
        if isinstance(index, slice):
            length = len(batches[0])
            returns = [tuple([batch[i] for batch in batches])
                       for i in six.moves.range(length)]
            return returns
        else:
            return tuple(batches)
    def _get_labelled_sentences(self, docs, doc_labels):
        labels = []
        sentences = []
        for doc, y in izip(docs, doc_labels):
            for sent in doc.sents:
                sentences.append(sent)
                labels.append(y)
        return sentences, xp.asarray(labels, dtype='i')
 class DocDataset(TupleDataset):
    def __init__(self, nlp, texts, labels):
        self.max_length = max_length
        DatasetMixin.__init__(self,
            get_features(
                nlp.pipe(texts, batch_size=5000, n_threads=3), self.max_length),
            labels)
 def read_data(data_dir, limit=0):
    examples = []
    for subdir, label in (('pos', 1), ('neg', 0)):
        for filename in (data_dir / subdir).iterdir():
            with filename.open() as file_:
                text = file_.read()
            examples.append((text, label))
    random.shuffle(examples)
    if limit >= 1:
        examples = examples[:limit]
    return zip(*examples) # Unzips into two lists
 def get_features(docs, max_length):
    docs = list(docs)
    Xs = xp.zeros((len(docs), max_length), dtype='i')
    for i, doc in enumerate(docs):
        j = 0
        for token in doc:
            if token.has_vector and not token.is_punct and not token.is_space:
                Xs[i, j] = token.norm
                j += 1
                if j >= max_length:
                    break
    return Xs
 def set_vectors(vectors, vocab):
    for lex in vocab:
        if lex.has_vector and (lex.rank+1) < vectors.shape[0]:
            lex.norm = lex.rank+1
            vectors[lex.rank + 1] = lex.vector
        else:
            lex.norm = 0
    return vectors
 def train(train_texts, train_labels, dev_texts, dev_labels,
        lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5,
        by_sentence=True):
    nlp = spacy.load('en', entity=False)
    if 'nr_vector' not in lstm_shape:
        lstm_shape['nr_vector'] = max(lex.rank+1 for lex in nlp.vocab if lex.has_vector)
    if 'nr_dim' not in lstm_shape:
        lstm_shape['nr_dim'] = nlp.vocab.vectors_length
    print("Make model")
    model = Classifier(SentimentModel(nlp, lstm_shape, **lstm_settings))
    print("Parsing texts...")
    if by_sentence:
        train_data = SentenceDataset(nlp, train_texts, train_labels, lstm_shape['max_length'])
        dev_data = SentenceDataset(nlp, dev_texts, dev_labels, lstm_shape['max_length'])
    else:
        train_data = DocDataset(nlp, train_texts, train_labels)
        dev_data = DocDataset(nlp, dev_texts, dev_labels)
    train_iter = SerialIterator(train_data, batch_size=batch_size,
                                shuffle=True, repeat=True)
    dev_iter = SerialIterator(dev_data, batch_size=batch_size,
                              shuffle=False, repeat=False)
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)
    updater = chainer.training.StandardUpdater(train_iter, optimizer, device=0)
    trainer = chainer.training.Trainer(updater, (1, 'epoch'), out='result')
    trainer.extend(extensions.Evaluator(dev_iter, model, device=0))
    trainer.extend(extensions.LogReport())
    trainer.extend(extensions.PrintReport([
        'epoch', 'main/accuracy', 'validation/main/accuracy']))
    trainer.extend(extensions.ProgressBar())
    trainer.run()
 def evaluate(model_dir, texts, labels, max_length=100):
    def create_pipeline(nlp):
        '''
        This could be a lambda, but named functions are easier to read in Python.
        '''
        return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp,
                                                               max_length=max_length)]
    nlp = spacy.load('en')
    nlp.pipeline = create_pipeline(nlp)
    correct = 0
    i = 0 
    for doc in nlp.pipe(texts, batch_size=1000, n_threads=4):
        correct += bool(doc.sentiment >= 0.5) == bool(labels[i])
        i += 1
    return float(correct) / i
@plac.annotations(
    train_dir=("Location of training file or directory"),
    dev_dir=("Location of development file or directory"),
    model_dir=("Location of output model directory",),
    is_runtime=("Demonstrate run-time usage", "flag", "r", bool),
    nr_hidden=("Number of hidden units", "option", "H", int),
    max_length=("Maximum sentence length", "option", "L", int),
    dropout=("Dropout", "option", "d", float),
    learn_rate=("Learn rate", "option", "e", float),
    nb_epoch=("Number of training epochs", "option", "i", int),
    batch_size=("Size of minibatches for training LSTM", "option", "b", int),
    nr_examples=("Limit to N examples", "option", "n", int)
 )
 def main(model_dir, train_dir, dev_dir,
         is_runtime=False,
         nr_hidden=64, max_length=100, # Shape
         dropout=0.5, learn_rate=0.001, # General NN config
         nb_epoch=5, batch_size=32, nr_examples=-1):  # Training params
    model_dir = pathlib.Path(model_dir)
    train_dir = pathlib.Path(train_dir)
    dev_dir = pathlib.Path(dev_dir)
    if is_runtime:
        dev_texts, dev_labels = read_data(dev_dir)
        acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length)
        print(acc)
    else:
        print("Read data")
        train_texts, train_labels = read_data(train_dir, limit=nr_examples)
        dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples)
        print("Using GPU 0")
        #chainer.cuda.get_device(0).use()
        train_labels = xp.asarray(train_labels, dtype='i')
        dev_labels = xp.asarray(dev_labels, dtype='i')
        lstm = train(train_texts, train_labels, dev_texts, dev_labels,
                     {'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 2,
                      'nr_vector': 5000},
                      {'dropout': 0.5, 'lr': learn_rate},
                      {},
                      nb_epoch=nb_epoch, batch_size=batch_size)
 if __name__ == '__main__':
    #cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
    #s = pstats.Stats("Profile.prof")
    #s.strip_dirs().sort_stats("time").print_stats()
    plac.call(main)
--- a/examples/multi_word_matches.py
+++ b/examples/multi_word_matches.py
@ -20,71 +20,71 @@ The algorithm is O(n) at run-time for document of length n because we're only ev
 matching over the tag patterns. So no matter how many phrases we're looking for,
 our pattern set stays very small (exact size depends on the maximum length we're
 looking for, as the query language currently has no quantifiers)
 The example expects a .bz2 file from the Reddit corpus, and a patterns file,
 formatted in jsonl as a sequence of entries like this:
 {"text":"Anchorage"}
 {"text":"Angola"}
 {"text":"Ann Arbor"}
 {"text":"Annapolis"}
 {"text":"Appalachia"}
 {"text":"Argentina"}
 """
 from __future__ import print_function, unicode_literals, division
 from ast import literal_eval
 from bz2 import BZ2File
 import time
 import math
 import codecs
 import plac
 import ujson
 from preshed.maps import PreshMap
 from preshed.counter import PreshCounter
 from spacy.strings import hash_string
 from spacy.en import English
 from spacy.matcher import PhraseMatcher
 import spacy
 def read_gazetteer(tokenizer, loc, n=-1):
    for i, line in enumerate(open(loc)):
-        phrase = literal_eval('u' + line.strip())
+        data = ujson.loads(line.strip())
-        if ' (' in phrase and phrase.endswith(')'):
+        phrase = tokenizer(data['text'])
-            phrase = phrase.split(' (', 1)[0]
+        for w in phrase:
-        if i >= n:
+            _ = tokenizer.vocab[w.text]
            break
        phrase = tokenizer(phrase)
        if all((t.is_lower and t.prob >= -10) for t in phrase):
            continue
        if len(phrase) >= 2:
            yield phrase
-def read_text(bz2_loc):
+def read_text(bz2_loc, n=10000):
    with BZ2File(bz2_loc) as file_:
-        for line in file_:
+        for i, line in enumerate(file_):
-            yield line.decode('utf8')
+            data = ujson.loads(line)
            yield data['body']
            if i >= n:
                break
 def get_matches(tokenizer, phrases, texts, max_length=6):
-    matcher = PhraseMatcher(tokenizer.vocab, phrases, max_length=max_length)
+    matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length)
-    print("Match")
+    matcher.add('Phrase', None, *phrases)
    for text in texts:
        doc = tokenizer(text)
        for w in doc:
            _ = doc.vocab[w.text]
        matches = matcher(doc)
-        for mwe in doc.ents:
+        for ent_id, start, end in matches:
-            yield mwe
+            yield (ent_id, doc[start:end].text)
-def main(patterns_loc, text_loc, counts_loc, n=10000000):
+def main(patterns_loc, text_loc, n=10000):
-    nlp = English(parser=False, tagger=False, entity=False)
+    nlp = spacy.blank('en')
-    print("Make matcher")
+    nlp.vocab.lex_attr_getters = {}
-    phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n)
+    phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
-    counts = PreshCounter()
+    count = 0
    t1 = time.time()
-    for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)):
+    for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)):
-        counts.inc(hash_string(mwe.text), 1)
+        count += 1
    t2 = time.time()
-    print("10m tokens in %d s" % (t2 - t1))
+    print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
    with codecs.open(counts_loc, 'w', 'utf8') as file_:
        for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n):
            text = phrase.string
            key = hash_string(text)
            count = counts[key]
            if count != 0:
                file_.write('%d\t%s\n' % (count, text))
 if __name__ == '__main__':
--- a/examples/training/train_ner_standalone.py
+++ b/examples/training/train_ner_standalone.py
@ -13,24 +13,29 @@ Input data:
 https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip
 Developed for: spaCy 1.7.1
-Last tested for: spaCy 1.7.1
+Last tested for: spaCy 2.0.0a13
 '''
 from __future__ import unicode_literals, print_function
 import plac
 from pathlib import Path
 import random
 import json
 import tqdm
 from thinc.neural.optimizers import Adam
 from thinc.neural.ops import NumpyOps
 import spacy.orth as orth_funcs
 from spacy.vocab import Vocab
-from spacy.pipeline import BeamEntityRecognizer
+from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer
 from spacy.pipeline import EntityRecognizer
 from spacy.tokenizer import Tokenizer
 from spacy.tokens import Doc
 from spacy.attrs import *
 from spacy.gold import GoldParse
-from spacy.gold import _iob_to_biluo as iob_to_biluo
+from spacy.gold import iob_to_biluo
 from spacy.gold import minibatch
 from spacy.scorer import Scorer
 import spacy.util
 try:
    unicode
@ -38,96 +43,38 @@ except NameError:
    unicode = str
 spacy.util.set_env_log(True)
 def init_vocab():
    return Vocab(
        lex_attr_getters={
            LOWER: lambda string: string.lower(),
-            SHAPE: orth_funcs.word_shape,
+            NORM: lambda string: string.lower(),
            PREFIX: lambda string: string[0],
            SUFFIX: lambda string: string[-3:],
            CLUSTER: lambda string: 0,
            IS_ALPHA: orth_funcs.is_alpha,
            IS_ASCII: orth_funcs.is_ascii,
            IS_DIGIT: lambda string: string.isdigit(),
            IS_LOWER: orth_funcs.is_lower,
            IS_PUNCT: orth_funcs.is_punct,
            IS_SPACE: lambda string: string.isspace(),
            IS_TITLE: orth_funcs.is_title,
            IS_UPPER: orth_funcs.is_upper,
            IS_STOP: lambda string: False,
            IS_OOV: lambda string: True
        })
 def save_vocab(vocab, path):
    path = Path(path)
    if not path.exists():
        path.mkdir()
    elif not path.is_dir():
        raise IOError("Can't save vocab to %s\nNot a directory" % path)
    with (path / 'strings.json').open('w') as file_:
        vocab.strings.dump(file_)
    vocab.dump((path / 'lexemes.bin').as_posix())
 def load_vocab(path):
    path = Path(path)
    if not path.exists():
        raise IOError("Cannot load vocab from %s\nDoes not exist" % path)
    if not path.is_dir():
        raise IOError("Cannot load vocab from %s\nNot a directory" % path)
    return Vocab.load(path)
 def init_ner_model(vocab, features=None):
    if features is None:
        features = tuple(EntityRecognizer.feature_templates)
    return EntityRecognizer(vocab, features=features)
 def save_ner_model(model, path):
    path = Path(path)
    if not path.exists():
        path.mkdir()
    if not path.is_dir():
        raise IOError("Can't save model to %s\nNot a directory" % path)
    model.model.dump((path / 'model').as_posix())
    with (path / 'config.json').open('w') as file_:
        data = json.dumps(model.cfg)
        if not isinstance(data, unicode):
            data = data.decode('utf8')
        file_.write(data)
 def load_ner_model(vocab, path):
    return EntityRecognizer.load(path, vocab)
 class Pipeline(object):
    @classmethod
    def load(cls, path):
        path = Path(path)
        if not path.exists():
            raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
        if not path.is_dir():
            raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
        vocab = load_vocab(path)
        tokenizer = Tokenizer(vocab, {}, None, None, None)
        ner_model = load_ner_model(vocab, path / 'ner')
        return cls(vocab, tokenizer, ner_model)
    def __init__(self, vocab=None, tokenizer=None, entity=None):
        if vocab is None:
            vocab = init_vocab()
        if tokenizer is None:
            tokenizer = Tokenizer(vocab, {}, None, None, None)
        if entity is None:
-            entity = init_ner_model(self.vocab)
+            entity = NeuralEntityRecognizer(vocab)
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.entity = entity
        self.pipeline = [self.entity]
    def begin_training(self):
        for model in self.pipeline:
            model.begin_training([])
        optimizer = Adam(NumpyOps(), 0.001)
        return optimizer
    def __call__(self, input_):
        doc = self.make_doc(input_)
        for process in self.pipeline:
@ -147,14 +94,16 @@ class Pipeline(object):
        gold = GoldParse(doc, entities=annotations)
        return gold
-    def update(self, input_, annot):
+    def update(self, inputs, annots, sgd, losses=None, drop=0.):
-        doc = self.make_doc(input_)
+        if losses is None:
-        gold = self.make_gold(input_, annot)
+            losses = {}
-        for ner in gold.ner:
+        docs = [self.make_doc(input_) for input_ in inputs]
-            if ner not in (None, '-', 'O'):
+        golds = [self.make_gold(input_, annot) for input_, annot in
-                action, label = ner.split('-', 1)
+                 zip(inputs, annots)]
-                self.entity.add_label(label)
+
-        return self.entity.update(doc, gold)
+        self.entity.update(docs, golds, drop=drop,
                           sgd=sgd, losses=losses)
        return losses
    def evaluate(self, examples):
        scorer = Scorer()
@ -164,34 +113,36 @@ class Pipeline(object):
            scorer.score(doc, gold)
        return scorer.scores
-    def average_weights(self):
+    def to_disk(self, path):
        self.entity.model.end_training()
    def save(self, path):
        path = Path(path)
        if not path.exists():
            path.mkdir()
        elif not path.is_dir():
            raise IOError("Can't save pipeline to %s\nNot a directory" % path)
-        save_vocab(self.vocab, path / 'vocab')
+        self.vocab.to_disk(path / 'vocab')
-        save_ner_model(self.entity, path / 'ner')
+        self.entity.to_disk(path / 'ner')
    def from_disk(self, path):
        path = Path(path)
        if not path.exists():
            raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
        if not path.is_dir():
            raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
        self.vocab = self.vocab.from_disk(path / 'vocab')
        self.entity = self.entity.from_disk(path / 'ner')
-def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5):
+def train(nlp, train_examples, dev_examples, nr_epoch=5):
-    next_epoch = train_examples
+    sgd = nlp.begin_training()
    print("Iter", "Loss", "P", "R", "F")
    for i in range(nr_epoch):
-        this_epoch = next_epoch
+        random.shuffle(train_examples)
-        next_epoch = []
+        losses = {}
-        loss = 0
+        for batch in minibatch(tqdm.tqdm(train_examples, leave=False), size=8):
-        for input_, annot in this_epoch:
+            inputs, annots = zip(*batch)
-            loss += nlp.update(input_, annot)
+            nlp.update(list(inputs), list(annots), sgd, losses=losses)
            if (i+1) < nr_epoch:
                next_epoch.append((input_, annot))
        random.shuffle(next_epoch)
        scores = nlp.evaluate(dev_examples)
-        report_scores(i, loss, scores)
+        report_scores(i, losses['ner'], scores)
    nlp.average_weights()
    scores = nlp.evaluate(dev_examples)
    report_scores(channels, i+1, loss, scores)
@ -208,7 +159,8 @@ def read_examples(path):
    with path.open() as file_:
        sents = file_.read().strip().split('\n\n')
        for sent in sents:
-            if not sent.strip():
+            sent = sent.strip()
            if not sent:
                continue
            tokens = sent.split('\n')
            while tokens and tokens[0].startswith('#'):
@ -217,28 +169,39 @@ def read_examples(path):
            iob = []
            for token in tokens:
                if token.strip():
-                    pieces = token.split()
+                    pieces = token.split('\t')
                    words.append(pieces[1])
                    iob.append(pieces[2])
            yield words, iob_to_biluo(iob)
 def get_labels(examples):
    labels = set()
    for words, tags in examples:
        for tag in tags:
            if '-' in tag:
                labels.add(tag.split('-')[1])
    return sorted(labels)
@plac.annotations(
    model_dir=("Path to save the model", "positional", None, Path),
    train_loc=("Path to your training data", "positional", None, Path),
    dev_loc=("Path to your development data", "positional", None, Path),
 )
-def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'),
+def main(model_dir, train_loc, dev_loc, nr_epoch=30):
-        train_loc=None, dev_loc=None, nr_epoch=30):
+    print(model_dir, train_loc, dev_loc)
-    
+    train_examples = list(read_examples(train_loc))
    train_examples = read_examples(train_loc)
    dev_examples = read_examples(dev_loc)
-    nlp = Pipeline.load(model_dir)
+    nlp = Pipeline()
    for label in get_labels(train_examples):
        nlp.entity.add_label(label)
        print("Add label", label)
-    train(nlp, train_examples, list(dev_examples), ctx, nr_epoch)
+    train(nlp, train_examples, list(dev_examples), nr_epoch)
-    nlp.save(model_dir)
+    nlp.to_disk(model_dir)
 if __name__ == '__main__':
-    main()
+    plac.call(main)
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@ -25,7 +25,7 @@ For more details, see the documentation:
 * Saving and loading models: https://spacy.io/docs/usage/saving-loading
 Developed for: spaCy 1.7.6
-Last tested for: spaCy 1.7.6
+Last updated for: spaCy 2.0.0a13
 """
 from __future__ import unicode_literals, print_function
@ -34,55 +34,41 @@ from pathlib import Path
 import random
 import spacy
-from spacy.gold import GoldParse
+from spacy.gold import GoldParse, minibatch
-from spacy.tagger import Tagger
+from spacy.pipeline import NeuralEntityRecognizer
 from spacy.pipeline import TokenVectorEncoder
 def get_gold_parses(tokenizer, train_data):
    '''Shuffle and create GoldParse objects'''
    random.shuffle(train_data)
    for raw_text, entity_offsets in train_data:
        doc = tokenizer(raw_text)
        gold = GoldParse(doc, entities=entity_offsets)
        yield doc, gold
 def train_ner(nlp, train_data, output_dir):
    # Add new words to vocab
    for raw_text, _ in train_data:
        doc = nlp.make_doc(raw_text)
        for word in doc:
            _ = nlp.vocab[word.orth]
    random.seed(0)
-    # You may need to change the learning rate. It's generally difficult to
+    optimizer = nlp.begin_training(lambda: [])
-    # guess what rate you should set, especially when you have limited data.
+    nlp.meta['name'] = 'en_ent_animal'
-    nlp.entity.model.learn_rate = 0.001
+    for itn in range(50):
-    for itn in range(1000):
+        losses = {}
-        random.shuffle(train_data)
+        for batch in minibatch(get_gold_parses(nlp.make_doc, train_data), size=3):
-        loss = 0.
+            docs, golds = zip(*batch)
-        for raw_text, entity_offsets in train_data:
+            nlp.update(docs, golds, losses=losses, sgd=optimizer, update_shared=True,
-            gold = GoldParse(doc, entities=entity_offsets)
+                       drop=0.35)
-            # By default, the GoldParse class assumes that the entities
+        print(losses)
-            # described by offset are complete, and all other words should
+    if not output_dir:
-            # have the tag 'O'. You can tell it to make no assumptions
+        return
-            # about the tag of a word by giving it the tag '-'.
+    elif not output_dir.exists():
-            # However, this allows a trivial solution to the current
+        output_dir.mkdir()
-            # learning problem: if words are either 'any tag' or 'ANIMAL',
+    nlp.to_disk(output_dir)
            # the model can learn that all words can be tagged 'ANIMAL'.
            #for i in range(len(gold.ner)):
                #if not gold.ner[i].endswith('ANIMAL'):
                #    gold.ner[i] = '-'
            doc = nlp.make_doc(raw_text)
            nlp.tagger(doc)
            # As of 1.9, spaCy's parser now lets you supply a dropout probability
            # This might help the model generalize better from only a few
            # examples.
            loss += nlp.entity.update(doc, gold, drop=0.9)
        if loss == 0:
            break
    # This step averages the model's weights. This may or may not be good for
    # your situation --- it's empirical.
    nlp.end_training()
    if output_dir:
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.save_to_directory(output_dir)
 def main(model_name, output_directory=None):
-    print("Loading initial model", model_name)
+    print("Creating initial model", model_name)
-    nlp = spacy.load(model_name)
+    nlp = spacy.blank(model_name)
    if output_directory is not None:
        output_directory = Path(output_directory)
@ -91,6 +77,11 @@ def main(model_name, output_directory=None):
            "Horses are too tall and they pretend to care about your feelings",
            [(0, 6, 'ANIMAL')],
        ),
        (
            "Do they bite?", 
            [],
        ),
        (
            "horses are too tall and they pretend to care about your feelings",
            [(0, 6, 'ANIMAL')]
@ -109,18 +100,20 @@ def main(model_name, output_directory=None):
        )
    ]
-    nlp.entity.add_label('ANIMAL')
+    nlp.pipeline.append(TokenVectorEncoder(nlp.vocab))
    nlp.pipeline.append(NeuralEntityRecognizer(nlp.vocab))
    nlp.pipeline[-1].add_label('ANIMAL')
    train_ner(nlp, train_data, output_directory)
    # Test that the entity is recognized
-    doc = nlp('Do you like horses?')
+    text = 'Do you like horses?'
    print("Ents in 'Do you like horses?':")
    doc = nlp(text)
    for ent in doc.ents:
        print(ent.label_, ent.text)
    if output_directory:
        print("Loading from", output_directory)
-        nlp2 = spacy.load('en', path=output_directory)
+        nlp2 = spacy.load(output_directory)
        nlp2.entity.add_label('ANIMAL')
        doc2 = nlp2('Do you like horses?')
        for ent in doc2.ents:
            print(ent.label_, ent.text)
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@ -1,3 +1,7 @@
 '''Train a multi-label convolutional neural network text classifier,
 using the spacy.pipeline.TextCategorizer component. The model is then added
 to spacy.pipeline, and predictions are available at `doc.cats`.
 '''
 from __future__ import unicode_literals
 import plac
 import random
@ -12,6 +16,11 @@ from spacy.gold import GoldParse, minibatch
 from spacy.util import compounding
 from spacy.pipeline import TextCategorizer
 # TODO: Remove this once we're not supporting models trained with thinc <6.9.0
 import thinc.neural._classes.layernorm
 thinc.neural._classes.layernorm.set_compat_six_eight(False)
 def train_textcat(tokenizer, textcat,
                  train_texts, train_cats, dev_texts, dev_cats,
@ -24,14 +33,15 @@ def train_textcat(tokenizer, textcat,
    train_docs = [tokenizer(text) for text in train_texts]
    train_gold = [GoldParse(doc, cats=cats) for doc, cats in
                  zip(train_docs, train_cats)]
-    train_data = zip(train_docs, train_gold)
+    train_data = list(zip(train_docs, train_gold))
    batch_sizes = compounding(4., 128., 1.001)
    for i in range(n_iter):
        losses = {}
-        train_data = tqdm.tqdm(train_data, leave=False) # Progress bar
+        # Progress bar and minibatching
-        for batch in minibatch(train_data, size=batch_sizes):
+        batches = minibatch(tqdm.tqdm(train_data, leave=False), size=batch_sizes)
        for batch in batches:
            docs, golds = zip(*batch)
-            textcat.update((docs, None), golds, sgd=optimizer, drop=0.2,
+            textcat.update(docs, golds, sgd=optimizer, drop=0.2,
                losses=losses)
        with textcat.model.use_params(optimizer.averages):
            scores = evaluate(tokenizer, textcat, dev_texts, dev_cats)
@ -61,12 +71,13 @@ def evaluate(tokenizer, textcat, texts, cats):
    return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}  
-def load_data():
+def load_data(limit=0):
    # Partition off part of the train data --- avoid running experiments
    # against test.
    train_data, _ = thinc.extra.datasets.imdb()
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [(['POSITIVE'] if y else []) for y in labels]
@ -86,7 +97,7 @@ def main(model_loc=None):
    textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])
    print("Load IMDB data")
-    (train_texts, train_cats), (dev_texts, dev_cats) = load_data()
+    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=1000)
    print("Itn.\tLoss\tP\tR\tF")
    progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'
--- a/examples/vectors_fast_text.py
+++ b/examples/vectors_fast_text.py
@ -0,0 +1,30 @@
 '''Load vectors for a language trained using FastText
 https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
 '''
 from __future__ import unicode_literals
 import plac
 import numpy
 import spacy.language
 def main(vectors_loc):
    nlp = spacy.language.Language()
    with open(vectors_loc, 'rb') as file_:
        header = file_.readline()
        nr_row, nr_dim = header.split()
        nlp.vocab.clear_vectors(int(nr_dim))
        for line in file_:
            line = line.decode('utf8')
            pieces = line.split() 
            word = pieces[0]
            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
            nlp.vocab.set_vector(word, vector)
    doc = nlp(u'class colspan')
    print(doc[0].similarity(doc[1]))
 if __name__ == '__main__':
    plac.call(main)
--- a/fabfile.py
+++ b/fabfile.py
@ -14,6 +14,7 @@ VENV_DIR = path.join(PWD, ENV)
 def env(lang='python2.7'):
    if path.exists(VENV_DIR):
        local('rm -rf {env}'.format(env=VENV_DIR))
    local('pip install virtualenv')
    local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))
@ -32,6 +33,10 @@ def make():
            local('pip install -r requirements.txt')
            local('python setup.py build_ext --inplace')
 def sdist():
    with virtualenv(VENV_DIR):
        with lcd(path.dirname(__file__)):
            local('python setup.py sdist')
 def clean():
    with lcd(path.dirname(__file__)):
--- a/requirements.txt
+++ b/requirements.txt
@ -1,9 +1,9 @@
-cython<0.24
+cython>=0.24,<0.27.0
 pathlib
 numpy>=1.7
 cymem>=1.30,<1.32
 preshed>=1.0.0,<2.0.0
-thinc>=6.8.0,<6.9.0
+thinc>=6.9.0,<6.10.0
 murmurhash>=0.28,<0.29
 plac<1.0.0,>=0.9.6
 six
@ -13,7 +13,7 @@ requests>=2.13.0,<3.0.0
 regex==2017.4.5
 ftfy>=4.4.2,<5.0.0
 pytest>=3.0.6,<4.0.0
 pip>=9.0.0,<10.0.0
 mock>=2.0.0,<3.0.0
 msgpack-python
 msgpack-numpy
 html5lib==1.0b8
--- a/setup.py
+++ b/setup.py
@ -195,9 +195,8 @@ def setup_package():
                'murmurhash>=0.28,<0.29',
                'cymem>=1.30,<1.32',
                'preshed>=1.0.0,<2.0.0',
-                'thinc>=6.8.0,<6.9.0',
+                'thinc>=6.9.0,<6.10.0',
                'plac<1.0.0,>=0.9.6',
                'pip>=9.0.0,<10.0.0',
                'six',
                'pathlib',
                'ujson>=1.35',
--- a/spacy/init.py
+++ b/spacy/init.py
@ -4,11 +4,13 @@ from __future__ import unicode_literals
 from .cli.info import info as cli_info
 from .glossary import explain
 from .deprecated import resolve_load_name
 #from .about import __version__
 from .about import __version__
 from . import util
 def load(name, **overrides):
    from .deprecated import resolve_load_name
    name = resolve_load_name(name, **overrides)
    return util.load_model(name, **overrides)
--- a/spacy/main.py
+++ b/spacy/main.py
@ -7,7 +7,7 @@ if __name__ == '__main__':
    import plac
    import sys
    from spacy.cli import download, link, info, package, train, convert, model
-    from spacy.cli import profile
+    from spacy.cli import profile, evaluate
    from spacy.util import prints
    commands = {
@ -15,6 +15,7 @@ if __name__ == '__main__':
        'link': link,
        'info': info,
        'train': train,
        'evaluate': evaluate,
        'convert': convert,
        'package': package,
        'model': model,
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -1,28 +1,27 @@
 import ujson
 from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
 from thinc.i2v import HashEmbed, StaticVectors
 from thinc.t2t import ExtractWindow, ParametricAttention
 from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
 from thinc.misc import Residual
 from thinc.misc import BatchNorm as BN
 from thinc.misc import LayerNorm as LN
 from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
-from thinc.neural import Model, Maxout, Softmax, Affine
+from thinc.api import FeatureExtracter, with_getitem
-from thinc.neural._classes.hash_embed import HashEmbed
+from thinc.api import uniqued, wrap, flatten_add_lengths, noop
 from thinc.linear.linear import LinearModel
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module
 import random
 import cytoolz
 from thinc.neural._classes.convolution import ExtractWindow
 from thinc.neural._classes.static_vectors import StaticVectors
 from thinc.neural._classes.batchnorm import BatchNorm as BN
 from thinc.neural._classes.layernorm import LayerNorm as LN
 from thinc.neural._classes.resnet import Residual
 from thinc.neural import ReLu
 from thinc.neural._classes.selu import SELU
 from thinc import describe
 from thinc.describe import Dimension, Synapses, Biases, Gradient
 from thinc.neural._classes.affine import _set_dimensions_if_needed
-from thinc.api import FeatureExtracter, with_getitem
+import thinc.extra.load_nlp
 from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
 from thinc.neural._classes.attention import ParametricAttention
 from thinc.linear.linear import LinearModel
 from thinc.api import uniqued, wrap, flatten_add_lengths
 from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
 from .tokens.doc import Doc
@ -31,6 +30,11 @@ from . import util
 import numpy
 import io
 # TODO: Unset this once we don't want to support models previous models.
 import thinc.neural._classes.layernorm
 thinc.neural._classes.layernorm.set_compat_six_eight(True)
 VECTORS_KEY = 'spacy_pretrained_vectors'
@layerize
 def _flatten_add_lengths(seqs, pad=0, drop=0.):
@ -225,33 +229,80 @@ def drop_layer(layer, factor=2.):
    model.predict = layer
    return model
 def link_vectors_to_models(vocab):
    vectors = vocab.vectors
    ops = Model.ops
    for word in vocab:
        if word.orth in vectors.key2row:
            word.rank = vectors.key2row[word.orth]
        else:
            word.rank = 0
    data = ops.asarray(vectors.data)
    # Set an entry here, so that vectors are accessed by StaticVectors
    # (unideal, I know)
    thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
-def Tok2Vec(width, embed_size, preprocess=None):
+def Tok2Vec(width, embed_size, **kwargs):
    pretrained_dims = kwargs.get('pretrained_dims', 0)
    cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
-    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
+    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
                                 '*': reapply}):
        norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
        prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
        suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
        shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
        if pretrained_dims is not None and pretrained_dims >= 1:
            glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
-        embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3))
+            embed = uniqued(
-        tok2vec = (
+                (glove | norm | prefix | suffix | shape)
-            with_flatten(
+                >> LN(Maxout(width, width*5, pieces=3)), column=5)
-                asarray(Model.ops, dtype='uint64')
+        else:
-                >> uniqued(embed, column=5)
+            embed = uniqued(
-                >> Residual(
+                (norm | prefix | suffix | shape)
-                    (ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
+                >> LN(Maxout(width, width*4, pieces=3)), column=5)
-                ) ** 4, pad=4
+
-            )
+
        convolution = Residual(
            ExtractWindow(nW=1)
            >> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
        )
-        if preprocess not in (False, None):
+
-            tok2vec = preprocess >> tok2vec
+        tok2vec = (
            FeatureExtracter(cols)
            >> with_flatten(
                embed >> (convolution ** 4), pad=4)
        )
        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
        tok2vec.nO = width
        tok2vec.embed = embed
    return tok2vec
 def reapply(layer, n_times):
    def reapply_fwd(X, drop=0.):
        backprops = []
        for i in range(n_times):
            Y, backprop = layer.begin_update(X, drop=drop)
            X = Y
            backprops.append(backprop)
        def reapply_bwd(dY, sgd=None):
            dX = None
            for backprop in reversed(backprops):
                dY = backprop(dY, sgd=sgd)
                if dX is None:
                    dX = dY
                else:
                    dX += dY
            return dX
        return Y, reapply_bwd
    return wrap(reapply_fwd, layer)
 def asarray(ops, dtype):
    def forward(X, drop=0.):
        return ops.asarray(X, dtype=dtype), None
@ -455,20 +506,25 @@ def getitem(i):
        return X[i], None
    return layerize(getitem_fwd)
-def build_tagger_model(nr_class, token_vector_width, **cfg):
+def build_tagger_model(nr_class, **cfg):
-    embed_size = util.env_opt('embed_size', 7500)
+    embed_size = util.env_opt('embed_size', 7000)
    if 'token_vector_width' in cfg:
        token_vector_width = cfg['token_vector_width']
    else:
        token_vector_width = util.env_opt('token_vector_width', 128)
    pretrained_dims = cfg.get('pretrained_dims', 0)
    with Model.define_operators({'>>': chain, '+': add}):
-        # Input: (doc, tensor) tuples
+        if 'tok2vec' in cfg:
-        private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats())
+            tok2vec = cfg['tok2vec']
-
+        else:
            tok2vec = Tok2Vec(token_vector_width, embed_size,
                              pretrained_dims=pretrained_dims)
        model = (
-            fine_tune(private_tok2vec)
+            tok2vec
-            >> with_flatten(
+            >> with_flatten(Softmax(nr_class, token_vector_width))
                Maxout(token_vector_width, token_vector_width)
                >> Softmax(nr_class, token_vector_width)
            )
        )
    model.nI = None
    model.tok2vec = tok2vec
    return model
@ -514,6 +570,7 @@ def foreach(layer, drop_factor=1.0):
 def build_text_classifier(nr_class, width=64, **cfg):
    nr_vector = cfg.get('nr_vector', 5000)
    pretrained_dims = cfg.get('pretrained_dims', 0)
    with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
                                 '**': clone}):
        if cfg.get('low_data'):
@ -521,7 +578,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
                SpacyVectors
                >> flatten_add_lengths
                >> with_getitem(0,
-                    Affine(width, 300)
+                    Affine(width, pretrained_dims)
                )
                >> ParametricAttention(width)
                >> Pooling(sum_pool)
@ -548,18 +605,24 @@ def build_text_classifier(nr_class, width=64, **cfg):
            )
        )
-        static_vectors = (
+        if pretrained_dims:
-            SpacyVectors
+            static_vectors = (
-            >> with_flatten(Affine(width, 300))
+                SpacyVectors
-        )
+                >> with_flatten(Affine(width, pretrained_dims))
-
+            )
        cnn_model = (
            # TODO Make concatenate support lists
-            concatenate_lists(trained_vectors, static_vectors)
+            vectors = concatenate_lists(trained_vectors, static_vectors)
            vectors_width = width*2
        else:
            vectors = trained_vectors
            vectors_width = width
            static_vectors = None
        cnn_model = (
            vectors
            >> with_flatten(
-                LN(Maxout(width, width*2))
+                LN(Maxout(width, vectors_width))
                >> Residual(
-                    (ExtractWindow(nW=1) >> zero_init(Maxout(width, width*3)))
+                    (ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
                ) ** 2, pad=2
            )
            >> flatten_add_lengths
@ -579,7 +642,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
            >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
            >> logistic
        )
-
+    model.nO = nr_class
    model.lsuv = False
    return model
--- a/spacy/about.py
+++ b/spacy/about.py
@ -3,14 +3,15 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
 __title__ = 'spacy-nightly'
-__version__ = '2.0.0a13'
+__version__ = '2.0.0a16'
 __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 __uri__ = 'https://spacy.io'
 __author__ = 'Explosion AI'
 __email__ = 'contact@explosion.ai'
 __license__ = 'MIT'
 __release__ = True
-__docs_models__ = 'https://spacy.io/docs/usage/models'
+__docs_models__ = 'https://alpha.spacy.io/usage/models'
 __download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
 __compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
 __shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json'
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -1,5 +1,5 @@
 # Reserve 64 values for flag features
-cpdef enum attr_id_t:
+cdef enum attr_id_t:
    NULL_ATTR
    IS_ALPHA
    IS_ASCII
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -94,6 +94,7 @@ IDS = {
 # ATTR IDs, in order of the symbol
 NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
 locals().update(IDS)
 def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -4,5 +4,6 @@ from .link import link
 from .package import package
 from .profile import profile
 from .train import train
 from .evaluate import evaluate
 from .convert import convert
 from .model import model
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -14,7 +14,7 @@ from ..util import prints
 CONVERTERS = {
    '.conllu': conllu2json,
    '.conll': conllu2json,
-    '.iob': iob2json
+    '.iob': iob2json,
 }
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@ -1,5 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 from cytoolz import partition_all, concat
 from ...compat import json_dumps, path2str
 from ...util import prints
@ -10,11 +11,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
    """
    Convert IOB files into JSON format for use with train cli.
    """
    # TODO: This isn't complete yet -- need to map from IOB to
    # BILUO
    with input_path.open('r', encoding='utf8') as file_:
-        docs = read_iob(file_)
+        sentences = read_iob(file_)
-
+    docs = merge_sentences(sentences, n_sents)
    output_filename = input_path.parts[-1].replace(".iob", ".json")
    output_file = output_path / output_filename
    with output_file.open('w', encoding='utf-8') as f:
@ -23,9 +22,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
           title="Generated output file %s" % path2str(output_file))
-def read_iob(file_):
+def read_iob(raw_sents):
    sentences = []
-    for line in file_:
+    for line in raw_sents:
        if not line.strip():
            continue
        tokens = [t.split('|') for t in line.split()]
@ -43,3 +42,15 @@ def read_iob(file_):
    paragraphs = [{'sentences': [sent]} for sent in sentences]
    docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs]
    return docs
 def merge_sentences(docs, n_sents):
    counter = 0
    merged = []
    for group in partition_all(n_sents, docs):
        group = list(group)
        first = group.pop(0)
        to_extend = first['paragraphs'][0]['sentences']
        for sent in group[1:]:
            to_extend.extend(sent['paragraphs'][0]['sentences'])
        merged.append(first)
    return merged
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -0,0 +1,119 @@
 # coding: utf8
 from __future__ import unicode_literals, division, print_function
 import plac
 import json
 from collections import defaultdict
 import cytoolz
 from pathlib import Path
 import dill
 import tqdm
 from thinc.neural._classes.model import Model
 from thinc.neural.optimizers import linear_decay
 from timeit import default_timer as timer
 import random
 import numpy.random
 from ..tokens.doc import Doc
 from ..scorer import Scorer
 from ..gold import GoldParse, merge_sents
 from ..gold import GoldCorpus, minibatch
 from ..util import prints
 from .. import util
 from .. import about
 from .. import displacy
 from ..compat import json_dumps
 random.seed(0)
 numpy.random.seed(0)
@plac.annotations(
    model=("Model name or path", "positional", None, str),
    data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
    gpu_id=("Use GPU", "option", "g", int),
    displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
    displacy_limit=("Limit of parses to render as HTML", "option", "dl", int)
 )
 def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
             displacy_path=None, displacy_limit=25):
    """
    Evaluate a model. To render a sample of parses in a HTML file, set an output
    directory as the displacy_path argument.
    """
    util.use_gpu(gpu_id)
    util.set_env_log(False)
    data_path = util.ensure_path(data_path)
    displacy_path = util.ensure_path(displacy_path)
    if not data_path.exists():
        prints(data_path, title="Evaluation data not found", exits=1)
    if displacy_path and not displacy_path.exists():
        prints(displacy_path, title="Visualization output directory not found", exits=1)
    corpus = GoldCorpus(data_path, data_path)
    nlp = util.load_model(model)
    dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
    begin = timer()
    scorer = nlp.evaluate(dev_docs, verbose=False)
    end = timer()
    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
    print_results(scorer, time=end - begin, words=nwords,
                  wps=nwords / (end - begin))
    if displacy_path:
        docs, golds = zip(*dev_docs)
        render_deps = 'parser' in nlp.meta.get('pipeline', [])
        render_ents = 'ner' in nlp.meta.get('pipeline', [])
        render_parses(docs, displacy_path, model_name=model, limit=displacy_limit,
                      deps=render_deps, ents=render_ents)
        prints(displacy_path, title="Generated %s parses as HTML" % displacy_limit)
 def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=True):
    docs[0].user_data['title'] = model_name
    if ents:
        with (output_path / 'entities.html').open('w') as file_:
            html = displacy.render(docs[:limit], style='ent', page=True)
            file_.write(html)
    if deps:
        with (output_path / 'parses.html').open('w') as file_:
            html = displacy.render(docs[:limit], style='dep', page=True, options={'compact': True})
            file_.write(html)
 def print_progress(itn, losses, dev_scores, wps=0.0):
    scores = {}
    for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
                'ents_p', 'ents_r', 'ents_f', 'wps']:
        scores[col] = 0.0
    scores['dep_loss'] = losses.get('parser', 0.0)
    scores['ner_loss'] = losses.get('ner', 0.0)
    scores['tag_loss'] = losses.get('tagger', 0.0)
    scores.update(dev_scores)
    scores['wps'] = wps
    tpl = '\t'.join((
        '{:d}',
        '{dep_loss:.3f}',
        '{ner_loss:.3f}',
        '{uas:.3f}',
        '{ents_p:.3f}',
        '{ents_r:.3f}',
        '{ents_f:.3f}',
        '{tags_acc:.3f}',
        '{token_acc:.3f}',
        '{wps:.1f}'))
    print(tpl.format(itn, **scores))
 def print_results(scorer, time, words, wps):
    results = {
        'Time': '%.2f s' % time,
        'Words': words,
        'Words/s': '%.0f' % wps,
        'TOK': '%.2f' % scorer.token_acc,
        'POS': '%.2f' % scorer.tags_acc,
        'UAS': '%.2f' % scorer.uas,
        'LAS': '%.2f' % scorer.las,
        'NER P': '%.2f' % scorer.ents_p,
        'NER R': '%.2f' % scorer.ents_r,
        'NER F': '%.2f' % scorer.ents_f}
    util.print_table(results, title="Results")
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -105,8 +105,11 @@ def generate_pipeline():
           "parser, ner. For more information, see the docs on processing pipelines.",
           title="Enter your model's pipeline components")
    pipeline = util.get_raw_input("Pipeline components", True)
-    replace = {'True': True, 'False': False}
+    subs = {'True': True, 'False': False}
-    return replace[pipeline] if pipeline in replace else pipeline.split(', ')
+    if pipeline in subs:
        return subs[pipeline]
    else:
        return [p.strip() for p in pipeline.split(',')]
 def validate_meta(meta, keys):
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -8,8 +8,11 @@ import cytoolz
 from pathlib import Path
 import dill
 import tqdm
 from thinc.neural._classes.model import Model
 from thinc.neural.optimizers import linear_decay
 from timeit import default_timer as timer
 import random
 import numpy.random
 from ..tokens.doc import Doc
 from ..scorer import Scorer
@ -17,9 +20,13 @@ from ..gold import GoldParse, merge_sents
 from ..gold import GoldCorpus, minibatch
 from ..util import prints
 from .. import util
 from .. import about
 from .. import displacy
 from ..compat import json_dumps
 random.seed(0)
 numpy.random.seed(0)
@plac.annotations(
    lang=("model language", "positional", None, str),
@ -29,15 +36,17 @@ from ..compat import json_dumps
    n_iter=("number of iterations", "option", "n", int),
    n_sents=("number of sentences", "option", "ns", int),
    use_gpu=("Use GPU", "option", "g", int),
-    resume=("Whether to resume training", "flag", "R", bool),
+    vectors=("Model to load vectors from", "option", "v"),
    no_tagger=("Don't train tagger", "flag", "T", bool),
    no_parser=("Don't train parser", "flag", "P", bool),
    no_entities=("Don't train NER", "flag", "N", bool),
    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
    version=("Model version", "option", "V", str),
    meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
 )
-def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
+def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
-          use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False,
+          use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
-          gold_preproc=False):
+          gold_preproc=False, version="0.0.0", meta_path=None):
    """
    Train a model. Expects data in spaCy's JSON format.
    """
@ -46,19 +55,24 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
    output_path = util.ensure_path(output_dir)
    train_path = util.ensure_path(train_data)
    dev_path = util.ensure_path(dev_data)
    meta_path = util.ensure_path(meta_path)
    if not output_path.exists():
        output_path.mkdir()
    if not train_path.exists():
        prints(train_path, title="Training data not found", exits=1)
    if dev_path and not dev_path.exists():
        prints(dev_path, title="Development data not found", exits=1)
    if meta_path is not None and not meta_path.exists():
        prints(meta_path, title="meta.json not found", exits=1)
    meta = util.read_json(meta_path) if meta_path else {}
    if not isinstance(meta, dict):
        prints("Expected dict but got: {}".format(type(meta)),
               title="Not a valid meta.json format", exits=1)
-    lang_class = util.get_lang_class(lang)
+    pipeline = ['tagger', 'parser', 'ner']
-
+    if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
-    pipeline = ['token_vectors', 'tags', 'dependencies', 'entities']
+    if no_parser and 'parser' in pipeline: pipeline.remove('parser')
-    if no_tagger and 'tags' in pipeline: pipeline.remove('tags')
+    if no_entities and 'ner' in pipeline: pipeline.remove('ner')
    if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies')
    if no_entities and 'entities' in pipeline: pipeline.remove('entities')
    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
@ -68,33 +82,30 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                                  util.env_opt('dropout_to', 0.2),
                                  util.env_opt('dropout_decay', 0.0))
    batch_sizes = util.compounding(util.env_opt('batch_from', 1),
-                                   util.env_opt('batch_to', 64),
+                                   util.env_opt('batch_to', 16),
                                   util.env_opt('batch_compound', 1.001))
    if resume:
        prints(output_path / 'model9.pickle', title="Resuming training")
        nlp = dill.load((output_path / 'model9.pickle').open('rb'))
    else:
        nlp = lang_class(pipeline=pipeline)
    corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
    n_train_words = corpus.count_train()
    lang_class = util.get_lang_class(lang)
    nlp = lang_class(pipeline=pipeline)
    if vectors:
        util.load_model(vectors, vocab=nlp.vocab)
    optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
    nlp._optimizer = None
-    print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
+    print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
    try:
        train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
                                       gold_preproc=gold_preproc, max_length=0)
        train_docs = list(train_docs)
        for i in range(n_iter):
            if resume:
                i += 20
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
                                               gold_preproc=gold_preproc, max_length=0)
                losses = {}
                for batch in minibatch(train_docs, size=batch_sizes):
                    docs, golds = zip(*batch)
                    nlp.update(docs, golds, sgd=optimizer,
-                               drop=next(dropout_rates), losses=losses,
+                               drop=next(dropout_rates), losses=losses)
                               update_shared=True)
                    pbar.update(sum(len(doc) for doc in docs))
            with nlp.use_params(optimizer.averages):
@ -104,12 +115,22 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                nlp_loaded = lang_class(pipeline=pipeline)
                nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
                scorer = nlp_loaded.evaluate(
-                            corpus.dev_docs(
+                            list(corpus.dev_docs(
                                nlp_loaded,
-                                gold_preproc=gold_preproc))
+                                gold_preproc=gold_preproc)))
                acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
                with acc_loc.open('w') as file_:
                    file_.write(json_dumps(scorer.scores))
                meta_loc = output_path / ('model%d' % i) / 'meta.json'
                meta['accuracy'] = scorer.scores
                meta['lang'] = nlp.lang
                meta['pipeline'] = pipeline
                meta['spacy_version'] = '>=%s' % about.__version__
                meta.setdefault('name', 'model%d' % i)
                meta.setdefault('version', version)
                with meta_loc.open('w') as file_:
                    file_.write(json_dumps(meta))
                util.set_env_log(True)
            print_progress(i, losses, scorer.scores)
    finally:
@ -138,12 +159,14 @@ def print_progress(itn, losses, dev_scores, wps=0.0):
                'ents_p', 'ents_r', 'ents_f', 'wps']:
        scores[col] = 0.0
    scores['dep_loss'] = losses.get('parser', 0.0)
    scores['ner_loss'] = losses.get('ner', 0.0)
    scores['tag_loss'] = losses.get('tagger', 0.0)
    scores.update(dev_scores)
    scores['wps'] = wps
    tpl = '\t'.join((
        '{:d}',
        '{dep_loss:.3f}',
        '{ner_loss:.3f}',
        '{uas:.3f}',
        '{ents_p:.3f}',
        '{ents_r:.3f}',
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -7,6 +7,7 @@ import re
 import ujson
 import random
 import cytoolz
 import itertools
 from .syntax import nonproj
 from .util import ensure_path
@ -146,9 +147,13 @@ def minibatch(items, size=8):
    '''Iterate over batches of items. `size` may be an iterator,
    so that batch-size can vary on each step.
    '''
    if isinstance(size, int):
        size_ = itertools.repeat(8)
    else:
        size_ = size
    items = iter(items)
    while True:
-        batch_size = next(size) #if hasattr(size, '__next__') else size
+        batch_size = next(size_)
        batch = list(cytoolz.take(int(batch_size), items))
        if len(batch) == 0:
            break
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -29,9 +29,9 @@ _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm
          'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
          'TB T G M K %')
 _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
-_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
+_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ？ ！ ， 、 ； ： ～ ·'
 _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
-_hyphens = '- – — -- ---'
+_hyphens = '- – — -- --- —— ~'
 _other_symbols = r'[\p{So}]'
 UNITS = merge_chars(_units)
--- a/spacy/lang/de/init.py
+++ b/spacy/lang/de/init.py
@ -3,6 +3,7 @@ from __future__ import unicode_literals
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .norm_exceptions import NORM_EXCEPTIONS
 from .punctuation import TOKENIZER_INFIXES
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .lemmatizer import LOOKUP
@ -23,6 +24,7 @@ class GermanDefaults(Language.Defaults):
                                         NORM_EXCEPTIONS, BASE_NORMS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    infixes = tuple(TOKENIZER_INFIXES)
    tag_map = dict(TAG_MAP)
    stop_words = set(STOP_WORDS)
    syntax_iterators = dict(SYNTAX_ITERATORS)
--- a/spacy/lang/de/punctuation.py
+++ b/spacy/lang/de/punctuation.py
@ -0,0 +1,20 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ..char_classes import LIST_ELLIPSES, LIST_ICONS
 from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
 _quotes = QUOTES.replace("'", '')
 _infixes = (LIST_ELLIPSES + LIST_ICONS +
            [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
             r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
             r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
             r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes),
             r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
             r'(?<=[0-9])-(?=[0-9])'])
 TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import LOOKUP
 from .syntax_iterators import SYNTAX_ITERATORS
@ -17,6 +18,7 @@ from ...util import update_exc, add_lookups
 class FrenchDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: 'fr'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
--- a/spacy/lang/fr/lex_attrs.py
+++ b/spacy/lang/fr/lex_attrs.py
@ -0,0 +1,41 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...attrs import LIKE_NUM
 _num_words = set("""
 zero un deux trois quatre cinq six sept huit neuf dix
 onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf
 vingt trente quanrante cinquante soixante septante quatre-vingt huitante nonante
 cent mille mil million milliard billion quadrillion quintillion
 sextillion septillion octillion nonillion decillion
 """.split())
 _ordinal_words = set("""
 premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième
 onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neufième
 vingtième trentième quanrantième cinquantième soixantième septantième quatre-vingtième huitantième nonantième
 centième millième millionnième milliardième billionnième quadrillionnième quintillionnième
 sextillionnième septillionnième octillionnième nonillionnième decillionnième
 """.split())
 def like_num(text):
    # Might require more work?
    # See this discussion: https://github.com/explosion/spaCy/pull/1161
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
    if text.count('/') == 1:
        num, denom = text.split('/')
        if num.isdigit() and denom.isdigit():
            return True
    if text in _num_words:
        return True
    return False
 LEX_ATTRS = {
    LIKE_NUM: like_num
 }
--- a/spacy/lang/nl/init.py
+++ b/spacy/lang/nl/init.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
@ -12,6 +13,7 @@ from ...util import update_exc, add_lookups
 class DutchDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: 'nl'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
--- a/spacy/lang/nl/lex_attrs.py
+++ b/spacy/lang/nl/lex_attrs.py
@ -0,0 +1,40 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...attrs import LIKE_NUM
 _num_words = set("""
 nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
 veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
 duizend miljoen miljard biljoen biljard triljoen triljard
 """.split())
 _ordinal_words = set("""
 eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
 twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
 zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
 miljardste biljoenste biljardste triljoenste triljardste
 """.split())
 def like_num(text):
    # This only does the most basic check for whether a token is a digit
    # or matches one of the number words. In order to handle numbers like
    # "drieëntwintig", more work is required.
    # See this discussion: https://github.com/explosion/spaCy/pull/1177
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
    if text.count('/') == 1:
        num, denom = text.split('/')
        if num.isdigit() and denom.isdigit():
            return True
    if text in _num_words:
        return True
    return False
 LEX_ATTRS = {
    LIKE_NUM: like_num
 }
--- a/spacy/lang/th/init.py
+++ b/spacy/lang/th/init.py
@ -0,0 +1,35 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...tokens import Doc
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups
 class ThaiDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'th'
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    tag_map = dict(TAG_MAP)
    stop_words = set(STOP_WORDS)
 class Thai(Language):
 	lang = 'th'
 	Defaults = ThaiDefaults
 	def make_doc(self, text):
 		try:
 			from pythainlp.tokenize import word_tokenize
 		except ImportError:
 			raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
 								"https://github.com/wannaphongcom/pythainlp/")
 		words = [x for x in list(word_tokenize(text,"newmm"))]
 		return Doc(self.vocab, words=words, spaces=[False]*len(words))
 __all__ = ['Thai']
--- a/spacy/lang/th/stop_words.py
+++ b/spacy/lang/th/stop_words.py
@ -0,0 +1,62 @@
 # encoding: utf8
 from __future__ import unicode_literals
 # data from https://github.com/wannaphongcom/pythainlp/blob/dev/pythainlp/corpus/stopwords-th.txt
 # stop words as whitespace-separated list
 STOP_WORDS = set("""
 นี้	นํา	นั้น	นัก	นอกจาก	ทุก	ที่สุด	ที่	ทําให้	ทํา	ทาง	ทั้งนี้	ดัง	ซึ่ง	ช่วง	จาก	จัด	จะ	คือ	ความ	ครั้ง	คง	ขึ้น	ของ
 ขอ	รับ	ระหว่าง	รวม	ยัง	มี	มาก	มา	พร้อม	พบ	ผ่าน	ผล	บาง	น่า	เปิดเผย	เปิด	เนื่องจาก	เดียวกัน	เดียว	เช่น	เฉพาะ	เข้า	ถ้า
 ถูก	ถึง	ต้อง	ต่างๆ	ต่าง	ต่อ	ตาม	ตั้งแต่	ตั้ง	ด้าน	ด้วย	อีก	อาจ	ออก	อย่าง	อะไร	อยู่	อยาก	หาก	หลาย	หลังจาก	แต่	เอง	เห็น
 เลย	เริ่ม	เรา	เมื่อ	เพื่อ	เพราะ	เป็นการ	เป็น	หลัง	หรือ	หนึ่ง	ส่วน	ส่ง	สุด	สําหรับ	ว่า	ลง	ร่วม	ราย	ขณะ	ก่อน	ก็	การ	กับ	กัน
 กว่า	กล่าว	จึง	ไว้	ไป	ได้	ให้	ใน	โดย	แห่ง	แล้ว	และ	แรก	แบบ	ๆ	ทั้ง	วัน	เขา	เคย	ไม่	อยาก	เกิน	เกินๆ	เกี่ยวกัน	เกี่ยวกับ
 เกี่ยวข้อง	เกี่ยวเนื่อง	เกี่ยวๆ	เกือบ	เกือบจะ	เกือบๆ	แก	แก่	แก้ไข	ใกล้	ใกล้ๆ	ไกล	ไกลๆ	ขณะเดียวกัน	ขณะใด	ขณะใดๆ	ขณะที่	ขณะนั้น	ขณะนี้	ขณะหนึ่ง	ขวาง
 ขวางๆ	ขั้น	ใคร	ใคร่	ใคร่จะ	ใครๆ	ง่าย	ง่ายๆ	ไง	จง	จด	จน	จนกระทั่ง	จนกว่า	จนขณะนี้	จนตลอด	จนถึง	จนทั่ว	จนบัดนี้	จนเมื่อ	จนแม้	จนแม้น
 จรด	จรดกับ	จริง	จริงจัง	จริงๆ	จริงๆจังๆ	จวน	จวนจะ	จวนเจียน	จวบ	ซึ่งก็	ซึ่งก็คือ	ซึ่งกัน	ซึ่งกันและกัน	ซึ่งได้แก่	ซึ่งๆ	ณ	ด้วย	ด้วยกัน	ด้วยเช่นกัน	ด้วยที่	ด้วยประการฉะนี้
 ด้วยเพราะ	ด้วยว่า	ด้วยเหตุที่	ด้วยเหตุนั้น	ด้วยเหตุนี้	ด้วยเหตุเพราะ	ด้วยเหตุว่า	ด้วยเหมือนกัน	ดั่ง	ดังกล่าว	ดังกับ	ดั่งกับ	ดังกับว่า	ดั่งกับว่า	ดังเก่า
 ดั่งเก่า	ดังเคย	ใดๆ	ได้	ได้แก่	ได้แต่	ได้ที่	ได้มา	ได้รับ	ตน	ตนเอง	ตนฯ	ตรง	ตรงๆ	ตลอด	ตลอดกาล	ตลอดกาลนาน	ตลอดจน	ตลอดถึง	ตลอดทั้ง
 ตลอดทั่ว	ตลอดทั่วถึง	ตลอดทั่วทั้ง	ตลอดปี	ตลอดไป	ตลอดมา	ตลอดระยะเวลา	ตลอดวัน	ตลอดเวลา	ตลอดศก	ต่อ	ต่อกัน	ถึงแก่	ถึงจะ	ถึงบัดนั้น	ถึงบัดนี้
 ถึงเมื่อ	ถึงเมื่อใด	ถึงเมื่อไร	ถึงแม้	ถึงแม้จะ	ถึงแม้ว่า	ถึงอย่างไร	ถือ	ถือว่า	ถูกต้อง	ถูกๆ	เถอะ	เถิด	ทรง	ทว่า	ทั้งคน	ทั้งตัว	ทั้งที	ทั้งที่	ทั้งนั้น	ทั้งนั้นด้วย	ทั้งนั้นเพราะ
 นอก	นอกจากที่	นอกจากนั้น	นอกจากนี้	นอกจากว่า	นอกนั้น	นอกเหนือ	นอกเหนือจาก	น้อย	น้อยกว่า	น้อยๆ	นะ	น่ะ	นักๆ	นั่น	นั่นไง	นั่นเป็น	นั่นแหละ
 นั่นเอง	นั้นๆ	นับ	นับจากนั้น	นับจากนี้	นับตั้งแต่	นับแต่	นับแต่ที่	นับแต่นั้น	เป็นต้น	เป็นต้นไป	เป็นต้นมา	เป็นแต่	เป็นแต่เพียง	เป็นที	เป็นที่	เป็นที่สุด	เป็นเพราะ
 เป็นเพราะว่า	เป็นเพียง	เป็นเพียงว่า	เป็นเพื่อ	เป็นอัน	เป็นอันมาก	เป็นอันว่า	เป็นอันๆ	เป็นอาทิ	เป็นๆ	เปลี่ยน	เปลี่ยนแปลง	เปิด	เปิดเผย	ไป่	ผ่าน	ผ่านๆ
 ผิด	ผิดๆ	ผู้	เพียงเพื่อ	เพียงไร	เพียงไหน	เพื่อที่	เพื่อที่จะ	เพื่อว่า	เพื่อให้	ภาค	ภาคฯ	ภาย	ภายใต้	ภายนอก	ภายใน	ภายภาค	ภายภาคหน้า	ภายหน้า	ภายหลัง
 มอง	มองว่า	มัก	มักจะ	มัน	มันๆ	มั้ย	มั้ยนะ	มั้ยนั่น	มั้ยเนี่ย	มั้ยล่ะ	ยืนนาน	ยืนยง	ยืนยัน	ยืนยาว	เยอะ	เยอะแยะ	เยอะๆ	แยะ	แยะๆ	รวด	รวดเร็ว	ร่วม	รวมกัน	ร่วมกัน
 รวมด้วย	ร่วมด้วย	รวมถึง	รวมทั้ง	ร่วมมือ	รวมๆ	ระยะ	ระยะๆ	ระหว่าง	รับรอง	รึ	รึว่า	รือ	รือว่า	สิ้นกาลนาน	สืบเนื่อง	สุดๆ	สู่	สูง	สูงกว่า	สูงส่ง	สูงสุด	สูงๆ	เสมือนกับ
 เสมือนว่า	เสร็จ	เสร็จกัน	เสร็จแล้ว	เสร็จสมบูรณ์	เสร็จสิ้น	เสีย	เสียก่อน	เสียจน	เสียจนกระทั่ง	เสียจนถึง	เสียด้วย	เสียนั่น	เสียนั่นเอง	เสียนี่	เสียนี่กระไร	เสียยิ่ง
 เสียยิ่งนัก	เสียแล้ว	ใหญ่ๆ	ให้ดี	ให้แด่	ให้ไป	ใหม่	ให้มา	ใหม่ๆ	ไหน	ไหนๆ	อดีต	อนึ่ง	อย่าง	อย่างเช่น	อย่างดี	อย่างเดียว	อย่างใด	อย่างที่	อย่างน้อย	อย่างนั้น
 อย่างนี้	อย่างโน้น	ก็คือ	ก็แค่	ก็จะ	ก็ดี	ก็ได้	ก็ต่อเมื่อ	ก็ตาม	ก็ตามแต่	ก็ตามที	ก็แล้วแต่	กระทั่ง	กระทำ	กระนั้น	กระผม	กลับ	กล่าวคือ	กลุ่ม	กลุ่มก้อน
 กลุ่มๆ	กว้าง	กว้างขวาง	กว้างๆ	ก่อนหน้า	ก่อนหน้านี้	ก่อนๆ	กันดีกว่า	กันดีไหม	กันเถอะ	กันนะ	กันและกัน	กันไหม	กันเอง	กำลัง	กำลังจะ	กำหนด	กู	เก็บ
 เกิด	เกี่ยวข้อง	แก่	แก้ไข	ใกล้	ใกล้ๆ	ข้า	ข้าง	ข้างเคียง	ข้างต้น	ข้างบน	ข้างล่าง	ข้างๆ	ขาด	ข้าพเจ้า	ข้าฯ	เข้าใจ	เขียน	คงจะ	คงอยู่	ครบ	ครบครัน	ครบถ้วน
 ครั้งกระนั้น	ครั้งก่อน	ครั้งครา	ครั้งคราว	ครั้งใด	ครั้งที่	ครั้งนั้น	ครั้งนี้	ครั้งละ	ครั้งหนึ่ง	ครั้งหลัง	ครั้งหลังสุด	ครั้งไหน	ครั้งๆ	ครัน	ครับ	ครา	คราใด	คราที่	ครานั้น	ครานี้	คราหนึ่ง
 คราไหน	คราว	คราวก่อน	คราวใด	คราวที่	คราวนั้น	คราวนี้	คราวโน้น	คราวละ	คราวหน้า	คราวหนึ่ง	คราวหลัง	คราวไหน	คราวๆ	คล้าย	คล้ายกัน	คล้ายกันกับ
 คล้ายกับ	คล้ายกับว่า	คล้ายว่า	ควร	ค่อน	ค่อนข้าง	ค่อนข้างจะ	ค่อยไปทาง	ค่อนมาทาง	ค่อย	ค่อยๆ	คะ	ค่ะ	คำ	คิด	คิดว่า	คุณ	คุณๆ
 เคยๆ	แค่	แค่จะ	แค่นั้น	แค่นี้	แค่เพียง	แค่ว่า	แค่ไหน	ใคร่	ใคร่จะ	ง่าย	ง่ายๆ	จนกว่า	จนแม้	จนแม้น	จังๆ	จวบกับ	จวบจน	จ้ะ	จ๊ะ	จะได้	จัง	จัดการ	จัดงาน	จัดแจง
 จัดตั้ง	จัดทำ	จัดหา	จัดให้	จับ	จ้า	จ๋า	จากนั้น	จากนี้ 	จากนี้ไป	จำ	จำเป็น 	จำพวก	จึงจะ	จึงเป็น	จู่ๆ	ฉะนั้น	ฉะนี้	ฉัน	เฉกเช่น	เฉย	เฉยๆ	ไฉน	ช่วงก่อน
 ช่วงต่อไป	ช่วงถัดไป	ช่วงท้าย	ช่วงที่	ช่วงนั้น	ช่วงนี้	ช่วงระหว่าง	ช่วงแรก	ช่วงหน้า	ช่วงหลัง	ช่วงๆ	ช่วย	ช้า	ช้านาน	ชาว	ช้าๆ	เช่นก่อน	เช่นกัน	เช่นเคย
 เช่นดัง	เช่นดังก่อน	เช่นดังเก่า	เช่นดังที่	เช่นดังว่า	เช่นเดียวกัน	เช่นเดียวกับ	เช่นใด	เช่นที่	เช่นที่เคย	เช่นที่ว่า	เช่นนั้น	เช่นนั้นเอง	เช่นนี้	เช่นเมื่อ	เช่นไร	เชื่อ
 เชื่อถือ	เชื่อมั่น	เชื่อว่า	ใช่	ใช่ไหม	ใช้	ซะ	ซะก่อน	ซะจน	ซะจนกระทั่ง	ซะจนถึง	ซึ่งได้แก่	ด้วยกัน	ด้วยเช่นกัน	ด้วยที่	ด้วยเพราะ	ด้วยว่า	ด้วยเหตุที่	ด้วยเหตุนั้น
 ด้วยเหตุนี้	ด้วยเหตุเพราะ	ด้วยเหตุว่า	ด้วยเหมือนกัน	ดังกล่าว	ดังกับว่า	ดั่งกับว่า	ดังเก่า	ดั่งเก่า	ดั่งเคย	ต่างก็	ต่างหาก	ตามด้วย	ตามแต่	ตามที่
 ตามๆ	เต็มไปด้วย	เต็มไปหมด	เต็มๆ	แต่ก็	แต่ก่อน	แต่จะ	แต่เดิม	แต่ต้อง	แต่ถ้า	แต่ทว่า	แต่ที่	แต่นั้น	แต่เพียง	แต่เมื่อ	แต่ไร	แต่ละ	แต่ว่า	แต่ไหน	แต่อย่างใด	โต
 โตๆ	ใต้	ถ้าจะ	ถ้าหาก	ถึงแก่	ถึงแม้	ถึงแม้จะ	ถึงแม้ว่า	ถึงอย่างไร	ถือว่า	ถูกต้อง	ทว่า	ทั้งนั้นด้วย	ทั้งปวง	ทั้งเป็น	ทั้งมวล	ทั้งสิ้น	ทั้งหมด	ทั้งหลาย	ทั้งๆ	ทัน
 ทันใดนั้น	ทันที	ทันทีทันใด	ทั่ว	ทำไม	ทำไร	ทำให้	ทำๆ	ที	ที่จริง	ที่ซึ่ง	ทีเดียว	ทีใด	ที่ใด	ที่ได้	ทีเถอะ	ที่แท้	ที่แท้จริง	ที่นั้น	ที่นี้	ทีไร	ทีละ	ที่ละ
 ที่แล้ว	ที่ว่า	ที่แห่งนั้น	ที่ไหน	ทีๆ	ที่ๆ	ทุกคน	ทุกครั้ง	ทุกครา	ทุกคราว	ทุกชิ้น	ทุกตัว	ทุกทาง	ทุกที	ทุกที่	ทุกเมื่อ	ทุกวัน	ทุกวันนี้	ทุกสิ่ง	ทุกหน	ทุกแห่ง	ทุกอย่าง
 ทุกอัน	ทุกๆ	เท่า	เท่ากัน	เท่ากับ	เท่าใด	เท่าที่	เท่านั้น	เท่านี้	เท่าไร	เท่าไหร่	แท้	แท้จริง	เธอ	นอกจากว่า	น้อย	น้อยกว่า	น้อยๆ	น่ะ	นั้นไว	นับแต่นี้	นาง
 นางสาว	น่าจะ	นาน	นานๆ	นาย	นำ	นำพา	นำมา	นิด	นิดหน่อย	นิดๆ	นี่	นี่ไง	นี่นา	นี่แน่ะ	นี่แหละ	นี้แหล่	นี่เอง	นี้เอง	นู่น	นู้น	เน้น	เนี่ย
 เนี่ยเอง	ในช่วง	ในที่	ในเมื่อ	ในระหว่าง	บน	บอก	บอกแล้ว	บอกว่า	บ่อย	บ่อยกว่า	บ่อยครั้ง	บ่อยๆ	บัดดล	บัดเดี๋ยวนี้	บัดนั้น	บัดนี้	บ้าง	บางกว่า
 บางขณะ	บางครั้ง	บางครา	บางคราว	บางที	บางที่	บางแห่ง	บางๆ	ปฏิบัติ	ประกอบ	ประการ	ประการฉะนี้	ประการใด	ประการหนึ่ง	ประมาณ	ประสบ	ปรับ
 ปรากฏ	ปรากฏว่า	ปัจจุบัน	ปิด	เป็นด้วย	เป็นดัง	เป็นต้น	เป็นแต่	เป็นเพื่อ	เป็นอัน	เป็นอันมาก	เป็นอาทิ	ผ่านๆ	ผู้	ผู้ใด	เผื่อ	เผื่อจะ	เผื่อที่	เผื่อว่า	ฝ่าย
 ฝ่ายใด	พบว่า	พยายาม	พร้อมกัน	พร้อมกับ	พร้อมด้วย	พร้อมทั้ง	พร้อมที่	พร้อมเพียง	พวก	พวกกัน	พวกกู	พวกแก	พวกเขา	พวกคุณ	พวกฉัน	พวกท่าน
 พวกที่	พวกเธอ	พวกนั้น	พวกนี้	พวกนู้น	พวกโน้น	พวกมัน	พวกมึง	พอ	พอกัน	พอควร	พอจะ	พอดี	พอตัว	พอที	พอที่	พอเพียง	พอแล้ว	พอสม	พอสมควร
 พอเหมาะ	พอๆ	พา	พึง	พึ่ง	พื้นๆ	พูด	เพราะฉะนั้น	เพราะว่า	เพิ่ง	เพิ่งจะ	เพิ่ม	เพิ่มเติม	เพียง	เพียงแค่	เพียงใด	เพียงแต่	เพียงพอ	เพียงเพราะ
 เพื่อว่า	เพื่อให้	ภายใต้	มองว่า	มั๊ย	มากกว่า	มากมาย	มิ	มิฉะนั้น	มิใช่	มิได้	มีแต่	มึง	มุ่ง	มุ่งเน้น	มุ่งหมาย	เมื่อก่อน	เมื่อครั้ง	เมื่อครั้งก่อน
 เมื่อคราวก่อน	เมื่อคราวที่	เมื่อคราว	เมื่อคืน	เมื่อเช้า	เมื่อใด	เมื่อนั้น	เมื่อนี้	เมื่อเย็น	เมื่อไร	เมื่อวันวาน	เมื่อวาน	เมื่อไหร่	แม้	แม้กระทั่ง	แม้แต่	แม้นว่า	แม้ว่า
 ไม่ค่อย	ไม่ค่อยจะ	ไม่ค่อยเป็น	ไม่ใช่	ไม่เป็นไร	ไม่ว่า	ยก	ยกให้	ยอม	ยอมรับ	ย่อม	ย่อย	ยังคง	ยังงั้น	ยังงี้	ยังโง้น	ยังไง	ยังจะ	ยังแต่	ยาก
 ยาว	ยาวนาน	ยิ่ง	ยิ่งกว่า	ยิ่งขึ้น	ยิ่งขึ้นไป	ยิ่งจน	ยิ่งจะ	ยิ่งนัก	ยิ่งเมื่อ	ยิ่งแล้ว	ยิ่งใหญ่	ร่วมกัน	รวมด้วย	ร่วมด้วย	รือว่า	เร็ว	เร็วๆ	เราๆ	เรียก	เรียบ	เรื่อย
 เรื่อยๆ	ไร	ล้วน	ล้วนจน	ล้วนแต่	ละ	ล่าสุด	เล็ก	เล็กน้อย	เล็กๆ	เล่าว่า	แล้วกัน	แล้วแต่	แล้วเสร็จ	วันใด	วันนั้น	วันนี้	วันไหน	สบาย	สมัย	สมัยก่อน
 สมัยนั้น	สมัยนี้	สมัยโน้น	ส่วนเกิน	ส่วนด้อย	ส่วนดี	ส่วนใด	ส่วนที่	ส่วนน้อย	ส่วนนั้น	ส่วนมาก	ส่วนใหญ่	สั้น	สั้นๆ	สามารถ	สำคัญ	สิ่ง
 สิ่งใด	สิ่งนั้น	สิ่งนี้	สิ่งไหน	สิ้น	เสร็จแล้ว	เสียด้วย	เสียแล้ว	แสดง	แสดงว่า	หน	หนอ	หนอย	หน่อย	หมด	หมดกัน	หมดสิ้น	หรือไง	หรือเปล่า	หรือไม่	หรือยัง
 หรือไร	หากแม้	หากแม้น	หากแม้นว่า	หากว่า	หาความ	หาใช่	หารือ	เหตุ	เหตุผล	เหตุนั้น	เหตุนี้	เหตุไร	เห็นแก่	เห็นควร	เห็นจะ	เห็นว่า	เหลือ	เหลือเกิน	เหล่า
 เหล่านั้น	เหล่านี้	แห่งใด	แห่งนั้น	แห่งนี้	แห่งโน้น	แห่งไหน	แหละ	ให้แก่	ใหญ่	ใหญ่โต	อย่างเช่น	อย่างดี	อย่างเดียว	อย่างใด	อย่างที่	อย่างน้อย	อย่างนั้น	อย่างนี้
 อย่างโน้น	อย่างมาก	อย่างยิ่ง	อย่างไร	อย่างไรก็	อย่างไรก็ได้	อย่างไรเสีย	อย่างละ	อย่างหนึ่ง	อย่างไหน	อย่างๆ	อัน	อันจะ	อันใด	อันได้แก่	อันที่
 อันที่จริง	อันที่จะ	อันเนื่องมาจาก	อันละ	อันไหน	อันๆ	อาจจะ	อาจเป็น	อาจเป็นด้วย	อื่น	อื่นๆ	เอ็ง	เอา	ฯ	ฯล	ฯลฯ
 """.split())
--- a/spacy/lang/th/tag_map.py
+++ b/spacy/lang/th/tag_map.py
@ -0,0 +1,81 @@
 # encoding: utf8
 # data from Korakot Chaovavanich (https://www.facebook.com/photo.php?fbid=390564854695031&set=p.390564854695031&type=3&permPage=1&ifg=1)
 from __future__ import unicode_literals
 from ...symbols import *
 TAG_MAP = {
    #NOUN
    "NOUN":     {POS: NOUN},
    "NCMN":     {POS: NOUN},
    "NTTL":     {POS: NOUN},
    "CNIT":     {POS: NOUN},
    "CLTV":     {POS: NOUN},
    "CMTR":     {POS: NOUN},
    "CFQC":     {POS: NOUN},
    "CVBL":     {POS: NOUN},
    #PRON
    "PRON":     {POS: PRON},
    "NPRP":     {POS: PRON},
    # ADJ
    "ADJ":      {POS: ADJ},
    "NONM":      {POS: ADJ},
    "VATT":      {POS: ADJ},
    "DONM":      {POS: ADJ},
    # ADV
    "ADV":      {POS: ADV},
    "ADVN":      {POS: ADV},
    "ADVI":      {POS: ADV},
    "ADVP":      {POS: ADV},
    "ADVS":      {POS: ADV},
 	# INT
    "INT":      {POS: INTJ},
    # PRON
    "PROPN":    {POS: PROPN},
    "PPRS":    {POS: PROPN},
    "PDMN":    {POS: PROPN},
    "PNTR":    {POS: PROPN},
    # DET
    "DET":      {POS: DET},
    "DDAN":      {POS: DET},
    "DDAC":      {POS: DET},
    "DDBQ":      {POS: DET},
    "DDAQ":      {POS: DET},
    "DIAC":      {POS: DET},
    "DIBQ":      {POS: DET},
    "DIAQ":      {POS: DET},
    "DCNM":      {POS: DET},
    # NUM
    "NUM":      {POS: NUM},
    "NCNM":      {POS: NUM},
    "NLBL":      {POS: NUM},
    "DCNM":      {POS: NUM},
 	# AUX
    "AUX":      {POS: AUX},
    "XVBM":      {POS: AUX},
    "XVAM":      {POS: AUX},
    "XVMM":      {POS: AUX},
    "XVBB":      {POS: AUX},
    "XVAE":      {POS: AUX},
 	# ADP
    "ADP":      {POS: ADP},
    "RPRE":      {POS: ADP},
    # CCONJ
    "CCONJ":    {POS: CCONJ},
    "JCRG":    {POS: CCONJ},
 	# SCONJ
    "SCONJ":    {POS: SCONJ},
    "PREL":    {POS: SCONJ},
    "JSBR":    {POS: SCONJ},
    "JCMP":    {POS: SCONJ},
    # PART
    "PART":    {POS: PART},
    "FIXN":    {POS: PART},
    "FIXV":    {POS: PART},
    "EAFF":    {POS: PART},
    "AITT":    {POS: PART},
    "NEG":    {POS: PART},
    # PUNCT
    "PUNCT":    {POS: PUNCT},
    "PUNC":    {POS: PUNCT}
 }
--- a/spacy/lang/th/tokenizer_exceptions.py
+++ b/spacy/lang/th/tokenizer_exceptions.py
@ -0,0 +1,43 @@
 # encoding: utf8
 from __future__ import unicode_literals
 from ...symbols import *
 TOKENIZER_EXCEPTIONS = {
    "ม.ค.": [
        {ORTH: "ม.ค.", LEMMA: "มกราคม"}
    ],
    "ก.พ.": [
        {ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"}
    ],
    "มี.ค.": [
        {ORTH: "มี.ค.", LEMMA: "มีนาคม"}
    ],
    "เม.ย.": [
        {ORTH: "เม.ย.", LEMMA: "เมษายน"}
    ],
    "พ.ค.": [
        {ORTH: "พ.ค.", LEMMA: "พฤษภาคม"}
    ],
    "มิ.ย.": [
        {ORTH: "มิ.ย.", LEMMA: "มิถุนายน"}
    ],
    "ก.ค.": [
        {ORTH: "ก.ค.", LEMMA: "กรกฎาคม"}
    ],
    "ส.ค.": [
        {ORTH: "ส.ค.", LEMMA: "สิงหาคม"}
    ],
    "ก.ย.": [
        {ORTH: "ก.ย.", LEMMA: "กันยายน"}
    ],
    "ต.ค.": [
        {ORTH: "ต.ค.", LEMMA: "ตุลาคม"}
    ],
    "พ.ย.": [
        {ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"}
    ],
    "ธ.ค.": [
        {ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}
    ]
 }
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -14,8 +14,8 @@ class Chinese(Language):
        except ImportError:
            raise ImportError("The Chinese tokenizer requires the Jieba library: "
                              "https://github.com/fxsjy/jieba")
-        words = list(jieba.cut(text, cut_all=True))
+        words = list(jieba.cut(text, cut_all=False))
-        words=[x for x in words if x]
+        words = [x for x in words if x]
        return Doc(self.vocab, words=words, spaces=[False]*len(words))
--- a/spacy/language.py
+++ b/spacy/language.py
@ -34,6 +34,7 @@ from .lang.tag_map import TAG_MAP
 from .lang.lex_attrs import LEX_ATTRS
 from . import util
 from .scorer import Scorer
 from ._ml import link_vectors_to_models
 class BaseDefaults(object):
@ -278,8 +279,7 @@ class Language(object):
    def make_doc(self, text):
        return self.tokenizer(text)
-    def update(self, docs, golds, drop=0., sgd=None, losses=None,
+    def update(self, docs, golds, drop=0., sgd=None, losses=None):
            update_shared=False):
        """Update the models in the pipeline.
        docs (iterable): A batch of `Doc` objects.
@ -303,32 +303,17 @@ class Language(object):
            if self._optimizer is None:
                self._optimizer = Adam(Model.ops, 0.001)
            sgd = self._optimizer
        tok2vec = self.pipeline[0]
        feats = tok2vec.doc2feats(docs)
        grads = {}
        def get_grads(W, dW, key=None):
            grads[key] = (W, dW)
-        pipes = list(self.pipeline[1:])
+        pipes = list(self.pipeline)
        random.shuffle(pipes)
        tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
        all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
        for proc in pipes:
            if not hasattr(proc, 'update'):
                continue
-            d_tokvecses = proc.update((docs, tokvecses), golds,
+            proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
                                      drop=drop, sgd=get_grads, losses=losses)
            if update_shared and d_tokvecses is not None:
                for i, d_tv in enumerate(d_tokvecses):
                    all_d_tokvecses[i] += d_tv
        if update_shared and bp_tokvecses is not None:
            bp_tokvecses(all_d_tokvecses, sgd=sgd)
        for key, (W, dW) in grads.items():
            sgd(W, dW, key=key)
        # Clear the tensor variable, to free GPU memory.
        # If we don't do this, the memory leak gets pretty
        # bad, because we may be holding part of a batch.
        for doc in docs:
            doc.tensor = None
    def preprocess_gold(self, docs_golds):
        """Can be called before training to pre-process gold data. By default,
@ -343,36 +328,49 @@ class Language(object):
        for doc, gold in docs_golds:
            yield doc, gold
-    def begin_training(self, get_gold_tuples, **cfg):
+    def resume_training(self, **cfg):
        if cfg.get('device', -1) >= 0:
            device = util.use_gpu(cfg['device'])
            if self.vocab.vectors.data.shape[1] >= 1:
                self.vocab.vectors.data = Model.ops.asarray(
                    self.vocab.vectors.data)
        else:
            device = None
        learn_rate = util.env_opt('learn_rate', 0.001)
        beta1 = util.env_opt('optimizer_B1', 0.9)
        beta2 = util.env_opt('optimizer_B2', 0.999)
        eps = util.env_opt('optimizer_eps', 1e-08)
        L2 = util.env_opt('L2_penalty', 1e-6)
        max_grad_norm = util.env_opt('grad_norm_clip', 1.)
        self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
                              beta2=beta2, eps=eps)
        self._optimizer.max_grad_norm = max_grad_norm
        self._optimizer.device = device
        return self._optimizer
    def begin_training(self, get_gold_tuples=None, **cfg):
        """Allocate models, pre-process training data and acquire a trainer and
        optimizer. Used as a contextmanager.
-        gold_tuples (iterable): Gold-standard training data.
+        get_gold_tuples (function): Function returning gold data
        **cfg: Config parameters.
-        YIELDS (tuple): A trainer and an optimizer.
+        returns: An optimizer
        EXAMPLE:
            >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
            >>>    for epoch in trainer.epochs(gold):
            >>>        for docs, golds in epoch:
            >>>            state = nlp.update(docs, golds, sgd=optimizer)
        """
        if self.parser:
            self.pipeline.append(NeuralLabeller(self.vocab))
        # Populate vocab
-        for _, annots_brackets in get_gold_tuples():
+        if get_gold_tuples is not None:
-            for annots, _ in annots_brackets:
+            for _, annots_brackets in get_gold_tuples():
-                for word in annots[1]:
+                for annots, _ in annots_brackets:
-                    _ = self.vocab[word]
+                    for word in annots[1]:
                        _ = self.vocab[word]
        contexts = []
        if cfg.get('device', -1) >= 0:
-            import cupy.cuda.device
+            device = util.use_gpu(cfg['device'])
-            device = cupy.cuda.device.Device(cfg['device'])
+            if self.vocab.vectors.data.shape[1] >= 1:
-            device.use()
+                self.vocab.vectors.data = Model.ops.asarray(
-            Model.ops = CupyOps()
+                    self.vocab.vectors.data)
            Model.Ops = CupyOps
        else:
            device = None
        link_vectors_to_models(self.vocab)
        for proc in self.pipeline:
            if hasattr(proc, 'begin_training'):
                context = proc.begin_training(get_gold_tuples(),
@ -390,7 +388,7 @@ class Language(object):
        self._optimizer.device = device
        return self._optimizer
-    def evaluate(self, docs_golds):
+    def evaluate(self, docs_golds, verbose=False):
        scorer = Scorer()
        docs, golds = zip(*docs_golds)
        docs = list(docs)
@ -403,8 +401,9 @@ class Language(object):
                docs = list(pipe.pipe(docs))
        assert len(docs) == len(golds)
        for doc, gold in zip(docs, golds):
-            scorer.score(doc, gold)
+            if verbose:
-            doc.tensor = None
+                print(doc)
            scorer.score(doc, gold, verbose=verbose)
        return scorer
    @contextmanager
@ -493,7 +492,6 @@ class Language(object):
        """
        path = util.ensure_path(path)
        serializers = OrderedDict((
            ('vocab', lambda p: self.vocab.to_disk(p)),
            ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
            ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
        ))
@ -505,6 +503,7 @@ class Language(object):
            if not hasattr(proc, 'to_disk'):
                continue
            serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
        serializers['vocab'] = lambda p: self.vocab.to_disk(p)
        util.to_disk(path, serializers, {p: False for p in disable})
    def from_disk(self, path, disable=tuple()):
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -38,7 +38,8 @@ class Lemmatizer(object):
        avoid lemmatization entirely.
        """
        morphology = {} if morphology is None else morphology
-        others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
+        others = [key for key in morphology
                  if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
        true_morph_key = morphology.get('morph', 0)
        if univ_pos == 'noun' and morphology.get('Number') == 'sing':
            return True
@ -47,7 +48,9 @@ class Lemmatizer(object):
        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
        # morphology
        elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
-                                     morphology.get('Tense') == 'pres'):
+                                     morphology.get('Tense') == 'pres' and \
                                     morphology.get('Number') is None and \
                                     not others):
            return True
        elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
            return True
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -421,47 +421,69 @@ cdef class PhraseMatcher:
    cdef int max_length
    cdef attr_t* _phrase_key
-    def __init__(self, Vocab vocab, phrases, max_length=10):
+    cdef public object _callbacks
    cdef public object _patterns
    def __init__(self, Vocab vocab, max_length=10):
        self.mem = Pool()
        self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
        self.max_length = max_length
        self.vocab = vocab
-        self.matcher = Matcher(self.vocab, {})
+        self.matcher = Matcher(self.vocab)
        self.phrase_ids = PreshMap()
        for phrase in phrases:
            if len(phrase) < max_length:
                self.add(phrase)
        abstract_patterns = []
        for length in range(1, max_length):
            abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
-        self.matcher.add('Candidate', 'MWE', {}, abstract_patterns, acceptor=self.accept_match)
+        self.matcher.add('Candidate', None, *abstract_patterns)
        self._callbacks = {}
-    def add(self, Doc tokens):
+    def __len__(self):
-        cdef int length = tokens.length
+        raise NotImplementedError
        assert length < self.max_length
        tags = get_bilou(length)
        assert len(tags) == length, length
    def __contains__(self, key):
        raise NotImplementedError
    def __reduce__(self):
        return (self.__class__, (self.vocab,), None, None)
    def add(self, key, on_match, *docs):
        cdef Doc doc
        for doc in docs:
            if len(doc) >= self.max_length:
                msg = (
                    "Pattern length (%d) >= phrase_matcher.max_length (%d). "
                    "Length can be set on initialization, up to 10."
                )
                raise ValueError(msg % (len(doc), self.max_length))
        cdef hash_t ent_id = self.matcher._normalize_key(key)
        self._callbacks[ent_id] = on_match
        cdef int length
        cdef int i
-        for i in range(self.max_length):
+        cdef hash_t phrase_hash
-            self._phrase_key[i] = 0
+        for doc in docs:
-        for i, tag in enumerate(tags):
+            length = doc.length
-            lexeme = self.vocab[tokens.c[i].lex.orth]
+            tags = get_bilou(length)
-            lexeme.set_flag(tag, True)
+            for i in range(self.max_length):
-            self._phrase_key[i] = lexeme.orth
+                self._phrase_key[i] = 0
-        cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
+            for i, tag in enumerate(tags):
-        self.phrase_ids[key] = True
+                lexeme = self.vocab[doc.c[i].lex.orth]
                lexeme.set_flag(tag, True)
                self._phrase_key[i] = lexeme.orth
            phrase_hash = hash64(self._phrase_key,
                                 self.max_length * sizeof(attr_t), 0)
            self.phrase_ids.set(phrase_hash, <void*>ent_id)
    def __call__(self, Doc doc):
        matches = []
-        for ent_id, label, start, end in self.matcher(doc):
+        for _, start, end in self.matcher(doc):
-            cand = doc[start : end]
+            ent_id = self.accept_match(doc, start, end)
-            start = cand[0].idx
+            if ent_id is not None:
-            end = cand[-1].idx + len(cand[-1])
+                matches.append((ent_id, start, end))
-            matches.append((start, end, cand.root.tag_, cand.text, 'MWE'))
+        for i, (ent_id, start, end) in enumerate(matches):
-        for match in matches:
+            on_match = self._callbacks.get(ent_id)
-            doc.merge(*match)
+            if on_match is not None:
                on_match(self, doc, i, matches)
        return matches
    def pipe(self, stream, batch_size=1000, n_threads=2):
@ -469,7 +491,7 @@ cdef class PhraseMatcher:
            self(doc)
            yield doc
-    def accept_match(self, Doc doc, attr_t ent_id, attr_t label, int start, int end):
+    def accept_match(self, Doc doc, int start, int end):
        assert (end - start) < self.max_length
        cdef int i, j
        for i in range(self.max_length):
@ -477,7 +499,8 @@ cdef class PhraseMatcher:
        for i, j in enumerate(range(start, end)):
            self._phrase_key[i] = doc.c[j].lex.orth
        cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
-        if self.phrase_ids.get(key):
+        ent_id = <hash_t>self.phrase_ids.get(key)
-            return (ent_id, label, start, end)
+        if ent_id == 0:
            return None
        else:
-            return False
+            return ent_id
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -146,6 +146,8 @@ cdef class Morphology:
                self.add_special_case(tag_str, form_str, attrs)
    def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
        if orth not in self.strings:
            return orth
        cdef unicode py_string = self.strings[orth]
        if self.lemmatizer is None:
            return self.strings.add(py_string.lower())
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -4,7 +4,6 @@
 from __future__ import unicode_literals
 from thinc.api import chain, layerize, with_getitem
 from thinc.neural import Model, Softmax
 import numpy
 cimport numpy as np
 import cytoolz
@ -14,17 +13,18 @@ import ujson
 import msgpack
 from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
-from thinc.neural import Model, Maxout, Softmax, Affine
+from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
-from thinc.neural._classes.hash_embed import HashEmbed
+from thinc.i2v import HashEmbed
 from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
 from thinc.t2t import ExtractWindow, ParametricAttention
 from thinc.misc import Residual
 from thinc.misc import BatchNorm as BN
 from thinc.misc import LayerNorm as LN
 from thinc.neural.util import to_categorical
 from thinc.neural.pooling import Pooling, max_pool, mean_pool
 from thinc.neural._classes.difference import Siamese, CauchySimilarity
 from thinc.neural._classes.convolution import ExtractWindow
 from thinc.neural._classes.resnet import Residual
 from thinc.neural._classes.batchnorm import BatchNorm as BN
 from .tokens.doc cimport Doc
 from .syntax.parser cimport Parser as LinearParser
 from .syntax.nn_parser cimport Parser as NeuralParser
@ -41,13 +41,14 @@ from .syntax import nonproj
 from .compat import json_dumps
 from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
-from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
+from ._ml import rebatch, Tok2Vec, flatten
 from ._ml import build_text_classifier, build_tagger_model
 from ._ml import link_vectors_to_models
 from .parts_of_speech import X
 class SentenceSegmenter(object):
-    '''A simple spaCy hook, to allow custom sentence boundary detection logic
+    """A simple spaCy hook, to allow custom sentence boundary detection logic
    (that doesn't require the dependency parse).
    To change the sentence boundary detection strategy, pass a generator
@ -56,7 +57,7 @@ class SentenceSegmenter(object):
    Sentence detection strategies should be generators that take `Doc` objects
    and yield `Span` objects for each sentence.
-    '''
+    """
    name = 'sbd'
    def __init__(self, vocab, strategy=None):
@ -88,17 +89,30 @@ class BaseThincComponent(object):
    @classmethod
    def Model(cls, *shape, **kwargs):
        """Initialize a model for the pipe."""
        raise NotImplementedError
    def __init__(self, vocab, model=True, **cfg):
        """Create a new pipe instance."""
        raise NotImplementedError
    def __call__(self, doc):
        """Apply the pipe to one document. The document is
        modified in-place, and returned.
        Both __call__ and pipe should delegate to the `predict()`
        and `set_annotations()` methods.
        """
        scores = self.predict([doc])
        self.set_annotations([doc], scores)
        return doc
    def pipe(self, stream, batch_size=128, n_threads=-1):
        """Apply the pipe to a stream of documents.
        Both __call__ and pipe should delegate to the `predict()`
        and `set_annotations()` methods.
        """
        for docs in cytoolz.partition_all(batch_size, stream):
            docs = list(docs)
            scores = self.predict(docs)
@ -106,27 +120,43 @@ class BaseThincComponent(object):
            yield from docs
    def predict(self, docs):
        """Apply the pipeline's model to a batch of docs, without
        modifying them.
        """
        raise NotImplementedError
    def set_annotations(self, docs, scores):
        """Modify a batch of documents, using pre-computed scores."""
        raise NotImplementedError
-    def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
+    def update(self, docs, golds, drop=0., sgd=None, losses=None):
        """Learn from a batch of documents and gold-standard information,
        updating the pipe's model.
        Delegates to predict() and get_loss().
        """
        raise NotImplementedError
    def get_loss(self, docs, golds, scores):
        """Find the loss and gradient of loss for the batch of
        documents and their predicted scores."""
        raise NotImplementedError
    def begin_training(self, gold_tuples=tuple(), pipeline=None):
-        token_vector_width = pipeline[0].model.nO
+        """Initialize the pipe for training, using data exampes if available.
        If no model has been initialized yet, the model is added."""
        if self.model is True:
-            self.model = self.Model(1, token_vector_width)
+            self.model = self.Model(**self.cfg)
        link_vectors_to_models(self.vocab)
    def use_params(self, params):
        """Modify the pipe's model, to use the given parameter values.
        """
        with self.model.use_params(params):
            yield
    def to_bytes(self, **exclude):
        """Serialize the pipe to a bytestring."""
        serialize = OrderedDict((
            ('cfg', lambda: json_dumps(self.cfg)),
            ('model', lambda: self.model.to_bytes()),
@ -135,37 +165,42 @@ class BaseThincComponent(object):
        return util.to_bytes(serialize, exclude)
    def from_bytes(self, bytes_data, **exclude):
        """Load the pipe from a bytestring."""
        def load_model(b):
            if self.model is True:
                self.cfg['pretrained_dims'] = self.vocab.vectors_length
                self.model = self.Model(**self.cfg)
            self.model.from_bytes(b)
        deserialize = OrderedDict((
            ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
            ('vocab', lambda b: self.vocab.from_bytes(b)),
            ('model', load_model),
            ('vocab', lambda b: self.vocab.from_bytes(b))
        ))
        util.from_bytes(bytes_data, deserialize, exclude)
        return self
    def to_disk(self, path, **exclude):
        """Serialize the pipe to disk."""
        serialize = OrderedDict((
            ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
            ('vocab', lambda p: self.vocab.to_disk(p)),
            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
            ('vocab', lambda p: self.vocab.to_disk(p))
        ))
        util.to_disk(path, serialize, exclude)
    def from_disk(self, path, **exclude):
        """Load the pipe from disk."""
        def load_model(p):
            if self.model is True:
                self.cfg['pretrained_dims'] = self.vocab.vectors_length
                self.model = self.Model(**self.cfg)
            self.model.from_bytes(p.open('rb').read())
        deserialize = OrderedDict((
            ('cfg', lambda p: self.cfg.update(_load_cfg(p))),
            ('model', load_model),
            ('vocab', lambda p: self.vocab.from_disk(p)),
            ('model', load_model),
        ))
        util.from_disk(path, deserialize, exclude)
        return self
@ -193,7 +228,7 @@ class TokenVectorEncoder(BaseThincComponent):
        """
        width = util.env_opt('token_vector_width', width)
        embed_size = util.env_opt('embed_size', embed_size)
-        return Tok2Vec(width, embed_size, preprocess=None)
+        return Tok2Vec(width, embed_size, **cfg)
    def __init__(self, vocab, model=True, **cfg):
        """Construct a new statistical model. Weights are not allocated on
@ -210,9 +245,10 @@ class TokenVectorEncoder(BaseThincComponent):
            >>> tok2vec.model = tok2vec.Model(128, 5000)
        """
        self.vocab = vocab
        self.doc2feats = doc2feats()
        self.model = model
        self.cfg = dict(cfg)
        self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
        self.cfg.setdefault('cnn_maxout_pieces', 3)
    def __call__(self, doc):
        """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
@ -245,8 +281,7 @@ class TokenVectorEncoder(BaseThincComponent):
        docs (iterable): A sequence of `Doc` objects.
        RETURNS (object): Vector representations for each token in the documents.
        """
-        feats = self.doc2feats(docs)
+        tokvecs = self.model(docs)
        tokvecs = self.model(feats)
        return tokvecs
    def set_annotations(self, docs, tokvecses):
@ -270,8 +305,7 @@ class TokenVectorEncoder(BaseThincComponent):
        """
        if isinstance(docs, Doc):
            docs = [docs]
-        feats = self.doc2feats(docs)
+        tokvecs, bp_tokvecs = self.model.begin_update(docs, drop=drop)
        tokvecs, bp_tokvecs = self.model.begin_update(feats, drop=drop)
        return tokvecs, bp_tokvecs
    def get_loss(self, docs, golds, scores):
@ -285,9 +319,10 @@ class TokenVectorEncoder(BaseThincComponent):
        gold_tuples (iterable): Gold-standard training data.
        pipeline (list): The pipeline the model is part of.
        """
        self.doc2feats = doc2feats()
        if self.model is True:
-            self.model = self.Model()
+            self.cfg['pretrained_dims'] = self.vocab.vectors_length
            self.model = self.Model(**self.cfg)
        link_vectors_to_models(self.vocab)
 class NeuralTagger(BaseThincComponent):
@ -296,29 +331,29 @@ class NeuralTagger(BaseThincComponent):
        self.vocab = vocab
        self.model = model
        self.cfg = dict(cfg)
        self.cfg.setdefault('cnn_maxout_pieces', 2)
        self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
    def __call__(self, doc):
-        tags = self.predict(([doc], [doc.tensor]))
+        tags = self.predict([doc])
        self.set_annotations([doc], tags)
        return doc
    def pipe(self, stream, batch_size=128, n_threads=-1):
        for docs in cytoolz.partition_all(batch_size, stream):
            docs = list(docs)
-            tokvecs = [d.tensor for d in docs]
+            tag_ids = self.predict(docs)
            tag_ids = self.predict((docs, tokvecs))
            self.set_annotations(docs, tag_ids)
            yield from docs
-    def predict(self, docs_tokvecs):
+    def predict(self, docs):
-        scores = self.model(docs_tokvecs)
+        scores = self.model(docs)
        scores = self.model.ops.flatten(scores)
        guesses = scores.argmax(axis=1)
        if not isinstance(guesses, numpy.ndarray):
            guesses = guesses.get()
        tokvecs = docs_tokvecs[1]
        guesses = self.model.ops.unflatten(guesses,
-                    [tv.shape[0] for tv in tokvecs])
+                    [len(d) for d in docs])
        return guesses
    def set_annotations(self, docs, batch_tag_ids):
@ -338,20 +373,16 @@ class NeuralTagger(BaseThincComponent):
                idx += 1
        doc.is_tagged = True
-    def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
+    def update(self, docs, golds, drop=0., sgd=None, losses=None):
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
        docs, tokvecs = docs_tokvecs
-        if self.model.nI is None:
+        tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
            self.model.nI = tokvecs[0].shape[1]
        tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
        loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
        bp_tag_scores(d_tag_scores, sgd=sgd)
        d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
        if losses is not None:
            losses[self.name] += loss
        return d_tokvecs
    def get_loss(self, docs, golds, scores):
        scores = self.model.ops.flatten(scores)
@ -392,13 +423,14 @@ class NeuralTagger(BaseThincComponent):
            vocab.morphology = Morphology(vocab.strings, new_tag_map,
                                          vocab.morphology.lemmatizer,
                                          exc=vocab.morphology.exc)
        token_vector_width = pipeline[0].model.nO
        if self.model is True:
-            self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
+            self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
            self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
        link_vectors_to_models(self.vocab)
    @classmethod
-    def Model(cls, n_tags, token_vector_width):
+    def Model(cls, n_tags, **cfg):
-        return build_tagger_model(n_tags, token_vector_width)
+        return build_tagger_model(n_tags, **cfg)
    def use_params(self, params):
        with self.model.use_params(params):
@ -419,7 +451,7 @@ class NeuralTagger(BaseThincComponent):
            if self.model is True:
                token_vector_width = util.env_opt('token_vector_width',
                        self.cfg.get('token_vector_width', 128))
-                self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
+                self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
            self.model.from_bytes(b)
        def load_tag_map(b):
@ -438,6 +470,7 @@ class NeuralTagger(BaseThincComponent):
        return self
    def to_disk(self, path, **exclude):
        self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
        serialize = OrderedDict((
            ('vocab', lambda p: self.vocab.to_disk(p)),
            ('tag_map', lambda p: p.open('wb').write(msgpack.dumps(
@ -452,9 +485,7 @@ class NeuralTagger(BaseThincComponent):
    def from_disk(self, path, **exclude):
        def load_model(p):
            if self.model is True:
-                token_vector_width = util.env_opt('token_vector_width',
+                self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
                        self.cfg.get('token_vector_width', 128))
                self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
            self.model.from_bytes(p.open('rb').read())
        def load_tag_map(p):
@ -466,10 +497,10 @@ class NeuralTagger(BaseThincComponent):
                exc=self.vocab.morphology.exc)
        deserialize = OrderedDict((
            ('cfg', lambda p: self.cfg.update(_load_cfg(p))),
            ('vocab', lambda p: self.vocab.from_disk(p)),
            ('tag_map', load_tag_map),
            ('model', load_model),
            ('cfg', lambda p: self.cfg.update(_load_cfg(p)))
        ))
        util.from_disk(path, deserialize, exclude)
        return self
@ -477,10 +508,28 @@ class NeuralTagger(BaseThincComponent):
 class NeuralLabeller(NeuralTagger):
    name = 'nn_labeller'
-    def __init__(self, vocab, model=True, **cfg):
+    def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
        self.vocab = vocab
        self.model = model
        if target == 'dep':
            self.make_label = self.make_dep
        elif target == 'tag':
            self.make_label = self.make_tag
        elif target == 'ent':
            self.make_label = self.make_ent
        elif target == 'dep_tag_offset':
            self.make_label = self.make_dep_tag_offset
        elif target == 'ent_tag':
            self.make_label = self.make_ent_tag
        elif hasattr(target, '__call__'):
            self.make_label = target
        else:
            raise ValueError(
                "NeuralLabeller target should be function or one of "
                "['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
        self.cfg = dict(cfg)
        self.cfg.setdefault('cnn_maxout_pieces', 2)
        self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
    @property
    def labels(self):
@ -493,41 +542,79 @@ class NeuralLabeller(NeuralTagger):
    def set_annotations(self, docs, dep_ids):
        pass
-    def begin_training(self, gold_tuples=tuple(), pipeline=None):
+    def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None):
        gold_tuples = nonproj.preprocess_training_data(gold_tuples)
        for raw_text, annots_brackets in gold_tuples:
            for annots, brackets in annots_brackets:
                ids, words, tags, heads, deps, ents = annots
-                for dep in deps:
+                for i in range(len(ids)):
-                    if dep not in self.labels:
+                    label = self.make_label(i, words, tags, heads, deps, ents)
-                        self.labels[dep] = len(self.labels)
+                    if label is not None and label not in self.labels:
-        token_vector_width = pipeline[0].model.nO
+                        self.labels[label] = len(self.labels)
        print(len(self.labels))
        if self.model is True:
-            self.model = self.Model(len(self.labels), token_vector_width)
+            token_vector_width = util.env_opt('token_vector_width')
            self.model = chain(
                tok2vec,
                Softmax(len(self.labels), token_vector_width)
            )
        link_vectors_to_models(self.vocab)
    @classmethod
-    def Model(cls, n_tags, token_vector_width):
+    def Model(cls, n_tags, tok2vec=None, **cfg):
-        return build_tagger_model(n_tags, token_vector_width)
+        return build_tagger_model(n_tags, tok2vec=tok2vec, **cfg)
    def get_loss(self, docs, golds, scores):
        scores = self.model.ops.flatten(scores)
        cdef int idx = 0
        correct = numpy.zeros((scores.shape[0],), dtype='i')
        guesses = scores.argmax(axis=1)
        for gold in golds:
-            for tag in gold.labels:
+            for i in range(len(gold.labels)):
-                if tag is None or tag not in self.labels:
+                label = self.make_label(i, gold.words, gold.tags, gold.heads,
                                        gold.labels, gold.ents)
                if label is None or label not in self.labels:
                    correct[idx] = guesses[idx]
                else:
-                    correct[idx] = self.labels[tag]
+                    correct[idx] = self.labels[label]
                idx += 1
        correct = self.model.ops.xp.array(correct, dtype='i')
        d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
        d_scores /= d_scores.shape[0]
        loss = (d_scores**2).sum()
        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
        return float(loss), d_scores
    @staticmethod
    def make_dep(i, words, tags, heads, deps, ents):
        if deps[i] is None or heads[i] is None:
            return None
        return deps[i]
    @staticmethod
    def make_tag(i, words, tags, heads, deps, ents):
        return tags[i]
    @staticmethod
    def make_ent(i, words, tags, heads, deps, ents):
        if ents is None:
            return None
        return ents[i]
    @staticmethod
    def make_dep_tag_offset(i, words, tags, heads, deps, ents):
        if deps[i] is None or heads[i] is None:
            return None
        offset = heads[i] - i
        offset = min(offset, 2)
        offset = max(offset, -2)
        return '%s-%s:%d' % (deps[i], tags[i], offset)
    @staticmethod
    def make_ent_tag(i, words, tags, heads, deps, ents):
        if ents is None or ents[i] is None:
            return None
        else:
            return '%s-%s' % (tags[i], ents[i])
 class SimilarityHook(BaseThincComponent):
    """
@ -555,7 +642,7 @@ class SimilarityHook(BaseThincComponent):
        return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
    def __call__(self, doc):
-        '''Install similarity hook'''
+        """Install similarity hook"""
        doc.user_hooks['similarity'] = self.predict
        return doc
@ -564,15 +651,10 @@ class SimilarityHook(BaseThincComponent):
            yield self(doc)
    def predict(self, doc1, doc2):
-        return self.model.predict([(doc1.tensor, doc2.tensor)])
+        return self.model.predict([(doc1, doc2)])
-    def update(self, doc1_tensor1_doc2_tensor2, golds, sgd=None, drop=0.):
+    def update(self, doc1_doc2, golds, sgd=None, drop=0.):
-        doc1s, tensor1s, doc2s, tensor2s = doc1_tensor1_doc2_tensor2
+        sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
        sims, bp_sims = self.model.begin_update(zip(tensor1s, tensor2s),
                                                drop=drop)
        d_tensor1s, d_tensor2s = bp_sims(golds, sgd=sgd)
        return d_tensor1s, d_tensor2s
    def begin_training(self, _=tuple(), pipeline=None):
        """
@ -583,6 +665,7 @@ class SimilarityHook(BaseThincComponent):
        """
        if self.model is True:
            self.model = self.Model(pipeline[0].model.nO)
            link_vectors_to_models(self.vocab)
 class TextCategorizer(BaseThincComponent):
@ -627,15 +710,13 @@ class TextCategorizer(BaseThincComponent):
            for j, label in enumerate(self.labels):
                doc.cats[label] = float(scores[i, j])
-    def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
+    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
        docs, tensors = docs_tensors
        scores, bp_scores = self.model.begin_update(docs, drop=drop)
        loss, d_scores = self.get_loss(docs, golds, scores)
-        d_tensors = bp_scores(d_scores, sgd=sgd)
+        bp_scores(d_scores, sgd=sgd)
        if losses is not None:
            losses.setdefault(self.name, 0.0)
            losses[self.name] += loss
        return d_tensors
    def get_loss(self, docs, golds, scores):
        truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
@ -653,8 +734,10 @@ class TextCategorizer(BaseThincComponent):
        else:
            token_vector_width = 64
        if self.model is True:
            self.cfg['pretrained_dims'] = self.vocab.vectors_length
            self.model = self.Model(len(self.labels), token_vector_width,
                                    **self.cfg)
            link_vectors_to_models(self.vocab)
 cdef class EntityRecognizer(LinearParser):
@ -695,6 +778,14 @@ cdef class NeuralDependencyParser(NeuralParser):
    name = 'parser'
    TransitionSystem = ArcEager
    def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
        for target in []:
            labeller = NeuralLabeller(self.vocab, target=target)
            tok2vec = self.model[0]
            labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
            pipeline.append(labeller)
            self._multitasks.append(labeller)
    def __reduce__(self):
        return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
@ -705,13 +796,13 @@ cdef class NeuralEntityRecognizer(NeuralParser):
    nr_feature = 6
-    def predict_confidences(self, docs):
+    def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
-        tensors = [d.tensor for d in docs]
+        for target in []:
-        samples = []
+            labeller = NeuralLabeller(self.vocab, target=target)
-        for i in range(10):
+            tok2vec = self.model[0]
-            states = self.parse_batch(docs, tensors, drop=0.3)
+            labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
-            for state in states:
+            pipeline.append(labeller)
-                samples.append(self._get_entities(state))
+            self._multitasks.append(labeller)
    def __reduce__(self):
        return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -1,4 +1,4 @@
-cpdef enum symbol_t:
+cdef enum symbol_t:
    NIL
    IS_ALPHA
    IS_ASCII
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -1,4 +1,6 @@
 # coding: utf8
 #cython: optimize.unpack_method_calls=False
 from __future__ import unicode_literals
 IDS = {
@ -458,4 +460,11 @@ IDS = {
    "xcomp": xcomp
 }
-NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])]
+def sort_nums(x):
    return x[1]
 NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
 # Unfortunate hack here, to work around problem with long cpdef enum
 # (which is generating an enormous amount of C++ in Cython 0.24+)
 # We keep the enum cdef, and just make sure the names are available to Python
 locals().update(IDS)
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@ -147,10 +147,10 @@ def get_token_ids(states, int n_tokens):
 nr_update = 0
 def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
-                states, tokvecs, golds,
+                states, golds,
                state2vec, vec2scores, 
                int width, float density,
-                sgd=None, losses=None, drop=0.):
+                losses=None, drop=0.):
    global nr_update
    cdef MaxViolation violn
    nr_update += 1
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@ -101,9 +101,10 @@ cdef cppclass StateC:
        elif n == 6:
            if this.B(0) >= 0:
                ids[0] = this.B(0)
                ids[1] = this.B(0)-1
            else:
                ids[0] = -1
-            ids[1] = this.B(0)
+                ids[1] = -1
            ids[2] = this.B(1)
            ids[3] = this.E(0)
            if ids[3] >= 1:
@ -120,6 +121,8 @@ cdef cppclass StateC:
        for i in range(n):
            if ids[i] >= 0:
                ids[i] += this.offset
            else:
                ids[i] = -1
    int S(int i) nogil const:
        if i >= this._s_i:
@ -162,9 +165,9 @@ cdef cppclass StateC:
    int E(int i) nogil const:
        if this._e_i <= 0 or this._e_i >= this.length:
-            return 0
+            return -1
        if i < 0 or i >= this._e_i:
-            return 0
+            return -1
        return this._ents[this._e_i - (i+1)].start
    int L(int i, int idx) nogil const:
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -161,8 +161,7 @@ cdef class BiluoPushDown(TransitionSystem):
    cdef Transition lookup_transition(self, object name) except *:
        cdef attr_t label
        if name == '-' or name == None:
-            move_str = 'M'
+            return Transition(clas=0, move=MISSING, label=0, score=0)
            label = 0
        elif name == '!O':
            return Transition(clas=0, move=ISNT, label=0, score=0)
        elif '-' in name:
@ -220,6 +219,31 @@ cdef class BiluoPushDown(TransitionSystem):
            raise Exception(move)
        return t
    #def add_action(self, int action, label_name):
    #    cdef attr_t label_id
    #    if not isinstance(label_name, (int, long)):
    #        label_id = self.strings.add(label_name)
    #    else:
    #        label_id = label_name
    #    if action == OUT and label_id != 0:
    #        return
    #    if action == MISSING or action == ISNT:
    #        return
    #    # Check we're not creating a move we already have, so that this is
    #    # idempotent
    #    for trans in self.c[:self.n_moves]:
    #        if trans.move == action and trans.label == label_id:
    #            return 0
    #    if self.n_moves >= self._size:
    #        self._size *= 2
    #        self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
    #    self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
    #    assert self.c[self.n_moves].label == label_id
    #    self.n_moves += 1
    #    return 1
    cdef int initialize_state(self, StateC* st) nogil:
        # This is especially necessary when we use limited training data.
        for i in range(st.length):
--- a/spacy/syntax/nn_parser.pxd
+++ b/spacy/syntax/nn_parser.pxd
@ -13,6 +13,7 @@ cdef class Parser:
    cdef public object model
    cdef readonly TransitionSystem moves
    cdef readonly object cfg
    cdef public object _multitasks
    cdef void _parse_step(self, StateC* state,
            const float* feat_weights,
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -7,6 +7,7 @@ from __future__ import unicode_literals, print_function
 from collections import Counter, OrderedDict
 import ujson
 import json
 import contextlib
 from libc.math cimport exp
@ -37,10 +38,9 @@ from preshed.maps cimport MapStruct
 from preshed.maps cimport map_get
 from thinc.api import layerize, chain, noop, clone, with_flatten
-from thinc.neural import Model, Affine, ReLu, Maxout
+from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
-from thinc.neural._classes.batchnorm import BatchNorm as BN
+from thinc.misc import LayerNorm
-from thinc.neural._classes.selu import SELU
+
 from thinc.neural._classes.layernorm import LayerNorm
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module
@ -48,7 +48,8 @@ from .. import util
 from ..util import get_async, get_cuda_stream
 from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
 from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
-from .._ml import Residual, drop_layer
+from .._ml import Residual, drop_layer, flatten
 from .._ml import link_vectors_to_models
 from ..compat import json_dumps
 from . import _parse_features
@ -238,14 +239,15 @@ cdef class Parser:
    Base class of the DependencyParser and EntityRecognizer.
    """
    @classmethod
-    def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg):
+    def Model(cls, nr_class, token_vector_width=128, hidden_width=200, depth=1, **cfg):
        depth = util.env_opt('parser_hidden_depth', depth)
        token_vector_width = util.env_opt('token_vector_width', token_vector_width)
        hidden_width = util.env_opt('hidden_width', hidden_width)
        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
-        embed_size = util.env_opt('embed_size', 4000)
+        embed_size = util.env_opt('embed_size', 7000)
-        tensors = fine_tune(Tok2Vec(token_vector_width, embed_size,
+        tok2vec = Tok2Vec(token_vector_width, embed_size,
-                                    preprocess=doc2feats()))
+                          pretrained_dims=cfg.get('pretrained_dims', 0))
        tok2vec = chain(tok2vec, flatten)
        if parser_maxout_pieces == 1:
            lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
                        nF=cls.nr_feature,
@ -262,8 +264,8 @@ cdef class Parser:
                upper.is_noop = True
            else:
                upper = chain(
-                    clone(Maxout(hidden_width), (depth-1)),
+                    clone(Maxout(hidden_width), depth-1),
-                    zero_init(Affine(nr_class, drop_factor=0.0))
+                    zero_init(Affine(nr_class, hidden_width, drop_factor=0.0))
                )
                upper.is_noop = False
        # TODO: This is an unfortunate hack atm!
@ -277,7 +279,7 @@ cdef class Parser:
            'hidden_width': hidden_width,
            'maxout_pieces': parser_maxout_pieces
        }
-        return (tensors, lower, upper), cfg
+        return (tok2vec, lower, upper), cfg
    def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
        """
@ -307,12 +309,16 @@ cdef class Parser:
            cfg['beam_width'] = util.env_opt('beam_width', 1)
        if 'beam_density' not in cfg:
            cfg['beam_density'] = util.env_opt('beam_density', 0.0)
        if 'pretrained_dims' not in cfg:
            cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
        cfg.setdefault('cnn_maxout_pieces', 3)
        self.cfg = cfg
        if 'actions' in self.cfg:
            for action, labels in self.cfg.get('actions', {}).items():
                for label in labels:
                    self.moves.add_action(action, label)
        self.model = model
        self._multitasks = []
    def __reduce__(self):
        return (Parser, (self.vocab, self.moves, self.model), None, None)
@ -332,11 +338,11 @@ cdef class Parser:
            beam_density = self.cfg.get('beam_density', 0.0)
        cdef Beam beam
        if beam_width == 1:
-            states = self.parse_batch([doc], [doc.tensor])
+            states = self.parse_batch([doc])
            self.set_annotations([doc], states)
            return doc
        else:
-            beam = self.beam_parse([doc], [doc.tensor],
+            beam = self.beam_parse([doc],
                        beam_width=beam_width, beam_density=beam_density)[0]
            output = self.moves.get_beam_annot(beam)
            state = <StateClass>beam.at(0)
@ -365,11 +371,11 @@ cdef class Parser:
        cdef Beam beam
        for docs in cytoolz.partition_all(batch_size, docs):
            docs = list(docs)
            tokvecs = [doc.tensor for doc in docs]
            if beam_width == 1:
-                parse_states = self.parse_batch(docs, tokvecs)
+                parse_states = self.parse_batch(docs)
                beams = []
            else:
-                beams = self.beam_parse(docs, tokvecs,
+                beams = self.beam_parse(docs,
                            beam_width=beam_width, beam_density=beam_density)
                parse_states = []
                for beam in beams:
@ -377,7 +383,7 @@ cdef class Parser:
            self.set_annotations(docs, parse_states)
            yield from docs
-    def parse_batch(self, docs, tokvecses):
+    def parse_batch(self, docs):
        cdef:
            precompute_hiddens state2vec
            StateClass state
@ -388,21 +394,15 @@ cdef class Parser:
            int nr_class, nr_feat, nr_piece, nr_dim, nr_state
        if isinstance(docs, Doc):
            docs = [docs]
        if isinstance(tokvecses, np.ndarray):
            tokvecses = [tokvecses]
-        tokvecs = self.model[0].ops.flatten(tokvecses)
+        cuda_stream = get_cuda_stream()
-        if USE_FINE_TUNE:
+        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
-            tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
+                                                                            0.0)
        nr_state = len(docs)
        nr_class = self.moves.n_moves
        nr_dim = tokvecs.shape[1]
        nr_feat = self.nr_feature
        cuda_stream = get_cuda_stream()
        state2vec, vec2scores = self.get_batch_model(nr_state, tokvecs,
                                                     cuda_stream, 0.0)
        nr_piece = state2vec.nP
        states = self.moves.init_batch(docs)
@ -418,21 +418,23 @@ cdef class Parser:
        c_token_ids = <int*>token_ids.data
        c_is_valid = <int*>is_valid.data
        cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
        cdef int nr_step
        while not next_step.empty():
            nr_step = next_step.size()
            if not has_hidden:
-                for i in cython.parallel.prange(
+                for i in cython.parallel.prange(nr_step, num_threads=6,
-                        next_step.size(), num_threads=6, nogil=True):
+                                                nogil=True):
                    self._parse_step(next_step[i],
                        feat_weights, nr_class, nr_feat, nr_piece)
            else:
-                for i in range(next_step.size()):
+                for i in range(nr_step):
                    st = next_step[i]
                    st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
                    self.moves.set_valid(&c_is_valid[i*nr_class], st)
                vectors = state2vec(token_ids[:next_step.size()])
                scores = vec2scores(vectors)
                c_scores = <float*>scores.data
-                for i in range(next_step.size()):
+                for i in range(nr_step):
                    st = next_step[i]
                    guess = arg_max_if_valid(
                        &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
@ -445,18 +447,15 @@ cdef class Parser:
                    next_step.push_back(st)
        return states
-    def beam_parse(self, docs, tokvecses, int beam_width=3, float beam_density=0.001):
+    def beam_parse(self, docs, int beam_width=3, float beam_density=0.001):
        cdef Beam beam
        cdef np.ndarray scores
        cdef Doc doc
        cdef int nr_class = self.moves.n_moves
        cdef StateClass stcls, output
        tokvecs = self.model[0].ops.flatten(tokvecses)
        if USE_FINE_TUNE:
            tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
        cuda_stream = get_cuda_stream()
-        state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
+        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
-                                                     cuda_stream, 0.0)
+                                                                            0.0)
        beams = []
        cdef int offset = 0
        cdef int j = 0
@ -516,29 +515,24 @@ cdef class Parser:
        free(scores)
        free(token_ids)
-    def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
+    def update(self, docs, golds, drop=0., sgd=None, losses=None):
        if not any(self.moves.has_gold(gold) for gold in golds):
            return None
        if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
-            return self.update_beam(docs_tokvecs, golds,
+            return self.update_beam(docs, golds,
                    self.cfg['beam_width'], self.cfg['beam_density'],
                    drop=drop, sgd=sgd, losses=losses)
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
        docs, tokvec_lists = docs_tokvecs
        tokvecs = self.model[0].ops.flatten(tokvec_lists)
        if isinstance(docs, Doc) and isinstance(golds, GoldParse):
            docs = [docs]
            golds = [golds]
        if USE_FINE_TUNE:
            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
            tokvecs = self.model[0].ops.flatten(my_tokvecs)
        cuda_stream = get_cuda_stream()
        states, golds, max_steps = self._init_gold_batch(docs, golds)
-        state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
+        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
-                                                      0.0)
+                                                                            drop)
        todo = [(s, g) for (s, g) in zip(states, golds)
                if not s.is_final() and g is not None]
        if not todo:
@ -582,13 +576,9 @@ cdef class Parser:
            if n_steps >= max_steps:
                break
        self._make_updates(d_tokvecs,
-            backprops, sgd, cuda_stream)
+            bp_tokvecs, backprops, sgd, cuda_stream)
        d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
        if USE_FINE_TUNE:
            d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
        return d_tokvecs
-    def update_beam(self, docs_tokvecs, golds, width=None, density=None,
+    def update_beam(self, docs, golds, width=None, density=None,
            drop=0., sgd=None, losses=None):
        if not any(self.moves.has_gold(gold) for gold in golds):
            return None
@ -600,26 +590,20 @@ cdef class Parser:
            density = self.cfg.get('beam_density', 0.0)
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
        docs, tokvecs = docs_tokvecs
        lengths = [len(d) for d in docs]
        assert min(lengths) >= 1
        tokvecs = self.model[0].ops.flatten(tokvecs)
        if USE_FINE_TUNE:
            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
            tokvecs += self.model[0].ops.flatten(my_tokvecs)
        states = self.moves.init_batch(docs)
        for gold in golds:
            self.moves.preprocess_gold(gold)
        cuda_stream = get_cuda_stream()
-        state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0)
+        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop)
        states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
-                                        states, tokvecs, golds,
+                                        states, golds,
                                        state2vec, vec2scores,
                                        width, density,
-                                        sgd=sgd, drop=drop, losses=losses)
+                                        drop=drop, losses=losses)
        backprop_lower = []
        cdef float batch_size = len(docs)
        for i, d_scores in enumerate(states_d_scores):
@ -637,11 +621,7 @@ cdef class Parser:
            else:
                backprop_lower.append((ids, d_vector, bp_vectors))
        d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
-        self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
+        self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream)
        d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
        if USE_FINE_TUNE:
            d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
        return d_tokvecs
    def _init_gold_batch(self, whole_docs, whole_golds):
        """Make a square batch, of length equal to the shortest doc. A long
@ -679,7 +659,7 @@ cdef class Parser:
            max_moves = max(max_moves, len(oracle_actions))
        return states, golds, max_moves
-    def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None):
+    def _make_updates(self, d_tokvecs, bp_tokvecs, backprops, sgd, cuda_stream=None):
        # Tells CUDA to block, so our async copies complete.
        if cuda_stream is not None:
            cuda_stream.synchronize()
@ -690,6 +670,7 @@ cdef class Parser:
            d_state_features *= mask.reshape(ids.shape + (1,))
            self.model[0].ops.scatter_add(d_tokvecs, ids * mask,
                d_state_features)
        bp_tokvecs(d_tokvecs, sgd=sgd)
    @property
    def move_names(self):
@ -699,11 +680,12 @@ cdef class Parser:
            names.append(name)
        return names
-    def get_batch_model(self, batch_size, tokvecs, stream, dropout):
+    def get_batch_model(self, docs, stream, dropout):
-        _, lower, upper = self.model
+        tok2vec, lower, upper = self.model
-        state2vec = precompute_hiddens(batch_size, tokvecs,
+        tokvecs, bp_tokvecs = tok2vec.begin_update(docs, drop=dropout)
-                        lower, stream, drop=dropout)
+        state2vec = precompute_hiddens(len(docs), tokvecs,
-        return state2vec, upper
+                                       lower, stream, drop=0.0)
        return (tokvecs, bp_tokvecs), state2vec, upper
    nr_feature = 8
@ -766,7 +748,7 @@ cdef class Parser:
                # order, or the model goes out of synch
                self.cfg.setdefault('extra_labels', []).append(label)
-    def begin_training(self, gold_tuples, **cfg):
+    def begin_training(self, gold_tuples, pipeline=None, **cfg):
        if 'model' in cfg:
            self.model = cfg['model']
        gold_tuples = nonproj.preprocess_training_data(gold_tuples)
@ -775,9 +757,22 @@ cdef class Parser:
            for label in labels:
                self.moves.add_action(action, label)
        if self.model is True:
            cfg['pretrained_dims'] = self.vocab.vectors_length
            self.model, cfg = self.Model(self.moves.n_moves, **cfg)
            self.init_multitask_objectives(gold_tuples, pipeline, **cfg)
            link_vectors_to_models(self.vocab)
            self.cfg.update(cfg)
    def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
        '''Setup models for secondary objectives, to benefit from multi-task
        learning. This method is intended to be overridden by subclasses.
        For instance, the dependency parser can benefit from sharing
        an input representation with a label prediction model. These auxiliary
        models are discarded after training.
        '''
        pass
    def preprocess_gold(self, docs_golds):
        for doc, gold in docs_golds:
            yield doc, gold
@ -813,6 +808,7 @@ cdef class Parser:
        if 'model' not in exclude:
            path = util.ensure_path(path)
            if self.model is True:
                self.cfg['pretrained_dims'] = self.vocab.vectors_length
                self.model, cfg = self.Model(**self.cfg)
            else:
                cfg = {}
@ -835,7 +831,7 @@ cdef class Parser:
            ('upper_model', lambda: self.model[2].to_bytes()),
            ('vocab', lambda: self.vocab.to_bytes()),
            ('moves', lambda: self.moves.to_bytes(strings=False)),
-            ('cfg', lambda: ujson.dumps(self.cfg))
+            ('cfg', lambda: json.dumps(self.cfg, indent=2, sort_keys=True))
        ))
        if 'model' in exclude:
            exclude['tok2vec_model'] = True
@ -848,7 +844,7 @@ cdef class Parser:
        deserializers = OrderedDict((
            ('vocab', lambda b: self.vocab.from_bytes(b)),
            ('moves', lambda b: self.moves.from_bytes(b, strings=False)),
-            ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
+            ('cfg', lambda b: self.cfg.update(json.loads(b))),
            ('tok2vec_model', lambda b: None),
            ('lower_model', lambda b: None),
            ('upper_model', lambda b: None)
@ -856,9 +852,11 @@ cdef class Parser:
        msg = util.from_bytes(bytes_data, deserializers, exclude)
        if 'model' not in exclude:
            if self.model is True:
-                self.model, cfg = self.Model(self.moves.n_moves)
+                self.model, cfg = self.Model(**self.cfg)
                cfg['pretrained_dims'] = self.vocab.vectors_length
            else:
                cfg = {}
            cfg['pretrained_dims'] = self.vocab.vectors_length
            if 'tok2vec_model' in msg:
                self.model[0].from_bytes(msg['tok2vec_model'])
            if 'lower_model' in msg:
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -148,7 +148,7 @@ cdef class TransitionSystem:
    def add_action(self, int action, label_name):
        cdef attr_t label_id
-        if not isinstance(label_name, int):
+        if not isinstance(label_name, (int, long)):
            label_id = self.strings.add(label_name)
        else:
            label_id = label_name
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -12,7 +12,7 @@ from .. import util
 _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id',
-              'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
+              'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'th','xx']
 _models = {'en': ['en_core_web_sm'],
           'de': ['de_core_news_md'],
           'fr': ['fr_depvec_web_lg'],
@ -108,6 +108,11 @@ def he_tokenizer():
 def nb_tokenizer():
    return util.get_lang_class('nb').Defaults.create_tokenizer()
@pytest.fixture
 def th_tokenizer():
    pythainlp = pytest.importorskip("pythainlp")
    return util.get_lang_class('th').Defaults.create_tokenizer()
@pytest.fixture
 def stringstore():
--- a/spacy/tests/lang/de/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/de/test_prefix_suffix_infix.py
@ -67,12 +67,6 @@ def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text):
    assert len(tokens) == 4
@pytest.mark.parametrize('text', ["blau-rot"])
 def test_tokenizer_splits_hyphens(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
 def test_tokenizer_splits_numeric_range(de_tokenizer, text):
    tokens = de_tokenizer(text)
@ -100,17 +94,21 @@ def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text):
    assert len(tokens) == 3
@pytest.mark.parametrize('text', ['Islam-Konferenz', 'Ost-West-Konflikt'])
 def test_tokenizer_keeps_hyphens(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 1
 def test_tokenizer_splits_double_hyphen_infix(de_tokenizer):
    tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.")
-    assert len(tokens) == 12
+    assert len(tokens) == 10
    assert tokens[0].text == "Viele"
    assert tokens[1].text == "Regeln"
    assert tokens[2].text == "--"
    assert tokens[3].text == "wie"
    assert tokens[4].text == "die"
-    assert tokens[5].text == "Bindestrich"
+    assert tokens[5].text == "Bindestrich-Regeln"
-    assert tokens[6].text == "-"
+    assert tokens[6].text == "--"
-    assert tokens[7].text == "Regeln"
+    assert tokens[7].text == "sind"
-    assert tokens[8].text == "--"
+    assert tokens[8].text == "kompliziert"
    assert tokens[9].text == "sind"
    assert tokens[10].text == "kompliziert"
--- a/spacy/tests/lang/de/test_text.py
+++ b/spacy/tests/lang/de/test_text.py
@ -25,15 +25,15 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen.
    assert len(tokens) == 109
-@pytest.mark.parametrize('text,length', [
+@pytest.mark.parametrize('text', [
-    ("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1),
+    "Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten",
-    ("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1),
+    "Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
-    ("Kraftfahrzeug-Haftpflichtversicherung", 3),
+    "Kraftfahrzeug-Haftpflichtversicherung",
-    ("Vakuum-Mittelfrequenz-Induktionsofen", 5)
+    "Vakuum-Mittelfrequenz-Induktionsofen"
    ])
-def test_tokenizer_handles_long_words(de_tokenizer, text, length):
+def test_tokenizer_handles_long_words(de_tokenizer, text):
    tokens = de_tokenizer(text)
-    assert len(tokens) == length
+    assert len(tokens) == 1
@pytest.mark.parametrize('text,length', [
--- a/spacy/tests/lang/th/init.py
+++ b/spacy/tests/lang/th/init.py
--- a/spacy/tests/lang/th/test_tokenizer.py
+++ b/spacy/tests/lang/th/test_tokenizer.py
@ -0,0 +1,13 @@
 # coding: utf8
 from __future__ import unicode_literals
 import pytest
 TOKENIZER_TESTS = [
        ("คุณรักผมไหม", ['คุณ', 'รัก', 'ผม', 'ไหม'])
 ]
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
 def test_thai_tokenizer(th_tokenizer, text, expected_tokens):
 	tokens = [token.text for token in th_tokenizer(text)]
 	assert tokens == expected_tokens
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@ -26,7 +26,7 @@ def arc_eager(vocab):
@pytest.fixture
 def tok2vec():
-    return Tok2Vec(8, 100, preprocess=doc2feats())
+    return Tok2Vec(8, 100)
@pytest.fixture
@ -61,33 +61,22 @@ def test_predict_doc(parser, tok2vec, model, doc):
    parser(doc)
-def test_update_doc(parser, tok2vec, model, doc, gold):
+def test_update_doc(parser, model, doc, gold):
    parser.model = model
    tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
    d_tokvecs = parser.update(([doc], tokvecs), [gold])
    assert d_tokvecs[0].shape == tokvecs[0].shape
    def optimize(weights, gradient, key=None):
        weights -= 0.001 * gradient
-    bp_tokvecs(d_tokvecs, sgd=optimize)
+    parser.update([doc], [gold], sgd=optimize)
    assert d_tokvecs[0].sum() == 0.
-def test_predict_doc_beam(parser, tok2vec, model, doc):
+def test_predict_doc_beam(parser, model, doc):
    doc.tensor = tok2vec([doc])[0]
    parser.model = model
    parser(doc, beam_width=32, beam_density=0.001)
    for word in doc:
        print(word.text, word.head, word.dep_)
-def test_update_doc_beam(parser, tok2vec, model, doc, gold):
+def test_update_doc_beam(parser, model, doc, gold):
    parser.model = model
    tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
    d_tokvecs = parser.update_beam(([doc], tokvecs), [gold])
    assert d_tokvecs[0].shape == tokvecs[0].shape
    def optimize(weights, gradient, key=None):
        weights -= 0.001 * gradient
-    bp_tokvecs(d_tokvecs, sgd=optimize)
+    parser.update_beam([doc], [gold], sgd=optimize)
    assert d_tokvecs[0].sum() == 0.
--- a/spacy/tests/regression/test_issue1305.py
+++ b/spacy/tests/regression/test_issue1305.py
@ -0,0 +1,8 @@
 import pytest
@pytest.mark.models('en')
 def test_issue1305(EN):
    '''Test lemmatization of English VBZ'''
    assert EN.vocab.morphology.lemmatizer('works', 'verb') == set(['work'])
    doc = EN(u'This app works well')
    assert doc[2].lemma_ == 'work'
--- a/spacy/tests/regression/test_issue1380.py
+++ b/spacy/tests/regression/test_issue1380.py
@ -0,0 +1,14 @@
 from __future__ import unicode_literals
 import pytest
 from ...language import Language
 def test_issue1380_empty_string():
    nlp = Language()
    doc = nlp('')
    assert len(doc) == 0
@pytest.mark.models('en')
 def test_issue1380_en(EN):
    doc = EN('')
    assert len(doc) == 0
--- a/spacy/tests/regression/test_issue429.py
+++ b/spacy/tests/regression/test_issue429.py
@ -9,11 +9,14 @@ import pytest
@pytest.mark.models('en')
 def test_issue429(EN):
    def merge_phrases(matcher, doc, i, matches):
-      if i != len(matches) - 1:
+        if i != len(matches) - 1:
-        return None
+            return None
-      spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
+        spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
-      for ent_id, label, span in spans:
+        for ent_id, label, span in spans:
-        span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label])
+            span.merge(
                tag=('NNP' if label else span.root.tag_),
                lemma=span.text,
                label='PERSON')
    doc = EN('a')
    matcher = Matcher(EN.vocab)
--- a/spacy/tests/serialize/test_serialize_tagger.py
+++ b/spacy/tests/serialize/test_serialize_tagger.py
@ -11,7 +11,7 @@ import pytest
 def taggers(en_vocab):
    tagger1 = Tagger(en_vocab)
    tagger2 = Tagger(en_vocab)
-    tagger1.model = tagger1.Model(8, 8)
+    tagger1.model = tagger1.Model(8)
    tagger2.model = tagger1.model
    return (tagger1, tagger2)
--- a/spacy/tests/stringstore/test_stringstore.py
+++ b/spacy/tests/stringstore/test_stringstore.py
@ -6,6 +6,16 @@ from ...strings import StringStore
 import pytest
 def test_string_hash(stringstore):
    '''Test that string hashing is stable across platforms'''
    ss = stringstore
    assert ss.add('apple') == 8566208034543834098
    heart = '\U0001f499'
    print(heart)
    h = ss.add(heart)
    assert h == 11841826740069053588
 def test_stringstore_from_api_docs(stringstore):
    apple_hash = stringstore.add('apple')
    assert apple_hash == 8566208034543834098
--- a/spacy/tests/test_matcher.py
+++ b/spacy/tests/test_matcher.py
@ -34,7 +34,6 @@ def test_matcher_from_api_docs(en_vocab):
    assert len(patterns[0])
@pytest.mark.xfail
 def test_matcher_from_usage_docs(en_vocab):
    text = "Wow 😀 This is really cool! 😂 😂"
    doc = get_doc(en_vocab, words=text.split(' '))
@ -46,7 +45,8 @@ def test_matcher_from_usage_docs(en_vocab):
        if doc.vocab.strings[match_id] == 'HAPPY':
            doc.sentiment += 0.1
        span = doc[start : end]
-        token = span.merge(norm='happy emoji')
+        token = span.merge()
        token.vocab[token.text].norm_ = 'happy emoji'
    matcher = Matcher(en_vocab)
    matcher.add('HAPPY', label_sentiment, *pos_patterns)
@ -98,11 +98,11 @@ def test_matcher_match_multi(matcher):
                            (doc.vocab.strings['Java'], 5, 6)]
@pytest.mark.xfail
 def test_matcher_phrase_matcher(en_vocab):
    words = ["Google", "Now"]
    doc = get_doc(en_vocab, words)
-    matcher = PhraseMatcher(en_vocab, [doc])
+    matcher = PhraseMatcher(en_vocab)
    matcher.add('COMPANY', None, doc)
    words = ["I", "like", "Google", "Now", "best"]
    doc = get_doc(en_vocab, words)
    assert len(matcher(doc)) == 1
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -9,7 +9,8 @@ from .util import get_doc
 from pathlib import Path
 import pytest
-from thinc.neural import Maxout, Softmax
+from thinc.neural._classes.maxout import Maxout
 from thinc.neural._classes.softmax import Softmax
 from thinc.api import chain
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@ -1,6 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import sys
 import pytest
@ -37,9 +38,10 @@ def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
    tokens = tokenizer(text)
    assert len(tokens) == length
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
                                         ('i💙you', 3), ('🤘🤘yay!', 4)])
 def test_tokenizer_handles_emoji(tokenizer, text, length):
-    tokens = tokenizer(text)
+    # These break on narrow unicode builds, e.g. Windows
-    assert len(tokens) == length
+    if sys.maxunicode >= 1114111:
        tokens = tokenizer(text)
        assert len(tokens) == length
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -54,7 +54,7 @@ cdef class Doc:
    cdef public object noun_chunks_iterator
-    cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1
+    cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1
    cpdef np.ndarray to_array(self, object features)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -660,7 +660,7 @@ cdef class Doc:
        """
        with path.open('rb') as file_:
            bytes_data = file_.read()
-        self.from_bytes(bytes_data, **exclude)
+        return self.from_bytes(bytes_data, **exclude)
    def to_bytes(self, **exclude):
        """Serialize, i.e. export the document contents to a binary string.
--- a/spacy/util.py
+++ b/spacy/util.py
@ -3,7 +3,7 @@ from __future__ import unicode_literals, print_function
 import os
 import ujson
-import pip
+import pkg_resources
 import importlib
 import regex as re
 from pathlib import Path
@ -14,6 +14,7 @@ import numpy
 import io
 import dill
 from collections import OrderedDict
 from thinc.neural._classes.model import Model
 import msgpack
 import msgpack_numpy
@ -180,9 +181,10 @@ def is_package(name):
    name (unicode): Name of package.
    RETURNS (bool): True if installed package, False if not.
    """
-    packages = pip.get_installed_distributions()
+    name = name.lower()  # compare package name against lowercase name
    packages = pkg_resources.working_set.by_key.keys()
    for package in packages:
-        if package.project_name.replace('-', '_') == name:
+        if package.lower().replace('-', '_') == name:
            return True
    return False
@ -193,6 +195,7 @@ def get_package_path(name):
    name (unicode): Package name.
    RETURNS (Path): Path to installed package.
    """
    name = name.lower()  # use lowercase version to be safe
    # Here we're importing the module just to find it. This is worryingly
    # indirect, but it's otherwise very difficult to find the package.
    pkg = importlib.import_module(name)
@ -557,3 +560,17 @@ def minify_html(html):
    RETURNS (unicode): "Minified" HTML.
    """
    return html.strip().replace('    ', '').replace('\n', '')
 def use_gpu(gpu_id):
    try:
        import cupy.cuda.device
    except ImportError:
        return None
    from thinc.neural.ops import CupyOps
    device = cupy.cuda.device.Device(gpu_id)
    device.use()
    Model.ops = CupyOps()
    Model.Ops = CupyOps
    return device
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@ -6,6 +6,8 @@ import msgpack
 import msgpack_numpy
 msgpack_numpy.patch()
 cimport numpy as np
 from thinc.neural.util import get_array_module
 from thinc.neural._classes.model import Model
 from .typedefs cimport attr_t
 from .strings cimport StringStore
@ -14,15 +16,29 @@ from .compat import basestring_
 cdef class Vectors:
-    '''Store, save and load word vectors.'''
+    '''Store, save and load word vectors.
    Vectors data is kept in the vectors.data attribute, which should be an
    instance of numpy.ndarray (for CPU vectors)
    or cupy.ndarray (for GPU vectors).
    vectors.key2row is a dictionary mapping word hashes to rows
    in the vectors.data table. The array `vectors.keys` keeps
    the keys in order, such that keys[vectors.key2row[key]] == key.
    '''
    cdef public object data
    cdef readonly StringStore strings
    cdef public object key2row
    cdef public object keys
    cdef public int i
-    def __init__(self, strings, data_or_width):
+    def __init__(self, strings, data_or_width=0):
-        self.strings = StringStore()
+        if isinstance(strings, StringStore):
            self.strings = strings
        else:
            self.strings = StringStore()
            for string in strings:
                self.strings.add(string)
        if isinstance(data_or_width, int):
            self.data = data = numpy.zeros((len(strings), data_or_width),
                                           dtype='f')
@ -37,6 +53,11 @@ cdef class Vectors:
        return (Vectors, (self.strings, self.data))
    def __getitem__(self, key):
        '''Get a vector by key. If key is a string, it is hashed
        to an integer ID using the vectors.strings table.
        If the integer key is not found in the table, a KeyError is raised.
        '''
        if isinstance(key, basestring):
            key = self.strings[key]
        i = self.key2row[key]
@ -46,23 +67,30 @@ cdef class Vectors:
            return self.data[i]
    def __setitem__(self, key, vector):
        '''Set a vector for the given key. If key is a string, it is hashed
        to an integer ID using the vectors.strings table.
        '''
        if isinstance(key, basestring):
            key = self.strings.add(key)
        i = self.key2row[key]
        self.data[i] = vector
    def __iter__(self):
        '''Yield vectors from the table.'''
        yield from self.data
    def __len__(self):
        '''Return the number of vectors that have been assigned.'''
        return self.i
    def __contains__(self, key):
        '''Check whether a key has a vector entry in the table.'''
        if isinstance(key, basestring_):
            key = self.strings[key]
        return key in self.key2row
    def add(self, key, vector=None):
        '''Add a key to the table, optionally setting a vector value as well.'''
        if isinstance(key, basestring_):
            key = self.strings.add(key)
        if key not in self.key2row:
@ -80,7 +108,9 @@ cdef class Vectors:
        return i
    def items(self):
-        for i, string in enumerate(self.strings):
+        '''Iterate over (string key, vector) pairs, in order.'''
        for i, key in enumerate(self.keys):
            string = self.strings[key]
            yield string, self.data[i]
    @property
@ -118,9 +148,14 @@ cdef class Vectors:
            self.data
    def to_disk(self, path, **exclude):
        xp = get_array_module(self.data)
        if xp is numpy:
            save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)
        else:
            save_array = lambda arr, file_: xp.save(file_, arr)
        serializers = OrderedDict((
-            ('vectors', lambda p: numpy.save(p.open('wb'), self.data, allow_pickle=False)),
+            ('vectors', lambda p: save_array(self.data, p.open('wb'))),
-            ('keys', lambda p: numpy.save(p.open('wb'), self.keys, allow_pickle=False)),
+            ('keys', lambda p: xp.save(p.open('wb'), self.keys))
        ))
        return util.to_disk(path, serializers, exclude)
@ -133,8 +168,9 @@ cdef class Vectors:
                    self.key2row[key] = i
        def load_vectors(path):
            xp = Model.ops.xp
            if path.exists():
-                self.data = numpy.load(path)
+                self.data = xp.load(path)
        serializers = OrderedDict((
            ('keys', load_keys),
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -27,6 +27,7 @@ from .vectors import Vectors
 from . import util
 from . import attrs
 from . import symbols
 from ._ml import link_vectors_to_models
 cdef class Vocab:
@ -65,7 +66,7 @@ cdef class Vocab:
                self.strings.add(name)
        self.lex_attr_getters = lex_attr_getters
        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
-        self.vectors = Vectors(self.strings, 300)
+        self.vectors = Vectors(self.strings)
    property lang:
        def __get__(self):
@ -261,7 +262,7 @@ cdef class Vocab:
        Words can be looked up by string or int ID.
        RETURNS:
-            A word vector. Size and shape determed by the
+            A word vector. Size and shape determined by the
            vocab.vectors instance. Usually, a numpy ndarray
            of shape (300,) and dtype float32.
@ -323,6 +324,7 @@ cdef class Vocab:
            self.lexemes_from_bytes(file_.read())
        if self.vectors is not None:
            self.vectors.from_disk(path, exclude='strings.json')
        link_vectors_to_models(self)
        return self
    def to_bytes(self, **exclude):
@ -436,6 +438,7 @@ def unpickle_vocab(sstore, morphology, data_dir,
    vocab.lex_attr_getters = lex_attr_getters
    vocab.lexemes_from_bytes(lexemes_data)
    vocab.length = length
    link_vectors_to_models(vocab)
    return vocab
--- a/travis.sh
+++ b/travis.sh
@ -17,6 +17,7 @@ fi
 if [ "${VIA}" == "compile" ]; then
  pip install -r requirements.txt
  python setup.py build_ext --inplace
  pip install -e .
 fi
--- a/website/404.jade
+++ b/website/404.jade
@ -8,4 +8,5 @@ include _includes/_mixins
        | does not exist!
    h2.c-landing__title.u-heading-3.u-padding-small
-        a(href="javascript:history.go(-1)") Click here to go back.
+        +button(false, true, "secondary-light")(href="javascript:history.go(-1)")
            |  Click here to go back
--- a/website/_data.json
+++ b/website/_data.json
@ -3,24 +3,22 @@
        "landing": true,
        "logos": [
            {
-                "quora": [ "https://www.quora.com", 150 ],
+                "airbnb": [ "https://www.airbnb.com", 150, 45],
-                "chartbeat": [ "https://chartbeat.com", 200 ],
+                "quora": [ "https://www.quora.com", 120, 34 ],
-                "duedil": [ "https://www.duedil.com", 150 ],
+                "retriever": [ "https://www.retriever.no", 150, 33 ],
-                "stitchfix": [ "https://www.stitchfix.com", 190 ]
+                "stitchfix": [ "https://www.stitchfix.com", 150, 18 ]
            },
            {
-                "wayblazer": [ "http://wayblazer.com", 200 ],
+                "chartbeat": [ "https://chartbeat.com", 180, 25 ],
-                "indico": [ "https://indico.io", 150 ],
+                "allenai": [ "https://allenai.org", 220, 37 ]
-                "chattermill": [ "https://chattermill.io", 175 ],
+            }
-                "turi": [ "https://turi.com", 150 ],
+        ],
-                "kip": [ "http://kipthis.com", 70 ]
+        "features": [
            },
            {
-                "socrata": [ "https://www.socrata.com", 150 ],
+                "thoughtworks": ["https://www.thoughtworks.com/radar/tools", 150, 28],
-                "cytora": [ "http://www.cytora.com", 125 ],
+                "wapo": ["https://www.washingtonpost.com/news/wonk/wp/2016/05/18/googles-new-artificial-intelligence-cant-understand-these-sentences-can-you/", 100, 77],
-                "signaln": [ "http://signaln.com", 150 ],
+                "venturebeat": ["https://venturebeat.com/2017/01/27/4-ai-startups-that-analyze-customer-reviews/", 150, 19],
-                "wonderflow": [ "http://www.wonderflow.co", 200 ],
+                "microsoft": ["https://www.microsoft.com/developerblog/2016/09/13/training-a-classifier-for-relation-extraction-from-medical-literature/", 130, 28]
                "synapsify": [ "http://www.gosynapsify.com", 150 ]
            }
        ]
    },
@ -34,7 +32,24 @@
        "landing": true
    },
-    "announcement" : {
+    "styleguide": {
-        "title": "Important Announcement"
+        "title": "Styleguide",
        "sidebar": {
            "Styleguide": { "": "styleguide" },
            "Resources": {
                "Website Source": "https://github.com/explosion/spacy/tree/master/website",
                "Contributing Guide": "https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md"
            }
        },
        "menu": {
            "Introduction": "intro",
            "Logo": "logo",
            "Colors": "colors",
            "Typography": "typography",
            "Elements": "elements",
            "Components": "components",
            "Embeds": "embeds",
            "Markup Reference": "markup"
        }
    }
 }
--- a/website/_harp.json
+++ b/website/_harp.json
@ -11,12 +11,9 @@
        "COMPANY": "Explosion AI",
        "COMPANY_URL": "https://explosion.ai",
        "DEMOS_URL": "https://demos.explosion.ai",
        "MODELS_REPO": "explosion/spacy-models",
-        "SPACY_VERSION": "1.8",
+        "SPACY_VERSION": "2.0",
        "LATEST_NEWS": {
            "url": "https://github.com/explosion/spaCy/releases/tag/v2.0.0-alpha",
            "title": "Test spaCy v2.0.0 alpha!"
        },
        "SOCIAL": {
            "twitter": "spacy_io",
@ -27,25 +24,23 @@
        },
        "NAVIGATION": {
-            "Home": "/",
+            "Usage": "/usage",
-            "Usage": "/docs/usage",
+            "Models": "/models",
-            "Reference": "/docs/api",
+            "API": "/api"
            "Demos": "/docs/usage/showcase",
            "Blog": "https://explosion.ai/blog"
        },
        "FOOTER": {
            "spaCy": {
-                "Usage": "/docs/usage",
+                "Usage": "/usage",
-                "API Reference": "/docs/api",
+                "Models": "/models",
-                "Tutorials": "/docs/usage/tutorials",
+                "API Reference": "/api",
-                "Showcase": "/docs/usage/showcase"
+                "Resources": "/usage/resources"
            },
            "Support": {
                "Issue Tracker": "https://github.com/explosion/spaCy/issues",
                "StackOverflow": "http://stackoverflow.com/questions/tagged/spacy",
-                "Reddit usergroup": "https://www.reddit.com/r/spacynlp/",
+                "Reddit Usergroup": "https://www.reddit.com/r/spacynlp/",
-                "Gitter chat": "https://gitter.im/explosion/spaCy"
+                "Gitter Chat": "https://gitter.im/explosion/spaCy"
            },
            "Connect": {
                "Twitter": "https://twitter.com/spacy_io",
@ -74,21 +69,11 @@
                {"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" },
                {"id": "gpu", "title": "GPU", "help": "Run spaCy on GPU to make it faster. Requires an NVDIA graphics card with CUDA 2+. See section below for more info."}]
            },
-            { "id": "model", "title": "Models", "multiple": true, "options": [
+            { "id": "model", "title": "Models", "multiple": true }
                { "id": "en", "title": "English", "meta": "50MB" },
                { "id": "de", "title": "German", "meta": "645MB" },
                { "id": "fr", "title": "French", "meta": "1.33GB" },
                { "id": "es", "title": "Spanish", "meta": "377MB"}]
            }
        ],
        "QUICKSTART_MODELS": [
-            { "id": "lang", "title": "Language", "options": [
+            { "id": "lang", "title": "Language"},
                { "id": "en", "title": "English", "checked": true },
                { "id": "de", "title": "German" },
                { "id": "fr", "title": "French" },
                { "id": "es", "title": "Spanish" }]
            },
            { "id": "load", "title": "Loading style", "options": [
                { "id": "spacy", "title": "Use spacy.load()", "checked": true, "help": "Use spaCy's built-in loader to load the model by name." },
                {  "id": "module", "title": "Import as module", "help": "Import the model explicitly as a Python module." }]
@ -98,50 +83,15 @@
            }
        ],
        "MODELS": {
            "en": [
                { "id": "en_core_web_sm", "lang": "English", "feats": [1, 1, 1, 1], "size": "50 MB", "license": "CC BY-SA", "def": true },
                { "id": "en_core_web_md", "lang": "English", "feats": [1, 1, 1, 1], "size": "1 GB", "license": "CC BY-SA" },
                { "id": "en_depent_web_md", "lang": "English", "feats": [1, 1, 1, 0], "size": "328 MB", "license": "CC BY-SA" },
                { "id": "en_vectors_glove_md", "lang": "English", "feats": [1, 0, 0, 1], "size": "727 MB", "license": "CC BY-SA" }
            ],
            "de": [
                { "id": "de_core_news_md", "lang": "German", "feats": [1, 1, 1, 1], "size": "645 MB", "license": "CC BY-SA" }
            ],
            "fr": [
                { "id": "fr_depvec_web_lg", "lang": "French", "feats": [1, 1, 0, 1], "size": "1.33 GB", "license": "CC BY-NC" }
            ],
            "es": [
                { "id": "es_core_web_md", "lang": "Spanish", "feats": [1, 1, 1, 1], "size": "377 MB", "license": "CC BY-SA"}
            ]
        },
        "EXAMPLE_SENTENCES": {
            "en": "This is a sentence.",
            "de": "Dies ist ein Satz.",
            "fr": "C'est une phrase.",
            "es": "Esto es una frase."
        },
        "ALPHA": true,
-        "V_CSS": "1.6",
+        "V_CSS": "2.0",
-        "V_JS": "1.2",
+        "V_JS": "2.0",
        "DEFAULT_SYNTAX": "python",
        "ANALYTICS": "UA-58931649-1",
        "MAILCHIMP": {
            "user": "spacy.us12",
            "id": "83b0498b1e7fa3c91ce68c3f1",
            "list": "89ad33e698"
        },
        "BADGES": {
            "pipy": {
                "badge": "https://img.shields.io/pypi/v/spacy.svg?style=flat-square",
                "link": "https://pypi.python.org/pypi/spacy"
            },
            "conda": {
                "badge": "https://anaconda.org/conda-forge/spacy/badges/version.svg",
                "link": "https://anaconda.org/conda-forge/spacy"
            }
        }
    }
 }
--- a/website/_includes/_footer.jade
+++ b/website/_includes/_footer.jade
@ -1,8 +1,6 @@
 //- 💫 INCLUDES > FOOTER
-include _mixins
+footer.o-footer.u-text
 footer.o-footer.u-text.u-border-dotted
    +grid.o-content
        each group, label in FOOTER
            +grid-col("quarter")
@ -13,18 +11,18 @@ footer.o-footer.u-text.u-border-dotted
                        li
                            +a(url)=item
-        if SECTION != "docs"
+        if SECTION == "index"
            +grid-col("quarter")
                include _newsletter
-    if SECTION == "docs"
+    if SECTION != "index"
        .o-content.o-block.u-border-dotted
            include _newsletter
    .o-inline-list.u-text-center.u-text-tiny.u-color-subtle
        span &copy; 2016-#{new Date().getFullYear()} #[+a(COMPANY_URL, true)=COMPANY]
-        +a(COMPANY_URL, true)
+        +a(COMPANY_URL, true)(aria-label="Explosion AI")
-            +svg("graphics", "explosion", 45).o-icon.u-color-theme.u-grayscale
+            +icon("explosion", 45).o-icon.u-color-theme.u-grayscale
        +a(COMPANY_URL + "/legal", true) Legal / Imprint
--- a/website/_includes/_functions.jade
+++ b/website/_includes/_functions.jade
@ -1,35 +1,71 @@
 //- 💫 INCLUDES > FUNCTIONS
-//- More descriptive variables for current.path and current.source
+//- Descriptive variables, available in the global scope
 - CURRENT = current.source
 - SECTION = current.path[0]
- SUBSECTION = current.path[1]
+- LANGUAGES = public.models._data.LANGUAGES
 - MODELS = public.models._data.MODELS
 - CURRENT_MODELS = MODELS[current.source] || []
 - MODEL_COUNT = Object.keys(MODELS).map(m => Object.keys(MODELS[m]).length).reduce((a, b) => a + b)
 - MODEL_LANG_COUNT = Object.keys(MODELS).length
 - LANG_COUNT = Object.keys(LANGUAGES).length
 - MODEL_META = public.models._data.MODEL_META
 - MODEL_LICENSES = public.models._data.MODEL_LICENSES
 - MODEL_ACCURACY = public.models._data.MODEL_ACCURACY
 - EXAMPLE_SENTENCES = public.models._data.EXAMPLE_SENTENCES
 - IS_PAGE = (SECTION != "index") && !landing
 - IS_MODELS = (SECTION == "models" && LANGUAGES[current.source])
 - HAS_MODELS = IS_MODELS && CURRENT_MODELS.length
 //- Add prefixes to items of an array (for modifier CSS classes)
    array   - [array] list of class names or options, e.g. ["foot"]
    prefix  - [string] prefix to add to each class, e.g. "c-table__row"
    RETURNS - [array] list of modified class names
 -   function prefixArgs(array, prefix) {
-       return array.map(function(arg) {
+-       return array.map(arg => prefix + '--' + arg).join(' ');
-           return prefix + '--' + arg;
+-   }
-       }).join(' ');
+
 //- Convert API paths (semi-temporary fix for renamed sections)
    path    - [string] link path supplied to +api mixin
    RETURNS - [string] new link path to correct location
 -   function convertAPIPath(path) {
 -       if (path.startsWith('spacy#') || path.startsWith('displacy#') || path.startsWith('util#')) {
 -           var comps = path.split('#');
 -           return "top-level#" + comps[0] + '.' + comps[1];
 -       }
 -       else if (path.startsWith('cli#')) {
 -           return "top-level#" + path.split('#')[1];
 -       }
 -       return path;
 -   }
 //- Get model components from ID. Components can then be looked up in LANGUAGES
    and MODEL_META respectively, to get their human-readable form.
    id      - [string] model ID, e.g. "en_core_web_sm"
    RETURNS - [object] object keyed by components lang, type, genre and size
 -   function getModelComponents(id) {
 -       var comps = id.split('_');
 -       return {'lang': comps[0], 'type': comps[1], 'genre': comps[2], 'size': comps[3]}
 -   }
 //- Generate GitHub links
    repo     - [string] name of repo owned by explosion
    filepath - [string] logical path to file relative to repository root
    branch   - [string] optional branch, defaults to "master"
    RETURNS  - [string] the correct link to the file on GitHub
 -   function gh(repo, filepath, branch) {
 -       var branch = ALPHA ? 'develop' : branch
-       return 'https://github.com/' + SOCIAL.github + '/' + repo + (filepath ? '/blob/' + (branch || 'master') + '/' + filepath : '' );
+-       return 'https://github.com/' + SOCIAL.github + '/' + (repo || '') + (filepath ? '/blob/' + (branch || 'master') + '/' + filepath : '' );
 -   }
 //- Get social images
 -   function getSocialImg() {
 -       var base = SITE_URL + '/assets/img/social/preview_'
 -       var image = ALPHA ? 'alpha' : 'default'
 -       if (preview) image = preview
 -       else if (SECTION == 'docs' && !ALPHA) image = 'docs'
 -       return base + image + '.jpg'
 -   }
--- a/website/_includes/_mixins-base.jade
+++ b/website/_includes/_mixins-base.jade
@ -1,5 +1,13 @@
 //- 💫 MIXINS > BASE
 //- Section
    id - [string] anchor assigned to section (used for breadcrumb navigation)
 mixin section(id)
    section.o-section(id="section-" + id data-section=id)
        block
 //- Aside wrapper
    label - [string] aside label
@ -11,34 +19,26 @@ mixin aside-wrapper(label)
            block
 //- Date
    input - [string] date in the format YYYY-MM-DD
-mixin date(input)
+//- SVG from map (uses embedded SVG sprite)
    - var date = new Date(input)
    - var months = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ]
    time(datetime=JSON.parse(JSON.stringify(date)))&attributes(attributes)=months[date.getMonth()] + ' ' + date.getDate() + ', ' + date.getFullYear()
 //- SVG from map
    file   - [string] SVG file name in /assets/img/
    name   - [string] SVG symbol id
    width  - [integer] width in px
    height - [integer] height in px (default: same as width)
-mixin svg(file, name, width, height)
+mixin svg(name, width, height)
    svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
-        use(xlink:href="/assets/img/#{file}.svg##{name}")
+        use(xlink:href="#svg_#{name}")
 //- Icon
-    name - [string] icon name, should be SVG symbol ID
+    name   - [string] icon name (will be used as symbol id: #svg_{name})
-    size - [integer] icon width and height (default: 20)
+    width  - [integer] icon width (default: 20)
    height - [integer] icon height (defaults to width)
-mixin icon(name, size)
+mixin icon(name, width, height)
-    - var size = size || 20
+    - var width = width || 20
-    +svg("icons", name, size).o-icon(style="min-width: #{size}px")&attributes(attributes)
+    - var height = height || width
    +svg(name, width, height).o-icon(style="min-width: #{width}px")&attributes(attributes)
 //- Pro/Con/Neutral icon
@ -46,8 +46,8 @@ mixin icon(name, size)
    size - [integer] icon size (optional)
 mixin procon(icon, size)
-    - colors = { pro: "green", con: "red", neutral: "yellow" }
+    - colors = { pro: "green", con: "red", neutral: "subtle" }
-    +icon(icon, size)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
+    +icon("circle", size || 16)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
 //- Headlines Helper Mixin
@ -80,8 +80,7 @@ mixin headline(level)
 mixin permalink(id)
    if id
-        a.u-permalink(id=id href="##{id}")
+        a.u-permalink(href="##{id}")
            +icon("anchor").u-permalink__icon
            block
    else
@ -109,7 +108,7 @@ mixin quickstart(groups, headline, description, hide_results)
                    .c-quickstart__fields
                        for option in group.options
                            input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked)
-                            label.c-quickstart__label(for="qs-#{option.id}")!=option.title
+                            label.c-quickstart__label.u-text-tiny(for="qs-#{option.id}")!=option.title
                                if option.meta
                                    |  #[span.c-quickstart__label__meta (#{option.meta})]
                                if option.help
@ -122,12 +121,10 @@ mixin quickstart(groups, headline, description, hide_results)
                code.c-code-block__content.c-quickstart__code(data-qs-results="")
                    block
    .c-quickstart__info.u-text-tiny.o-block.u-text-right
        |  Like this widget? Check out #[+a("https://github.com/ines/quickstart").u-link quickstart.js]!
 //- Quickstart code item
-    data [object] - Rendering conditions (keyed by option group ID, value: option)
+    data  - [object] Rendering conditions (keyed by option group ID, value: option)
    style - [string] modifier ID for line style
 mixin qs(data, style)
    - args = {}
@ -148,6 +145,13 @@ mixin terminal(label)
        +code.x-terminal__code
            block
 //- Chart.js
    id - [string] chart ID, will be assigned as #chart_{id}
 mixin chart(id)
    figure.o-block&attributes(attributes)
        canvas(id="chart_#{id}" width="800" height="400" style="max-width: 100%")
 //- Gitter chat button and widget
    button - [string] text shown on button
@ -156,26 +160,24 @@ mixin terminal(label)
 mixin gitter(button, label)
    aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
-    button.js-gitter-button.c-chat__button.u-text-small
+    button.js-gitter-button.c-chat__button.u-text-tag
-        +icon("chat").o-icon--inline
+        +icon("chat", 16).o-icon--inline
        !=button
 //- Badge
-    name - [string] "pipy" or "conda"
+    image - [string] path to badge image
    url   - [string] badge link
-mixin badge(name)
+mixin badge(image, url)
-    - site = BADGES[name]
+    +a(url).u-padding-small.u-hide-link&attributes(attributes)
-
+        img.o-badge(src=image alt=url height="20")
    if site
        +a(site.link).u-padding-small
            img(src=site.badge alt="{name} version" height="20")
-//- Logo
+//- spaCy logo
 mixin logo()
-    +svg("graphics", "spacy", 675, 215).o-logo&attributes(attributes)
+    +svg("spacy", 675, 215).o-logo&attributes(attributes)
 //- Landing
@ -186,18 +188,56 @@ mixin landing-header()
            .c-landing__content
                block
 mixin landing-banner(headline, label)
    .c-landing__banner.u-padding.o-block.u-color-light
        +grid.c-landing__banner__content.o-no-block
            +grid-col("third")
                h3.u-heading.u-heading-1
                    if label
                        div
                            span.u-text-label.u-text-label--light=label
                    !=headline
-mixin landing-badge(url, graphic, alt, size)
+            +grid-col("two-thirds").c-landing__banner__text
-    +a(url)(aria-label=alt title=alt).c-landing__badge
+                block
-        +svg("graphics", graphic, size || 225)
+
 mixin landing-logos(title, logos)
    .o-content.u-text-center&attributes(attributes)
        h3.u-heading.u-text-label.u-color-dark=title
        each row, i in logos
            - var is_last = i == logos.length - 1
            +grid("center").o-inline-list.o-no-block(class=is_last ? "o-no-block" : null)
                each details, name in row
                    +a(details[0]).u-padding-medium
                        +icon(name, details[1], details[2])
                if is_last
                    block
 //- Under construction (temporary)
    Marks sections that still need to be completed for the v2.0 release.
 mixin under-construction()
-    +infobox("🚧 Under construction")
+    +infobox("Under construction", "🚧")
        |  This section is still being written and will be updated for the v2.0
        |  release. Is there anything that you think should definitely mentioned or
        |  explained here? Any examples you'd like to see? #[strong Let us know]
        |  on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub!
 //- Alpha infobox (temporary)
    Added in the templates to notify user that they're visiting the alpha site.
 mixin alpha-info()
    +infobox("You are viewing the spaCy v2.0.0 alpha docs", "⚠️")
        strong This page is part of the alpha documentation for spaCy v2.0.
        |  It does not reflect the state of the latest stable release.
        |  Because v2.0 is still under development, the implementation
        |  may differ from the intended state described here. See the
        |  #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes]
        |  for details on how to install and test the new version. To
        |  read the official docs for spaCy v1.x,
        |  #[+a("https://spacy.io/docs") go here].
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -8,11 +8,15 @@ include _mixins-base
    level - [integer] headline level, corresponds to h1, h2, h3 etc.
    id    - [string] unique identifier, creates permalink (optional)
-mixin h(level, id)
+mixin h(level, id, source)
-    +headline(level).u-heading&attributes(attributes)
+    +headline(level).u-heading(id=id)&attributes(attributes)
        +permalink(id)
            block
        if source
            +button(gh("spacy", source), false, "secondary", "small").u-nowrap.u-float-right
                span Source #[+icon("code", 14).o-icon--inline]
 //- External links
    url     - [string] link href
@ -38,21 +42,23 @@ mixin src(url)
 //- API link (with added tag and automatically generated path)
-    path - [string] path to API docs page relative to /docs/api/
+    path - [string] path to API docs page relative to /api/
 mixin api(path)
-    +a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap
+    - path = convertAPIPath(path)
    +a("/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap
        block
-        |  #[+icon("book", 18).o-icon--inline.u-color-theme]
+        |  #[+icon("book", 16).o-icon--inline.u-color-theme]
 //- Help icon with tooltip
-    tooltip - [string] Tooltip text
+    tooltip   - [string] Tooltip text
    icon_size - [integer] Optional size of help icon in px.
-mixin help(tooltip)
+mixin help(tooltip, icon_size)
    span(data-tooltip=tooltip)&attributes(attributes)
-        +icon("help", 16).i-icon--inline
+        +icon("help", icon_size || 16).o-icon--inline
 //- Aside for text
@ -68,24 +74,43 @@ mixin aside(label)
    label    - [string] aside title (optional or false for no label)
    language - [string] language for syntax highlighting (default: "python")
               supports basic relevant languages available for PrismJS
    prompt   - [string] prompt displayed before first line, e.g. "$"
-mixin aside-code(label, language)
+mixin aside-code(label, language, prompt)
    +aside-wrapper(label)
-        +code(false, language).o-no-block
+        +code(false, language, prompt).o-no-block
            block
 //- Infobox
    label - [string] infobox title (optional or false for no title)
    emoji - [string] optional emoji displayed before the title, necessary as
            argument to be able to wrap it for spacing
-mixin infobox(label)
+mixin infobox(label, emoji)
    aside.o-box.o-block.u-text-small
        if label
-            h3.u-text-label.u-color-theme=label
+            h3.u-heading.u-text-label.u-color-theme
                if emoji
                    span.o-emoji=emoji
                |  #{label}
        block
 //- Logos displayed in the top corner of some infoboxes
    logos - [array] List of icon ID, width, height and link.
 mixin infobox-logos(...logos)
    .o-box__logos.u-text-right.u-float-right
        for logo in logos
            if logo[3]
                |  #[+a(logo[3]).u-inline-block.u-hide-link.u-padding-small #[+icon(logo[0], logo[1], logo[2]).u-color-dark]]
            else
                |  #[+icon(logo[0], logo[1], logo[2]).u-color-dark]
 //- Link button
    url      - [string] link href
    trusted  - [boolean] if not set / false, rel="noopener nofollow" is added
@ -94,7 +119,7 @@ mixin infobox(label)
               see assets/css/_components/_buttons.sass
 mixin button(url, trusted, ...style)
-    - external = url.includes("http")
+    - external = url && url.includes("http")
    a.c-button.u-text-label(href=url class=prefixArgs(style, "c-button") role="button" target=external ? "_blank" : null rel=external && !trusted ? "noopener nofollow" : null)&attributes(attributes)
        block
@ -103,31 +128,33 @@ mixin button(url, trusted, ...style)
    label    - [string] aside title (optional or false for no label)
    language - [string] language for syntax highlighting (default: "python")
               supports basic relevant languages available for PrismJS
-    prompt    - [string] prompt or icon to display next to code block, (mostly used for old/new)
+    prompt   - [string] prompt displayed before first line, e.g. "$"
    height   - [integer] optional height to clip code block to
    icon     - [string] icon displayed next to code block (e.g. "accept" for new code)
    wrap     - [boolean] wrap text and disable horizontal scrolling
-mixin code(label, language, prompt, height)
+mixin code(label, language, prompt, height, icon, wrap)
    pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
        if label
            h4.u-text-label.u-text-label--dark=label
-        - var icon = (prompt == 'accept' || prompt == 'reject')
+        - var icon = icon || (prompt == 'accept' || prompt == 'reject')
        if icon
            - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
            .c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
                +icon(icon, 18)
-        code.c-code-block__content(data-prompt=icon ? null : prompt)
+        code.c-code-block__content(class=wrap ? "u-wrap" : null data-prompt=icon ? null : prompt)
            block
 //- Code blocks to display old/new versions
 mixin code-old()
-    +code(false, false, "reject").o-block-small
+    +code(false, false, false, false, "reject").o-block-small
        block
 mixin code-new()
-    +code(false, false, "accept").o-block-small
+    +code(false, false, false, false, "accept").o-block-small
        block
@ -138,12 +165,33 @@ mixin code-new()
 mixin codepen(slug, height, default_tab)
    figure.o-block(style="min-height: #{height}px")&attributes(attributes)
-        .codepen(data-height=height data-theme-id="26467" data-slug-hash=slug data-default-tab=(default_tab || "result") data-embed-version="2" data-user=SOCIAL.codepen)
+        .codepen(data-height=height data-theme-id="31335" data-slug-hash=slug data-default-tab=(default_tab || "result") data-embed-version="2" data-user=SOCIAL.codepen)
            +a("https://codepen.io/" + SOCIAL.codepen + "/" + slug) View on CodePen
        script(async src="https://assets.codepen.io/assets/embed/ei.js")
 //- GitHub embed
    repo     - [string] repository owned by explosion organization
    file     - [string] logical path to file, relative to repository root
    alt_file - [string] alternative file path used in footer and link button
    height   - [integer] height of code preview in px
 mixin github(repo, file, alt_file, height)
    - var branch = ALPHA ? "develop" : "master"
    - var height = height || 250
    figure.o-block
        pre.c-code-block.o-block-small(class="lang-#{(language || DEFAULT_SYNTAX)}" style="height: #{height}px; min-height: #{height}px")
            code.c-code-block__content(data-gh-embed="#{repo}/#{branch}/#{file}")
        footer.o-grid.u-text
            .o-block-small.u-flex-full #[+icon("github")] #[code=repo + '/' + (alt_file || file)]
            div
                +button(gh(repo, alt_file || file), false, "primary", "small") View on GitHub
 //- Images / figures
    url     - [string] url or path to image
    width   - [integer] image width in px, for better rendering (default: 500)
@ -168,10 +216,26 @@ mixin image-caption()
        block
-//- Label
+//- Graphic or illustration with button
    original - [string] Path to original image
 mixin graphic(original)
    +image
        block
        if original
            .u-text-right
                +button(original, false, "secondary", "small") View large graphic
 //- Labels
 mixin label()
-    .u-text-label.u-color-subtle&attributes(attributes)
+    .u-text-label.u-color-dark&attributes(attributes)
        block
 mixin label-inline()
    strong.u-text-label.u-color-dark&attributes(attributes)
        block
@ -188,8 +252,10 @@ mixin tag()
 mixin tag-model(...capabs)
    - var intro = "To use this functionality, spaCy needs a model to be installed"
    - var ext = capabs.length ? " that supports the following capabilities: " + capabs.join(', ') : ""
-    +tag Requires model
+
-    +help(intro + ext + ".").u-color-theme
+    span.u-nowrap
        +tag Needs model
        +help(intro + ext + ".").u-color-theme
 //- "New" tag to label features new in a specific version
@ -219,15 +285,9 @@ mixin list(type, start)
 //- List item (only used within +list)
-mixin item(procon)
+mixin item()
-    if procon
+    li.c-list__item&attributes(attributes)
-        li&attributes(attributes)
+        block
            +procon(procon).c-list__icon
            block
    else
        li.c-list__item&attributes(attributes)
            block
 //- Table
@ -237,9 +297,9 @@ mixin table(head)
    table.c-table.o-block&attributes(attributes)
        if head
-            +row
+            +row("head")
                each column in head
-                    th.c-table__head-cell.u-text-label=column
+                    +head-cell=column
        block
@ -251,10 +311,11 @@ mixin row(...style)
        block
 //- Footer table row (only ued within +table)
-mixin footrow()
+//- Header table cell (only used within +row)
-    tr.c-table__row.c-table__row--foot&attributes(attributes)
+
 mixin head-cell()
    th.c-table__head-cell.u-text-label&attributes(attributes)
        block
@ -284,71 +345,58 @@ mixin grid-col(width)
 //- Card (only used within +grid)
-    title     - [string] card title
+    title  - [string] card title
-    details   - [object] url, image, author, description, tags etc.
+    url    - [string] link for card
-                (see /docs/usage/_data.json)
+    author - [string] optional author, displayed as byline at the bottom
    icon   - [string] optional ID of icon displayed with card
    width  - [string] optional width of grid column, defaults to "half"
-mixin card(title, details)
+mixin card(title, url, author, icon, width)
-    +grid-col("half").o-card.u-text&attributes(attributes)
+    +grid-col(width || "half").o-box.o-grid.o-grid--space.u-text&attributes(attributes)
-        if details.image
+        +a(url)
-            +a(details.url).o-block-small
+            h4.u-heading.u-text-label
-                img(src=details.image alt=title width="300" role="presentation")
+                if icon
-
+                    +icon(icon, 25).u-float-right
-        if title
+                if title
-            +a(details.url)
+                    span.u-color-dark=title
-                +h(3)=title
+            .o-block-small.u-text-small
-
+                block
-                    if details.author
+        if author
-                        .u-text-small.u-color-subtle by #{details.author}
+            .u-color-subtle.u-text-tiny by #{author}
        if details.description || details.tags
            ul
                if details.description
                    li=details.description
                if details.tags
                    li
                        each tag in details.tags
                            span.u-text-tag #{tag}
                            | &nbsp;
        block
-//- Simpler card list item (only used within +list)
+//- Table of contents, to be used with +item mixins for links
-    title     - [string] card title
+    col - [string] width of column (see +grid-col)
    details   - [object] url, image, author, description, tags etc.
                (see /docs/usage/_data.json)
-mixin card-item(title, details)
+mixin table-of-contents(col)
-    +item&attributes(attributes)
+    +grid-col(col || "half")
-        +a(details.url)=title
+        +infobox
-
+            +label.o-block-small Table of contents
-        if details.description
+            +list("numbers").u-text-small.o-no-block
-            br
+                block
            span=details.description
        if details.author
            br
            span.u-text-small.u-color-subtle by #{details.author}
-//- Table row for models table
+//- Bibliography
    id - [string] ID of bibliography component, for anchor links. Can be used if
         there's more than one bibliography on one page.
-mixin model-row(name, lang, procon, size, license, default_model, divider)
+mixin bibliography(id)
-    - var licenses = { "CC BY-SA": "https://creativecommons.org/licenses/by-sa/3.0/", "CC BY-NC": "https://creativecommons.org/licenses/by-nc/3.0/" }
+    section(id=id || "bibliography")
        +infobox
            +label.o-block-small Bibliography
            +list("numbers").u-text-small.o-no-block
                block
-    +row(divider ? "divider": null)
+
-        +cell #[code=name]
+//- Footnote
-            if default_model
+    id      - [string / integer] ID of footnote.
-                |  #[span.u-color-theme(title="default model") #[+icon("star", 16)]]
+    bib_id  - [string] ID of bibliography component, defaults to "bibliography".
-        +cell=lang
+    tooltip - [string] optional text displayed as tooltip
-        each icon in procon
+
-            +cell.u-text-center #[+procon(icon ? "pro" : "con")]
+mixin fn(id, bib_id, tooltip)
-        +cell.u-text-right=size
+    sup.u-padding-small(id="bib" + id data-tooltip=tooltip)
-        +cell
+        span.u-text-tag
-            if license in licenses
+            +a("#" + (bib_id || "bibliography")).u-hide-link #{id}
                +a(licenses[license])=license
 //- Table rows for annotation specs
@ -383,14 +431,3 @@ mixin annotation-row(annots, style)
            else
                +cell=cell
        block
 //- Table of contents, to be used with +item mixins for links
    col - [string] width of column (see +grid-col)
 mixin table-of-contents(col)
    +grid-col(col || "half")
        +infobox
            +label.o-block-small Table of contents
            +list("numbers").u-text-small.o-no-block
                block
--- a/website/_includes/_navigation.jade
+++ b/website/_includes/_navigation.jade
@ -1,19 +1,15 @@
 //- 💫 INCLUDES > TOP NAVIGATION
 include _mixins
 nav.c-nav.u-text.js-nav(class=landing ? "c-nav--theme" : null)
-    a(href='/') #[+logo]
+    a(href="/" aria-label=SITENAME) #[+logo]
    if SUBSECTION != "index"
        .u-text-label.u-padding-small.u-hidden-xs=SUBSECTION
    ul.c-nav__menu
-        - var NAV = ALPHA ? { "Usage": "/docs/usage", "Reference": "/docs/api" } : NAVIGATION
+        - var current_url = '/' + current.path[0]
-
+        each url, item in NAVIGATION
-        each url, item in NAV
+            li.c-nav__menu__item(class=(current_url == url) ? "is-active" : null)
            li.c-nav__menu__item(class=(url == "/") ? "u-hidden-xs" : null)
                +a(url)=item
-        li.c-nav__menu__item
+        li.c-nav__menu__item.u-hidden-xs
-            +a(gh("spaCy"))(aria-label="GitHub").u-hidden-xs #[+icon("github", 20)]
+            +a(gh("spaCy"))(aria-label="GitHub") #[+icon("github", 20)]
    progress.c-progress.js-progress(value="0" max="1")
--- a/website/_includes/_newsletter.jade
+++ b/website/_includes/_newsletter.jade
@ -1,6 +1,6 @@
 //- 💫 INCLUDES > NEWSLETTER
-ul.o-block
+ul.o-block-small
    li.u-text-label.u-color-subtle Stay in the loop!
    li Receive updates about new releases, tutorials and more.
@ -10,7 +10,6 @@ form.o-grid#mc-embedded-subscribe-form(action="//#{MAILCHIMP.user}.list-manage.c
    div(style="position: absolute; left: -5000px;" aria-hidden="true")
        input(type="text" name="b_#{MAILCHIMP.id}_#{MAILCHIMP.list}" tabindex="-1" value="")
-    .o-grid-col.u-border.u-padding-small
+    .o-grid-col.o-grid.o-grid--nowrap.o-field.u-padding-small
-        input#mce-EMAIL.u-text(type="email" name="EMAIL" placeholder="Your email")
+        input#mce-EMAIL.o-field__input.u-text(type="email" name="EMAIL" placeholder="Your email" aria-label="Your email")
-
+        button#mc-embedded-subscribe.o-field__button.u-text-label.u-color-theme.u-nowrap(type="submit" name="subscribe") Sign up
        button#mc-embedded-subscribe.u-text-label.u-color-theme(type="submit" name="subscribe") Sign up
--- a/website/_includes/_page-docs.jade
+++ b/website/_includes/_page-docs.jade
@ -1,47 +1,56 @@
 //- 💫 INCLUDES > DOCS PAGE TEMPLATE
- sidebar_content = (SUBSECTION != "index") ? public.docs[SUBSECTION]._data.sidebar : public.docs._data.sidebar || FOOTER
+- sidebar_content = (public[SECTION] ? public[SECTION]._data.sidebar : public._data[SECTION] ? public._data[SECTION].sidebar : false) || FOOTER
 include _sidebar
 main.o-main.o-main--sidebar.o-main--aside
    article.o-content
        +grid.o-no-block
-            +grid-col(source ? "two-thirds" : "full")
+            +h(1).u-heading--title=title.replace("'", "’")
-                +h(1)=title
+                if tag
-                    if tag
+                    +tag=tag
-                        +tag=tag
+                if tag_new
                    +tag-new(tag_new)
                if teaser
                    .u-heading__teaser.u-text-small.u-color-dark=teaser
                else if IS_MODELS
                    .u-heading__teaser.u-text-small.u-color-dark
                        |  Available statistical models for
                        |  #[code=current.source] (#{LANGUAGES[current.source]}).
            if source
-                +grid-col("third").u-text-right
+                .o-block.u-text-right
-                    .o-inline-list
+                    +button(gh("spacy", source), false, "secondary", "small").u-nowrap
-                        +button(gh("spacy", source), false, "secondary").u-text-tag Source #[+icon("code", 14)]
+                        |  Source #[+icon("code", 14)]
        //-if ALPHA
        //-    +alpha-info
-        if ALPHA
+        if IS_MODELS
-            +infobox("⚠️ You are viewing the spaCy v2.0.0 alpha docs")
+            include _page_models
-                strong This page is part of the alpha documentation for spaCy v2.0.
+        else
-                |  It does not reflect the state of the latest stable release.
+            !=yield
                |  Because v2.0 is still under development, the implementation
                |  may differ from the intended state described here. See the
                |  #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes]
                |  for details on how to install and test the new version. To
                |  read the official docs for spaCy v1.x,
                |  #[+a("https://spacy.io/docs") go here].
        !=yield
    +grid.o-content.u-text
        +grid-col("half")
-            if next && public.docs[SUBSECTION]._data[next]
+            if !IS_MODELS
                - data = public.docs[SUBSECTION]._data[next]
                .o-inline-list
-                    span #[strong.u-text-label Read next:] #[+a(next).u-link=data.title]
+                    +button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary", "small")
                        |  #[span.o-icon Suggest edits] #[+icon("code", 14)]
        +grid-col("half").u-text-right
-            .o-inline-list
+            if next && public[SECTION]._data[next]
-                +button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary").u-text-tag Suggest edits #[+icon("code", 14)]
+                - data = public[SECTION]._data[next]
                +grid("vcenter")
                    +a(next).u-text-small.u-flex-full
                        h4.u-text-label.u-color-dark Read next
                        |  #{data.title}
                    +a(next).c-icon-button.c-icon-button--right(aria-hidden="true")
                        +icon("arrow-right", 24)
    +gitter("spaCy chat")
--- a/website/_includes/_page_models.jade
+++ b/website/_includes/_page_models.jade
@ -0,0 +1,77 @@
 //- 💫 INCLUDES > MODELS PAGE TEMPLATE
 for id in CURRENT_MODELS
    +section(id)
        +grid("vcenter").o-no-block(id=id)
            +grid-col("two-thirds")
                +h(2)
                    +a("#" + id).u-permalink=id
            +grid-col("third").u-text-right
                .u-color-subtle.u-text-tiny
                    +button(gh("spacy-models") + "/releases", true, "secondary", "small")(data-tpl=id data-tpl-key="download")
                        |  Release details
                    .u-padding-small Latest: #[code(data-tpl=id data-tpl-key="version") n/a]
        +aside-code("Installation", "bash", "$").
            spacy download #{id}
        - var comps = getModelComponents(id)
        p(data-tpl=id data-tpl-key="description")
        div(data-tpl=id data-tpl-key="error" style="display: none")
            +infobox
                |  Unable to load model details from GitHub. To find out more
                |  about this model, see the overview of the
                |  #[+a(gh("spacy-models") + "/releases") latest model releases].
        +table(data-tpl=id data-tpl-key="table")
            +row
                +cell #[+label Language]
                +cell #[+tag=comps.lang] #{LANGUAGES[comps.lang]}
            for comp, label in {"Type": comps.type, "Genre": comps.genre}
                +row
                    +cell #[+label=label]
                    +cell #[+tag=comp] #{MODEL_META[comp]}
            +row
                +cell #[+label Size]
                +cell #[+tag=comps.size] #[span(data-tpl=id data-tpl-key="size") #[em n/a]]
            each label in ["Pipeline", "Sources", "Author", "License"]
                - var field = label.toLowerCase()
                +row
                    +cell.u-nowrap
                        +label=label
                            if MODEL_META[field]
                                |  #[+help(MODEL_META[field]).u-color-subtle]
                    +cell
                        span(data-tpl=id data-tpl-key=field) #[em n/a]
            +row(data-tpl=id data-tpl-key="compat-wrapper" style="display: none")
                +cell
                    +label Compat #[+help("Latest compatible model version for your spaCy installation").u-color-subtle]
                +cell
                    .o-field.u-float-left
                        select.o-field__select.u-text-small(data-tpl=id data-tpl-key="compat")
                    .o-empty(data-tpl=id data-tpl-key="compat-versions") &nbsp;
        section(data-tpl=id data-tpl-key="accuracy-wrapper" style="display: none")
            +grid.o-no-block
                +grid-col("third")
                    +h(4) Accuracy
                    +table.o-block-small
                        for label, field in MODEL_ACCURACY
                            +row(style="display: none")
                                +cell.u-nowrap
                                    +label=label
                                        if MODEL_META[field]
                                            |  #[+help(MODEL_META[field]).u-color-subtle]
                                +cell.u-text-right(data-tpl=id data-tpl-key=field)
                                    |  n/a
                +grid-col("two-thirds")
                    +h(4) Comparison
                    +chart(id).u-padding-small
        p.u-text-small.u-color-dark(data-tpl=id data-tpl-key="notes")
--- a/website/_includes/_scripts.jade
+++ b/website/_includes/_scripts.jade
@ -1,27 +1,46 @@
 //- 💫 INCLUDES > SCRIPTS
-script(src="/assets/js/main.js?v#{V_JS}")
+if quickstart
-script(src="/assets/js/prism.js")
+        script(src="/assets/js/quickstart.min.js")
-if SECTION == "docs"
+if IS_PAGE
-    if quickstart
+    script(src="/assets/js/in-view.min.js")
        script(src="/assets/js/quickstart.js")
        script var qs = new Quickstart("#qs")
-    script.
+if HAS_MODELS
-        ((window.gitter = {}).chat = {}).options = {
+    script(src="/assets/js/chart.min.js")
            useStyles: false,
            activationElement: '.js-gitter-button',
            targetElement: '.js-gitter',
            room: '!{SOCIAL.gitter}'
        };
    script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
 if environment == "deploy"
-    script
+    script(async src="https://www.google-analytics.com/analytics.js")
 script(src="/assets/js/prism.min.js")
 script(src="/assets/js/main.js?v#{V_JS}")
 script
    | new ProgressBar('.js-progress');
    if changelog
        | new Changelog('!{SOCIAL.github}', 'spacy');
    if quickstart
        | new Quickstart("#qs");
    if IS_PAGE
        | new SectionHighlighter('data-section', 'data-nav');
        | new GitHubEmbed('!{SOCIAL.github}', 'data-gh-embed');
        | ((window.gitter = {}).chat = {}).options = {
        |     useStyles: false,
        |     activationElement: '.js-gitter-button',
        |     targetElement: '.js-gitter',
        |     room: '!{SOCIAL.gitter}'
        | };
    if HAS_MODELS
        | new ModelLoader('!{MODELS_REPO}', !{JSON.stringify(CURRENT_MODELS)}, !{JSON.stringify(MODEL_LICENSES)}, !{JSON.stringify(MODEL_ACCURACY)});
    if environment == "deploy"
        | window.ga=window.ga||function(){
        | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
        | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
-    script(async src="https://www.google-analytics.com/analytics.js")
+if IS_PAGE
    script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
--- a/website/_includes/_sidebar.jade
+++ b/website/_includes/_sidebar.jade
@ -1,13 +1,23 @@
 //- 💫 INCLUDES > SIDEBAR
 include _mixins
 menu.c-sidebar.js-sidebar.u-text
    if sidebar_content
-        each items, menu in sidebar_content
+        each items, sectiontitle in sidebar_content
-            ul.c-sidebar__section.o-block
+            ul.c-sidebar__section.o-block-small
-                li.u-text-label.u-color-subtle=menu
+                li.u-text-label.u-color-dark=sectiontitle
                each url, item in items
-                    li(class=(CURRENT == url || (CURRENT == "index" && url == "./")) ? "is-active" : null)
+                    - var is_current = CURRENT == url || (CURRENT == "index" && url == "./")
-                        +a(url)=item
+                    li.c-sidebar__item
                        +a(url)(class=is_current ? "is-active" : null)=item
                        if is_current
                            if IS_MODELS && CURRENT_MODELS.length
                                - menu = Object.assign({}, ...CURRENT_MODELS.map(id => ({ [id]: id })))
                            if menu
                                ul.c-sidebar__crumb.u-hidden-sm
                                    - var counter = 0
                                    for id, title in menu
                                        - counter++
                                        li.c-sidebar__crumb__item(data-nav=id class=(counter == 1) ? "is-active" : null)
                                            +a("#section-" + id)=title
--- a/website/_includes/_svg.jade
+++ b/website/_includes/_svg.jade
--- a/website/_layout.jade
+++ b/website/_layout.jade
@ -2,11 +2,16 @@
 include _includes/_mixins
 - title = IS_MODELS ? LANGUAGES[current.source] || title : title
 - social_title = (SECTION == "index") ? SITENAME + " - " + SLOGAN : title + " - " + SITENAME
 - social_img = SITE_URL + "/assets/img/social/preview_" + (preview || ALPHA ? "alpha" : "default") + ".jpg"
 doctype html
 html(lang="en")
    title
-        if SECTION == "docs" && SUBSECTION && SUBSECTION != "index"
+        if SECTION == "api" || SECTION == "usage" || SECTION == "models"
-            | #{title} | #{SITENAME} #{SUBSECTION == "api" ? "API" : "Usage"} Documentation
+            - var title_section = (SECTION == "api") ? "API" : SECTION.charAt(0).toUpperCase() + SECTION.slice(1)
            | #{title} | #{SITENAME} #{title_section} Documentation
        else if SECTION != "index"
            | #{title} | #{SITENAME}
@ -22,32 +27,30 @@ html(lang="en")
    meta(property="og:type" content="website")
    meta(property="og:site_name" content=sitename)
    meta(property="og:url" content="#{SITE_URL}/#{current.path.join('/')}")
-    meta(property="og:title" content="#{title} - spaCy")
+    meta(property="og:title" content=social_title)
    meta(property="og:description" content=description)
-    meta(property="og:image" content=getSocialImg())
+    meta(property="og:image" content=social_img)
    meta(name="twitter:card" content="summary_large_image")
    meta(name="twitter:site" content="@" + SOCIAL.twitter)
-    meta(name="twitter:title" content="#{title} - spaCy")
+    meta(name="twitter:title" content=social_title)
    meta(name="twitter:description" content=description)
-    meta(name="twitter:image" content=getSocialImg())
+    meta(name="twitter:image" content=social_img)
    link(rel="shortcut icon" href="/assets/img/favicon.ico")
    link(rel="icon" type="image/x-icon" href="/assets/img/favicon.ico")
-    if ALPHA && SECTION == "docs"
+    if SECTION == "api"
        link(href="/assets/css/style_green.css?v#{V_CSS}" rel="stylesheet")
    else if SUBSECTION == "usage"
        link(href="/assets/css/style_red.css?v#{V_CSS}" rel="stylesheet")
    else
        link(href="/assets/css/style.css?v#{V_CSS}" rel="stylesheet")
    body
        include _includes/_svg
        include _includes/_navigation
-        if SECTION == "docs"
+        if !landing
            include _includes/_page-docs
        else
--- a/website/api/_annotation/_biluo.jade
+++ b/website/api/_annotation/_biluo.jade
@ -0,0 +1,43 @@
 //- 💫 DOCS > API > ANNOTATION > BILUO
 +table([ "Tag", "Description" ])
    +row
        +cell #[code #[span.u-color-theme B] EGIN]
        +cell The first token of a multi-token entity.
    +row
        +cell #[code #[span.u-color-theme I] N]
        +cell An inner token of a multi-token entity.
    +row
        +cell #[code #[span.u-color-theme L] AST]
        +cell The final token of a multi-token entity.
    +row
        +cell #[code #[span.u-color-theme U] NIT]
        +cell A single-token entity.
    +row
        +cell #[code #[span.u-color-theme O] UT]
        +cell A non-entity token.
 +aside("Why BILUO, not IOB?")
    |  There are several coding schemes for encoding entity annotations as
    |  token tags.  These coding schemes are equally expressive, but not
    |  necessarily equally learnable.
    |  #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth]
    |  showed that the minimal #[strong Begin], #[strong In], #[strong Out]
    |  scheme was more difficult to learn than the #[strong BILUO] scheme that
    |  we use, which explicitly marks boundary tokens.
 p
    |  spaCy translates the character offsets into this scheme, in order to
    |  decide the cost of each action given the current state of the entity
    |  recogniser. The costs are then used to calculate the gradient of the
    |  loss, to train the model. The exact algorithm is a pastiche of
    |  well-known methods, and is not currently described in any single
    |  publication. The model is a greedy transition-based parser guided by a
    |  linear model whose weights are learned using the averaged perceptron
    |  loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle]
    |  imitation learning strategy. The transition system is equivalent to the
    |  BILOU tagging scheme.
--- a/website/docs/api/_annotation/_dep-labels.jade
+++ b/website/docs/api/_annotation/_dep-labels.jade
--- a/website/docs/api/_annotation/_named-entities.jade
+++ b/website/docs/api/_annotation/_named-entities.jade
--- a/website/docs/api/_annotation/_pos-tags.jade
+++ b/website/docs/api/_annotation/_pos-tags.jade
--- a/website/api/_architecture/_cython.jade
+++ b/website/api/_architecture/_cython.jade
@ -0,0 +1,115 @@
 //- 💫 DOCS > API > ARCHITECTURE > CYTHON
 +aside("What's Cython?")
    |  #[+a("http://cython.org/") Cython] is a language for writing
    |  C extensions for Python. Most Python code is also valid Cython, but
    |  you can add type declarations to get efficient memory-managed code
    |  just like C or C++.
 p
    |  spaCy's core data structures are implemented as
    |  #[+a("http://cython.org/") Cython] #[code cdef] classes. Memory is
    |  managed through the #[+a(gh("cymem")) #[code cymem]]
    |  #[code cymem.Pool] class, which allows you
    |  to allocate memory which will be freed when the #[code Pool] object
    |  is garbage collected. This means you usually don't have to worry
    |  about freeing memory. You just have to decide which Python object
    |  owns the memory, and make it own the #[code Pool]. When that object
    |  goes out of scope, the memory will be freed. You do have to take
    |  care that no pointers outlive the object that owns them — but this
    |  is generally quite easy.
 p
    |  All Cython modules should have the #[code # cython: infer_types=True]
    |  compiler directive at the top of the file. This makes the code much
    |  cleaner, as it avoids the need for many type declarations. If
    |  possible, you should prefer to declare your functions #[code nogil],
    |  even if you don't especially care about multi-threading. The reason
    |  is that #[code nogil] functions help the Cython compiler reason about
    |  your code quite a lot — you're telling the compiler that no Python
    |  dynamics are possible. This lets many errors be raised, and ensures
    |  your function will run at C speed.
 p
    |  Cython gives you many choices of sequences: you could have a Python
    |  list, a numpy array, a memory view, a C++ vector, or a pointer.
    |  Pointers are preferred, because they are fastest, have the most
    |  explicit semantics, and let the compiler check your code more
    |  strictly. C++ vectors are also great — but you should only use them
    |  internally in functions. It's less friendly to accept a vector as an
    |  argument, because that asks the user to do much more work. Here's
    |  how to get a pointer from a numpy array, memory view or vector:
 +code.
    cdef void get_pointers(np.ndarray[int, mode='c'] numpy_array, vector[int] cpp_vector, int[::1] memory_view) nogil:
    pointer1 = &lt;int*&gt;numpy_array.data
    pointer2 = cpp_vector.data()
    pointer3 = &memory_view[0]
 p
    |  Both C arrays and C++ vectors reassure the compiler that no Python
    |  operations are possible on your variable. This is a big advantage:
    |  it lets the Cython compiler raise many more errors for you.
 p
    |  When getting a pointer from a numpy array or memoryview, take care
    |  that the data is actually stored in C-contiguous order — otherwise
    |  you'll get a pointer to nonsense. The type-declarations in the code
    |  above should generate runtime errors if buffers with incorrect
    |  memory layouts are passed in. To iterate over the array, the
    |  following style is preferred:
 +code.
    cdef int c_total(const int* int_array, int length) nogil:
        total = 0
        for item in int_array[:length]:
            total += item
        return total
 p
    |  If this is confusing, consider that the compiler couldn't deal with
    |  #[code for item in int_array:] — there's no length attached to a raw
    |  pointer, so how could we figure out where to stop? The length is
    |  provided in the slice notation as a solution to this. Note that we
    |  don't have to declare the type of #[code item] in the code above —
    |  the compiler can easily infer it. This gives us tidy code that looks
    |  quite like Python, but is exactly as fast as C — because we've made
    |  sure the compilation to C is trivial.
 p
    |  Your functions cannot be declared #[code nogil] if they need to
    |  create Python objects or call Python functions. This is perfectly
    |  okay — you shouldn't torture your code just to get #[code nogil]
    |  functions. However, if your function isn't #[code nogil], you should
    |  compile your module with #[code cython -a --cplus my_module.pyx] and
    |  open the resulting #[code my_module.html] file in a browser. This
    |  will let you see how Cython is compiling your code. Calls into the
    |  Python run-time will be in bright yellow. This lets you easily see
    |  whether Cython is able to correctly type your code, or whether there
    |  are unexpected problems.
 p
    |  Working in Cython is very rewarding once you're over the initial
    |  learning curve. As with C and C++, the first way you write something
    |  in Cython will often be the performance-optimal approach. In
    |  contrast, Python optimisation generally requires a lot of
    |  experimentation. Is it faster to have an #[code if item in my_dict]
    |  check, or to use #[code .get()]? What about
    |  #[code try]/#[code except]? Does this numpy operation create a copy?
    |  There's no way to guess the answers to these questions, and you'll
    |  usually be dissatisfied with your results — so there's no way to
    |  know when to stop this process. In the worst case, you'll make a
    |  mess that invites the next reader to try their luck too. This is
    |  like one of those
    |  #[+a("http://www.wemjournal.org/article/S1080-6032%2809%2970088-2/abstract") volcanic gas-traps],
    |  where the rescuers keep passing out from low oxygen, causing
    |  another rescuer to follow — only to succumb themselves. In short,
    |  just say no to optimizing your Python. If it's not fast enough the
    |  first time, just switch to Cython.
 +infobox("Resources")
    +list.o-no-block
        +item #[+a("http://docs.cython.org/en/latest/") Official Cython documentation] (cython.org)
        +item #[+a("https://explosion.ai/blog/writing-c-in-cython", true) Writing C in Cython] (explosion.ai)
        +item #[+a("https://explosion.ai/blog/multithreading-with-cython") Multi-threading spaCy’s parser and named entity recogniser] (explosion.ai)
--- a/website/api/_architecture/_nn-model.jade
+++ b/website/api/_architecture/_nn-model.jade
@ -0,0 +1,141 @@
 //- 💫 DOCS > API > ARCHITECTURE > NN MODEL ARCHITECTURE
 p
    |  The parsing model is a blend of recent results. The two recent
    |  inspirations have been the work of Eli Klipperwasser and Yoav Goldberg at
    |  Bar Ilan#[+fn(1)], and the SyntaxNet team from Google. The foundation of
    |  the parser is still based on the work of Joakim Nivre#[+fn(2)], who
    |  introduced the transition-based framework#[+fn(3)], the arc-eager
    |  transition system, and the imitation learning objective. The model is
    |  implemented using #[+a(gh("thinc")) Thinc], spaCy's machine learning
    |  library. We first predict context-sensitive vectors for each word in the
    |  input:
 +code.
    (embed_lower | embed_prefix | embed_suffix | embed_shape)
        &gt;&gt; Maxout(token_width)
        &gt;&gt; convolution ** 4
 p
    |  This convolutional layer is shared between the tagger, parser and NER,
    |  and will also be shared by the future neural lemmatizer. Because the
    |  parser shares these layers with the tagger, the parser does not require
    |  tag features. I got this trick from David Weiss's "Stack Combination"
    |  paper#[+fn(4)].
 p
    |  To boost the representation, the tagger actually predicts a "super tag"
    |  with POS, morphology and dependency label#[+fn(5)]. The tagger predicts
    |  these supertags by adding a softmax layer onto the convolutional layer –
    |  so, we're teaching the convolutional layer to give us a representation
    |  that's one affine transform from this informative lexical information.
    |  This is obviously good for the parser (which backprops to the
    |  convolutions too). The parser model makes a state vector by concatenating
    |  the vector representations for its context tokens.  The current context
    |  tokens:
 +table
    +row
        +cell #[code S0], #[code S1], #[code S2]
        +cell Top three words on the stack.
    +row
        +cell #[code B0], #[code B1]
        +cell First two words of the buffer.
    +row
        +cell.u-nowrap
            |  #[code S0L1], #[code S1L1], #[code S2L1], #[code B0L1],
            |  #[code B1L1]#[br]
            |  #[code S0L2], #[code S1L2], #[code S2L2], #[code B0L2],
            |  #[code B1L2]
        +cell
            |  Leftmost and second leftmost children of #[code S0], #[code S1],
            |  #[code S2], #[code B0] and #[code B1].
    +row
        +cell.u-nowrap
            |  #[code S0R1], #[code S1R1], #[code S2R1], #[code B0R1],
            |  #[code B1R1]#[br]
            |  #[code S0R2], #[code S1R2], #[code S2R2], #[code B0R2],
            |  #[code B1R2]
        +cell
            |  Rightmost and second rightmost children of #[code S0], #[code S1],
            |  #[code S2], #[code B0] and #[code B1].
 p
    |  This makes the state vector quite long: #[code 13*T], where #[code T] is
    |  the token vector width (128 is working well). Fortunately, there's a way
    |  to structure the computation to save some expense (and make it more
    |  GPU-friendly).
 p
    |  The parser typically visits #[code 2*N] states for a sentence of length
    |  #[code N] (although it may visit more, if it back-tracks with a
    |  non-monotonic transition#[+fn(4)]). A naive implementation would require
    |  #[code 2*N (B, 13*T) @ (13*T, H)] matrix multiplications for a batch of
    |  size #[code B]. We can instead perform one #[code (B*N, T) @ (T, 13*H)]
    |  multiplication, to pre-compute the hidden weights for each positional
    |  feature with respect to the words in the batch. (Note that our token
    |  vectors come from the CNN — so we can't play this trick over the
    |  vocabulary. That's how Stanford's NN parser#[+fn(3)] works — and why its
    |  model is so big.)
 p
    |  This pre-computation strategy allows a nice compromise between
    |  GPU-friendliness and implementation simplicity. The CNN and the wide
    |  lower layer are computed on the GPU, and then the precomputed hidden
    |  weights are moved to the CPU, before we start the transition-based
    |  parsing process. This makes a lot of things much easier. We don't have to
    |  worry about variable-length batch sizes, and we don't have to implement
    |  the dynamic oracle in CUDA to train.
 p
    |  Currently the parser's loss function is multilabel log loss#[+fn(6)], as
    |  the dynamic oracle allows multiple states to be 0 cost. This is defined
    |  as follows, where #[code gZ] is the sum of the scores assigned to gold
    |  classes:
 +code.
    (exp(score) / Z) - (exp(score) / gZ)
 +bibliography
    +item
        |  #[+a("https://www.semanticscholar.org/paper/Simple-and-Accurate-Dependency-Parsing-Using-Bidir-Kiperwasser-Goldberg/3cf31ecb2724b5088783d7c96a5fc0d5604cbf41") Simple and Accurate Dependency Parsing Using Bidirectional LSTM Feature Representations]
        br
        |  Eliyahu Kiperwasser, Yoav Goldberg. (2016)
    +item
        |  #[+a("https://www.semanticscholar.org/paper/A-Dynamic-Oracle-for-Arc-Eager-Dependency-Parsing-Goldberg-Nivre/22697256ec19ecc3e14fcfc63624a44cf9c22df4") A Dynamic Oracle for Arc-Eager Dependency Parsing]
        br
        |  Yoav Goldberg, Joakim Nivre (2012)
    +item
        |  #[+a("https://explosion.ai/blog/parsing-english-in-python") Parsing English in 500 Lines of Python]
        br
        |  Matthew Honnibal (2013)
    +item
        |  #[+a("https://www.semanticscholar.org/paper/Stack-propagation-Improved-Representation-Learning-Zhang-Weiss/0c133f79b23e8c680891d2e49a66f0e3d37f1466") Stack-propagation: Improved Representation Learning for Syntax]
        br
        |  Yuan Zhang, David Weiss (2016)
    +item
        |  #[+a("https://www.semanticscholar.org/paper/Deep-multi-task-learning-with-low-level-tasks-supe-S%C3%B8gaard-Goldberg/03ad06583c9721855ccd82c3d969a01360218d86") Deep multi-task learning with low level tasks supervised at lower layers]
        br
        |  Anders Søgaard, Yoav Goldberg (2016)
    +item
        |  #[+a("https://www.semanticscholar.org/paper/An-Improved-Non-monotonic-Transition-System-for-De-Honnibal-Johnson/4094cee47ade13b77b5ab4d2e6cb9dd2b8a2917c") An Improved Non-monotonic Transition System for Dependency Parsing]
        br
        |  Matthew Honnibal, Mark Johnson (2015)
    +item
        |  #[+a("http://cs.stanford.edu/people/danqi/papers/emnlp2014.pdf") A Fast and Accurate Dependency Parser using Neural Networks]
        br
        |  Danqi Cheng, Christopher D. Manning (2014)
    +item
        |  #[+a("https://www.semanticscholar.org/paper/Parsing-the-Wall-Street-Journal-using-a-Lexical-Fu-Riezler-King/0ad07862a91cd59b7eb5de38267e47725a62b8b2") Parsing the Wall Street Journal using a Lexical-Functional Grammar and Discriminative Estimation Techniques]
        br
        |  Stefan Riezler et al. (2002)
--- a/website/docs/api/_data.json
+++ b/website/docs/api/_data.json
@ -1,29 +1,32 @@
 {
    "sidebar": {
-        "Introduction": {
+        "Overview": {
-            "Facts & Figures": "./",
+            "Architecture": "./",
-            "Languages": "language-models",
+            "Annotation Specs": "annotation",
-            "Annotation Specs": "annotation"
+            "Functions": "top-level"
        },
-        "Top-level": {
+        "Containers": {
            "spacy": "spacy",
            "displacy": "displacy",
            "Utility Functions": "util",
            "Command line": "cli"
        },
        "Classes": {
            "Doc": "doc",
            "Token": "token",
            "Span": "span",
            "Lexeme": "lexeme"
        },
        "Pipeline": {
            "Language": "language",
-            "Tokenizer": "tokenizer",
+            "Pipe": "pipe",
            "Tensorizer": "tensorizer",
            "Tagger": "tagger",
            "DependencyParser": "dependencyparser",
            "EntityRecognizer": "entityrecognizer",
            "TextCategorizer": "textcategorizer",
            "Tokenizer": "tokenizer",
            "Lemmatizer": "lemmatizer",
            "Matcher": "matcher",
-            "Lexeme": "lexeme",
+            "PhraseMatcher": "phrasematcher"
        },
        "Other": {
            "Vocab": "vocab",
            "StringStore": "stringstore",
            "Vectors": "vectors",
@ -34,52 +37,37 @@
    },
    "index": {
-        "title": "Facts & Figures",
+        "title": "Architecture",
-        "next": "language-models"
+        "next": "annotation",
        "menu": {
            "Basics": "basics",
            "Neural Network Model": "nn-model",
            "Cython Conventions": "cython"
        }
    },
-    "language-models": {
+    "top-level": {
-        "title": "Languages",
+        "title": "Top-level Functions",
-        "next": "philosophy"
+        "menu": {
-    },
+            "spacy": "spacy",
-
+            "displacy": "displacy",
-    "philosophy": {
+            "Utility Functions": "util",
-        "title": "Philosophy"
+            "Compatibility": "compat",
-    },
+            "Command Line": "cli"
-
+        }
    "spacy": {
        "title": "spaCy top-level functions",
        "source": "spacy/__init__.py",
        "next": "displacy"
    },
    "displacy": {
        "title": "displaCy",
        "tag": "module",
        "source": "spacy/displacy",
        "next": "util"
    },
    "util": {
        "title": "Utility Functions",
        "source": "spacy/util.py",
        "next": "cli"
    },
    "cli": {
        "title": "Command Line Interface",
        "source": "spacy/cli"
    },
    "language": {
        "title": "Language",
        "tag": "class",
        "teaser": "A text-processing pipeline.",
        "source": "spacy/language.py"
    },
    "doc": {
        "title": "Doc",
        "tag": "class",
        "teaser": "A container for accessing linguistic annotations.",
        "source": "spacy/tokens/doc.pyx"
    },
@ -103,6 +91,7 @@
    "vocab": {
        "title": "Vocab",
        "teaser": "A storage class for vocabulary and other data shared across a language.",
        "tag": "class",
        "source": "spacy/vocab.pyx"
    },
@ -115,10 +104,27 @@
    "matcher": {
        "title": "Matcher",
        "teaser": "Match sequences of tokens, based on pattern rules.",
        "tag": "class",
        "source": "spacy/matcher.pyx"
    },
    "phrasematcher": {
        "title": "PhraseMatcher",
        "teaser": "Match sequences of tokens, based on documents.",
        "tag": "class",
        "tag_new": 2,
        "source": "spacy/matcher.pyx"
    },
    "pipe": {
        "title": "Pipe",
        "teaser": "Abstract base class defining the API for pipeline components.",
        "tag": "class",
        "tag_new": 2,
        "source": "spacy/pipeline.pyx"
    },
    "dependenyparser": {
        "title": "DependencyParser",
        "tag": "class",
@ -127,18 +133,22 @@
    "entityrecognizer": {
        "title": "EntityRecognizer",
        "teaser": "Annotate named entities on documents.",
        "tag": "class",
        "source": "spacy/pipeline.pyx"
    },
    "textcategorizer": {
        "title": "TextCategorizer",
        "teaser": "Add text categorization models to spaCy pipelines.",
        "tag": "class",
        "tag_new": 2,
        "source": "spacy/pipeline.pyx"
    },
    "dependencyparser": {
        "title": "DependencyParser",
        "teaser": "Annotate syntactic dependencies on documents.",
        "tag": "class",
        "source": "spacy/pipeline.pyx"
    },
@ -149,15 +159,23 @@
        "source": "spacy/tokenizer.pyx"
    },
    "lemmatizer": {
        "title": "Lemmatizer",
        "tag": "class"
    },
    "tagger": {
        "title": "Tagger",
        "teaser": "Annotate part-of-speech tags on documents.",
        "tag": "class",
        "source": "spacy/pipeline.pyx"
    },
    "tensorizer": {
        "title": "Tensorizer",
        "teaser": "Add a tensor with position-sensitive meaning representations to a document.",
        "tag": "class",
        "tag_new": 2,
        "source": "spacy/pipeline.pyx"
    },
@ -169,23 +187,38 @@
    "goldcorpus": {
        "title": "GoldCorpus",
        "teaser": "An annotated corpus, using the JSON file format.",
        "tag": "class",
        "tag_new": 2,
        "source": "spacy/gold.pyx"
    },
    "binder": {
        "title": "Binder",
        "tag": "class",
        "tag_new": 2,
        "source": "spacy/tokens/binder.pyx"
    },
    "vectors": {
        "title": "Vectors",
        "teaser": "Store, save and load word vectors.",
        "tag": "class",
        "tag_new": 2,
        "source": "spacy/vectors.pyx"
    },
    "annotation": {
-        "title": "Annotation Specifications"
+        "title": "Annotation Specifications",
        "teaser": "Schemes used for labels, tags and training data.",
        "menu": {
            "Tokenization": "tokenization",
            "Sentence Boundaries": "sbd",
            "POS Tagging": "pos-tagging",
            "Lemmatization": "lemmatization",
            "Dependencies": "dependency-parsing",
            "Named Entities": "named-entities",
            "Training Data": "training"
        }
    }
 }
--- a/website/api/_top-level/_cli.jade
+++ b/website/api/_top-level/_cli.jade
@ -1,26 +1,17 @@
-//- 💫 DOCS > USAGE > COMMAND LINE INTERFACE
+//- 💫 DOCS > API > TOP-LEVEL > COMMAND LINE INTERFACE
 include ../../_includes/_mixins
 p
    |  As of v1.7.0, spaCy comes with new command line helpers to download and
    |  link models and show useful debugging information. For a list of available
    |  commands, type #[code spacy --help].
-+infobox("⚠️ Deprecation note")
+h(3, "download") Download
    |  As of spaCy 2.0, the #[code model] command to initialise a model data
    |  directory is deprecated. The command was only necessary because previous
    |  versions of spaCy expected a model directory to already be set up. This
    |  has since been changed, so you can use the #[+api("cli#train") #[code train]]
    |  command straight away.
 +h(2, "download") Download
 p
-    |  Download #[+a("/docs/usage/models") models] for spaCy. The downloader finds the
+    |  Download #[+a("/usage/models") models] for spaCy. The downloader finds the
    |  best-matching compatible version, uses pip to download the model as a
    |  package and automatically creates a
-    |  #[+a("/docs/usage/models#usage") shortcut link] to load the model by name.
+    |  #[+a("/usage/models#usage") shortcut link] to load the model by name.
    |  Direct downloads don't perform any compatibility checks and require the
    |  model name to be specified with its version (e.g., #[code en_core_web_sm-1.2.0]).
@ -49,15 +40,15 @@ p
    |  detailed messages in case things go wrong. It's #[strong not recommended]
    |  to use this command as part of an automated process. If you know which
    |  model your project needs, you should consider a
-    |  #[+a("/docs/usage/models#download-pip") direct download via pip], or
+    |  #[+a("/usage/models#download-pip") direct download via pip], or
    |  uploading the model to a local PyPi installation and fetching it straight
    |  from there. This will also allow you to add it as a versioned package
    |  dependency to your project.
-+h(2, "link") Link
+h(3, "link") Link
 p
-    |  Create a #[+a("/docs/usage/models#usage") shortcut link] for a model,
+    |  Create a #[+a("/usage/models#usage") shortcut link] for a model,
    |  either a Python package or a local directory. This will let you load
    |  models from any location using a custom name via
    |  #[+api("spacy#load") #[code spacy.load()]].
@ -95,7 +86,7 @@ p
        +cell flag
        +cell Show help message and available arguments.
-+h(2, "info") Info
+h(3, "info") Info
 p
    |  Print information about your spaCy installation, models and local setup,
@ -122,15 +113,15 @@ p
        +cell flag
        +cell Show help message and available arguments.
-+h(2, "convert") Convert
+h(3, "convert") Convert
 p
-    |  Convert files into spaCy's #[+a("/docs/api/annotation#json-input") JSON format]
+    |  Convert files into spaCy's #[+a("/api/annotation#json-input") JSON format]
    |  for use with the #[code train] command and other experiment management
    |  functions. The right converter is chosen based on the file extension of
    |  the input file. Currently only supports #[code .conllu].
-+code(false, "bash", "$").
+code(false, "bash", "$", false, false, true).
    spacy convert [input_file] [output_dir] [--n-sents] [--morphology]
 +table(["Argument", "Type", "Description"])
@ -159,14 +150,18 @@ p
        +cell flag
        +cell Show help message and available arguments.
-+h(2, "train") Train
+h(3, "train") Train
 p
    |  Train a model. Expects data in spaCy's
-    |  #[+a("/docs/api/annotation#json-input") JSON format].
+    |  #[+a("/api/annotation#json-input") JSON format]. On each epoch, a model
    |  will be saved out to the directory. Accuracy scores and model details
    |  will be added to a #[+a("/usage/training#models-generating") #[code meta.json]]
    |  to allow packaging the model using the
    |  #[+api("cli#package") #[code package]] command.
-+code(false, "bash", "$").
+code(false, "bash", "$", false, false, true).
-    spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
+    spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser] [--no-entities] [--gold-preproc]
 +table(["Argument", "Type", "Description"])
    +row
@ -204,6 +199,27 @@ p
        +cell option
        +cell Use GPU.
    +row
        +cell #[code --vectors], #[code -v]
        +cell option
        +cell Model to load vectors from.
    +row
        +cell #[code --meta-path], #[code -m]
        +cell option
        +cell
            |  #[+tag-new(2)] Optional path to model
            |  #[+a("/usage/training#models-generating") #[code meta.json]].
            |  All relevant properties like #[code lang], #[code pipeline] and
            |  #[code spacy_version] will be overwritten.
    +row
        +cell #[code --version], #[code -V]
        +cell option
        +cell
            |  Model version. Will be written out to the model's
            |  #[code meta.json] after training.
    +row
        +cell #[code --no-tagger], #[code -T]
        +cell flag
@ -219,12 +235,18 @@ p
        +cell flag
        +cell Don't train NER.
    +row
        +cell #[code --gold-preproc], #[code -G]
        +cell flag
        +cell Use gold preprocessing.
    +row
        +cell #[code --help], #[code -h]
        +cell flag
        +cell Show help message and available arguments.
-+h(3, "train-hyperparams") Environment variables for hyperparameters
+h(4, "train-hyperparams") Environment variables for hyperparameters
    +tag-new(2)
 p
    |  spaCy lets you set hyperparameters for training via environment variables.
@ -236,98 +258,149 @@ p
 +code(false, "bash").
    parser_hidden_depth=2 parser_maxout_pieces=1 train-parser
 +under-construction
 +table(["Name", "Description", "Default"])
    +row
        +cell #[code dropout_from]
-        +cell
+        +cell Initial dropout rate.
        +cell #[code 0.2]
    +row
        +cell #[code dropout_to]
-        +cell
+        +cell Final dropout rate.
        +cell #[code 0.2]
    +row
        +cell #[code dropout_decay]
-        +cell
+        +cell Rate of dropout change.
        +cell #[code 0.0]
    +row
        +cell #[code batch_from]
-        +cell
+        +cell Initial batch size.
        +cell #[code 1]
    +row
        +cell #[code batch_to]
-        +cell
+        +cell Final batch size.
        +cell #[code 64]
    +row
        +cell #[code batch_compound]
-        +cell
+        +cell Rate of batch size acceleration.
        +cell #[code 1.001]
    +row
        +cell #[code token_vector_width]
-        +cell
+        +cell Width of embedding tables and convolutional layers.
        +cell #[code 128]
    +row
        +cell #[code embed_size]
-        +cell
+        +cell Number of rows in embedding tables.
        +cell #[code 7500]
    +row
        +cell #[code parser_maxout_pieces]
-        +cell
+        +cell Number of pieces in the parser's and NER's first maxout layer.
        +cell #[code 2]
    +row
        +cell #[code parser_hidden_depth]
-        +cell
+        +cell Number of hidden layers in the parser and NER.
        +cell #[code 1]
    +row
        +cell #[code hidden_width]
-        +cell
+        +cell Size of the parser's and NER's hidden layers.
        +cell #[code 128]
    +row
        +cell #[code learn_rate]
-        +cell
+        +cell Learning rate.
        +cell #[code 0.001]
    +row
        +cell #[code optimizer_B1]
-        +cell
+        +cell Momentum for the Adam solver.
        +cell #[code 0.9]
    +row
        +cell #[code optimizer_B2]
-        +cell
+        +cell Adagrad-momentum for the Adam solver.
        +cell #[code 0.999]
    +row
        +cell #[code optimizer_eps]
-        +cell
+        +cell Epsylon value for the Adam solver.
        +cell #[code 1e-08]
    +row
        +cell #[code L2_penalty]
-        +cell
+        +cell L2 regularisation penalty.
        +cell #[code 1e-06]
    +row
        +cell #[code grad_norm_clip]
-        +cell
+        +cell Gradient L2 norm constraint.
        +cell #[code 1.0]
-+h(2, "package") Package
+h(3, "evaluate") Evaluate
    +tag-new(2)
 p
-    |  Generate a #[+a("/docs/usage/saving-loading#generating") model Python package]
+    |  Evaluate a model's accuracy and speed on JSON-formatted annotated data.
    |  Will print the results and optionally export
    |  #[+a("/usage/visualizers") displaCy visualizations] of a sample set of
    |  parses to #[code .html] files. Visualizations for the dependency parse
    |  and NER will be exported as separate files if the respective component
    |  is present in the model's pipeline.
 +code(false, "bash", "$", false, false, true).
    spacy evaluate [model] [data_path] [--displacy-path] [--displacy-limit] [--gpu-id] [--gold-preproc]
 +table(["Argument", "Type", "Description"])
    +row
        +cell #[code model]
        +cell positional
        +cell
            |  Model to evaluate. Can be a package or shortcut link name, or a
            |  path to a model data directory.
    +row
        +cell #[code data_path]
        +cell positional
        +cell Location of JSON-formatted evaluation data.
    +row
        +cell #[code --displacy-path], #[code -dp]
        +cell option
        +cell
            |  Directory to output rendered parses as HTML. If not set, no
            |  visualizations will be generated.
    +row
        +cell #[code --displacy-limit], #[code -dl]
        +cell option
        +cell
            |  Number of parses to generate per file. Defaults to #[code 25].
            |  Keep in mind that a significantly higher number might cause the
            |  #[code .html] files to render slowly.
    +row
        +cell #[code --gpu-id], #[code -g]
        +cell option
        +cell GPU to use, if any. Defaults to #[code -1] for CPU.
    +row
        +cell #[code --gold-preproc], #[code -G]
        +cell flag
        +cell Use gold preprocessing.
 +h(3, "package") Package
 p
    |  Generate a #[+a("/usage/training#models-generating") model Python package]
    |  from an existing model data directory. All data files are copied over.
    |  If the path to a meta.json is supplied, or a meta.json is found in the
    |  input directory, this file is used. Otherwise, the data can be entered
@ -336,8 +409,8 @@ p
    |  sure you're always using the latest versions. This means you need to be
    |  connected to the internet to use this command.
-+code(false, "bash", "$").
+code(false, "bash", "$", false, false, true).
-    spacy package [input_dir] [output_dir] [--meta] [--force]
+    spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force]
 +table(["Argument", "Type", "Description"])
    +row
@ -353,14 +426,14 @@ p
    +row
        +cell #[code --meta-path], #[code -m]
        +cell option
-        +cell Path to meta.json file (optional).
+        +cell #[+tag-new(2)] Path to meta.json file (optional).
    +row
        +cell #[code --create-meta], #[code -c]
        +cell flag
        +cell
-            |  Create a meta.json file on the command line, even if one already
+            |  #[+tag-new(2)] Create a meta.json file on the command line, even
-            |  exists in the directory.
+            |  if one already exists in the directory.
    +row
        +cell #[code --force], #[code -f]
--- a/website/api/_top-level/_compat.jade
+++ b/website/api/_top-level/_compat.jade
@ -0,0 +1,91 @@
 //- 💫 DOCS > API > TOP-LEVEL > COMPATIBILITY
 p
    |  All Python code is written in an
    |  #[strong intersection of Python 2 and Python 3]. This is easy in Cython,
    |  but somewhat ugly in Python. Logic that deals with Python or platform
    |  compatibility only lives in #[code spacy.compat]. To distinguish them from
    |  the builtin functions, replacement functions are suffixed with an
    |  undersocre, e.e #[code unicode_]. For specific checks, spaCy uses the
    |  #[code six] and #[code ftfy] packages.
 +aside-code("Example").
    from spacy.compat import unicode_, json_dumps
    compatible_unicode = unicode_('hello world')
    compatible_json = json_dumps({'key': 'value'})
 +table(["Name", "Python 2", "Python 3"])
    +row
        +cell #[code compat.bytes_]
        +cell #[code str]
        +cell #[code bytes]
    +row
        +cell #[code compat.unicode_]
        +cell #[code unicode]
        +cell #[code str]
    +row
        +cell #[code compat.basestring_]
        +cell #[code basestring]
        +cell #[code str]
    +row
        +cell #[code compat.input_]
        +cell #[code raw_input]
        +cell #[code input]
    +row
        +cell #[code compat.json_dumps]
        +cell #[code ujson.dumps] with #[code .decode('utf8')]
        +cell #[code ujson.dumps]
    +row
        +cell #[code compat.path2str]
        +cell #[code str(path)] with #[code .decode('utf8')]
        +cell #[code str(path)]
 +h(3, "is_config") compat.is_config
    +tag function
 p
    |  Check if a specific configuration of Python version and operating system
    |  matches the user's setup. Mostly used to display targeted error messages.
 +aside-code("Example").
    from spacy.compat import is_config
    if is_config(python2=True, windows=True):
        print("You are using Python 2 on Windows.")
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code python2]
        +cell bool
        +cell spaCy is executed with Python 2.x.
    +row
        +cell #[code python3]
        +cell bool
        +cell spaCy is executed with Python 3.x.
    +row
        +cell #[code windows]
        +cell bool
        +cell spaCy is executed on Windows.
    +row
        +cell #[code linux]
        +cell bool
        +cell spaCy is executed on Linux.
    +row
        +cell #[code osx]
        +cell bool
        +cell spaCy is executed on OS X or macOS.
    +row("foot")
        +cell returns
        +cell bool
        +cell Whether the specified configuration matches the user's platform.
--- a/website/api/_top-level/_displacy.jade
+++ b/website/api/_top-level/_displacy.jade
@ -1,14 +1,12 @@
-//- 💫 DOCS > API > DISPLACY
+//- 💫 DOCS > API > TOP-LEVEL > DISPLACY
 include ../../_includes/_mixins
 p
    |  As of v2.0, spaCy comes with a built-in visualization suite. For more
    |  info and examples, see the usage guide on
-    |  #[+a("/docs/usage/visualizers") visualizing spaCy].
+    |  #[+a("/usage/visualizers") visualizing spaCy].
-+h(2, "serve") displacy.serve
+h(3, "displacy.serve") displacy.serve
    +tag method
    +tag-new(2)
@ -60,7 +58,7 @@ p
        +cell bool
        +cell
            |  Don't parse #[code Doc] and instead, expect a dict or list of
-            |  dicts. #[+a("/docs/usage/visualizers#manual-usage") See here]
+            |  dicts. #[+a("/usage/visualizers#manual-usage") See here]
            |  for formats and examples.
        +cell #[code False]
@ -70,7 +68,7 @@ p
        +cell Port to serve visualization.
        +cell #[code 5000]
-+h(2, "render") displacy.render
+h(3, "displacy.render") displacy.render
    +tag method
    +tag-new(2)
@ -127,24 +125,24 @@ p Render a dependency parse tree or named entity visualization.
        +cell bool
        +cell
            |  Don't parse #[code Doc] and instead, expect a dict or list of
-            |  dicts. #[+a("/docs/usage/visualizers#manual-usage") See here]
+            |  dicts. #[+a("/usage/visualizers#manual-usage") See here]
            |  for formats and examples.
        +cell #[code False]
-    +footrow
+    +row("foot")
        +cell returns
        +cell unicode
        +cell Rendered HTML markup.
        +cell
-+h(2, "options") Visualizer options
+h(3, "displacy_options") Visualizer options
 p
    |  The #[code options] argument lets you specify additional settings for
    |  each visualizer. If a setting is not present in the options, the default
    |  value will be used.
-+h(3, "options-dep") Dependency Visualizer options
+h(4, "options-dep") Dependency Visualizer options
 +aside-code("Example").
    options = {'compact': True, 'color': 'blue'}
@ -219,7 +217,7 @@ p
        +cell Distance between words in px.
        +cell #[code 175] / #[code 85] (compact)
-+h(3, "options-ent") Named Entity Visualizer options
+h(4, "displacy_options-ent") Named Entity Visualizer options
 +aside-code("Example").
    options = {'ents': ['PERSON', 'ORG', 'PRODUCT'],
@ -244,6 +242,6 @@ p
 p
    |  By default, displaCy comes with colours for all
-    |  #[+a("/docs/api/annotation#named-entities") entity types supported by spaCy].
+    |  #[+a("/api/annotation#named-entities") entity types supported by spaCy].
    |  If you're using custom entity types, you can use the #[code colors]
    |  setting to add your own colours for them.
--- a/website/api/_top-level/_spacy.jade
+++ b/website/api/_top-level/_spacy.jade
@ -1,15 +1,13 @@
-//- 💫 DOCS > API > SPACY
+//- 💫 DOCS > API > TOP-LEVEL > SPACY
-include ../../_includes/_mixins
+h(3, "spacy.load") spacy.load
 +h(2, "load") spacy.load
    +tag function
    +tag-model
 p
-    |  Load a model via its #[+a("/docs/usage/models#usage") shortcut link],
+    |  Load a model via its #[+a("/usage/models#usage") shortcut link],
    |  the name of an installed
-    |  #[+a("/docs/usage/saving-loading#generating") model package], a unicode
+    |  #[+a("/usage/training#models-generating") model package], a unicode
    |  path or a #[code Path]-like object. spaCy will try resolving the load
    |  argument in this order. If a model is loaded from a shortcut link or
    |  package name, spaCy will assume it's a Python package and import it and
@ -38,25 +36,57 @@ p
        +cell list
        +cell
            |  Names of pipeline components to
-            |  #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+            |  #[+a("/usage/processing-pipelines#disabling") disable].
-    +footrow
+    +row("foot")
        +cell returns
        +cell #[code Language]
        +cell A #[code Language] object with the loaded model.
-+infobox("⚠️ Deprecation note")
+infobox("Deprecation note", "⚠️")
    .o-block
        |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
        |  will also raise an error if no model could be loaded and never just
        |  return an empty #[code Language] object. If you need a blank language,
-        |  you need to import it explicitly (#[code from spacy.lang.en import English])
+        |  you can use the new function #[+api("spacy#blank") #[code spacy.blank()]]
-        |  or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
+        |  or import the class explicitly, e.g.
        |  #[code from spacy.lang.en import English].
    +code-new nlp = spacy.load('/model')
    +code-old nlp = spacy.load('en', path='/model')
-+h(2, "info") spacy.info
+h(3, "spacy.blank") spacy.blank
    +tag function
    +tag-new(2)
 p
    |  Create a blank model of a given language class. This function is the
    |  twin of #[code spacy.load()].
 +aside-code("Example").
    nlp_en = spacy.blank('en')
    nlp_de = spacy.blank('de')
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code name]
        +cell unicode
        +cell ISO code of the language class to load.
    +row
        +cell #[code disable]
        +cell list
        +cell
            |  Names of pipeline components to
            |  #[+a("/usage/processing-pipelines#disabling") disable].
    +row("foot")
        +cell returns
        +cell #[code Language]
        +cell An empty #[code Language] object of the appropriate subclass.
 +h(4, "spacy.info") spacy.info
    +tag function
 p
@ -83,13 +113,13 @@ p
        +cell Print information as Markdown.
-+h(2, "explain") spacy.explain
+h(3, "spacy.explain") spacy.explain
    +tag function
 p
    |  Get a description for a given POS tag, dependency label or entity type.
    |  For a list of available terms, see
-    |  #[+src(gh("spacy", "spacy/glossary.py")) glossary.py].
+    |  #[+src(gh("spacy", "spacy/glossary.py")) #[code glossary.py]].
 +aside-code("Example").
    spacy.explain('NORP')
@ -107,18 +137,18 @@ p
        +cell unicode
        +cell Term to explain.
-    +footrow
+    +row("foot")
        +cell returns
        +cell unicode
        +cell The explanation, or #[code None] if not found in the glossary.
-+h(2, "set_factory") spacy.set_factory
+h(3, "spacy.set_factory") spacy.set_factory
    +tag function
    +tag-new(2)
 p
    |  Set a factory that returns a custom
-    |  #[+a("/docs/usage/language-processing-pipeline") processing pipeline]
+    |  #[+a("/usage/processing-pipelines") processing pipeline]
    |  component. Factories are useful for creating stateful components, especially ones which depend on shared data.
 +aside-code("Example").
--- a/website/api/_top-level/_util.jade
+++ b/website/api/_top-level/_util.jade
@ -1,10 +1,8 @@
-//- 💫 DOCS > API > UTIL
+//- 💫 DOCS > API > TOP-LEVEL > UTIL
 include ../../_includes/_mixins
 p
    |  spaCy comes with a small collection of utility functions located in
-    |  #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py].
+    |  #[+src(gh("spaCy", "spacy/util.py")) #[code spacy/util.py]].
    |  Because utility functions are mostly intended for
    |  #[strong internal use within spaCy], their behaviour may change with
    |  future releases. The functions documented on this page should be safe
@ -12,7 +10,7 @@ p
    |  recommend having additional tests in place if your application depends on
    |  any of spaCy's utilities.
-+h(2, "get_data_path") util.get_data_path
+h(3, "util.get_data_path") util.get_data_path
    +tag function
 p
@ -25,12 +23,12 @@ p
        +cell bool
        +cell Only return path if it exists, otherwise return #[code None].
-    +footrow
+    +row("foot")
        +cell returns
        +cell #[code Path] / #[code None]
        +cell Data path or #[code None].
-+h(2, "set_data_path") util.set_data_path
+h(3, "util.set_data_path") util.set_data_path
    +tag function
 p
@ -47,12 +45,12 @@ p
        +cell unicode or #[code Path]
        +cell Path to new data directory.
-+h(2, "get_lang_class") util.get_lang_class
+h(3, "util.get_lang_class") util.get_lang_class
    +tag function
 p
    |  Import and load a #[code Language] class. Allows lazy-loading
-    |  #[+a("/docs/usage/adding-languages") language data] and importing
+    |  #[+a("/usage/adding-languages") language data] and importing
    |  languages using the two-letter language code.
 +aside-code("Example").
@ -67,12 +65,12 @@ p
        +cell unicode
        +cell Two-letter language code, e.g. #[code 'en'].
-    +footrow
+    +row("foot")
        +cell returns
        +cell #[code Language]
        +cell Language class.
-+h(2, "load_model") util.load_model
+h(3, "util.load_model") util.load_model
    +tag function
    +tag-new(2)
@ -101,12 +99,12 @@ p
        +cell -
        +cell Specific overrides, like pipeline components to disable.
-    +footrow
+    +row("foot")
        +cell returns
        +cell #[code Language]
        +cell #[code Language] class with the loaded model.
-+h(2, "load_model_from_path") util.load_model_from_path
+h(3, "util.load_model_from_path") util.load_model_from_path
    +tag function
    +tag-new(2)
@ -139,18 +137,18 @@ p
        +cell -
        +cell Specific overrides, like pipeline components to disable.
-    +footrow
+    +row("foot")
        +cell returns
        +cell #[code Language]
        +cell #[code Language] class with the loaded model.
-+h(2, "load_model_from_init_py") util.load_model_from_init_py
+h(3, "util.load_model_from_init_py") util.load_model_from_init_py
    +tag function
    +tag-new(2)
 p
    |  A helper function to use in the #[code load()] method of a model package's
-    |  #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py].
+    |  #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) #[code __init__.py]].
 +aside-code("Example").
    from spacy.util import load_model_from_init_py
@ -169,12 +167,12 @@ p
        +cell -
        +cell Specific overrides, like pipeline components to disable.
-    +footrow
+    +row("foot")
        +cell returns
        +cell #[code Language]
        +cell #[code Language] class with the loaded model.
-+h(2, "get_model_meta") util.get_model_meta
+h(3, "util.get_model_meta") util.get_model_meta
    +tag function
    +tag-new(2)
@ -190,17 +188,17 @@ p
        +cell unicode or #[code Path]
        +cell Path to model directory.
-    +footrow
+    +row("foot")
        +cell returns
        +cell dict
        +cell The model's meta data.
-+h(2, "is_package") util.is_package
+h(3, "util.is_package") util.is_package
    +tag function
 p
    |  Check if string maps to a package installed via pip. Mainly used to
-    |  validate #[+a("/docs/usage/models") model packages].
+    |  validate #[+a("/usage/models") model packages].
 +aside-code("Example").
    util.is_package('en_core_web_sm') # True
@ -212,18 +210,18 @@ p
        +cell unicode
        +cell Name of package.
-    +footrow
+    +row("foot")
        +cell returns
        +cell #[code bool]
        +cell #[code True] if installed package, #[code False] if not.
-+h(2, "get_package_path") util.get_package_path
+h(3, "util.get_package_path") util.get_package_path
    +tag function
    +tag-new(2)
 p
    |  Get path to an installed package. Mainly used to resolve the location of
-    |  #[+a("/docs/usage/models") model packages]. Currently imports the package
+    |  #[+a("/usage/models") model packages]. Currently imports the package
    |  to find its path.
 +aside-code("Example").
@ -236,12 +234,12 @@ p
        +cell unicode
        +cell Name of installed package.
-    +footrow
+    +row("foot")
        +cell returns
        +cell #[code Path]
        +cell Path to model package directory.
-+h(2, "is_in_jupyter") util.is_in_jupyter
+h(3, "util.is_in_jupyter") util.is_in_jupyter
    +tag function
    +tag-new(2)
@ -257,17 +255,17 @@ p
        return display(HTML(html))
 +table(["Name", "Type", "Description"])
-    +footrow
+    +row("foot")
        +cell returns
        +cell bool
        +cell #[code True] if in Jupyter, #[code False] if not.
-+h(2, "update_exc") util.update_exc
+h(3, "util.update_exc") util.update_exc
    +tag function
 p
    |  Update, validate and overwrite
-    |  #[+a("/docs/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions].
+    |  #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions].
    |  Used to combine global  exceptions with custom, language-specific
    |  exceptions. Will raise an error if key doesn't match #[code ORTH] values.
@ -288,20 +286,20 @@ p
        +cell dicts
        +cell Exception dictionaries to add to the base exceptions, in order.
-    +footrow
+    +row("foot")
        +cell returns
        +cell dict
        +cell Combined tokenizer exceptions.
-+h(2, "prints") util.prints
+h(3, "util.prints") util.prints
    +tag function
    +tag-new(2)
 p
    |  Print a formatted, text-wrapped message with optional title. If a text
    |  argument is a #[code Path], it's converted to a string. Should only
-    |  be used for interactive components like the #[+api("cli") cli].
+    |  be used for interactive components like the command-line interface.
 +aside-code("Example").
    data_path = Path('/some/path')
--- a/website/api/annotation.jade
+++ b/website/api/annotation.jade
@ -0,0 +1,131 @@
 //- 💫 DOCS > API > ANNOTATION SPECS
 include ../_includes/_mixins
 p This document describes the target annotations spaCy is trained to predict.
 +section("tokenization")
    +h(2, "tokenization") Tokenization
    p
        |  Tokenization standards are based on the
        |  #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] corpus.
        |  The tokenizer differs from most by including tokens for significant
        |  whitespace. Any sequence of whitespace characters beyond a single space
        |  (#[code ' ']) is included as a token.
    +aside-code("Example").
        from spacy.lang.en import English
        nlp = English()
        tokens = nlp('Some\nspaces  and\ttab characters')
        tokens_text = [t.text for t in tokens]
        assert tokens_text == ['Some', '\n', 'spaces', ' ', 'and',
                            '\t', 'tab', 'characters']
    p
        |  The whitespace tokens are useful for much the same reason punctuation is
        |  – it's often an important delimiter in the text. By preserving it in the
        |  token output, we are able to maintain a simple alignment between the
        |  tokens and the original string, and we ensure that no information is
        |  lost during processing.
 +section("sbd")
    +h(2, "sentence-boundary") Sentence boundary detection
    p
        |  Sentence boundaries are calculated from the syntactic parse tree, so
        |  features such as punctuation and capitalisation play an important but
        |  non-decisive role in determining the sentence boundaries. Usually this
        |  means that the sentence boundaries will at least coincide with clause
        |  boundaries, even given poorly punctuated text.
 +section("pos-tagging")
    +h(2, "pos-tagging") Part-of-speech Tagging
    +aside("Tip: Understanding tags")
        |  You can also use #[code spacy.explain()] to get the description for the
        |  string representation of a tag. For example,
        |  #[code spacy.explain("RB")] will return "adverb".
    include _annotation/_pos-tags
 +section("lemmatization")
    +h(2, "lemmatization") Lemmatization
    p A "lemma" is the uninflected form of a word. In English, this means:
    +list
        +item #[strong Adjectives]: The form like "happy", not "happier" or "happiest"
        +item #[strong Adverbs]: The form like "badly", not "worse" or "worst"
        +item #[strong Nouns]: The form like "dog", not "dogs"; like "child", not "children"
        +item #[strong Verbs]: The form like "write", not "writes", "writing", "wrote" or "written"
    p
        |  The lemmatization data is taken from
        |  #[+a("https://wordnet.princeton.edu") WordNet]. However, we also add a
        |  special case for pronouns: all pronouns are lemmatized to the special
        |  token #[code -PRON-].
    +infobox("About spaCy's custom pronoun lemma")
        |  Unlike verbs and common nouns, there's no clear base form of a personal
        |  pronoun. Should the lemma of "me" be "I", or should we normalize person
        |  as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
        |  novel symbol, #[code -PRON-], which is used as the lemma for
        |  all personal pronouns.
 +section("dependency-parsing")
    +h(2, "dependency-parsing") Syntactic Dependency Parsing
    +aside("Tip: Understanding labels")
        |  You can also use #[code spacy.explain()] to get the description for the
        |  string representation of a label. For example,
        |  #[code spacy.explain("prt")] will return "particle".
    include _annotation/_dep-labels
 +section("named-entities")
    +h(2, "named-entities") Named Entity Recognition
    +aside("Tip: Understanding entity types")
        |  You can also use #[code spacy.explain()] to get the description for the
        |  string representation of an entity label. For example,
        |  #[code spacy.explain("LANGUAGE")] will return "any named language".
    include _annotation/_named-entities
    +h(3, "biluo") BILUO Scheme
    include _annotation/_biluo
 +section("training")
    +h(2, "json-input") JSON input format for training
    +under-construction
    p spaCy takes training data in the following format:
    +code("Example structure").
        doc: {
            id: string,
            paragraphs: [{
                raw: string,
                sents: [int],
                tokens: [{
                    start: int,
                    tag: string,
                    head: int,
                    dep: string
                }],
                ner: [{
                    start: int,
                    end: int,
                    label: string
                }],
                brackets: [{
                    start: int,
                    end: int,
                    label: string
                }]
            }]
        }
--- a/website/docs/api/binder.jade
+++ b/website/docs/api/binder.jade
@ -1,6 +1,6 @@
 //- 💫 DOCS > API > BINDER
-include ../../_includes/_mixins
+include ../_includes/_mixins
 p A container class for serializing collections of #[code Doc] objects.
--- a/website/api/dependencyparser.jade
+++ b/website/api/dependencyparser.jade
@ -0,0 +1,5 @@
 //- 💫 DOCS > API > DEPENDENCYPARSER
 include ../_includes/_mixins
 !=partial("pipe", { subclass: "DependencyParser", short: "parser", pipeline_id: "parser" })
--- a/Show More
+++ b/Show More