WIP on vectors fixes

2026-01-06 08:49:21 +03:00 · 2017-10-31 11:22:56 +01:00 · 2017-10-31 11:22:56 +01:00 · 9c11ee4a1c
commit 9c11ee4a1c
parent 368fdb389a 5af6c8b746
34 changed files with 682 additions and 343 deletions
--- a/examples/training/vocab-data.jsonl
+++ b/examples/training/vocab-data.jsonl
@ -0,0 +1,21 @@
+{"lang": "en", "settings": {"oov_prob": -20.502029418945312}}
+{"orth": ".", "id": 1, "lower": ".", "norm": ".", "shape": ".", "prefix": ".", "suffix": ".", "length": 1, "cluster": "8", "prob": -3.0678977966308594, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": ",", "id": 2, "lower": ",", "norm": ",", "shape": ",", "prefix": ",", "suffix": ",", "length": 1, "cluster": "4", "prob": -3.4549596309661865, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "the", "id": 3, "lower": "the", "norm": "the", "shape": "xxx", "prefix": "t", "suffix": "the", "length": 3, "cluster": "11", "prob": -3.528766632080078, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "I", "id": 4, "lower": "i", "norm": "I", "shape": "X", "prefix": "I", "suffix": "I", "length": 1, "cluster": "346", "prob": -3.791565179824829, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": false, "is_title": true, "is_upper": true, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "to", "id": 5, "lower": "to", "norm": "to", "shape": "xx", "prefix": "t", "suffix": "to", "length": 2, "cluster": "12", "prob": -3.8560216426849365, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "a", "id": 6, "lower": "a", "norm": "a", "shape": "x", "prefix": "a", "suffix": "a", "length": 1, "cluster": "19", "prob": -3.92978835105896, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "and", "id": 7, "lower": "and", "norm": "and", "shape": "xxx", "prefix": "a", "suffix": "and", "length": 3, "cluster": "20", "prob": -4.113108158111572, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "of", "id": 8, "lower": "of", "norm": "of", "shape": "xx", "prefix": "o", "suffix": "of", "length": 2, "cluster": "28", "prob": -4.27587366104126, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "you", "id": 9, "lower": "you", "norm": "you", "shape": "xxx", "prefix": "y", "suffix": "you", "length": 3, "cluster": "602", "prob": -4.373791217803955, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "it", "id": 10, "lower": "it", "norm": "it", "shape": "xx", "prefix": "i", "suffix": "it", "length": 2, "cluster": "474", "prob": -4.388050079345703, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "is", "id": 11, "lower": "is", "norm": "is", "shape": "xx", "prefix": "i", "suffix": "is", "length": 2, "cluster": "762", "prob": -4.457748889923096, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "that", "id": 12, "lower": "that", "norm": "that", "shape": "xxxx", "prefix": "t", "suffix": "hat", "length": 4, "cluster": "84", "prob": -4.464504718780518, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "\n\n", "id": 0, "lower": "\n\n", "norm": "\n\n", "shape": "\n\n", "prefix": "\n", "suffix": "\n\n", "length": 2, "cluster": "0", "prob": -4.606560707092285, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "in", "id": 13, "lower": "in", "norm": "in", "shape": "xx", "prefix": "i", "suffix": "in", "length": 2, "cluster": "60", "prob": -4.619071960449219, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "'s", "id": 14, "lower": "'s", "norm": "'s", "shape": "'x", "prefix": "'", "suffix": "'s", "length": 2, "cluster": "52", "prob": -4.830559253692627, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "n't", "id": 15, "lower": "n't", "norm": "n't", "shape": "x'x", "prefix": "n", "suffix": "n't", "length": 3, "cluster": "74", "prob": -4.859938621520996, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "for", "id": 16, "lower": "for", "norm": "for", "shape": "xxx", "prefix": "f", "suffix": "for", "length": 3, "cluster": "508", "prob": -4.8801093101501465, "is_alpha": true, "is_ascii": true, "is_digit": false, "is_lower": true, "is_punct": false, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": "\"", "id": 17, "lower": "\"", "norm": "\"", "shape": "\"", "prefix": "\"", "suffix": "\"", "length": 1, "cluster": "0", "prob": -5.02677583694458, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": true, "is_left_punct": true, "is_right_punct": true}
+{"orth": "?", "id": 18, "lower": "?", "norm": "?", "shape": "?", "prefix": "?", "suffix": "?", "length": 1, "cluster": "0", "prob": -5.05924654006958, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": true, "is_space": false, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
+{"orth": " ", "id": 0, "lower": " ", "norm": " ", "shape": " ", "prefix": " ", "suffix": " ", "length": 1, "cluster": "0", "prob": -5.129165172576904, "is_alpha": false, "is_ascii": true, "is_digit": false, "is_lower": false, "is_punct": false, "is_space": true, "is_title": false, "is_upper": false, "like_url": false, "like_num": false, "like_email": false, "is_stop": false, "is_oov": false, "is_quote": false, "is_left_punct": false, "is_right_punct": false}
--- a/spacy/main.py
+++ b/spacy/main.py
@ -7,7 +7,7 @@ if __name__ == '__main__':
    import plac
    import sys
    from spacy.cli import download, link, info, package, train, convert, model
-    from spacy.cli import profile, evaluate, validate
+    from spacy.cli import vocab, profile, evaluate, validate
    from spacy.util import prints

    commands = {
@ -19,6 +19,7 @@ if __name__ == '__main__':
        'convert': convert,
        'package': package,
        'model': model,
+        'vocab': vocab,
        'profile': profile,
        'validate': validate
    }
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -7,4 +7,5 @@ from .train import train
 from .evaluate import evaluate
 from .convert import convert
 from .model import model
+from .vocab import make_vocab as vocab
 from .validate import validate
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -17,14 +17,14 @@ numpy.random.seed(0)


@plac.annotations(
-    model=("Model name or path", "positional", None, str),
-    data_path=("Location of JSON-formatted evaluation data", "positional",
+    model=("model name or path", "positional", None, str),
+    data_path=("location of JSON-formatted evaluation data", "positional",
               None, str),
-    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
-    gpu_id=("Use GPU", "option", "g", int),
-    displacy_path=("Directory to output rendered parses as HTML", "option",
+    gold_preproc=("use gold preprocessing", "flag", "G", bool),
+    gpu_id=("use GPU", "option", "g", int),
+    displacy_path=("directory to output rendered parses as HTML", "option",
                   "dp", str),
-    displacy_limit=("Limit of parses to render as HTML", "option", "dl", int))
+    displacy_limit=("limit of parses to render as HTML", "option", "dl", int))
 def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
             displacy_path=None, displacy_limit=25):
    """
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -16,10 +16,11 @@ from .. import about
    input_dir=("directory with model data", "positional", None, str),
    output_dir=("output parent directory", "positional", None, str),
    meta_path=("path to meta.json", "option", "m", str),
-    create_meta=("create meta.json, even if one exists in directory", "flag",
-                 "c", bool),
-    force=("force overwriting of existing folder in output directory", "flag",
-           "f", bool))
+    create_meta=("create meta.json, even if one exists in directory – if "
+                 "existing meta is found, entries are shown as defaults in "
+                 "the command line prompt", "flag", "c", bool),
+    force=("force overwriting of existing model directory in output directory",
+           "flag", "f", bool))
 def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False,
            force=False):
    """
@ -41,13 +42,13 @@ def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False,
    template_manifest = get_template('MANIFEST.in')
    template_init = get_template('xx_model_name/__init__.py')
    meta_path = meta_path or input_path / 'meta.json'
-    if not create_meta and meta_path.is_file():
-        prints(meta_path, title="Reading meta.json from file")
+    if meta_path.is_file():
        meta = util.read_json(meta_path)
-    else:
-        meta = generate_meta(input_dir)
+        if not create_meta:  # only print this if user doesn't want to overwrite
+            prints(meta_path, title="Loaded meta.json from file")
+        else:
+            meta = generate_meta(input_dir, meta)
    meta = validate_meta(meta, ['lang', 'name', 'version'])
-
    model_name = meta['lang'] + '_' + meta['name']
    model_name_v = model_name + '-' + meta['version']
    main_path = output_path / model_name_v
@ -82,18 +83,19 @@ def create_file(file_path, contents):
    file_path.open('w', encoding='utf-8').write(contents)


-def generate_meta(model_path):
-    meta = {}
-    settings = [('lang', 'Model language', 'en'),
-                ('name', 'Model name', 'model'),
-                ('version', 'Model version', '0.0.0'),
+def generate_meta(model_path, existing_meta):
+    meta = existing_meta or {}
+    settings = [('lang', 'Model language', meta.get('lang', 'en')),
+                ('name', 'Model name', meta.get('name', 'model')),
+                ('version', 'Model version', meta.get('version', '0.0.0')),
                ('spacy_version', 'Required spaCy version',
                 '>=%s,<3.0.0' % about.__version__),
-                ('description', 'Model description', False),
-                ('author', 'Author', False),
-                ('email', 'Author email', False),
-                ('url', 'Author website', False),
-                ('license', 'License', 'CC BY-NC 3.0')]
+                ('description', 'Model description',
+                  meta.get('description', False)),
+                ('author', 'Author', meta.get('author', False)),
+                ('email', 'Author email', meta.get('email', False)),
+                ('url', 'Author website', meta.get('url', False)),
+                ('license', 'License', meta.get('license', 'CC BY-SA 3.0'))]
    nlp = util.load_model_from_path(Path(model_path))
    meta['pipeline'] = nlp.pipe_names
    meta['vectors'] = {'width': nlp.vocab.vectors_length,
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -32,6 +32,7 @@ numpy.random.seed(0)
    n_sents=("number of sentences", "option", "ns", int),
    use_gpu=("Use GPU", "option", "g", int),
    vectors=("Model to load vectors from", "option", "v"),
+    vectors_limit=("Truncate to N vectors (requires -v)", "option", None, int),
    no_tagger=("Don't train tagger", "flag", "T", bool),
    no_parser=("Don't train parser", "flag", "P", bool),
    no_entities=("Don't train NER", "flag", "N", bool),
@ -40,9 +41,9 @@ numpy.random.seed(0)
    meta_path=("Optional path to meta.json. All relevant properties will be "
               "overwritten.", "option", "m", Path))
 def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
-          use_gpu=-1, vectors=None, no_tagger=False, no_parser=False,
-          no_entities=False, gold_preproc=False, version="0.0.0",
-          meta_path=None):
+          use_gpu=-1, vectors=None, vectors_limit=None, no_tagger=False,
+          no_parser=False, no_entities=False, gold_preproc=False,
+          version="0.0.0", meta_path=None):
    """
    Train a model. Expects data in spaCy's JSON format.
    """
@ -95,10 +96,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
    if vectors:
        util.load_model(vectors, vocab=nlp.vocab)
        if vectors_limit is not None:
-            remap = nlp.vocab.prune_vectors(vectors_limit)
-            print('remap', len(remap))
-            for key, (value, sim) in remap.items():
-                print(repr(key), repr(value), sim)
+            nlp.vocab.prune_vectors(vectors_limit)
    for name in pipeline:
        nlp.add_pipe(nlp.create_pipe(name), name=name)
    optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
--- a/spacy/cli/vocab.py
+++ b/spacy/cli/vocab.py
@ -0,0 +1,54 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import plac
+import json
+import spacy
+import numpy
+from pathlib import Path
+
+from ..util import prints, ensure_path
+
+
+@plac.annotations(
+    lang=("model language", "positional", None, str),
+    output_dir=("model output directory", "positional", None, Path),
+    lexemes_loc=("location of JSONL-formatted lexical data", "positional",
+                 None, Path),
+    vectors_loc=("optional: location of vectors data, as numpy .npz",
+                 "positional", None, str))
+def make_vocab(cmd, lang, output_dir, lexemes_loc, vectors_loc=None):
+    """Compile a vocabulary from a lexicon jsonl file and word vectors."""
+    if not lexemes_loc.exists():
+        prints(lexemes_loc, title="Can't find lexical data", exits=1)
+    vectors_loc = ensure_path(vectors_loc)
+    nlp = spacy.blank(lang)
+    for word in nlp.vocab:
+        word.rank = 0
+    lex_added = 0
+    vec_added = 0
+    with lexemes_loc.open() as file_:
+        for line in file_:
+            if line.strip():
+                attrs = json.loads(line)
+                if 'settings' in attrs:
+                    nlp.vocab.cfg.update(attrs['settings'])
+                else:
+                    lex = nlp.vocab[attrs['orth']]
+                    lex.set_attrs(**attrs)
+                    assert lex.rank == attrs['id']
+                lex_added += 1
+    if vectors_loc is not None:
+        vector_data = numpy.load(open(vectors_loc, 'rb'))
+        nlp.vocab.clear_vectors(width=vector_data.shape[1])
+        for word in nlp.vocab:
+            if word.rank:
+                nlp.vocab.vectors.add(word.orth_, row=word.rank,
+                                      vector=vector_data[word.rank])
+                vec_added += 1
+    if not output_dir.exists():
+        output_dir.mkdir()
+    nlp.to_disk(output_dir)
+    prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
+           title="Sucessfully compiled vocab and vectors, and saved model")
+    return nlp
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@ -300,5 +300,15 @@ GLOSSARY = {
    'MONEY':        'Monetary values, including unit',
    'QUANTITY':     'Measurements, as of weight or distance',
    'ORDINAL':      '"first", "second", etc.',
-    'CARDINAL':     'Numerals that do not fall under another type'
+    'CARDINAL':     'Numerals that do not fall under another type',
+
+
+    # Named Entity Recognition
+    # Wikipedia
+    # http://www.sciencedirect.com/science/article/pii/S0004370212000276
+    # https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf
+
+    'PER':          'Named person or family.',
+    'MISC':         ('Miscellaneous entities, e.g. events, nationalities, '
+                     'products or works of art'),
 }
--- a/spacy/language.py
+++ b/spacy/language.py
@ -154,6 +154,8 @@ class Language(object):
        self._meta.setdefault('email', '')
        self._meta.setdefault('url', '')
        self._meta.setdefault('license', '')
+        self._meta['vectors'] = {'width': self.vocab.vectors_length,
+                                 'entries': len(self.vocab.vectors)}
        self._meta['pipeline'] = self.pipe_names
        return self._meta

--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -13,6 +13,8 @@ from .typedefs cimport attr_t, flags_t
 from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV
+from .attrs cimport PROB
+from .attrs import intify_attrs
 from . import about


@ -68,6 +70,19 @@ cdef class Lexeme:
    def __hash__(self):
        return self.c.orth

+    def set_attrs(self, **attrs):
+        cdef attr_id_t attr
+        attrs = intify_attrs(attrs)
+        for attr, value in attrs.items():
+            if attr == PROB:
+                self.c.prob = value
+            elif attr == CLUSTER:
+                self.c.cluster = int(value)
+            elif isinstance(value, int) or isinstance(value, long):
+                Lexeme.set_struct_attr(self.c, attr, value)
+            else:
+                Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
+
    def set_flag(self, attr_id_t flag_id, bint value):
        """Change the value of a boolean flag.

--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -209,7 +209,7 @@ def test_doc_api_right_edge(en_tokenizer):
 def test_doc_api_has_vector():
    vocab = Vocab()
    vocab.clear_vectors(2)
-    vocab.vectors.add('kitten', numpy.asarray([0., 2.], dtype='f'))
+    vocab.vectors.add('kitten', vector=numpy.asarray([0., 2.], dtype='f'))
    doc = Doc(vocab, words=['kitten'])
    assert doc.has_vector

--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@ -73,8 +73,8 @@ def test_doc_token_api_is_properties(en_vocab):
 def test_doc_token_api_vectors():
    vocab = Vocab()
    vocab.clear_vectors(2)
-    vocab.vectors.add('apples', numpy.asarray([0., 2.], dtype='f'))
-    vocab.vectors.add('oranges', numpy.asarray([0., 1.], dtype='f'))
+    vocab.vectors.add('apples', vector=numpy.asarray([0., 2.], dtype='f'))
+    vocab.vectors.add('oranges', vector=numpy.asarray([0., 1.], dtype='f'))
    doc = Doc(vocab, words=['apples', 'oranges', 'oov'])
    assert doc.has_vector

--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@ -21,8 +21,10 @@ cdef class Vectors:
    Vectors data is kept in the vectors.data attribute, which should be an
    instance of numpy.ndarray (for CPU vectors) or cupy.ndarray
    (for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to
-    rows in the vectors.data table. The array `vectors.keys` keeps the keys in
-    order, such that `keys[vectors.key2row[key]] == key`.
+    rows in the vectors.data table.
+    
+    Multiple keys can be mapped to the same vector, so len(keys) may be greater
+    (but not smaller) than data.shape[0].
    """
    cdef public object data
    cdef readonly StringStore strings
@ -101,7 +103,7 @@ cdef class Vectors:

        RETURNS (int): The number of vectors in the data.
        """
-        return self.i
+        return self._i_vec

    def __contains__(self, key):
        """Check whether a key has a vector entry in the table.
@ -113,11 +115,13 @@ cdef class Vectors:
            key = self.strings[key]
        return key in self.key2row

-    def add(self, key, vector=None):
-        """Add a key to the table, optionally setting a vector value as well.
+    def add(self, key, *, vector=None, row=None):
+        """Add a key to the table. Keys can be mapped to an existing vector
+        by setting `row`, or a new vector can be added.

        key (unicode / int): The key to add.
-        vector (numpy.ndarray): An optional vector to add.
+        vector (numpy.ndarray / None): A vector to add for the key.
+        row (int / None): The row-number of a vector to map the key to.
        """
        if isinstance(key, basestring_):
            key = self.strings.add(key)
@ -131,8 +135,8 @@ cdef class Vectors:

        self.key2row[key] = row
        if vector is not None:
-            self.data[i] = vector
-        return i
+            self.data[row] = vector
+        return row

    def items(self):
        """Iterate over `(string key, vector)` pairs, in order.
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -32,6 +32,7 @@ cdef class Vocab:
    cdef readonly int length
    cdef public object data_dir
    cdef public object lex_attr_getters
+    cdef public object cfg

    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -5,6 +5,7 @@ import numpy
 import dill

 from collections import OrderedDict
+from thinc.neural.util import get_array_module
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport Lexeme
 from .strings cimport hash_string
@ -27,7 +28,7 @@ cdef class Vocab:
    C-data that is shared between `Doc` objects.
    """
    def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
-                 strings=tuple(), **deprecated_kwargs):
+                 strings=tuple(), oov_prob=-20., **deprecated_kwargs):
        """Create the vocabulary.

        lex_attr_getters (dict): A dictionary mapping attribute IDs to
@ -43,6 +44,7 @@ cdef class Vocab:
        tag_map = tag_map if tag_map is not None else {}
        if lemmatizer in (None, True, False):
            lemmatizer = Lemmatizer({}, {}, {})
+        self.cfg = {'oov_prob': oov_prob}
        self.mem = Pool()
        self._by_hash = PreshMap()
        self._by_orth = PreshMap()
@ -239,7 +241,7 @@ cdef class Vocab:
    def vectors_length(self):
        return self.vectors.data.shape[1]

-    def clear_vectors(self, new_dim=None):
+    def clear_vectors(self, width=None):
        """Drop the current vector table. Because all vectors must be the same
        width, you have to call this to change the size of the vectors.
        """
@ -283,16 +285,14 @@ cdef class Vocab:
        keep = xp.ascontiguousarray(keep.T)
        neighbours = xp.zeros((toss.shape[0],), dtype='i')
        scores = xp.zeros((toss.shape[0],), dtype='f')
-        for i in range(0, toss.shape[0]//2, batch_size):
+        for i in range(0, toss.shape[0], batch_size):
            batch = toss[i : i+batch_size]
            batch /= xp.linalg.norm(batch, axis=1, keepdims=True)+1e-8
            sims = xp.dot(batch, keep)
            matches = sims.argmax(axis=1)
            neighbours[i:i+batch_size] = matches
            scores[i:i+batch_size] = sims.max(axis=1)
-        i2k = {i: key for key, i in self.vectors.key2row.items()}
-        remap = {}
-        for lex in list(self):
+        for lex in self:
            # If we're losing the vector for this word, map it to the nearest
            # vector we're keeping.
            if lex.rank >= nr_row:
--- a/website/_includes/_functions.jade
+++ b/website/_includes/_functions.jade
@ -41,9 +41,6 @@
 -           var comps = path.split('#');
 -           return "top-level#" + comps[0] + '.' + comps[1];
 -       }
-       else if (path.startsWith('cli#')) {
-           return "top-level#" + path.split('#')[1];
-       }
 -       return path;
 -   }

--- a/website/_includes/_mixins-base.jade
+++ b/website/_includes/_mixins-base.jade
@ -1,244 +0,0 @@
-//- 💫 MIXINS > BASE
-
-//- Section
-    id - [string] anchor assigned to section (used for breadcrumb navigation)
-
-mixin section(id)
-    section.o-section(id="section-" + id data-section=id)
-        block
-
-
-//- Aside wrapper
-    label - [string] aside label
-
-mixin aside-wrapper(label)
-    aside.c-aside
-        .c-aside__content(role="complementary")&attributes(attributes)
-            if label
-                h4.u-text-label.u-text-label--dark=label
-
-            block
-
-
-//- SVG from map (uses embedded SVG sprite)
-    name   - [string] SVG symbol id
-    width  - [integer] width in px
-    height - [integer] height in px (default: same as width)
-
-mixin svg(name, width, height)
-    svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
-        use(xlink:href="#svg_#{name}")
-
-
-//- Icon
-    name   - [string] icon name (will be used as symbol id: #svg_{name})
-    width  - [integer] icon width (default: 20)
-    height - [integer] icon height (defaults to width)
-
-mixin icon(name, width, height)
-    - var width = width || 20
-    - var height = height || width
-    +svg(name, width, height).o-icon(style="min-width: #{width}px")&attributes(attributes)
-
-
-//- Pro/Con/Neutral icon
-    icon - [string] "pro", "con" or "neutral" (default: "neutral")
-    size - [integer] icon size (optional)
-
-mixin procon(icon, label, show_label, size)
-    - var colors = { yes: "green", no: "red", neutral: "subtle" }
-    span.u-nowrap
-        +icon(icon, size || 20)(class="u-color-#{colors[icon] || 'subtle'}").o-icon--inline&attributes(attributes)
-        span.u-text-small(class=show_label ? null : "u-hidden")=(label || icon)
-
-//- Headlines Helper Mixin
-    level - [integer] 1, 2, 3, 4, or 5
-
-mixin headline(level)
-    if level == 1
-        h1.u-heading-1&attributes(attributes)
-            block
-
-    else if level == 2
-        h2.u-heading-2&attributes(attributes)
-            block
-
-    else if level == 3
-        h3.u-heading-3&attributes(attributes)
-            block
-
-    else if level == 4
-        h4.u-heading-4&attributes(attributes)
-            block
-
-    else if level == 5
-        h5.u-heading-5&attributes(attributes)
-            block
-
-
-//- Permalink rendering
-    id - [string] permalink ID used for link anchor
-
-mixin permalink(id)
-    if id
-        a.u-permalink(href="##{id}")
-            block
-
-    else
-        block
-
-
-//- Quickstart widget
-    quickstart.js with manual markup, inspired by PyTorch's "Getting started"
-    groups - [object] option groups, uses global variable QUICKSTART
-    headline - [string] optional text to be rendered as widget headline
-
-mixin quickstart(groups, headline, description, hide_results)
-    .c-quickstart.o-block-small#qs
-        .c-quickstart__content
-            if headline
-                +h(2)=headline
-            if description
-                p=description
-            for group in groups
-                .c-quickstart__group.u-text-small(data-qs-group=group.id)
-                    if group.title
-                        .c-quickstart__legend=group.title
-                            if group.help
-                                |  #[+help(group.help)]
-                    .c-quickstart__fields
-                        for option in group.options
-                            input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked)
-                            label.c-quickstart__label.u-text-tiny(for="qs-#{option.id}")!=option.title
-                                if option.meta
-                                    |  #[span.c-quickstart__label__meta (#{option.meta})]
-                                if option.help
-                                    |  #[+help(option.help)]
-
-        if hide_results
-            block
-        else
-            pre.c-code-block
-                code.c-code-block__content.c-quickstart__code(data-qs-results="")
-                    block
-
-
-//- Quickstart code item
-    data  - [object] Rendering conditions (keyed by option group ID, value: option)
-    style - [string] modifier ID for line style
-
-mixin qs(data, style)
-    - args = {}
-    for value, setting in data
-        - args['data-qs-' + setting] = value
-    span.c-quickstart__line(class="c-quickstart__line--#{style || 'bash'}")&attributes(args)
-        block
-
-
-//- Terminal-style code window
-    label - [string] title displayed in top bar of terminal window
-
-mixin terminal(label)
-    .x-terminal
-        .x-terminal__icons: span
-        .u-padding-small.u-text-label.u-text-center=label
-
-        +code.x-terminal__code
-            block
-
-//- Chart.js
-    id - [string] chart ID, will be assigned as #chart_{id}
-
-mixin chart(id, height)
-    figure.o-block&attributes(attributes)
-        canvas(id="chart_#{id}" width="800" height=(height || "400") style="max-width: 100%")
-
-
-//- Gitter chat button and widget
-    button - [string] text shown on button
-    label  - [string] title of chat window (default: same as button)
-
-mixin gitter(button, label)
-    aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
-
-    button.js-gitter-button.c-chat__button.u-text-tag
-        +icon("chat", 16).o-icon--inline
-        !=button
-
-
-//- Badge
-    image - [string] path to badge image
-    url   - [string] badge link
-
-mixin badge(image, url)
-    +a(url).u-padding-small.u-hide-link&attributes(attributes)
-        img.o-badge(src=image alt=url height="20")
-
-
-//- spaCy logo
-
-mixin logo()
-    +svg("spacy", 675, 215).o-logo&attributes(attributes)
-
-
-//- Landing
-
-mixin landing-header()
-    header.c-landing
-        .c-landing__wrapper
-            .c-landing__content
-                block
-
-mixin landing-banner(headline, label)
-    .c-landing__banner.u-padding.o-block.u-color-light
-        +grid.c-landing__banner__content.o-no-block
-            +grid-col("third")
-                h3.u-heading.u-heading-1
-                    if label
-                        div
-                            span.u-text-label.u-text-label--light=label
-                    !=headline
-
-            +grid-col("two-thirds").c-landing__banner__text
-                block
-
-
-mixin landing-logos(title, logos)
-    .o-content.u-text-center&attributes(attributes)
-        h3.u-heading.u-text-label.u-color-dark=title
-
-        each row, i in logos
-            - var is_last = i == logos.length - 1
-            +grid("center").o-inline-list.o-no-block(class=is_last ? "o-no-block" : null)
-                each details, name in row
-                    +a(details[0]).u-padding-medium
-                        +icon(name, details[1], details[2])
-
-                if is_last
-                    block
-
-
-//- Under construction (temporary)
-    Marks sections that still need to be completed for the v2.0 release.
-
-mixin under-construction()
-    +infobox("Under construction", "🚧")
-        |  This section is still being written and will be updated for the v2.0
-        |  release. Is there anything that you think should definitely mentioned or
-        |  explained here? Any examples you'd like to see? #[strong Let us know]
-        |  on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub!
-
-
-//- Alpha infobox (temporary)
-    Added in the templates to notify user that they're visiting the alpha site.
-
-mixin alpha-info()
-    +infobox("You are viewing the spaCy v2.0.0 alpha docs", "⚠️")
-        strong This page is part of the alpha documentation for spaCy v2.0.
-        |  It does not reflect the state of the latest stable release.
-        |  Because v2.0 is still under development, the implementation
-        |  may differ from the intended state described here. See the
-        |  #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes]
-        |  for details on how to install and test the new version. To
-        |  read the official docs for spaCy v1.x,
-        |  #[+a("https://spacy.io/docs") go here].
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -1,7 +1,39 @@
 //- 💫 INCLUDES > MIXINS

 include _functions
-include _mixins-base
+
+
+//- Section
+    id - [string] anchor assigned to section (used for breadcrumb navigation)
+
+mixin section(id)
+    section.o-section(id="section-" + id data-section=id)
+        block
+
+
+//- Headlines Helper Mixin
+    level - [integer] 1, 2, 3, 4, or 5
+
+mixin headline(level)
+    if level == 1
+        h1.u-heading-1&attributes(attributes)
+            block
+
+    else if level == 2
+        h2.u-heading-2&attributes(attributes)
+            block
+
+    else if level == 3
+        h3.u-heading-3&attributes(attributes)
+            block
+
+    else if level == 4
+        h4.u-heading-4&attributes(attributes)
+            block
+
+    else if level == 5
+        h5.u-heading-5&attributes(attributes)
+            block


 //- Headlines
@ -18,6 +50,18 @@ mixin h(level, id, source)
                span Source #[+icon("code", 14).o-icon--inline]


+//- Permalink rendering
+    id - [string] permalink ID used for link anchor
+
+mixin permalink(id)
+    if id
+        a.u-permalink(href="##{id}")
+            block
+
+    else
+        block
+
+
 //- External links
    url     - [string] link href
    trusted - [boolean] if not set / false, rel="noopener nofollow" is added
@ -63,6 +107,18 @@ mixin help(tooltip, icon_size)
        +icon("help_o", icon_size || 16).o-icon--inline


+//- Aside wrapper
+    label - [string] aside label
+
+mixin aside-wrapper(label)
+    aside.c-aside
+        .c-aside__content(role="complementary")&attributes(attributes)
+            if label
+                h4.u-text-label.u-text-label--dark=label
+
+            block
+
+
 //- Aside for text
    label - [string] aside title (optional)

@ -112,6 +168,37 @@ mixin infobox-logos(...logos)
                |  #[+icon(logo[0], logo[1], logo[2]).u-color-dark]


+//- SVG from map (uses embedded SVG sprite)
+    name   - [string] SVG symbol id
+    width  - [integer] width in px
+    height - [integer] height in px (default: same as width)
+
+mixin svg(name, width, height)
+    svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes)
+        use(xlink:href="#svg_#{name}")
+
+
+//- Icon
+    name   - [string] icon name (will be used as symbol id: #svg_{name})
+    width  - [integer] icon width (default: 20)
+    height - [integer] icon height (defaults to width)
+
+mixin icon(name, width, height)
+    - var width = width || 20
+    - var height = height || width
+    +svg(name, width, height).o-icon(style="min-width: #{width}px")&attributes(attributes)
+
+
+//- Pro/Con/Neutral icon
+    icon - [string] "pro", "con" or "neutral" (default: "neutral")
+    size - [integer] icon size (optional)
+
+mixin procon(icon, label, show_label, size)
+    - var colors = { yes: "green", no: "red", neutral: "subtle" }
+    span.u-nowrap
+        +icon(icon, size || 20)(class="u-color-#{colors[icon] || 'subtle'}").o-icon--inline&attributes(attributes)
+        span.u-text-small(class=show_label ? null : "u-hidden")=(label || icon)
+

 //- Link button
    url      - [string] link href
@ -238,6 +325,14 @@ mixin graphic(original)
                +button(original, false, "secondary", "small") View large graphic


+//- Chart.js
+    id - [string] chart ID, will be assigned as #chart_{id}
+
+mixin chart(id, height)
+    figure.o-block&attributes(attributes)
+        canvas(id="chart_#{id}" width="800" height=(height || "400") style="max-width: 100%")
+
+
 //- Labels

 mixin label()
@ -353,8 +448,8 @@ mixin grid(...style)
    width - [string] "quarter", "third", "half", "two-thirds", "three-quarters"
    see $grid in assets/css/_variables.sass

-mixin grid-col(width)
-    .o-grid__col(class="o-grid__col--#{width}")&attributes(attributes)
+mixin grid-col(...style)
+    .o-grid__col(class=prefixArgs(style, "o-grid__col"))&attributes(attributes)
        block


@ -445,3 +540,137 @@ mixin annotation-row(annots, style)
            else
                +cell=cell
        block
+
+
+//- spaCy logo
+
+mixin logo()
+    +svg("spacy", 675, 215).o-logo&attributes(attributes)
+
+
+//- Gitter chat button and widget
+    button - [string] text shown on button
+    label  - [string] title of chat window (default: same as button)
+
+mixin gitter(button, label)
+    aside.js-gitter.c-chat.is-collapsed(data-title=(label || button))
+
+    button.js-gitter-button.c-chat__button.u-text-tag
+        +icon("chat", 16).o-icon--inline
+        !=button
+
+
+//- Badge
+    image - [string] path to badge image
+    url   - [string] badge link
+
+mixin badge(image, url)
+    +a(url).u-padding-small.u-hide-link&attributes(attributes)
+        img.o-badge(src=image alt=url height="20")
+
+
+//- Quickstart widget
+    quickstart.js with manual markup, inspired by PyTorch's "Getting started"
+    groups - [object] option groups, uses global variable QUICKSTART
+    headline - [string] optional text to be rendered as widget headline
+
+mixin quickstart(groups, headline, description, hide_results)
+    .c-quickstart.o-block-small#qs
+        .c-quickstart__content
+            if headline
+                +h(2)=headline
+            if description
+                p=description
+            for group in groups
+                .c-quickstart__group.u-text-small(data-qs-group=group.id)
+                    if group.title
+                        .c-quickstart__legend=group.title
+                            if group.help
+                                |  #[+help(group.help)]
+                    .c-quickstart__fields
+                        for option in group.options
+                            input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked)
+                            label.c-quickstart__label.u-text-tiny(for="qs-#{option.id}")!=option.title
+                                if option.meta
+                                    |  #[span.c-quickstart__label__meta (#{option.meta})]
+                                if option.help
+                                    |  #[+help(option.help)]
+
+        if hide_results
+            block
+        else
+            pre.c-code-block
+                code.c-code-block__content.c-quickstart__code(data-qs-results="")
+                    block
+
+
+//- Quickstart code item
+    data  - [object] Rendering conditions (keyed by option group ID, value: option)
+    style - [string] modifier ID for line style
+
+mixin qs(data, style)
+    - args = {}
+    for value, setting in data
+        - args['data-qs-' + setting] = value
+    span.c-quickstart__line(class="c-quickstart__line--#{style || 'bash'}")&attributes(args)
+        block
+
+
+//- Terminal-style code window
+    label - [string] title displayed in top bar of terminal window
+
+mixin terminal(label)
+    .x-terminal
+        .x-terminal__icons: span
+        .u-padding-small.u-text-label.u-text-center=label
+
+        +code.x-terminal__code
+            block
+
+
+//- Landing
+
+mixin landing-header()
+    header.c-landing
+        .c-landing__wrapper
+            .c-landing__content
+                block
+
+mixin landing-banner(headline, label)
+    .c-landing__banner.u-padding.o-block.u-color-light
+        +grid.c-landing__banner__content.o-no-block
+            +grid-col("third")
+                h3.u-heading.u-heading-1
+                    if label
+                        div
+                            span.u-text-label.u-text-label--light=label
+                    !=headline
+
+            +grid-col("two-thirds").c-landing__banner__text
+                block
+
+
+mixin landing-logos(title, logos)
+    .o-content.u-text-center&attributes(attributes)
+        h3.u-heading.u-text-label.u-color-dark=title
+
+        each row, i in logos
+            - var is_last = i == logos.length - 1
+            +grid("center").o-inline-list.o-no-block(class=is_last ? "o-no-block" : null)
+                each details, name in row
+                    +a(details[0]).u-padding-medium
+                        +icon(name, details[1], details[2])
+
+                if is_last
+                    block
+
+
+//- Under construction (temporary)
+    Marks sections that still need to be completed for the v2.0 release.
+
+mixin under-construction()
+    +infobox("Under construction", "🚧")
+        |  This section is still being written and will be updated for the v2.0
+        |  release. Is there anything that you think should definitely mentioned or
+        |  explained here? Any examples you'd like to see? #[strong Let us know]
+        |  on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub!
--- a/website/_includes/_page-docs.jade
+++ b/website/_includes/_page-docs.jade
@ -25,9 +25,6 @@ main.o-main.o-main--sidebar.o-main--aside
                    +button(gh("spacy", source), false, "secondary", "small").u-nowrap
                        |  Source #[+icon("code", 14)]

-        //-if ALPHA
-        //-    +alpha-info
-
        if IS_MODELS
            include _page_models
        else
--- a/website/_includes/_svg.jade
+++ b/website/_includes/_svg.jade
@ -62,6 +62,9 @@ svg(style="position: absolute; visibility: hidden; width: 0; height: 0;" width="
        symbol#svg_explosion(viewBox="0 0 500 500")
            path(fill="currentColor" d="M111.7 74.9L91.2 93.1l9.1 10.2 17.8-15.8 7.4 8.4-17.8 15.8 10.1 11.4 20.6-18.2 7.7 8.7-30.4 26.9-41.9-47.3 30.3-26.9 7.6 8.6zM190.8 59.6L219 84.3l-14.4 4.5-20.4-18.2-6.4 26.6-14.4 4.5 8.9-36.4-26.9-24.1 14.3-4.5L179 54.2l5.7-25.2 14.3-4.5-8.2 35.1zM250.1 21.2l27.1 3.4c6.1.8 10.8 3.1 14 7.2 3.2 4.1 4.5 9.2 3.7 15.5-.8 6.3-3.2 11-7.4 14.1-4.1 3.1-9.2 4.3-15.3 3.5L258 63.2l-2.8 22.3-13-1.6 7.9-62.7zm11.5 13l-2.2 17.5 12.6 1.6c5.1.6 9.1-2 9.8-7.6.7-5.6-2.5-9.2-7.6-9.9l-12.6-1.6zM329.1 95.4l23.8 13.8-5.8 10L312 98.8l31.8-54.6 11.3 6.6-26 44.6zM440.5 145c-1.3 8.4-5.9 15.4-13.9 21.1s-16.2 7.7-24.6 6.1c-8.4-1.6-15.3-6.3-20.8-14.1-5.5-7.9-7.6-16-6.4-24.4 1.3-8.5 6-15.5 14-21.1 8-5.6 16.2-7.7 24.5-6 8.4 1.6 15.4 6.3 20.9 14.2 5.5 7.6 7.6 15.7 6.3 24.2zM412 119c-5.1-.8-10.3.6-15.6 4.4-5.2 3.7-8.4 8.1-9.4 13.2-1 5.2.2 10.1 3.5 14.8 3.4 4.8 7.5 7.5 12.7 8.2 5.2.8 10.4-.7 15.6-4.4 5.3-3.7 8.4-8.1 9.4-13.2 1.1-5.1-.1-9.9-3.4-14.7-3.4-4.8-7.6-7.6-12.8-8.3zM471.5 237.9c-2.8 4.8-7.1 7.6-13 8.7l-2.6-13.1c5.3-.9 8.1-5 7.2-11-.9-5.8-4.3-8.8-8.9-8.2-2.3.3-3.7 1.4-4.5 3.3-.7 1.9-1.4 5.2-1.7 10.1-.8 7.5-2.2 13.1-4.3 16.9-2.1 3.9-5.7 6.2-10.9 7-6.3.9-11.3-.5-15.2-4.4-3.9-3.8-6.3-9-7.3-15.7-1.1-7.4-.2-13.7 2.6-18.8 2.8-5.1 7.4-8.2 13.7-9.2l2.6 13c-5.6 1.1-8.7 6.6-7.7 13.4 1 6.6 3.9 9.5 8.6 8.8 4.4-.7 5.7-4.5 6.7-14.1.3-3.5.7-6.2 1.1-8.4.4-2.2 1.2-4.4 2.2-6.8 2.1-4.7 6-7.2 11.8-8.1 5.4-.8 10.3.4 14.5 3.7 4.2 3.3 6.9 8.5 8 15.6.9 6.9-.1 12.6-2.9 17.3zM408.6 293.5l2.4-12.9 62 11.7-2.4 12.9-62-11.7zM419.6 396.9c-8.3 2-16.5.3-24.8-5-8.2-5.3-13.2-12.1-14.9-20.5-1.6-8.4.1-16.6 5.3-24.6 5.2-8.1 11.9-13.1 20.2-15.1 8.4-1.9 16.6-.3 24.9 5 8.2 5.3 13.2 12.1 14.8 20.5 1.7 8.4 0 16.6-5.2 24.7-5.2 8-12 13-20.3 15zm13.4-36.3c-1.2-5.1-4.5-9.3-9.9-12.8s-10.6-4.7-15.8-3.7-9.3 4-12.4 8.9-4.1 9.8-2.8 14.8c1.2 5.1 4.5 9.3 9.9 12.8 5.5 3.5 10.7 4.8 15.8 3.7 5.1-.9 9.2-3.8 12.3-8.7s4.1-9.9 2.9-15zM303.6 416.5l9.6-5.4 43.3 20.4-19.2-34 11.4-6.4 31 55-9.6 5.4-43.4-20.5 19.2 34.1-11.3 6.4-31-55zM238.2 468.8c-49 0-96.9-17.4-134.8-49-38.3-32-64-76.7-72.5-125.9-2-11.9-3.1-24-3.1-35.9 0-36.5 9.6-72.6 27.9-104.4 2.1-3.6 6.7-4.9 10.3-2.8 3.6 2.1 4.9 6.7 2.8 10.3-16.9 29.5-25.9 63.1-25.9 96.9 0 11.1 1 22.3 2.9 33.4 7.9 45.7 31.8 87.2 67.3 116.9 35.2 29.3 79.6 45.5 125.1 45.5 11.1 0 22.3-1 33.4-2.9 4.1-.7 8 2 8.7 6.1.7 4.1-2 8-6.1 8.7-11.9 2-24 3.1-36 3.1z")

+        symbol#svg_prodigy(viewBox="0 0 538.5 157.6")
+            path(fill="currentColor" d="M70.6 48.6c7 7.3 10.5 17.1 10.5 29.2S77.7 99.7 70.6 107c-6.9 7.3-15.9 11.1-27 11.1-9.4 0-16.8-2.7-21.7-8.2v44.8H0V39h20.7v8.1c4.8-6.4 12.4-9.6 22.9-9.6 11.1 0 20.1 3.7 27 11.1zM21.9 76v3.6c0 12.1 7.3 19.8 18.3 19.8 11.2 0 18.7-7.9 18.7-21.6s-7.5-21.6-18.7-21.6c-11 0-18.3 7.7-18.3 19.8zM133.8 59.4c-12.6 0-20.5 7-20.5 17.8v39.3h-22V39h21.1v8.8c4-6.4 11.2-9.6 21.3-9.6v21.2zM209.5 107.1c-7.6 7.3-17.5 11.1-29.5 11.1s-21.9-3.8-29.7-11.1c-7.6-7.5-11.5-17.2-11.5-29.2 0-12.1 3.9-21.9 11.5-29.2 7.8-7.3 17.7-11.1 29.7-11.1s21.9 3.8 29.5 11.1c7.8 7.3 11.7 17.1 11.7 29.2 0 11.9-3.9 21.7-11.7 29.2zM180 56.2c-5.7 0-10.3 1.9-13.8 5.8-3.5 3.8-5.2 9-5.2 15.7 0 6.7 1.8 12 5.2 15.7 3.4 3.8 8.1 5.7 13.8 5.7s10.3-1.9 13.8-5.7 5.2-9 5.2-15.7c0-6.8-1.8-12-5.2-15.7-3.5-3.8-8.1-5.8-13.8-5.8zM313 116.5h-20.5v-7.9c-4.4 5.5-12.7 9.6-23.1 9.6-10.9 0-19.9-3.8-27-11.1C235.5 99.7 232 90 232 77.8s3.5-21.9 10.3-29.2c7-7.3 16-11.1 27-11.1 9.7 0 17.1 2.7 21.9 8.2V0H313v116.5zm-58.8-38.7c0 13.6 7.5 21.4 18.7 21.4 10.9 0 18.3-7.3 18.3-19.8V76c0-12.2-7.3-19.8-18.3-19.8-11.2 0-18.7 8-18.7 21.6zM354.1 13.6c0 3.6-1.3 6.8-3.9 9.3-5 4.9-13.6 4.9-18.6 0-8.4-7.5-1.6-23.1 9.3-22.5 7.4 0 13.2 5.9 13.2 13.2zm-2.2 102.9H330V39h21.9v77.5zM425.1 47.1V39h20.5v80.4c0 11.2-3.6 20.1-10.6 26.8-7 6.7-16.6 10-28.5 10-23.4 0-36.9-11.4-39.9-29.8l21.7-.8c1 7.6 7.6 12 17.4 12 11.2 0 18.1-5.8 18.1-16.6v-11.1c-5.1 5.5-12.5 8.2-21.9 8.2-10.9 0-19.9-3.8-27-11.1-6.9-7.3-10.3-17.1-10.3-29.2s3.5-21.9 10.3-29.2c7-7.3 16-11.1 27-11.1 10.7 0 18.4 3.1 23.2 9.6zm-38.3 30.7c0 13.6 7.5 21.6 18.7 21.6 11 0 18.3-7.6 18.3-19.8V76c0-12.2-7.3-19.8-18.3-19.8-11.2 0-18.7 8-18.7 21.6zM488.8 154.8H465l19.8-45.1L454.5 39h24.1l17.8 46.2L514.2 39h24.3l-49.7 115.8z")
+

        //- Machine learning & NLP libraries

--- a/website/api/_annotation/_named-entities.jade
+++ b/website/api/_annotation/_named-entities.jade
@ -1,6 +1,11 @@
 //- 💫 DOCS > API > ANNOTATION > NAMED ENTITIES

-+table([ "Type", "Description" ])
+p
+    |  Models trained on the
+    |  #[+a("https://catalog.ldc.upenn.edu/ldc2013t19") OntoNotes 5] corpus
+    |  support the following entity types:
+
+table(["Type", "Description"])
    +row
        +cell #[code PERSON]
        +cell People, including fictional.
@ -45,9 +50,6 @@
        +cell #[code LANGUAGE]
        +cell Any named language.

-p The following values are also annotated in a style similar to names:
-
-+table([ "Type", "Description" ])
    +row
        +cell #[code DATE]
        +cell Absolute or relative dates or periods.
@ -75,3 +77,33 @@ p The following values are also annotated in a style similar to names:
    +row
        +cell #[code CARDINAL]
        +cell Numerals that do not fall under another type.
+
+h(4, "ner-wikipedia-scheme") Wikipedia scheme
+
+p
+    |  Models trained on Wikipedia corpus
+    |  (#[+a("http://www.sciencedirect.com/science/article/pii/S0004370212000276") Nothman et al., 2013])
+    |  use a less fine-grained NER annotation scheme and recognise the
+    |  following entities:
+
+table(["Type", "Description"])
+    +row
+        +cell #[code PER]
+        +cell Named person or family.
+
+    +row
+        +cell #[code LOC]
+        +cell
+            |  Name of politically or geographically defined location (cities,
+            |  provinces, countries, international regions, bodies of water,
+            |  mountains).
+
+    +row
+        +cell #[code ORG]
+        +cell Named corporate, governmental, or other organizational entity.
+
+    +row
+        +cell #[code MISC]
+        +cell
+            |  Miscellaneous entities, e.g. events, nationalities, products or
+            |  works of art.
--- a/website/api/_annotation/_training.jade
+++ b/website/api/_annotation/_training.jade
@ -1,5 +1,7 @@
 //- 💫 DOCS > API > ANNOTATION > TRAINING

+h(3, "json-input") JSON input format for training
+
 p
    |  spaCy takes training data in JSON format. The built-in
    |  #[+api("cli#convert") #[code convert]] command helps you convert the
@ -46,3 +48,57 @@ p
    |  Treebank:

 +github("spacy", "examples/training/training-data.json", false, false, "json")
+
+h(3, "vocab-jsonl") Lexical data for vocabulary
+    +tag-new(2)
+
+p
+    |  The populate a model's vocabulary, you can use the
+    |  #[+api("cli#vocab") #[code spacy vocab]] command and load in a
+    |  #[+a("https://jsonlines.readthedocs.io/en/latest/") newline-delimited JSON]
+    |  (JSONL) file containing one lexical entry per line. The first line
+    |  defines the language and vocabulary settings. All other lines are
+    |  expected to be JSON objects describing an individual lexeme. The lexical
+    |  attributes will be then set as attributes on spaCy's
+    |  #[+api("lexeme#attributes") #[code Lexeme]] object. The #[code vocab]
+    |  command outputs a ready-to-use spaCy model with a #[code Vocab]
+    |  containing the lexical data.
+
+code("First line").
+    {"lang": "en", "settings": {"oov_prob": -20.502029418945312}}
+
+code("Entry structure").
+    {
+        "orth": string,
+        "id": int,
+        "lower": string,
+        "norm": string,
+        "shape": string
+        "prefix": string,
+        "suffix": string,
+        "length": int,
+        "cluster": string,
+        "prob": float,
+        "is_alpha": bool,
+        "is_ascii": bool,
+        "is_digit": bool,
+        "is_lower": bool,
+        "is_punct": bool,
+        "is_space": bool,
+        "is_title": bool,
+        "is_upper": bool,
+        "like_url": bool,
+        "like_num": bool,
+        "like_email": bool,
+        "is_stop": bool,
+        "is_oov": bool,
+        "is_quote": bool,
+        "is_left_punct": bool,
+        "is_right_punct": bool
+    }
+
+p
+    |  Here's an example of the 20 most frequent lexemes in the English
+    |  training data:
+
+github("spacy", "examples/training/vocab-data.jsonl", false, false, "json")
--- a/website/api/_data.json
+++ b/website/api/_data.json
@ -3,8 +3,10 @@
        "Overview": {
            "Architecture": "./",
            "Annotation Specs": "annotation",
+            "Command Line": "cli",
            "Functions": "top-level"
        },
+
        "Containers": {
            "Doc": "doc",
            "Token": "token",
@ -45,14 +47,19 @@
        }
    },

+    "cli": {
+        "title": "Command Line Interface",
+        "teaser": "Download, train and package models, and debug spaCy.",
+        "source": "spacy/cli"
+    },
+
    "top-level": {
        "title": "Top-level Functions",
        "menu": {
            "spacy": "spacy",
            "displacy": "displacy",
            "Utility Functions": "util",
-            "Compatibility": "compat",
-            "Command Line": "cli"
+            "Compatibility": "compat"
        }
    },

@ -213,7 +220,7 @@
            "Lemmatization": "lemmatization",
            "Dependencies": "dependency-parsing",
            "Named Entities": "named-entities",
-            "Training Data": "training"
+            "Models & Training": "training"
        }
    }
 }
--- a/website/api/_top-level/_spacy.jade
+++ b/website/api/_top-level/_spacy.jade
@ -85,7 +85,9 @@ p
    +row
        +cell #[code name]
        +cell unicode
-        +cell ISO code of the language class to load.
+        +cell
+            |  #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code]
+            |  of the language class to load.

    +row
        +cell #[code disable]
--- a/website/api/annotation.jade
+++ b/website/api/annotation.jade
@ -99,6 +99,6 @@ p This document describes the target annotations spaCy is trained to predict.
    include _annotation/_biluo

 +section("training")
-    +h(2, "json-input") JSON input format for training
+    +h(2, "training") Models and training data

    include _annotation/_training
--- a/website/api/_top-level/_cli.jade
+++ b/website/api/_top-level/_cli.jade
@ -1,4 +1,6 @@
-//- 💫 DOCS > API > TOP-LEVEL > COMMAND LINE INTERFACE
+//- 💫 DOCS > API > COMMAND LINE INTERFACE
+
+include ../_includes/_mixins

 p
    |  As of v1.7.0, spaCy comes with new command line helpers to download and
@ -34,6 +36,13 @@ p
        +cell flag
        +cell Show help message and available arguments.

+    +row("foot")
+        +cell creates
+        +cell directory, symlink
+        +cell
+            |  The installed model package in your #[code site-packages]
+            |  directory and a shortcut link as a symlink in #[code spacy/data].
+
 +aside("Downloading best practices")
    |  The #[code download] command is mostly intended as a convenient,
    |  interactive wrapper – it performs compatibility checks and prints
@ -86,6 +95,13 @@ p
        +cell flag
        +cell Show help message and available arguments.

+    +row("foot")
+        +cell creates
+        +cell symlink
+        +cell
+            |  A shortcut link of the given name as a symlink in
+            |  #[code spacy/data].
+
 +h(3, "info") Info

 p
@ -113,6 +129,11 @@ p
        +cell flag
        +cell Show help message and available arguments.

+    +row("foot")
+        +cell prints
+        +cell #[code stdout]
+        +cell Information about your spaCy installation.
+
 +h(3, "validate") Validate
    +tag-new(2)

@ -129,6 +150,12 @@ p
 +code(false, "bash", "$").
    spacy validate

+table(["Argument", "Type", "Description"])
+    +row("foot")
+        +cell prints
+        +cell #[code stdout]
+        +cell Details about the compatibility of your installed models.
+
 +h(3, "convert") Convert

 p
@ -172,6 +199,11 @@ p
        +cell flag
        +cell Show help message and available arguments.

+    +row("foot")
+        +cell creates
+        +cell JSON
+        +cell Data in spaCy's #[+a("/api/annotation#json-input") JSON format].
+
 p The following converters are available:

 +table(["ID", "Description"])
@ -286,6 +318,11 @@ p
        +cell flag
        +cell Show help message and available arguments.

+    +row("foot")
+        +cell creates
+        +cell model, pickle
+        +cell A spaCy model on each epoch, and a final #[code .pickle] file.
+
 +h(4, "train-hyperparams") Environment variables for hyperparameters
    +tag-new(2)

@ -395,6 +432,50 @@ p
        +cell Gradient L2 norm constraint.
        +cell #[code 1.0]

+h(3, "vocab") Vocab
+    +tag-new(2)
+
+p
+    |  Compile a vocabulary from a
+    |  #[+a("/api/annotation#vocab-jsonl") lexicon JSONL] file and optional
+    |  word vectors. Will save out a valid spaCy model that you can load via
+    |  #[+api("spacy#load") #[code spacy.load]] or package using the
+    |  #[+api("cli#package") #[code package]] command.
+
+code(false, "bash", "$").
+    spacy vocab [lang] [output_dir] [lexemes_loc] [vectors_loc]
+
+table(["Argument", "Type", "Description"])
+    +row
+        +cell #[code lang]
+        +cell positional
+        +cell
+            |  Model language
+            |  #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code],
+            |  e.g. #[code en].
+
+    +row
+        +cell #[code output_dir]
+        +cell positional
+        +cell Model output directory. Will be created if it doesn't exist.
+
+    +row
+        +cell #[code lexemes_loc]
+        +cell positional
+        +cell
+            |  Location of lexical data in spaCy's
+            |  #[+a("/api/annotation#vocab-jsonl") JSONL format].
+
+    +row
+        +cell #[code vectors_loc]
+        +cell positional
+        +cell Optional location of vectors data as numpy #[code .npz] file.
+
+    +row("foot")
+        +cell creates
+        +cell model
+        +cell A spaCy model containing the vocab and vectors.
+
 +h(3, "evaluate") Evaluate
    +tag-new(2)

@ -447,22 +528,36 @@ p
        +cell flag
        +cell Use gold preprocessing.

+    +row("foot")
+        +cell prints / creates
+        +cell #[code stdout], HTML
+        +cell Training results and optional displaCy visualizations.
+

 +h(3, "package") Package

 p
    |  Generate a #[+a("/usage/training#models-generating") model Python package]
    |  from an existing model data directory. All data files are copied over.
-    |  If the path to a meta.json is supplied, or a meta.json is found in the
-    |  input directory, this file is used. Otherwise, the data can be entered
-    |  directly from the command line. The required file templates are downloaded
-    |  from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub] to make
+    |  If the path to a #[code meta.json] is supplied, or a #[code meta.json] is
+    |  found in the input directory, this file is used. Otherwise, the data can
+    |  be entered directly from the command line. The required file templates
+    |  are downloaded from
+    |  #[+src(gh("spacy-dev-resources", "templates/model")) GitHub] to make
    |  sure you're always using the latest versions. This means you need to be
-    |  connected to the internet to use this command.
+    |  connected to the internet to use this command. After packaging, you
+    |  can run #[code python setup.py sdist] from the newly created directory
+    |  to turn your model into an installable archive file.

 +code(false, "bash", "$", false, false, true).
    spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force]

+aside-code("Example", "bash").
+    spacy package /input /output
+    cd /output/en_model-0.0.0
+    python setup.py sdist
+    pip install dist/en_model-0.0.0.tar.gz
+
 +table(["Argument", "Type", "Description"])
    +row
        +cell #[code input_dir]
@ -477,15 +572,16 @@ p
    +row
        +cell #[code --meta-path], #[code -m]
        +cell option
-        +cell #[+tag-new(2)] Path to meta.json file (optional).
+        +cell #[+tag-new(2)] Path to #[code meta.json] file (optional).

    +row
        +cell #[code --create-meta], #[code -c]
        +cell flag
        +cell
-            |  #[+tag-new(2)] Create a meta.json file on the command line, even
-            |  if one already exists in the directory.
-
+            |  #[+tag-new(2)] Create a #[code meta.json] file on the command
+            |  line, even if one already exists in the directory. If an
+            |  existing file is found, its entries will be shown as the defaults
+            |  in the command line prompt.
    +row
        +cell #[code --force], #[code -f]
        +cell flag
@ -495,3 +591,8 @@ p
        +cell #[code --help], #[code -h]
        +cell flag
        +cell Show help message and available arguments.
+
+    +row("foot")
+        +cell creates
+        +cell directory
+        +cell A Python package containing the spaCy model.
--- a/website/api/top-level.jade
+++ b/website/api/top-level.jade
@ -18,7 +18,3 @@ include ../_includes/_mixins
 +section("compat")
    +h(2, "compat", "spacy/compaty.py") Compatibility functions
    include _top-level/_compat
-
-+section("cli", "spacy/cli")
-    +h(2, "cli") Command line
-    include _top-level/_cli
--- a/website/api/vocab.jade
+++ b/website/api/vocab.jade
@ -162,7 +162,7 @@ p
        +cell int
        +cell The integer ID by which the flag value can be checked.

-+h(2, "add_flag") Vocab.clear_vectors
+h(2, "clear_vectors") Vocab.clear_vectors
    +tag method
    +tag-new(2)

@ -181,7 +181,50 @@ p
            |  Number of dimensions of the new vectors. If #[code None], size
            |  is not changed.

-+h(2, "add_flag") Vocab.get_vector
+h(2, "prune_vectors") Vocab.prune_vectors
+    +tag method
+    +tag-new(2)
+
+p
+    |  Reduce the current vector table to #[code nr_row] unique entries. Words
+    |  mapped to the discarded vectors will be remapped to the closest vector
+    |  among those remaining. For example, suppose the original table had
+    |  vectors for the words:
+    |  #[code.u-break ['sat', 'cat', 'feline', 'reclined']]. If we prune the
+    |  vector table to, two rows, we would discard the vectors for "feline"
+    |  and "reclined". These words would then be remapped to the closest
+    |  remaining vector – so "feline" would have the same vector as "cat",
+    |  and "reclined" would have the same vector as "sat". The similarities are
+    |  judged by cosine. The original vectors may be large, so the cosines are
+    |  calculated in minibatches, to reduce memory usage.
+
+aside-code("Example").
+    nlp.vocab.prune_vectors(10000)
+    assert len(nlp.vocab.vectors) &lt;= 1000
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code nr_row]
+        +cell int
+        +cell The number of rows to keep in the vector table.
+
+    +row
+        +cell #[code batch_size]
+        +cell int
+        +cell
+            |  Batch of vectors for calculating the similarities. Larger batch
+            |  sizes might be faster, while temporarily requiring more memory.
+
+    +row("foot")
+        +cell returns
+        +cell dict
+        +cell
+            |  A dictionary keyed by removed words mapped to
+            |  #[code (string, score)] tuples, where #[code string] is the entry
+            |  the removed word was mapped to, and #[code score] the similarity
+            |  score between the two words.
+
+h(2, "get_vector") Vocab.get_vector
    +tag method
    +tag-new(2)

@ -206,7 +249,7 @@ p
            |  A word vector. Size and shape are determined by the
            |  #[code Vocab.vectors] instance.

-+h(2, "add_flag") Vocab.set_vector
+h(2, "set_vector") Vocab.set_vector
    +tag method
    +tag-new(2)

@ -228,7 +271,7 @@ p
        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
        +cell The vector to set.

-+h(2, "add_flag") Vocab.has_vector
+h(2, "has_vector") Vocab.has_vector
    +tag method
    +tag-new(2)

--- a/website/assets/css/_base/_grid.sass
+++ b/website/assets/css/_base/_grid.sass
@ -48,6 +48,9 @@
        flex: 0 0 100%
        flex-flow: column wrap

+    &.o-grid__col--no-gutter
+        margin-top: 0
+
    // Fix overflow issue in old browsers

    & > *
--- a/website/assets/css/_components/_navigation.sass
+++ b/website/assets/css/_components/_navigation.sass
@ -8,7 +8,7 @@
    align-items: center
    display: flex
    justify-content: space-between
-    flex-flow: row wrap
+    flex-flow: row nowrap
    padding: 0 2rem 0 1rem
    z-index: 30
    width: 100%
--- a/website/assets/css/_components/_tables.sass
+++ b/website/assets/css/_components/_tables.sass
@ -51,6 +51,7 @@
        @include scroll-shadow-base($color-front)
        display: inline-block
        overflow-x: auto
+        overflow-y: hidden
        width: auto
        -webkit-overflow-scrolling: touch

--- a/website/usage/_install/_changelog.jade
+++ b/website/usage/_install/_changelog.jade
@ -3,7 +3,7 @@
 +h(2, "changelog") Changelog
    +button(gh("spacy") + "/releases", false, "secondary", "small").u-float-right.u-nowrap View releases

-div(data-tpl="changelog" data-tpl-key="error")
+div(data-tpl="changelog" data-tpl-key="error" style="display: none")
    +infobox
        |  Unable to load changelog from GitHub. Please see the
        |  #[+a(gh("spacy") + "/releases") releases page] instead.
--- a/website/usage/_training/_basics.jade
+++ b/website/usage/_training/_basics.jade
@ -76,6 +76,16 @@ p
        ("Google rebrands its business apps", [(0, 6, "ORG")]),
        ("look what i found on google! 😂", [(21, 27, "PRODUCT")])]

+infobox("Tip: Try the Prodigy annotation tool")
+    +infobox-logos(["prodigy", 100, 29, "https://prodi.gy"])
+    |  If you need to label a lot of data, check out
+    |  #[+a("https://prodi.gy", true) Prodigy], a new, active learning-powered
+    |  annotation tool we've developed. Prodigy is fast and extensible, and
+    |  comes with a modern  #[strong web application] that helps you collect
+    |  training data faster. It integrates seamlessly with spaCy, pre-selects
+    |  the #[strong most relevant examples] for annotation, and lets you
+    |  train and evaluate ready-to-use spaCy models.
+
 +h(3, "annotations") Training with annotations

 p
@ -180,9 +190,10 @@ p
        +cell #[code optimizer]
        +cell Callable to update the model's weights.

-+infobox
-    |  For the #[strong full example and more details], see the usage guide on
-    |  #[+a("/usage/training#ner") training the named entity recognizer],
-    |  or the runnable
-    |  #[+src(gh("spaCy", "examples/training/train_ner.py")) training script]
-    |  on GitHub.
+p
+    |  Instead of writing your own training loop, you can also use the
+    |  built-in #[+api("cli#train") #[code train]] command, which expects data
+    |  in spaCy's #[+a("/api/annotation#json-input") JSON format]. On each epoch,
+    |  a model will be saved out to the directory. After training, you can
+    |  use the #[+api("cli#package") #[code package]] command to generate an
+    |  installable Python package from your model.
--- a/website/usage/_training/_tagger-parser.jade
+++ b/website/usage/_training/_tagger-parser.jade
@ -190,7 +190,3 @@ p

    +item
        |  #[strong Test] the model to make sure the parser works as expected.
-
-+h(3, "training-json") JSON format for training
-
-include ../../api/_annotation/_training