Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-08-11 23:54:55 +03:00 · 2017-05-27 18:32:57 -05:00 · 2017-05-27 18:32:57 -05:00 · c1263a844b
commit c1263a844b
parent 9e711c3476 10d05c2b92
40 changed files with 720 additions and 271 deletions
--- a/spacy/init.py
+++ b/spacy/init.py
@ -1,10 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

-import importlib
-
-from .compat import basestring_
-from .cli.info import info
+from .cli.info import info as cli_info
 from .glossary import explain
 from .deprecated import resolve_load_name
 from . import util
@ -12,11 +9,8 @@ from . import util

 def load(name, **overrides):
    name = resolve_load_name(name, **overrides)
-    model_path = util.resolve_model_path(name)
-    meta = util.parse_package_meta(model_path)
-    if 'lang' not in meta:
-        raise IOError('No language setting found in model meta.')
-    cls = util.get_lang_class(meta['lang'])
-    overrides['meta'] = meta
-    overrides['path'] = model_path
-    return cls(**overrides)
+    return util.load_model(name)
+
+
+def info(model=None, markdown=False):
+    return cli_info(None, model, markdown)
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -19,6 +19,8 @@ import numpy


 def _init_for_precomputed(W, ops):
+    if (W**2).sum() != 0.:
+        return
    reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2]))
    ops.xavier_uniform_init(reshaped)
    W[:] = reshaped.reshape(W.shape)
@ -247,6 +249,7 @@ def doc2feats(cols=None):
    model.cols = cols
    return model

+
 def print_shape(prefix):
    def forward(X, drop=0.):
        return X, lambda dX, **kwargs: dX
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -24,8 +24,9 @@ CONVERTERS = {
    n_sents=("Number of sentences per doc", "option", "n", float),
    morphology=("Enable appending morphology to tags", "flag", "m", bool)
 )
-def convert(_, input_file, output_dir, n_sents, morphology):
-    """Convert files into JSON format for use with train command and other
+def convert(cmd, input_file, output_dir, n_sents, morphology):
+    """
+    Convert files into JSON format for use with train command and other
    experiment management functions.
    """
    input_path = Path(input_file)
@ -39,4 +40,4 @@ def convert(_, input_file, output_dir, n_sents, morphology):
        prints("Can't find converter for %s" % input_path.parts[-1],
               title="Unknown format", exits=1)
    CONVERTERS[file_ext](input_path, output_path,
-            n_sents=n_sents, morphology=morphology)
+            n_sents=n_sents, use_morphology=morphology)
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -17,8 +17,9 @@ from .. import about
    direct=("force direct download. Needs model name with version and won't "
            "perform compatibility check", "flag", "d", bool)
 )
-def download(model, direct=False):
-    """Download compatible model from default download path using pip. Model
+def download(cmd, model, direct=False):
+    """
+    Download compatible model from default download path using pip. Model
    can be shortcut, model name or, if --direct flag is set, full model name
    with version.
    """
@ -31,7 +32,7 @@ def download(model, direct=False):
        version = get_version(model_name, compatibility)
        download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
        try:
-            link(model_name, model, force=True)
+            link(None, model_name, model, force=True)
        except:
            # Dirty, but since spacy.download and the auto-linking is mostly
            # a convenience wrapper, it's best to show a success message and
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -14,14 +14,20 @@ from .. import util
    model=("optional: shortcut link of model", "positional", None, str),
    markdown=("generate Markdown for GitHub issues", "flag", "md", str)
 )
-def info(model=None, markdown=False):
+def info(cmd, model=None, markdown=False):
    """Print info about spaCy installation. If a model shortcut link is
    speficied as an argument, print model information. Flag --markdown
    prints details in Markdown for easy copy-pasting to GitHub issues.
    """
    if model:
-        model_path = util.resolve_model_path(model)
-        meta = util.parse_package_meta(model_path)
+        if util.is_package(model):
+            model_path = util.get_package_path(model)
+        else:
+            model_path = util.get_data_path() / model
+        meta_path = model_path / 'meta.json'
+        if not meta_path.is_file():
+            prints(meta_path, title="Can't find model meta.json", exits=1)
+        meta = read_json(meta_path)
        if model_path.resolve() != model_path:
            meta['link'] = path2str(model_path)
            meta['source'] = path2str(model_path.resolve())
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@ -14,13 +14,14 @@ from .. import util
    link_name=("name of shortuct link to create", "positional", None, str),
    force=("force overwriting of existing link", "flag", "f", bool)
 )
-def link(origin, link_name, force=False):
-    """Create a symlink for models within the spacy/data directory. Accepts
+def link(cmd, origin, link_name, force=False):
+    """
+    Create a symlink for models within the spacy/data directory. Accepts
    either the name of a pip package, or the local path to the model data
    directory. Linking models allows loading them via spacy.load(link_name).
    """
    if util.is_package(origin):
-        model_path = util.get_model_package_path(origin)
+        model_path = util.get_package_path(model)
    else:
        model_path = Path(origin)
    if not model_path.exists():
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -18,8 +18,9 @@ from .. import about
    meta=("path to meta.json", "option", "m", str),
    force=("force overwriting of existing folder in output directory", "flag", "f", bool)
 )
-def package(input_dir, output_dir, meta, force):
-    """Generate Python package for model data, including meta and required
+def package(cmd, input_dir, output_dir, meta=None, force=False):
+    """
+    Generate Python package for model data, including meta and required
    installation files. A new directory will be created in the specified
    output directory, and model data will be copied over.
    """
@ -42,7 +43,7 @@ def package(input_dir, output_dir, meta, force):
        meta = util.read_json(meta_path)
    else:
        meta = generate_meta()
-    validate_meta(meta, ['lang', 'name', 'version'])
+    meta = validate_meta(meta, ['lang', 'name', 'version'])

    model_name = meta['lang'] + '_' + meta['name']
    model_name_v = model_name + '-' + meta['version']
@ -85,20 +86,32 @@ def generate_meta():
                ('email', 'Author email', False),
                ('url', 'Author website', False),
                ('license', 'License', 'CC BY-NC 3.0')]
-
    prints("Enter the package settings for your model.", title="Generating meta.json")
    meta = {}
    for setting, desc, default in settings:
        response = util.get_raw_input(desc, default)
        meta[setting] = default if response == '' and default else response
+    meta['pipeline'] = generate_pipeline()
    return meta


+def generate_pipeline():
+    prints("If set to 'True', the default pipeline is used. If set to 'False', "
+           "the pipeline will be disabled. Components should be specified as a "
+           "comma-separated list of component names, e.g. vectorizer, tagger, "
+           "parser, ner. For more information, see the docs on processing pipelines.",
+           title="Enter your model's pipeline components")
+    pipeline = util.get_raw_input("Pipeline components", True)
+    replace = {'True': True, 'False': False}
+    return replace[pipeline] if pipeline in replace else pipeline.split(', ')
+
+
 def validate_meta(meta, keys):
    for key in keys:
        if key not in meta or meta[key] == '':
            prints("This setting is required to build your package.",
                   title='No "%s" setting found in meta.json' % key, exits=1)
+    return meta


 def get_template(filepath):
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -32,9 +32,11 @@ from .. import displacy
    no_parser=("Don't train parser", "flag", "P", bool),
    no_entities=("Don't train NER", "flag", "N", bool)
 )
-def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
+def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
          use_gpu=False, no_tagger=False, no_parser=False, no_entities=False):
-    """Train a model. Expects data in spaCy's JSON format."""
+    """
+    Train a model. Expects data in spaCy's JSON format.
+    """
    n_sents = n_sents or None
    output_path = util.ensure_path(output_dir)
    train_path = util.ensure_path(train_data)
@ -84,11 +86,11 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                    pbar.update(len(docs))

            with nlp.use_params(optimizer.averages):
-                scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False))
                with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
                    dill.dump(nlp, file_, -1)
-
-
+                with (output_path / ('model%d.pickle' % i)).open('rb') as file_:
+                    nlp_loaded = dill.load(file_)
+                scorer = nlp_loaded.evaluate(corpus.dev_docs(nlp_loaded, gold_preproc=False))
            print_progress(i, losses, scorer.scores)
    finally:
        print("Saving model...")
--- a/spacy/lang/bn/punctuation.py
+++ b/spacy/lang/bn/punctuation.py
@ -1,8 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS
-from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
+from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS


 _currency = r"\$|¢|£|€|¥|฿|৳"
@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
 _list_punct = LIST_PUNCT + '। ॥'.strip().split()


-_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES)
+_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)

-_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES +
+_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
             [r'(?<=[0-9])\+',
              r'(?<=°[FfCcKk])\.',
              r'(?<=[0-9])(?:{})'.format(_currency),
              r'(?<=[0-9])(?:{})'.format(UNITS),
              r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])

-_infixes = (LIST_ELLIPSES +
+_infixes = (LIST_ELLIPSES + LIST_ICONS +
            [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
             r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -20,7 +20,6 @@ _upper = [_latin_upper]
 _lower = [_latin_lower]
 _uncased = [_bengali, _hebrew]

-
 ALPHA = merge_char_classes(_upper + _lower + _uncased)
 ALPHA_LOWER = merge_char_classes(_lower + _uncased)
 ALPHA_UPPER = merge_char_classes(_upper + _uncased)
@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
 _punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
 _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
 _hyphens = '- – — -- ---'
-
+_other_symbols = r'[\p{So}]'

 UNITS = merge_chars(_units)
 CURRENCY = merge_chars(_currency)
 QUOTES = merge_chars(_quotes)
 PUNCT = merge_chars(_punct)
 HYPHENS = merge_chars(_hyphens)
+ICONS = _other_symbols

 LIST_UNITS = split_chars(_units)
 LIST_CURRENCY = split_chars(_currency)
@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
 LIST_PUNCT = split_chars(_punct)
 LIST_HYPHENS = split_chars(_hyphens)
 LIST_ELLIPSES = [r'\.\.+', '…']
+LIST_ICONS = [_other_symbols]
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -35,4 +35,4 @@ class English(Language):
    Defaults = EnglishDefaults


-__all__ = ['English', 'EnglishDefaults']
+__all__ = ['English']
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@ -2,15 +2,16 @@
 from __future__ import unicode_literals

 from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
-from .char_classes import CURRENCY, UNITS
+from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
+from .char_classes import QUOTES, CURRENCY, UNITS


 _prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
-             LIST_CURRENCY)
+             LIST_CURRENCY + LIST_ICONS)


-_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
+_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
+             ["'s", "'S", "’s", "’S"] +
             [r'(?<=[0-9])\+',
              r'(?<=°[FfCcKk])\.',
              r'(?<=[0-9])(?:{})'.format(CURRENCY),
@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
              r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])


-_infixes = (LIST_ELLIPSES +
+_infixes = (LIST_ELLIPSES + LIST_ICONS +
            [r'(?<=[0-9])[+\-\*^](?=[0-9-])',
             r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
--- a/spacy/lang/xx/init.py
+++ b/spacy/lang/xx/init.py
@ -0,0 +1,26 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ...language import Language
+from ...attrs import LANG
+from ...util import update_exc
+
+
+class MultiLanguageDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'xx'
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
+
+
+class MultiLanguage(Language):
+    """Language class to be used for models that support multiple languages.
+    This module allows models to specify their language ID as 'xx'.
+    """
+    lang = 'xx'
+    Defaults = MultiLanguageDefaults
+
+
+__all__ = ['MultiLanguage']
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -337,6 +337,9 @@ cdef class NeuralDependencyParser(NeuralParser):
    name = 'parser'
    TransitionSystem = ArcEager

+    def __reduce__(self):
+        return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
+

 cdef class NeuralEntityRecognizer(NeuralParser):
    name = 'entity'
@ -344,6 +347,10 @@ cdef class NeuralEntityRecognizer(NeuralParser):

    nr_feature = 6

+    def __reduce__(self):
+        return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
+
+

 cdef class BeamDependencyParser(BeamParser):
    TransitionSystem = ArcEager
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@ -335,17 +335,18 @@ cdef cppclass StateC:
            this._break = this._b_i

    void clone(const StateC* src) nogil:
+        this.length = src.length
        memcpy(this._sent, src._sent, this.length * sizeof(TokenC))
        memcpy(this._stack, src._stack, this.length * sizeof(int))
        memcpy(this._buffer, src._buffer, this.length * sizeof(int))
        memcpy(this._ents, src._ents, this.length * sizeof(Entity))
        memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
-        this.length = src.length
        this._b_i = src._b_i
        this._s_i = src._s_i
        this._e_i = src._e_i
        this._break = src._break
        this.offset = src.offset
+        this._empty_token = src._empty_token

    void fast_forward() nogil:
        # space token attachement policy:
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -9,6 +9,7 @@ import ctypes
 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
 from cymem.cymem cimport Pool
+from collections import OrderedDict

 from .stateclass cimport StateClass
 from ._state cimport StateC, is_space_token
@ -312,12 +313,13 @@ cdef class ArcEager(TransitionSystem):
    @classmethod
    def get_actions(cls, **kwargs):
        actions = kwargs.get('actions',
-                    {
-                        SHIFT: [''],
-                        REDUCE: [''],
-                        RIGHT: [],
-                        LEFT: [],
-                        BREAK: ['ROOT']})
+                    OrderedDict((
+                        (SHIFT, ['']),
+                        (REDUCE, ['']),
+                        (RIGHT, []),
+                        (LEFT, []),
+                        (BREAK, ['ROOT'])
+                    )))
        seen_actions = set()
        for label in kwargs.get('left_labels', []):
            if label.upper() != 'ROOT':
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -2,6 +2,7 @@
 from __future__ import unicode_literals

 from thinc.typedefs cimport weight_t
+from collections import OrderedDict

 from .stateclass cimport StateClass
 from ._state cimport StateC
@ -51,17 +52,29 @@ cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil:


 cdef class BiluoPushDown(TransitionSystem):
+    def __init__(self, *args, **kwargs):
+        TransitionSystem.__init__(self, *args, **kwargs)
+
+    def __reduce__(self):
+        labels_by_action = OrderedDict()
+        cdef Transition t
+        for trans in self.c[:self.n_moves]:
+            label_str = self.strings[trans.label]
+            labels_by_action.setdefault(trans.move, []).append(label_str)
+        return (BiluoPushDown, (self.strings, labels_by_action),
+                None, None)
+
    @classmethod
    def get_actions(cls, **kwargs):
        actions = kwargs.get('actions',
-                    {
-                        MISSING: [''],
-                        BEGIN: [],
-                        IN: [],
-                        LAST: [],
-                        UNIT: [],
-                        OUT: ['']
-                    })
+                    OrderedDict((
+                        (MISSING, ['']),
+                        (BEGIN, []),
+                        (IN, []),
+                        (LAST, []),
+                        (UNIT, []),
+                        (OUT, [''])
+                    )))
        seen_entities = set()
        for entity_type in kwargs.get('entity_types', []):
            if entity_type in seen_entities:
@ -90,7 +103,7 @@ cdef class BiluoPushDown(TransitionSystem):
    def move_name(self, int move, int label):
        if move == OUT:
            return 'O'
-        elif move == 'MISSING':
+        elif move == MISSING:
            return 'M'
        else:
            return MOVE_NAMES[move] + '-' + self.strings[label]
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -527,6 +527,14 @@ cdef class Parser:
                xp.add.at(d_tokvecs,
                    ids, d_state_features * active_feats)

+    @property
+    def move_names(self):
+        names = []
+        for i in range(self.moves.n_moves):
+            name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
+            names.append(name)
+        return names
+
    def get_batch_model(self, batch_size, tokvecs, stream, dropout):
        lower, upper = self.model
        state2vec = precompute_hiddens(batch_size, tokvecs,
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -5,7 +5,7 @@ from __future__ import unicode_literals
 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport weight_t
-from collections import defaultdict
+from collections import defaultdict, OrderedDict

 from ..structs cimport TokenC
 from .stateclass cimport StateClass
@ -26,7 +26,7 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:


 cdef class TransitionSystem:
-    def __init__(self, StringStore string_table, dict labels_by_action):
+    def __init__(self, StringStore string_table, labels_by_action):
        self.mem = Pool()
        self.strings = string_table
        self.n_moves = 0
@ -34,14 +34,14 @@ cdef class TransitionSystem:

        self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))

-        for action, label_strs in sorted(labels_by_action.items()):
+        for action, label_strs in labels_by_action.items():
            for label_str in label_strs:
                self.add_action(int(action), label_str)
        self.root_label = self.strings['ROOT']
        self.init_beam_state = _init_state

    def __reduce__(self):
-        labels_by_action = {}
+        labels_by_action = OrderedDict()
        cdef Transition t
        for trans in self.c[:self.n_moves]:
            label_str = self.strings[trans.label]
@ -77,6 +77,11 @@ cdef class TransitionSystem:
                    history.append(i)
                    action.do(state.c, action.label)
                    break
+            else:
+                print(gold.words)
+                print(gold.ner)
+                print(history)
+                raise ValueError("Could not find gold move")
        return history

    cdef int initialize_state(self, StateC* state) nogil:
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@ -1,7 +1,4 @@
 # coding: utf-8
-"""Test that tokenizer exceptions and emoticons are handled correctly."""
-
-
 from __future__ import unicode_literals

 import pytest
@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer):
 def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
    tokens = tokenizer(text)
    assert len(tokens) == length
+
+
+@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
+                                         ('i💙you', 3), ('🤘🤘yay!', 4)])
+def test_tokenizer_handles_emoji(tokenizer, text, length):
+    exceptions = ["hu"]
+    tokens = tokenizer(text)
+    if tokens[0].lang_ not in exceptions:
+        assert len(tokens) == length
--- a/spacy/util.py
+++ b/spacy/util.py
@ -78,27 +78,86 @@ def ensure_path(path):
        return path


-def resolve_model_path(name):
-    """Resolve a model name or string to a model path.
+def load_model(name):
+    """Load a model from a shortcut link, package or data path.

    name (unicode): Package name, shortcut link or model path.
-    RETURNS (Path): Path to model data directory.
+    RETURNS (Language): `Language` class with the loaded model.
    """
    data_path = get_data_path()
    if not data_path or not data_path.exists():
        raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
    if isinstance(name, basestring_):
-        if (data_path / name).exists(): # in data dir or shortcut link
-            return (data_path / name)
-        if is_package(name): # installed as a package
-            return get_model_package_path(name)
-        if Path(name).exists(): # path to model
-            return Path(name)
-    elif hasattr(name, 'exists'): # Path or Path-like object
-        return name
+        if (data_path / name).exists(): # in data dir or shortcut
+            return load_model_from_path(data_path / name)
+        if is_package(name): # installed as package
+            return load_model_from_pkg(name)
+        if Path(name).exists(): # path to model data directory
+            return load_data_from_path(Path(name))
+    elif hasattr(name, 'exists'): # Path or Path-like to model data
+        return load_data_from_path(name)
    raise IOError("Can't find model '%s'" % name)


+def load_model_from_init_py(init_file):
+    """Helper function to use in the `load()` method of a model package's
+    __init__.py.
+
+    init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
+    RETURNS (Language): `Language` class with loaded model.
+    """
+    model_path = Path(init_file).parent
+    return load_data_from_path(model_path, package=True)
+
+
+def load_model_from_path(model_path):
+    """Import and load a model package from its file path.
+
+    path (unicode or Path): Path to package directory.
+    RETURNS (Language): `Language` class with loaded model.
+    """
+    model_path = ensure_path(model_path)
+    spec = importlib.util.spec_from_file_location('model', model_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module.load()
+
+
+def load_model_from_pkg(name):
+    """Import and load a model package.
+
+    name (unicode): Name of model package installed via pip.
+    RETURNS (Language): `Language` class with loaded model.
+    """
+    module = importlib.import_module(name)
+    return module.load()
+
+
+def load_data_from_path(model_path, package=False):
+    """Initialie a `Language` class with a loaded model from a model data path.
+
+    model_path (unicode or Path): Path to model data directory.
+    package (bool): Does the path point to the parent package directory?
+    RETURNS (Language): `Language` class with loaded model.
+    """
+    model_path = ensure_path(model_path)
+    meta_path = model_path / 'meta.json'
+    if not meta_path.is_file():
+        raise IOError("Could not read meta.json from %s" % location)
+    meta = read_json(location)
+    for setting in ['lang', 'name', 'version']:
+        if setting not in meta:
+            raise IOError('No %s setting found in model meta.json' % setting)
+    if package:
+        model_data_path = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
+        model_path = model_path / model_data_path
+    if not model_path.exists():
+        raise ValueError("Can't find model directory: %s" % path2str(model_path))
+    cls = get_lang_class(meta['lang'])
+    nlp = cls(pipeline=meta.get('pipeline', True))
+    return nlp.from_disk(model_path)
+
+
 def is_package(name):
    """Check if string maps to a package installed via pip.

@ -112,36 +171,16 @@ def is_package(name):
    return False


-def get_model_package_path(package_name):
-    """Get path to a model package installed via pip.
+def get_package_path(name):
+    """Get the path to an installed package.

-    package_name (unicode): Name of installed package.
-    RETURNS (Path): Path to model data directory.
+    name (unicode): Package name.
+    RETURNS (Path): Path to installed package.
    """
    # Here we're importing the module just to find it. This is worryingly
    # indirect, but it's otherwise very difficult to find the package.
-    # Python's installation and import rules are very complicated.
    pkg = importlib.import_module(package_name)
-    package_path = Path(pkg.__file__).parent.parent
-    meta = parse_package_meta(package_path / package_name)
-    model_name = '%s-%s' % (package_name, meta['version'])
-    return package_path / package_name / model_name
-
-
-def parse_package_meta(package_path, require=True):
-    """Check if a meta.json exists in a package and return its contents.
-
-    package_path (Path): Path to model package directory.
-    require (bool): If True, raise error if no meta.json is found.
-    RETURNS (dict or None): Model meta.json data or None.
-    """
-    location = package_path / 'meta.json'
-    if location.is_file():
-        return read_json(location)
-    elif require:
-        raise IOError("Could not read meta.json from %s" % location)
-    else:
-        return None
+    return Path(pkg.__file__).parent


 def is_in_jupyter():
@ -177,10 +216,13 @@ def get_async(stream, numpy_array):

 def itershuffle(iterable, bufsize=1000):
    """Shuffle an iterator. This works by holding `bufsize` items back
-    and yielding them sometime later. Obviously, this is not unbiased --
+    and yielding them sometime later. Obviously, this is not unbiased –
    but should be good enough for batching. Larger bufsize means less bias.
-
    From https://gist.github.com/andres-erbsen/1307752
+
+    iterable (iterable): Iterator to shuffle.
+    bufsize (int): Items to hold back.
+    YIELDS (iterable): The shuffled iterator.
    """
    iterable = iter(iterable)
    buf = []
@ -315,17 +357,16 @@ def normalize_slice(length, start, stop, step=None):


 def compounding(start, stop, compound):
-    '''Yield an infinite series of compounding values. Each time the
+    """Yield an infinite series of compounding values. Each time the
    generator is called, a value is produced by multiplying the previous
    value by the compound rate.

-    EXAMPLE
-
+    EXAMPLE:
      >>> sizes = compounding(1., 10., 1.5)
      >>> assert next(sizes) == 1.
      >>> assert next(sizes) == 1 * 1.5
      >>> assert next(sizes) == 1.5 * 1.5
-    '''
+    """
    def clip(value):
        return max(value, stop) if (start>stop) else min(value, stop)
    curr = float(start)
@ -335,7 +376,7 @@ def compounding(start, stop, compound):


 def decaying(start, stop, decay):
-    '''Yield an infinite series of linearly decaying values.'''
+    """Yield an infinite series of linearly decaying values."""
    def clip(value):
        return max(value, stop) if (start>stop) else min(value, stop)
    nr_upd = 1.
@ -344,12 +385,6 @@ def decaying(start, stop, decay):
        nr_upd += 1


-def check_renamed_kwargs(renamed, kwargs):
-    for old, new in renamed.items():
-        if old in kwargs:
-            raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
-
-
 def read_json(location):
    """Open and load JSON from file.

--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -53,8 +53,6 @@ cdef class Vocab:
            vice versa.
        RETURNS (Vocab): The newly constructed vocab object.
        """
-        util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
-
        lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
        tag_map = tag_map if tag_map is not None else {}
        if lemmatizer in (None, True, False):
--- a/website/assets/img/docs/architecture.svg
+++ b/website/assets/img/docs/architecture.svg
@ -1,9 +1,9 @@
 <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="746">
    <style>
-        .svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
-        .svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" }
-        .svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" }
-        .svg__architecture__text-code {  fill: #1a1e23; font: 600 12px "Source Code Pro" }
+        .svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
+        .svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
+        .svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
+        .svg__architecture__text-code {  fill: #1a1e23; font: 600 12px "Source Code Pro", Monaco, "Courier New", monospace }
    </style>
    <ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
    <text class="svg__architecture__text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>
--- a/website/assets/img/docs/language_data.svg
+++ b/website/assets/img/docs/language_data.svg
@ -1,8 +1,8 @@
 <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="931" height="456" viewBox="-1 -1 932 480" preserveAspectRatio="xMinYMin meet">
    <style>
-        .svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
-        .svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
-        .svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
+        .svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
+        .svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
+        .svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
    </style>
    <path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M610 404h-69.8" stroke-dasharray="1 6" stroke-linecap="round"/>
    <path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M534.2 404l8-4-2 4 2 4z"/>
--- a/website/assets/img/docs/pipeline.svg
+++ b/website/assets/img/docs/pipeline.svg
@ -1,8 +1,8 @@
 <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 923 200" width="923" height="200">
    <style>
-        .svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro" }
-        .svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro" }
-        .svg__pipeline__text-code {  fill: #1a1e23; font: 600 16px "Source Code Pro" }
+        .svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
+        .svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
+        .svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro", Monaco, "Courier New", monospace }
    </style>
    <rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/>
    <path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/>
--- a/website/assets/img/docs/tokenization.svg
+++ b/website/assets/img/docs/tokenization.svg
@ -0,0 +1,123 @@
+<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="600" height="380" viewBox="-20 -10 550 400">
+    <style>
+        .svg__tokenization__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
+        .svg__tokenization__text-small { fill: #fff; font: 600 13px "Source Code Pro", Monaco, "Courier New", monospace }
+    </style>
+    <path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 39v12H16v11M71 39v12h20v11"/>
+    <path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M1 1h140v38.2H1z"/>
+    <text class="svg__tokenization__text" dy="1em" width="43" height="19" transform="translate(48.5 9.5)">“Let’s</text>
+    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 39v23"/>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 1h50v38.2h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(164.5 9.5)" width="19" height="19">go</text>
+    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 39v23"/>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 1h50v38.2h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(226.5 9.5)" width="15" height="19">to</text>
+    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 39v23"/>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 1h141v38.2H270z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(320.5 9.5)" width="38" height="19">N.Y.!”</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 100v20"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 62h30v38.2H1z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(11.5 70.5)" width="7" height="19">“</text>
+    <path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M91 100v11H66v9M91 100v11h29v9"/>
+    <path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M41 62h100v38.2H41z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(72.5 70.5)" width="35" height="19">Let’s</text>
+    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 100v20"/>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 62h50v38.2h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(164.5 70.5)" width="19" height="19">go</text>
+    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 100v20"/>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 62h50v38.2h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(226.5 70.5)" width="15" height="19">to</text>
+    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 100v20"/>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 62h141v38.2H270z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(320.5 70.5)" width="38" height="19">N.Y.!”</text>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 120h30v38H1z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(11.5 128.5)" width="7" height="19">“</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 158v24"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 120h50v38H41z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(53.5 128.5)" width="23" height="19">Let</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 158v24"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 120h50v38h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(164.5 128.5)" width="19" height="19">go</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 158v24"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 120h50v38h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(226.5 128.5)" width="15" height="19">to</text>
+    <path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M341 158v13h-20v11M341 158v13h55v11"/>
+    <path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 120h141v38H270z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(320.5 128.5)" width="38" height="19">N.Y.!”</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 158v24"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 120h40v38h-40z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(113.5 128.5)" width="11" height="19">’s</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 220v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 181.8h30V220H1z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(11.5 190.5)" width="7" height="19">“</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 220v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 181.8h50V220H41z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(53.5 190.5)" width="23" height="19">Let</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 220v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 181.8h50V220h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(164.5 190.5)" width="19" height="19">go</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 220v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 181.8h50V220h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(226.5 190.5)" width="15" height="19">to</text>
+    <path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M321 220v11h-20v12M321 220v11h34v12"/>
+    <path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 181.8h101V220H270z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(304.5 190.5)" width="30" height="19">N.Y.!</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 220v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 181.8h40V220h-40z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(113.5 190.5)" width="11" height="19">’s</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 220v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 181.8h30V220h-30z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(391.5 190.5)" width="7" height="19">”</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 281v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 242.7h30V281H1z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(11.5 251.5)" width="7" height="19">“</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 281v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 242.7h50V281H41z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(53.5 251.5)" width="23" height="19">Let</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 281v20-17 20"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 242.7h50V281h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(164.5 251.5)" width="19" height="19">go</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 281v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 242.7h50V281h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(226.5 251.5)" width="15" height="19">to</text>
+    <path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M301 281v23"/>
+    <path fill="#f8cecc" stroke="#b85450" stroke-width="2" d="M270 242.7h61V281h-61z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(286.5 251.5)" width="26" height="19">N.Y.</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 281v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 242.7h40V281h-40z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(113.5 251.5)" width="11" height="19">’s</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 281v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 242.7h30V281h-30z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(391.5 251.5)" width="7" height="19">”</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M355 281v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 242.7h30V281h-30z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(351.5 251.5)" width="5" height="19">!</text>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 304h30v38H1z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(11.5 312.5)" width="7" height="19">“</text>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 304h50v38H41z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(53.5 312.5)" width="23" height="19">Let</text>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 304h50v38h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(164.5 312.5)" width="19" height="19">go</text>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 304h50v38h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(226.5 312.5)" width="15" height="19">to</text>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M270 304h61v38h-61z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(286.5 312.5)" width="26" height="19">N.Y.</text>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 304h40v38h-40z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(113.5 312.5)" width="11" height="19">’s</text>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 304h30v38h-30z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(391.5 312.5)" width="7" height="19">”</text>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 304h30v38h-30z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(351.5 312.5)" width="5" height="19">!</text>
+    <rect width="104" height="19" x="437" y="72" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
+    <text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 74.5)" width="65" height="12">EXCEPTION</text>
+    <rect width="104" height="19" x="437" y="11" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
+    <text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 13.5)" width="43" height="12">PREFIX</text>
+    <rect width="104" height="19" x="437" y="130" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
+    <text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 132.5)" width="43" height="12">SUFFIX</text>
+    <rect width="104" height="19" x="437" y="191" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
+    <text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 193.5)" width="43" height="12">SUFFIX</text>
+    <rect width="104" height="19" x="437" y="252" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
+    <text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 254.5)" width="65" height="12">EXCEPTION</text>
+    <rect width="104" height="19" x="437" y="313" fill="#82b366" stroke="#82b366" stroke-width="2" rx="2.9" ry="2.9"/>
+    <text class="svg__tokenization__text-small" dy="0.9em" transform="translate(473.5 315.5)" width="29" height="12">DONE</text>
+</svg>
--- a/website/assets/img/docs/vocab_stringstore.svg
+++ b/website/assets/img/docs/vocab_stringstore.svg
@ -1,9 +1,9 @@
 <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-10 -10 582 365" width="572" height="355">
    <style>
-        .svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro" }
-        .svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro"; text-transform: uppercase }
-        .svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro" }
-        .svg__vocab__text-code {  fill: #1a1e23; font: bold 12px "Source Code Pro" }
+        .svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
+        .svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif; text-transform: uppercase }
+        .svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
+        .svg__vocab__text-code {  fill: #1a1e23; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
    </style>
    <rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/>
    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/>
--- a/website/docs/api/_data.json
+++ b/website/docs/api/_data.json
@ -158,7 +158,8 @@

    "binder": {
        "title": "Binder",
-        "tag": "class"
+        "tag": "class",
+        "source": "spacy/tokens/binder.pyx"
    },

    "annotation": {
--- a/website/docs/api/language-models.jade
+++ b/website/docs/api/language-models.jade
@ -2,7 +2,10 @@

 include ../../_includes/_mixins

-p spaCy currently supports the following languages and capabilities:
+p
+    |  spaCy currently provides models for the following languages and
+    |  capabilities:
+

 +aside-code("Download language models", "bash").
    python -m spacy download en
@ -22,12 +25,16 @@ p spaCy currently supports the following languages and capabilities:

    +row
        +cell French #[code fr]
-        each icon in [ "pro", "pro", "con", "pro", "con", "pro", "pro", "con" ]
+        each icon in [ "pro", "con", "con", "pro", "con", "pro", "pro", "con" ]
            +cell.u-text-center #[+procon(icon)]

-+h(2, "available") Available models
+    +row
+        +cell Spanish #[code es]
+        each icon in [ "pro", "pro", "con", "pro", "pro", "pro", "pro", "con" ]
+            +cell.u-text-center #[+procon(icon)]

-include ../usage/_models-list
+p
+    +button("/docs/usage/models", true, "primary") See available models

 +h(2, "alpha-support") Alpha tokenization support

@ -52,9 +59,35 @@ p
    |  #[+a("https://github.com/mocobeta/janome") Janome].

 +table([ "Language", "Code", "Source" ])
-    each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
+    each language, code in { it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
        +row
            +cell #{language}
            +cell #[code=code]
            +cell
                +src(gh("spaCy", "spacy/lang/" + code)) lang/#{code}
+
+h(2, "multi-language") Multi-language support
+    +tag-new(2)
+
+p
+    |  As of v2.0, spaCy supports models trained on more than one language. This
+    |  is especially useful for named entity recognition. The language ID used
+    |  for multi-language or language-neutral models is #[code xx]. The
+    |  language class, a generic subclass containing only the base language data,
+    |  can be found in #[+src(gh("spaCy", "spacy/lang/xx")) lang/xx].
+
+p
+    |  To load your model with the neutral, multi-language class, simply set
+    |  #[code "language": "xx"] in your
+    |  #[+a("/docs/usage/saving-loading#models-generating") model package]'s
+    |  meta.json. You can also import the class directly, or call
+    |  #[+api("util#get_lang_class") #[code util.get_lang_class()]] for
+    |  lazy-loading.
+
+code("Standard import").
+    from spacy.lang.xx import MultiLanguage
+    nlp = MultiLanguage()
+
+code("With lazy-loading").
+    from spacy.util import get_lang_class
+    nlp = get_lang_class('xx')
--- a/website/docs/api/spacy.jade
+++ b/website/docs/api/spacy.jade
@ -11,8 +11,13 @@ p
    |  the name of an installed
    |  #[+a("/docs/usage/saving-loading#generating") model package], a unicode
    |  path or a #[code Path]-like object. spaCy will try resolving the load
-    |  argument in this order. The #[code Language] class to initialise will be
-    |  determined based on the model's settings.
+    |  argument in this order. If a model is loaded from a shortcut link or
+    |  package name, spaCy will assume it's a Python package and import it and
+    |  call the model's own #[code load()] method. If a model is loaded from a
+    |  path, spaCy will assume it's a data directory, read the language and
+    |  pipeline settings off the meta.json and initialise the #[code Language]
+    |  class. The data will be loaded in via
+    |  #[+api("language#from_disk") #[code Language.from_disk()]].

 +aside-code("Example").
    nlp = spacy.load('en') # shortcut link
@ -20,7 +25,7 @@ p
    nlp = spacy.load('/path/to/en') # unicode path
    nlp = spacy.load(Path('/path/to/en')) # pathlib Path

-    nlp = spacy.load('en', disable['parser', 'tagger'])
+    nlp = spacy.load('en', disable=['parser', 'tagger'])

 +table(["Name", "Type", "Description"])
    +row
--- a/website/docs/api/util.jade
+++ b/website/docs/api/util.jade
@ -1,12 +1,10 @@
-//- 💫 DOCS > API > ANNOTATION SPECS
+//- 💫 DOCS > API > UTIL

 include ../../_includes/_mixins

 p
    |  spaCy comes with a small collection of utility functions located in
    |  #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py].
-
-+infobox("Important note")
    |  Because utility functions are mostly intended for
    |  #[strong internal use within spaCy], their behaviour may change with
    |  future releases. The functions documented on this page should be safe
@ -74,15 +72,23 @@ p
        +cell #[code Language]
        +cell Language class.

-+h(2, "resolve_model_path") util.resolve_model_path
+h(2, "load_model") util.load_model
    +tag function
    +tag-new(2)

-p Resolve a model name or string to a model path.
+p
+    |  Load a model from a shortcut link, package or data path. If called with a
+    |  shortcut link or package name, spaCy will assume the model is a Python
+    |  package and import and call its #[code load()] method. If called with a
+    |  path, spaCy will assume it's a data directory, read the language and
+    |  pipeline settings from the meta.json and initialise a #[code Language]
+    |  class. The model data will then be loaded in via
+    |  #[+api("language#from_disk") #[code Language.from_disk()]].

 +aside-code("Example").
-    model_path = util.resolve_model_path('en')
-    model_path = util.resolve_model_path('/path/to/en')
+    nlp = util.load_model('en')
+    nlp = util.load_model('en_core_web_sm')
+    nlp = util.load_model('/path/to/data')

 +table(["Name", "Type", "Description"])
    +row
@ -92,8 +98,33 @@ p Resolve a model name or string to a model path.

    +footrow
        +cell returns
-        +cell #[code Path]
-        +cell Path to model data directory.
+        +cell #[code Language]
+        +cell #[code Language] class with the loaded model.
+
+h(2, "load_model_from_init_py") util.load_model_from_init_py
+    +tag function
+    +tag-new(2)
+
+p
+    |  A helper function to use in the #[code load()] method of a model package's
+    |  #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py].
+
+aside-code("Example").
+    from spacy.util import load_model_from_init_py
+
+    def load():
+        return load_model_from_init_py(__file__)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code init_file]
+        +cell unicode
+        +cell Path to model's __init__.py, i.e. #[code __file__].
+
+    +footrow
+        +cell returns
+        +cell #[code Language]
+        +cell #[code Language] class with the loaded model.

 +h(2, "is_package") util.is_package
    +tag function
@ -117,16 +148,18 @@ p
        +cell #[code bool]
        +cell #[code True] if installed package, #[code False] if not.

-+h(2, "get_model_package_path") util.get_model_package_path
+h(2, "get_package_path") util.get_package_path
    +tag function
+    +tag-new(2)

 p
-    |  Get path to a #[+a("/docs/usage/models") model package] installed via pip.
-    |  Currently imports the package to find it and parse its meta data.
+    |  Get path to an installed package. Mainly used to resolve the location of
+    |  #[+a("/docs/usage/models") model packages]. Currently imports the package
+    |  to find its path.

 +aside-code("Example").
-    util.get_model_package_path('en_core_web_sm')
-    # /usr/lib/python3.6/site-packages/en_core_web_sm/en_core_web_sm-1.2.0
+    util.get_package_path('en_core_web_sm')
+    # /usr/lib/python3.6/site-packages/en_core_web_sm

 +table(["Name", "Type", "Description"])
    +row
@ -137,37 +170,8 @@ p
    +footrow
        +cell returns
        +cell #[code Path]
-        +cell Path to model data directory.
-
-+h(2, "parse_package_meta") util.parse_package_meta
-    +tag function
-
-p
-    |  Check if a #[code meta.json] exists in a model package and return its
-    |  contents.
-
-+aside-code("Example").
-    if util.is_package('en_core_web_sm'):
-        path = util.get_model_package_path('en_core_web_sm')
-        meta = util.parse_package_meta(path, require=True)
-        # {'name': 'core_web_sm', 'lang': 'en', ...}
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code package_path]
-        +cell #[code Path]
        +cell Path to model package directory.

-    +row
-        +cell #[code require]
-        +cell #[code bool]
-        +cell If #[code True], raise error if no #[code meta.json] is found.
-
-    +footrow
-        +cell returns
-        +cell dict / #[code None]
-        +cell Model meta data or #[code None].
-
 +h(2, "is_in_jupyter") util.is_in_jupyter
    +tag function
    +tag-new(2)
--- a/website/docs/usage/_spacy-101/_similarity.jade
+++ b/website/docs/usage/_spacy-101/_similarity.jade
@ -5,7 +5,7 @@ p
    |  #[strong how similar they are]. Predicting similarity is useful for
    |  building recommendation systems or flagging duplicates. For example, you
    |  can suggest a user content that's similar to what they're currently
-    |  looking at, or label a support ticket as a duplicate, if it's very
+    |  looking at, or label a support ticket as a duplicate if it's very
    |  similar to an already existing one.

 p
--- a/website/docs/usage/_spacy-101/_tokenization.jade
+++ b/website/docs/usage/_spacy-101/_tokenization.jade
@ -16,3 +16,47 @@ p
    +row
        for cell in ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "$", "1", "billion"]
            +cell=cell
+
+p
+    |  Fist, the raw text is split on whitespace characters, similar to
+    |  #[code text.split(' ')]. Then, the tokenizer processes the text from
+    |  left to right. On each substring, it performs two checks:
+
+list("numbers")
+    +item
+        |  #[strong Does the substring match a tokenizer exception rule?] For
+        |  example, "don't" does not contain whitespace, but should be split
+        |  into two tokens, "do" and "n't", while "U.K." should always
+        |  remain one token.
+    +item
+        |  #[strong Can a prefix, suffix or infixes be split off?]. For example
+        |  punctuation like commas, periods, hyphens or quotes.
+
+p
+    |  If there's a match, the rule is applied and the tokenizer continues its
+    |  loop, starting with the newly split substrings. This way, spaCy can split
+    |  #[strong complex, nested tokens] like combinations of abbreviations and
+    |  multiple punctuation marks.
+
+aside
+    |  #[strong Tokenizer exception:] Special-case rule to split a string into
+    |  several tokens or prevent a token from being split when punctuation rules
+    |  are applied.#[br]
+    |  #[strong Prefix:] Character(s) at the beginning, e.g.
+    |  #[code $], #[code (], #[code “], #[code ¿].#[br]
+    |  #[strong Suffix:] Character(s) at the end, e.g.
+    |  #[code km], #[code &#41;], #[code ”], #[code !].#[br]
+    |  #[strong Infix:] Character(s) in between, e.g.
+    |  #[code -], #[code --], #[code /], #[code …].#[br]
+
+image
+    include ../../../assets/img/docs/tokenization.svg
+    .u-text-right
+        +button("/assets/img/docs/tokenization.svg", false, "secondary").u-text-tag View large graphic
+
+p
+    |  While punctuation rules are usually pretty general, tokenizer exceptions
+    |  strongly depend on the specifics of the individual language. This is
+    |  why each #[+a("/docs/api/language-models") available language] has its
+    |  own subclass like #[code English] or #[code German], that loads in lists
+    |  of hard-coded data and exception rules.
--- a/website/docs/usage/_spacy-101/_vocab-stringstore.jade
+++ b/website/docs/usage/_spacy-101/_vocab-stringstore.jade
@ -89,4 +89,6 @@ p

 p
    |  Even though both #[code Doc] objects contain the same words, the internal
-    |  integer IDs are very different.
+    |  integer IDs are very different. The same applies for all other strings,
+    |  like the annotation scheme. To avoid mismatched IDs, spaCy will always
+    |  export the vocab if you save a #[code Doc] or #[code nlp] object.
--- a/website/docs/usage/language-processing-pipeline.jade
+++ b/website/docs/usage/language-processing-pipeline.jade
@ -19,19 +19,17 @@ p

 p
    |  When you load a model, spaCy first consults the model's
-    |  #[+a("/docs/usage/saving-loading#models-generating") meta.json] for its
-    |  #[code setup] details. This typically includes the ID of a language class,
+    |  #[+a("/docs/usage/saving-loading#models-generating") meta.json]. The
+    |  meta typically includes the model details, the ID of a language class,
    |  and an optional list of pipeline components. spaCy then does the
    |  following:

 +aside-code("meta.json (excerpt)", "json").
    {
        "name": "example_model",
+        "lang": "en"
        "description": "Example model for spaCy",
-        "setup": {
-            "lang": "en",
-            "pipeline": ["token_vectors", "tagger"]
-        }
+        "pipeline": ["token_vectors", "tagger"]
    }

 +list("numbers")
@ -146,7 +144,7 @@ p
 +table(["Argument", "Type", "Description"])
    +row
        +cell #[code vocab]
-        +cell #[coce Vocab]
+        +cell #[code Vocab]
        +cell
            |  Shared data between components, including strings, morphology,
            |  vectors etc.
@ -287,17 +285,15 @@ p

 p
    |  In the model package's meta.json, specify the language class and pipeline
-    |  IDs in #[code setup]:
+    |  IDs:

 +code("meta.json (excerpt)", "json").
    {
-        "name": "my_sentiment_model",
+        "name": "sentiment_model",
+        "lang": "en",
        "version": "1.0.0",
        "spacy_version": "&gt;=2.0.0,&lt;3.0.0",
-        "setup": {
-            "lang": "en",
-            "pipeline": ["vectorizer", "sentiment"]
-        }
+        "pipeline": ["vectorizer", "sentiment"]
    }

 p
@ -307,7 +303,7 @@ p
    |  by your custom #[code "sentiment"] factory.

 +code.
-    nlp = spacy.load('my_sentiment_model')
+    nlp = spacy.load('en_sentiment_model')
    doc = nlp(u'I love pizza')
    assert doc.sentiment

--- a/website/docs/usage/lightning-tour.jade
+++ b/website/docs/usage/lightning-tour.jade
@ -129,15 +129,18 @@ p
 +code.
    import spacy
    from spacy.tokens.doc import Doc
+    from spacy.vocab import Vocab

    nlp = spacy.load('en')
    moby_dick = open('moby_dick.txt', 'r')
    doc = nlp(moby_dick)
    doc.to_disk('/moby_dick.bin')

-    new_doc = Doc().from_disk('/moby_dick.bin')
+    new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')

 +infobox
+    |  #[strong API:] #[+api("language") #[code Language]],
+    |  #[+api("doc") #[code Doc]]
    |  #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]

 +h(2, "rule-matcher") Match text with token rules
@ -148,9 +151,14 @@ p

    nlp = spacy.load('en')
    matcher = Matcher(nlp.vocab)
-    # match "Google I/O" or "Google i/o"
-    pattern = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
-    matcher.add('GoogleIO', None, pattern)
+
+    def set_sentiment(matcher, doc, i, matches):
+        doc.sentiment += 0.1
+
+    pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
+    pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
+    matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
+    matcher.add('HAPPY', set_sentiment, pattern2) # match one or more happy emoji
    matches = nlp(LOTS_OF TEXT)

 +infobox
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/docs/usage/rule-based-matching.jade
@ -11,7 +11,7 @@ p
    |  You can also associate patterns with entity IDs, to allow some basic
    |  entity linking or disambiguation.

-+aside("What about \"real\" regular expressions?")
+//-+aside("What about \"real\" regular expressions?")

 +h(2, "adding-patterns") Adding patterns

@ -119,7 +119,7 @@ p
 +code.
    # Add a new custom flag to the vocab, which is always False by default.
    # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
-    BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False)
+    BAD_HTML_FLAG = nlp.vocab.add_flag(lambda text: False)

    def merge_and_flag(matcher, doc, i, matches):
        match_id, start, end = matches[i]
@ -221,7 +221,7 @@ p
        +cell match 0 or 1 times
        +cell optional, max one

-+h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations
+h(2, "example1") Example: Using linguistic annotations

 p
    |  Let's say you're analysing user comments and you want to find out what
@ -283,7 +283,7 @@ p
    # set manual=True to make displaCy render straight from a dictionary
    displacy.serve(matched_sents, style='ent', manual=True)

-+h(3, "quantifiers-example2") Quantifiers example: Phone numbers
+h(2, "example2") Example: Phone numbers

 p
    |  Phone numbers can have many different formats and matching them is often
@ -320,3 +320,114 @@ p
    |  It'll produce more predictable results, is much easier to modify and
    |  extend, and doesn't require any training data – only a set of
    |  test cases.
+
+h(2, "example3") Example: Hashtags and emoji on social media
+
+p
+    |  Social media posts, especially tweets, can be difficult to work with.
+    |  They're very short and often contain various emoji and hashtags. By only
+    |  looking at the plain text, you'll lose a lot of valuable semantic
+    |  information.
+
+p
+    |  Let's say you've extracted a large sample of social media posts on a
+    |  specific topic, for example posts mentioning a brand name or product.
+    |  As the first step of your data exploration, you want to filter out posts
+    |  containing certain emoji and use them to assign a general sentiment
+    |  score, based on whether the expressed emotion is positive or negative,
+    |  e.g. #[span.o-icon.o-icon--inline 😀] or #[span.o-icon.o-icon--inline 😞].
+    |  You also want to find, merge and label hashtags like
+    |  #[code #MondayMotivation], to be able to ignore or analyse them later.
+
+aside("Note on sentiment analysis")
+    |  Ultimately, sentiment analysis is not always #[em that] easy. In
+    |  addition to the emoji, you'll also want to take specific words into
+    |  account and check the #[code subtree] for intensifiers like "very", to
+    |  increase the sentiment score. At some point, you might also want to train
+    |  a sentiment model. However, the approach described in this example is
+    |  very useful for #[strong bootstrapping rules to collect training data].
+    |  It's also an incredibly fast way to gather first insights into your data
+    |  – with about 1 million tweets, you'd be looking at a processing time of
+    |  #[strong under 1 minute].
+
+p
+    |  By default, spaCy's tokenizer will split emoji into separate tokens. This
+    |  means that you can create a pattern for one or more emoji tokens. In this
+    |  case, a sequence of identical emoji should be treated as one instance.
+    |  Valid hashtags usually consist of a #[code #], plus a sequence of
+    |  ASCII characters with no whitespace, making them easy to match as well.
+
+code.
+    from spacy.lang.en import English
+    from spacy.matcher import Matcher
+
+    nlp = English() # we only want the tokenizer, so no need to load a model
+    matcher = Matcher(nlp.vocab)
+
+    pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍'] # positive emoji
+    neg_emoji = [u'😞', u'😠', u'😩', u'😢', u'😭', u'😒'] # negative emoji
+
+    # add patterns to match one or more emoji tokens
+    pos_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in pos_emoji]
+    neg_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in neg_emoji]
+
+    matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern
+    matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern
+
+    # add pattern to merge valid hashtag, i.e. '#' plus any ASCII token
+    matcher.add('HASHTAG', merge_hashtag, [{'ORTH': '#'}, {'IS_ASCII': True}])
+
+p
+    |  Because the #[code on_match] callback receives the ID of each match, you
+    |  can use the same function to handle the sentiment assignment for both
+    |  the positive and negative pattern. To keep it simple, we'll either add
+    |  or subtract #[code 0.1] points – this way, the score will also reflect
+    |  combinations of emoji, even positive #[em and] negative ones.
+
+p
+    |  With a library like
+    |  #[+a("https://github.com/bcongdon/python-emojipedia") Emojipedia],
+    |  we can also retrieve a short description for each emoji – for example,
+    |  #[span.o-icon.o-icon--inline 😍]'s official title is "Smiling Face With
+    |  Heart-Eyes". Assigning it to the merged token's norm will make it
+    |  available as #[code token.norm_].
+
+code.
+    from emojipedia import Emojipedia # installation: pip install emojipedia
+
+    def label_sentiment(matcher, doc, i, matches):
+        match_id, start, end = matches[i]
+        if match_id is 'HAPPY':
+            doc.sentiment += 0.1 # add 0.1 for positive sentiment
+        elif match_id is 'SAD':
+            doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment
+        span = doc[start : end]
+        emoji = Emojipedia.search(span[0].text) # get data for emoji
+        span.merge(norm=emoji.title) # merge span and set NORM to emoji title
+
+p
+    |  To label the hashtags, we first need to add a new custom flag.
+    |  #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it
+    |  to the hashtag's span, and check its value via a token's
+    |  #[+api("token#check_flag") #[code code check_flag()]] method. On each
+    |  match, we merge the hashtag and assign the flag.
+
+code.
+    # Add a new custom flag to the vocab, which is always False by default
+    IS_HASHTAG = nlp.vocab.add_flag(lambda text: False)
+
+    def merge_hashtag(matcher, doc, i, matches):
+        match_id, start, end = matches[i]
+        span = doc[start : end]
+        span.merge() # merge hashtag
+        span.set_flag(IS_HASHTAG, True) # set IS_HASHTAG to True
+
+p
+    |  To process a stream of social media posts, we can use
+    |  #[+api("language#pipe") #[code Language.pipe()]], which will return a
+    |  stream of #[code Doc] objects that we can pass to
+    |  #[+api("matcher#pipe") #[code Matcher.pipe()]].
+
+code.
+    docs = nlp.pipe(LOTS_OF_TWEETS)
+    matches = matcher.pipe(docs)
--- a/website/docs/usage/saving-loading.jade
+++ b/website/docs/usage/saving-loading.jade
@ -74,16 +74,14 @@ p
 +aside-code("meta.json", "json").
    {
        "name": "example_model",
+        "lang": "en",
        "version": "1.0.0",
        "spacy_version": "&gt;=2.0.0,&lt;3.0.0",
        "description": "Example model for spaCy",
        "author": "You",
        "email": "you@example.com",
        "license": "CC BY-SA 3.0",
-        "setup": {
-            "lang": "en",
-            "pipeline": ["token_vectors", "tagger"]
-        }
+        "pipeline": ["token_vectors", "tagger"]
    }

 +code(false, "bash").
@ -110,9 +108,9 @@ p
 +h(3, "models-custom") Customising the model setup

 p
-    |  The meta.json includes a #[code setup] key that lets you customise how
-    |  the model should be initialised and loaded. You can define the language
-    |  data to be loaded and the
+    |  The meta.json includes the model details, like name, requirements and
+    |  license, and lets you customise how the model should be initialised and
+    |  loaded. You can define the language data to be loaded and the
    |  #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to
    |  execute.

@ -183,9 +181,9 @@ p
 p
    |  To load a model from a data directory, you can use
    |  #[+api("spacy#load") #[code spacy.load()]] with the local path. This will
-    |  look for a meta.json in the directory and use the #[code setup] details
-    |  to initialise a #[code Language] class with a processing pipeline and
-    |  load in the model data.
+    |  look for a meta.json in the directory and use the #[code lang] and
+    |  #[code pipeline] settings to initialise a #[code Language] class with a
+    |  processing pipeline and load in the model data.

 +code.
    nlp = spacy.load('/path/to/model')
--- a/website/docs/usage/spacy-101.jade
+++ b/website/docs/usage/spacy-101.jade
@ -65,7 +65,7 @@ p
    |  spaCy provides a variety of linguistic annotations to give you insights
    |  into a text's grammatical structure. This includes the word types,
    |  i.e. the parts of speech, and how the words are related to each other.
-    |  For example, if you're analysing text, it makes a #[em huge] difference
+    |  For example, if you're analysing text, it makes a huge difference
    |  whether a noun is the subject of a sentence, or the object – or whether
    |  "google" is used as a verb, or refers to the website or company in a
    |  specific context.
@ -94,9 +94,10 @@ p
 include _spacy-101/_tokenization

 +infobox
-    |  To learn more about how spaCy's tokenizer and its rules work in detail,
-    |  how to #[strong customise] it and how to #[strong add your own tokenizer]
-    |  to a processing pipeline, see the usage guide on
+    |  To learn more about how spaCy's tokenization rules work in detail,
+    |  how to #[strong customise and replace] the default tokenizer and how to
+    |  #[strong add language-specific data], see the usage guides on
+    |  #[+a("/docs/usage/adding-languages") adding languages] and
    |  #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer].

 +h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
@ -118,9 +119,11 @@ include _spacy-101/_named-entities

 +infobox
    |  To learn more about entity recognition in spaCy, how to
-    |  #[strong add your own entities] to a document and how to train and update
-    |  the entity predictions of a model, see the usage guide on
-    |  #[+a("/docs/usage/entity-recognition") named entity recognition].
+    |  #[strong add your own entities] to a document and how to
+    |  #[strong train and update] the entity predictions of a model, see the
+    |  usage guides on
+    |  #[+a("/docs/usage/entity-recognition") named entity recognition] and
+    |  #[+a("/docs/usage/training-ner") training the named entity recognizer].

 +h(2, "vectors-similarity") Word vectors and similarity
    +tag-model("vectors")
--- a/website/docs/usage/v2.jade
+++ b/website/docs/usage/v2.jade
@ -20,19 +20,18 @@ p
    nlp = Language(pipeline=['my_factory', mycomponent])

 p
-    |  It's now much easier to customise the pipeline with your own components.
-    |  Components are functions that receive a #[code Doc] object, modify and
-    |  return it. If your component is stateful, you'll want to create a new one
-    |  for each pipeline. You can do that by defining and registering a factory
-    |  which receives the shared #[code Vocab] object and returns a component.
-
-p
-    |  spaCy's default components – the vectorizer, tagger, parser and entity
-    |  recognizer, can be added to your pipeline by using their string IDs.
-    |  This way, you won't have to worry about finding and implementing them –
-    |  to use the default tagger, simply add #[code "tagger"] to the pipeline,
+    |  It's now much easier to #[strong customise the pipeline] with your own
+    |  components, functions that receive a #[code Doc] object, modify and
+    |  return it. If your component is stateful, you can define and register a
+    |  factory which receives the shared #[code Vocab] object and returns a
+    |  component. spaCy's default components can be added to your pipeline by
+    |  using their string IDs. This way, you won't have to worry about finding
+    |  and implementing them – simply add #[code "tagger"] to the pipeline,
    |  and spaCy will know what to do.

+image
+    include ../../assets/img/docs/pipeline.svg
+
 +infobox
    |  #[strong API:] #[+api("language") #[code Language]]
    |  #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
@ -96,11 +95,10 @@ p
    |  #[code Language] class, or load a model that initialises one. This allows
    |  languages to contain more custom data, e.g. lemmatizer lookup tables, or
    |  complex regular expressions. The language data has also been tidied up
-    |  and simplified. It's now also possible to overwrite the functions that
-    |  compute lexical attributes like #[code like_num], and supply
-    |  language-specific syntax iterators, e.g. to determine noun chunks. spaCy
-    |  now also supports simple lookup-based lemmatization. The data is stored
-    |  in a dictionary mapping a string to its lemma.
+    |  and simplified. spaCy now also supports simple lookup-based lemmatization.
+
+image
+    include ../../assets/img/docs/language_data.svg

 +infobox
    |  #[strong API:] #[+api("language") #[code Language]]
@ -111,13 +109,10 @@ p

 +aside-code("Example").
    from spacy.matcher import Matcher
-    from spacy.attrs import LOWER, IS_PUNCT
    matcher = Matcher(nlp.vocab)
-    matcher.add('HelloWorld', None,
-                [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
-                [{LOWER: 'hello'}, {LOWER: 'world'}])
+    matcher.add('HEARTS', None, [{'ORTH': '❤️', 'OP': '+'}])
    assert len(matcher) == 1
-    assert 'HelloWorld' in matcher
+    assert 'HEARTS' in matcher

 p
    |  Patterns can now be added to the matcher by calling
@ -157,28 +152,8 @@ p
        +cell #[+api("language#to_disk") #[code Language.to_disk]]

    +row
-        +cell #[code Tokenizer.load]
-        +cell
-            |  #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
-            |  #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
-
-    +row
-        +cell #[code Tagger.load]
-        +cell
-            |  #[+api("tagger#from_disk") #[code Tagger.from_disk]]
-            |  #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
-
-    +row
-        +cell #[code DependencyParser.load]
-        +cell
-            |  #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
-            |  #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
-
-    +row
-        +cell #[code EntityRecognizer.load]
-        +cell
-            |  #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
-            |  #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
+        +cell #[code Language.create_make_doc]
+        +cell #[+api("language#attributes") #[code Language.tokenizer]]

    +row
        +cell
@ -212,6 +187,28 @@ p
            |  #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
            |  #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]

+    +row
+        +cell #[code Tokenizer.load]
+        +cell -
+
+    +row
+        +cell #[code Tagger.load]
+        +cell
+            |  #[+api("tagger#from_disk") #[code Tagger.from_disk]]
+            |  #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
+
+    +row
+        +cell #[code DependencyParser.load]
+        +cell
+            |  #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
+            |  #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
+
+    +row
+        +cell #[code EntityRecognizer.load]
+        +cell
+            |  #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
+            |  #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
+
    +row
        +cell #[code Matcher.load]
        +cell -
@ -232,7 +229,7 @@ p

    +row
        +cell #[code Doc.read_bytes]
-        +cell
+        +cell #[+api("binder") #[code Binder]]

    +row
        +cell #[code Token.is_ancestor_of]