Merge docstrings

2025-08-04 12:20:20 +03:00 · 2017-05-21 13:46:23 -05:00 · 2017-05-21 13:46:23 -05:00 · 5db89053aa
commit 5db89053aa
parent 432b3499b3 2c5cfe8bbf
68 changed files with 4137 additions and 3113 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -14,3 +14,4 @@ regex==2017.4.5
 ftfy>=4.4.2,<5.0.0
 pytest>=3.0.6,<4.0.0
 pip>=9.0.0,<10.0.0
+mock>=2.0.0,<3.0.0
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -20,7 +20,17 @@ def download(model, direct=False):
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
        download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
-        link(model_name, model, force=True)
+        try:
+            link(model_name, model, force=True)
+        except:
+            # Dirty, but since spacy.download and the auto-linking is mostly
+            # a convenience wrapper, it's best to show a success message and
+            # loading instructions, even if linking fails.
+            prints("Creating a shortcut link for 'en' didn't work (maybe you "
+                   "don't have admin permissions?), but you can still load "
+                   "the model via its full package name:",
+                   "nlp = spacy.load('%s')" % model_name,
+                   title="Download successful")


 def get_json(url, desc):
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -11,15 +11,14 @@ from .. import util

 def info(model=None, markdown=False):
    if model:
-        data_path = util.get_data_path()
-        data = util.parse_package_meta(data_path / model, require=True)
-        model_path = Path(__file__).parent / data_path / model
+        model_path = util.resolve_model_path(model)
+        meta = util.parse_package_meta(model_path)
        if model_path.resolve() != model_path:
-            data['link'] = path2str(model_path)
-            data['source'] = path2str(model_path.resolve())
+            meta['link'] = path2str(model_path)
+            meta['source'] = path2str(model_path.resolve())
        else:
-            data['source'] = path2str(model_path)
-        print_info(data, 'model %s' % model, markdown)
+            meta['source'] = path2str(model_path)
+        print_info(meta, 'model %s' % model, markdown)
    else:
        data = {'spaCy version': about.__version__,
                'Location': path2str(Path(__file__).parent.parent),
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -306,25 +306,17 @@ cdef class GoldParse:

    def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
                 deps=None, entities=None, make_projective=False):
-        """
-        Create a GoldParse.
+        """Create a GoldParse.

-        Arguments:
-            doc (Doc):
-                The document the annotations refer to.
-            words:
-                A sequence of unicode word strings.
-            tags:
-                A sequence of strings, representing tag annotations.
-            heads:
-                A sequence of integers, representing syntactic head offsets.
-            deps:
-                A sequence of strings, representing the syntactic relation types.
-            entities:
-                A sequence of named entity annotations, either as BILUO tag strings,
-                or as (start_char, end_char, label) tuples, representing the entity
-                positions.
-        Returns (GoldParse): The newly constructed object.
+        doc (Doc): The document the annotations refer to.
+        words (iterable): A sequence of unicode word strings.
+        tags (iterable): A sequence of strings, representing tag annotations.
+        heads (iterable): A sequence of integers, representing syntactic head offsets.
+        deps (iterable): A sequence of strings, representing the syntactic relation types.
+        entities (iterable): A sequence of named entity annotations, either as
+            BILUO tag strings, or as `(start_char, end_char, label)` tuples,
+            representing the entity positions.
+        RETURNS (GoldParse): The newly constructed object.
        """
        if words is None:
            words = [token.text for token in doc]
@ -389,55 +381,45 @@ cdef class GoldParse:
            self.heads = proj_heads

    def __len__(self):
-        """
-        Get the number of gold-standard tokens.
+        """Get the number of gold-standard tokens.

-        Returns (int): The number of gold-standard tokens.
+        RETURNS (int): The number of gold-standard tokens.
        """
        return self.length

    @property
    def is_projective(self):
-        """
-        Whether the provided syntactic annotations form a projective dependency
-        tree.
+        """Whether the provided syntactic annotations form a projective
+        dependency tree.
        """
        return not nonproj.is_nonproj_tree(self.heads)


 def biluo_tags_from_offsets(doc, entities):
-    """
-    Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
-    scheme (biluo).
+    """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
+    scheme (BILUO).

-    Arguments:
-        doc (Doc):
-            The document that the entity offsets refer to. The output tags will
-            refer to the token boundaries within the document.
+    doc (Doc): The document that the entity offsets refer to. The output tags
+        will refer to the token boundaries within the document.
+    entities (iterable): A sequence of `(start, end, label)` triples. `start` and
+        `end` should be character-offset integers denoting the slice into the
+        original string.

-        entities (sequence):
-            A sequence of (start, end, label) triples. start and end should be
-            character-offset integers denoting the slice into the original string.
+    RETURNS (list): A list of unicode strings, describing the tags. Each tag
+        string will be of the form either "", "O" or "{action}-{label}", where
+        action is one of "B", "I", "L", "U". The string "-" is used where the
+        entity offsets don't align with the tokenization in the `Doc` object. The
+        training algorithm will view these as missing values. "O" denotes a
+        non-entity token. "B" denotes the beginning of a multi-token entity,
+        "I" the inside of an entity of three or more tokens, and "L" the end
+        of an entity of two or more tokens. "U" denotes a single-token entity.

-    Returns:
-        tags (list):
-            A list of unicode strings, describing the tags. Each tag string will
-            be of the form either "", "O" or "{action}-{label}", where action is one
-            of "B", "I", "L", "U". The string "-" is used where the entity
-            offsets don't align with the tokenization in the Doc object. The
-            training algorithm will view these as missing values. "O" denotes
-            a non-entity token. "B" denotes the beginning of a multi-token entity,
-            "I" the inside of an entity of three or more tokens, and "L" the end
-            of an entity of two or more tokens. "U" denotes a single-token entity.
-
-    Example:
-        text = 'I like London.'
-        entities = [(len('I like '), len('I like London'), 'LOC')]
-        doc = nlp.tokenizer(text)
-
-        tags = biluo_tags_from_offsets(doc, entities)
-
-        assert tags == ['O', 'O', 'U-LOC', 'O']
+    EXAMPLE:
+        >>> text = 'I like London.'
+        >>> entities = [(len('I like '), len('I like London'), 'LOC')]
+        >>> doc = nlp.tokenizer(text)
+        >>> tags = biluo_tags_from_offsets(doc, entities)
+        >>> assert tags == ['O', 'O', 'U-LOC', 'O']
    """
    starts = {token.idx: token.i for token in doc}
    ends = {token.idx+len(token): token.i for token in doc}
--- a/spacy/lang/bn/init.py
+++ b/spacy/lang/bn/init.py
@ -13,21 +13,23 @@ from ...attrs import LANG
 from ...util import update_exc


+class BengaliDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'bn'
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    tag_map = TAG_MAP
+    stop_words = STOP_WORDS
+    lemma_rules = LEMMA_RULES
+
+    prefixes = tuple(TOKENIZER_PREFIXES)
+    suffixes = tuple(TOKENIZER_SUFFIXES)
+    infixes = tuple(TOKENIZER_INFIXES)
+
+
 class Bengali(Language):
    lang = 'bn'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'bn'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        tag_map = TAG_MAP
-        stop_words = STOP_WORDS
-        lemma_rules = LEMMA_RULES
-
-        prefixes = tuple(TOKENIZER_PREFIXES)
-        suffixes = tuple(TOKENIZER_SUFFIXES)
-        infixes = tuple(TOKENIZER_INFIXES)
+    Defaults = BengaliDefaults


 __all__ = ['Bengali']
--- a/spacy/lang/da/init.py
+++ b/spacy/lang/da/init.py
@ -10,15 +10,17 @@ from ...attrs import LANG
 from ...util import update_exc


+class DanishDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'da'
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+
+
 class Danish(Language):
    lang = 'da'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'da'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
+    Defaults = DanishDefaults


 __all__ = ['Danish']
--- a/spacy/lang/de/init.py
+++ b/spacy/lang/de/init.py
@ -14,21 +14,23 @@ from ...attrs import LANG
 from ...util import update_exc


+class GermanDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'de'
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    tag_map = dict(TAG_MAP)
+    stop_words = set(STOP_WORDS)
+    syntax_iterators = dict(SYNTAX_ITERATORS)
+
+    @classmethod
+    def create_lemmatizer(cls, nlp=None):
+        return Lemmatizer(LOOKUP)
+
+
 class German(Language):
    lang = 'de'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'de'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        tag_map = dict(TAG_MAP)
-        stop_words = set(STOP_WORDS)
-        syntax_iterators = dict(SYNTAX_ITERATORS)
-
-        @classmethod
-        def create_lemmatizer(cls, nlp=None):
-            return Lemmatizer(LOOKUP)
+    Defaults = GermanDefaults


 __all__ = ['German']
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -32,7 +32,6 @@ class EnglishDefaults(Language.Defaults):

 class English(Language):
    lang = 'en'
-
    Defaults = EnglishDefaults


--- a/spacy/lang/es/init.py
+++ b/spacy/lang/es/init.py
@ -28,7 +28,7 @@ class SpanishDefaults(Language.Defaults):

 class Spanish(Language):
    lang = 'es'
-
    Defaults = SpanishDefaults

+
 __all__ = ['Spanish']
--- a/spacy/lang/fi/init.py
+++ b/spacy/lang/fi/init.py
@ -10,15 +10,17 @@ from ...attrs import LANG
 from ...util import update_exc


+class FinnishDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'fi'
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+
+
 class Finnish(Language):
    lang = 'fi'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'fi'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
+    Defaults = FinnishDefaults


 __all__ = ['Finnish']
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -13,22 +13,24 @@ from ...attrs import LANG
 from ...util import update_exc


+class FrenchDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'fr'
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+    infixes = tuple(TOKENIZER_INFIXES)
+    suffixes = tuple(TOKENIZER_SUFFIXES)
+    token_match = TOKEN_MATCH
+
+    @classmethod
+    def create_lemmatizer(cls, nlp=None):
+        return Lemmatizer(LOOKUP)
+
+
 class French(Language):
    lang = 'fr'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'fr'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
-        infixes = tuple(TOKENIZER_INFIXES)
-        suffixes = tuple(TOKENIZER_SUFFIXES)
-        token_match = TOKEN_MATCH
-
-        @classmethod
-        def create_lemmatizer(cls, nlp=None):
-            return Lemmatizer(LOOKUP)
+    Defaults = FrenchDefaults


 __all__ = ['French']
--- a/spacy/lang/he/init.py
+++ b/spacy/lang/he/init.py
@ -9,15 +9,17 @@ from ...attrs import LANG
 from ...util import update_exc


+class HebrewDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'he'
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+
+
 class Hebrew(Language):
    lang = 'he'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'he'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
+    Defaults = HebrewDefaults


 __all__ = ['Hebrew']
--- a/spacy/lang/hu/init.py
+++ b/spacy/lang/hu/init.py
@ -13,23 +13,25 @@ from ...attrs import LANG
 from ...util import update_exc


+class HungarianDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'hu'
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+    prefixes = tuple(TOKENIZER_PREFIXES)
+    suffixes = tuple(TOKENIZER_SUFFIXES)
+    infixes = tuple(TOKENIZER_INFIXES)
+    token_match = TOKEN_MATCH
+
+    @classmethod
+    def create_lemmatizer(cls, nlp=None):
+        return Lemmatizer(LOOKUP)
+
+
 class Hungarian(Language):
    lang = 'hu'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'hu'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
-        prefixes = tuple(TOKENIZER_PREFIXES)
-        suffixes = tuple(TOKENIZER_SUFFIXES)
-        infixes = tuple(TOKENIZER_INFIXES)
-        token_match = TOKEN_MATCH
-
-        @classmethod
-        def create_lemmatizer(cls, nlp=None):
-            return Lemmatizer(LOOKUP)
+    Defaults = HungarianDefaults


 __all__ = ['Hungarian']
--- a/spacy/lang/it/init.py
+++ b/spacy/lang/it/init.py
@ -11,19 +11,21 @@ from ...attrs import LANG
 from ...util import update_exc


+class ItalianDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'it'
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+
+    @classmethod
+    def create_lemmatizer(cls, nlp=None):
+        return Lemmatizer(LOOKUP)
+
+
 class Italian(Language):
    lang = 'it'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'it'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
-
-        @classmethod
-        def create_lemmatizer(cls, nlp=None):
-            return Lemmatizer(LOOKUP)
+    Defaults = ItalianDefaults


 __all__ = ['Italian']
--- a/spacy/lang/nb/init.py
+++ b/spacy/lang/nb/init.py
@ -11,15 +11,17 @@ from ...attrs import LANG
 from ...util import update_exc


+class NorwegianDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'nb'
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+
+
 class Norwegian(Language):
    lang = 'nb'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'nb'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
+    Defaults = NorwegianDefaults


 __all__ = ['Norwegian']
--- a/spacy/lang/nl/init.py
+++ b/spacy/lang/nl/init.py
@ -9,16 +9,17 @@ from ...attrs import LANG
 from ...util import update_exc


+class DutchDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'nl'
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+

 class Dutch(Language):
    lang = 'nl'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'nl'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
+    Defaults = DutchDefaults


 __all__ = ['Dutch']
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -9,15 +9,17 @@ from ...attrs import LANG
 from ...util import update_exc


+class PolishDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'pl'
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+
+
 class Polish(Language):
    lang = 'pl'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'pl'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
+    Defaults = PolishDefaults


 __all__ = ['Polish']
--- a/spacy/lang/pt/init.py
+++ b/spacy/lang/pt/init.py
@ -13,20 +13,22 @@ from ...attrs import LANG
 from ...util import update_exc


+class PortugueseDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'pt'
+    lex_attr_getters.update(LEX_ATTRS)
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+
+    @classmethod
+    def create_lemmatizer(cls, nlp=None):
+        return Lemmatizer(LOOKUP)
+
+
 class Portuguese(Language):
    lang = 'pt'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'pt'
-        lex_attr_getters.update(LEX_ATTRS)
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
-
-        @classmethod
-        def create_lemmatizer(cls, nlp=None):
-            return Lemmatizer(LOOKUP)
+    Defaults = PortugueseDefaults


 __all__ = ['Portuguese']
--- a/spacy/lang/sv/init.py
+++ b/spacy/lang/sv/init.py
@ -13,19 +13,21 @@ from ...attrs import LANG
 from ...util import update_exc


+class SwedishDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'sv'
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+
+    @classmethod
+    def create_lemmatizer(cls, nlp=None):
+        return Lemmatizer(LOOKUP)
+
+
 class Swedish(Language):
    lang = 'sv'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'sv'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
-
-        @classmethod
-        def create_lemmatizer(cls, nlp=None):
-            return Lemmatizer(LOOKUP)
+    Defaults = SwedishDefaults


 __all__ = ['Swedish']
--- a/spacy/language.py
+++ b/spacy/language.py
@ -116,14 +116,30 @@ class BaseDefaults(object):


 class Language(object):
-    """
-    A text-processing pipeline. Usually you'll load this once per process, and
-    pass the instance around your program.
+    """A text-processing pipeline. Usually you'll load this once per process,
+    and pass the instance around your application.
+
+    Defaults (class): Settings, data and factory methods for creating the `nlp`
+        object and processing pipeline.
+    lang (unicode): Two-letter language ID, i.e. ISO code.
    """
    Defaults = BaseDefaults
    lang = None

    def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}):
+        """Initialise a Language object.
+
+        vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
+            `Language.Defaults.create_vocab`.
+        make_doc (callable): A function that takes text and returns a `Doc`
+            object. Usually a `Tokenizer`.
+        pipeline (list): A list of annotation processes or IDs of annotation,
+            processes, e.g. a `Tagger` object, or `'tagger'`. IDs are looked
+            up in `Language.Defaults.factories`.
+        meta (dict): Custom meta data for the Language class. Is written to by
+            models to add model meta data.
+        RETURNS (Language): The newly constructed object.
+        """
        self.meta = dict(meta)

        if vocab is True:
@ -147,22 +163,17 @@ class Language(object):
            self.pipeline = []

    def __call__(self, text, **disabled):
-        """
-        Apply the pipeline to some text.  The text can span multiple sentences,
-        and can contain arbtrary whitespace.  Alignment into the original string
+        """'Apply the pipeline to some text. The text can span multiple sentences,
+        and can contain arbtrary whitespace. Alignment into the original string
        is preserved.

-        Args:
-            text (unicode): The text to be processed.
+        text (unicode): The text to be processed.
+        **disabled: Elements of the pipeline that should not be run.
+        RETURNS (Doc): A container for accessing the annotations.

-        Returns:
-            doc (Doc): A container for accessing the annotations.
-
-        Example:
-            >>> from spacy.en import English
-            >>> nlp = English()
+        EXAMPLE:
            >>> tokens = nlp('An example sentence. Another example sentence.')
-            >>> tokens[0].orth_, tokens[0].head.tag_
+            >>> tokens[0].text, tokens[0].head.tag_
            ('An', 'NN')
        """
        doc = self.make_doc(text)
@ -174,6 +185,21 @@ class Language(object):
        return doc

    def update(self, docs, golds, drop=0., sgd=None):
+        """Update the models in the pipeline.
+
+        docs (iterable): A batch of `Doc` objects.
+        golds (iterable): A batch of `GoldParse` objects.
+        drop (float): The droput rate.
+        sgd (callable): An optimizer.
+        RETURNS (dict): Results from the update.
+
+        EXAMPLE:
+            >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
+            >>>    for epoch in trainer.epochs(gold):
+            >>>        for docs, golds in epoch:
+            >>>            state = nlp.update(docs, golds, sgd=optimizer)
+        """
+
        grads = {}
        def get_grads(W, dW, key=None):
            grads[key] = (W, dW)
@ -204,7 +230,20 @@ class Language(object):
        for doc, gold in docs_golds:
            yield doc, gold

-    def begin_training(self, get_gold_tuples, **cfg):
+    def begin_training(self, gold_tuples, **cfg):
+        """Allocate models, pre-process training data and acquire a trainer and
+        optimizer. Used as a contextmanager.
+
+        gold_tuples (iterable): Gold-standard training data.
+        **cfg: Config parameters.
+        YIELDS (tuple): A trainer and an optimizer.
+
+        EXAMPLE:
+            >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
+            >>>    for epoch in trainer.epochs(gold):
+            >>>        for docs, golds in epoch:
+            >>>            state = nlp.update(docs, golds, sgd=optimizer)
+        """
        # Populate vocab
        for _, annots_brackets in get_gold_tuples():
            for annots, _ in annots_brackets:
@ -233,6 +272,17 @@ class Language(object):

    @contextmanager
    def use_params(self, params, **cfg):
+        """Replace weights of models in the pipeline with those provided in the
+        params dictionary. Can be used as a contextmanager, in which case,
+        models go back to their original weights after the block.
+
+        params (dict): A dictionary of parameters keyed by model ID.
+        **cfg: Config parameters.
+
+        EXAMPLE:
+            >>> with nlp.use_params(optimizer.averages):
+            >>>     nlp.to_disk('/tmp/checkpoint')
+        """
        contexts = [pipe.use_params(params) for pipe
                    in self.pipeline if hasattr(pipe, 'use_params')]
        # TODO: Having trouble with contextlib
@ -250,16 +300,20 @@ class Language(object):
                pass

    def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
-        """
-        Process texts as a stream, and yield Doc objects in order.
+        """Process texts as a stream, and yield `Doc` objects in order. Supports
+        GIL-free multi-threading.

-        Supports GIL-free multi-threading.
+        texts (iterator): A sequence of texts to process.
+        n_threads (int): The number of worker threads to use. If -1, OpenMP will
+            decide how many to use at run time. Default is 2.
+        batch_size (int): The number of texts to buffer.
+        **disabled: Pipeline components to exclude.
+        YIELDS (Doc): Documents in the order of the original text.

-        Arguments:
-            texts (iterator)
-            tag (bool)
-            parse (bool)
-            entity (bool)
+        EXAMPLE:
+            >>> texts = [u'One document.', u'...', u'Lots of documents']
+            >>>     for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
+            >>>         assert doc.is_parsed
        """
        #docs = (self.make_doc(text) for text in texts)
        docs = texts
@ -267,7 +321,6 @@ class Language(object):
            name = getattr(proc, 'name', None)
            if name in disabled and not disabled[name]:
                continue
-
            if hasattr(proc, 'pipe'):
                docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
            else:
@ -278,11 +331,12 @@ class Language(object):
    def to_disk(self, path, **exclude):
        """Save the current state to a directory.

-        Args:
-            path: A path to a directory, which will be created if it doesn't
-                    exist. Paths may be either strings or pathlib.Path-like
-                    objects.
-            **exclude: Prevent named attributes from being saved.
+        path (unicode or Path): A path to a directory, which will be created if
+            it doesn't exist. Paths may be either strings or `Path`-like objects.
+        **exclude: Named attributes to prevent from being saved.
+
+        EXAMPLE:
+            >>> nlp.to_disk('/path/to/models')
        """
        path = util.ensure_path(path)
        if not path.exists():
@ -301,12 +355,17 @@ class Language(object):
            dill.dump(props, file_)

    def from_disk(self, path, **exclude):
-        """Load the current state from a directory.
+        """Loads state from a directory. Modifies the object in place and
+        returns it.

-        Args:
-            path: A path to a directory. Paths may be either strings or
-                pathlib.Path-like objects.
-            **exclude: Prevent named attributes from being saved.
+        path (unicode or Path): A path to a directory. Paths may be either
+            strings or `Path`-like objects.
+        **exclude: Named attributes to prevent from being loaded.
+        RETURNS (Language): The modified `Language` object.
+
+        EXAMPLE:
+            >>> from spacy.language import Language
+            >>> nlp = Language().from_disk('/path/to/models')
        """
        path = util.ensure_path(path)
        for name in path.iterdir():
@ -320,10 +379,8 @@ class Language(object):
    def to_bytes(self, **exclude):
        """Serialize the current state to a binary string.

-        Args:
-            path: A path to a directory. Paths may be either strings or
-                pathlib.Path-like objects.
-            **exclude: Prevent named attributes from being serialized.
+        **exclude: Named attributes to prevent from being serialized.
+        RETURNS (bytes): The serialized form of the `Language` object.
        """
        props = dict(self.__dict__)
        for key in exclude:
@ -334,13 +391,12 @@ class Language(object):
    def from_bytes(self, bytes_data, **exclude):
        """Load state from a binary string.

-        Args:
-            bytes_data (bytes): The data to load from.
-            **exclude: Prevent named attributes from being loaded.
+        bytes_data (bytes): The data to load from.
+        **exclude: Named attributes to prevent from being loaded.
+        RETURNS (Language): The `Language` object.
        """
        props = dill.loads(bytes_data)
        for key, value in props.items():
            if key not in exclude:
                setattr(self, key, value)
        return self
-
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -30,19 +30,16 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))


 cdef class Lexeme:
-    """
-    An entry in the vocabulary.  A Lexeme has no string context --- it's a
+    """An entry in the vocabulary. A `Lexeme` has no string context – it's a
    word-type, as opposed to a word token.  It therefore has no part-of-speech
    tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
    tag).
    """
    def __init__(self, Vocab vocab, int orth):
-        """
-        Create a Lexeme object.
+        """Create a Lexeme object.

-        Arguments:
-            vocab (Vocab): The parent vocabulary
-            orth (int): The orth id of the lexeme.
+        vocab (Vocab): The parent vocabulary
+        orth (int): The orth id of the lexeme.
        Returns (Lexeme): The newly constructd object.
        """
        self.vocab = vocab
@ -82,35 +79,28 @@ cdef class Lexeme:
        return self.c.orth

    def set_flag(self, attr_id_t flag_id, bint value):
-        """
-        Change the value of a boolean flag.
+        """Change the value of a boolean flag.

-        Arguments:
-            flag_id (int): The attribute ID of the flag to set.
-            value (bool): The new value of the flag.
+        flag_id (int): The attribute ID of the flag to set.
+        value (bool): The new value of the flag.
        """
        Lexeme.c_set_flag(self.c, flag_id, value)

    def check_flag(self, attr_id_t flag_id):
-        """
-        Check the value of a boolean flag.
+        """Check the value of a boolean flag.

-        Arguments:
-            flag_id (int): The attribute ID of the flag to query.
-        Returns (bool): The value of the flag.
+        flag_id (int): The attribute ID of the flag to query.
+        RETURNS (bool): The value of the flag.
        """
        return True if Lexeme.c_check_flag(self.c, flag_id) else False

    def similarity(self, other):
-        """
-        Compute a semantic similarity estimate. Defaults to cosine over vectors.
+        """Compute a semantic similarity estimate. Defaults to cosine over
+        vectors.

-        Arguments:
-            other:
-                The object to compare with. By default, accepts Doc, Span,
-                Token and Lexeme objects.
-        Returns:
-            score (float): A scalar similarity score. Higher is more similar.
+        other (object): The object to compare with. By default, accepts `Doc`,
+            `Span`, `Token` and `Lexeme` objects.
+        RETURNS (float): A scalar similarity score. Higher is more similar.
        """
        if self.vector_norm == 0 or other.vector_norm == 0:
            return 0.0
@ -140,6 +130,11 @@ cdef class Lexeme:
        self.orth = self.c.orth

    property has_vector:
+        """A boolean value indicating whether a word vector is associated with
+        the object.
+
+        RETURNS (bool): Whether a word vector is associated with the object.
+        """
        def __get__(self):
            cdef int i
            for i in range(self.vocab.vectors_length):
@ -149,6 +144,10 @@ cdef class Lexeme:
                return False

    property vector_norm:
+        """The L2 norm of the lexeme's vector representation.
+
+        RETURNS (float): The L2 norm of the vector representation.
+        """
        def __get__(self):
            return self.c.l2_norm

@ -156,6 +155,11 @@ cdef class Lexeme:
            self.c.l2_norm = value

    property vector:
+        """A real-valued meaning representation.
+
+        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
+            representing the lexeme's semantics.
+        """
        def __get__(self):
            cdef int length = self.vocab.vectors_length
            if length == 0:
@ -196,6 +200,14 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.orth]

+    property text:
+        """A unicode representation of the token text.
+
+        RETURNS (unicode): The original verbatim text of the token.
+        """
+        def __get__(self):
+            return self.orth_
+
    property lower:
        def __get__(self): return self.c.lower
        def __set__(self, int x): self.c.lower = x
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -87,7 +87,7 @@ ctypedef TokenPatternC* TokenPatternC_ptr
 ctypedef pair[int, TokenPatternC_ptr] StateC


-cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, attr_t label,
+cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
                                 object token_specs) except NULL:
    pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
    cdef int i
@ -99,15 +99,21 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, attr_t label,
            pattern[i].attrs[j].attr = attr
            pattern[i].attrs[j].value = value
    i = len(token_specs)
-    pattern[i].attrs = <AttrValueC*>mem.alloc(3, sizeof(AttrValueC))
+    pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
    pattern[i].attrs[0].attr = ID
    pattern[i].attrs[0].value = entity_id
-    pattern[i].attrs[1].attr = ENT_TYPE
-    pattern[i].attrs[1].value = label
    pattern[i].nr_attr = 0
    return pattern


+cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
+    while pattern.nr_attr != 0:
+        pattern += 1
+    id_attr = pattern[0].attrs[0]
+    assert id_attr.attr == ID
+    return id_attr.value
+
+
 cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
    for attr in pattern.attrs[:pattern.nr_attr]:
        if get_token_attr(token, attr.attr) != attr.value:
@ -159,14 +165,14 @@ def _convert_strings(token_specs, string_store):


 def merge_phrase(matcher, doc, i, matches):
-    '''Callback to merge a phrase on match'''
+    """Callback to merge a phrase on match."""
    ent_id, label, start, end = matches[i]
    span = doc[start : end]
    span.merge(ent_type=label, ent_id=ent_id)


 cdef class Matcher:
-    '''Match sequences of tokens, based on pattern rules.'''
+    """Match sequences of tokens, based on pattern rules."""
    cdef Pool mem
    cdef vector[TokenPatternC*] patterns
    cdef readonly Vocab vocab
@ -175,37 +181,12 @@ cdef class Matcher:
    cdef public object _callbacks
    cdef public object _acceptors

-    @classmethod
-    def load(cls, path, vocab):
-        """
-        Load the matcher and patterns from a file path.
+    def __init__(self, vocab):
+        """Create the Matcher.

-        Arguments:
-            path (Path):
-                Path to a JSON-formatted patterns file.
-            vocab (Vocab):
-                The vocabulary that the documents to match over will refer to.
-        Returns:
-            Matcher: The newly constructed object.
-        """
-        if (path / 'gazetteer.json').exists():
-            with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
-                patterns = ujson.load(file_)
-        else:
-            patterns = {}
-        return cls(vocab, patterns)
-
-    def __init__(self, vocab, patterns={}):
-        """
-        Create the Matcher.
-
-        Arguments:
-            vocab (Vocab):
-                The vocabulary object, which must be shared with the documents
-                the matcher will operate on.
-            patterns (dict): Patterns to add to the matcher.
-        Returns:
-            The newly constructed object.
+        vocab (Vocab): The vocabulary object, which must be shared with the
+            documents the matcher will operate on.
+        RETURNS (Matcher): The newly constructed object.
        """
        self._patterns = {}
        self._entities = {}
@ -213,144 +194,111 @@ cdef class Matcher:
        self._callbacks = {}
        self.vocab = vocab
        self.mem = Pool()
-        for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
-            self.add_entity(entity_key, attrs)
-            for spec in specs:
-                self.add_pattern(entity_key, spec, label=etype)

    def __reduce__(self):
        return (self.__class__, (self.vocab, self._patterns), None, None)

-    property n_patterns:
-        def __get__(self): return self.patterns.size()
+    def __len__(self):
+        """Get the number of rules added to the matcher. Note that this only
+        returns the number of rules (identical with the number of IDs), not the
+        number of individual patterns.

-    def add_entity(self, entity_key, attrs=None, if_exists='raise',
-                   acceptor=None, on_match=None):
+        RETURNS (int): The number of rules.
        """
-        Add an entity to the matcher.
+        return len(self._patterns)

-        Arguments:
-            entity_key (unicode or int):
-                An ID for the entity.
-            attrs:
-                Attributes to associate with the Matcher.
-            if_exists ('raise', 'ignore' or 'update'):
-                Controls what happens if the entity ID already exists. Defaults to 'raise'.
-            acceptor:
-                Callback function to filter matches of the entity.
-            on_match:
-                Callback function to act on matches of the entity.
-        Returns:
-            None
+    def __contains__(self, key):
+        """Check whether the matcher contains rules for a match ID.
+
+        key (unicode): The match ID.
+        RETURNS (bool): Whether the matcher contains rules for this match ID.
        """
-        if if_exists not in ('raise', 'ignore', 'update'):
-            raise ValueError(
-                "Unexpected value for if_exists: %s.\n"
-                "Expected one of: ['raise', 'ignore', 'update']" % if_exists)
-        if attrs is None:
-            attrs = {}
-        entity_key = self.normalize_entity_key(entity_key)
-        if self.has_entity(entity_key):
-            if if_exists == 'raise':
-                raise KeyError(
-                    "Tried to add entity %s. Entity exists, and if_exists='raise'.\n"
-                    "Set if_exists='ignore' or if_exists='update', or check with "
-                    "matcher.has_entity()")
-            elif if_exists == 'ignore':
-                return
-        self._entities[entity_key] = dict(attrs)
-        self._patterns.setdefault(entity_key, [])
-        self._acceptors[entity_key] = acceptor
-        self._callbacks[entity_key] = on_match
+        return len(self._patterns)

-    def add_pattern(self, entity_key, token_specs, label=""):
+    def add(self, key, on_match, *patterns):
+        """Add a match-rule to the matcher.
+        A match-rule consists of: an ID key, an on_match callback, and one or
+        more patterns. If the key exists, the patterns are appended to the
+        previous ones, and the previous on_match callback is replaced. The
+        `on_match` callback will receive the arguments `(matcher, doc, i,
+        matches)`. You can also set `on_match` to `None` to not perform any
+        actions. A pattern consists of one or more `token_specs`, where a
+        `token_spec` is a dictionary mapping attribute IDs to values. Token
+        descriptors can also include quantifiers. There are currently important
+        known problems with the quantifiers – see the docs.
        """
-        Add a pattern to the matcher.
+        for pattern in patterns:
+            if len(pattern) == 0:
+                msg = ("Cannot add pattern for zero tokens to matcher.\n"
+                       "key: {key}\n")
+                raise ValueError(msg.format(key=key))
+        key = self._normalize_key(key)
+        self._patterns.setdefault(key, [])
+        self._callbacks[key] = on_match

-        Arguments:
-            entity_key (unicode or int):
-                An ID for the entity.
-            token_specs:
-                Description of the pattern to be matched.
-            label:
-                Label to assign to the matched pattern. Defaults to "".
-        Returns:
-            None
+        for pattern in patterns:
+            specs = _convert_strings(pattern, self.vocab.strings)
+            self.patterns.push_back(init_pattern(self.mem, key, specs))
+            self._patterns[key].append(specs)
+
+    def remove(self, key):
+        """Remove a rule from the matcher. A KeyError is raised if the key does
+        not exist.
+
+        key (unicode): The ID of the match rule.
        """
-        token_specs = list(token_specs)
-        if len(token_specs) == 0:
-            msg = ("Cannot add pattern for zero tokens to matcher.\n"
-                   "entity_key: {entity_key}\n"
-                   "label: {label}")
-            raise ValueError(msg.format(entity_key=entity_key, label=label))
-        entity_key = self.normalize_entity_key(entity_key)
-        if not self.has_entity(entity_key):
-            self.add_entity(entity_key)
-        if isinstance(label, basestring):
-            label = self.vocab.strings[label]
-        elif label is None:
-            label = 0
-        spec = _convert_strings(token_specs, self.vocab.strings)
+        key = self._normalize_key(key)
+        self._patterns.pop(key)
+        self._callbacks.pop(key)
+        cdef int i = 0
+        while i < self.patterns.size():
+            pattern_key = get_pattern_key(self.patterns.at(i))
+            if pattern_key == key:
+                self.patterns.erase(self.patterns.begin()+i)
+            else:
+                i += 1

-        self.patterns.push_back(init_pattern(self.mem, entity_key, label, spec))
-        self._patterns[entity_key].append((label, token_specs))
+    def has_key(self, key):
+        """Check whether the matcher has a rule with a given key.

-    def add(self, entity_key, label, attrs, specs, acceptor=None, on_match=None):
-        self.add_entity(entity_key, attrs=attrs, if_exists='update',
-                        acceptor=acceptor, on_match=on_match)
-        for spec in specs:
-            self.add_pattern(entity_key, spec, label=label)
-
-    def normalize_entity_key(self, entity_key):
-        if isinstance(entity_key, basestring):
-            return self.vocab.strings[entity_key]
-        else:
-            return entity_key
-
-    def has_entity(self, entity_key):
+        key (string or int): The key to check.
+        RETURNS (bool): Whether the matcher has the rule.
        """
-        Check whether the matcher has an entity.
+        key = self._normalize_key(key)
+        return key in self._patterns

-        Arguments:
-            entity_key (string or int): The entity key to check.
-        Returns:
-            bool: Whether the matcher has the entity.
-        """
-        entity_key = self.normalize_entity_key(entity_key)
-        return entity_key in self._entities
+    def get(self, key, default=None):
+        """Retrieve the pattern stored for a key.

-    def get_entity(self, entity_key):
+        key (unicode or int): The key to retrieve.
+        RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
        """
-        Retrieve the attributes stored for an entity.
+        key = self._normalize_key(key)
+        if key not in self._patterns:
+            return default
+        return (self._callbacks[key], self._patterns[key])

-        Arguments:
-            entity_key (unicode or int): The entity to retrieve.
-        Returns:
-            The entity attributes if present, otherwise None.
-        """
-        entity_key = self.normalize_entity_key(entity_key)
-        if entity_key in self._entities:
-            return self._entities[entity_key]
-        else:
-            return None
+    def pipe(self, docs, batch_size=1000, n_threads=2):
+        """Match a stream of documents, yielding them in turn.

-    def __call__(self, Doc doc, acceptor=None):
+        docs (iterable): A stream of documents.
+        batch_size (int): The number of documents to accumulate into a working set.
+        n_threads (int): The number of threads with which to work on the buffer
+            in parallel, if the `Matcher` implementation supports multi-threading.
+        YIELDS (Doc): Documents, in order.
        """
-        Find all token sequences matching the supplied patterns on the Doc.
+        for doc in docs:
+            self(doc)
+            yield doc

-        Arguments:
-            doc (Doc):
-                The document to match over.
-        Returns:
-            list
-            A list of (entity_key, label_id, start, end) tuples,
-            describing the matches. A match tuple describes a span doc[start:end].
-            The label_id and entity_key are both integers.
+    def __call__(self, Doc doc):
+        """Find all token sequences matching the supplied patterns on the `Doc`.
+
+        doc (Doc): The document to match over.
+        RETURNS (list): A list of `(key, label_id, start, end)` tuples,
+            describing the matches. A match tuple describes a span
+            `doc[start:end]`. The `label_id` and `key` are both integers.
        """
-        if acceptor is not None:
-            raise ValueError(
-                "acceptor keyword argument to Matcher deprecated. Specify acceptor "
-                "functions when you add patterns instead.")
        cdef vector[StateC] partials
        cdef int n_partials = 0
        cdef int q = 0
@ -388,13 +336,7 @@ cdef class Matcher:
                    end = token_i+1
                    ent_id = state.second[1].attrs[0].value
                    label = state.second[1].attrs[1].value
-                    acceptor = self._acceptors.get(ent_id)
-                    if acceptor is None:
-                        matches.append((ent_id, label, start, end))
-                    else:
-                        match = acceptor(doc, ent_id, label, start, end)
-                        if match:
-                            matches.append(match)
+                    matches.append((ent_id, start, end))
            partials.resize(q)
            # Check whether we open any new patterns on this token
            for pattern in self.patterns:
@ -419,13 +361,7 @@ cdef class Matcher:
                    end = token_i+1
                    ent_id = pattern[1].attrs[0].value
                    label = pattern[1].attrs[1].value
-                    acceptor = self._acceptors.get(ent_id)
-                    if acceptor is None:
-                        matches.append((ent_id, label, start, end))
-                    else:
-                        match = acceptor(doc, ent_id, label, start, end)
-                        if match:
-                            matches.append(match)
+                    matches.append((ent_id, start, end))
        # Look for open patterns that are actually satisfied
        for state in partials:
            while state.second.quantifier in (ZERO, ZERO_PLUS):
@ -435,36 +371,19 @@ cdef class Matcher:
                    end = len(doc)
                    ent_id = state.second.attrs[0].value
                    label = state.second.attrs[0].value
-                    acceptor = self._acceptors.get(ent_id)
-                    if acceptor is None:
-                        matches.append((ent_id, label, start, end))
-                    else:
-                        match = acceptor(doc, ent_id, label, start, end)
-                        if match:
-                            matches.append(match)
+                    matches.append((ent_id, start, end))
        for i, (ent_id, label, start, end) in enumerate(matches):
            on_match = self._callbacks.get(ent_id)
            if on_match is not None:
                on_match(self, doc, i, matches)
+        # TODO: only return (match_id, start, end)
        return matches

-    def pipe(self, docs, batch_size=1000, n_threads=2):
-        """
-        Match a stream of documents, yielding them in turn.
-
-        Arguments:
-            docs: A stream of documents.
-            batch_size (int):
-                The number of documents to accumulate into a working set.
-            n_threads (int):
-                The number of threads with which to work on the buffer in parallel,
-                if the Matcher implementation supports multi-threading.
-        Yields:
-            Doc Documents, in order.
-        """
-        for doc in docs:
-            self(doc)
-            yield doc
+    def _normalize_key(self, key):
+        if isinstance(key, basestring):
+            return self.vocab.strings[key]
+        else:
+            return key


 def get_bilou(length):
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -38,33 +38,71 @@ from .parts_of_speech import X


 class TokenVectorEncoder(object):
-    '''Assign position-sensitive vectors to tokens, using a CNN or RNN.'''
+    """Assign position-sensitive vectors to tokens, using a CNN or RNN."""
    name = 'tok2vec'

    @classmethod
    def Model(cls, width=128, embed_size=5000, **cfg):
+        """Create a new statistical model for the class.
+
+        width (int): Output size of the model.
+        embed_size (int): Number of vectors in the embedding table.
+        **cfg: Config parameters.
+        RETURNS (Model): A `thinc.neural.Model` or similar instance.
+        """
        width = util.env_opt('token_vector_width', width)
        embed_size = util.env_opt('embed_size', embed_size)
        return Tok2Vec(width, embed_size, preprocess=None)

    def __init__(self, vocab, model=True, **cfg):
+        """Construct a new statistical model. Weights are not allocated on
+        initialisation.
+
+        vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
+            instance with the `Doc` objects it will process.
+        model (Model): A `Model` instance or `True` allocate one later.
+        **cfg: Config parameters.
+
+        EXAMPLE:
+            >>> from spacy.pipeline import TokenVectorEncoder
+            >>> tok2vec = TokenVectorEncoder(nlp.vocab)
+            >>> tok2vec.model = tok2vec.Model(128, 5000)
+        """
        self.vocab = vocab
        self.doc2feats = doc2feats()
        self.model = model

    def __call__(self, docs):
+        """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
+        model. Vectors are set to the `Doc.tensor` attribute.
+
+        docs (Doc or iterable): One or more documents to add vectors to.
+        RETURNS (dict or None): Intermediate computations.
+        """
        if isinstance(docs, Doc):
            docs = [docs]
        tokvecses = self.predict(docs)
        self.set_annotations(docs, tokvecses)

    def pipe(self, stream, batch_size=128, n_threads=-1):
+        """Process `Doc` objects as a stream.
+
+        stream (iterator): A sequence of `Doc` objects to process.
+        batch_size (int): Number of `Doc` objects to group.
+        n_threads (int): Number of threads.
+        YIELDS (iterator): A sequence of `Doc` objects, in order of input.
+        """
        for docs in cytoolz.partition_all(batch_size, stream):
            tokvecses = self.predict(docs)
            self.set_annotations(docs, tokvecses)
            yield from docs

    def predict(self, docs):
+        """Return a single tensor for a batch of documents.
+
+        docs (iterable): A sequence of `Doc` objects.
+        RETURNS (object): Vector representations for each token in the documents.
+        """
        feats = self.doc2feats(docs)
        tokvecs = self.model(feats)
        return tokvecs
@ -73,7 +111,26 @@ class TokenVectorEncoder(object):
        for doc, tokvecs in zip(docs, tokvecses):
            doc.tensor = tokvecs

-    def begin_update(self, docs, drop=0.):
+    def set_annotations(self, docs, tokvecs):
+        """Set the tensor attribute for a batch of documents.
+
+        docs (iterable): A sequence of `Doc` objects.
+        tokvecs (object): Vector representation for each token in the documents.
+        """
+        start = 0
+        for doc in docs:
+            doc.tensor = tokvecs[start : start + len(doc)]
+            start += len(doc)
+
+    def update(self, docs, golds, state=None, drop=0., sgd=None):
+        """Update the model.
+
+        docs (iterable): A batch of `Doc` objects.
+        golds (iterable): A batch of `GoldParse` objects.
+        drop (float): The droput rate.
+        sgd (callable): An optimizer.
+        RETURNS (dict): Results from the update.
+        """
        if isinstance(docs, Doc):
            docs = [docs]
        feats = self.doc2feats(docs)
@ -81,14 +138,26 @@ class TokenVectorEncoder(object):
        return tokvecs, bp_tokvecs

    def get_loss(self, docs, golds, scores):
+        # TODO: implement
        raise NotImplementedError

    def begin_training(self, gold_tuples, pipeline=None):
+        """Allocate models, pre-process training data and acquire a trainer and
+        optimizer.
+
+        gold_tuples (iterable): Gold-standard training data.
+        pipeline (list): The pipeline the model is part of.
+        """
        self.doc2feats = doc2feats()
        if self.model is True:
            self.model = self.Model()

    def use_params(self, params):
+        """Replace weights of models in the pipeline with those provided in the
+        params dictionary.
+
+        params (dict): A dictionary of parameters keyed by model ID.
+        """
        with self.model.use_params(params):
            yield

@ -189,9 +258,7 @@ class NeuralTagger(object):


 cdef class EntityRecognizer(LinearParser):
-    """
-    Annotate named entities on Doc objects.
-    """
+    """Annotate named entities on Doc objects."""
    TransitionSystem = BiluoPushDown

    feature_templates = get_feature_templates('ner')
@ -203,9 +270,7 @@ cdef class EntityRecognizer(LinearParser):


 cdef class BeamEntityRecognizer(BeamParser):
-    """
-    Annotate named entities on Doc objects.
-    """
+    """Annotate named entities on Doc objects."""
    TransitionSystem = BiluoPushDown

    feature_templates = get_feature_templates('ner')
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -11,8 +11,6 @@ from preshed.maps cimport map_iter, key_t
 from .typedefs cimport hash_t
 from libc.stdint cimport uint32_t

-import ujson
-

 cpdef hash_t hash_string(unicode string) except 0:
    chars = string.encode('utf8')
@ -72,15 +70,12 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex


 cdef class StringStore:
-    """
-    Map strings to and from integer IDs.
-    """
+    """Map strings to and from integer IDs."""
    def __init__(self, strings=None, freeze=False):
-        """
-        Create the StringStore.
+        """Create the StringStore.

-        Arguments:
-            strings: A sequence of unicode strings to add to the store.
+        strings (iterable): A sequence of unicode strings to add to the store.
+        RETURNS (StringStore): The newly constructed object.
        """
        self.mem = Pool()
        self._map = PreshMap()
@ -106,23 +101,17 @@ cdef class StringStore:
        return (StringStore, (list(self),))

    def __len__(self):
-        """
-        The number of strings in the store.
+        """The number of strings in the store.

-        Returns:
-            int The number of strings in the store.
+        RETURNS (int): The number of strings in the store.
        """
        return self.size-1

    def __getitem__(self, object string_or_id):
-        """
-        Retrieve a string from a given integer ID, or vice versa.
+        """Retrieve a string from a given integer ID, or vice versa.

-        Arguments:
-            string_or_id (bytes or unicode or int):
-                The value to encode.
-        Returns:
-            unicode or int: The value to retrieved.
+        string_or_id (bytes or unicode or int): The value to encode.
+        Returns (unicode or int): The value to be retrieved.
        """
        if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
            return 0
@ -163,13 +152,10 @@ cdef class StringStore:
                return utf8str - self.c

    def __contains__(self, unicode string not None):
-        """
-        Check whether a string is in the store.
+        """Check whether a string is in the store.

-        Arguments:
-            string (unicode): The string to check.
-        Returns bool:
-            Whether the store contains the string.
+        string (unicode): The string to check.
+        RETURNS (bool): Whether the store contains the string.
        """
        if len(string) == 0:
            return True
@ -177,10 +163,9 @@ cdef class StringStore:
        return self._map.get(key) is not NULL

    def __iter__(self):
-        """
-        Iterate over the strings in the store, in order.
+        """Iterate over the strings in the store, in order.

-        Yields: unicode A string in the store.
+        YIELDS (unicode): A string in the store.
        """
        cdef int i
        for i in range(self.size):
@ -195,6 +180,41 @@ cdef class StringStore:
            strings.append(py_string)
        return (StringStore, (strings,), None, None, None)

+    def to_disk(self, path):
+        """Save the current state to a directory.
+
+        path (unicode or Path): A path to a directory, which will be created if
+            it doesn't exist. Paths may be either strings or `Path`-like objects.
+        """
+        raise NotImplementedError()
+
+    def from_disk(self, path):
+        """Loads state from a directory. Modifies the object in place and
+        returns it.
+
+        path (unicode or Path): A path to a directory. Paths may be either
+            strings or `Path`-like objects.
+        RETURNS (StringStore): The modified `StringStore` object.
+        """
+        raise NotImplementedError()
+
+    def to_bytes(self, **exclude):
+        """Serialize the current state to a binary string.
+
+        **exclude: Named attributes to prevent from being serialized.
+        RETURNS (bytes): The serialized form of the `StringStore` object.
+        """
+        raise NotImplementedError()
+
+    def from_bytes(self, bytes_data, **exclude):
+        """Load state from a binary string.
+
+        bytes_data (bytes): The data to load from.
+        **exclude: Named attributes to prevent from being loaded.
+        RETURNS (StringStore): The `StringStore` object.
+        """
+        raise NotImplementedError()
+
    def set_frozen(self, bint is_frozen):
        # TODO
        self.is_frozen = is_frozen
@ -235,40 +255,6 @@ cdef class StringStore:
        self.size += 1
        return &self.c[self.size-1]

-    def dump(self, file_):
-        """
-        Save the strings to a JSON file.
-
-        Arguments:
-            file_ (buffer): The file to save the strings.
-        Returns:
-            None
-        """
-        string_data = ujson.dumps(list(self))
-        if not isinstance(string_data, unicode):
-            string_data = string_data.decode('utf8')
-        # TODO: OOV?
-        file_.write(string_data)
-
-    def load(self, file_):
-        """
-        Load the strings from a JSON file.
-
-        Arguments:
-            file_ (buffer): The file from which to load the strings.
-        Returns:
-            None
-        """
-        strings = ujson.load(file_)
-        if strings == ['']:
-            return None
-        cdef unicode string
-        for string in strings:
-            # explicit None/len check instead of simple truth testing
-            # (bug in Cython <= 0.23.4)
-            if string is not None and len(string):
-                self.intern_unicode(string)
-
    def _realloc(self):
        # We want to map straight to pointers, but they'll be invalidated if
        # we resize our array. So, first we remap to indices, then we resize,
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-import ujson
 from collections import defaultdict

 from cymem.cymem cimport Pool
@ -15,7 +14,6 @@ from .tokens.doc cimport Doc
 from .attrs cimport TAG
 from .gold cimport GoldParse
 from .attrs cimport *
-from . import util


 cpdef enum:
@ -108,55 +106,15 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:


 cdef class Tagger:
-    """
-    Annotate part-of-speech tags on Doc objects.
-    """
-    @classmethod
-    def load(cls, path, vocab, require=False):
-        """
-        Load the statistical model from the supplied path.
-
-        Arguments:
-            path (Path):
-                The path to load from.
-            vocab (Vocab):
-                The vocabulary. Must be shared by the documents to be processed.
-            require (bool):
-                Whether to raise an error if the files are not found.
-        Returns (Tagger):
-            The newly created object.
-        """
-        # TODO: Change this to expect config.json when we don't have to
-        # support old data.
-        path = util.ensure_path(path)
-        if (path / 'templates.json').exists():
-            with (path / 'templates.json').open('r', encoding='utf8') as file_:
-                templates = ujson.load(file_)
-        elif require:
-            raise IOError(
-                "Required file %s/templates.json not found when loading Tagger" % str(path))
-        else:
-            templates = cls.feature_templates
-        self = cls(vocab, model=None, feature_templates=templates)
-
-        if (path / 'model').exists():
-            self.model.load(str(path / 'model'))
-        elif require:
-            raise IOError(
-                "Required file %s/model not found when loading Tagger" % str(path))
-        return self
+    """Annotate part-of-speech tags on Doc objects."""

    def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
-        """
-        Create a Tagger.
+        """Create a Tagger.

-        Arguments:
-            vocab (Vocab):
-                The vocabulary object. Must be shared with documents to be processed.
-            model (thinc.linear.AveragedPerceptron):
-                The statistical model.
-        Returns (Tagger):
-            The newly constructed object.
+        vocab (Vocab): The vocabulary object. Must be shared with documents to
+            be processed.
+        model (thinc.linear.AveragedPerceptron): The statistical model.
+        RETURNS (Tagger): The newly constructed object.
        """
        if model is None:
            model = TaggerModel(cfg.get('features', self.feature_templates),
@ -186,13 +144,9 @@ cdef class Tagger:
        tokens._py_tokens = [None] * tokens.length

    def __call__(self, Doc tokens):
-        """
-        Apply the tagger, setting the POS tags onto the Doc object.
+        """Apply the tagger, setting the POS tags onto the Doc object.

-        Arguments:
-            doc (Doc): The tokens to be tagged.
-        Returns:
-            None
+        doc (Doc): The tokens to be tagged.
        """
        if tokens.length == 0:
            return 0
@ -215,34 +169,25 @@ cdef class Tagger:
        tokens._py_tokens = [None] * tokens.length

    def pipe(self, stream, batch_size=1000, n_threads=2):
-        """
-        Tag a stream of documents.
+        """Tag a stream of documents.

        Arguments:
-            stream: The sequence of documents to tag.
-            batch_size (int):
-                The number of documents to accumulate into a working set.
-            n_threads (int):
-                The number of threads with which to work on the buffer in parallel,
-                if the Matcher implementation supports multi-threading.
-        Yields:
-            Doc Documents, in order.
+        stream: The sequence of documents to tag.
+        batch_size (int): The number of documents to accumulate into a working set.
+        n_threads (int): The number of threads with which to work on the buffer
+            in parallel, if the Matcher implementation supports multi-threading.
+        YIELDS (Doc): Documents, in order.
        """
        for doc in stream:
            self(doc)
            yield doc

    def update(self, Doc tokens, GoldParse gold, itn=0):
-        """
-        Update the statistical model, with tags supplied for the given document.
+        """Update the statistical model, with tags supplied for the given document.

-        Arguments:
-            doc (Doc):
-                The document to update on.
-            gold (GoldParse):
-                Manager for the gold-standard tags.
-        Returns (int):
-            Number of tags correct.
+        doc (Doc): The document to update on.
+        gold (GoldParse): Manager for the gold-standard tags.
+        RETURNS (int): Number of tags predicted correctly.
        """
        gold_tag_strs = gold.tags
        assert len(tokens) == len(gold_tag_strs)
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@ -99,8 +99,8 @@ def test_doc_token_api_ancestors(en_tokenizer):
    assert [t.text for t in doc[1].ancestors] == ["saw"]
    assert [t.text for t in doc[2].ancestors] == []

-    assert doc[2].is_ancestor_of(doc[7])
-    assert not doc[6].is_ancestor_of(doc[2])
+    assert doc[2].is_ancestor(doc[7])
+    assert not doc[6].is_ancestor(doc[2])


 def test_doc_token_api_head_setter(en_tokenizer):
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -2,8 +2,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-import ujson
-
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as preinc
 from cymem.cymem cimport Pool
@ -12,75 +10,31 @@ from preshed.maps cimport PreshMap
 from .strings cimport hash_string
 cimport cython

-from . import util
 from .tokens.doc cimport Doc


 cdef class Tokenizer:
+    """Segment text, and create Doc objects with the discovered segment
+    boundaries.
    """
-    Segment text, and create Doc objects with the discovered segment boundaries.
-    """
-    @classmethod
-    def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
-             infix_finditer=None, token_match=None):
-        """
-        Load a Tokenizer, reading unsupplied components from the path.
-
-        Arguments:
-            path (Path):
-                The path to load from.
-            vocab (Vocab):
-                A storage container for lexical types.
-            rules (dict):
-                Exceptions and special-cases for the tokenizer.
-            token_match:
-                A boolean function matching strings that becomes tokens.
-            prefix_search:
-                Signature of re.compile(string).search
-            suffix_search:
-                Signature of re.compile(string).search
-            infix_finditer:
-                Signature of re.compile(string).finditer
-        Returns Tokenizer
-        """
-        path = util.ensure_path(path)
-        if rules is None:
-            with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_:
-                rules = ujson.load(file_)
-        if prefix_search in (None, True):
-            with (path / 'tokenizer' / 'prefix.txt').open() as file_:
-                entries = file_.read().split('\n')
-            prefix_search = util.compile_prefix_regex(entries).search
-        if suffix_search in (None, True):
-            with (path / 'tokenizer' / 'suffix.txt').open() as file_:
-                entries = file_.read().split('\n')
-            suffix_search = util.compile_suffix_regex(entries).search
-        if infix_finditer in (None, True):
-            with (path / 'tokenizer' / 'infix.txt').open() as file_:
-                entries = file_.read().split('\n')
-            infix_finditer = util.compile_infix_regex(entries).finditer
-        return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match)
-
    def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
-        """
-        Create a Tokenizer, to create Doc objects given unicode text.
+        """Create a `Tokenizer`, to create `Doc` objects given unicode text.

-        Arguments:
-            vocab (Vocab):
-                A storage container for lexical types.
-            rules (dict):
-                Exceptions and special-cases for the tokenizer.
-            prefix_search:
-                A function matching the signature of re.compile(string).search
-                to match prefixes.
-            suffix_search:
-                A function matching the signature of re.compile(string).search
-                to match suffixes.
-            infix_finditer:
-                A function matching the signature of re.compile(string).finditer
-                to find infixes.
-            token_match:
-                A boolean function matching strings that becomes tokens.
+        vocab (Vocab): A storage container for lexical types.
+        rules (dict): Exceptions and special-cases for the tokenizer.
+        prefix_search (callable): A function matching the signature of
+            `re.compile(string).search` to match prefixes.
+        suffix_search (callable): A function matching the signature of
+            `re.compile(string).search` to match suffixes.
+        `infix_finditer` (callable): A function matching the signature of
+            `re.compile(string).finditer` to find infixes.
+        token_match (callable): A boolean function matching strings to be
+            recognised as tokens.
+        RETURNS (Tokenizer): The newly constructed object.
+
+        EXAMPLE:
+            >>> tokenizer = Tokenizer(nlp.vocab)
+            >>> tokenizer = English().Defaults.create_tokenizer(nlp)
        """
        self.mem = Pool()
        self._cache = PreshMap()
@ -112,13 +66,10 @@ cdef class Tokenizer:

    @cython.boundscheck(False)
    def __call__(self, unicode string):
-        """
-        Tokenize a string.
+        """Tokenize a string.

-        Arguments:
-            string (unicode): The string to tokenize.
-        Returns:
-            Doc A container for linguistic annotations.
+        string (unicode): The string to tokenize.
+        RETURNS (Doc): A container for linguistic annotations.
        """
        if len(string) >= (2 ** 30):
            raise ValueError(
@ -166,18 +117,13 @@ cdef class Tokenizer:
        return tokens

    def pipe(self, texts, batch_size=1000, n_threads=2):
-        """
-        Tokenize a stream of texts.
+        """Tokenize a stream of texts.

-        Arguments:
-            texts: A sequence of unicode texts.
-            batch_size (int):
-                The number of texts to accumulate in an internal buffer.
-            n_threads (int):
-                The number of threads to use, if the implementation supports
-                multi-threading. The default tokenizer is single-threaded.
-        Yields:
-            Doc A sequence of Doc objects, in order.
+        texts: A sequence of unicode texts.
+        batch_size (int): The number of texts to accumulate in an internal buffer.
+        n_threads (int): The number of threads to use, if the implementation
+            supports multi-threading. The default tokenizer is single-threaded.
+        YIELDS (Doc): A sequence of Doc objects, in order.
        """
        for text in texts:
            yield self(text)
@ -321,27 +267,23 @@ cdef class Tokenizer:
        self._cache.set(key, cached)

    def find_infix(self, unicode string):
-        """
-        Find internal split points of the string, such as hyphens.
+        """Find internal split points of the string, such as hyphens.

        string (unicode): The string to segment.
-
-        Returns List[re.MatchObject]
-            A list of objects that have .start() and .end() methods, denoting the
-            placement of internal segment separators, e.g. hyphens.
+        RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
+            and `.end()` methods, denoting the placement of internal segment
+            separators, e.g. hyphens.
        """
        if self.infix_finditer is None:
            return 0
        return list(self.infix_finditer(string))

    def find_prefix(self, unicode string):
-        """
-        Find the length of a prefix that should be segmented from the string,
+        """Find the length of a prefix that should be segmented from the string,
        or None if no prefix rules match.

-        Arguments:
-            string (unicode): The string to segment.
-        Returns (int or None): The length of the prefix if present, otherwise None.
+        string (unicode): The string to segment.
+        RETURNS (int): The length of the prefix if present, otherwise `None`.
        """
        if self.prefix_search is None:
            return 0
@ -349,13 +291,11 @@ cdef class Tokenizer:
        return (match.end() - match.start()) if match is not None else 0

    def find_suffix(self, unicode string):
-        """
-        Find the length of a suffix that should be segmented from the string,
+        """Find the length of a suffix that should be segmented from the string,
        or None if no suffix rules match.

-        Arguments:
-            string (unicode): The string to segment.
-        Returns (int or None): The length of the suffix if present, otherwise None.
+        string (unicode): The string to segment.
+        Returns (int): The length of the suffix if present, otherwise `None`.
        """
        if self.suffix_search is None:
            return 0
@ -363,23 +303,17 @@ cdef class Tokenizer:
        return (match.end() - match.start()) if match is not None else 0

    def _load_special_tokenization(self, special_cases):
-        """
-        Add special-case tokenization rules.
-        """
+        """Add special-case tokenization rules."""
        for chunk, substrings in sorted(special_cases.items()):
            self.add_special_case(chunk, substrings)

    def add_special_case(self, unicode string, substrings):
-        """
-        Add a special-case tokenization rule.
+        """Add a special-case tokenization rule.

-        Arguments:
-            string (unicode): The string to specially tokenize.
-            token_attrs:
-                A sequence of dicts, where each dict describes a token and its
-                attributes. The ORTH fields of the attributes must exactly match
-                the string when they are concatenated.
-        Returns None
+        string (unicode): The string to specially tokenize.
+        token_attrs (iterable): A sequence of dicts, where each dict describes
+            a token and its attributes. The `ORTH` fields of the attributes must
+            exactly match the string when they are concatenated.
        """
        substrings = list(substrings)
        cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
@ -390,3 +324,38 @@ cdef class Tokenizer:
        self._specials.set(key, cached)
        self._cache.set(key, cached)
        self._rules[string] = substrings
+
+    def to_disk(self, path):
+        """Save the current state to a directory.
+
+        path (unicode or Path): A path to a directory, which will be created if
+            it doesn't exist. Paths may be either strings or `Path`-like objects.
+        """
+        raise NotImplementedError()
+
+    def from_disk(self, path):
+        """Loads state from a directory. Modifies the object in place and
+        returns it.
+
+        path (unicode or Path): A path to a directory. Paths may be either
+            strings or `Path`-like objects.
+        RETURNS (Tokenizer): The modified `Tokenizer` object.
+        """
+        raise NotImplementedError()
+
+    def to_bytes(self, **exclude):
+        """Serialize the current state to a binary string.
+
+        **exclude: Named attributes to prevent from being serialized.
+        RETURNS (bytes): The serialized form of the `Tokenizer` object.
+        """
+        raise NotImplementedError()
+
+    def from_bytes(self, bytes_data, **exclude):
+        """Load state from a binary string.
+
+        bytes_data (bytes): The data to load from.
+        **exclude: Named attributes to prevent from being loaded.
+        RETURNS (Tokenizer): The `Tokenizer` object.
+        """
+        raise NotImplementedError()
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -63,40 +63,30 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:


 cdef class Doc:
-    """
-    A sequence of `Token` objects. Access sentences and named entities,
-    export annotations to numpy arrays, losslessly serialize to compressed
-    binary strings.
+    """A sequence of Token objects. Access sentences and named entities, export
+    annotations to numpy arrays, losslessly serialize to compressed binary strings.
+    The `Doc` object holds an array of `TokenC` structs. The Python-level
+    `Token` and `Span` objects are views of this array, i.e. they don't own
+    the data themselves.

-    Aside: Internals
-        The `Doc` object holds an array of `TokenC` structs.
-        The Python-level `Token` and `Span` objects are views of this
-        array, i.e. they don't own the data themselves.
-
-    Code: Construction 1
-        doc = nlp.tokenizer(u'Some text')
-
-    Code: Construction 2
-        doc = Doc(nlp.vocab, orths_and_spaces=[(u'Some', True), (u'text', True)])
+    EXAMPLE: Construction 1
+        >>> doc = nlp(u'Some text')

+        Construction 2
+        >>> from spacy.tokens import Doc
+        >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
    """
    def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
-        """
-        Create a Doc object.
+        """Create a Doc object.

-        Arguments:
-            vocab:
-                A Vocabulary object, which must match any models you want to
-                use (e.g. tokenizer, parser, entity recognizer).
-
-            words:
-                A list of unicode strings to add to the document as words. If None,
-                defaults to empty list.
-
-            spaces:
-                A list of boolean values, of the same length as words. True
-                means that the word is followed by a space, False means it is not.
-                If None, defaults to [True]*len(words)
+        vocab (Vocab): A vocabulary object, which must match any models you want
+            to use (e.g. tokenizer, parser, entity recognizer).
+        words (list or None): A list of unicode strings to add to the document
+            as words. If `None`, defaults to empty list.
+        spaces (list or None): A list of boolean values, of the same length as
+            words. True means that the word is followed by a space, False means
+            it is not. If `None`, defaults to `[True]*len(words)`
+        RETURNS (Doc): The newly constructed object.
        """
        self.vocab = vocab
        size = 20
@ -158,20 +148,26 @@ cdef class Doc:
            self.is_parsed = True

    def __getitem__(self, object i):
-        """
-        doc[i]
-            Get the Token object at position i, where i is an integer.
+        """Get a `Token` or `Span` object.
+
+        i (int or tuple) The index of the token, or the slice of the document to get.
+        RETURNS (Token or Span): The token at `doc[i]]`, or the span at
+            `doc[start : end]`.
+
+        EXAMPLE:
+            >>> doc[i]
+            Get the `Token` object at position `i`, where `i` is an integer.
            Negative indexing is supported, and follows the usual Python
-            semantics, i.e. doc[-2] is doc[len(doc) - 2].
-        doc[start : end]]
-            Get a `Span` object, starting at position `start`
-            and ending at position `end`, where `start` and
-            `end` are token indices. For instance,
-            `doc[2:5]` produces a span consisting of
-            tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
-            are not supported, as `Span` objects must be contiguous (cannot have gaps).
-            You can use negative indices and open-ended ranges, which have their
-            normal Python semantics.
+            semantics, i.e. `doc[-2]` is `doc[len(doc) - 2]`.
+
+            >>> doc[start : end]]
+            Get a `Span` object, starting at position `start` and ending at
+            position `end`, where `start` and `end` are token indices. For
+            instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4.
+            Stepped slices (e.g. `doc[start : end : step]`) are not supported,
+            as `Span` objects must be contiguous (cannot have gaps). You can use
+            negative indices and open-ended ranges, which have their normal
+            Python semantics.
        """
        if isinstance(i, slice):
            start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
@ -186,14 +182,14 @@ cdef class Doc:
            return Token.cinit(self.vocab, &self.c[i], i, self)

    def __iter__(self):
-        """
-        for token in doc
-            Iterate over `Token`  objects, from which the annotations can
-            be easily accessed. This is the main way of accessing Token
-            objects, which are the main way annotations are accessed from
-            Python. If faster-than-Python speeds are required, you can
-            instead access the annotations as a numpy array, or access the
-            underlying C data directly from Cython.
+        """Iterate over `Token`  objects, from which the annotations can be
+        easily accessed. This is the main way of accessing `Token` objects,
+        which are the main way annotations are accessed from Python. If faster-
+        than-Python speeds are required, you can instead access the annotations
+        as a numpy array, or access the underlying C data directly from Cython.
+
+        EXAMPLE:
+            >>> for token in doc
        """
        cdef int i
        for i in range(self.length):
@ -203,9 +199,12 @@ cdef class Doc:
                yield Token.cinit(self.vocab, &self.c[i], i, self)

    def __len__(self):
-        """
-        len(doc)
-            The number of tokens in the document.
+        """The number of tokens in the document.
+
+        RETURNS (int): The number of tokens in the document.
+
+        EXAMPLE:
+            >>> len(doc)
        """
        return self.length

@ -228,16 +227,12 @@ cdef class Doc:
        return self

    def similarity(self, other):
-        """
-        Make a semantic similarity estimate. The default estimate is cosine
+        """Make a semantic similarity estimate. The default estimate is cosine
        similarity using an average of word vectors.

-        Arguments:
-            other (object): The object to compare with. By default, accepts Doc,
-                Span, Token and Lexeme objects.
-
-        Return:
-            score (float): A scalar similarity score. Higher is more similar.
+        other (object): The object to compare with. By default, accepts `Doc`,
+            `Span`, `Token` and `Lexeme` objects.
+        RETURNS (float): A scalar similarity score. Higher is more similar.
        """
        if 'similarity' in self.user_hooks:
            return self.user_hooks['similarity'](self, other)
@ -246,8 +241,10 @@ cdef class Doc:
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)

    property has_vector:
-        """
-        A boolean value indicating whether a word vector is associated with the object.
+        """A boolean value indicating whether a word vector is associated with
+        the object.
+
+        RETURNS (bool): Whether a word vector is associated with the object.
        """
        def __get__(self):
            if 'has_vector' in self.user_hooks:
@ -256,10 +253,11 @@ cdef class Doc:
            return any(token.has_vector for token in self)

    property vector:
-        """
-        A real-valued meaning representation. Defaults to an average of the token vectors.
+        """A real-valued meaning representation. Defaults to an average of the
+        token vectors.

-        Type: numpy.ndarray[ndim=1, dtype='float32']
+        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
+            representing the document's semantics.
        """
        def __get__(self):
            if 'vector' in self.user_hooks:
@ -275,6 +273,10 @@ cdef class Doc:
            self._vector = value

    property vector_norm:
+        """The L2 norm of the document's vector representation.
+
+        RETURNS (float): The L2 norm of the vector representation.
+        """
        def __get__(self):
            if 'vector_norm' in self.user_hooks:
                return self.user_hooks['vector_norm'](self)
@ -295,34 +297,37 @@ cdef class Doc:
        return self.text

    property text:
-        """
-        A unicode representation of the document text.
+        """A unicode representation of the document text.
+
+        RETURNS (unicode): The original verbatim text of the document.
        """
        def __get__(self):
            return u''.join(t.text_with_ws for t in self)

    property text_with_ws:
-        """
-        An alias of Doc.text, provided for duck-type compatibility with Span and Token.
+        """An alias of `Doc.text`, provided for duck-type compatibility with
+        `Span` and `Token`.
+
+        RETURNS (unicode): The original verbatim text of the document.
        """
        def __get__(self):
            return self.text

    property ents:
-        """
-        Yields named-entity `Span` objects, if the entity recognizer
-        has been applied to the document. Iterate over the span to get
-        individual Token objects, or access the label:
+        """Iterate over the entities in the document. Yields named-entity `Span`
+        objects, if the entity recognizer has been applied to the document.

-        Example:
-            from spacy.en import English
-            nlp = English()
-            tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
-            ents = list(tokens.ents)
-            assert ents[0].label == 346
-            assert ents[0].label_ == 'PERSON'
-            assert ents[0].orth_ == 'Best'
-            assert ents[0].text == 'Mr. Best'
+        YIELDS (Span): Entities in the document.
+
+        EXAMPLE: Iterate over the span to get individual Token objects, or access
+            the label:
+
+            >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
+            >>> ents = list(tokens.ents)
+            >>> assert ents[0].label == 346
+            >>> assert ents[0].label_ == 'PERSON'
+            >>> assert ents[0].orth_ == 'Best'
+            >>> assert ents[0].text == 'Mr. Best'
        """
        def __get__(self):
            cdef int i
@ -387,12 +392,13 @@ cdef class Doc:
                    self.c[start].ent_iob = 3

    property noun_chunks:
-        """
-        Yields base noun-phrase #[code Span] objects, if the document
-        has been syntactically parsed. A base noun phrase, or
-        'NP chunk', is a noun phrase that does not permit other NPs to
-        be nested within it – so no NP-level coordination, no prepositional
-        phrases, and no relative clauses.
+        """Iterate over the base noun phrases in the document. Yields base
+        noun-phrase #[code Span] objects, if the document has been syntactically
+        parsed. A base noun phrase, or "NP chunk", is a noun phrase that does
+        not permit other NPs to be nested within it – so no NP-level
+        coordination, no prepositional phrases, and no relative clauses.
+
+        YIELDS (Span): Noun chunks in the document.
        """
        def __get__(self):
            if not self.is_parsed:
@ -411,17 +417,15 @@ cdef class Doc:
                yield span

    property sents:
-        """
-        Yields sentence `Span` objects. Sentence spans have no label.
-        To improve accuracy on informal texts, spaCy calculates sentence
-        boundaries from the syntactic dependency parse. If the parser is disabled,
-        `sents` iterator will be unavailable.
+        """Iterate over the sentences in the document. Yields sentence `Span`
+        objects. Sentence spans have no label. To improve accuracy on informal
+        texts, spaCy calculates sentence boundaries from the syntactic
+        dependency parse. If the parser is disabled, the `sents` iterator will
+        be unavailable.

-        Example:
-            from spacy.en import English
-            nlp = English()
-            doc = nlp("This is a sentence. Here's another...")
-            assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
+        EXAMPLE:
+            >>> doc = nlp("This is a sentence. Here's another...")
+            >>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
        """
        def __get__(self):
            if 'sents' in self.user_hooks:
@ -467,24 +471,20 @@ cdef class Doc:

    @cython.boundscheck(False)
    cpdef np.ndarray to_array(self, object py_attr_ids):
-        """
-        Given a list of M attribute IDs, export the tokens to a numpy
-        `ndarray` of shape (N, M), where `N` is the length
-        of the document. The values will be 32-bit integers.
+        """Given a list of M attribute IDs, export the tokens to a numpy
+        `ndarray` of shape `(N, M)`, where `N` is the length of the document.
+        The values will be 32-bit integers.

-        Example:
-            from spacy import attrs
-            doc = nlp(text)
-            # All strings mapped to integers, for easy export to numpy
-            np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
+        attr_ids (list[int]): A list of attribute ID ints.
+        RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
+            per word, and one column per attribute indicated in the input
+            `attr_ids`.

-        Arguments:
-            attr_ids (list[int]): A list of attribute ID ints.
-
-        Returns:
-            feat_array (numpy.ndarray[long, ndim=2]):
-              A feature matrix, with one row per word, and one column per attribute
-              indicated in the input attr_ids.
+        EXAMPLE:
+            >>> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
+            >>> doc = nlp(text)
+            >>> # All strings mapped to integers, for easy export to numpy
+            >>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
        """
        cdef int i, j
        cdef attr_id_t feature
@ -499,27 +499,20 @@ cdef class Doc:
        return output

    def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
-        """
-        Produce a dict of {attribute (int): count (ints)} frequencies, keyed
-        by the values of the given attribute ID.
+        """Count the frequencies of a given attribute. Produces a dict of
+        `{attribute (int): count (ints)}` frequencies, keyed by the values of
+        the given attribute ID.

-        Example:
-            from spacy.en import English
-            from spacy import attrs
-            nlp = English()
-            tokens = nlp(u'apple apple orange banana')
-            tokens.count_by(attrs.ORTH)
-            # {12800L: 1, 11880L: 2, 7561L: 1}
-            tokens.to_array([attrs.ORTH])
-            # array([[11880],
-            #   [11880],
-            #   [ 7561],
-            #   [12800]])
+        attr_id (int): The attribute ID to key the counts.
+        RETURNS (dict): A dictionary mapping attributes to integer counts.

-        Arguments:
-            attr_id
-                int
-                The attribute ID to key the counts.
+        EXAMPLE:
+            >>> from spacy import attrs
+            >>> doc = nlp(u'apple apple orange banana')
+            >>> tokens.count_by(attrs.ORTH)
+            {12800L: 1, 11880L: 2, 7561L: 1}
+            >>> tokens.to_array([attrs.ORTH])
+            array([[11880], [11880], [7561], [12800]])
        """
        cdef int i
        cdef attr_t attr
@ -567,8 +560,12 @@ cdef class Doc:
            self.c[i] = parsed[i]

    def from_array(self, attrs, int[:, :] array):
-        """
-        Write to a `Doc` object, from an `(M, N)` array of attributes.
+        """Load attributes from a numpy array. Write to a `Doc` object, from an
+        `(M, N)` array of attributes.
+
+        attrs (ints): A list of attribute ID ints.
+        array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load.
+        RETURNS (Doc): Itself.
        """
        cdef int i, col
        cdef attr_id_t attr_id
@ -597,8 +594,10 @@ cdef class Doc:
        return self

    def to_bytes(self):
-        """
-        Serialize, producing a byte string.
+        """Serialize, i.e. export the document contents to a binary string.
+
+        RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
+            all annotations.
        """
        return dill.dumps(
            (self.text,
@ -611,8 +610,10 @@ cdef class Doc:
            protocol=-1)

    def from_bytes(self, data):
-        """
-        Deserialize, loading from bytes.
+        """Deserialize, i.e. import the document contents from a binary string.
+
+        data (bytes): The string to load from.
+        RETURNS (Doc): Itself.
        """
        if self.length != 0:
            raise ValueError("Cannot load into non-empty Doc")
@ -640,21 +641,16 @@ cdef class Doc:
        return self

    def merge(self, int start_idx, int end_idx, *args, **attributes):
-        """
-        Retokenize the document, such that the span at doc.text[start_idx : end_idx]
-        is merged into a single token. If start_idx and end_idx do not mark start
-        and end token boundaries, the document remains unchanged.
+        """Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
+        is merged into a single token. If `start_idx` and `end_idx `do not mark
+        start and end token boundaries, the document remains unchanged.

-        Arguments:
-            start_idx (int): The character index of the start of the slice to merge.
-            end_idx (int): The character index after the end of the slice to merge.
-            **attributes:
-                Attributes to assign to the merged token. By default, attributes
-                are inherited from the syntactic root token of the span.
-        Returns:
-            token (Token):
-                The newly merged token, or None if the start and end indices did
-                not fall at token boundaries.
+        start_idx (int): The character index of the start of the slice to merge.
+        end_idx (int): The character index after the end of the slice to merge.
+        **attributes: Attributes to assign to the merged token. By default,
+            attributes are inherited from the syntactic root token of the span.
+        RETURNS (Token): The newly merged token, or `None` if the start and end
+            indices did not fall at token boundaries.
        """
        cdef unicode tag, lemma, ent_type
        if len(args) == 3:
@ -758,7 +754,29 @@ cdef class Doc:
        return self[start]

    def print_tree(self, light=False, flat=False):
-        """Returns the parse trees in the JSON (Dict) format."""
+        """Returns the parse trees in JSON (dict) format.
+
+        light (bool): Don't include lemmas or entities.
+        flat (bool): Don't include arcs or modifiers.
+        RETURNS (dict): Parse tree as dict.
+
+        EXAMPLE:
+            >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
+            >>> trees = doc.print_tree()
+            >>> trees[1]
+            {'modifiers': [
+                {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
+                'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
+                {'modifiers': [
+                    {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
+                    'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
+                'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
+                'POS_fine': 'NN', 'lemma': 'pizza'},
+                {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
+                'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
+                'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
+                'POS_fine': 'VBD', 'lemma': 'eat'}
+        """
        return parse_tree(self, light=light, flat=flat)


--- a/spacy/tokens/printers.py
+++ b/spacy/tokens/printers.py
@ -6,18 +6,14 @@ from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE


 def merge_ents(doc):
-    """
-    Helper: merge adjacent entities into single tokens; modifies the doc.
-    """
+    """Helper: merge adjacent entities into single tokens; modifies the doc."""
    for ent in doc.ents:
        ent.merge(ent.root.tag_, ent.text, ent.label_)
    return doc


 def format_POS(token, light, flat):
-    """
-    Helper: form the POS output for a token.
-    """
+    """Helper: form the POS output for a token."""
    subtree = dict([
        ("word", token.text),
        ("lemma", token.lemma_),  # trigger
@ -37,9 +33,8 @@ def format_POS(token, light, flat):


 def POS_tree(root, light=False, flat=False):
-    """
-    Helper: generate a POS tree for a root token. The doc must have
-    merge_ents(doc) ran on it.
+    """Helper: generate a POS tree for a root token. The doc must have
+    `merge_ents(doc)` ran on it.
    """
    subtree = format_POS(root, light=light, flat=flat)
    for c in root.children:
@ -48,21 +43,28 @@ def POS_tree(root, light=False, flat=False):


 def parse_tree(doc, light=False, flat=False):
-    """
-    Makes a copy of the doc, then construct a syntactic parse tree, similar to
+    """Makes a copy of the doc, then construct a syntactic parse tree, similar to
    the one used in displaCy. Generates the POS tree for all sentences in a doc.

-    Args:
-        doc: The doc for parsing.
+    doc (Doc): The doc for parsing.
+    RETURNS (dict): The parse tree.

-    Returns:
-        [parse_trees (Dict)]:
-
-    >>> from spacy.en import English
-    >>> nlp = English()
-    >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
-    >>> trees = doc.print_tree()
-    [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
+    EXAMPLE:
+        >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
+        >>> trees = doc.print_tree()
+        >>> trees[1]
+        {'modifiers': [
+            {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
+             'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
+            {'modifiers': [
+                {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
+                 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
+             'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
+             'POS_fine': 'NN', 'lemma': 'pizza'},
+            {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
+             'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
+            'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
+            'POS_fine': 'VBD', 'lemma': 'eat'}
    """
    doc_clone  = Doc(doc.vocab, words=[w.text for w in doc])
    doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -20,22 +20,17 @@ from .. import about


 cdef class Span:
-    """
-    A slice from a Doc object.
-    """
+    """A slice from a Doc object."""
    def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
                  vector_norm=None):
-        """
-        Create a Span object from the slice doc[start : end]
+        """Create a `Span` object from the slice `doc[start : end]`.

-        Arguments:
-            doc (Doc): The parent document.
-            start (int): The index of the first token of the span.
-            end (int): The index of the first token after the span.
-            label (int): A label to attach to the Span, e.g. for named entities.
-            vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
-        Returns:
-            Span The newly constructed object.
+        doc (Doc): The parent document.
+        start (int): The index of the first token of the span.
+        end (int): The index of the first token after the span.
+        label (int): A label to attach to the Span, e.g. for named entities.
+        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
+        RETURNS (Span): The newly constructed object.
        """
        if not (0 <= start <= end <= len(doc)):
            raise IndexError
@ -70,8 +65,11 @@ cdef class Span:
    def __hash__(self):
        return hash((self.doc, self.label, self.start_char, self.end_char))

-
    def __len__(self):
+        """Get the number of tokens in the span.
+
+        RETURNS (int): The number of tokens in the span.
+        """
        self._recalculate_indices()
        if self.end < self.start:
            return 0
@ -83,6 +81,16 @@ cdef class Span:
        return self.text.encode('utf-8')

    def __getitem__(self, object i):
+        """Get a `Token` or a `Span` object
+
+        i (int or tuple): The index of the token within the span, or slice of
+            the span to get.
+        RETURNS (Token or Span): The token at `span[i]`.
+
+        EXAMPLE:
+            >>> span[0]
+            >>> span[1:3]
+        """
        self._recalculate_indices()
        if isinstance(i, slice):
            start, end = normalize_slice(len(self), i.start, i.stop, i.step)
@ -94,35 +102,31 @@ cdef class Span:
                return self.doc[self.start + i]

    def __iter__(self):
+        """Iterate over `Token` objects.
+
+        YIELDS (Token): A `Token` object.
+        """
        self._recalculate_indices()
        for i in range(self.start, self.end):
            yield self.doc[i]

    def merge(self, *args, **attributes):
-        """
-        Retokenize the document, such that the span is merged into a single token.
+        """Retokenize the document, such that the span is merged into a single
+        token.

-        Arguments:
-            **attributes:
-                Attributes to assign to the merged token. By default, attributes
-                are inherited from the syntactic root token of the span.
-        Returns:
-            token (Token):
-                The newly merged token.
+        **attributes: Attributes to assign to the merged token. By default,
+            attributes are inherited from the syntactic root token of the span.
+        RETURNS (Token): The newly merged token.
        """
        return self.doc.merge(self.start_char, self.end_char, *args, **attributes)

    def similarity(self, other):
-        """
-        Make a semantic similarity estimate. The default estimate is cosine
+        """Make a semantic similarity estimate. The default estimate is cosine
        similarity using an average of word vectors.

-        Arguments:
-            other (object): The object to compare with. By default, accepts Doc,
-                Span, Token and Lexeme objects.
-
-        Return:
-            score (float): A scalar similarity score. Higher is more similar.
+        other (object): The object to compare with. By default, accepts `Doc`,
+            `Span`, `Token` and `Lexeme` objects.
+        RETURNS (float): A scalar similarity score. Higher is more similar.
        """
        if 'similarity' in self.doc.user_span_hooks:
            self.doc.user_span_hooks['similarity'](self, other)
@ -145,11 +149,9 @@ cdef class Span:
            self.end = end + 1

    property sent:
-        """
-        The sentence span that this span is a part of.
+        """The sentence span that this span is a part of.

-        Returns:
-            Span The sentence this is part of.
+        RETURNS (Span): The sentence span that the span is a part of.
        """
        def __get__(self):
            if 'sent' in self.doc.user_span_hooks:
@ -166,12 +168,23 @@ cdef class Span:
            return self.doc[root.l_edge : root.r_edge + 1]

    property has_vector:
+        """A boolean value indicating whether a word vector is associated with
+        the object.
+
+        RETURNS (bool): Whether a word vector is associated with the object.
+        """
        def __get__(self):
            if 'has_vector' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['has_vector'](self)
            return any(token.has_vector for token in self)

    property vector:
+        """A real-valued meaning representation. Defaults to an average of the
+        token vectors.
+
+        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
+            representing the span's semantics.
+        """
        def __get__(self):
            if 'vector' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['vector'](self)
@ -180,6 +193,10 @@ cdef class Span:
            return self._vector

    property vector_norm:
+        """The L2 norm of the document's vector representation.
+
+        RETURNS (float): The L2 norm of the vector representation.
+        """
        def __get__(self):
            if 'vector_norm' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['vector'](self)
@ -193,6 +210,7 @@ cdef class Span:
            return self._vector_norm

    property sentiment:
+        # TODO: docstring
        def __get__(self):
            if 'sentiment' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['sentiment'](self)
@ -200,6 +218,10 @@ cdef class Span:
                return sum([token.sentiment for token in self]) / len(self)

    property text:
+        """A unicode representation of the span text.
+
+        RETURNS (unicode): The original verbatim text of the span.
+        """
        def __get__(self):
            text = self.text_with_ws
            if self[-1].whitespace_:
@ -207,16 +229,21 @@ cdef class Span:
            return text

    property text_with_ws:
+        """The text content of the span with a trailing whitespace character if
+        the last token has one.
+
+        RETURNS (unicode): The text content of the span (with trailing whitespace).
+        """
        def __get__(self):
            return u''.join([t.text_with_ws for t in self])

    property noun_chunks:
-        """
-        Yields base noun-phrase #[code Span] objects, if the document
-        has been syntactically parsed. A base noun phrase, or
-        'NP chunk', is a noun phrase that does not permit other NPs to
-        be nested within it – so no NP-level coordination, no prepositional
-        phrases, and no relative clauses. For example:
+        """Yields base noun-phrase `Span` objects, if the document has been
+        syntactically parsed. A base noun phrase, or "NP chunk", is a noun
+        phrase that does not permit other NPs to be nested within it – so no
+        NP-level coordination, no prepositional phrases, and no relative clauses.
+
+        YIELDS (Span): Base noun-phrase `Span` objects
        """
        def __get__(self):
            if not self.doc.is_parsed:
@ -235,49 +262,47 @@ cdef class Span:
                yield span

    property root:
-        """
-        The token within the span that's highest in the parse tree. If there's a
-        tie, the earlist is prefered.
+        """The token within the span that's highest in the parse tree.
+        If there's a tie, the earliest is prefered.

-        Returns:
-            Token: The root token.
+        RETURNS (Token): The root token.

-        i.e. has the shortest path to the root of the sentence (or is the root
-        itself). If multiple words are equally high in the tree, the first word
-        is taken. For example:
+        EXAMPLE: The root token has the shortest path to the root of the sentence
+            (or is the root itself). If multiple words are equally high in the
+            tree, the first word is taken. For example:

-        >>> toks = nlp(u'I like New York in Autumn.')
+            >>> toks = nlp(u'I like New York in Autumn.')

-        Let's name the indices --- easier than writing "toks[4]" etc.
+            Let's name the indices – easier than writing `toks[4]` etc.

-        >>> i, like, new, york, in_, autumn, dot = range(len(toks))
+            >>> i, like, new, york, in_, autumn, dot = range(len(toks))

-        The head of 'new' is 'York', and the head of 'York' is 'like'
+            The head of 'new' is 'York', and the head of "York" is "like"

-        >>> toks[new].head.orth_
-        'York'
-        >>> toks[york].head.orth_
-        'like'
+            >>> toks[new].head.text
+            'York'
+            >>> toks[york].head.text
+            'like'

-        Create a span for "New York". Its root is "York".
+            Create a span for "New York". Its root is "York".

-        >>> new_york = toks[new:york+1]
-        >>> new_york.root.orth_
-        'York'
+            >>> new_york = toks[new:york+1]
+            >>> new_york.root.text
+            'York'

-        Here's a more complicated case, raise by Issue #214
+            Here's a more complicated case, raised by issue #214:

-        >>> toks = nlp(u'to, north and south carolina')
-        >>> to, north, and_, south, carolina = toks
-        >>> south.head.text, carolina.head.text
-        ('north', 'to')
+            >>> toks = nlp(u'to, north and south carolina')
+            >>> to, north, and_, south, carolina = toks
+            >>> south.head.text, carolina.head.text
+            ('north', 'to')

-        Here 'south' is a child of 'north', which is a child of 'carolina'.
-        Carolina is the root of the span:
+            Here "south" is a child of "north", which is a child of "carolina".
+            Carolina is the root of the span:

-        >>> south_carolina = toks[-2:]
-        >>> south_carolina.root.text
-        'carolina'
+            >>> south_carolina = toks[-2:]
+            >>> south_carolina.root.text
+            'carolina'
        """
        def __get__(self):
            self._recalculate_indices()
@ -314,10 +339,10 @@ cdef class Span:
                return self.doc[root]

    property lefts:
-        """
-        Tokens that are to the left of the span, whose head is within the Span.
+        """ Tokens that are to the left of the span, whose head is within the
+        `Span`.

-        Yields: Token A left-child of a token of the span.
+        YIELDS (Token):A left-child of a token of the span.
        """
        def __get__(self):
            for token in reversed(self): # Reverse, so we get the tokens in order
@ -326,10 +351,10 @@ cdef class Span:
                        yield left

    property rights:
-        """
-        Tokens that are to the right of the Span, whose head is within the Span.
+        """Tokens that are to the right of the Span, whose head is within the
+        `Span`.

-        Yields: Token A right-child of a token of the span.
+        YIELDS (Token): A right-child of a token of the span.
        """
        def __get__(self):
            for token in self:
@ -338,10 +363,9 @@ cdef class Span:
                        yield right

    property subtree:
-        """
-        Tokens that descend from tokens in the span, but fall outside it.
+        """Tokens that descend from tokens in the span, but fall outside it.

-        Yields: Token A descendant of a token within the span.
+        YIELDS (Token): A descendant of a token within the span.
        """
        def __get__(self):
            for word in self.lefts:
@ -351,8 +375,9 @@ cdef class Span:
                yield from word.subtree

    property ent_id:
-        """
-        An (integer) entity ID. Usually assigned by patterns in the Matcher.
+        """An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
+
+        RETURNS (int): The entity ID.
        """
        def __get__(self):
            return self.root.ent_id
@ -362,9 +387,11 @@ cdef class Span:
            raise NotImplementedError(
                "Can't yet set ent_id from Span. Vote for this feature on the issue "
                "tracker: http://github.com/explosion/spaCy/issues")
+
    property ent_id_:
-        """
-        A (string) entity ID. Usually assigned by patterns in the Matcher.
+        """A (string) entity ID. Usually assigned by patterns in the `Matcher`.
+
+        RETURNS (unicode): The entity ID.
        """
        def __get__(self):
            return self.root.ent_id_
@ -376,26 +403,38 @@ cdef class Span:
                "tracker: http://github.com/explosion/spaCy/issues")

    property orth_:
+        # TODO: docstring
        def __get__(self):
            return ''.join([t.string for t in self]).strip()

    property lemma_:
+        """The span's lemma.
+
+        RETURNS (unicode): The span's lemma.
+        """
        def __get__(self):
            return ' '.join([t.lemma_ for t in self]).strip()

    property upper_:
+        # TODO: docstring
        def __get__(self):
            return ''.join([t.string.upper() for t in self]).strip()

    property lower_:
+        # TODO: docstring
        def __get__(self):
            return ''.join([t.string.lower() for t in self]).strip()

    property string:
+        # TODO: docstring
        def __get__(self):
            return ''.join([t.string for t in self])

    property label_:
+        """The span's label.
+
+        RETURNS (unicode): The span's label.
+        """
        def __get__(self):
            return self.doc.vocab.strings[self.label]

--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -23,10 +23,14 @@ from .. import about


 cdef class Token:
-    """
-    An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
-    """
+    """An individual token – i.e. a word, punctuation symbol, whitespace, etc."""
    def __cinit__(self, Vocab vocab, Doc doc, int offset):
+        """Construct a `Token` object.
+
+        vocab (Vocab): A storage container for lexical types.
+        doc (Doc): The parent document.
+        offset (int): The index of the token within the document.
+        """
        self.vocab = vocab
        self.doc = doc
        self.c = &self.doc.c[offset]
@ -36,8 +40,9 @@ cdef class Token:
        return hash((self.doc, self.i))

    def __len__(self):
-        """
-        Number of unicode characters in token.text.
+        """The number of unicode characters in the token, i.e. `token.text`.
+
+        RETURNS (int): The number of unicode characters in the token.
        """
        return self.c.lex.length

@ -75,37 +80,35 @@ cdef class Token:
            raise ValueError(op)

    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
-        """
-        Check the value of a boolean flag.
+        """Check the value of a boolean flag.

-        Arguments:
-            flag_id (int): The ID of the flag attribute.
-        Returns:
-            is_set (bool): Whether the flag is set.
+        flag_id (int): The ID of the flag attribute.
+        RETURNS (bool): Whether the flag is set.
+
+        EXAMPLE:
+            >>> from spacy.attrs import IS_TITLE
+            >>> doc = nlp(u'Give it back! He pleaded.')
+            >>> token = doc[0]
+            >>> token.check_flag(IS_TITLE)
+            True
        """
        return Lexeme.c_check_flag(self.c.lex, flag_id)

    def nbor(self, int i=1):
-        """
-        Get a neighboring token.
+        """Get a neighboring token.

-        Arguments:
-            i (int): The relative position of the token to get. Defaults to 1.
-        Returns:
-            neighbor (Token): The token at position self.doc[self.i+i]
+        i (int): The relative position of the token to get. Defaults to 1.
+        RETURNS (Token): The token at position `self.doc[self.i+i]`.
        """
        return self.doc[self.i+i]

    def similarity(self, other):
-        """
-        Compute a semantic similarity estimate. Defaults to cosine over vectors.
+        """Make a semantic similarity estimate. The default estimate is cosine
+        similarity using an average of word vectors.

-        Arguments:
-            other:
-                The object to compare with. By default, accepts Doc, Span,
-                Token and Lexeme objects.
-        Returns:
-            score (float): A scalar similarity score. Higher is more similar.
+        other (object): The object to compare with. By default, accepts `Doc`,
+            `Span`, `Token` and `Lexeme` objects.
+        RETURNS (float): A scalar similarity score. Higher is more similar.
        """
        if 'similarity' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['similarity'](self)
@ -114,10 +117,14 @@ cdef class Token:
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)

    property lex_id:
+        """ID of the token's lexical type.
+
+        RETURNS (int): ID of the token's lexical type."""
        def __get__(self):
            return self.c.lex.id

    property rank:
+        # TODO: add docstring
        def __get__(self):
            return self.c.lex.id

@ -126,10 +133,19 @@ cdef class Token:
            return self.text_with_ws

    property text:
+        """A unicode representation of the token text.
+
+        RETURNS (unicode): The original verbatim text of the token.
+        """
        def __get__(self):
            return self.orth_

    property text_with_ws:
+        """The text content of the token with a trailing whitespace character if
+        it has one.
+
+        RETURNS (unicode): The text content of the span (with trailing whitespace).
+        """
        def __get__(self):
            cdef unicode orth = self.vocab.strings[self.c.lex.orth]
            if self.c.spacy:
@ -184,6 +200,10 @@ cdef class Token:
            return self.c.lex.suffix

    property lemma:
+        """Base form of the word, with no inflectional suffixes.
+
+        RETURNS (int): Token lemma.
+        """
        def __get__(self):
            return self.c.lemma
        def __set__(self, int lemma):
@ -206,8 +226,10 @@ cdef class Token:
            self.c.dep = label

    property has_vector:
-        """
-        A boolean value indicating whether a word vector is associated with the object.
+        """A boolean value indicating whether a word vector is associated with
+        the object.
+
+        RETURNS (bool): Whether a word vector is associated with the object.
        """
        def __get__(self):
            if 'has_vector' in self.doc.user_token_hooks:
@ -220,10 +242,10 @@ cdef class Token:
                return False

    property vector:
-        """
-        A real-valued meaning representation.
+        """A real-valued meaning representation.

-        Type: numpy.ndarray[ndim=1, dtype='float32']
+        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
+            representing the token's semantics.
        """
        def __get__(self):
            if 'vector' in self.doc.user_token_hooks:
@ -239,15 +261,11 @@ cdef class Token:
            vector_view = <float[:length,]>self.c.lex.vector
            return numpy.asarray(vector_view)

-    property repvec:
-        def __get__(self):
-            raise AttributeError("repvec was renamed to vector in v0.100")
-
-    property has_repvec:
-        def __get__(self):
-            raise AttributeError("has_repvec was renamed to has_vector in v0.100")
-
    property vector_norm:
+        """The L2 norm of the token's vector representation.
+
+        RETURNS (float): The L2 norm of the vector representation.
+        """
        def __get__(self):
            if 'vector_norm' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['vector_norm'](self)
@ -324,28 +342,26 @@ cdef class Token:
                yield from word.subtree

    property left_edge:
-        """
-        The leftmost token of this token's syntactic descendents.
+        """The leftmost token of this token's syntactic descendents.

-        Returns: Token The first token such that self.is_ancestor(token)
+        RETURNS (Token): The first token such that `self.is_ancestor(token)`.
        """
        def __get__(self):
            return self.doc[self.c.l_edge]

    property right_edge:
-        """
-        The rightmost token of this token's syntactic descendents.
+        """The rightmost token of this token's syntactic descendents.

-        Returns: Token The last token such that self.is_ancestor(token)
+        RETURNS (Token): The last token such that `self.is_ancestor(token)`.
        """
        def __get__(self):
            return self.doc[self.c.r_edge]

    property ancestors:
-        """
-        A sequence of this token's syntactic ancestors.
+        """A sequence of this token's syntactic ancestors.

-        Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
+        YIELDS (Token): A sequence of ancestor tokens such that
+            `ancestor.is_ancestor(self)`.
        """
        def __get__(self):
            cdef const TokenC* head_ptr = self.c
@ -357,33 +373,25 @@ cdef class Token:
                yield self.doc[head_ptr - (self.c - self.i)]
                i += 1

-    def is_ancestor_of(self, descendant):
-        # TODO: Remove after backward compatibility check.
-        return self.is_ancestor(descendant)
-
    def is_ancestor(self, descendant):
-        """
-        Check whether this token is a parent, grandparent, etc. of another
+        """Check whether this token is a parent, grandparent, etc. of another
        in the dependency tree.

-        Arguments:
-            descendant (Token): Another token.
-        Returns:
-            is_ancestor (bool): Whether this token is the ancestor of the descendant.
+        descendant (Token): Another token.
+        RETURNS (bool): Whether this token is the ancestor of the descendant.
        """
        if self.doc is not descendant.doc:
            return False
        return any( ancestor.i == self.i for ancestor in descendant.ancestors )

    property head:
-        """
-        The syntactic parent, or "governor", of this token.
+        """The syntactic parent, or "governor", of this token.

-        Returns: Token
+        RETURNS (Token): The token head.
        """
        def __get__(self):
-            """
-            The token predicted by the parser to be the head of the current token.
+            """The token predicted by the parser to be the head of the current
+            token.
            """
            return self.doc[self.i + self.c.head]
        def __set__(self, Token new_head):
@ -399,7 +407,7 @@ cdef class Token:
            cdef int rel_newhead_i = new_head.i - self.i

            # is the new head a descendant of the old head
-            cdef bint is_desc = old_head.is_ancestor_of(new_head)
+            cdef bint is_desc = old_head.is_ancestor(new_head)

            cdef int new_edge
            cdef Token anc, child
@ -477,10 +485,9 @@ cdef class Token:
            self.c.head = rel_newhead_i

    property conjuncts:
-        """
-        A sequence of coordinated tokens, including the token itself.
+        """A sequence of coordinated tokens, including the token itself.

-        Yields: Token A coordinated token
+        YIELDS (Token): A coordinated token.
        """
        def __get__(self):
            """Get a list of conjoined words."""
@ -495,25 +502,46 @@ cdef class Token:
                            yield from word.conjuncts

    property ent_type:
+        """Named entity type.
+
+        RETURNS (int): Named entity type.
+        """
        def __get__(self):
            return self.c.ent_type

    property ent_iob:
+        """IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag
+        is assigned.
+
+        RETURNS (int): IOB code of named entity tag.
+        """
        def __get__(self):
            return self.c.ent_iob

    property ent_type_:
+        """Named entity type.
+
+        RETURNS (unicode): Named entity type.
+        """
        def __get__(self):
            return self.vocab.strings[self.c.ent_type]

    property ent_iob_:
+        """IOB code of named entity tag. "B" means the token begins an entity,
+        "I" means it is inside an entity, "O" means it is outside an entity, and
+        "" means no entity tag is set.
+
+        RETURNS (unicode): IOB code of named entity tag.
+        """
        def __get__(self):
            iob_strings = ('', 'I', 'O', 'B')
            return iob_strings[self.c.ent_iob]

    property ent_id:
-        """
-        An (integer) entity ID. Usually assigned by patterns in the Matcher.
+        """ID of the entity the token is an instance of, if any. Usually
+        assigned by patterns in the Matcher.
+
+        RETURNS (int): ID of the entity.
        """
        def __get__(self):
            return self.c.ent_id
@ -522,8 +550,10 @@ cdef class Token:
            self.c.ent_id = key

    property ent_id_:
-        """
-        A (string) entity ID. Usually assigned by patterns in the Matcher.
+        """ID of the entity the token is an instance of, if any. Usually
+        assigned by patterns in the Matcher.
+
+        RETURNS (unicode): ID of the entity.
        """
        def __get__(self):
            return self.vocab.strings[self.c.ent_id]
@ -564,6 +594,10 @@ cdef class Token:
            return self.vocab.strings[self.c.lex.lang]

    property lemma_:
+        """Base form of the word, with no inflectional suffixes.
+
+        RETURNS (unicode): Token lemma.
+        """
        def __get__(self):
            return self.vocab.strings[self.c.lemma]
        def __set__(self, unicode lemma_):
--- a/spacy/util.py
+++ b/spacy/util.py
@ -145,7 +145,8 @@ def parse_package_meta(package_path, require=True):


 def is_in_jupyter():
-    """Check if user is in a Jupyter notebook. Mainly used for displaCy.
+    """Check if user is running spaCy from a Jupyter notebook by detecting the
+    IPython kernel. Mainly used for the displaCy visualizer.

    RETURNS (bool): True if in Jupyter, False if not.
    """
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -36,79 +36,22 @@ EMPTY_LEXEME.vector = EMPTY_VEC


 cdef class Vocab:
+    """A look-up table that allows you to access `Lexeme` objects. The `Vocab`
+    instance also provides access to the `StringStore`, and owns underlying
+    C-data that is shared between `Doc` objects.
    """
-    A map container for a language's LexemeC structs.
-    """
-    @classmethod
-    def load(cls, path, lex_attr_getters=None, lemmatizer=True,
-             tag_map=True, oov_prob=True, **deprecated_kwargs):
-        """
-        Deprecated --- replace in spaCy 2
-        Load the vocabulary from a path.
-
-        Arguments:
-            path (Path):
-                The path to load from.
-            lex_attr_getters (dict):
-                A dictionary mapping attribute IDs to functions to compute them.
-                Defaults to None.
-            lemmatizer (object):
-                A lemmatizer. Defaults to None.
-            tag_map (dict):
-                A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
-                and optionally morphological attributes.
-            oov_prob (float):
-                The default probability for out-of-vocabulary words.
-        Returns:
-            Vocab: The newly constructed vocab object.
-        """
-        path = util.ensure_path(path)
-        util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
-        if 'vectors' in deprecated_kwargs:
-            raise AttributeError(
-                "vectors argument to Vocab.load() deprecated. "
-                "Install vectors after loading.")
-        if tag_map is True and (path / 'vocab' / 'tag_map.json').exists():
-            with (path / 'vocab' / 'tag_map.json').open('r', encoding='utf8') as file_:
-                tag_map = ujson.load(file_)
-        elif tag_map is True:
-            tag_map = None
-        if lex_attr_getters is not None \
-        and oov_prob is True \
-        and (path / 'vocab' / 'oov_prob').exists():
-            with (path / 'vocab' / 'oov_prob').open('r', encoding='utf8') as file_:
-                oov_prob = float(file_.read())
-            lex_attr_getters[PROB] = lambda text: oov_prob
-        if lemmatizer is True:
-            lemmatizer = Lemmatizer.load(path)
-
-        with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
-            strings_list = ujson.load(file_)
-        cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
-                              lemmatizer=lemmatizer,
-                              strings=strings_list)
-        self.load_lexemes(path / 'vocab' / 'lexemes.bin')
-        return self
-
-
    def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
            strings=tuple(), **deprecated_kwargs):
-        """
-        Create the vocabulary.
+        """Create the vocabulary.

-        lex_attr_getters (dict):
-            A dictionary mapping attribute IDs to functions to compute them.
-            Defaults to None.
-        lemmatizer (object):
-            A lemmatizer. Defaults to None.
-        tag_map (dict):
-            A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
-            and optionally morphological attributes.
-        oov_prob (float):
-            The default probability for out-of-vocabulary words.
-
-        Returns:
-            Vocab: The newly constructed vocab object.
+        lex_attr_getters (dict): A dictionary mapping attribute IDs to functions
+            to compute them. Defaults to `None`.
+        tag_map (dict): A dictionary mapping fine-grained tags to coarse-grained
+            parts-of-speech, and optionally morphological attributes.
+        lemmatizer (object): A lemmatizer. Defaults to `None`.
+        strings (StringStore): StringStore that maps strings to integers, and
+            vice versa.
+        RETURNS (Vocab): The newly constructed vocab object.
        """
        util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)

@ -148,33 +91,32 @@ cdef class Vocab:
            return langfunc('_') if langfunc else ''

    def __len__(self):
-        """
-        The current number of lexemes stored.
+        """The current number of lexemes stored.
+
+        RETURNS (int): The current number of lexemes stored.
        """
        return self.length
-    
-    def add_flag(self, flag_getter, int flag_id=-1):
-        """
-        Set a new boolean flag to words in the vocabulary.

-        The flag_setter function will be called over the words currently in the
+    def add_flag(self, flag_getter, int flag_id=-1):
+        """Set a new boolean flag to words in the vocabulary.
+
+        The flag_getter function will be called over the words currently in the
        vocab, and then applied to new words as they occur. You'll then be able
        to access the flag value on each token, using token.check_flag(flag_id).
+        See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
+        `Token.check_flag`.

-        See also:
-            Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag.
+        flag_getter (callable): A function `f(unicode) -> bool`, to get the flag
+            value.
+        flag_id (int): An integer between 1 and 63 (inclusive), specifying
+            the bit at which the flag will be stored. If -1, the lowest
+            available bit will be chosen.
+        RETURNS (int): The integer ID by which the flag value can be checked.

-        Arguments:
-            flag_getter:
-                A function f(unicode) -> bool, to get the flag value.
-
-            flag_id (int):
-                An integer between 1 and 63 (inclusive), specifying the bit at which the
-                flag will be stored. If -1, the lowest available bit will be
-                chosen.
-
-        Returns:
-            flag_id (int): The integer ID by which the flag value can be checked.
+        EXAMPLE:
+            >>> MY_PRODUCT = nlp.vocab.add_flag(lambda text: text in ['spaCy', 'dislaCy'])
+            >>> doc = nlp(u'I like spaCy')
+            >>> assert doc[2].check_flag(MY_PRODUCT) == True
        """
        if flag_id == -1:
            for bit in range(1, 64):
@ -196,9 +138,8 @@ cdef class Vocab:
        return flag_id

    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
-        """
-        Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
-        if necessary, using memory acquired from the given pool.  If the pool
+        """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
+        if necessary, using memory acquired from the given pool. If the pool
        is the lexicon's own memory, the lexeme is saved in the lexicon.
        """
        if string == u'':
@ -216,9 +157,8 @@ cdef class Vocab:
            return self._new_lexeme(mem, string)

    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
-        """
-        Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
-        if necessary, using memory acquired from the given pool.  If the pool
+        """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
+        if necessary, using memory acquired from the given pool. If the pool
        is the lexicon's own memory, the lexeme is saved in the lexicon.
        """
        if orth == 0:
@ -263,24 +203,19 @@ cdef class Vocab:
        self.length += 1

    def __contains__(self, unicode string):
-        """
-        Check whether the string has an entry in the vocabulary.
+        """Check whether the string has an entry in the vocabulary.

-        Arguments:
-            string (unicode): The ID string.
-
-        Returns:
-            bool Whether the string has an entry in the vocabulary.
+        string (unicode): The ID string.
+        RETURNS (bool) Whether the string has an entry in the vocabulary.
        """
        key = hash_string(string)
        lex = self._by_hash.get(key)
        return lex is not NULL

    def __iter__(self):
-        """
-        Iterate over the lexemes in the vocabulary.
+        """Iterate over the lexemes in the vocabulary.

-        Yields: Lexeme An entry in the vocabulary.
+        YIELDS (Lexeme): An entry in the vocabulary.
        """
        cdef attr_t orth
        cdef size_t addr
@ -288,19 +223,19 @@ cdef class Vocab:
            yield Lexeme(self, orth)

    def __getitem__(self,  id_or_string):
-        """
-        Retrieve a lexeme, given an int ID or a unicode string.  If a previously
-        unseen unicode string is given, a new lexeme is created and stored.
+        """Retrieve a lexeme, given an int ID or a unicode string.  If a
+        previously unseen unicode string is given, a new lexeme is created and
+        stored.

-        Arguments:
-            id_or_string (int or unicode):
-              The integer ID of a word, or its unicode string.
+        id_or_string (int or unicode): The integer ID of a word, or its unicode
+            string. If `int >= Lexicon.size`, `IndexError` is raised. If
+            `id_or_string` is neither an int nor a unicode string, `ValueError`
+            is raised.
+        RETURNS (Lexeme): The lexeme indicated by the given ID.

-              If an int >= Lexicon.size, IndexError is raised. If id_or_string
-              is neither an int nor a unicode string, ValueError is raised.
-
-        Returns:
-            lexeme (Lexeme): The lexeme indicated by the given ID.
+        EXAMPLE:
+            >>> apple = nlp.vocab.strings['apple']
+            >>> assert nlp.vocab[apple] == nlp.vocab[u'apple']
        """
        cdef attr_t orth
        if type(id_or_string) == unicode:
@ -324,15 +259,29 @@ cdef class Vocab:
        return tokens

    def to_disk(self, path):
+        """Save the current state to a directory.
+
+        path (unicode or Path): A path to a directory, which will be created if
+            it doesn't exist. Paths may be either strings or `Path`-like objects.
+        """
        path = util.ensure_path(path)
        if not path.exists():
            path.mkdir()
        strings_loc = path / 'strings.json'
        with strings_loc.open('w', encoding='utf8') as file_:
            self.strings.dump(file_)
-        self.dump(path / 'lexemes.bin')
+
+        # TODO: pickle
+        # self.dump(path / 'lexemes.bin')

    def from_disk(self, path):
+        """Loads state from a directory. Modifies the object in place and
+        returns it.
+
+        path (unicode or Path): A path to a directory. Paths may be either
+            strings or `Path`-like objects.
+        RETURNS (Vocab): The modified `Vocab` object.
+        """
        path = util.ensure_path(path)
        with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
            strings_list = ujson.load(file_)
@ -340,6 +289,23 @@ cdef class Vocab:
            self.strings[string]
        self.load_lexemes(path / 'lexemes.bin')

+    def to_bytes(self, **exclude):
+        """Serialize the current state to a binary string.
+
+        **exclude: Named attributes to prevent from being serialized.
+        RETURNS (bytes): The serialized form of the `Vocab` object.
+        """
+        raise NotImplementedError()
+
+    def from_bytes(self, bytes_data, **exclude):
+        """Load state from a binary string.
+
+        bytes_data (bytes): The data to load from.
+        **exclude: Named attributes to prevent from being loaded.
+        RETURNS (Vocab): The `Vocab` object.
+        """
+        raise NotImplementedError()
+
    def lexemes_to_bytes(self, **exclude):
        cdef hash_t key
        cdef size_t addr
@ -365,9 +331,7 @@ cdef class Vocab:
        return byte_string

    def lexemes_from_bytes(self, bytes bytes_data):
-        """
-        Load the binary vocabulary data from the given string.
-        """
+        """Load the binary vocabulary data from the given string."""
        cdef LexemeC* lexeme
        cdef hash_t key
        cdef unicode py_str
@ -391,16 +355,12 @@ cdef class Vocab:
            self.length += 1

    # Deprecated --- delete these once stable
-   
-    def dump_vectors(self, out_loc):
-        """
-        Save the word vectors to a binary file.

-        Arguments:
-            loc (Path): The path to save to.
-        Returns:
-            None
-        #"""
+    def dump_vectors(self, out_loc):
+        """Save the word vectors to a binary file.
+
+        loc (Path): The path to save to.
+        """
        cdef int32_t vec_len = self.vectors_length
        cdef int32_t word_len
        cdef bytes word_str
@ -424,17 +384,14 @@ cdef class Vocab:


    def load_vectors(self, file_):
-        """
-        Load vectors from a text-based file.
+        """Load vectors from a text-based file.

-        Arguments:
-            file_ (buffer): The file to read from. Entries should be separated by newlines,
-        and each entry should be whitespace delimited. The first value of the entry
-        should be the word string, and subsequent entries should be the values of the
-        vector.
+        file_ (buffer): The file to read from. Entries should be separated by
+            newlines, and each entry should be whitespace delimited. The first value of the entry
+            should be the word string, and subsequent entries should be the values of the
+            vector.

-        Returns:
-            vec_len (int): The length of the vectors loaded.
+        RETURNS (int): The length of the vectors loaded.
        """
        cdef LexemeC* lexeme
        cdef attr_t orth
@ -464,14 +421,11 @@ cdef class Vocab:
        return vec_len

    def load_vectors_from_bin_loc(self, loc):
-        """
-        Load vectors from the location of a binary file.
+        """Load vectors from the location of a binary file.

-        Arguments:
-            loc (unicode): The path of the binary file to load from.
+        loc (unicode): The path of the binary file to load from.

-        Returns:
-            vec_len (int): The length of the vectors loaded.
+        RETURNS (int): The length of the vectors loaded.
        """
        cdef CFile file_ = CFile(loc, b'rb')
        cdef int32_t word_len
@ -526,12 +480,10 @@ cdef class Vocab:


    def resize_vectors(self, int new_size):
-        """
-        Set vectors_length to a new size, and allocate more memory for the Lexeme
-        vectors if necessary. The memory will be zeroed.
+        """Set vectors_length to a new size, and allocate more memory for the
+        `Lexeme` vectors if necessary. The memory will be zeroed.

-        Arguments:
-            new_size (int): The new size of the vectors.
+        new_size (int): The new size of the vectors.
        """
        cdef hash_t key
        cdef size_t addr
@ -633,237 +585,3 @@ class VectorReadError(Exception):
            "Vector size: %d\n"
            "Max size: %d\n"
            "Min size: 1\n" % (loc, size, MAX_VEC_SIZE))
-
-
-#
-#Deprecated --- delete these once stable
-#    
-#    def dump_vectors(self, out_loc):
-#        """
-#        Save the word vectors to a binary file.
-#
-#        Arguments:
-#            loc (Path): The path to save to.
-#        Returns:
-#            None
-#        #"""
-#        cdef int32_t vec_len = self.vectors_length
-#        cdef int32_t word_len
-#        cdef bytes word_str
-#        cdef char* chars
-#
-#        cdef Lexeme lexeme
-#        cdef CFile out_file = CFile(out_loc, 'wb')
-#        for lexeme in self:
-#            word_str = lexeme.orth_.encode('utf8')
-#            vec = lexeme.c.vector
-#            word_len = len(word_str)
-#
-#            out_file.write_from(&word_len, 1, sizeof(word_len))
-#            out_file.write_from(&vec_len, 1, sizeof(vec_len))
-#
-#            chars = <char*>word_str
-#            out_file.write_from(chars, word_len, sizeof(char))
-#            out_file.write_from(vec, vec_len, sizeof(float))
-#        out_file.close()
-#
-#
-#
-#    def load_vectors(self, file_):
-#        """
-#        Load vectors from a text-based file.
-#
-#        Arguments:
-#            file_ (buffer): The file to read from. Entries should be separated by newlines,
-#        and each entry should be whitespace delimited. The first value of the entry
-#        should be the word string, and subsequent entries should be the values of the
-#        vector.
-#
-#        Returns:
-#            vec_len (int): The length of the vectors loaded.
-#        """
-#        cdef LexemeC* lexeme
-#        cdef attr_t orth
-#        cdef int32_t vec_len = -1
-#        cdef double norm = 0.0
-#
-#        whitespace_pattern = re.compile(r'\s', re.UNICODE)
-#
-#        for line_num, line in enumerate(file_):
-#            pieces = line.split()
-#            word_str = " " if whitespace_pattern.match(line) else pieces.pop(0)
-#            if vec_len == -1:
-#                vec_len = len(pieces)
-#            elif vec_len != len(pieces):
-#                raise VectorReadError.mismatched_sizes(file_, line_num,
-#                                                        vec_len, len(pieces))
-#            orth = self.strings[word_str]
-#            lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
-#            lexeme.vector = <float*>self.mem.alloc(vec_len, sizeof(float))
-#            for i, val_str in enumerate(pieces):
-#                lexeme.vector[i] = float(val_str)
-#            norm = 0.0
-#            for i in range(vec_len):
-#                norm += lexeme.vector[i] * lexeme.vector[i]
-#            lexeme.l2_norm = sqrt(norm)
-#        self.vectors_length = vec_len
-#        return vec_len
-#
-#    def load_vectors_from_bin_loc(self, loc):
-#        """
-#        Load vectors from the location of a binary file.
-#
-#        Arguments:
-#            loc (unicode): The path of the binary file to load from.
-#
-#        Returns:
-#            vec_len (int): The length of the vectors loaded.
-#        """
-#        cdef CFile file_ = CFile(loc, b'rb')
-#        cdef int32_t word_len
-#        cdef int32_t vec_len = 0
-#        cdef int32_t prev_vec_len = 0
-#        cdef float* vec
-#        cdef Address mem
-#        cdef attr_t string_id
-#        cdef bytes py_word
-#        cdef vector[float*] vectors
-#        cdef int line_num = 0
-#        cdef Pool tmp_mem = Pool()
-#        while True:
-#            try:
-#                file_.read_into(&word_len, sizeof(word_len), 1)
-#            except IOError:
-#                break
-#            file_.read_into(&vec_len, sizeof(vec_len), 1)
-#            if prev_vec_len != 0 and vec_len != prev_vec_len:
-#                raise VectorReadError.mismatched_sizes(loc, line_num,
-#                                                       vec_len, prev_vec_len)
-#            if 0 >= vec_len >= MAX_VEC_SIZE:
-#                raise VectorReadError.bad_size(loc, vec_len)
-#
-#            chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char))
-#            vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
-#
-#            string_id = self.strings[chars[:word_len]]
-#            # Insert words into vocab to add vector.
-#            self.get_by_orth(self.mem, string_id)
-#            while string_id >= vectors.size():
-#                vectors.push_back(EMPTY_VEC)
-#            assert vec != NULL
-#            vectors[string_id] = vec
-#            line_num += 1
-#        cdef LexemeC* lex
-#        cdef size_t lex_addr
-#        cdef double norm = 0.0
-#        cdef int i
-#        for orth, lex_addr in self._by_orth.items():
-#            lex = <LexemeC*>lex_addr
-#            if lex.lower < vectors.size():
-#                lex.vector = vectors[lex.lower]
-#                norm = 0.0
-#                for i in range(vec_len):
-#                    norm += lex.vector[i] * lex.vector[i]
-#                lex.l2_norm = sqrt(norm)
-#            else:
-#                lex.vector = EMPTY_VEC
-#        self.vectors_length = vec_len
-#        return vec_len
-#
-#
-#def write_binary_vectors(in_loc, out_loc):
-#    cdef CFile out_file = CFile(out_loc, 'wb')
-#    cdef Address mem
-#    cdef int32_t word_len
-#    cdef int32_t vec_len
-#    cdef char* chars
-#    with bz2.BZ2File(in_loc, 'r') as file_:
-#        for line in file_:
-#            pieces = line.split()
-#            word = pieces.pop(0)
-#            mem = Address(len(pieces), sizeof(float))
-#            vec = <float*>mem.ptr
-#            for i, val_str in enumerate(pieces):
-#                vec[i] = float(val_str)
-#
-#            word_len = len(word)
-#            vec_len = len(pieces)
-#
-#            out_file.write_from(&word_len, 1, sizeof(word_len))
-#            out_file.write_from(&vec_len, 1, sizeof(vec_len))
-#
-#            chars = <char*>word
-#            out_file.write_from(chars, len(word), sizeof(char))
-#            out_file.write_from(vec, vec_len, sizeof(float))
-#
-#
-#    def resize_vectors(self, int new_size):
-#        """
-#        Set vectors_length to a new size, and allocate more memory for the Lexeme
-#        vectors if necessary. The memory will be zeroed.
-#
-#        Arguments:
-#            new_size (int): The new size of the vectors.
-#        """
-#        cdef hash_t key
-#        cdef size_t addr
-#        if new_size > self.vectors_length:
-#            for key, addr in self._by_hash.items():
-#                lex = <LexemeC*>addr
-#                lex.vector = <float*>self.mem.realloc(lex.vector,
-#                                        new_size * sizeof(lex.vector[0]))
-#        self.vectors_length = new_size
-#
-#
-
-#
-#    def dump(self, loc=None):
-#        """
-#        Save the lexemes binary data to the given location, or
-#        return a byte-string with the data if loc is None.
-#
-#        Arguments:
-#            loc (Path or None): The path to save to, or None.
-#        """
-#        if loc is None:
-#            return self.to_bytes()
-#        else:
-#            return self.to_disk(loc)
-#
-#    def load_lexemes(self, loc):
-#        """
-#        Load the binary vocabulary data from the given location.
-#
-#        Arguments:
-#            loc (Path): The path to load from.
-#
-#        Returns:
-#            None
-#        """
-#        fp = CFile(loc, 'rb',
-#                on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
-#        cdef LexemeC* lexeme = NULL
-#        cdef SerializedLexemeC lex_data
-#        cdef hash_t key
-#        cdef unicode py_str
-#        cdef attr_t orth = 0
-#        assert sizeof(orth) == sizeof(lexeme.orth)
-#        i = 0
-#        while True:
-#            try:
-#                fp.read_into(&orth, 1, sizeof(orth))
-#            except IOError:
-#                break
-#            lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
-#            # Copy data from the file into the lexeme
-#            fp.read_into(&lex_data.data, 1, sizeof(lex_data.data))
-#            Lexeme.c_from_bytes(lexeme, lex_data)
-#
-#            lexeme.vector = EMPTY_VEC
-#            py_str = self.strings[lexeme.orth]
-#            key = hash_string(py_str)
-#            self._by_hash.set(key, lexeme)
-#            self._by_orth.set(lexeme.orth, lexeme)
-#            self.length += 1
-#            i += 1
-#        fp.close()
--- a/website/_harp.json
+++ b/website/_harp.json
@ -80,6 +80,7 @@
            }
        ],

+        "ALPHA": true,
        "V_CSS": "1.6",
        "V_JS": "1.2",
        "DEFAULT_SYNTAX": "python",
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -34,17 +34,17 @@ mixin src(url)
        +a(url)
            block

-        |   #[+icon("code", 16).o-icon--inline.u-color-subtle]
+        |   #[+icon("code", 16).o-icon--inline.u-color-theme]


 //- API link (with added tag and automatically generated path)
    path - [string] path to API docs page relative to /docs/api/

 mixin api(path)
-    +a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block
+    +a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap
        block

-        |  #[+icon("book", 18).o-icon--inline.u-color-subtle]
+        |  #[+icon("book", 18).o-icon--inline.u-color-theme]


 //- Help icon with tooltip
@ -104,15 +104,31 @@ mixin button(url, trusted, ...style)
    language - [string] language for syntax highlighting (default: "python")
               supports basic relevant languages available for PrismJS

-mixin code(label, language)
-    pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}")&attributes(attributes)
+mixin code(label, language, icon)
+    pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : "")&attributes(attributes)
        if label
            h4.u-text-label.u-text-label--dark=label

+        if icon
+            - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
+            .c-code-block__icon(class=classes[icon] || "")
+                +icon(icon, 18)
+
        code.c-code-block__content
            block


+//- Code blocks to display old/new versions
+
+mixin code-old()
+    +code(false, false, "reject").o-block-small
+        block
+
+mixin code-new()
+    +code(false, false, "accept").o-block-small
+        block
+
+
 //- CodePen embed
    slug        - [string] ID of CodePen demo (taken from URL)
    height      - [integer] height of demo embed iframe
@ -164,6 +180,16 @@ mixin tag()
        block


+//- "Requires model" tag with tooltip and list of capabilities
+    ...capabs - [string] Required model capabilities, e.g. "vectors".
+
+mixin tag-model(...capabs)
+    - var intro = "To use this functionality, spaCy needs a model to be installed"
+    - var ext = capabs.length ? " that supports the following capabilities: " + capabs.join(', ') : ""
+    +tag Requires model
+    +help(intro + ext + ".").u-color-theme
+
+
 //- List
    type  - [string] "numbers", "letters", "roman" (bulleted list if none set)
    start - [integer] start number
--- a/website/_includes/_navigation.jade
+++ b/website/_includes/_navigation.jade
@ -9,6 +9,9 @@ nav.c-nav.u-text.js-nav(class=landing ? "c-nav--theme" : null)
        .u-text-label.u-padding-small.u-hidden-xs=SUBSECTION

    ul.c-nav__menu
+        if ALPHA
+            - var NAVIGATION = { "Usage": "/docs/usage", "Reference": "/docs/api" }
+
        each url, item in NAVIGATION
            li.c-nav__menu__item(class=(url == "/") ? "u-hidden-xs" : null)
                +a(url)=item
--- a/website/_includes/_page-docs.jade
+++ b/website/_includes/_page-docs.jade
@ -10,6 +10,14 @@ main.o-main.o-main--sidebar.o-main--aside
            if tag
                +tag=tag

+        if ALPHA
+            +infobox("⚠️ You are viewing the spaCy v2.0 alpha docs")
+                |  This page is part of the alpha documentation for spaCy v2.0
+                |  and does not reflect the state of the latest stable release.
+                |  #[+a("#") See here] for more information on how to install
+                |  and test the new version. To read the official docs for
+                |  v1.x, #[+a("https://spacy.io/docs") go here].
+
        !=yield

    +grid.o-content.u-text
--- a/website/_layout.jade
+++ b/website/_layout.jade
@ -35,7 +35,10 @@ html(lang="en")
    link(rel="shortcut icon" href="/assets/img/favicon.ico")
    link(rel="icon" type="image/x-icon" href="/assets/img/favicon.ico")

-    if SUBSECTION == "usage"
+    if ALPHA && SECTION == "docs"
+        link(href="/assets/css/style_green.css?v#{V_CSS}" rel="stylesheet")
+
+    else if SUBSECTION == "usage"
        link(href="/assets/css/style_red.css?v#{V_CSS}" rel="stylesheet")

    else
--- a/website/assets/css/_components/_code.sass
+++ b/website/assets/css/_components/_code.sass
@ -13,6 +13,17 @@
    white-space: pre
    direction: ltr

+    &.c-code-block--has-icon
+        padding: 0
+        display: flex
+
+.c-code-block__icon
+    padding: 0 0 0 1rem
+    display: flex
+    justify-content: center
+    align-items: center
+    border-left: 6px solid
+

 //- Code block content

@ -26,8 +37,8 @@

 *:not(.c-code-block) > code
    font: normal 600 0.8em/#{1} $font-code
-    background: rgba($color-front, 0.05)
-    box-shadow: 1px 1px 0 rgba($color-front, 0.1)
+    background: darken($color-theme-light, 5)
+    box-shadow: 1px 1px 0 rgba($color-front, 0.05)
    text-shadow: 1px 1px 0 rgba($color-back, 0.5)
    color: $color-front
    padding: 0.1em 0.5em
--- a/website/assets/css/_components/_tables.sass
+++ b/website/assets/css/_components/_tables.sass
@ -13,7 +13,7 @@
        background: rgba($color-subtle-light, 0.35)

    &.c-table__row--foot
-        background: rgba($color-theme, 0.025)
+        background: $color-theme-light
        border-top: 2px solid $color-theme

        .c-table__cell:first-child
--- a/website/assets/css/_components/_tooltips.sass
+++ b/website/assets/css/_components/_tooltips.sass
@ -11,9 +11,8 @@
            background: $color-front
            border-radius: 2px
            color: $color-back
-            font-family: inherit
-            font-size: 1.3rem
-            line-height: 1.25
+            font: normal 1.3rem/#{1.25} $font-primary
+            text-transform: none
            opacity: 0
            padding: 0.5em 0.75em
            transform: translateX(-50%) translateY(-2px)
--- a/website/assets/css/_variables.sass
+++ b/website/assets/css/_variables.sass
@ -26,8 +26,7 @@ $font-code: 'Source Code Pro', Consolas, 'Andale Mono', Menlo, Monaco, Courier,

 // Colors

-$colors: ( blue: #09a3d5, red: #d9515d )
-$colors-light: (blue: #cceaf4, red: #f9d7da)
+$colors: ( blue: #09a3d5, red: #d9515d, green: #08c35e )

 $color-back: #fff !default
 $color-front: #1a1e23 !default
@ -35,7 +34,7 @@ $color-dark: lighten($color-front, 20) !default

 $color-theme: map-get($colors, $theme)
 $color-theme-dark: darken(map-get($colors, $theme), 5)
-$color-theme-light: map-get($colors-light, $theme)
+$color-theme-light: rgba($color-theme, 0.05)

 $color-subtle: #ddd !default
 $color-subtle-light: #f6f6f6 !default
--- a/website/assets/css/style_green.sass
+++ b/website/assets/css/style_green.sass
@ -0,0 +1,4 @@
+//- 💫 STYLESHEET (GREEN)
+
+$theme: green
+@import style
--- a/website/assets/img/icons.svg
+++ b/website/assets/img/icons.svg
@ -30,5 +30,11 @@
        <symbol id="help" viewBox="0 0 24 24">
            <path d="M12 6c2.203 0 3.984 1.781 3.984 3.984 0 2.484-3 2.766-3 5.016h-1.969c0-3.234 3-3 3-5.016 0-1.078-0.938-1.969-2.016-1.969s-2.016 0.891-2.016 1.969h-1.969c0-2.203 1.781-3.984 3.984-3.984zM12 20.016c4.406 0 8.016-3.609 8.016-8.016s-3.609-8.016-8.016-8.016-8.016 3.609-8.016 8.016 3.609 8.016 8.016 8.016zM12 2.016c5.531 0 9.984 4.453 9.984 9.984s-4.453 9.984-9.984 9.984-9.984-4.453-9.984-9.984 4.453-9.984 9.984-9.984zM11.016 18v-2.016h1.969v2.016h-1.969z"/>
        </symbol>
+        <symbol id="reject" viewBox="0 0 24 24">
+            <path d="M18.984 6.422l-5.578 5.578 5.578 5.578-1.406 1.406-5.578-5.578-5.578 5.578-1.406-1.406 5.578-5.578-5.578-5.578 1.406-1.406 5.578 5.578 5.578-5.578z"/>
+        </symbol>
+        <symbol id="accept" viewBox="0 0 24 24">
+            <path d="M9 16.172l10.594-10.594 1.406 1.406-12 12-5.578-5.578 1.406-1.406z"/>
+        </symbol>
    </defs>
 </svg>
--- a/website/assets/img/pattern_green.jpg
+++ b/website/assets/img/pattern_green.jpg
--- a/website/docs/api/_data.json
+++ b/website/docs/api/_data.json
@ -2,8 +2,13 @@
    "sidebar": {
        "Introduction": {
            "Facts & Figures": "./",
-            "Languages": "language-models",
-            "Philosophy": "philosophy"
+            "Languages": "language-models"
+        },
+        "Top-level": {
+            "spacy": "spacy",
+            "displacy": "displacy",
+            "Utility Functions": "util",
+            "Command line": "cli"
        },
        "Classes": {
            "Doc": "doc",
@ -21,9 +26,6 @@
            "GoldParse": "goldparse"
        },
        "Other": {
-            "Command line": "cli",
-            "displaCy": "displacy",
-            "Utility Functions": "util",
            "Annotation Specs": "annotation",
            "Feature Scheme": "features"
        }
@ -43,6 +45,26 @@
        "title": "Philosophy"
    },

+    "spacy": {
+        "title": "spaCy top-level functions",
+        "next": "displacy"
+    },
+
+    "displacy": {
+        "title": "displaCy",
+        "tag": "module",
+        "next": "util"
+    },
+
+    "util": {
+        "title": "Utility Functions",
+        "next": "cli"
+    },
+
+    "cli": {
+        "title": "Command Line Interface"
+    },
+
    "language": {
        "title": "Language",
        "tag": "class"
@ -113,20 +135,6 @@
        "tag": "class"
    },

-    "cli": {
-        "title": "Command Line Interface",
-        "next": "displacy"
-    },
-
-    "displacy": {
-        "title": "displaCy",
-        "tag": "module"
-    },
-
-    "util": {
-        "title": "Utility Functions"
-    },
-
    "annotation": {
        "title": "Annotation Specifications"
    },
--- a/website/docs/api/annotation.jade
+++ b/website/docs/api/annotation.jade
@ -71,6 +71,44 @@ include _annotation/_dep-labels

 include _annotation/_named-entities

+h(3, "biluo") BILUO Scheme
+
+p
+    |  spaCy translates character offsets into the BILUO scheme, in order to
+    |  decide the cost of each action given the current state of the entity
+    |  recognizer. The costs are then used to calculate the gradient of the
+    |  loss, to train the model.
+
+aside("Why BILUO, not IOB?")
+    |  There are several coding schemes for encoding entity annotations as
+    |  token tags.  These coding schemes are equally expressive, but not
+    |  necessarily equally learnable.
+    |  #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth]
+    |  showed that the minimal #[strong Begin], #[strong In], #[strong Out]
+    |  scheme was more difficult to learn than the #[strong BILUO] scheme that
+    |  we use, which explicitly marks boundary tokens.
+
+table([ "Tag", "Description" ])
+    +row
+        +cell #[code #[span.u-color-theme B] EGIN]
+        +cell The first token of a multi-token entity.
+
+    +row
+        +cell #[code #[span.u-color-theme I] N]
+        +cell An inner token of a multi-token entity.
+
+    +row
+        +cell #[code #[span.u-color-theme L] AST]
+        +cell The final token of a multi-token entity.
+
+    +row
+        +cell #[code #[span.u-color-theme U] NIT]
+        +cell A single-token entity.
+
+    +row
+        +cell #[code #[span.u-color-theme O] UT]
+        +cell A non-entity token.
+
 +h(2, "json-input") JSON input format for training

 p
--- a/website/docs/api/cli.jade
+++ b/website/docs/api/cli.jade
@ -10,11 +10,11 @@ p
 +aside("Why python -m?")
    |  The problem with a global entry point is that it's resolved by looking up
    |  entries in your #[code PATH] environment variable. This can give you
-    |  unexpected results, especially when using #[code virtualenv]. For
-    |  instance, you may have spaCy installed on your system but not in your
-    |  current environment. The command will then execute the wrong
-    |  spaCy installation. #[code python -m] prevents fallbacks to system modules
-    |  and makes sure the correct version of spaCy is used.
+    |  unexpected results, like executing the wrong spaCy installation
+    |  (especially when using #[code virtualenv]). #[code python -m] prevents
+    |  fallbacks to system modules and makes sure the correct spaCy version is
+    |  used. If you hate typing it every time, we recommend creating an
+    |  #[code alias] instead.

 +h(2, "download") Download

@ -45,13 +45,24 @@ p
        +cell flag
        +cell Show help message and available arguments.

+infobox("Important note")
+    |  The #[code download] command is mostly intended as a convenient,
+    |  interactive wrapper – it performs compatibility checks and prints
+    |  detailed messages in case things go wrong. It's #[strong not recommended]
+    |  to use this command as part of an automated process. If you know which
+    |  model your project needs, you should consider a
+    |  #[+a("/docs/usage/models#download-pip") direct download via pip], or
+    |  uploading the model to a local PyPi installation and fetching it straight
+    |  from there. This will also allow you to add it as a versioned package
+    |  dependency to your project.

 +h(2, "link") Link

 p
    |  Create a #[+a("/docs/usage/models#usage") shortcut link] for a model,
    |  either a Python package or a local directory. This will let you load
-    |  models from any location via #[code spacy.load()].
+    |  models from any location using a custom name via
+    |  #[+api("spacy#load") #[code spacy.load()]].

 +code(false, "bash").
    python -m spacy link [origin] [link_name] [--force]
@ -92,7 +103,7 @@ p
    +row
        +cell #[code model]
        +cell positional
-        +cell Shortcut link of model (optional).
+        +cell A model, i.e. shortcut link, package name or path (optional).

    +row
        +cell #[code --markdown], #[code -md]
@ -114,7 +125,7 @@ p
    |  the input file. Currently only supports #[code .conllu].

 +code(false, "bash").
-    python -m spacy convert [input_file] [output_dir] [--n_sents] [--morphology]
+    python -m spacy convert [input_file] [output_dir] [--n-sents] [--morphology]

 +table(["Argument", "Type", "Description"])
    +row
@ -128,7 +139,7 @@ p
        +cell Output directory for converted JSON file.

    +row
-        +cell #[code --n_sents], #[code -n]
+        +cell #[code --n-sents], #[code -n]
        +cell option
        +cell Number of sentences per document.

@ -191,7 +202,7 @@ p
    |  #[+a("/docs/api/annotation#json-input") JSON format].

 +code(false, "bash").
-    python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner]
+    python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--parser-L1] [--no-tagger] [--no-parser] [--no-ner]

 +table(["Argument", "Type", "Description"])
    +row
@ -215,27 +226,37 @@ p
        +cell Location of JSON-formatted dev data (optional).

    +row
-        +cell #[code --n_iter], #[code -n]
+        +cell #[code --n-iter], #[code -n]
        +cell option
        +cell Number of iterations (default: #[code 15]).

    +row
-        +cell #[code --parser_L1], #[code -L]
+        +cell #[code --nsents]
+        +cell option
+        +cell Number of sentences (default: #[code 0]).
+
+    +row
+        +cell #[code --parser-L1], #[code -L]
        +cell option
        +cell L1 regularization penalty for parser (default: #[code 0.0]).

    +row
-        +cell #[code --no_tagger], #[code -T]
+        +cell #[code --use-gpu], #[code -g]
+        +cell flag
+        +cell Use GPU.
+
+    +row
+        +cell #[code --no-tagger], #[code -T]
        +cell flag
        +cell Don't train tagger.

    +row
-        +cell #[code --no_parser], #[code -P]
+        +cell #[code --no-parser], #[code -P]
        +cell flag
        +cell Don't train parser.

    +row
-        +cell #[code --no_ner], #[code -N]
+        +cell #[code --no-ner], #[code -N]
        +cell flag
        +cell Don't train NER.

--- a/website/docs/api/dependencyparser.jade
+++ b/website/docs/api/dependencyparser.jade
@ -4,32 +4,6 @@ include ../../_includes/_mixins

 p Annotate syntactic dependencies on #[code Doc] objects.

-+h(2, "load") DependencyParser.load
-    +tag classmethod
-
-p Load the statistical model from the supplied path.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code path]
-        +cell #[code Path]
-        +cell The path to load from.
-
-    +row
-        +cell #[code vocab]
-        +cell #[code Vocab]
-        +cell The vocabulary. Must be shared by the documents to be processed.
-
-    +row
-        +cell #[code require]
-        +cell bool
-        +cell Whether to raise an error if the files are not found.
-
-    +footrow
-        +cell return
-        +cell #[code DependencyParser]
-        +cell The newly constructed object.
-
 +h(2, "init") DependencyParser.__init__
    +tag method

@ -47,7 +21,7 @@ p Create a #[code DependencyParser].
        +cell The statistical model.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code DependencyParser]
        +cell The newly constructed object.

@ -65,7 +39,7 @@ p
        +cell The document to be processed.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code None]
        +cell -

@ -93,7 +67,7 @@ p Process a stream of documents.
            |  parallel.

    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Doc]
        +cell Documents, in order.

@ -114,7 +88,7 @@ p Update the statistical model.
        +cell The gold-standard annotations, to calculate the loss.

    +footrow
-        +cell return
+        +cell returns
        +cell int
        +cell The loss on this example.

@ -130,6 +104,6 @@ p Set up a stepwise state, to introspect and control the transition sequence.
        +cell The document to step through.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code StepwiseState]
        +cell A state object, to step through the annotation process.
--- a/website/docs/api/displacy.jade
+++ b/website/docs/api/displacy.jade
@ -8,7 +8,7 @@ p
    |  #[+a("/docs/usage/visualizers") visualizing spaCy].


-+h(2, "serve") serve
+h(2, "serve") displacy.serve
    +tag method

 p
@ -60,7 +60,7 @@ p
        +cell Port to serve visualization.
        +cell #[code 5000]

-+h(2, "render") render
+h(2, "render") displacy.render
    +tag method

 p Render a dependency parse tree or named entity visualization.
@ -112,7 +112,7 @@ p Render a dependency parse tree or named entity visualization.
        +cell #[code {}]

    +footrow
-        +cell return
+        +cell returns
        +cell unicode
        +cell Rendered HTML markup.
        +cell
@ -218,7 +218,7 @@ p
        +cell #[code colors]
        +cell dict
        +cell
-            |  Color overrides. Entity types in lowercase should be mapped to
+            |  Color overrides. Entity types in uppercase should be mapped to
            |  color names or values.
        +cell #[code {}]

--- a/website/docs/api/doc.jade
+++ b/website/docs/api/doc.jade
@ -4,9 +4,508 @@ include ../../_includes/_mixins

 p A container for accessing linguistic annotations.

+p
+    |  A #[code Doc] is a sequence of #[+api("token") #[code Token]] objects.
+    |  Access sentences and named entities, export annotations to numpy arrays,
+    |  losslessly serialize to compressed binary strings. The #[code Doc] object
+    |  holds an array of #[code TokenC] structs. The Python-level #[code Token]
+    |  and #[+api("span") #[code Span]] objects are views of this array, i.e.
+    |  they don't own the data themselves.
+
+aside-code("Example").
+    # Construction 1
+    doc = nlp(u'Some text')
+
+    # Construction 2
+    from spacy.tokens import Doc
+    doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
+                               spaces=[True, False, False])
+
+h(2, "init") Doc.__init__
+    +tag method
+
+p
+    |  Construct a #[code Doc] object. The most common way to get a #[code Doc]
+    |  object is via the #[code nlp] object.
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code vocab]
+        +cell #[code Vocab]
+        +cell A storage container for lexical types.
+
+    +row
+        +cell #[code words]
+        +cell -
+        +cell A list of strings to add to the container.
+
+    +row
+        +cell #[code spaces]
+        +cell -
+        +cell
+            |  A list of boolean values indicating whether each word has a
+            |  subsequent space. Must have the same length as #[code words], if
+            |  specified. Defaults to a sequence of #[code True].
+
+    +footrow
+        +cell returns
+        +cell #[code Doc]
+        +cell The newly constructed object.
+
+h(2, "getitem") Doc.__getitem__
+    +tag method
+
+p
+    |  Get a #[+api("token") #[code Token]] object at position #[code i], where
+    |  #[code i] is an integer. Negative indexing is supported, and follows the
+    |  usual Python semantics, i.e. #[code doc[-2]] is #[code doc[len(doc) - 2]].
+
+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    assert doc[0].text == 'Give'
+    assert doc[-1].text == '.'
+    span = doc[1:1]
+    assert span.text == 'it back'
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code i]
+        +cell int
+        +cell The index of the token.
+
+    +footrow
+        +cell returns
+        +cell #[code Token]
+        +cell The token at #[code doc[i]].
+
+p
+    |  Get a #[+api("span") #[code Span]] object, starting at position
+    |  #[code start] (token index) and ending at position #[code end] (token
+    |  index).
+
+p
+    |  For instance, #[code doc[2:5]] produces a span consisting of tokens 2, 3
+    |  and 4. Stepped slices (e.g. #[code doc[start : end : step]]) are not
+    |  supported, as #[code Span] objects must be contiguous (cannot have gaps).
+    |  You can use negative indices and open-ended ranges, which have their
+    |  normal Python semantics.
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code start_end]
+        +cell tuple
+        +cell The slice of the document to get.
+
+    +footrow
+        +cell returns
+        +cell #[code Span]
+        +cell The span at #[code doc[start : end]].
+
+h(2, "iter") Doc.__iter__
+    +tag method
+
+p
+    |  Iterate over #[code Token] objects, from which the annotations can be
+    |  easily accessed.
+
+aside-code("Example").
+    doc = nlp(u'Give it back')
+    assert [t.text for t in doc] == [u'Give', u'it', u'back']
+
+p
+    |  This is the main way of accessing #[+api("token") #[code Token]] objects,
+    |  which are the main way annotations are accessed from Python. If
+    |  faster-than-Python speeds are required, you can instead access the
+    |  annotations as a numpy array, or access the underlying C data directly
+    |  from Cython.
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell yields
+        +cell #[code Token]
+        +cell A #[code Token] object.
+
+h(2, "len") Doc.__len__
+    +tag method
+
+p Get the number of tokens in the document.
+
+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    assert len(doc) == 7
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell int
+        +cell The number of tokens in the document.
+
+h(2, "similarity") Doc.similarity
+    +tag method
+    +tag-model("vectors")
+
+p
+    |  Make a semantic similarity estimate. The default estimate is cosine
+    |  similarity using an average of word vectors.
+
+aside-code("Example").
+    apples = nlp(u'I like apples')
+    oranges = nlp(u'I like oranges')
+    apples_oranges = apples.similarity(oranges)
+    oranges_apples = oranges.similarity(apples)
+    assert apples_oranges == oranges_apples
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code other]
+        +cell -
+        +cell
+            |  The object to compare with. By default, accepts #[code Doc],
+            |  #[code Span], #[code Token] and #[code Lexeme] objects.
+
+    +footrow
+        +cell returns
+        +cell float
+        +cell A scalar similarity score. Higher is more similar.
+
+h(2, "count_by") Doc.count_by
+    +tag method
+
+p
+    |  Count the frequencies of a given attribute. Produces a dict of
+    |  #[code {attr (int): count (ints)}] frequencies, keyed by the values
+    |  of the given attribute ID.
+
+aside-code("Example").
+    from spacy.attrs import ORTH
+    doc = nlp(u'apple apple orange banana')
+    assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2}
+    doc.to_array([attrs.ORTH])
+    # array([[11880], [11880], [7561], [12800]])
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code attr_id]
+        +cell int
+        +cell The attribute ID
+
+    +footrow
+        +cell returns
+        +cell dict
+        +cell A dictionary mapping attributes to integer counts.
+
+h(2, "to_array") Doc.to_array
+    +tag method
+
+p
+    |  Export the document annotations to a numpy array of shape #[code N*M]
+    |  where #[code N] is the length of the document and #[code M] is the number
+    |  of attribute IDs to export. The values will be 32-bit integers.
+
+aside-code("Example").
+    from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
+    doc = nlp(text)
+    # All strings mapped to integers, for easy export to numpy
+    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code attr_ids]
+        +cell ints
+        +cell A list of attribute ID ints.
+
+    +footrow
+        +cell returns
+        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+        +cell
+            |  The exported attributes as a 2D numpy array, with one row per
+            |  token and one column per attribute.
+
+h(2, "from_array") Doc.from_array
+    +tag method
+
+p
+    |  Load attributes from a numpy array. Write to a #[code Doc] object, from
+    |  an #[code (M, N)] array of attributes.
+
+aside-code("Example").
+    from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
+    from spacy.tokens import Doc
+    doc = nlp(text)
+    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
+    doc2 = Doc(doc.vocab)
+    doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
+    assert doc.text == doc2.text
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code attrs]
+        +cell ints
+        +cell A list of attribute ID ints.
+
+    +row
+        +cell #[code array]
+        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+        +cell The attribute values to load.
+
+    +footrow
+        +cell returns
+        +cell #[code Doc]
+        +cell Itself.
+
+h(2, "to_bytes") Doc.to_bytes
+    +tag method
+
+p Serialize, i.e. export the document contents to a binary string.
+
+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    doc_bytes = doc.to_bytes()
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell bytes
+        +cell
+            |  A losslessly serialized copy of the #[code Doc], including all
+            |  annotations.
+
+h(2, "from_bytes") Doc.from_bytes
+    +tag method
+
+p Deserialize, i.e. import the document contents from a binary string.
+
+aside-code("Example").
+    from spacy.tokens import Doc
+    text = u'Give it back! He pleaded.'
+    doc = nlp(text)
+    bytes = doc.to_bytes()
+    doc2 = Doc(doc.vocab).from_bytes(bytes)
+    assert doc.text == doc2.text
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code data]
+        +cell bytes
+        +cell The string to load from.
+
+    +footrow
+        +cell returns
+        +cell #[code Doc]
+        +cell The #[code Doc] object.
+
+h(2, "merge") Doc.merge
+    +tag method
+
+p
+    |  Retokenize the document, such that the span at
+    |  #[code doc.text[start_idx : end_idx]] is merged into a single token. If
+    |  #[code start_idx] and #[end_idx] do not mark start and end token
+    |  boundaries, the document remains unchanged.
+
+aside-code("Example").
+    doc = nlp(u'Los Angeles start.')
+    doc.merge(0, len('Los Angeles'), 'NNP', 'Los Angeles', 'GPE')
+    assert [t.text for t in doc] == [u'Los Angeles', u'start', u'.']
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code start_idx]
+        +cell int
+        +cell The character index of the start of the slice to merge.
+
+    +row
+        +cell #[code end_idx]
+        +cell int
+        +cell The character index after the end of the slice to merge.
+
+    +row
+        +cell #[code **attributes]
+        +cell -
+        +cell
+            |  Attributes to assign to the merged token. By default,
+            |  attributes are inherited from the syntactic root token of
+            |  the span.
+
+    +footrow
+        +cell returns
+        +cell #[code Token]
+        +cell
+            |  The newly merged token, or #[code None] if the start and end
+            |  indices did not fall at token boundaries
+
+h(2, "print_tree") Doc.print_tree
+    +tag method
+    +tag-model("parse")
+
+p
+    |  Returns the parse trees in JSON (dict) format. Especially useful for
+    |  web applications.
+
+aside-code("Example").
+    doc = nlp('Alice ate the pizza.')
+    trees = doc.print_tree()
+    # {'modifiers': [
+    #   {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
+    #   {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'},
+    #   {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}
+    # ], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code light]
+        +cell bool
+        +cell Don't include lemmas or entities.
+
+    +row
+        +cell #[code flat]
+        +cell bool
+        +cell Don't include arcs or modifiers.
+
+    +footrow
+        +cell returns
+        +cell dict
+        +cell Parse tree as dict.
+
+h(2, "ents") Doc.ents
+    +tag property
+    +tag-model("NER")
+
+p
+    |  Iterate over the entities in the document. Yields named-entity
+    |  #[code Span] objects, if the entity recognizer has been applied to the
+    |  document.
+
+aside-code("Example").
+    tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
+    ents = list(tokens.ents)
+    assert ents[0].label == 346
+    assert ents[0].label_ == 'PERSON'
+    assert ents[0].text == 'Mr. Best'
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell yields
+        +cell #[code Span]
+        +cell Entities in the document.
+
+h(2, "noun_chunks") Doc.noun_chunks
+    +tag property
+    +tag-model("parse")
+
+p
+    |  Iterate over the base noun phrases in the document. Yields base
+    |  noun-phrase #[code Span] objects, if the document has been syntactically
+    |  parsed. A base noun phrase, or "NP chunk", is a noun phrase that does not
+    |  permit other NPs to be nested within it – so no NP-level coordination, no
+    |  prepositional phrases, and no relative clauses.
+
+aside-code("Example").
+    doc = nlp(u'A phrase with another phrase occurs.')
+    chunks = list(doc.noun_chunks)
+    assert chunks[0].text == "A phrase"
+    assert chunks[1].text == "another phrase"
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell yields
+        +cell #[code Span]
+        +cell Noun chunks in the document.
+
+h(2, "sents") Doc.sents
+    +tag property
+    +tag-model("parse")
+
+p
+    |  Iterate over the sentences in the document. Sentence spans have no label.
+    |  To improve accuracy on informal texts, spaCy calculates sentence boundaries
+    |  from the syntactic dependency parse. If the parser is disabled,
+    |  the #[code sents] iterator will be unavailable.
+
+aside-code("Example").
+    doc = nlp(u"This is a sentence. Here's another...")
+    sents = list(doc.sents)
+    assert len(sents) == 2
+    assert [s.root.text for s in sents] == ["is", "'s"]
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell yields
+        +cell #[code Span]
+        +cell Sentences in the document.
+
+h(2, "has_vector") Doc.has_vector
+    +tag property
+    +tag-model("vectors")
+
+p
+    |  A boolean value indicating whether a word vector is associated with the
+    |  object.
+
+aside-code("Example").
+    doc = nlp(u'I like apples')
+    assert doc.has_vector
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell bool
+        +cell Whether the document has a vector data attached.
+
+h(2, "vector") Doc.vector
+    +tag property
+    +tag-model("vectors")
+
+p
+    |  A real-valued meaning representation. Defaults to an average of the
+    |  token vectors.
+
+aside-code("Example").
+    apples = nlp(u'I like apples')
+    assert doc.vector.dtype == 'float32'
+    assert doc.vector.shape == (300,)
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell A 1D numpy array representing the document's semantics.
+
+h(2, "vector_norm") Doc.vector_norm
+    +tag property
+    +tag-model("vectors")
+
+p
+    |  The L2 norm of the document's vector representation.
+
+aside-code("Example").
+    doc1 = nlp(u'I like apples')
+    doc2 = nlp(u'I like oranges')
+    doc1.vector_norm # 4.54232424414368
+    doc2.vector_norm # 3.304373298575751
+    assert doc1.vector_norm != doc2.vector_norm
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell float
+        +cell The L2 norm of the vector representation.
+
 +h(2, "attributes") Attributes

 +table(["Name", "Type", "Description"])
+    +row
+        +cell #[code text]
+        +cell unicode
+        +cell A unicode representation of the document text.
+
+    +row
+        +cell #[code text_with_ws]
+        +cell unicode
+        +cell
+            |  An alias of #[code Doc.text], provided for duck-type compatibility
+            |  with #[code Span] and #[code Token].
+
    +row
        +cell #[code mem]
        +cell #[code Pool]
@ -17,6 +516,11 @@ p A container for accessing linguistic annotations.
        +cell #[code Vocab]
        +cell The store of lexical types.

+    +row
+        +cell #[code tensor]
+        +cell object
+        +cell Container for dense vector representations.
+
    +row
        +cell #[code user_data]
        +cell -
@ -59,358 +563,3 @@ p A container for accessing linguistic annotations.
        +cell
            |  A dictionary that allows customisation of properties of
            |  #[code Span] children.
-
-+h(2, "init") Doc.__init__
-    +tag method
-
-p Construct a #[code Doc] object.
-
-+aside("Note")
-    |  The most common way to get a #[code Doc] object is via the #[code nlp]
-    |  object. This method is usually only used for deserialization or preset
-    |  tokenization.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code vocab]
-        +cell #[code Vocab]
-        +cell A storage container for lexical types.
-
-    +row
-        +cell #[code words]
-        +cell -
-        +cell A list of strings to add to the container.
-
-    +row
-        +cell #[code spaces]
-        +cell -
-        +cell
-            |  A list of boolean values indicating whether each word has a
-            |  subsequent space. Must have the same length as #[code words], if
-            |  specified. Defaults to a sequence of #[code True].
-
-    +footrow
-        +cell return
-        +cell #[code Doc]
-        +cell The newly constructed object.
-
-+h(2, "getitem") Doc.__getitem__
-    +tag method
-
-p Get a #[code Token] object.
-
-+aside-code("Example").
-    doc = nlp(u'Give it back! He pleaded.')
-    assert doc[0].text == 'Give'
-    assert doc[-1].text == '.'
-    span = doc[1:1]
-    assert span.text == 'it back'
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code i]
-        +cell int
-        +cell The index of the token.
-
-    +footrow
-        +cell return
-        +cell #[code Token]
-        +cell The token at #[code doc[i]].
-
-p Get a #[code Span] object.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code start_end]
-        +cell tuple
-        +cell The slice of the document to get.
-
-    +footrow
-        +cell return
-        +cell #[code Span]
-        +cell The span at #[code doc[start : end]].
-
-+h(2, "iter") Doc.__iter__
-    +tag method
-
-p Iterate over #[code Token] objects.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell yield
-        +cell #[code Token]
-        +cell A #[code Token] object.
-
-+h(2, "len") Doc.__len__
-    +tag method
-
-p Get the number of tokens in the document.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell int
-        +cell The number of tokens in the document.
-
-+h(2, "similarity") Doc.similarity
-    +tag method
-
-p
-    |  Make a semantic similarity estimate. The default estimate is cosine
-    |  similarity using an average of word vectors.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code other]
-        +cell -
-        +cell
-            |  The object to compare with. By default, accepts #[code Doc],
-            |  #[code Span], #[code Token] and #[code Lexeme] objects.
-
-    +footrow
-        +cell return
-        +cell float
-        +cell A scalar similarity score. Higher is more similar.
-
-+h(2, "to_array") Doc.to_array
-    +tag method
-
-p
-    |  Export the document annotations to a numpy array of shape #[code N*M]
-    |  where #[code N] is the length of the document and #[code M] is the number
-    |  of attribute IDs to export. The values will be 32-bit integers.
-
-+aside-code("Example").
-    from spacy import attrs
-    doc = nlp(text)
-    # All strings mapped to integers, for easy export to numpy
-    np_array = doc.to_array([attrs.LOWER, attrs.POS,
-                             attrs.ENT_TYPE, attrs.IS_ALPHA])
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code attr_ids]
-        +cell ints
-        +cell A list of attribute ID ints.
-
-    +footrow
-        +cell return
-        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
-        +cell
-            |  The exported attributes as a 2D numpy array, with one row per
-            |  token and one column per attribute.
-
-+h(2, "count_by") Doc.count_by
-    +tag method
-
-p Count the frequencies of a given attribute.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code attr_id]
-        +cell int
-        +cell The attribute ID
-
-    +footrow
-        +cell return
-        +cell dict
-        +cell A dictionary mapping attributes to integer counts.
-
-+h(2, "from_array") Doc.from_array
-    +tag method
-
-p Load attributes from a numpy array.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code attr_ids]
-        +cell ints
-        +cell A list of attribute ID ints.
-
-    +row
-        +cell #[code values]
-        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
-        +cell The attribute values to load.
-
-    +footrow
-        +cell return
-        +cell #[code None]
-        +cell -
-
-+h(2, "to_bytes") Doc.to_bytes
-    +tag method
-
-p Export the document contents to a binary string.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell bytes
-        +cell
-            |  A losslessly serialized copy of the #[code Doc] including all
-            |  annotations.
-
-+h(2, "from_bytes") Doc.from_bytes
-    +tag method
-
-p Import the document contents from a binary string.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code byte_string]
-        +cell bytes
-        +cell The string to load from.
-
-    +footrow
-        +cell return
-        +cell #[code Doc]
-        +cell The #[code self] variable.
-
-+h(2, "merge") Doc.merge
-    +tag method
-
-p
-    |  Retokenize the document, such that the span at
-    |  #[code doc.text[start_idx : end_idx]] is merged into a single token. If
-    |  #[code start_idx] and #[end_idx] do not mark start and end token
-    |  boundaries, the document remains unchanged.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code start_idx]
-        +cell int
-        +cell The character index of the start of the slice to merge.
-
-    +row
-        +cell #[code end_idx]
-        +cell int
-        +cell The character index after the end of the slice to merge.
-
-    +row
-        +cell #[code **attributes]
-        +cell -
-        +cell
-            |  Attributes to assign to the merged token. By default,
-            |  attributes are inherited from the syntactic root token of
-            |  the span.
-
-    +footrow
-        +cell return
-        +cell #[code Token]
-        +cell
-            |  The newly merged token, or None if the start and end
-            |  indices did not fall at token boundaries
-
-+h(2, "read_bytes") Doc.read_bytes
-    +tag staticmethod
-
-p A static method, used to read serialized #[code Doc] objects from a file.
-
-+aside-code("Example").
-    from spacy.tokens.doc import Doc
-    loc = 'test_serialize.bin'
-    with open(loc, 'wb') as file_:
-        file_.write(nlp(u'This is a document.').to_bytes())
-        file_.write(nlp(u'This is another.').to_bytes())
-    docs = []
-    with open(loc, 'rb') as file_:
-        for byte_string in Doc.read_bytes(file_):
-            docs.append(Doc(nlp.vocab).from_bytes(byte_string))
-    assert len(docs) == 2
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell file
-        +cell buffer
-        +cell A binary buffer to read the serialized annotations from.
-
-    +footrow
-        +cell yield
-        +cell bytes
-        +cell Binary strings from with documents can be loaded.
-
-+h(2, "text") Doc.text
-    +tag property
-
-p A unicode representation of the document text.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell unicode
-        +cell The original verbatim text of the document.
-
-+h(2, "text_with_ws") Doc.text_with_ws
-    +tag property
-
-p
-    |  An alias of #[code Doc.text], provided for duck-type compatibility with
-    |  #[code Span] and #[code Token].
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell unicode
-        +cell The original verbatim text of the document.
-
-+h(2, "sents") Doc.sents
-    +tag property
-
-p Iterate over the sentences in the document.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell yield
-        +cell #[code Span]
-        +cell Sentences in the document.
-
-+h(2, "ents") Doc.ents
-    +tag property
-
-p Iterate over the entities in the document.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell yield
-        +cell #[code Span]
-        +cell Entities in the document.
-
-+h(2, "noun_chunks") Doc.noun_chunks
-    +tag property
-
-p
-    |  Iterate over the base noun phrases in the document. A base noun phrase,
-    |  or "NP chunk", is a noun phrase that does not permit other NPs to be
-    |  nested within it.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell yield
-        +cell #[code Span]
-        +cell Noun chunks in the document
-
-+h(2, "vector") Doc.vector
-    +tag property
-
-p
-    |  A real-valued meaning representation. Defaults to an average of the
-    |  token vectors.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
-        +cell A 1D numpy array representing the document's semantics.
-
-+h(2, "has_vector") Doc.has_vector
-    +tag property
-
-p
-    |  A boolean value indicating whether a word vector is associated with the
-    |  object.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell bool
-        +cell Whether the document has a vector data attached.
--- a/website/docs/api/entityrecognizer.jade
+++ b/website/docs/api/entityrecognizer.jade
@ -4,32 +4,6 @@ include ../../_includes/_mixins

 p Annotate named entities on #[code Doc] objects.

-+h(2, "load") EntityRecognizer.load
-    +tag classmethod
-
-p Load the statistical model from the supplied path.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code path]
-        +cell #[code Path]
-        +cell The path to load from.
-
-    +row
-        +cell #[code vocab]
-        +cell #[code Vocab]
-        +cell The vocabulary. Must be shared by the documents to be processed.
-
-    +row
-        +cell #[code require]
-        +cell bool
-        +cell Whether to raise an error if the files are not found.
-
-    +footrow
-        +cell return
-        +cell #[code EntityRecognizer]
-        +cell The newly constructed object.
-
 +h(2, "init") EntityRecognizer.__init__
    +tag method

@ -47,7 +21,7 @@ p Create an #[code EntityRecognizer].
        +cell The statistical model.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code EntityRecognizer]
        +cell The newly constructed object.

@ -63,7 +37,7 @@ p Apply the entity recognizer, setting the NER tags onto the #[code Doc] object.
        +cell The document to be processed.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code None]
        +cell -

@ -91,7 +65,7 @@ p Process a stream of documents.
            |  parallel.

    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Doc]
        +cell Documents, in order.

@ -112,7 +86,7 @@ p Update the statistical model.
        +cell The gold-standard annotations, to calculate the loss.

    +footrow
-        +cell return
+        +cell returns
        +cell int
        +cell The loss on this example.

@ -128,6 +102,6 @@ p Set up a stepwise state, to introspect and control the transition sequence.
        +cell The document to step through.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code StepwiseState]
        +cell A state object, to step through the annotation process.
--- a/website/docs/api/goldparse.jade
+++ b/website/docs/api/goldparse.jade
@ -4,6 +4,72 @@ include ../../_includes/_mixins

 p Collection for training annotations.

+h(2, "init") GoldParse.__init__
+    +tag method
+
+p Create a GoldParse.
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code doc]
+        +cell #[code Doc]
+        +cell The document the annotations refer to.
+
+    +row
+        +cell #[code words]
+        +cell iterable
+        +cell A sequence of unicode word strings.
+
+    +row
+        +cell #[code tags]
+        +cell iterable
+        +cell A sequence of strings, representing tag annotations.
+
+    +row
+        +cell #[code heads]
+        +cell iterable
+        +cell A sequence of integers, representing syntactic head offsets.
+
+    +row
+        +cell #[code deps]
+        +cell iterable
+        +cell A sequence of strings, representing the syntactic relation types.
+
+    +row
+        +cell #[code entities]
+        +cell iterable
+        +cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.
+
+    +footrow
+        +cell returns
+        +cell #[code GoldParse]
+        +cell The newly constructed object.
+
+h(2, "len") GoldParse.__len__
+    +tag method
+
+p Get the number of gold-standard tokens.
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell int
+        +cell The number of gold-standard tokens.
+
+h(2, "is_projective") GoldParse.is_projective
+    +tag property
+
+p
+    |  Whether the provided syntactic annotations form a projective dependency
+    |  tree.
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell bool
+        +cell Whether annotations form projective tree.
+
+
 +h(2, "attributes") Attributes

 +table(["Name", "Type", "Description"])
@ -37,67 +103,57 @@ p Collection for training annotations.
        +cell list
        +cell The alignment from gold tokenization to candidate tokenization.

-+h(2, "init") GoldParse.__init__
-    +tag method

-p Create a GoldParse.
+h(2, "util") Utilities
+
+h(3, "biluo_tags_from_offsets") gold.biluo_tags_from_offsets
+    +tag function
+
+p
+    |  Encode labelled spans into per-token tags, using the
+    |  #[+a("/docs/api/annotation#biluo") BILUO scheme] (Begin/In/Last/Unit/Out).
+
+p
+    |  Returns a list of unicode strings, describing the tags. Each tag string
+    |  will be of the form either #[code ""], #[code "O"] or
+    |  #[code "{action}-{label}"], where action is one of #[code "B"],
+    |  #[code "I"], #[code "L"], #[code "U"]. The string #[code &quot;-&quot;]
+    |  is used where the entity offsets don't align with the tokenization in the
+    |  #[code Doc] object. The training algorithm will view these as missing
+    |  values. #[code O] denotes a non-entity token. #[code B] denotes the
+    |  beginning of a multi-token entity, #[code I] the inside of an entity
+    |  of three or more tokens, and #[code L] the end of an entity of two or
+    |  more tokens. #[code U] denotes a single-token entity.
+
+aside-code("Example").
+    from spacy.gold import biluo_tags_from_offsets
+    text = 'I like London.'
+    entities = [(len('I like '), len('I like London'), 'LOC')]
+    doc = tokenizer(text)
+    tags = biluo_tags_from_offsets(doc, entities)
+    assert tags == ['O', 'O', 'U-LOC', 'O']

 +table(["Name", "Type", "Description"])
    +row
        +cell #[code doc]
        +cell #[code Doc]
-        +cell The document the annotations refer to.
-
-    +row
-        +cell #[code words]
-        +cell -
-        +cell A sequence of unicode word strings.
-
-    +row
-        +cell #[code tags]
-        +cell -
-        +cell A sequence of strings, representing tag annotations.
-
-    +row
-        +cell #[code heads]
-        +cell -
-        +cell A sequence of integers, representing syntactic head offsets.
-
-    +row
-        +cell #[code deps]
-        +cell -
-        +cell A sequence of strings, representing the syntactic relation types.
+        +cell
+            |  The document that the entity offsets refer to. The output tags
+            |  will refer to the token boundaries within the document.

    +row
        +cell #[code entities]
-        +cell -
-        +cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.
+        +cell iterable
+        +cell
+            |  A sequence of #[code (start, end, label)] triples. #[code start]
+            |  and #[code end] should be character-offset integers denoting the
+            |  slice into the original string.

    +footrow
-        +cell return
-        +cell #[code GoldParse]
-        +cell The newly constructed object.
+        +cell returns
+        +cell list
+        +cell
+            |  Unicode strings, describing the
+            |  #[+a("/docs/api/annotation#biluo") BILUO] tags.

-+h(2, "len") GoldParse.__len__
-    +tag method

-p Get the number of gold-standard tokens.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell int
-        +cell The number of gold-standard tokens.
-
-+h(2, "is_projective") GoldParse.is_projective
-    +tag property
-
-p
-    |  Whether the provided syntactic annotations form a projective dependency
-    |  tree.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell bool
-        +cell Whether annotations form projective tree.
--- a/website/docs/api/language.jade
+++ b/website/docs/api/language.jade
@ -2,79 +2,69 @@

 include ../../_includes/_mixins

-p A text processing pipeline.
+p
+    |  A text-processing pipeline. Usually you'll load this once per process,
+    |  and pass the instance around your application.

-+h(2, "attributes") Attributes
+h(2, "init") Language.__init__
+    +tag method
+
+p Initialise a #[code Language] object.
+
+aside-code("Example").
+    from spacy.language import Language
+    nlp = Language(pipeline=['token_vectors', 'tags',
+                             'dependencies'])
+
+    from spacy.lang.en import English
+    nlp = English()

 +table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
-        +cell A container for the lexical types.
-
-    +row
-        +cell #[code tokenizer]
-        +cell #[code Tokenizer]
-        +cell Find word boundaries and create #[code Doc] object.
-
-    +row
-        +cell #[code tagger]
-        +cell #[code Tagger]
-        +cell Annotate #[code Doc] objects with POS tags.
-
-    +row
-        +cell #[code parser]
-        +cell #[code DependencyParser]
-        +cell Annotate #[code Doc] objects with syntactic dependencies.
-
-    +row
-        +cell #[code entity]
-        +cell #[code EntityRecognizer]
-        +cell Annotate #[code Doc] objects with named entities.
-
-    +row
-        +cell #[code matcher]
-        +cell #[code Matcher]
-        +cell Rule-based sequence matcher.
+        +cell
+            |  A #[code Vocab] object. If #[code True], a vocab is created via
+            |  #[code Language.Defaults.create_vocab].

    +row
        +cell #[code make_doc]
-        +cell #[code lambda text: Doc]
-        +cell Create a #[code Doc] object from unicode text.
+        +cell callable
+        +cell
+            |  A function that takes text and returns a #[code Doc] object.
+            |  Usually a #[code Tokenizer].

    +row
        +cell #[code pipeline]
-        +cell -
-        +cell Sequence of annotation functions.
+        +cell list
+        +cell
+            |  A list of annotation processes or IDs of annotation, processes,
+            |  e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked
+            |  up in #[code Language.Defaults.factories].

-
-+h(2, "init") Language.__init__
-    +tag method
-
-p Create or load the pipeline.
-
-+table(["Name", "Type", "Description"])
    +row
-        +cell #[code **overrides]
-        +cell -
-        +cell Keyword arguments indicating which defaults to override.
+        +cell #[code meta]
+        +cell dict
+        +cell
+            |  Custom meta data for the #[code Language] class. Is written to by
+            |  models to add model meta data.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code Language]
        +cell The newly constructed object.

 +h(2, "call") Language.__call__
    +tag method

-p Apply the pipeline to a single text.
+p
+    |  Apply the pipeline to some text. The text can span multiple sentences,
+    |  and can contain arbtrary whitespace. Alignment into the original string
+    |  is preserved.

 +aside-code("Example").
-    from spacy.en import English
-    nlp = English()
-    doc = nlp('An example sentence. Another example sentence.')
-    doc[0].orth_, doc[0].head.tag_
-    # ('An', 'NN')
+    doc = nlp(u'An example sentence. Another sentence.')
+    assert (doc[0].text, doc[0].head.tag_) == ('An', 'NN')

 +table(["Name", "Type", "Description"])
    +row
@ -83,24 +73,104 @@ p Apply the pipeline to a single text.
        +cell The text to be processed.

    +row
-        +cell #[code tag]
-        +cell bool
-        +cell Whether to apply the part-of-speech tagger.
-
-    +row
-        +cell #[code parse]
-        +cell bool
-        +cell Whether to apply the syntactic dependency parser.
-
-    +row
-        +cell #[code entity]
-        +cell bool
-        +cell Whether to apply the named entity recognizer.
+        +cell #[code **disabled]
+        +cell -
+        +cell Elements of the pipeline that should not be run.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code Doc]
-        +cell A container for accessing the linguistic annotations.
+        +cell A container for accessing the annotations.
+
+h(2, "update") Language.update
+    +tag method
+
+p Update the models in the pipeline.
+
+aside-code("Example").
+    with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
+        for epoch in trainer.epochs(gold):
+            for docs, golds in epoch:
+                state = nlp.update(docs, golds, sgd=optimizer)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code docs]
+        +cell iterable
+        +cell A batch of #[code Doc] objects.
+
+    +row
+        +cell #[code golds]
+        +cell iterable
+        +cell A batch of #[code GoldParse] objects.
+
+    +row
+        +cell #[code drop]
+        +cell float
+        +cell The dropout rate.
+
+    +row
+        +cell #[code sgd]
+        +cell callable
+        +cell An optimizer.
+
+    +footrow
+        +cell returns
+        +cell dict
+        +cell Results from the update.
+
+h(2, "begin_training") Language.begin_training
+    +tag contextmanager
+
+p
+    |  Allocate models, pre-process training data and acquire a trainer and
+    |  optimizer. Used as a contextmanager.
+
+aside-code("Example").
+    with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
+        for epoch in trainer.epochs(gold):
+            for docs, golds in epoch:
+                state = nlp.update(docs, golds, sgd=optimizer)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code gold_tuples]
+        +cell iterable
+        +cell Gold-standard training data.
+
+    +row
+        +cell #[code **cfg]
+        +cell -
+        +cell Config parameters.
+
+    +footrow
+        +cell yields
+        +cell tuple
+        +cell A trainer and an optimizer.
+
+h(2, "use_params") Language.use_params
+    +tag contextmanager
+    +tag method
+
+p
+    |  Replace weights of models in the pipeline with those provided in the
+    |  params dictionary. Can be used as a contextmanager, in which case, models
+    |  go back to their original weights after the block.
+
+aside-code("Example").
+    with nlp.use_params(optimizer.averages):
+        nlp.to_disk('/tmp/checkpoint')
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code params]
+        +cell dict
+        +cell A dictionary of parameters keyed by model ID.
+
+    +row
+        +cell #[code **cfg]
+        +cell -
+        +cell Config parameters.

 +h(2, "pipe") Language.pipe
    +tag method
@ -133,22 +203,142 @@ p
        +cell The number of texts to buffer.

    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Doc]
-        +cell Containers for accessing the linguistic annotations.
+        +cell Documents in the order of the original text.

-+h(2, "save_to_directory") Language.save_to_directory
+h(2, "to_disk") Language.to_disk
    +tag method

-p Save the #[code Vocab], #[code StringStore] and pipeline to a directory.
+p Save the current state to a directory.
+
+aside-code("Example").
+    nlp.to_disk('/path/to/models')

 +table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
-        +cell string or pathlib path
-        +cell Path to save the model.
+        +cell unicode or #[code Path]
+        +cell
+            |  A path to a directory, which will be created if it doesn't exist.
+            |  Paths may be either strings or #[code Path]-like objects.
+
+    +row
+        +cell #[code **exclude]
+        +cell -
+        +cell Named attributes to prevent from being saved.
+
+h(2, "from_disk") Language.from_disk
+    +tag method
+
+p Loads state from a directory. Modifies the object in place and returns it.
+
+aside-code("Example").
+    from spacy.language import Language
+    nlp = Language().from_disk('/path/to/models')
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code path]
+        +cell unicode or #[code Path]
+        +cell
+            |  A path to a directory. Paths may be either strings or
+            |  #[code Path]-like objects.
+
+    +row
+        +cell #[code **exclude]
+        +cell -
+        +cell Named attributes to prevent from being loaded.

    +footrow
-        +cell return
-        +cell #[code None]
+        +cell returns
+        +cell #[code Language]
+        +cell The modified #[code Language] object.
+
+h(2, "to_bytes") Language.to_bytes
+    +tag method
+
+p Serialize the current state to a binary string.
+
+aside-code("Example").
+    nlp_bytes = nlp.to_bytes()
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code **exclude]
        +cell -
+        +cell Named attributes to prevent from being serialized.
+
+    +footrow
+        +cell returns
+        +cell bytes
+        +cell The serialized form of the #[code Language] object.
+
+h(2, "from_bytes") Language.from_bytes
+    +tag method
+
+p Load state from a binary string.
+
+aside-code("Example").
+    fron spacy.lang.en import English
+    nlp_bytes = nlp.to_bytes()
+    nlp2 = English()
+    nlp2.from_bytes(nlp_bytes)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code bytes_data]
+        +cell bytes
+        +cell The data to load from.
+
+    +row
+        +cell #[code **exclude]
+        +cell -
+        +cell Named attributes to prevent from being loaded.
+
+    +footrow
+        +cell returns
+        +cell #[code Language]
+        +cell The #[code Language] object.
+
+h(2, "attributes") Attributes
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code vocab]
+        +cell #[code Vocab]
+        +cell A container for the lexical types.
+
+    +row
+        +cell #[code make_doc]
+        +cell #[code lambda text: Doc]
+        +cell Create a #[code Doc] object from unicode text.
+
+    +row
+        +cell #[code pipeline]
+        +cell list
+        +cell Sequence of annotation functions.
+
+    +row
+        +cell #[code meta]
+        +cell dict
+        +cell
+            |  Custom meta data for the Language class. If a model is loaded,
+            |  contains meta data of the model.
+
+h(2, "class-attributes") Class attributes
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code Defaults]
+        +cell class
+        +cell
+            |  Settings, data and factory methods for creating the
+            |  #[code nlp] object and processing pipeline.
+
+    +row
+        +cell #[code lang]
+        +cell unicode
+        +cell
+            |  Two-letter language ID, i.e.
+            |  #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code].
--- a/website/docs/api/lexeme.jade
+++ b/website/docs/api/lexeme.jade
@ -2,7 +2,154 @@

 include ../../_includes/_mixins

-p An entry in the vocabulary.
+p
+    |  An entry in the vocabulary. A #[code Lexeme] has no string context – it's
+    |  a word-type, as opposed to a word token. It therefore has no
+    |  part-of-speech tag, dependency parse, or lemma (if lemmatization depends
+    |  on the part-of-speech tag).
+
+h(2, "init") Lexeme.__init__
+    +tag method
+
+p Create a #[code Lexeme] object.
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code vocab]
+        +cell #[code Vocab]
+        +cell The parent vocabulary.
+
+    +row
+        +cell #[code orth]
+        +cell int
+        +cell The orth id of the lexeme.
+
+    +footrow
+        +cell returns
+        +cell #[code Lexeme]
+        +cell The newly constructed object.
+
+h(2, "set_flag") Lexeme.set_flag
+    +tag method
+
+p Change the value of a boolean flag.
+
+aside-code("Example").
+    COOL_FLAG = nlp.vocab.add_flag(lambda text: False)
+    nlp.vocab[u'spaCy'].set_flag(COOL_FLAG, True)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code flag_id]
+        +cell int
+        +cell The attribute ID of the flag to set.
+
+    +row
+        +cell #[code value]
+        +cell bool
+        +cell The new value of the flag.
+
+h(2, "check_flag") Lexeme.check_flag
+    +tag method
+
+p Check the value of a boolean flag.
+
+aside-code("Example").
+    is_my_library = lambda text: text in ['spaCy', 'Thinc']
+    MY_LIBRARY = nlp.vocab.add_flag(is_my_library)
+    assert nlp.vocab[u'spaCy'].check_flag(MY_LIBRARY) == True
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code flag_id]
+        +cell int
+        +cell The attribute ID of the flag to query.
+
+    +footrow
+        +cell returns
+        +cell bool
+        +cell The value of the flag.
+
+h(2, "similarity") Lexeme.similarity
+    +tag method
+    +tag-model("vectors")
+
+p Compute a semantic similarity estimate. Defaults to cosine over vectors.
+
+aside-code("Example").
+    apple = nlp.vocab[u'apple']
+    orange = nlp.vocab[u'orange']
+    apple_orange = apple.similarity(orange)
+    orange_apple = orange.similarity(apple)
+    assert apple_orange == orange_apple
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell other
+        +cell -
+        +cell
+            |  The object to compare with. By default, accepts #[code Doc],
+            |  #[code Span], #[code Token] and #[code Lexeme] objects.
+
+    +footrow
+        +cell returns
+        +cell float
+        +cell A scalar similarity score. Higher is more similar.
+
+
+h(2, "has_vector") Lexeme.has_vector
+    +tag property
+    +tag-model("vectors")
+
+p
+    |  A boolean value indicating whether a word vector is associated with the
+    |  lexeme.
+
+aside-code("Example").
+    apple = nlp.vocab[u'apple']
+    assert apple.has_vector
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell bool
+        +cell Whether the lexeme has a vector data attached.
+
+h(2, "vector") Lexeme.vector
+    +tag property
+    +tag-model("vectors")
+
+p A real-valued meaning representation.
+
+aside-code("Example").
+    apple = nlp.vocab[u'apple']
+    assert apple.vector.dtype == 'float32'
+    assert apple.vector.shape == (300,)
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell A 1D numpy array representing the lexeme's semantics.
+
+h(2, "vector_norm") Lexeme.vector_norm
+    +tag property
+    +tag-model("vectors")
+
+p The L2 norm of the lexeme's vector representation.
+
+aside-code("Example").
+    apple = nlp.vocab[u'apple']
+    pasta = nlp.vocab[u'pasta']
+    apple.vector_norm # 7.1346845626831055
+    pasta.vector_norm # 7.759851932525635
+    assert apple.vector_norm != pasta.vector_norm
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell float
+        +cell The L2 norm of the vector representation.

 +h(2, "attributes") Attributes

@ -12,6 +159,16 @@ p An entry in the vocabulary.
        +cell #[code Vocab]
        +cell

+    +row
+        +cell #[code text]
+        +cell unicode
+        +cell Verbatim text content.
+
+    +row
+        +cell #[code lex_id]
+        +cell int
+        +cell ID of the lexeme's lexical type.
+
    +row
        +cell #[code lower]
        +cell int
@ -124,116 +281,9 @@ p An entry in the vocabulary.
    +row
        +cell #[code prob]
        +cell float
-        +cell Smoothed log probability estimate of token's type.
+        +cell Smoothed log probability estimate of lexeme's type.

    +row
        +cell #[code sentiment]
        +cell float
-        +cell A scalar value indicating the positivity or negativity of the token.
-    +row
-        +cell #[code lex_id]
-        +cell int
-        +cell ID of the token's lexical type.
-
-    +row
-        +cell #[code text]
-        +cell unicode
-        +cell Verbatim text content.
-
-+h(2, "init") Lexeme.__init__
-    +tag method
-
-p Create a #[code Lexeme] object.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code vocab]
-        +cell #[code Vocab]
-        +cell The parent vocabulary.
-
-    +row
-        +cell #[code orth]
-        +cell int
-        +cell The orth id of the lexeme.
-
-    +footrow
-        +cell return
-        +cell #[code Lexeme]
-        +cell The newly constructed object.
-
-+h(2, "set_flag") Lexeme.set_flag
-    +tag method
-
-p Change the value of a boolean flag.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code flag_id]
-        +cell int
-        +cell The attribute ID of the flag to set.
-
-    +row
-        +cell #[code value]
-        +cell bool
-        +cell The new value of the flag.
-
-    +footrow
-        +cell return
-        +cell #[code None]
-        +cell -
-
-+h(2, "check_flag") Lexeme.check_flag
-    +tag method
-
-p Check the value of a boolean flag.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code flag_id]
-        +cell int
-        +cell The attribute ID of the flag to query.
-
-    +footrow
-        +cell return
-        +cell bool
-        +cell The value of the flag.
-
-+h(2, "similarity") Lexeme.similarity
-    +tag method
-
-p Compute a semantic similarity estimate. Defaults to cosine over vectors.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code other]
-        +cell -
-        +cell
-            |  The object to compare with. By default, accepts #[code Doc],
-            |  #[code Span], #[code Token] and #[code Lexeme] objects.
-
-    +footrow
-        +cell return
-        +cell float
-        +cell A scalar similarity score. Higher is more similar.
-
-+h(2, "vector") Lexeme.vector
-    +tag property
-
-p A real-valued meaning representation.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
-        +cell A real-valued meaning representation.
-
-+h(2, "has_vector") Lexeme.has_vector
-    +tag property
-
-p A boolean value indicating whether a word vector is associated with the object.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell bool
-        +cell Whether a word vector is associated with the object.
+        +cell A scalar value indicating the positivity or negativity of the lexeme.
--- a/website/docs/api/matcher.jade
+++ b/website/docs/api/matcher.jade
@ -4,31 +4,26 @@ include ../../_includes/_mixins

 p Match sequences of tokens, based on pattern rules.

-+h(2, "load") Matcher.load
-    +tag classmethod
-
-p Load the matcher and patterns from a file path.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code path]
-        +cell #[code Path]
-        +cell Path to a JSON-formatted patterns file.
-
-    +row
-        +cell #[code vocab]
-        +cell #[code Vocab]
-        +cell The vocabulary that the documents to match over will refer to.
-
-    +footrow
-        +cell return
-        +cell #[code Matcher]
-        +cell The newly constructed object.
+infobox("⚠️ Deprecation note")
+    |  As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
+    |  are deprecated and have been replaced with a simpler
+    |  #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
+    |  patterns and a callback for a given match ID. #[code Matcher.get_entity]
+    |  is now called #[+api("matcher#get") #[code matcher.get]].
+    |  #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
+    |  and #[code Matcher.has_entity] (now redundant) have been removed.

 +h(2, "init") Matcher.__init__
    +tag method

-p Create the Matcher.
+p Create the rule-based #[code Matcher].
+
+aside-code("Example").
+    from spacy.matcher import Matcher
+    from spacy.attrs import LOWER
+
+    patterns = {"HelloWorld": [{LOWER: "hello"}, {LOWER: "world"}]}
+    matcher = Matcher(nlp.vocab)

 +table(["Name", "Type", "Description"])
    +row
@ -41,17 +36,38 @@ p Create the Matcher.
    +row
        +cell #[code patterns]
        +cell dict
-        +cell Patterns to add to the matcher.
+        +cell Patterns to add to the matcher, keyed by ID.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code Matcher]
        +cell The newly constructed object.

 +h(2, "call") Matcher.__call__
    +tag method

-p Find all token sequences matching the supplied patterns on the Doc.
+p Find all token sequences matching the supplied patterns on the #[code Doc].
+
+aside-code("Example").
+    from spacy.matcher import Matcher
+    from spacy.attrs import LOWER
+
+    matcher = Matcher(nlp.vocab)
+    pattern = [{LOWER: "hello"}, {LOWER: "world"}]
+    matcher.add("HelloWorld", on_match=None, pattern)
+    doc = nlp(u'hello world!')
+    matches = matcher(doc)
+
+infobox("Important note")
+    |  By default, the matcher #[strong does not perform any action] on matches,
+    |  like tagging matched phrases with entity types. Instead, actions need to
+    |  be specified when #[strong adding patterns or entities], by
+    |  passing in a callback function as the #[code on_match] argument on
+    |  #[+api("matcher#add") #[code add]]. This allows you to define custom
+    |  actions per pattern within the same matcher. For example, you might only
+    |  want to merge some entity types, and set custom flags for other matched
+    |  patterns. For more details and examples, see the usage workflow on
+    |  #[+a("/docs/usage/rule-based-matching") rule-based matching].

 +table(["Name", "Type", "Description"])
    +row
@ -60,23 +76,28 @@ p Find all token sequences matching the supplied patterns on the Doc.
        +cell The document to match over.

    +footrow
-        +cell return
+        +cell returns
        +cell list
        +cell
-            |  A list of#[code (entity_key, label_id,  start, end)] tuples,
-            |  describing the matches. A match tuple describes a
-            |  #[code span doc[start:end]]. The #[code label_id] and
-            |  #[code entity_key] are both integers.
+            |  A list of #[code (match_id, start, end)] tuples, describing the
+            |  matches. A match tuple describes a span #[code doc[start:end]].
+            |  The #[code match_id] is the ID of the added match pattern.

 +h(2, "pipe") Matcher.pipe
    +tag method

 p Match a stream of documents, yielding them in turn.

+aside-code("Example").
+    from spacy.matcher import Matcher
+    matcher = Matcher(nlp.vocab)
+    for doc in matcher.pipe(texts, batch_size=50, n_threads=4):
+        pass
+
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code docs]
-        +cell -
+        +cell iterable
        +cell A stream of documents.

    +row
@ -93,87 +114,132 @@ p Match a stream of documents, yielding them in turn.
            |  multi-threading.

    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Doc]
        +cell Documents, in order.

-+h(2, "add_entity") Matcher.add_entity
+h(2, "len") Matcher.__len__
    +tag method

-p Add an entity to the matcher.
+p
+    |  Get the number of rules added to the matcher. Note that this only returns
+    |  the number of rules (identical with the number of IDs), not the number
+    |  of individual patterns.
+
+aside-code("Example").
+    matcher = Matcher(nlp.vocab)
+    assert len(matcher) == 0
+    matcher.add('Rule', None, [{ORTH: 'test'}])
+    assert len(matcher) == 1
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell int
+        +cell The number of rules.
+
+h(2, "contains") Matcher.__contains__
+    +tag method
+
+p Check whether the matcher contains rules for a match ID.
+
+aside-code("Example").
+    matcher = Matcher(nlp.vocab)
+    assert 'Rule' in matcher == False
+    matcher.add('Rule', None, [{ORTH: 'test'}])
+    assert 'Rule' in matcher == True

 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code entity_key]
-        +cell unicode / int
-        +cell An ID for the entity.
-
-    +row
-        +cell #[code attrs]
-        +cell -
-        +cell Attributes to associate with the Matcher.
-
-    +row
-        +cell #[code if_exists]
+        +cell #[code key]
        +cell unicode
-        +cell
-            |  #[code 'raise'], #[code 'ignore'] or #[code 'update']. Controls
-            |  what happens if the entity ID already exists. Defaults to
-            |  #[code 'raise'].
+        +cell The match ID.
+    +footrow
+        +cell returns
+        +cell int
+        +cell Whether the matcher contains rules for this match ID.

+h(2, "add") Matcher.add
+    +tag method
+
+p
+    |  Add a rule to the matcher, consisting of an ID key, one or more patterns, and
+    |  a callback function to act on the matches. The callback function will
+    |  receive the arguments #[code matcher], #[code doc], #[code i] and
+    |  #[code matches]. If a pattern already exists for the given ID, the
+    |  patterns will be extended. An #[code on_match] callback will be
+    |  overwritten.
+
+aside-code("Example").
+    def on_match(matcher, doc, id, matches):
+        print('Matched!', matches)
+
+    matcher = Matcher(nlp.vocab)
+    matcher.add('HelloWorld', on_match, [{LOWER: "hello"}, {LOWER: "world"}])
+    matcher.add('GoogleMaps', on_match, [{ORTH: "Google"}, {ORTH: "Maps"}])
+    doc = nlp(u'HELLO WORLD on Google Maps.')
+    matches = matcher(doc)
+
+table(["Name", "Type", "Description"])
    +row
-        +cell #[code acceptor]
-        +cell -
-        +cell Callback function to filter matches of the entity.
+        +cell #[code match_id]
+        +cell unicode
+        +cell An ID for the thing you're matching.

    +row
        +cell #[code on_match]
-        +cell -
-        +cell Callback function to act on matches of the entity.
+        +cell callable or #[code None]
+        +cell
+            |  Callback function to act on matches. Takes the arguments
+            |  #[code matcher], #[code doc], #[code i] and #[code matches].

-    +footrow
-        +cell return
-        +cell #[code None]
-        +cell -
+    +row
+        +cell #[code *patterns]
+        +cell list
+        +cell
+            |  Match pattern. A pattern consists of a list of dicts, where each
+            |  dict describes a token.

-+h(2, "add_pattern") Matcher.add_pattern
+h(2, "remove") Matcher.remove
    +tag method

-p Add a pattern to the matcher.
+p
+    |  Remove a rule from the matcher. A #[code KeyError] is raised if the match
+    |  ID does not exist.
+
+aside-code("Example").
+    matcher.add('Rule', None, [{ORTH: 'test'}])
+    assert 'Rule' in matcher == True
+    matcher.remove('Rule')
+    assert 'Rule' in matcher == False

 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code entity_key]
-        +cell unicode / int
-        +cell An ID for the entity.
+        +cell #[code key]
+        +cell unicode
+        +cell The ID of the match rule.

-    +row
-        +cell #[code token_specs]
-        +cell -
-        +cell Description of the pattern to be matched.
-
-    +row
-        +cell #[code label]
-        +cell unicode / int
-        +cell Label to assign to the matched pattern. Defaults to #[code ""].
-
-    +footrow
-        +cell return
-        +cell #[code None]
-        +cell -
-
-+h(2, "has_entity") Matcher.has_entity
+h(2, "get") Matcher.get
    +tag method

-p Check whether the matcher has an entity.
+p
+    |  Retrieve the pattern stored for a key. Returns the rule as an
+    |  #[code (on_match, patterns)] tuple containing the callback and available
+    |  patterns.
+
+aside-code("Example").
+    pattern = [{ORTH: 'test'}]
+    matcher.add('Rule', None, pattern)
+    (on_match, patterns) = matcher.get('Rule')
+    assert patterns = [pattern]

 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code entity_key]
-        +cell unicode / int
-        +cell The entity key to check.
+        +cell #[code key]
+        +cell unicode
+        +cell The ID of the match rule.

    +footrow
-        +cell return
-        +cell bool
-        +cell Whether the matcher has the entity.
+        +cell returns
+        +cell tuple
+        +cell The rule, as an #[code (on_match, patterns)] tuple.
--- a/website/docs/api/spacy.jade
+++ b/website/docs/api/spacy.jade
@ -0,0 +1,95 @@
+//- 💫 DOCS > API > SPACY
+
+include ../../_includes/_mixins
+
+h(2, "load") spacy.load
+    +tag function
+    +tag-model
+
+p
+    |  Load a model via its #[+a("/docs/usage/models#usage") shortcut link],
+    |  the name of an installed
+    |  #[+a("/docs/usage/saving-loading#generating") model package], a unicode
+    |  path or a #[code Path]-like object. spaCy will try resolving the load
+    |  argument in this order. The #[code Language] class to initialise will be
+    |  determined based on the model's settings.
+
+aside-code("Example").
+    nlp = spacy.load('en') # shortcut link
+    nlp = spacy.load('en_core_web_sm') # package
+    nlp = spacy.load('/path/to/en') # unicode path
+    nlp = spacy.load(Path('/path/to/en')) # pathlib Path
+
+infobox("⚠️ Deprecation note")
+    |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
+    |  will also raise an error if no model could be loaded and never just
+    |  return an empty #[code Language] object. If you need a blank language,
+    |  you need to import it explicitly (#[code from spacy.lang.en import English])
+    |  or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code name]
+        +cell unicode or #[code Path]
+        +cell Model to load, i.e. shortcut link, package name or path.
+
+    +footrow
+        +cell returns
+        +cell #[code Language]
+        +cell A #[code Language] object with the loaded model.
+
+h(2, "info") spacy.info
+    +tag function
+
+p
+    |  The same as the #[+api("cli#info") #[code info] command]. Pretty-print
+    |  information about your installation, models and local setup from within
+    |  spaCy. To get the model meta data as a dictionary instead, you can
+    |  use the #[code meta] attribute on your #[code nlp] object with a
+    |  loaded model, e.g. #[code nlp['meta']].
+
+aside-code("Example").
+    spacy.info()
+    spacy.info('en')
+    spacy.info('de', markdown=True)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code model]
+        +cell unicode
+        +cell A model, i.e. shortcut link, package name or path (optional).
+
+    +row
+        +cell #[code markdown]
+        +cell bool
+        +cell Print information as Markdown.
+
+
+h(2, "explain") spacy.explain
+    +tag function
+
+p
+    |  Get a description for a given POS tag, dependency label or entity type.
+    |  For a list of available terms, see
+    |  #[+src(gh("spacy", "spacy/glossary.py")) glossary.py].
+
+aside-code("Example").
+    spacy.explain('NORP')
+    # Nationalities or religious or political groups
+
+    doc = nlp(u'Hello world')
+    for word in doc:
+        print(word.text, word.tag_, spacy.explain(word.tag_))
+    # Hello UH interjection
+    # world NN noun, singular or mass
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code term]
+        +cell unicode
+        +cell Term to explain.
+
+    +footrow
+        +cell returns
+        +cell unicode
+        +cell The explanation, or #[code None] if not found in the glossary.
--- a/website/docs/api/span.jade
+++ b/website/docs/api/span.jade
@ -2,66 +2,18 @@

 include ../../_includes/_mixins

-p A slice from a #[code Doc] object.
-
-+h(2, "attributes") Attributes
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code doc]
-        +cell #[code Doc]
-        +cell The parent document.
-
-    +row
-        +cell #[code start]
-        +cell int
-        +cell The token offset for the start of the span.
-
-    +row
-        +cell #[code end]
-        +cell int
-        +cell The token offset for the end of the span.
-
-    +row
-        +cell #[code start_char]
-        +cell int
-        +cell The character offset for the start of the span.
-
-    +row
-        +cell #[code end_char]
-        +cell int
-        +cell The character offset for the end of the span.
-
-    +row
-        +cell #[code label]
-        +cell int
-        +cell The span's label.
-
-    +row
-        +cell #[code label_]
-        +cell unicode
-        +cell The span's label.
-
-    +row
-        +cell #[code lemma_]
-        +cell unicode
-        +cell The span's lemma.
-
-    +row
-        +cell #[code ent_id]
-        +cell int
-        +cell The integer ID of the named entity the token is an instance of.
-
-    +row
-        +cell #[code ent_id_]
-        +cell unicode
-        +cell The string ID of the named entity the token is an instance of.
+p A slice from a #[+api("doc") #[code Doc]] object.

 +h(2, "init") Span.__init__
    +tag method

 p Create a Span object from the #[code slice doc[start : end]].

+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    span = doc[1:4]
+    assert [t.text for t in span] ==  [u'it', u'back', u'!']
+
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code doc]
@ -89,7 +41,7 @@ p Create a Span object from the #[code slice doc[start : end]].
        +cell A meaning representation of the span.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code Span]
        +cell The newly constructed object.

@ -98,6 +50,11 @@ p Create a Span object from the #[code slice doc[start : end]].

 p Get a #[code Token] object.

+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    span = doc[1:4]
+    assert span[1].text == 'back'
+
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code i]
@ -105,12 +62,17 @@ p Get a #[code Token] object.
        +cell The index of the token within the span.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code Token]
        +cell The token at #[code span[i]].

 p Get a #[code Span] object.

+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    span = doc[1:4]
+    assert span[1:3].text == 'back!'
+
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code start_end]
@ -118,7 +80,7 @@ p Get a #[code Span] object.
        +cell The slice of the span to get.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code Span]
        +cell The span at #[code span[start : end]].

@ -127,9 +89,14 @@ p Get a #[code Span] object.

 p Iterate over #[code Token] objects.

+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    span = doc[1:4]
+    assert [t.text for t in span] == ['it', 'back', '!']
+
 +table(["Name", "Type", "Description"])
    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Token]
        +cell A #[code Token] object.

@ -138,19 +105,33 @@ p Iterate over #[code Token] objects.

 p Get the number of tokens in the span.

+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    span = doc[1:4]
+    assert len(span) == 3
+
 +table(["Name", "Type", "Description"])
    +footrow
-        +cell return
+        +cell returns
        +cell int
        +cell The number of tokens in the span.

 +h(2, "similarity") Span.similarity
    +tag method
+    +tag-model("vectors")

 p
    |  Make a semantic similarity estimate. The default estimate is cosine
    |  similarity using an average of word vectors.

+aside-code("Example").
+    doc = nlp(u'green apples and red oranges')
+    green_apples = doc[:2]
+    red_oranges = doc[3:]
+    apples_oranges = green_apples.similarity(red_oranges)
+    oranges_apples = red_oranges.similarity(green_apples)
+    assert apples_oranges == oranges_apples
+
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code other]
@ -160,7 +141,7 @@ p
            |  #[code Span], #[code Token] and #[code Lexeme] objects.

    +footrow
-        +cell return
+        +cell returns
        +cell float
        +cell A scalar similarity score. Higher is more similar.

@ -178,87 +159,205 @@ p Retokenize the document, such that the span is merged into a single token.
            |  are inherited from the syntactic root token of the span.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code Token]
        +cell The newly merged token.

-+h(2, "text") Span.text
-    +tag property
-
-p A unicode representation of the span text.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell unicode
-        +cell The original verbatim text of the span.
-
-+h(2, "text_with_ws") Span.text_with_ws
-    +tag property
-
-p
-    |  The text content of the span with a trailing whitespace character if the
-    |  last token has one.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell unicode
-        +cell The text content of the span (with trailing whitespace).
-
-+h(2, "sent") Span.sent
-    +tag property
-
-p The sentence span that this span is a part of.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell #[code Span]
-        +cell The sentence this is part of.
-
 +h(2, "root") Span.root
    +tag property
+    +tag-model("parse")

 p
    |  The token within the span that's highest in the parse tree. If there's a
    |  tie, the earlist is prefered.

+aside-code("Example").
+    doc = nlp(u'I like New York in Autumn.')
+    i, like, new, york, in_, autumn, dot = range(len(doc))
+    assert doc[new].head.text == 'York'
+    assert doc[york].head.text == 'like'
+    new_york = doc[new&#58;york+1]
+    assert new_york.root.text == 'York'
+
 +table(["Name", "Type", "Description"])
    +footrow
-        +cell return
+        +cell returns
        +cell #[code Token]
        +cell The root token.

 +h(2, "lefts") Span.lefts
    +tag property
+    +tag-model("parse")

 p Tokens that are to the left of the span, whose head is within the span.

+aside-code("Example").
+    doc = nlp(u'I like New York in Autumn.')
+    lefts = [t.text for t in doc[3:7].lefts]
+    assert lefts == [u'New']
+
 +table(["Name", "Type", "Description"])
    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Token]
        +cell A left-child of a token of the span.

 +h(2, "rights") Span.rights
    +tag property
+    +tag-model("parse")

 p Tokens that are to the right of the span, whose head is within the span.

+aside-code("Example").
+    doc = nlp(u'I like New York in Autumn.')
+    rights = [t.text for t in doc[2:4].rights]
+    assert rights == [u'in']
+
 +table(["Name", "Type", "Description"])
    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Token]
        +cell A right-child of a token of the span.

 +h(2, "subtree") Span.subtree
    +tag property
+    +tag-model("parse")

 p Tokens that descend from tokens in the span, but fall outside it.

+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    subtree = [t.text for t in doc[:3].subtree]
+    assert subtree == [u'Give', u'it', u'back', u'!']
+
 +table(["Name", "Type", "Description"])
    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Token]
        +cell A descendant of a token within the span.
+
+h(2, "has_vector") Span.has_vector
+    +tag property
+    +tag-model("vectors")
+
+p
+    |  A boolean value indicating whether a word vector is associated with the
+    |  object.
+
+aside-code("Example").
+    doc = nlp(u'I like apples')
+    assert doc[1:].has_vector
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell bool
+        +cell Whether the span has a vector data attached.
+
+h(2, "vector") Span.vector
+    +tag property
+    +tag-model("vectors")
+
+p
+    |  A real-valued meaning representation. Defaults to an average of the
+    |  token vectors.
+
+aside-code("Example").
+    doc = nlp(u'I like apples')
+    assert doc[1:].vector.dtype == 'float32'
+    assert doc[1:].vector.shape == (300,)
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell A 1D numpy array representing the span's semantics.
+
+h(2, "vector_norm") Span.vector_norm
+    +tag property
+    +tag-model("vectors")
+
+p
+    |  The L2 norm of the span's vector representation.
+
+aside-code("Example").
+    doc = nlp(u'I like apples')
+    doc[1:].vector_norm # 4.800883928527915
+    doc[2:].vector_norm # 6.895897646384268
+    assert doc[1:].vector_norm != doc[2:].vector_norm
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell float
+        +cell The L2 norm of the vector representation.
+
+h(2, "attributes") Attributes
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code doc]
+        +cell #[code Doc]
+        +cell The parent document.
+
+    +row
+        +cell #[code sent]
+        +cell #[code Span]
+        +cell The sentence span that this span is a part of.
+
+    +row
+        +cell #[code start]
+        +cell int
+        +cell The token offset for the start of the span.
+
+    +row
+        +cell #[code end]
+        +cell int
+        +cell The token offset for the end of the span.
+
+    +row
+        +cell #[code start_char]
+        +cell int
+        +cell The character offset for the start of the span.
+
+    +row
+        +cell #[code end_char]
+        +cell int
+        +cell The character offset for the end of the span.
+
+    +row
+        +cell #[code text]
+        +cell unicode
+        +cell A unicode representation of the span text.
+
+    +row
+        +cell #[code text_with_ws]
+        +cell unicode
+        +cell
+            |  The text content of the span with a trailing whitespace character
+            |  if the last token has one.
+
+    +row
+        +cell #[code label]
+        +cell int
+        +cell The span's label.
+
+    +row
+        +cell #[code label_]
+        +cell unicode
+        +cell The span's label.
+
+    +row
+        +cell #[code lemma_]
+        +cell unicode
+        +cell The span's lemma.
+
+    +row
+        +cell #[code ent_id]
+        +cell int
+        +cell The integer ID of the named entity the token is an instance of.
+
+    +row
+        +cell #[code ent_id_]
+        +cell unicode
+        +cell The string ID of the named entity the token is an instance of.
--- a/website/docs/api/stringstore.jade
+++ b/website/docs/api/stringstore.jade
@ -7,16 +7,22 @@ p Map strings to and from integer IDs.
 +h(2, "init") StringStore.__init__
    +tag method

-p Create the #[code StringStore].
+p
+    |  Create the #[code StringStore]. Note that a newly initialised store will
+    |  always include an empty string #[code ''] at position #[code 0].
+
+aside-code("Example").
+    from spacy.strings import StringStore
+    stringstore = StringStore([u'apple', u'orange'])

 +table(["Name", "Type", "Description"])
    +row
        +cell #[code strings]
-        +cell -
+        +cell iterable
        +cell A sequence of unicode strings to add to the store.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code StringStore]
        +cell The newly constructed object.

@ -25,9 +31,13 @@ p Create the #[code StringStore].

 p Get the number of strings in the store.

+aside-code("Example").
+    stringstore = StringStore([u'apple', u'orange'])
+    assert len(stringstore) == 2
+
 +table(["Name", "Type", "Description"])
    +footrow
-        +cell return
+        +cell returns
        +cell int
        +cell The number of strings in the store.

@ -36,22 +46,32 @@ p Get the number of strings in the store.

 p Retrieve a string from a given integer ID, or vice versa.

+aside-code("Example").
+    stringstore = StringStore([u'apple', u'orange'])
+    int_id = stringstore[u'apple'] # 1
+    assert stringstore[int_id] == u'apple'
+
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string_or_id]
-        +cell bytes / unicode / int
+        +cell bytes, unicode or int
        +cell The value to encode.

    +footrow
-        +cell return
-        +cell unicode / int
-        +cell The value to retrieved.
+        +cell returns
+        +cell unicode or int
+        +cell The value to be retrieved.

 +h(2, "contains") StringStore.__contains__
    +tag method

 p Check whether a string is in the store.

+aside-code("Example").
+    stringstore = StringStore([u'apple', u'orange'])
+    assert u'apple' in stringstore == True
+    assert u'cherry' in stringstore == False
+
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
@ -59,49 +79,108 @@ p Check whether a string is in the store.
        +cell The string to check.

    +footrow
-        +cell return
+        +cell returns
        +cell bool
        +cell Whether the store contains the string.

 +h(2, "iter") StringStore.__iter__
    +tag method

-p Iterate over the strings in the store, in order.
+p
+    |  Iterate over the strings in the store, in order. Note that a newly
+    |  initialised store will always include an empty string #[code ''] at
+    |  position #[code 0].
+
+aside-code("Example").
+    stringstore = StringStore([u'apple', u'orange'])
+    all_strings = [s for s in stringstore]
+    assert all_strings == [u'', u'apple', u'orange']

 +table(["Name", "Type", "Description"])
    +footrow
-        +cell yield
+        +cell yields
        +cell unicode
        +cell A string in the store.

-+h(2, "dump") StringStore.dump
+h(2, "to_disk") StringStore.to_disk
    +tag method

-p Save the strings to a JSON file.
+p Save the current state to a directory.
+
+aside-code("Example").
+    stringstore.to_disk('/path/to/strings')

 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code file]
-        +cell buffer
-        +cell The file to save the strings.
+        +cell #[code path]
+        +cell unicode or #[code Path]
+        +cell
+            |  A path to a directory, which will be created if it doesn't exist.
+            |  Paths may be either strings or #[code Path]-like objects.

-    +footrow
-        +cell return
-        +cell #[code None]
-        +cell -
-
-+h(2, "load") StringStore.load
+h(2, "from_disk") Tokenizer.from_disk
    +tag method

-p Load the strings from a JSON file.
+p Loads state from a directory. Modifies the object in place and returns it.
+
+aside-code("Example").
+    from spacy.strings import StringStore
+    stringstore = StringStore().from_disk('/path/to/strings')

 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code file]
-        +cell buffer
-        +cell The file from which to load the strings.
+        +cell #[code path]
+        +cell unicode or #[code Path]
+        +cell
+            |  A path to a directory. Paths may be either strings or
+            |  #[code Path]-like objects.

    +footrow
-        +cell return
-        +cell #[code None]
+        +cell returns
+        +cell #[code Tokenizer]
+        +cell The modified #[code Tokenizer] object.
+
+h(2, "to_bytes") Tokenizer.to_bytes
+    +tag method
+
+p Serialize the current state to a binary string.
+
+aside-code("Example").
+    store_bytes = stringstore.to_bytes()
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code **exclude]
        +cell -
+        +cell Named attributes to prevent from being serialized.
+
+    +footrow
+        +cell returns
+        +cell bytes
+        +cell The serialized form of the #[code Tokenizer] object.
+
+h(2, "from_bytes") Tokenizer.from_bytes
+    +tag method
+
+p Load state from a binary string.
+
+aside-code("Example").
+    fron spacy.strings import StringStore
+    store_bytes = stringstore.to_bytes()
+    new_store = StringStore().from_bytes(store_bytes)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code bytes_data]
+        +cell bytes
+        +cell The data to load from.
+
+    +row
+        +cell #[code **exclude]
+        +cell -
+        +cell Named attributes to prevent from being loaded.
+
+    +footrow
+        +cell returns
+        +cell #[code StringStore]
+        +cell The #[code StringStore] object.
--- a/website/docs/api/tagger.jade
+++ b/website/docs/api/tagger.jade
@ -4,32 +4,6 @@ include ../../_includes/_mixins

 p Annotate part-of-speech tags on #[code Doc] objects.

-+h(2, "load") Tagger.load
-    +tag classmethod
-
-p Load the statistical model from the supplied path.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code path]
-        +cell #[code Path]
-        +cell The path to load from.
-
-    +row
-        +cell #[code vocab]
-        +cell #[code Vocab]
-        +cell The vocabulary. Must be shared by the documents to be processed.
-
-    +row
-        +cell #[code require]
-        +cell bool
-        +cell Whether to raise an error if the files are not found.
-
-    +footrow
-        +cell return
-        +cell #[code Tagger]
-        +cell The newly constructed object.
-
 +h(2, "init") Tagger.__init__
    +tag method

@ -47,7 +21,7 @@ p Create a #[code Tagger].
        +cell The statistical model.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code Tagger]
        +cell The newly constructed object.

@ -63,7 +37,7 @@ p Apply the tagger, setting the POS tags onto the #[code Doc] object.
        +cell The tokens to be tagged.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code None]
        +cell -

@ -91,7 +65,7 @@ p Tag a stream of documents.
            |  parallel.

    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Doc]
        +cell Documents, in order.

@ -112,6 +86,6 @@ p Update the statistical model, with tags supplied for the given document.
        +cell Manager for the gold-standard tags.

    +footrow
-        +cell return
+        +cell returns
        +cell int
        +cell Number of tags predicted correctly.
--- a/website/docs/api/token.jade
+++ b/website/docs/api/token.jade
@ -4,9 +4,296 @@ include ../../_includes/_mixins

 p An individual token — i.e. a word, punctuation symbol, whitespace, etc.

+h(2, "init") Token.__init__
+    +tag method
+
+p Construct a #[code Token] object.
+
+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    token = doc[0]
+    assert token.text == u'Give'
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code vocab]
+        +cell #[code Vocab]
+        +cell A storage container for lexical types.
+
+    +row
+        +cell #[code doc]
+        +cell #[code Doc]
+        +cell The parent document.
+
+    +row
+        +cell #[code offset]
+        +cell int
+        +cell The index of the token within the document.
+
+    +footrow
+        +cell returns
+        +cell #[code Token]
+        +cell The newly constructed object.
+
+h(2, "len") Token.__len__
+    +tag method
+
+p The number of unicode characters in the token, i.e. #[code token.text].
+
+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    token = doc[0]
+    assert len(token) == 4
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell int
+        +cell The number of unicode characters in the token.
+
+h(2, "check_flag") Token.check_flag
+    +tag method
+
+p Check the value of a boolean flag.
+
+aside-code("Example").
+    from spacy.attrs import IS_TITLE
+    doc = nlp(u'Give it back! He pleaded.')
+    token = doc[0]
+    assert token.check_flag(IS_TITLE) == True
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code flag_id]
+        +cell int
+        +cell The attribute ID of the flag to check.
+
+    +footrow
+        +cell returns
+        +cell bool
+        +cell Whether the flag is set.
+
+h(2, "similarity") Token.similarity
+    +tag method
+    +tag-model("vectors")
+
+p Compute a semantic similarity estimate. Defaults to cosine over vectors.
+
+aside-code("Example").
+    apples, _, oranges = nlp(u'apples and oranges')
+    apples_oranges = apples.similarity(oranges)
+    oranges_apples = oranges.similarity(apples)
+    assert apples_oranges == oranges_apples
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell other
+        +cell -
+        +cell
+            |  The object to compare with. By default, accepts #[code Doc],
+            |  #[code Span], #[code Token] and #[code Lexeme] objects.
+
+    +footrow
+        +cell returns
+        +cell float
+        +cell A scalar similarity score. Higher is more similar.
+
+h(2, "nbor") Token.nbor
+    +tag method
+
+p Get a neighboring token.
+
+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    give_nbor = doc[0].nbor()
+    assert give_nbor.text == u'it'
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code i]
+        +cell int
+        +cell The relative position of the token to get. Defaults to #[code 1].
+
+    +footrow
+        +cell returns
+        +cell #[code Token]
+        +cell The token at position #[code self.doc[self.i+i]].
+
+h(2, "is_ancestor") Token.is_ancestor
+    +tag method
+    +tag-model("parse")
+
+p
+    |  Check whether this token is a parent, grandparent, etc. of another
+    |  in the dependency tree.
+
+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    give = doc[0]
+    it = doc[1]
+    assert give.is_ancestor(it)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell descendant
+        +cell #[code Token]
+        +cell Another token.
+
+    +footrow
+        +cell returns
+        +cell bool
+        +cell Whether this token is the ancestor of the descendant.
+
+h(2, "ancestors") Token.ancestors
+    +tag property
+    +tag-model("parse")
+
+p The rightmost token of this token's syntactic descendants.
+
+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    it_ancestors = doc[1].ancestors
+    assert [t.text for t in it_ancestors] == [u'Give']
+    he_ancestors = doc[4].ancestors
+    assert [t.text for t in he_ancestors] == [u'pleaded']
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell yields
+        +cell #[code Token]
+        +cell
+            |  A sequence of ancestor tokens such that
+            |  #[code ancestor.is_ancestor(self)].
+
+h(2, "conjuncts") Token.conjuncts
+    +tag property
+    +tag-model("parse")
+
+p A sequence of coordinated tokens, including the token itself.
+
+aside-code("Example").
+    doc = nlp(u'I like apples and oranges')
+    apples_conjuncts = doc[2].conjuncts
+    assert [t.text for t in apples_conjuncts] == [u'oranges']
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell yields
+        +cell #[code Token]
+        +cell A coordinated token.
+
+h(2, "children") Token.children
+    +tag property
+    +tag-model("parse")
+
+p A sequence of the token's immediate syntactic children.
+
+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    give_children = doc[0].children
+    assert [t.text for t in give_children] == [u'it', u'back', u'!']
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell yields
+        +cell #[code Token]
+        +cell A child token such that #[code child.head==self].
+
+h(2, "subtree") Token.subtree
+    +tag property
+    +tag-model("parse")
+
+p A sequence of all the token's syntactic descendents.
+
+aside-code("Example").
+    doc = nlp(u'Give it back! He pleaded.')
+    give_subtree = doc[0].subtree
+    assert [t.text for t in give_subtree] == [u'Give', u'it', u'back', u'!']
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell yields
+        +cell #[code Token]
+        +cell A descendant token such that #[code self.is_ancestor(descendant)].
+
+h(2, "has_vector") Token.has_vector
+    +tag property
+    +tag-model("vectors")
+
+p
+    |  A boolean value indicating whether a word vector is associated with the
+    |  token.
+
+aside-code("Example").
+    doc = nlp(u'I like apples')
+    apples = doc[2]
+    assert apples.has_vector
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell bool
+        +cell Whether the token has a vector data attached.
+
+h(2, "vector") Token.vector
+    +tag property
+    +tag-model("vectors")
+
+p A real-valued meaning representation.
+
+aside-code("Example").
+    doc = nlp(u'I like apples')
+    apples = doc[2]
+    assert apples.vector.dtype == 'float32'
+    assert apples.vector.shape == (300,)
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell A 1D numpy array representing the token's semantics.
+
+h(2, "vector_norm") Span.vector_norm
+    +tag property
+    +tag-model("vectors")
+
+p The L2 norm of the token's vector representation.
+
+aside-code("Example").
+    doc = nlp(u'I like apples and pasta')
+    apples = doc[2]
+    pasta = doc[4]
+    apples.vector_norm # 6.89589786529541
+    pasta.vector_norm # 7.759851932525635
+    assert apples.vector_norm != pasta.vector_norm
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell float
+        +cell The L2 norm of the vector representation.
+
 +h(2, "attributes") Attributes

 +table(["Name", "Type", "Description"])
+    +row
+        +cell #[code text]
+        +cell unicode
+        +cell Verbatim text content.
+    +row
+        +cell #[code text_with_ws]
+        +cell unicode
+        +cell Text content, with trailing space character if present.
+
+    +row
+        +cell #[code whitespace]
+        +cell int
+        +cell Trailing space character if present.
+    +row
+        +cell #[code whitespace_]
+        +cell unicode
+        +cell Trailing space character if present.
+
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
@ -17,14 +304,31 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
        +cell #[code Doc]
        +cell The parent document.

+    +row
+        +cell #[code head]
+        +cell #[code Token]
+        +cell The syntactic parent, or "governor", of this token.
+
+    +row
+        +cell #[code left_edge]
+        +cell #[code Token]
+        +cell The leftmost token of this token's syntactic descendants.
+
+    +row
+        +cell #[code right_edge]
+        +cell #[code Token]
+        +cell The rightmost token of this token's syntactic descendents.
+
    +row
        +cell #[code i]
        +cell int
        +cell The index of the token within the parent document.
+
    +row
        +cell #[code ent_type]
        +cell int
        +cell Named entity type.
+
    +row
        +cell #[code ent_type_]
        +cell unicode
@ -42,19 +346,23 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
        +cell unicode
        +cell
            |  IOB code of named entity tag. #[code "B"]
-            |  means the token begins an entity, #[code "I"] means it inside an
-            |  entity, #[code "O"] means it is outside an entity, and
+            |  means the token begins an entity, #[code "I"] means it is inside
+            |  an entity, #[code "O"] means it is outside an entity, and
            |  #[code ""] means no entity tag is set.

    +row
        +cell #[code ent_id]
        +cell int
-        +cell ID of the entity the token is an instance of, if any.
+        +cell
+            |  ID of the entity the token is an instance of, if any. Usually
+            |  assigned by patterns in the Matcher.

    +row
        +cell #[code ent_id_]
        +cell unicode
-        +cell ID of the entity the token is an instance of, if any.
+        +cell
+            |  ID of the entity the token is an instance of, if any. Usually
+            |  assigned by patterns in the Matcher.

    +row
        +cell #[code lemma]
@ -229,232 +537,3 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
        +cell #[code lex_id]
        +cell int
        +cell ID of the token's lexical type.
-
-    +row
-        +cell #[code text]
-        +cell unicode
-        +cell Verbatim text content.
-    +row
-        +cell #[code text_with_ws]
-        +cell unicode
-        +cell Text content, with trailing space character if present.
-
-    +row
-        +cell #[code whitespace]
-        +cell int
-        +cell Trailing space character if present.
-    +row
-        +cell #[code whitespace_]
-        +cell unicode
-        +cell Trailing space character if present.
-
-
-+h(2, "init") Token.__init__
-    +tag method
-
-p Construct a #[code Token] object.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code vocab]
-        +cell #[code Vocab]
-        +cell A storage container for lexical types.
-
-    +row
-        +cell #[code doc]
-        +cell #[code Doc]
-        +cell The parent document.
-
-    +row
-        +cell #[code offset]
-        +cell int
-        +cell The index of the token within the document.
-
-    +footrow
-        +cell return
-        +cell #[code Token]
-        +cell The newly constructed object.
-
-+h(2, "len") Token.__len__
-    +tag method
-
-p Get the number of unicode characters in the token.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell int
-        +cell The number of unicode characters in the token.
-
-
-+h(2, "check_flag") Token.check_flag
-    +tag method
-
-p Check the value of a boolean flag.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code flag_id]
-        +cell int
-        +cell The attribute ID of the flag to check.
-
-    +footrow
-        +cell return
-        +cell bool
-        +cell Whether the flag is set.
-
-+h(2, "nbor") Token.nbor
-    +tag method
-
-p Get a neighboring token.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code i]
-        +cell int
-        +cell The relative position of the token to get. Defaults to #[code 1].
-
-    +footrow
-        +cell return
-        +cell #[code Token]
-        +cell The token at position #[code self.doc[self.i+i]]
-
-+h(2, "similarity") Token.similarity
-    +tag method
-
-p Compute a semantic similarity estimate. Defaults to cosine over vectors.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell other
-        +cell -
-        +cell
-            |  The object to compare with. By default, accepts #[code Doc],
-            |  #[code Span], #[code Token] and #[code Lexeme] objects.
-
-    +footrow
-        +cell return
-        +cell float
-        +cell A scalar similarity score. Higher is more similar.
-
-+h(2, "is_ancestor") Token.is_ancestor
-    +tag method
-
-p
-    |  Check whether this token is a parent, grandparent, etc. of another
-    |  in the dependency tree.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell descendant
-        +cell #[code Token]
-        +cell Another token.
-
-    +footrow
-        +cell return
-        +cell bool
-        +cell Whether this token is the ancestor of the descendant.
-
-
-+h(2, "vector") Token.vector
-    +tag property
-
-p A real-valued meaning representation.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
-        +cell A 1D numpy array representing the token's semantics.
-
-+h(2, "has_vector") Token.has_vector
-    +tag property
-
-p
-    |  A boolean value indicating whether a word vector is associated with the
-    |  object.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell bool
-        +cell Whether the token has a vector data attached.
-
-+h(2, "head") Token.head
-    +tag property
-
-p The syntactic parent, or "governor", of this token.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell #[code Token]
-        +cell The head.
-
-+h(2, "conjuncts") Token.conjuncts
-    +tag property
-
-p A sequence of coordinated tokens, including the token itself.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell yield
-        +cell #[code Token]
-        +cell A coordinated token.
-
-+h(2, "children") Token.children
-    +tag property
-
-p A sequence of the token's immediate syntactic children.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell yield
-        +cell #[code Token]
-        +cell A child token such that #[code child.head==self].
-
-+h(2, "subtree") Token.subtree
-    +tag property
-
-p A sequence of all the token's syntactic descendents.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell yield
-        +cell #[code Token]
-        +cell A descendant token such that #[code self.is_ancestor(descendant)].
-
-+h(2, "left_edge") Token.left_edge
-    +tag property
-
-p The leftmost token of this token's syntactic descendants.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell #[code Token]
-        +cell The first token such that #[code self.is_ancestor(token)].
-
-+h(2, "right_edge") Token.right_edge
-    +tag property
-
-p The rightmost token of this token's syntactic descendents.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell return
-        +cell #[code Token]
-        +cell The last token such that #[code self.is_ancestor(token)].
-
-+h(2, "ancestors") Token.ancestors
-    +tag property
-
-p The rightmost token of this token's syntactic descendants.
-
-+table(["Name", "Type", "Description"])
-    +footrow
-        +cell yield
-        +cell #[code Token]
-        +cell
-            |  A sequence of ancestor tokens such that
-            |  #[code ancestor.is_ancestor(self)].
--- a/website/docs/api/tokenizer.jade
+++ b/website/docs/api/tokenizer.jade
@ -6,6 +6,283 @@ p
    |  Segment text, and create #[code Doc] objects with the discovered segment
    |  boundaries.

+h(2, "init") Tokenizer.__init__
+    +tag method
+
+p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
+
+aside-code("Example").
+    # Construction 1
+    from spacy.tokenizer import Tokenizer
+    tokenizer = Tokenizer(nlp.vocab)
+
+    # Construction 2
+    from spacy.lang.en import English
+    tokenizer = English().Defaults.create_tokenizer(nlp)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code vocab]
+        +cell #[code Vocab]
+        +cell A storage container for lexical types.
+
+    +row
+        +cell #[code rules]
+        +cell dict
+        +cell Exceptions and special-cases for the tokenizer.
+
+    +row
+        +cell #[code prefix_search]
+        +cell callable
+        +cell
+            |  A function matching the signature of
+            |  #[code re.compile(string).search] to match prefixes.
+
+    +row
+        +cell #[code suffix_search]
+        +cell callable
+        +cell
+            |  A function matching the signature of
+            |  #[code re.compile(string).search] to match suffixes.
+
+    +row
+        +cell #[code infix_finditer]
+        +cell callable
+        +cell
+            |  A function matching the signature of
+            |  #[code re.compile(string).finditer] to find infixes.
+
+    +row
+        +cell #[code token_match]
+        +cell callable
+        +cell A boolean function matching strings to be recognised as tokens.
+
+    +footrow
+        +cell returns
+        +cell #[code Tokenizer]
+        +cell The newly constructed object.
+
+h(2, "call") Tokenizer.__call__
+    +tag method
+
+p Tokenize a string.
+
+aside-code("Example").
+    tokens = tokenizer(u'This is a sentence')
+    assert len(tokens) == 4
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code string]
+        +cell unicode
+        +cell The string to tokenize.
+
+    +footrow
+        +cell returns
+        +cell #[code Doc]
+        +cell A container for linguistic annotations.
+
+h(2, "pipe") Tokenizer.pipe
+    +tag method
+
+p Tokenize a stream of texts.
+
+aside-code("Example").
+    texts = [u'One document.', u'...', u'Lots of documents']
+    for doc in tokenizer.pipe(texts, batch_size=50):
+        pass
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code texts]
+        +cell -
+        +cell A sequence of unicode texts.
+
+    +row
+        +cell #[code batch_size]
+        +cell int
+        +cell The number of texts to accumulate in an internal buffer.
+
+    +row
+        +cell #[code n_threads]
+        +cell int
+        +cell
+            |  The number of threads to use, if the implementation supports
+            |  multi-threading. The default tokenizer is single-threaded.
+
+    +footrow
+        +cell yields
+        +cell #[code Doc]
+        +cell A sequence of Doc objects, in order.
+
+h(2, "find_infix") Tokenizer.find_infix
+    +tag method
+
+p Find internal split points of the string.
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code string]
+        +cell unicode
+        +cell The string to split.
+
+    +footrow
+        +cell returns
+        +cell list
+        +cell
+            |  A list of #[code re.MatchObject] objects that have #[code .start()]
+            |  and #[code .end()] methods, denoting the placement of internal
+            |  segment separators, e.g. hyphens.
+
+h(2, "find_prefix") Tokenizer.find_prefix
+    +tag method
+
+p
+    |  Find the length of a prefix that should be segmented from the string, or
+    |  #[code None] if no prefix rules match.
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code string]
+        +cell unicode
+        +cell The string to segment.
+
+    +footrow
+        +cell returns
+        +cell int
+        +cell The length of the prefix if present, otherwise #[code None].
+
+h(2, "find_suffix") Tokenizer.find_suffix
+    +tag method
+
+p
+    |  Find the length of a suffix that should be segmented from the string, or
+    |  #[code None] if no suffix rules match.
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code string]
+        +cell unicode
+        +cell The string to segment.
+
+    +footrow
+        +cell returns
+        +cell int / #[code None]
+        +cell The length of the suffix if present, otherwise #[code None].
+
+h(2, "add_special_case") Tokenizer.add_special_case
+    +tag method
+
+p
+    |  Add a special-case tokenization rule. This mechanism is also used to add
+    |  custom tokenizer exceptions to the language data. See the usage workflow
+    |  on #[+a("/docs/usage/adding-languages#tokenizer-exceptions") adding languages]
+    |  for more details and examples.
+
+aside-code("Example").
+    from spacy.attrs import ORTH, LEMMA
+    case = [{"don't": [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]}]
+    tokenizer.add_special_case(case)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code string]
+        +cell unicode
+        +cell The string to specially tokenize.
+
+    +row
+        +cell #[code token_attrs]
+        +cell iterable
+        +cell
+            |  A sequence of dicts, where each dict describes a token and its
+            |  attributes. The #[code ORTH] fields of the attributes must
+            |  exactly match the string when they are concatenated.
+
+h(2, "to_disk") Tokenizer.to_disk
+    +tag method
+
+p Save the current state to a directory.
+
+aside-code("Example").
+    tokenizer.to_disk('/path/to/tokenizer')
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code path]
+        +cell unicode or #[code Path]
+        +cell
+            |  A path to a directory, which will be created if it doesn't exist.
+            |  Paths may be either strings or #[code Path]-like objects.
+
+h(2, "from_disk") Tokenizer.from_disk
+    +tag method
+
+p Loads state from a directory. Modifies the object in place and returns it.
+
+aside-code("Example").
+    from spacy.tokenizer import Tokenizer
+    tokenizer = Tokenizer(nlp.vocab)
+    tokenizer = tokenizer.from_disk('/path/to/tokenizer')
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code path]
+        +cell unicode or #[code Path]
+        +cell
+            |  A path to a directory. Paths may be either strings or
+            |  #[code Path]-like objects.
+
+    +footrow
+        +cell returns
+        +cell #[code Tokenizer]
+        +cell The modified #[code Tokenizer] object.
+
+h(2, "to_bytes") Tokenizer.to_bytes
+    +tag method
+
+p Serialize the current state to a binary string.
+
+aside-code("Example").
+    tokenizer_bytes = tokenizer.to_bytes()
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code **exclude]
+        +cell -
+        +cell Named attributes to prevent from being serialized.
+
+    +footrow
+        +cell returns
+        +cell bytes
+        +cell The serialized form of the #[code Tokenizer] object.
+
+h(2, "from_bytes") Tokenizer.from_bytes
+    +tag method
+
+p Load state from a binary string.
+
+aside-code("Example").
+    fron spacy.tokenizer import Tokenizer
+    tokenizer_bytes = tokenizer.to_bytes()
+    new_tokenizer = Tokenizer(nlp.vocab)
+    new_tokenizer.from_bytes(tokenizer_bytes)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code bytes_data]
+        +cell bytes
+        +cell The data to load from.
+
+    +row
+        +cell #[code **exclude]
+        +cell -
+        +cell Named attributes to prevent from being loaded.
+
+    +footrow
+        +cell returns
+        +cell #[code Tokenizer]
+        +cell The #[code Tokenizer] object.
+
 +h(2, "attributes") Attributes

 +table(["Name", "Type", "Description"])
@ -35,215 +312,3 @@ p
            |  A function to find internal segment separators, e.g. hyphens.
            |  Returns a (possibly empty) list of #[code re.MatchObject]
            |  objects.
-
-+h(2, "load") Tokenizer.load
-    +tag classmethod
-
-p Load a #[code Tokenizer], reading unsupplied components from the path.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code path]
-        +cell #[code Path]
-        +cell The path to load from.
-
-    +row
-        +cell #[code vocab]
-        +cell #[code Vocab]
-        +cell A storage container for lexical types.
-
-    +row
-        +cell #[code rules]
-        +cell dict
-        +cell Exceptions and special-cases for the tokenizer.
-
-    +row
-        +cell #[code prefix_search]
-        +cell callable
-        +cell
-            |  A function matching the signature of
-            |  #[code re.compile(string).search] to match prefixes.
-
-    +row
-        +cell #[code suffix_search]
-        +cell callable
-        +cell
-            |  A function matching the signature of
-            |  #[code re.compile(string).search] to match suffixes.
-
-    +row
-        +cell #[code infix_finditer]
-        +cell callable
-        +cell
-            |  A function matching the signature of
-            |  #[code re.compile(string).finditer] to find infixes.
-
-    +footrow
-        +cell return
-        +cell #[code Tokenizer]
-        +cell The newly constructed object.
-
-+h(2, "init") Tokenizer.__init__
-    +tag method
-
-p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code vocab]
-        +cell #[code Vocab]
-        +cell A storage container for lexical types.
-
-    +row
-        +cell #[code rules]
-        +cell dict
-        +cell Exceptions and special-cases for the tokenizer.
-
-    +row
-        +cell #[code prefix_search]
-        +cell callable
-        +cell
-            |  A function matching the signature of
-            |  #[code re.compile(string).search] to match prefixes.
-
-    +row
-        +cell #[code suffix_search]
-        +cell callable
-        +cell
-            |  A function matching the signature of
-            |  #[code re.compile(string).search] to match suffixes.
-
-    +row
-        +cell #[code infix_finditer]
-        +cell callable
-        +cell
-            |  A function matching the signature of
-            |  #[code re.compile(string).finditer] to find infixes.
-
-    +footrow
-        +cell return
-        +cell #[code Tokenizer]
-        +cell The newly constructed object.
-
-+h(2, "call") Tokenizer.__call__
-    +tag method
-
-p Tokenize a string.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code string]
-        +cell unicode
-        +cell The string to tokenize.
-
-    +footrow
-        +cell return
-        +cell #[code Doc]
-        +cell A container for linguistic annotations.
-
-+h(2, "pipe") Tokenizer.pipe
-    +tag method
-
-p Tokenize a stream of texts.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code texts]
-        +cell -
-        +cell A sequence of unicode texts.
-
-    +row
-        +cell #[code batch_size]
-        +cell int
-        +cell The number of texts to accumulate in an internal buffer.
-
-    +row
-        +cell #[code n_threads]
-        +cell int
-        +cell
-            |  The number of threads to use, if the implementation supports
-            |  multi-threading. The default tokenizer is single-threaded.
-
-    +footrow
-        +cell yield
-        +cell #[code Doc]
-        +cell A sequence of Doc objects, in order.
-
-+h(2, "find_infix") Tokenizer.find_infix
-    +tag method
-
-p Find internal split points of the string.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code string]
-        +cell unicode
-        +cell The string to split.
-
-    +footrow
-        +cell return
-        +cell #[code List[re.MatchObject]]
-        +cell
-            |  A list of objects that have #[code .start()] and #[code .end()]
-            |  methods, denoting the placement of internal segment separators,
-            |  e.g. hyphens.
-
-+h(2, "find_prefix") Tokenizer.find_prefix
-    +tag method
-
-p
-    |  Find the length of a prefix that should be segmented from the string, or
-    |  #[code None] if no prefix rules match.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code string]
-        +cell unicode
-        +cell The string to segment.
-
-    +footrow
-        +cell return
-        +cell int / #[code None]
-        +cell The length of the prefix if present, otherwise #[code None].
-
-+h(2, "find_suffix") Tokenizer.find_suffix
-    +tag method
-
-p
-    |  Find the length of a suffix that should be segmented from the string, or
-    |  #[code None] if no suffix rules match.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code string]
-        +cell unicode
-        +cell The string to segment.
-
-    +footrow
-        +cell return
-        +cell int / #[code None]
-        +cell The length of the suffix if present, otherwise #[code None].
-
-+h(2, "add_special_case") Tokenizer.add_special_case
-    +tag method
-
-p Add a special-case tokenization rule.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code string]
-        +cell unicode
-        +cell The string to specially tokenize.
-
-    +row
-        +cell #[code token_attrs]
-        +cell -
-        +cell
-            |  A sequence of dicts, where each dict describes a token and its
-            |  attributes. The #[code ORTH] fields of the attributes must
-            |  exactly match the string when they are concatenated.
-
-    +footrow
-        +cell return
-        +cell #[code None]
-        +cell -
--- a/website/docs/api/util.jade
+++ b/website/docs/api/util.jade
@ -14,7 +14,7 @@ p
    |  recommend having additional tests in place if your application depends on
    |  any of spaCy's utilities.

-+h(2, "get_data_path") get_data_path
+h(2, "get_data_path") util.get_data_path
    +tag function

 p
@ -28,11 +28,11 @@ p
        +cell Only return path if it exists, otherwise return #[code None].

    +footrow
-        +cell return
+        +cell returns
        +cell #[code Path] / #[code None]
        +cell Data path or #[code None].

-+h(2, "set_data_path") set_data_path
+h(2, "set_data_path") util.set_data_path
    +tag function

 p
@ -49,7 +49,7 @@ p
        +cell unicode or #[code Path]
        +cell Path to new data directory.

-+h(2, "get_lang_class") get_lang_class
+h(2, "get_lang_class") util.get_lang_class
    +tag function

 p
@ -70,11 +70,11 @@ p
        +cell Two-letter language code, e.g. #[code 'en'].

    +footrow
-        +cell return
+        +cell returns
        +cell #[code Language]
        +cell Language class.

-+h(2, "resolve_model_path") resolve_model_path
+h(2, "resolve_model_path") util.resolve_model_path
    +tag function

 p Resolve a model name or string to a model path.
@ -90,11 +90,11 @@ p Resolve a model name or string to a model path.
        +cell Package name, shortcut link or model path.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code Path]
        +cell Path to model data directory.

-+h(2, "is_package") is_package
+h(2, "is_package") util.is_package
    +tag function

 p
@ -112,11 +112,11 @@ p
        +cell Name of package.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code bool]
        +cell #[code True] if installed package, #[code False] if not.

-+h(2, "get_model_package_path") get_model_package_path
+h(2, "get_model_package_path") util.get_model_package_path
    +tag function

 p
@ -134,11 +134,11 @@ p
        +cell Name of installed package.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code Path]
        +cell Path to model data directory.

-+h(2, "parse_package_meta") parse_package_meta
+h(2, "parse_package_meta") util.parse_package_meta
    +tag function

 p
@ -163,11 +163,31 @@ p
        +cell If #[code True], raise error if no #[code meta.json] is found.

    +footrow
-        +cell return
+        +cell returns
        +cell dict / #[code None]
        +cell Model meta data or #[code None].

-+h(2, "update_exc") update_exc
+h(2, "is_in_jupyter") util.is_in_jupyter
+    +tag function
+
+p
+    |  Check if user is running spaCy from a #[+a("https://jupyter.org") Jupyter]
+    |  notebook by detecting the IPython kernel. Mainly used for the
+    |  #[+api("displacy") #[code displacy]] visualizer.
+
+aside-code("Example").
+    html = '&lt;h1&gt;Hello world!&lt;/h1&gt;'
+    if util.is_in_jupyter():
+        from IPython.core.display import display, HTML
+        return display(HTML(html))
+
+table(["Name", "Type", "Description"])
+    +footrow
+        +cell returns
+        +cell bool
+        +cell #[code True] if in Jupyter, #[code False] if not.
+
+h(2, "update_exc") util.update_exc
    +tag function

 p
@ -194,12 +214,12 @@ p
        +cell Exception dictionaries to add to the base exceptions, in order.

    +footrow
-        +cell return
+        +cell returns
        +cell dict
        +cell Combined tokenizer exceptions.


-+h(2, "prints") prints
+h(2, "prints") util.prints
    +tag function

 p
--- a/website/docs/api/vocab.jade
+++ b/website/docs/api/vocab.jade
@ -7,59 +7,6 @@ p
    |  #[code Vocab] instance also provides access to the #[code StringStore],
    |  and owns underlying C-data that is shared between #[code Doc] objects.

-+h(2, "attributes") Attributes
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code strings]
-        +cell #[code StringStore]
-        +cell A table managing the string-to-int mapping.
-
-    +row
-        +cell #[code vectors_length]
-        +cell int
-        +cell The dimensionality of the word vectors, if present.
-
-+h(2, "load") Vocab.load
-    +tag classmethod
-
-p Load the vocabulary from a path.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code path]
-        +cell #[code Path]
-        +cell The path to load from.
-
-    +row
-        +cell #[code lex_attr_getters]
-        +cell dict
-        +cell
-            |  A dictionary mapping attribute IDs to functions to compute them.
-            |  Defaults to #[code None].
-
-    +row
-        +cell #[code lemmatizer]
-        +cell -
-        +cell A lemmatizer. Defaults to #[code None].
-
-    +row
-        +cell #[code tag_map]
-        +cell dict
-        +cell
-            |  A dictionary mapping fine-grained tags to coarse-grained
-            |  parts-of-speech, and optionally morphological attributes.
-
-    +row
-        +cell #[code oov_prob]
-        +cell float
-        +cell The default probability for out-of-vocabulary words.
-
-    +footrow
-        +cell return
-        +cell #[code Vocab]
-        +cell The newly constructed object.
-
 +h(2, "init") Vocab.__init__
    +tag method

@ -73,11 +20,6 @@ p Create the vocabulary.
            |  A dictionary mapping attribute IDs to functions to compute them.
            |  Defaults to #[code None].

-    +row
-        +cell #[code lemmatizer]
-        +cell -
-        +cell A lemmatizer. Defaults to #[code None].
-
    +row
        +cell #[code tag_map]
        +cell dict
@ -86,23 +28,34 @@ p Create the vocabulary.
            |  parts-of-speech, and optionally morphological attributes.

    +row
-        +cell #[code oov_prob]
-        +cell float
-        +cell The default probability for out-of-vocabulary words.
+        +cell #[code lemmatizer]
+        +cell object
+        +cell A lemmatizer. Defaults to #[code None].
+
+    +row
+        +cell #[code strings]
+        +cell #[code StringStore]
+        +cell
+            |  A #[code StringStore] that maps strings to integers, and vice
+            |  versa.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code Vocab]
        +cell The newly constructed object.

 +h(2, "len") Vocab.__len__
    +tag method

-p Get the number of lexemes in the vocabulary.
+p Get the current number of lexemes in the vocabulary.
+
+aside-code("Example").
+    doc = nlp(u'This is a sentence.')
+    assert len(nlp.vocab) > 0

 +table(["Name", "Type", "Description"])
    +footrow
-        +cell return
+        +cell returns
        +cell int
        +cell The number of lexems in the vocabulary.

@ -113,6 +66,10 @@ p
    |  Retrieve a lexeme, given an int ID or a unicode string. If a previously
    |  unseen unicode string is given, a new lexeme is created and stored.

+aside-code("Example").
+    apple = nlp.vocab.strings['apple']
+    assert nlp.vocab[apple] == nlp.vocab[u'apple']
+
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code id_or_string]
@ -120,25 +77,37 @@ p
        +cell The integer ID of a word, or its unicode string.

    +footrow
-        +cell return
+        +cell returns
        +cell #[code Lexeme]
        +cell The lexeme indicated by the given ID.

-+h(2, "iter") Span.__iter__
+h(2, "iter") Vocab.__iter__
    +tag method

 p Iterate over the lexemes in the vocabulary.

+aside-code("Example").
+    stop_words = (lex for lex in nlp.vocab if lex.is_stop)
+
 +table(["Name", "Type", "Description"])
    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Lexeme]
        +cell An entry in the vocabulary.

 +h(2, "contains") Vocab.__contains__
    +tag method

-p Check whether the string has an entry in the vocabulary.
+p
+    |  Check whether the string has an entry in the vocabulary. To get the ID
+    |  for a given string, you need to look it up in
+    |  #[+api("vocab#attributes") #[code vocab.strings]].
+
+aside-code("Example").
+    apple = nlp.vocab.strings['apple']
+    oov = nlp.vocab.strings['dskfodkfos']
+    assert apple in nlp.vocab
+    assert oov not in nlp.vocab

 +table(["Name", "Type", "Description"])
    +row
@ -147,32 +116,27 @@ p Check whether the string has an entry in the vocabulary.
        +cell The ID string.

    +footrow
-        +cell return
+        +cell returns
        +cell bool
        +cell Whether the string has an entry in the vocabulary.

-+h(2, "resize_vectors") Vocab.resize_vectors
-    +tag method
-
-p
-    |  Set #[code vectors_length] to a new size, and allocate more memory for
-    |  the #[code Lexeme] vectors if necessary. The memory will be zeroed.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code new_size]
-        +cell int
-        +cell The new size of the vectors.
-
-    +footrow
-        +cell return
-        +cell #[code None]
-        +cell -
-
 +h(2, "add_flag") Vocab.add_flag
    +tag method

-p Set a new boolean flag to words in the vocabulary.
+p
+    |  Set a new boolean flag to words in the vocabulary. The #[code flag_getter]
+    |  function will be called over the words currently in the vocab, and then
+    |  applied to new words as they occur. You'll then be able to access the flag
+    |  value on each token, using #[code token.check_flag(flag_id)].
+
+aside-code("Example").
+    def is_my_product(text):
+        products = [u'spaCy', u'Thinc', u'displaCy']
+        return text in products
+
+    MY_PRODUCT = nlp.vocab.add_flag(is_my_product)
+    doc = nlp(u'I like spaCy')
+    assert doc[2].check_flag(MY_PRODUCT) == True

 +table(["Name", "Type", "Description"])
    +row
@ -189,90 +153,104 @@ p Set a new boolean flag to words in the vocabulary.
            |  available bit will be chosen.

    +footrow
-        +cell return
+        +cell returns
        +cell int
        +cell The integer ID by which the flag value can be checked.

-+h(2, "dump") Vocab.dump
+h(2, "to_disk") Vocab.to_disk
    +tag method

-p Save the lexemes binary data to the given location.
+p Save the current state to a directory.
+
+aside-code("Example").
+    nlp.vocab.to_disk('/path/to/vocab')

 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code loc]
-        +cell #[code Path]
-        +cell The path to load from.
-
-    +footrow
-        +cell return
-        +cell #[code None]
-        +cell -
-
-+h(2, "load_lexemes") Vocab.load_lexemes
-    +tag method
-
-p
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code loc]
-        +cell unicode
-        +cell Path to load the lexemes.bin file from.
-
-    +footrow
-        +cell return
-        +cell #[code None]
-        +cell -
-
-+h(2, "dump_vectors") Vocab.dump_vectors
-    +tag method
-
-p Save the word vectors to a binary file.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code loc]
-        +cell #[code Path]
-        +cell The path to save to.
-
-    +footrow
-        +cell return
-        +cell #[code None]
-        +cell -
-
-+h(2, "load_vectors") Vocab.load_vectors
-    +tag method
-
-p Load vectors from a text-based file.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code file_]
-        +cell buffer
+        +cell #[code path]
+        +cell unicode or #[code Path]
        +cell
-            |  The file to read from. Entries should be separated by newlines,
-            |  and each entry should be whitespace delimited. The first value
-            |  of the entry should be the word string, and subsequent entries
-            |  should be the values of the vector.
+            |  A path to a directory, which will be created if it doesn't exist.
+            |  Paths may be either strings or #[code Path]-like objects.

-    +footrow
-        +cell return
-        +cell int
-        +cell The length of the vectors loaded.
-
-+h(2, "load_vectors_from_bin_loc") Vocab.load_vectors_from_bin_loc
+h(2, "from_disk") Vocab.from_disk
    +tag method

-p Load vectors from the location of a binary file.
+p Loads state from a directory. Modifies the object in place and returns it.
+
+aside-code("Example").
+    from spacy.vocab import Vocab
+    vocab = Vocab().from_disk('/path/to/vocab')

 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code loc]
-        +cell unicode
-        +cell The path of the binary file to load from.
+        +cell #[code path]
+        +cell unicode or #[code Path]
+        +cell
+            |  A path to a directory. Paths may be either strings or
+            |  #[code Path]-like objects.

    +footrow
-        +cell return
-        +cell int
-        +cell The length of the vectors loaded.
+        +cell returns
+        +cell #[code Vocab]
+        +cell The modified #[code Vocab] object.
+
+h(2, "to_bytes") Vocab.to_bytes
+    +tag method
+
+p Serialize the current state to a binary string.
+
+aside-code("Example").
+    vocab_bytes = nlp.vocab.to_bytes()
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code **exclude]
+        +cell -
+        +cell Named attributes to prevent from being serialized.
+
+    +footrow
+        +cell returns
+        +cell bytes
+        +cell The serialized form of the #[code Vocab] object.
+
+h(2, "from_bytes") Vocab.from_bytes
+    +tag method
+
+p Load state from a binary string.
+
+aside-code("Example").
+    fron spacy.vocab import Vocab
+    vocab_bytes = nlp.vocab.to_bytes()
+    vocab = Vocab()
+    vocab.from_bytes(vocab_bytes)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code bytes_data]
+        +cell bytes
+        +cell The data to load from.
+
+    +row
+        +cell #[code **exclude]
+        +cell -
+        +cell Named attributes to prevent from being loaded.
+
+    +footrow
+        +cell returns
+        +cell #[code Vocab]
+        +cell The #[code Vocab] object.
+
+h(2, "attributes") Attributes
+
+aside-code("Example").
+    apple_id = nlp.vocab.strings['apple']
+    assert type(apple_id) == int
+    PERSON = nlp.vocab.strings['PERSON']
+    assert type(PERSON) == int
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code strings]
+        +cell #[code StringStore]
+        +cell A table managing the string-to-int mapping.
--- a/website/docs/usage/adding-languages.jade
+++ b/website/docs/usage/adding-languages.jade
@ -56,20 +56,22 @@ p
    from ...attrs import LANG
    from ...util import update_exc

+    # create Defaults class in the module scope (necessary for pickling!)
+    class XxxxxDefaults(Language.Defaults):
+        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+        lex_attr_getters[LANG] = lambda text: 'xx' # language ISO code
+
+        # optional: replace flags with custom functions, e.g. like_num()
+        lex_attr_getters.update(LEX_ATTRS)
+
+        # merge base exceptions and custom tokenizer exceptions
+        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+        stop_words = set(STOP_WORDS)
+
+    # create actual Language class
    class Xxxxx(Language):
        lang = 'xx' # language ISO code
-
-        # override defaults
-        class Defaults(Language.Defaults):
-            lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-            lex_attr_getters[LANG] = lambda text: 'xx' # language ISO code
-
-            # optional: replace flags with custom functions, e.g. like_num()
-            lex_attr_getters.update(LEX_ATTRS)
-
-            # merge base exceptions and custom tokenizer exceptions
-            tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-            stop_words = set(STOP_WORDS)
+        Defaults = XxxxxDefaults # override defaults

    # set default export – this allows the language class to be lazy-loaded
    __all__ = ['Xxxxx']
--- a/website/docs/usage/entity-recognition.jade
+++ b/website/docs/usage/entity-recognition.jade
@ -141,11 +141,11 @@ p
 include ../api/_annotation/_named-entities

 +aside("Install")
-    |  The #[+api("load") spacy.load()] function configures a pipeline that
+    |  The #[+api("load") #[code spacy.load()]] function configures a pipeline that
    |  includes all of the available annotators for the given ID. In the example
    |  above, the #[code 'en'] ID tells spaCy to load the default English
    |  pipeline. If you have installed the data with
-    |  #[code python -m spacy.en.download] this will include the entity
+    |  #[code python -m spacy download en], this will include the entity
    |  recognition model.

 +h(2, "updating") Training and updating
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/docs/usage/rule-based-matching.jade
@ -4,58 +4,190 @@ include ../../_includes/_mixins

 p
    |  spaCy features a rule-matching engine that operates over tokens, similar
-    |  to regular expressions. The rules can refer to token annotations and
-    |  flags, and matches support callbacks to accept, modify and/or act on the
-    |  match. The rule matcher also allows you to associate patterns with
-    |  entity IDs, to allow some basic entity linking or disambiguation.
+    |  to regular expressions. The rules can refer to token annotations (e.g.
+    |  the token #[code text] or #[code tag_], and flags (e.g. #[code IS_PUNCT]).
+    |  The rule matcher also lets you pass in a custom callback
+    |  to act on matches – for example, to merge entities and apply custom labels.
+    |  You can also associate patterns with entity IDs, to allow some basic
+    |  entity linking or disambiguation.

-p Here's a minimal example. We first add a pattern that specifies three tokens:
+aside("What about \"real\" regular expressions?")

-+list("numbers")
-    +item A token whose lower-case form matches "hello"
-    +item A token whose #[code is_punct] flag is set to #[code True]
-    +item A token whose lower-case form matches "world"
+h(2, "adding-patterns") Adding patterns

 p
-    |  Once we've added the pattern, we can use the #[code matcher] as a
-    |  callable, to receive a list of #[code (ent_id, start, end)] tuples.
-    |  Note that #[code LOWER] and #[code IS_PUNCT] are data attributes
-    |  of #[code spacy.attrs].
+    |  Let's say we want to enable spaCy to find a combination of three tokens:
+
+list("numbers")
+    +item
+        |  A token whose #[strong lower-case form matches "hello"], e.g. "Hello"
+        |  or "HELLO".
+    +item
+        |  A token whose #[strong #[code is_punct] flag is set to #[code True]],
+        |  i.e. any punctuation.
+    +item
+        |  A token whose #[strong lower-case form matches "world"], e.g. "World"
+        |  or "WORLD".

 +code.
-    from spacy.matcher import Matcher
-    matcher = Matcher(nlp.vocab)
-    matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True}, {LOWER: "world"}])
+    [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}]

-    doc = nlp(u'Hello, world!')
+p
+    |  First, we initialise the #[code Matcher] with a vocab. The matcher must
+    |  always share the same vocab with the documents it will operate on. We
+    |  can now call #[+api("matcher#add") #[code matcher.add()]] with an ID and
+    |  our custom pattern. The second argument lets you pass in an optional
+    |  callback function to invoke on a successful match. For now, we set it
+    |  to #[code None].
+
+code.
+    import spacy
+    from spacy.matcher import Matcher
+    from spacy.attrs import LOWER, IS_PUNCT # don't forget to import the attrs!
+
+    nlp = spacy.load('en')
+    matcher = Matcher(nlp.vocab)
+    # add match ID "HelloWorld" with no callback and one pattern
+    matcher.add('HelloWorld', on_match=None,
+                [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}])
+
+    doc = nlp(u'Hello, world! Hello world!')
    matches = matcher(doc)

 p
-    |  The returned matches include the ID, to let you associate the matches
-    |  with the patterns. You can also group multiple patterns together, which
-    |  is useful when you have a knowledge base of entities you want to match,
-    |  and you want to write multiple patterns for each entity.
-
-+h(2, "entities-patterns") Entities and patterns
+    |  The matcher returns a list of #[code (match_id, start, end)] tuples – in
+    |  this case, #[code [('HelloWorld', 0, 2)]], which maps to the span
+    |  #[code doc[0:2]] of our original document. Optionally, we could also
+    |  choose to add more than one pattern, for example to also match sequences
+    |  without punctuation between "hello" and "world":

 +code.
-    matcher.add_entity(
-        "GoogleNow", # Entity ID -- Helps you act on the match.
-        {"ent_type": "PRODUCT", "wiki_en": "Google_Now"}, # Arbitrary attributes (optional)
-    )
+    matcher.add('HelloWorld', on_match=None,
+                [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
+                [{LOWER: 'hello'}, {LOWER: 'world'}])

-    matcher.add_pattern(
-        "GoogleNow", # Entity ID -- Created if doesn't exist.
-        [ # The pattern is a list of *Token Specifiers*.
-            { # This Token Specifier matches tokens whose orth field is "Google"
-              ORTH: "Google"
-            },
-            { # This Token Specifier matches tokens whose orth field is "Now"
-              ORTH: "Now"
-            }
-        ],
-        label=None # Can associate a label to the pattern-match, to handle it better.
-    )
+p
+    |  By default, the matcher will only return the matches and
+    |  #[strong not do anything else], like merge entities or assign labels.
+    |  This is all up to you and can be defined individually for each pattern,
+    |  by passing in a callback function as the #[code on_match] argument on
+    |  #[code add()]. This is useful, because it lets you write entirely custom
+    |  and #[strong pattern-specific logic]. For example, you might want to
+    |  merge #[em some] patterns into one token, while adding entity labels for
+    |  other pattern types. You shouldn't have to create different matchers for
+    |  each of those processes.
+
+h(2, "on_match") Adding #[code on_match] rules
+
+p
+    |  To move on to a more realistic example, let's say you're working with a
+    |  large corpus of blog articles, and you want to match all mentions of
+    |  "Google I/O" (which spaCy tokenizes as #[code ['Google', 'I', '/', 'O']]).
+    |  To be safe, you only match on the uppercase versions, in case someone has
+    |  written it as "Google i/o". You also add a second pattern with an added
+    |  #[code {IS_DIGIT: True}] token – this will make sure you also match on
+    |  "Google I/O 2017". If your pattern matches, spaCy should execute your
+    |  custom callback function #[code add_event_ent].
+
+code.
+    import spacy
+    from spacy.matcher import Matcher
+    from spacy.attrs import ORTH, UPPER, LOWER, IS_DIGIT
+
+    nlp = spacy.load('en')
+    matcher = Matcher(nlp.vocab)
+
+    matcher.add('GoogleIO', on_match=add_event_ent,
+                [{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}],
+                [{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}, {IS_DIGIT: True}])
+
+    # Get the ID of the 'EVENT' entity type. This is required to set an entity.
+    EVENT = nlp.vocab.strings['EVENT']
+
+    def add_event_ent(matcher, doc, i, matches):
+        # Get the current match and create tuple of entity label, start and end.
+        # Append entity to the doc's entity. (Don't overwrite doc.ents!)
+        match_id, start, end = matches[i]
+        doc.ents += ((EVENT, start, end),)
+
+p
+    |  In addition to mentions of "Google I/O", your data also contains some
+    |  annoying pre-processing artefacts, like leftover HTML line breaks
+    |  (e.g. #[code &lt;br&gt;] or #[code &lt;BR/&gt;]). While you're at it,
+    |  you want to merge those into one token and flag them, to make sure you
+    |  can easily ignore them later. So you add a second pattern and pass in a
+    |  function #[code merge_and_flag]:
+
+code.
+    matcher.add('BAD_HTML', on_match=merge_and_flag,
+                [{ORTH: '&lt;'}, {LOWER: 'br'}, {ORTH: '&gt;'}],
+                [{ORTH: '&lt;'}, {LOWER: 'br/'}, {ORTH: '&gt;'}])
+
+    # Add a new custom flag to the vocab, which is always False by default.
+    # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
+    BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False)
+
+    def merge_and_flag(matcher, doc, i, matches):
+        match_id, start, end = matches[i]
+        span = doc[start : end]
+        span.merge(is_stop=True) # merge (and mark it as a stop word, just in case)
+        span.set_flag(BAD_HTML_FLAG, True) # set BAD_HTML_FLAG
+
+aside("Tip: Visualizing matches")
+    |  When working with entities, you can use #[+api("displacy") displaCy]
+    |  to quickly generate a NER visualization from your updated #[code Doc],
+    |  which can be exported as an HTML file:
+
+    +code.o-no-block.
+        from spacy import displacy
+        html = displacy.render(doc, style='ent', page=True,
+                               options={'ents': ['EVENT']})
+
+    |  For more info and examples, see the usage workflow on
+    |  #[+a("/docs/usage/visualizers") visualizing spaCy].
+
+p
+    |  We can now call the matcher on our documents. The patterns will be
+    |  matched in the order they occur in the text.
+
+code.
+    doc = nlp(LOTS_OF_TEXT)
+    matcher(doc)
+
+h(3, "on_match-callback") The callback function
+
+p
+    |  The matcher will first collect all matches over the document. It will
+    |  then iterate over the matches, lookup the callback for the entity ID
+    |  that was matched, and invoke it. When the callback is invoked, it is
+    |  passed four arguments: the matcher itself, the document, the position of
+    |  the current match, and the total list of matches. This allows you to
+    |  write callbacks that consider the entire set of matched phrases, so that
+    |  you can resolve overlaps and other conflicts in whatever way you prefer.
+
+table(["Argument", "Type", "Description"])
+    +row
+        +cell #[code matcher]
+        +cell #[code Matcher]
+        +cell The matcher instance.
+
+    +row
+        +cell #[code doc]
+        +cell #[code Doc]
+        +cell The document the matcher was used on.
+
+    +row
+        +cell #[code i]
+        +cell int
+        +cell Index of the current match (#[code matches[i]]).
+
+    +row
+        +cell #[code matches]
+        +cell list
+        +cell
+            |  A list of #[code (match_id, start, end)] tuples, describing the
+            |  matches. A match tuple describes a span #[code doc[start:end]].
+            |  The #[code match_id] is the ID of the added match pattern.

 +h(2, "quantifiers") Using quantifiers

@ -82,78 +214,4 @@ p

 p
    |  There are no nested or scoped quantifiers. You can build those
-    |  behaviours with acceptors and
-    |  #[+api("matcher#add_entity") #[code on_match]] callbacks.
-
-+h(2, "acceptor-functions") Acceptor functions
-
-p
-    |  The #[code acceptor] keyword of #[code matcher.add_entity()] allows you to
-    |  pass a function to reject or modify matches. The function you pass should
-    |  take five arguments: #[code doc], #[code ent_id], #[code label], #[code start],
-    |  and #[code end].  You can return a falsey value to reject the match, or
-    |  return a 4-tuple #[code (ent_id, label, start, end)].
-
-+code.
-    from spacy.tokens.doc import Doc
-    def trim_title(doc, ent_id, label, start, end):
-        if doc[start].check_flag(IS_TITLE_TERM):
-            return (ent_id, label, start+1, end)
-        else:
-            return (ent_id, label, start, end)
-    titles = set(title.lower() for title in [u'Mr.', 'Dr.', 'Ms.', u'Admiral'])
-    IS_TITLE_TERM = matcher.vocab.add_flag(lambda string: string.lower() in titles)
-    matcher.add_entity('PersonName', acceptor=trim_title)
-    matcher.add_pattern('PersonName', [{LOWER: 'mr.'}, {LOWER: 'cruise'}])
-    matcher.add_pattern('PersonName', [{LOWER: 'dr.'}, {LOWER: 'seuss'}])
-    doc = Doc(matcher.vocab, words=[u'Mr.', u'Cruise', u'likes', 'Dr.', u'Seuss'])
-    for ent_id, label, start, end in matcher(doc):
-        print(doc[start:end].text)
-        # Cruise
-        # Seuss
-
-p
-    |  Passing an #[code acceptor] function allows you to match patterns with
-    |  arbitrary logic that can't easily be expressed by a finite-state machine.
-    |  You can look at the entirety of the
-    |  matched phrase, and its context in the document, and decide to move
-    |  the boundaries or reject the match entirely.
-
-+h(2, "callback-functions") Callback functions
-
-p
-    |  In spaCy &lt;1.0, the #[code Matcher] automatically tagged matched phrases
-    |  with entity types. Since spaCy 1.0, the matcher no longer acts on matches
-    |  automatically. By default, the match list is returned for the user to action.
-    |  However, it's often more convenient to register the required actions as a
-    |  callback. You can do this by passing a function to the #[code on_match]
-    |  keyword argument of #[code matcher.add_entity].
-
-+aside-code("Example").
-    def merge_phrases(matcher, doc, i, matches):
-        '''
-        Merge a phrase. We have to be careful here because we'll change the token indices.
-        To avoid problems, merge all the phrases once we're called on the last match.
-        '''
-        if i != len(matches)-1:
-            return None
-        # Get Span objects
-        spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches]
-        for ent_id, label, span in spans:
-            span.merge(label=label, tag='NNP' if label else span.root.tag_)
-
-    matcher.add_entity('GoogleNow', on_match=merge_phrases)
-    matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
-    doc = Doc(matcher.vocab, words=[u'Google', u'Now', u'is', u'being', u'rebranded'])
-    matcher(doc)
-    print([w.text for w in doc])
-    # [u'Google Now', u'is', u'being', u'rebranded']
-
-p
-    |  The matcher will first collect all matches over the document. It will
-    |  then iterate over the matches, look-up the callback for the entity ID
-    |  that was matched, and invoke it. When the callback is invoked, it is
-    |  passed four arguments: the matcher itself, the document, the position of
-    |  the current match, and the total list of matches. This allows you to
-    |  write callbacks that consider the entire set of matched phrases, so that
-    |  you can resolve overlaps and other conflicts in whatever way you prefer.
+    |  behaviours with #[code on_match] callbacks.
--- a/website/docs/usage/v2.jade
+++ b/website/docs/usage/v2.jade
@ -2,9 +2,218 @@

 include ../../_includes/_mixins

+p
+    |  We also re-wrote a large part of the documentation and usage workflows,
+    |  and added more examples.

 +h(2, "features") New features

+h(3, "features-displacy") displaCy visualizer with Jupyter support
+
+aside-code("Example").
+    from spacy import displacy
+    doc = nlp(u'This is a sentence about Facebook.')
+    displacy.serve(doc, style='dep') # run the web server
+    html = displacy.render(doc, style='ent') # generate HTML
+
+p
+    |  Our popular dependency and named entity visualizers are now an official
+    |  part of the spaCy library! displaCy can run a simple web server, or
+    |  generate raw HTML markup or SVG files to be exported. You can pass in one
+    |  or more docs, and customise the style. displaCy also auto-detects whether
+    |  you're running #[+a("https://jupyter.org") Jupyter] and will render the
+    |  visualizations in your notebook.
+
+infobox
+    |  #[strong API:] #[+api("displacy") #[code displacy]]
+    |  #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizing spaCy]
+
+h(3, "features-loading") Loading
+
+aside-code("Example").
+    nlp = spacy.load('en') # shortcut link
+    nlp = spacy.load('en_core_web_sm') # package
+    nlp = spacy.load('/path/to/en') # unicode path
+    nlp = spacy.load(Path('/path/to/en')) # pathlib Path
+
+p
+    |  The improved #[code spacy.load] makes loading models easier and more
+    |  transparent. You can load a model by supplying its
+    |  #[+a("/docs/usage/models#usage") shortcut link], the name of an installed
+    |  #[+a("/docs/usage/saving-loading#generating") model package], a unicode
+    |  path or a #[code Path]-like object. spaCy will try resolving the load
+    |  argument in this order. The #[code path] keyword argument is now deprecated.
+
+p
+    |  The #[code Language] class to initialise will be determined based on the
+    |  model's settings. If no model is found, spaCy will let you know and won't
+    |  just return an empty #[code Language] object anymore. If you want a blank
+    |  language, you can always import the class directly, e.g.
+    |  #[code from spacy.lang.en import English].
+
+infobox
+    |  #[strong API:] #[+api("spacy#load") #[code spacy.load]]
+    |  #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
+
+h(3, "features-language") Improved language data and processing pipelines
+
+aside-code("Example").
+    from spacy.language import Language
+    nlp = Language(pipeline=['token_vectors', 'tags',
+                             'dependencies'])
+
+infobox
+    |  #[strong API:] #[+api("language") #[code Language]]
+    |  #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages]
+
+h(3, "features-lemmatizer") Simple lookup-based lemmatization
+
+aside-code("Example").
+    LOOKUP = {
+        "aba": "abar",
+        "ababa": "abar",
+        "ababais": "abar",
+        "ababan": "abar",
+        "ababanes": "ababán"
+    }
+
+p
+    |  spaCy now supports simple lookup-based lemmatization. The data is stored
+    |  in a dictionary mapping a string to its lemma. To determine a token's
+    |  lemma, spaCy simply looks it up in the table. The lookup lemmatizer can
+    |  be imported from #[code spacy.lemmatizerlookup]. It's initialised with
+    |  the lookup table, and should be returned by the #[code create_lemmatizer]
+    |  classmethod of the language's defaults.
+
+infobox
+    |  #[strong API:] #[+api("language") #[code Language]]
+    |  #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages]
+
+h(3, "features-matcher") Revised matcher API
+
+aside-code("Example").
+    from spacy.matcher import Matcher
+    from spacy.attrs import LOWER, IS_PUNCT
+    matcher = Matcher(nlp.vocab)
+    matcher.add('HelloWorld', on_match=None,
+                [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
+                [{LOWER: 'hello'}, {LOWER: 'world'}])
+    assert len(matcher) == 1
+    assert 'HelloWorld' in matcher
+
+p
+    |  Patterns can now be added to the matcher by calling
+    |  #[+api("matcher-add") #[code matcher.add()]] with a match ID, an optional
+    |  callback function to be invoked on each match, and one or more patterns.
+    |  This allows you to write powerful, pattern-specific logic using only one
+    |  matcher. For example, you might only want to merge some entity types,
+    |  and set custom flags for other matched patterns.
+
+infobox
+    |  #[strong API:] #[+api("matcher") #[code Matcher]]
+    |  #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching]
+
+h(3, "features-serializer") Serialization
+
+infobox
+    |  #[strong API:] #[+api("serializer") #[code Serializer]]
+    |  #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
+
+h(3, "features-models") Neural network models for English, German, French and Spanish
+
+infobox
+    |  #[strong Details:] #[+src(gh("spacy-models")) spacy-models]
+    |  #[strong Usage:] #[+a("/docs/usage/models") Models]
+
 +h(2, "incompat") Backwards incompatibilities

+table(["Old", "New"])
+    +row
+        +cell #[code Language.save_to_directory]
+        +cell #[+api("language#to_disk") #[code Language.to_disk]]
+
+    +row
+        +cell #[code Tokenizer.load]
+        +cell
+            |  #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
+            |  #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
+
+    +row
+        +cell #[code Tagger.load]
+        +cell
+            |  #[+api("tagger#from_disk") #[code Tagger.from_disk]]
+            |  #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
+
+    +row
+        +cell #[code DependencyParser.load]
+        +cell
+            |  #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
+            |  #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
+
+    +row
+        +cell #[code EntityRecognizer.load]
+        +cell
+            |  #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
+            |  #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
+
+    +row
+        +cell
+            |  #[code Vocab.load]
+            |  #[code Vocab.load_lexemes]
+            |  #[code Vocab.load_vectors]
+            |  #[code Vocab.load_vectors_from_bin_loc]
+        +cell
+            |  #[+api("vocab#from_disk") #[code Vocab.from_disk]]
+            |  #[+api("vocab#from_bytes") #[code Vocab.from_bytes]]
+
+    +row
+        +cell
+            |  #[code Vocab.dump]
+            |  #[code Vocab.dump_vectors]
+        +cell
+            |  #[+api("vocab#to_disk") #[code Vocab.to_disk]]
+            |  #[+api("vocab#to_bytes") #[code Vocab.to_bytes]]
+
+    +row
+        +cell
+            |  #[code StringStore.load]
+        +cell
+            |  #[+api("stringstore#from_disk") #[code StringStore.from_disk]]
+            |  #[+api("stringstore#from_bytes") #[code StringStore.from_bytes]]
+
+    +row
+        +cell
+            |  #[code StringStore.dump]
+        +cell
+            |  #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
+            |  #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
+
+    +row
+        +cell #[code Matcher.load]
+        +cell -
+
+    +row
+        +cell
+            |  #[code Matcher.add_pattern]
+            |  #[code Matcher.add_entity]
+        +cell #[+api("matcher#add") #[code Matcher.add]]
+
+    +row
+        +cell #[code Matcher.get_entity]
+        +cell #[+api("matcher#get") #[code Matcher.get]]
+
+    +row
+        +cell #[code Matcher.has_entity]
+        +cell #[+api("matcher#contains") #[code Matcher.__contains__]]
+
+    +row
+        +cell #[code Doc.read_bytes]
+        +cell
+
+    +row
+        +cell #[code Token.is_ancestor_of]
+        +cell #[+api("token#is_ancestor") #[code Token.is_ancestor]]
+
+
+
 +h(2, "migrating") Migrating from spaCy 1.x