Merge docstrings

2025-11-09 12:27:54 +03:00 · 2017-05-21 13:46:23 -05:00 · 2017-05-21 13:46:23 -05:00 · 5db89053aa
commit 5db89053aa
parent 432b3499b3 2c5cfe8bbf
68 changed files with 4137 additions and 3113 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -14,3 +14,4 @@ regex==2017.4.5
 ftfy>=4.4.2,<5.0.0
 pytest>=3.0.6,<4.0.0
 pip>=9.0.0,<10.0.0
 mock>=2.0.0,<3.0.0
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -20,7 +20,17 @@ def download(model, direct=False):
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
        download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
-        link(model_name, model, force=True)
+        try:
            link(model_name, model, force=True)
        except:
            # Dirty, but since spacy.download and the auto-linking is mostly
            # a convenience wrapper, it's best to show a success message and
            # loading instructions, even if linking fails.
            prints("Creating a shortcut link for 'en' didn't work (maybe you "
                   "don't have admin permissions?), but you can still load "
                   "the model via its full package name:",
                   "nlp = spacy.load('%s')" % model_name,
                   title="Download successful")
 def get_json(url, desc):
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -11,15 +11,14 @@ from .. import util
 def info(model=None, markdown=False):
    if model:
-        data_path = util.get_data_path()
+        model_path = util.resolve_model_path(model)
-        data = util.parse_package_meta(data_path / model, require=True)
+        meta = util.parse_package_meta(model_path)
        model_path = Path(__file__).parent / data_path / model
        if model_path.resolve() != model_path:
-            data['link'] = path2str(model_path)
+            meta['link'] = path2str(model_path)
-            data['source'] = path2str(model_path.resolve())
+            meta['source'] = path2str(model_path.resolve())
        else:
-            data['source'] = path2str(model_path)
+            meta['source'] = path2str(model_path)
-        print_info(data, 'model %s' % model, markdown)
+        print_info(meta, 'model %s' % model, markdown)
    else:
        data = {'spaCy version': about.__version__,
                'Location': path2str(Path(__file__).parent.parent),
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -306,25 +306,17 @@ cdef class GoldParse:
    def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
                 deps=None, entities=None, make_projective=False):
-        """
+        """Create a GoldParse.
        Create a GoldParse.
-        Arguments:
+        doc (Doc): The document the annotations refer to.
-            doc (Doc):
+        words (iterable): A sequence of unicode word strings.
-                The document the annotations refer to.
+        tags (iterable): A sequence of strings, representing tag annotations.
-            words:
+        heads (iterable): A sequence of integers, representing syntactic head offsets.
-                A sequence of unicode word strings.
+        deps (iterable): A sequence of strings, representing the syntactic relation types.
-            tags:
+        entities (iterable): A sequence of named entity annotations, either as
-                A sequence of strings, representing tag annotations.
+            BILUO tag strings, or as `(start_char, end_char, label)` tuples,
-            heads:
+            representing the entity positions.
-                A sequence of integers, representing syntactic head offsets.
+        RETURNS (GoldParse): The newly constructed object.
            deps:
                A sequence of strings, representing the syntactic relation types.
            entities:
                A sequence of named entity annotations, either as BILUO tag strings,
                or as (start_char, end_char, label) tuples, representing the entity
                positions.
        Returns (GoldParse): The newly constructed object.
        """
        if words is None:
            words = [token.text for token in doc]
@ -389,55 +381,45 @@ cdef class GoldParse:
            self.heads = proj_heads
    def __len__(self):
-        """
+        """Get the number of gold-standard tokens.
        Get the number of gold-standard tokens.
-        Returns (int): The number of gold-standard tokens.
+        RETURNS (int): The number of gold-standard tokens.
        """
        return self.length
    @property
    def is_projective(self):
-        """
+        """Whether the provided syntactic annotations form a projective
-        Whether the provided syntactic annotations form a projective dependency
+        dependency tree.
        tree.
        """
        return not nonproj.is_nonproj_tree(self.heads)
 def biluo_tags_from_offsets(doc, entities):
-    """
+    """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
-    Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
+    scheme (BILUO).
    scheme (biluo).
-    Arguments:
+    doc (Doc): The document that the entity offsets refer to. The output tags
-        doc (Doc):
+        will refer to the token boundaries within the document.
-            The document that the entity offsets refer to. The output tags will
+    entities (iterable): A sequence of `(start, end, label)` triples. `start` and
-            refer to the token boundaries within the document.
+        `end` should be character-offset integers denoting the slice into the
        original string.
-        entities (sequence):
+    RETURNS (list): A list of unicode strings, describing the tags. Each tag
-            A sequence of (start, end, label) triples. start and end should be
+        string will be of the form either "", "O" or "{action}-{label}", where
-            character-offset integers denoting the slice into the original string.
+        action is one of "B", "I", "L", "U". The string "-" is used where the
        entity offsets don't align with the tokenization in the `Doc` object. The
        training algorithm will view these as missing values. "O" denotes a
        non-entity token. "B" denotes the beginning of a multi-token entity,
        "I" the inside of an entity of three or more tokens, and "L" the end
        of an entity of two or more tokens. "U" denotes a single-token entity.
-    Returns:
+    EXAMPLE:
-        tags (list):
+        >>> text = 'I like London.'
-            A list of unicode strings, describing the tags. Each tag string will
+        >>> entities = [(len('I like '), len('I like London'), 'LOC')]
-            be of the form either "", "O" or "{action}-{label}", where action is one
+        >>> doc = nlp.tokenizer(text)
-            of "B", "I", "L", "U". The string "-" is used where the entity
+        >>> tags = biluo_tags_from_offsets(doc, entities)
-            offsets don't align with the tokenization in the Doc object. The
+        >>> assert tags == ['O', 'O', 'U-LOC', 'O']
            training algorithm will view these as missing values. "O" denotes
            a non-entity token. "B" denotes the beginning of a multi-token entity,
            "I" the inside of an entity of three or more tokens, and "L" the end
            of an entity of two or more tokens. "U" denotes a single-token entity.
    Example:
        text = 'I like London.'
        entities = [(len('I like '), len('I like London'), 'LOC')]
        doc = nlp.tokenizer(text)
        tags = biluo_tags_from_offsets(doc, entities)
        assert tags == ['O', 'O', 'U-LOC', 'O']
    """
    starts = {token.idx: token.i for token in doc}
    ends = {token.idx+len(token): token.i for token in doc}
--- a/spacy/lang/bn/init.py
+++ b/spacy/lang/bn/init.py
@ -13,21 +13,23 @@ from ...attrs import LANG
 from ...util import update_exc
 class BengaliDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'bn'
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    tag_map = TAG_MAP
    stop_words = STOP_WORDS
    lemma_rules = LEMMA_RULES
    prefixes = tuple(TOKENIZER_PREFIXES)
    suffixes = tuple(TOKENIZER_SUFFIXES)
    infixes = tuple(TOKENIZER_INFIXES)
 class Bengali(Language):
    lang = 'bn'
-
+    Defaults = BengaliDefaults
    class Defaults(Language.Defaults):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'bn'
        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
        tag_map = TAG_MAP
        stop_words = STOP_WORDS
        lemma_rules = LEMMA_RULES
        prefixes = tuple(TOKENIZER_PREFIXES)
        suffixes = tuple(TOKENIZER_SUFFIXES)
        infixes = tuple(TOKENIZER_INFIXES)
 __all__ = ['Bengali']
--- a/spacy/lang/da/init.py
+++ b/spacy/lang/da/init.py
@ -10,15 +10,17 @@ from ...attrs import LANG
 from ...util import update_exc
 class DanishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'da'
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = set(STOP_WORDS)
 class Danish(Language):
    lang = 'da'
-
+    Defaults = DanishDefaults
    class Defaults(Language.Defaults):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'da'
        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
        stop_words = set(STOP_WORDS)
 __all__ = ['Danish']
--- a/spacy/lang/de/init.py
+++ b/spacy/lang/de/init.py
@ -14,21 +14,23 @@ from ...attrs import LANG
 from ...util import update_exc
 class GermanDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'de'
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    tag_map = dict(TAG_MAP)
    stop_words = set(STOP_WORDS)
    syntax_iterators = dict(SYNTAX_ITERATORS)
    @classmethod
    def create_lemmatizer(cls, nlp=None):
        return Lemmatizer(LOOKUP)
 class German(Language):
    lang = 'de'
-
+    Defaults = GermanDefaults
    class Defaults(Language.Defaults):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'de'
        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
        tag_map = dict(TAG_MAP)
        stop_words = set(STOP_WORDS)
        syntax_iterators = dict(SYNTAX_ITERATORS)
        @classmethod
        def create_lemmatizer(cls, nlp=None):
            return Lemmatizer(LOOKUP)
 __all__ = ['German']
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -32,7 +32,6 @@ class EnglishDefaults(Language.Defaults):
 class English(Language):
    lang = 'en'
    Defaults = EnglishDefaults
--- a/spacy/lang/es/init.py
+++ b/spacy/lang/es/init.py
@ -28,7 +28,7 @@ class SpanishDefaults(Language.Defaults):
 class Spanish(Language):
    lang = 'es'
    Defaults = SpanishDefaults
 __all__ = ['Spanish']
--- a/spacy/lang/fi/init.py
+++ b/spacy/lang/fi/init.py
@ -10,15 +10,17 @@ from ...attrs import LANG
 from ...util import update_exc
 class FinnishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'fi'
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = set(STOP_WORDS)
 class Finnish(Language):
    lang = 'fi'
-
+    Defaults = FinnishDefaults
    class Defaults(Language.Defaults):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'fi'
        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
        stop_words = set(STOP_WORDS)
 __all__ = ['Finnish']
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -13,22 +13,24 @@ from ...attrs import LANG
 from ...util import update_exc
 class FrenchDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'fr'
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = set(STOP_WORDS)
    infixes = tuple(TOKENIZER_INFIXES)
    suffixes = tuple(TOKENIZER_SUFFIXES)
    token_match = TOKEN_MATCH
    @classmethod
    def create_lemmatizer(cls, nlp=None):
        return Lemmatizer(LOOKUP)
 class French(Language):
    lang = 'fr'
-
+    Defaults = FrenchDefaults
    class Defaults(Language.Defaults):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'fr'
        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
        stop_words = set(STOP_WORDS)
        infixes = tuple(TOKENIZER_INFIXES)
        suffixes = tuple(TOKENIZER_SUFFIXES)
        token_match = TOKEN_MATCH
        @classmethod
        def create_lemmatizer(cls, nlp=None):
            return Lemmatizer(LOOKUP)
 __all__ = ['French']
--- a/spacy/lang/he/init.py
+++ b/spacy/lang/he/init.py
@ -9,15 +9,17 @@ from ...attrs import LANG
 from ...util import update_exc
 class HebrewDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'he'
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
    stop_words = set(STOP_WORDS)
 class Hebrew(Language):
    lang = 'he'
-
+    Defaults = HebrewDefaults
    class Defaults(Language.Defaults):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'he'
        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
        stop_words = set(STOP_WORDS)
 __all__ = ['Hebrew']
--- a/spacy/lang/hu/init.py
+++ b/spacy/lang/hu/init.py
@ -13,23 +13,25 @@ from ...attrs import LANG
 from ...util import update_exc
 class HungarianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'hu'
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = set(STOP_WORDS)
    prefixes = tuple(TOKENIZER_PREFIXES)
    suffixes = tuple(TOKENIZER_SUFFIXES)
    infixes = tuple(TOKENIZER_INFIXES)
    token_match = TOKEN_MATCH
    @classmethod
    def create_lemmatizer(cls, nlp=None):
        return Lemmatizer(LOOKUP)
 class Hungarian(Language):
    lang = 'hu'
-
+    Defaults = HungarianDefaults
    class Defaults(Language.Defaults):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'hu'
        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
        stop_words = set(STOP_WORDS)
        prefixes = tuple(TOKENIZER_PREFIXES)
        suffixes = tuple(TOKENIZER_SUFFIXES)
        infixes = tuple(TOKENIZER_INFIXES)
        token_match = TOKEN_MATCH
        @classmethod
        def create_lemmatizer(cls, nlp=None):
            return Lemmatizer(LOOKUP)
 __all__ = ['Hungarian']
--- a/spacy/lang/it/init.py
+++ b/spacy/lang/it/init.py
@ -11,19 +11,21 @@ from ...attrs import LANG
 from ...util import update_exc
 class ItalianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'it'
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
    stop_words = set(STOP_WORDS)
    @classmethod
    def create_lemmatizer(cls, nlp=None):
        return Lemmatizer(LOOKUP)
 class Italian(Language):
    lang = 'it'
-
+    Defaults = ItalianDefaults
    class Defaults(Language.Defaults):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'it'
        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
        stop_words = set(STOP_WORDS)
        @classmethod
        def create_lemmatizer(cls, nlp=None):
            return Lemmatizer(LOOKUP)
 __all__ = ['Italian']
--- a/spacy/lang/nb/init.py
+++ b/spacy/lang/nb/init.py
@ -11,15 +11,17 @@ from ...attrs import LANG
 from ...util import update_exc
 class NorwegianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'nb'
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = set(STOP_WORDS)
 class Norwegian(Language):
    lang = 'nb'
-
+    Defaults = NorwegianDefaults
    class Defaults(Language.Defaults):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'nb'
        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
        stop_words = set(STOP_WORDS)
 __all__ = ['Norwegian']
--- a/spacy/lang/nl/init.py
+++ b/spacy/lang/nl/init.py
@ -9,16 +9,17 @@ from ...attrs import LANG
 from ...util import update_exc
 class DutchDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'nl'
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
    stop_words = set(STOP_WORDS)
 class Dutch(Language):
    lang = 'nl'
-
+    Defaults = DutchDefaults
    class Defaults(Language.Defaults):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'nl'
        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
        stop_words = set(STOP_WORDS)
 __all__ = ['Dutch']
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -9,15 +9,17 @@ from ...attrs import LANG
 from ...util import update_exc
 class PolishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'pl'
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
    stop_words = set(STOP_WORDS)
 class Polish(Language):
    lang = 'pl'
-
+    Defaults = PolishDefaults
    class Defaults(Language.Defaults):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'pl'
        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
        stop_words = set(STOP_WORDS)
 __all__ = ['Polish']
--- a/spacy/lang/pt/init.py
+++ b/spacy/lang/pt/init.py
@ -13,20 +13,22 @@ from ...attrs import LANG
 from ...util import update_exc
 class PortugueseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'pt'
    lex_attr_getters.update(LEX_ATTRS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = set(STOP_WORDS)
    @classmethod
    def create_lemmatizer(cls, nlp=None):
        return Lemmatizer(LOOKUP)
 class Portuguese(Language):
    lang = 'pt'
-
+    Defaults = PortugueseDefaults
    class Defaults(Language.Defaults):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'pt'
        lex_attr_getters.update(LEX_ATTRS)
        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
        stop_words = set(STOP_WORDS)
        @classmethod
        def create_lemmatizer(cls, nlp=None):
            return Lemmatizer(LOOKUP)
 __all__ = ['Portuguese']
--- a/spacy/lang/sv/init.py
+++ b/spacy/lang/sv/init.py
@ -13,19 +13,21 @@ from ...attrs import LANG
 from ...util import update_exc
 class SwedishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'sv'
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = set(STOP_WORDS)
    @classmethod
    def create_lemmatizer(cls, nlp=None):
        return Lemmatizer(LOOKUP)
 class Swedish(Language):
    lang = 'sv'
-
+    Defaults = SwedishDefaults
    class Defaults(Language.Defaults):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'sv'
        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
        stop_words = set(STOP_WORDS)
        @classmethod
        def create_lemmatizer(cls, nlp=None):
            return Lemmatizer(LOOKUP)
 __all__ = ['Swedish']
--- a/spacy/language.py
+++ b/spacy/language.py
@ -116,14 +116,30 @@ class BaseDefaults(object):
 class Language(object):
-    """
+    """A text-processing pipeline. Usually you'll load this once per process,
-    A text-processing pipeline. Usually you'll load this once per process, and
+    and pass the instance around your application.
-    pass the instance around your program.
+
    Defaults (class): Settings, data and factory methods for creating the `nlp`
        object and processing pipeline.
    lang (unicode): Two-letter language ID, i.e. ISO code.
    """
    Defaults = BaseDefaults
    lang = None
    def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}):
        """Initialise a Language object.
        vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
            `Language.Defaults.create_vocab`.
        make_doc (callable): A function that takes text and returns a `Doc`
            object. Usually a `Tokenizer`.
        pipeline (list): A list of annotation processes or IDs of annotation,
            processes, e.g. a `Tagger` object, or `'tagger'`. IDs are looked
            up in `Language.Defaults.factories`.
        meta (dict): Custom meta data for the Language class. Is written to by
            models to add model meta data.
        RETURNS (Language): The newly constructed object.
        """
        self.meta = dict(meta)
        if vocab is True:
@ -147,22 +163,17 @@ class Language(object):
            self.pipeline = []
    def __call__(self, text, **disabled):
-        """
+        """'Apply the pipeline to some text. The text can span multiple sentences,
-        Apply the pipeline to some text.  The text can span multiple sentences,
+        and can contain arbtrary whitespace. Alignment into the original string
        and can contain arbtrary whitespace.  Alignment into the original string
        is preserved.
-        Args:
+        text (unicode): The text to be processed.
-            text (unicode): The text to be processed.
+        **disabled: Elements of the pipeline that should not be run.
        RETURNS (Doc): A container for accessing the annotations.
-        Returns:
+        EXAMPLE:
            doc (Doc): A container for accessing the annotations.
        Example:
            >>> from spacy.en import English
            >>> nlp = English()
            >>> tokens = nlp('An example sentence. Another example sentence.')
-            >>> tokens[0].orth_, tokens[0].head.tag_
+            >>> tokens[0].text, tokens[0].head.tag_
            ('An', 'NN')
        """
        doc = self.make_doc(text)
@ -174,6 +185,21 @@ class Language(object):
        return doc
    def update(self, docs, golds, drop=0., sgd=None):
        """Update the models in the pipeline.
        docs (iterable): A batch of `Doc` objects.
        golds (iterable): A batch of `GoldParse` objects.
        drop (float): The droput rate.
        sgd (callable): An optimizer.
        RETURNS (dict): Results from the update.
        EXAMPLE:
            >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
            >>>    for epoch in trainer.epochs(gold):
            >>>        for docs, golds in epoch:
            >>>            state = nlp.update(docs, golds, sgd=optimizer)
        """
        grads = {}
        def get_grads(W, dW, key=None):
            grads[key] = (W, dW)
@ -204,7 +230,20 @@ class Language(object):
        for doc, gold in docs_golds:
            yield doc, gold
-    def begin_training(self, get_gold_tuples, **cfg):
+    def begin_training(self, gold_tuples, **cfg):
        """Allocate models, pre-process training data and acquire a trainer and
        optimizer. Used as a contextmanager.
        gold_tuples (iterable): Gold-standard training data.
        **cfg: Config parameters.
        YIELDS (tuple): A trainer and an optimizer.
        EXAMPLE:
            >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
            >>>    for epoch in trainer.epochs(gold):
            >>>        for docs, golds in epoch:
            >>>            state = nlp.update(docs, golds, sgd=optimizer)
        """
        # Populate vocab
        for _, annots_brackets in get_gold_tuples():
            for annots, _ in annots_brackets:
@ -233,6 +272,17 @@ class Language(object):
    @contextmanager
    def use_params(self, params, **cfg):
        """Replace weights of models in the pipeline with those provided in the
        params dictionary. Can be used as a contextmanager, in which case,
        models go back to their original weights after the block.
        params (dict): A dictionary of parameters keyed by model ID.
        **cfg: Config parameters.
        EXAMPLE:
            >>> with nlp.use_params(optimizer.averages):
            >>>     nlp.to_disk('/tmp/checkpoint')
        """
        contexts = [pipe.use_params(params) for pipe
                    in self.pipeline if hasattr(pipe, 'use_params')]
        # TODO: Having trouble with contextlib
@ -250,16 +300,20 @@ class Language(object):
                pass
    def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
-        """
+        """Process texts as a stream, and yield `Doc` objects in order. Supports
-        Process texts as a stream, and yield Doc objects in order.
+        GIL-free multi-threading.
-        Supports GIL-free multi-threading.
+        texts (iterator): A sequence of texts to process.
        n_threads (int): The number of worker threads to use. If -1, OpenMP will
            decide how many to use at run time. Default is 2.
        batch_size (int): The number of texts to buffer.
        **disabled: Pipeline components to exclude.
        YIELDS (Doc): Documents in the order of the original text.
-        Arguments:
+        EXAMPLE:
-            texts (iterator)
+            >>> texts = [u'One document.', u'...', u'Lots of documents']
-            tag (bool)
+            >>>     for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
-            parse (bool)
+            >>>         assert doc.is_parsed
            entity (bool)
        """
        #docs = (self.make_doc(text) for text in texts)
        docs = texts
@ -267,7 +321,6 @@ class Language(object):
            name = getattr(proc, 'name', None)
            if name in disabled and not disabled[name]:
                continue
            if hasattr(proc, 'pipe'):
                docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
            else:
@ -278,11 +331,12 @@ class Language(object):
    def to_disk(self, path, **exclude):
        """Save the current state to a directory.
-        Args:
+        path (unicode or Path): A path to a directory, which will be created if
-            path: A path to a directory, which will be created if it doesn't
+            it doesn't exist. Paths may be either strings or `Path`-like objects.
-                    exist. Paths may be either strings or pathlib.Path-like
+        **exclude: Named attributes to prevent from being saved.
-                    objects.
+
-            **exclude: Prevent named attributes from being saved.
+        EXAMPLE:
            >>> nlp.to_disk('/path/to/models')
        """
        path = util.ensure_path(path)
        if not path.exists():
@ -301,12 +355,17 @@ class Language(object):
            dill.dump(props, file_)
    def from_disk(self, path, **exclude):
-        """Load the current state from a directory.
+        """Loads state from a directory. Modifies the object in place and
        returns it.
-        Args:
+        path (unicode or Path): A path to a directory. Paths may be either
-            path: A path to a directory. Paths may be either strings or
+            strings or `Path`-like objects.
-                pathlib.Path-like objects.
+        **exclude: Named attributes to prevent from being loaded.
-            **exclude: Prevent named attributes from being saved.
+        RETURNS (Language): The modified `Language` object.
        EXAMPLE:
            >>> from spacy.language import Language
            >>> nlp = Language().from_disk('/path/to/models')
        """
        path = util.ensure_path(path)
        for name in path.iterdir():
@ -320,10 +379,8 @@ class Language(object):
    def to_bytes(self, **exclude):
        """Serialize the current state to a binary string.
-        Args:
+        **exclude: Named attributes to prevent from being serialized.
-            path: A path to a directory. Paths may be either strings or
+        RETURNS (bytes): The serialized form of the `Language` object.
                pathlib.Path-like objects.
            **exclude: Prevent named attributes from being serialized.
        """
        props = dict(self.__dict__)
        for key in exclude:
@ -334,13 +391,12 @@ class Language(object):
    def from_bytes(self, bytes_data, **exclude):
        """Load state from a binary string.
-        Args:
+        bytes_data (bytes): The data to load from.
-            bytes_data (bytes): The data to load from.
+        **exclude: Named attributes to prevent from being loaded.
-            **exclude: Prevent named attributes from being loaded.
+        RETURNS (Language): The `Language` object.
        """
        props = dill.loads(bytes_data)
        for key, value in props.items():
            if key not in exclude:
                setattr(self, key, value)
        return self
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -30,19 +30,16 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
 cdef class Lexeme:
-    """
+    """An entry in the vocabulary. A `Lexeme` has no string context – it's a
    An entry in the vocabulary.  A Lexeme has no string context --- it's a
    word-type, as opposed to a word token.  It therefore has no part-of-speech
    tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
    tag).
    """
    def __init__(self, Vocab vocab, int orth):
-        """
+        """Create a Lexeme object.
        Create a Lexeme object.
-        Arguments:
+        vocab (Vocab): The parent vocabulary
-            vocab (Vocab): The parent vocabulary
+        orth (int): The orth id of the lexeme.
            orth (int): The orth id of the lexeme.
        Returns (Lexeme): The newly constructd object.
        """
        self.vocab = vocab
@ -82,35 +79,28 @@ cdef class Lexeme:
        return self.c.orth
    def set_flag(self, attr_id_t flag_id, bint value):
-        """
+        """Change the value of a boolean flag.
        Change the value of a boolean flag.
-        Arguments:
+        flag_id (int): The attribute ID of the flag to set.
-            flag_id (int): The attribute ID of the flag to set.
+        value (bool): The new value of the flag.
            value (bool): The new value of the flag.
        """
        Lexeme.c_set_flag(self.c, flag_id, value)
    def check_flag(self, attr_id_t flag_id):
-        """
+        """Check the value of a boolean flag.
        Check the value of a boolean flag.
-        Arguments:
+        flag_id (int): The attribute ID of the flag to query.
-            flag_id (int): The attribute ID of the flag to query.
+        RETURNS (bool): The value of the flag.
        Returns (bool): The value of the flag.
        """
        return True if Lexeme.c_check_flag(self.c, flag_id) else False
    def similarity(self, other):
-        """
+        """Compute a semantic similarity estimate. Defaults to cosine over
-        Compute a semantic similarity estimate. Defaults to cosine over vectors.
+        vectors.
-        Arguments:
+        other (object): The object to compare with. By default, accepts `Doc`,
-            other:
+            `Span`, `Token` and `Lexeme` objects.
-                The object to compare with. By default, accepts Doc, Span,
+        RETURNS (float): A scalar similarity score. Higher is more similar.
                Token and Lexeme objects.
        Returns:
            score (float): A scalar similarity score. Higher is more similar.
        """
        if self.vector_norm == 0 or other.vector_norm == 0:
            return 0.0
@ -140,6 +130,11 @@ cdef class Lexeme:
        self.orth = self.c.orth
    property has_vector:
        """A boolean value indicating whether a word vector is associated with
        the object.
        RETURNS (bool): Whether a word vector is associated with the object.
        """
        def __get__(self):
            cdef int i
            for i in range(self.vocab.vectors_length):
@ -149,6 +144,10 @@ cdef class Lexeme:
                return False
    property vector_norm:
        """The L2 norm of the lexeme's vector representation.
        RETURNS (float): The L2 norm of the vector representation.
        """
        def __get__(self):
            return self.c.l2_norm
@ -156,6 +155,11 @@ cdef class Lexeme:
            self.c.l2_norm = value
    property vector:
        """A real-valued meaning representation.
        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
            representing the lexeme's semantics.
        """
        def __get__(self):
            cdef int length = self.vocab.vectors_length
            if length == 0:
@ -196,6 +200,14 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.orth]
    property text:
        """A unicode representation of the token text.
        RETURNS (unicode): The original verbatim text of the token.
        """
        def __get__(self):
            return self.orth_
    property lower:
        def __get__(self): return self.c.lower
        def __set__(self, int x): self.c.lower = x
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -87,7 +87,7 @@ ctypedef TokenPatternC* TokenPatternC_ptr
 ctypedef pair[int, TokenPatternC_ptr] StateC
-cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, attr_t label,
+cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
                                 object token_specs) except NULL:
    pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
    cdef int i
@ -99,15 +99,21 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, attr_t label,
            pattern[i].attrs[j].attr = attr
            pattern[i].attrs[j].value = value
    i = len(token_specs)
-    pattern[i].attrs = <AttrValueC*>mem.alloc(3, sizeof(AttrValueC))
+    pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
    pattern[i].attrs[0].attr = ID
    pattern[i].attrs[0].value = entity_id
    pattern[i].attrs[1].attr = ENT_TYPE
    pattern[i].attrs[1].value = label
    pattern[i].nr_attr = 0
    return pattern
 cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
    while pattern.nr_attr != 0:
        pattern += 1
    id_attr = pattern[0].attrs[0]
    assert id_attr.attr == ID
    return id_attr.value
 cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
    for attr in pattern.attrs[:pattern.nr_attr]:
        if get_token_attr(token, attr.attr) != attr.value:
@ -159,14 +165,14 @@ def _convert_strings(token_specs, string_store):
 def merge_phrase(matcher, doc, i, matches):
-    '''Callback to merge a phrase on match'''
+    """Callback to merge a phrase on match."""
    ent_id, label, start, end = matches[i]
    span = doc[start : end]
    span.merge(ent_type=label, ent_id=ent_id)
 cdef class Matcher:
-    '''Match sequences of tokens, based on pattern rules.'''
+    """Match sequences of tokens, based on pattern rules."""
    cdef Pool mem
    cdef vector[TokenPatternC*] patterns
    cdef readonly Vocab vocab
@ -175,37 +181,12 @@ cdef class Matcher:
    cdef public object _callbacks
    cdef public object _acceptors
-    @classmethod
+    def __init__(self, vocab):
-    def load(cls, path, vocab):
+        """Create the Matcher.
        """
        Load the matcher and patterns from a file path.
-        Arguments:
+        vocab (Vocab): The vocabulary object, which must be shared with the
-            path (Path):
+            documents the matcher will operate on.
-                Path to a JSON-formatted patterns file.
+        RETURNS (Matcher): The newly constructed object.
            vocab (Vocab):
                The vocabulary that the documents to match over will refer to.
        Returns:
            Matcher: The newly constructed object.
        """
        if (path / 'gazetteer.json').exists():
            with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
                patterns = ujson.load(file_)
        else:
            patterns = {}
        return cls(vocab, patterns)
    def __init__(self, vocab, patterns={}):
        """
        Create the Matcher.
        Arguments:
            vocab (Vocab):
                The vocabulary object, which must be shared with the documents
                the matcher will operate on.
            patterns (dict): Patterns to add to the matcher.
        Returns:
            The newly constructed object.
        """
        self._patterns = {}
        self._entities = {}
@ -213,144 +194,111 @@ cdef class Matcher:
        self._callbacks = {}
        self.vocab = vocab
        self.mem = Pool()
        for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
            self.add_entity(entity_key, attrs)
            for spec in specs:
                self.add_pattern(entity_key, spec, label=etype)
    def __reduce__(self):
        return (self.__class__, (self.vocab, self._patterns), None, None)
-    property n_patterns:
+    def __len__(self):
-        def __get__(self): return self.patterns.size()
+        """Get the number of rules added to the matcher. Note that this only
        returns the number of rules (identical with the number of IDs), not the
        number of individual patterns.
-    def add_entity(self, entity_key, attrs=None, if_exists='raise',
+        RETURNS (int): The number of rules.
                   acceptor=None, on_match=None):
        """
-        Add an entity to the matcher.
+        return len(self._patterns)
-        Arguments:
+    def __contains__(self, key):
-            entity_key (unicode or int):
+        """Check whether the matcher contains rules for a match ID.
-                An ID for the entity.
+
-            attrs:
+        key (unicode): The match ID.
-                Attributes to associate with the Matcher.
+        RETURNS (bool): Whether the matcher contains rules for this match ID.
            if_exists ('raise', 'ignore' or 'update'):
                Controls what happens if the entity ID already exists. Defaults to 'raise'.
            acceptor:
                Callback function to filter matches of the entity.
            on_match:
                Callback function to act on matches of the entity.
        Returns:
            None
        """
-        if if_exists not in ('raise', 'ignore', 'update'):
+        return len(self._patterns)
            raise ValueError(
                "Unexpected value for if_exists: %s.\n"
                "Expected one of: ['raise', 'ignore', 'update']" % if_exists)
        if attrs is None:
            attrs = {}
        entity_key = self.normalize_entity_key(entity_key)
        if self.has_entity(entity_key):
            if if_exists == 'raise':
                raise KeyError(
                    "Tried to add entity %s. Entity exists, and if_exists='raise'.\n"
                    "Set if_exists='ignore' or if_exists='update', or check with "
                    "matcher.has_entity()")
            elif if_exists == 'ignore':
                return
        self._entities[entity_key] = dict(attrs)
        self._patterns.setdefault(entity_key, [])
        self._acceptors[entity_key] = acceptor
        self._callbacks[entity_key] = on_match
-    def add_pattern(self, entity_key, token_specs, label=""):
+    def add(self, key, on_match, *patterns):
        """Add a match-rule to the matcher.
        A match-rule consists of: an ID key, an on_match callback, and one or
        more patterns. If the key exists, the patterns are appended to the
        previous ones, and the previous on_match callback is replaced. The
        `on_match` callback will receive the arguments `(matcher, doc, i,
        matches)`. You can also set `on_match` to `None` to not perform any
        actions. A pattern consists of one or more `token_specs`, where a
        `token_spec` is a dictionary mapping attribute IDs to values. Token
        descriptors can also include quantifiers. There are currently important
        known problems with the quantifiers – see the docs.
        """
-        Add a pattern to the matcher.
+        for pattern in patterns:
            if len(pattern) == 0:
                msg = ("Cannot add pattern for zero tokens to matcher.\n"
                       "key: {key}\n")
                raise ValueError(msg.format(key=key))
        key = self._normalize_key(key)
        self._patterns.setdefault(key, [])
        self._callbacks[key] = on_match
-        Arguments:
+        for pattern in patterns:
-            entity_key (unicode or int):
+            specs = _convert_strings(pattern, self.vocab.strings)
-                An ID for the entity.
+            self.patterns.push_back(init_pattern(self.mem, key, specs))
-            token_specs:
+            self._patterns[key].append(specs)
-                Description of the pattern to be matched.
+
-            label:
+    def remove(self, key):
-                Label to assign to the matched pattern. Defaults to "".
+        """Remove a rule from the matcher. A KeyError is raised if the key does
-        Returns:
+        not exist.
-            None
+
        key (unicode): The ID of the match rule.
        """
-        token_specs = list(token_specs)
+        key = self._normalize_key(key)
-        if len(token_specs) == 0:
+        self._patterns.pop(key)
-            msg = ("Cannot add pattern for zero tokens to matcher.\n"
+        self._callbacks.pop(key)
-                   "entity_key: {entity_key}\n"
+        cdef int i = 0
-                   "label: {label}")
+        while i < self.patterns.size():
-            raise ValueError(msg.format(entity_key=entity_key, label=label))
+            pattern_key = get_pattern_key(self.patterns.at(i))
-        entity_key = self.normalize_entity_key(entity_key)
+            if pattern_key == key:
-        if not self.has_entity(entity_key):
+                self.patterns.erase(self.patterns.begin()+i)
-            self.add_entity(entity_key)
+            else:
-        if isinstance(label, basestring):
+                i += 1
            label = self.vocab.strings[label]
        elif label is None:
            label = 0
        spec = _convert_strings(token_specs, self.vocab.strings)
-        self.patterns.push_back(init_pattern(self.mem, entity_key, label, spec))
+    def has_key(self, key):
-        self._patterns[entity_key].append((label, token_specs))
+        """Check whether the matcher has a rule with a given key.
-    def add(self, entity_key, label, attrs, specs, acceptor=None, on_match=None):
+        key (string or int): The key to check.
-        self.add_entity(entity_key, attrs=attrs, if_exists='update',
+        RETURNS (bool): Whether the matcher has the rule.
                        acceptor=acceptor, on_match=on_match)
        for spec in specs:
            self.add_pattern(entity_key, spec, label=label)
    def normalize_entity_key(self, entity_key):
        if isinstance(entity_key, basestring):
            return self.vocab.strings[entity_key]
        else:
            return entity_key
    def has_entity(self, entity_key):
        """
-        Check whether the matcher has an entity.
+        key = self._normalize_key(key)
        return key in self._patterns
-        Arguments:
+    def get(self, key, default=None):
-            entity_key (string or int): The entity key to check.
+        """Retrieve the pattern stored for a key.
        Returns:
            bool: Whether the matcher has the entity.
        """
        entity_key = self.normalize_entity_key(entity_key)
        return entity_key in self._entities
-    def get_entity(self, entity_key):
+        key (unicode or int): The key to retrieve.
        RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
        """
-        Retrieve the attributes stored for an entity.
+        key = self._normalize_key(key)
        if key not in self._patterns:
            return default
        return (self._callbacks[key], self._patterns[key])
-        Arguments:
+    def pipe(self, docs, batch_size=1000, n_threads=2):
-            entity_key (unicode or int): The entity to retrieve.
+        """Match a stream of documents, yielding them in turn.
        Returns:
            The entity attributes if present, otherwise None.
        """
        entity_key = self.normalize_entity_key(entity_key)
        if entity_key in self._entities:
            return self._entities[entity_key]
        else:
            return None
-    def __call__(self, Doc doc, acceptor=None):
+        docs (iterable): A stream of documents.
        batch_size (int): The number of documents to accumulate into a working set.
        n_threads (int): The number of threads with which to work on the buffer
            in parallel, if the `Matcher` implementation supports multi-threading.
        YIELDS (Doc): Documents, in order.
        """
-        Find all token sequences matching the supplied patterns on the Doc.
+        for doc in docs:
            self(doc)
            yield doc
-        Arguments:
+    def __call__(self, Doc doc):
-            doc (Doc):
+        """Find all token sequences matching the supplied patterns on the `Doc`.
-                The document to match over.
+
-        Returns:
+        doc (Doc): The document to match over.
-            list
+        RETURNS (list): A list of `(key, label_id, start, end)` tuples,
-            A list of (entity_key, label_id, start, end) tuples,
+            describing the matches. A match tuple describes a span
-            describing the matches. A match tuple describes a span doc[start:end].
+            `doc[start:end]`. The `label_id` and `key` are both integers.
            The label_id and entity_key are both integers.
        """
        if acceptor is not None:
            raise ValueError(
                "acceptor keyword argument to Matcher deprecated. Specify acceptor "
                "functions when you add patterns instead.")
        cdef vector[StateC] partials
        cdef int n_partials = 0
        cdef int q = 0
@ -388,13 +336,7 @@ cdef class Matcher:
                    end = token_i+1
                    ent_id = state.second[1].attrs[0].value
                    label = state.second[1].attrs[1].value
-                    acceptor = self._acceptors.get(ent_id)
+                    matches.append((ent_id, start, end))
                    if acceptor is None:
                        matches.append((ent_id, label, start, end))
                    else:
                        match = acceptor(doc, ent_id, label, start, end)
                        if match:
                            matches.append(match)
            partials.resize(q)
            # Check whether we open any new patterns on this token
            for pattern in self.patterns:
@ -419,13 +361,7 @@ cdef class Matcher:
                    end = token_i+1
                    ent_id = pattern[1].attrs[0].value
                    label = pattern[1].attrs[1].value
-                    acceptor = self._acceptors.get(ent_id)
+                    matches.append((ent_id, start, end))
                    if acceptor is None:
                        matches.append((ent_id, label, start, end))
                    else:
                        match = acceptor(doc, ent_id, label, start, end)
                        if match:
                            matches.append(match)
        # Look for open patterns that are actually satisfied
        for state in partials:
            while state.second.quantifier in (ZERO, ZERO_PLUS):
@ -435,36 +371,19 @@ cdef class Matcher:
                    end = len(doc)
                    ent_id = state.second.attrs[0].value
                    label = state.second.attrs[0].value
-                    acceptor = self._acceptors.get(ent_id)
+                    matches.append((ent_id, start, end))
                    if acceptor is None:
                        matches.append((ent_id, label, start, end))
                    else:
                        match = acceptor(doc, ent_id, label, start, end)
                        if match:
                            matches.append(match)
        for i, (ent_id, label, start, end) in enumerate(matches):
            on_match = self._callbacks.get(ent_id)
            if on_match is not None:
                on_match(self, doc, i, matches)
        # TODO: only return (match_id, start, end)
        return matches
-    def pipe(self, docs, batch_size=1000, n_threads=2):
+    def _normalize_key(self, key):
-        """
+        if isinstance(key, basestring):
-        Match a stream of documents, yielding them in turn.
+            return self.vocab.strings[key]
-
+        else:
-        Arguments:
+            return key
            docs: A stream of documents.
            batch_size (int):
                The number of documents to accumulate into a working set.
            n_threads (int):
                The number of threads with which to work on the buffer in parallel,
                if the Matcher implementation supports multi-threading.
        Yields:
            Doc Documents, in order.
        """
        for doc in docs:
            self(doc)
            yield doc
 def get_bilou(length):
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -38,33 +38,71 @@ from .parts_of_speech import X
 class TokenVectorEncoder(object):
-    '''Assign position-sensitive vectors to tokens, using a CNN or RNN.'''
+    """Assign position-sensitive vectors to tokens, using a CNN or RNN."""
    name = 'tok2vec'
    @classmethod
    def Model(cls, width=128, embed_size=5000, **cfg):
        """Create a new statistical model for the class.
        width (int): Output size of the model.
        embed_size (int): Number of vectors in the embedding table.
        **cfg: Config parameters.
        RETURNS (Model): A `thinc.neural.Model` or similar instance.
        """
        width = util.env_opt('token_vector_width', width)
        embed_size = util.env_opt('embed_size', embed_size)
        return Tok2Vec(width, embed_size, preprocess=None)
    def __init__(self, vocab, model=True, **cfg):
        """Construct a new statistical model. Weights are not allocated on
        initialisation.
        vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
            instance with the `Doc` objects it will process.
        model (Model): A `Model` instance or `True` allocate one later.
        **cfg: Config parameters.
        EXAMPLE:
            >>> from spacy.pipeline import TokenVectorEncoder
            >>> tok2vec = TokenVectorEncoder(nlp.vocab)
            >>> tok2vec.model = tok2vec.Model(128, 5000)
        """
        self.vocab = vocab
        self.doc2feats = doc2feats()
        self.model = model
    def __call__(self, docs):
        """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
        model. Vectors are set to the `Doc.tensor` attribute.
        docs (Doc or iterable): One or more documents to add vectors to.
        RETURNS (dict or None): Intermediate computations.
        """
        if isinstance(docs, Doc):
            docs = [docs]
        tokvecses = self.predict(docs)
        self.set_annotations(docs, tokvecses)
    def pipe(self, stream, batch_size=128, n_threads=-1):
        """Process `Doc` objects as a stream.
        stream (iterator): A sequence of `Doc` objects to process.
        batch_size (int): Number of `Doc` objects to group.
        n_threads (int): Number of threads.
        YIELDS (iterator): A sequence of `Doc` objects, in order of input.
        """
        for docs in cytoolz.partition_all(batch_size, stream):
            tokvecses = self.predict(docs)
            self.set_annotations(docs, tokvecses)
            yield from docs
    def predict(self, docs):
        """Return a single tensor for a batch of documents.
        docs (iterable): A sequence of `Doc` objects.
        RETURNS (object): Vector representations for each token in the documents.
        """
        feats = self.doc2feats(docs)
        tokvecs = self.model(feats)
        return tokvecs
@ -73,7 +111,26 @@ class TokenVectorEncoder(object):
        for doc, tokvecs in zip(docs, tokvecses):
            doc.tensor = tokvecs
-    def begin_update(self, docs, drop=0.):
+    def set_annotations(self, docs, tokvecs):
        """Set the tensor attribute for a batch of documents.
        docs (iterable): A sequence of `Doc` objects.
        tokvecs (object): Vector representation for each token in the documents.
        """
        start = 0
        for doc in docs:
            doc.tensor = tokvecs[start : start + len(doc)]
            start += len(doc)
    def update(self, docs, golds, state=None, drop=0., sgd=None):
        """Update the model.
        docs (iterable): A batch of `Doc` objects.
        golds (iterable): A batch of `GoldParse` objects.
        drop (float): The droput rate.
        sgd (callable): An optimizer.
        RETURNS (dict): Results from the update.
        """
        if isinstance(docs, Doc):
            docs = [docs]
        feats = self.doc2feats(docs)
@ -81,14 +138,26 @@ class TokenVectorEncoder(object):
        return tokvecs, bp_tokvecs
    def get_loss(self, docs, golds, scores):
        # TODO: implement
        raise NotImplementedError
    def begin_training(self, gold_tuples, pipeline=None):
        """Allocate models, pre-process training data and acquire a trainer and
        optimizer.
        gold_tuples (iterable): Gold-standard training data.
        pipeline (list): The pipeline the model is part of.
        """
        self.doc2feats = doc2feats()
        if self.model is True:
            self.model = self.Model()
    def use_params(self, params):
        """Replace weights of models in the pipeline with those provided in the
        params dictionary.
        params (dict): A dictionary of parameters keyed by model ID.
        """
        with self.model.use_params(params):
            yield
@ -189,9 +258,7 @@ class NeuralTagger(object):
 cdef class EntityRecognizer(LinearParser):
-    """
+    """Annotate named entities on Doc objects."""
    Annotate named entities on Doc objects.
    """
    TransitionSystem = BiluoPushDown
    feature_templates = get_feature_templates('ner')
@ -203,9 +270,7 @@ cdef class EntityRecognizer(LinearParser):
 cdef class BeamEntityRecognizer(BeamParser):
-    """
+    """Annotate named entities on Doc objects."""
    Annotate named entities on Doc objects.
    """
    TransitionSystem = BiluoPushDown
    feature_templates = get_feature_templates('ner')
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -11,8 +11,6 @@ from preshed.maps cimport map_iter, key_t
 from .typedefs cimport hash_t
 from libc.stdint cimport uint32_t
 import ujson
 cpdef hash_t hash_string(unicode string) except 0:
    chars = string.encode('utf8')
@ -72,15 +70,12 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
 cdef class StringStore:
-    """
+    """Map strings to and from integer IDs."""
    Map strings to and from integer IDs.
    """
    def __init__(self, strings=None, freeze=False):
-        """
+        """Create the StringStore.
        Create the StringStore.
-        Arguments:
+        strings (iterable): A sequence of unicode strings to add to the store.
-            strings: A sequence of unicode strings to add to the store.
+        RETURNS (StringStore): The newly constructed object.
        """
        self.mem = Pool()
        self._map = PreshMap()
@ -106,23 +101,17 @@ cdef class StringStore:
        return (StringStore, (list(self),))
    def __len__(self):
-        """
+        """The number of strings in the store.
        The number of strings in the store.
-        Returns:
+        RETURNS (int): The number of strings in the store.
            int The number of strings in the store.
        """
        return self.size-1
    def __getitem__(self, object string_or_id):
-        """
+        """Retrieve a string from a given integer ID, or vice versa.
        Retrieve a string from a given integer ID, or vice versa.
-        Arguments:
+        string_or_id (bytes or unicode or int): The value to encode.
-            string_or_id (bytes or unicode or int):
+        Returns (unicode or int): The value to be retrieved.
                The value to encode.
        Returns:
            unicode or int: The value to retrieved.
        """
        if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
            return 0
@ -163,13 +152,10 @@ cdef class StringStore:
                return utf8str - self.c
    def __contains__(self, unicode string not None):
-        """
+        """Check whether a string is in the store.
        Check whether a string is in the store.
-        Arguments:
+        string (unicode): The string to check.
-            string (unicode): The string to check.
+        RETURNS (bool): Whether the store contains the string.
        Returns bool:
            Whether the store contains the string.
        """
        if len(string) == 0:
            return True
@ -177,10 +163,9 @@ cdef class StringStore:
        return self._map.get(key) is not NULL
    def __iter__(self):
-        """
+        """Iterate over the strings in the store, in order.
        Iterate over the strings in the store, in order.
-        Yields: unicode A string in the store.
+        YIELDS (unicode): A string in the store.
        """
        cdef int i
        for i in range(self.size):
@ -195,6 +180,41 @@ cdef class StringStore:
            strings.append(py_string)
        return (StringStore, (strings,), None, None, None)
    def to_disk(self, path):
        """Save the current state to a directory.
        path (unicode or Path): A path to a directory, which will be created if
            it doesn't exist. Paths may be either strings or `Path`-like objects.
        """
        raise NotImplementedError()
    def from_disk(self, path):
        """Loads state from a directory. Modifies the object in place and
        returns it.
        path (unicode or Path): A path to a directory. Paths may be either
            strings or `Path`-like objects.
        RETURNS (StringStore): The modified `StringStore` object.
        """
        raise NotImplementedError()
    def to_bytes(self, **exclude):
        """Serialize the current state to a binary string.
        **exclude: Named attributes to prevent from being serialized.
        RETURNS (bytes): The serialized form of the `StringStore` object.
        """
        raise NotImplementedError()
    def from_bytes(self, bytes_data, **exclude):
        """Load state from a binary string.
        bytes_data (bytes): The data to load from.
        **exclude: Named attributes to prevent from being loaded.
        RETURNS (StringStore): The `StringStore` object.
        """
        raise NotImplementedError()
    def set_frozen(self, bint is_frozen):
        # TODO
        self.is_frozen = is_frozen
@ -235,40 +255,6 @@ cdef class StringStore:
        self.size += 1
        return &self.c[self.size-1]
    def dump(self, file_):
        """
        Save the strings to a JSON file.
        Arguments:
            file_ (buffer): The file to save the strings.
        Returns:
            None
        """
        string_data = ujson.dumps(list(self))
        if not isinstance(string_data, unicode):
            string_data = string_data.decode('utf8')
        # TODO: OOV?
        file_.write(string_data)
    def load(self, file_):
        """
        Load the strings from a JSON file.
        Arguments:
            file_ (buffer): The file from which to load the strings.
        Returns:
            None
        """
        strings = ujson.load(file_)
        if strings == ['']:
            return None
        cdef unicode string
        for string in strings:
            # explicit None/len check instead of simple truth testing
            # (bug in Cython <= 0.23.4)
            if string is not None and len(string):
                self.intern_unicode(string)
    def _realloc(self):
        # We want to map straight to pointers, but they'll be invalidated if
        # we resize our array. So, first we remap to indices, then we resize,
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 import ujson
 from collections import defaultdict
 from cymem.cymem cimport Pool
@ -15,7 +14,6 @@ from .tokens.doc cimport Doc
 from .attrs cimport TAG
 from .gold cimport GoldParse
 from .attrs cimport *
 from . import util
 cpdef enum:
@ -108,55 +106,15 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
 cdef class Tagger:
-    """
+    """Annotate part-of-speech tags on Doc objects."""
    Annotate part-of-speech tags on Doc objects.
    """
    @classmethod
    def load(cls, path, vocab, require=False):
        """
        Load the statistical model from the supplied path.
        Arguments:
            path (Path):
                The path to load from.
            vocab (Vocab):
                The vocabulary. Must be shared by the documents to be processed.
            require (bool):
                Whether to raise an error if the files are not found.
        Returns (Tagger):
            The newly created object.
        """
        # TODO: Change this to expect config.json when we don't have to
        # support old data.
        path = util.ensure_path(path)
        if (path / 'templates.json').exists():
            with (path / 'templates.json').open('r', encoding='utf8') as file_:
                templates = ujson.load(file_)
        elif require:
            raise IOError(
                "Required file %s/templates.json not found when loading Tagger" % str(path))
        else:
            templates = cls.feature_templates
        self = cls(vocab, model=None, feature_templates=templates)
        if (path / 'model').exists():
            self.model.load(str(path / 'model'))
        elif require:
            raise IOError(
                "Required file %s/model not found when loading Tagger" % str(path))
        return self
    def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
-        """
+        """Create a Tagger.
        Create a Tagger.
-        Arguments:
+        vocab (Vocab): The vocabulary object. Must be shared with documents to
-            vocab (Vocab):
+            be processed.
-                The vocabulary object. Must be shared with documents to be processed.
+        model (thinc.linear.AveragedPerceptron): The statistical model.
-            model (thinc.linear.AveragedPerceptron):
+        RETURNS (Tagger): The newly constructed object.
                The statistical model.
        Returns (Tagger):
            The newly constructed object.
        """
        if model is None:
            model = TaggerModel(cfg.get('features', self.feature_templates),
@ -186,13 +144,9 @@ cdef class Tagger:
        tokens._py_tokens = [None] * tokens.length
    def __call__(self, Doc tokens):
-        """
+        """Apply the tagger, setting the POS tags onto the Doc object.
        Apply the tagger, setting the POS tags onto the Doc object.
-        Arguments:
+        doc (Doc): The tokens to be tagged.
            doc (Doc): The tokens to be tagged.
        Returns:
            None
        """
        if tokens.length == 0:
            return 0
@ -215,34 +169,25 @@ cdef class Tagger:
        tokens._py_tokens = [None] * tokens.length
    def pipe(self, stream, batch_size=1000, n_threads=2):
-        """
+        """Tag a stream of documents.
        Tag a stream of documents.
        Arguments:
-            stream: The sequence of documents to tag.
+        stream: The sequence of documents to tag.
-            batch_size (int):
+        batch_size (int): The number of documents to accumulate into a working set.
-                The number of documents to accumulate into a working set.
+        n_threads (int): The number of threads with which to work on the buffer
-            n_threads (int):
+            in parallel, if the Matcher implementation supports multi-threading.
-                The number of threads with which to work on the buffer in parallel,
+        YIELDS (Doc): Documents, in order.
                if the Matcher implementation supports multi-threading.
        Yields:
            Doc Documents, in order.
        """
        for doc in stream:
            self(doc)
            yield doc
    def update(self, Doc tokens, GoldParse gold, itn=0):
-        """
+        """Update the statistical model, with tags supplied for the given document.
        Update the statistical model, with tags supplied for the given document.
-        Arguments:
+        doc (Doc): The document to update on.
-            doc (Doc):
+        gold (GoldParse): Manager for the gold-standard tags.
-                The document to update on.
+        RETURNS (int): Number of tags predicted correctly.
            gold (GoldParse):
                Manager for the gold-standard tags.
        Returns (int):
            Number of tags correct.
        """
        gold_tag_strs = gold.tags
        assert len(tokens) == len(gold_tag_strs)
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@ -99,8 +99,8 @@ def test_doc_token_api_ancestors(en_tokenizer):
    assert [t.text for t in doc[1].ancestors] == ["saw"]
    assert [t.text for t in doc[2].ancestors] == []
-    assert doc[2].is_ancestor_of(doc[7])
+    assert doc[2].is_ancestor(doc[7])
-    assert not doc[6].is_ancestor_of(doc[2])
+    assert not doc[6].is_ancestor(doc[2])
 def test_doc_token_api_head_setter(en_tokenizer):
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -2,8 +2,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 import ujson
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as preinc
 from cymem.cymem cimport Pool
@ -12,75 +10,31 @@ from preshed.maps cimport PreshMap
 from .strings cimport hash_string
 cimport cython
 from . import util
 from .tokens.doc cimport Doc
 cdef class Tokenizer:
    """Segment text, and create Doc objects with the discovered segment
    boundaries.
    """
    Segment text, and create Doc objects with the discovered segment boundaries.
    """
    @classmethod
    def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
             infix_finditer=None, token_match=None):
        """
        Load a Tokenizer, reading unsupplied components from the path.
        Arguments:
            path (Path):
                The path to load from.
            vocab (Vocab):
                A storage container for lexical types.
            rules (dict):
                Exceptions and special-cases for the tokenizer.
            token_match:
                A boolean function matching strings that becomes tokens.
            prefix_search:
                Signature of re.compile(string).search
            suffix_search:
                Signature of re.compile(string).search
            infix_finditer:
                Signature of re.compile(string).finditer
        Returns Tokenizer
        """
        path = util.ensure_path(path)
        if rules is None:
            with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_:
                rules = ujson.load(file_)
        if prefix_search in (None, True):
            with (path / 'tokenizer' / 'prefix.txt').open() as file_:
                entries = file_.read().split('\n')
            prefix_search = util.compile_prefix_regex(entries).search
        if suffix_search in (None, True):
            with (path / 'tokenizer' / 'suffix.txt').open() as file_:
                entries = file_.read().split('\n')
            suffix_search = util.compile_suffix_regex(entries).search
        if infix_finditer in (None, True):
            with (path / 'tokenizer' / 'infix.txt').open() as file_:
                entries = file_.read().split('\n')
            infix_finditer = util.compile_infix_regex(entries).finditer
        return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match)
    def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
-        """
+        """Create a `Tokenizer`, to create `Doc` objects given unicode text.
        Create a Tokenizer, to create Doc objects given unicode text.
-        Arguments:
+        vocab (Vocab): A storage container for lexical types.
-            vocab (Vocab):
+        rules (dict): Exceptions and special-cases for the tokenizer.
-                A storage container for lexical types.
+        prefix_search (callable): A function matching the signature of
-            rules (dict):
+            `re.compile(string).search` to match prefixes.
-                Exceptions and special-cases for the tokenizer.
+        suffix_search (callable): A function matching the signature of
-            prefix_search:
+            `re.compile(string).search` to match suffixes.
-                A function matching the signature of re.compile(string).search
+        `infix_finditer` (callable): A function matching the signature of
-                to match prefixes.
+            `re.compile(string).finditer` to find infixes.
-            suffix_search:
+        token_match (callable): A boolean function matching strings to be
-                A function matching the signature of re.compile(string).search
+            recognised as tokens.
-                to match suffixes.
+        RETURNS (Tokenizer): The newly constructed object.
-            infix_finditer:
+
-                A function matching the signature of re.compile(string).finditer
+        EXAMPLE:
-                to find infixes.
+            >>> tokenizer = Tokenizer(nlp.vocab)
-            token_match:
+            >>> tokenizer = English().Defaults.create_tokenizer(nlp)
                A boolean function matching strings that becomes tokens.
        """
        self.mem = Pool()
        self._cache = PreshMap()
@ -112,13 +66,10 @@ cdef class Tokenizer:
    @cython.boundscheck(False)
    def __call__(self, unicode string):
-        """
+        """Tokenize a string.
        Tokenize a string.
-        Arguments:
+        string (unicode): The string to tokenize.
-            string (unicode): The string to tokenize.
+        RETURNS (Doc): A container for linguistic annotations.
        Returns:
            Doc A container for linguistic annotations.
        """
        if len(string) >= (2 ** 30):
            raise ValueError(
@ -166,18 +117,13 @@ cdef class Tokenizer:
        return tokens
    def pipe(self, texts, batch_size=1000, n_threads=2):
-        """
+        """Tokenize a stream of texts.
        Tokenize a stream of texts.
-        Arguments:
+        texts: A sequence of unicode texts.
-            texts: A sequence of unicode texts.
+        batch_size (int): The number of texts to accumulate in an internal buffer.
-            batch_size (int):
+        n_threads (int): The number of threads to use, if the implementation
-                The number of texts to accumulate in an internal buffer.
+            supports multi-threading. The default tokenizer is single-threaded.
-            n_threads (int):
+        YIELDS (Doc): A sequence of Doc objects, in order.
                The number of threads to use, if the implementation supports
                multi-threading. The default tokenizer is single-threaded.
        Yields:
            Doc A sequence of Doc objects, in order.
        """
        for text in texts:
            yield self(text)
@ -321,27 +267,23 @@ cdef class Tokenizer:
        self._cache.set(key, cached)
    def find_infix(self, unicode string):
-        """
+        """Find internal split points of the string, such as hyphens.
        Find internal split points of the string, such as hyphens.
        string (unicode): The string to segment.
-
+        RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
-        Returns List[re.MatchObject]
+            and `.end()` methods, denoting the placement of internal segment
-            A list of objects that have .start() and .end() methods, denoting the
+            separators, e.g. hyphens.
            placement of internal segment separators, e.g. hyphens.
        """
        if self.infix_finditer is None:
            return 0
        return list(self.infix_finditer(string))
    def find_prefix(self, unicode string):
-        """
+        """Find the length of a prefix that should be segmented from the string,
        Find the length of a prefix that should be segmented from the string,
        or None if no prefix rules match.
-        Arguments:
+        string (unicode): The string to segment.
-            string (unicode): The string to segment.
+        RETURNS (int): The length of the prefix if present, otherwise `None`.
        Returns (int or None): The length of the prefix if present, otherwise None.
        """
        if self.prefix_search is None:
            return 0
@ -349,13 +291,11 @@ cdef class Tokenizer:
        return (match.end() - match.start()) if match is not None else 0
    def find_suffix(self, unicode string):
-        """
+        """Find the length of a suffix that should be segmented from the string,
        Find the length of a suffix that should be segmented from the string,
        or None if no suffix rules match.
-        Arguments:
+        string (unicode): The string to segment.
-            string (unicode): The string to segment.
+        Returns (int): The length of the suffix if present, otherwise `None`.
        Returns (int or None): The length of the suffix if present, otherwise None.
        """
        if self.suffix_search is None:
            return 0
@ -363,23 +303,17 @@ cdef class Tokenizer:
        return (match.end() - match.start()) if match is not None else 0
    def _load_special_tokenization(self, special_cases):
-        """
+        """Add special-case tokenization rules."""
        Add special-case tokenization rules.
        """
        for chunk, substrings in sorted(special_cases.items()):
            self.add_special_case(chunk, substrings)
    def add_special_case(self, unicode string, substrings):
-        """
+        """Add a special-case tokenization rule.
        Add a special-case tokenization rule.
-        Arguments:
+        string (unicode): The string to specially tokenize.
-            string (unicode): The string to specially tokenize.
+        token_attrs (iterable): A sequence of dicts, where each dict describes
-            token_attrs:
+            a token and its attributes. The `ORTH` fields of the attributes must
-                A sequence of dicts, where each dict describes a token and its
+            exactly match the string when they are concatenated.
                attributes. The ORTH fields of the attributes must exactly match
                the string when they are concatenated.
        Returns None
        """
        substrings = list(substrings)
        cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
@ -390,3 +324,38 @@ cdef class Tokenizer:
        self._specials.set(key, cached)
        self._cache.set(key, cached)
        self._rules[string] = substrings
    def to_disk(self, path):
        """Save the current state to a directory.
        path (unicode or Path): A path to a directory, which will be created if
            it doesn't exist. Paths may be either strings or `Path`-like objects.
        """
        raise NotImplementedError()
    def from_disk(self, path):
        """Loads state from a directory. Modifies the object in place and
        returns it.
        path (unicode or Path): A path to a directory. Paths may be either
            strings or `Path`-like objects.
        RETURNS (Tokenizer): The modified `Tokenizer` object.
        """
        raise NotImplementedError()
    def to_bytes(self, **exclude):
        """Serialize the current state to a binary string.
        **exclude: Named attributes to prevent from being serialized.
        RETURNS (bytes): The serialized form of the `Tokenizer` object.
        """
        raise NotImplementedError()
    def from_bytes(self, bytes_data, **exclude):
        """Load state from a binary string.
        bytes_data (bytes): The data to load from.
        **exclude: Named attributes to prevent from being loaded.
        RETURNS (Tokenizer): The `Tokenizer` object.
        """
        raise NotImplementedError()
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -63,40 +63,30 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
 cdef class Doc:
-    """
+    """A sequence of Token objects. Access sentences and named entities, export
-    A sequence of `Token` objects. Access sentences and named entities,
+    annotations to numpy arrays, losslessly serialize to compressed binary strings.
-    export annotations to numpy arrays, losslessly serialize to compressed
+    The `Doc` object holds an array of `TokenC` structs. The Python-level
-    binary strings.
+    `Token` and `Span` objects are views of this array, i.e. they don't own
    the data themselves.
-    Aside: Internals
+    EXAMPLE: Construction 1
-        The `Doc` object holds an array of `TokenC` structs.
+        >>> doc = nlp(u'Some text')
        The Python-level `Token` and `Span` objects are views of this
        array, i.e. they don't own the data themselves.
    Code: Construction 1
        doc = nlp.tokenizer(u'Some text')
    Code: Construction 2
        doc = Doc(nlp.vocab, orths_and_spaces=[(u'Some', True), (u'text', True)])
        Construction 2
        >>> from spacy.tokens import Doc
        >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
    """
    def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
-        """
+        """Create a Doc object.
        Create a Doc object.
-        Arguments:
+        vocab (Vocab): A vocabulary object, which must match any models you want
-            vocab:
+            to use (e.g. tokenizer, parser, entity recognizer).
-                A Vocabulary object, which must match any models you want to
+        words (list or None): A list of unicode strings to add to the document
-                use (e.g. tokenizer, parser, entity recognizer).
+            as words. If `None`, defaults to empty list.
-
+        spaces (list or None): A list of boolean values, of the same length as
-            words:
+            words. True means that the word is followed by a space, False means
-                A list of unicode strings to add to the document as words. If None,
+            it is not. If `None`, defaults to `[True]*len(words)`
-                defaults to empty list.
+        RETURNS (Doc): The newly constructed object.
            spaces:
                A list of boolean values, of the same length as words. True
                means that the word is followed by a space, False means it is not.
                If None, defaults to [True]*len(words)
        """
        self.vocab = vocab
        size = 20
@ -158,20 +148,26 @@ cdef class Doc:
            self.is_parsed = True
    def __getitem__(self, object i):
-        """
+        """Get a `Token` or `Span` object.
-        doc[i]
+
-            Get the Token object at position i, where i is an integer.
+        i (int or tuple) The index of the token, or the slice of the document to get.
        RETURNS (Token or Span): The token at `doc[i]]`, or the span at
            `doc[start : end]`.
        EXAMPLE:
            >>> doc[i]
            Get the `Token` object at position `i`, where `i` is an integer.
            Negative indexing is supported, and follows the usual Python
-            semantics, i.e. doc[-2] is doc[len(doc) - 2].
+            semantics, i.e. `doc[-2]` is `doc[len(doc) - 2]`.
-        doc[start : end]]
+
-            Get a `Span` object, starting at position `start`
+            >>> doc[start : end]]
-            and ending at position `end`, where `start` and
+            Get a `Span` object, starting at position `start` and ending at
-            `end` are token indices. For instance,
+            position `end`, where `start` and `end` are token indices. For
-            `doc[2:5]` produces a span consisting of
+            instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4.
-            tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
+            Stepped slices (e.g. `doc[start : end : step]`) are not supported,
-            are not supported, as `Span` objects must be contiguous (cannot have gaps).
+            as `Span` objects must be contiguous (cannot have gaps). You can use
-            You can use negative indices and open-ended ranges, which have their
+            negative indices and open-ended ranges, which have their normal
-            normal Python semantics.
+            Python semantics.
        """
        if isinstance(i, slice):
            start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
@ -186,14 +182,14 @@ cdef class Doc:
            return Token.cinit(self.vocab, &self.c[i], i, self)
    def __iter__(self):
-        """
+        """Iterate over `Token`  objects, from which the annotations can be
-        for token in doc
+        easily accessed. This is the main way of accessing `Token` objects,
-            Iterate over `Token`  objects, from which the annotations can
+        which are the main way annotations are accessed from Python. If faster-
-            be easily accessed. This is the main way of accessing Token
+        than-Python speeds are required, you can instead access the annotations
-            objects, which are the main way annotations are accessed from
+        as a numpy array, or access the underlying C data directly from Cython.
-            Python. If faster-than-Python speeds are required, you can
+
-            instead access the annotations as a numpy array, or access the
+        EXAMPLE:
-            underlying C data directly from Cython.
+            >>> for token in doc
        """
        cdef int i
        for i in range(self.length):
@ -203,9 +199,12 @@ cdef class Doc:
                yield Token.cinit(self.vocab, &self.c[i], i, self)
    def __len__(self):
-        """
+        """The number of tokens in the document.
-        len(doc)
+
-            The number of tokens in the document.
+        RETURNS (int): The number of tokens in the document.
        EXAMPLE:
            >>> len(doc)
        """
        return self.length
@ -228,16 +227,12 @@ cdef class Doc:
        return self
    def similarity(self, other):
-        """
+        """Make a semantic similarity estimate. The default estimate is cosine
        Make a semantic similarity estimate. The default estimate is cosine
        similarity using an average of word vectors.
-        Arguments:
+        other (object): The object to compare with. By default, accepts `Doc`,
-            other (object): The object to compare with. By default, accepts Doc,
+            `Span`, `Token` and `Lexeme` objects.
-                Span, Token and Lexeme objects.
+        RETURNS (float): A scalar similarity score. Higher is more similar.
        Return:
            score (float): A scalar similarity score. Higher is more similar.
        """
        if 'similarity' in self.user_hooks:
            return self.user_hooks['similarity'](self, other)
@ -246,8 +241,10 @@ cdef class Doc:
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
    property has_vector:
-        """
+        """A boolean value indicating whether a word vector is associated with
-        A boolean value indicating whether a word vector is associated with the object.
+        the object.
        RETURNS (bool): Whether a word vector is associated with the object.
        """
        def __get__(self):
            if 'has_vector' in self.user_hooks:
@ -256,10 +253,11 @@ cdef class Doc:
            return any(token.has_vector for token in self)
    property vector:
-        """
+        """A real-valued meaning representation. Defaults to an average of the
-        A real-valued meaning representation. Defaults to an average of the token vectors.
+        token vectors.
-        Type: numpy.ndarray[ndim=1, dtype='float32']
+        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
            representing the document's semantics.
        """
        def __get__(self):
            if 'vector' in self.user_hooks:
@ -275,6 +273,10 @@ cdef class Doc:
            self._vector = value
    property vector_norm:
        """The L2 norm of the document's vector representation.
        RETURNS (float): The L2 norm of the vector representation.
        """
        def __get__(self):
            if 'vector_norm' in self.user_hooks:
                return self.user_hooks['vector_norm'](self)
@ -295,34 +297,37 @@ cdef class Doc:
        return self.text
    property text:
-        """
+        """A unicode representation of the document text.
-        A unicode representation of the document text.
+
        RETURNS (unicode): The original verbatim text of the document.
        """
        def __get__(self):
            return u''.join(t.text_with_ws for t in self)
    property text_with_ws:
-        """
+        """An alias of `Doc.text`, provided for duck-type compatibility with
-        An alias of Doc.text, provided for duck-type compatibility with Span and Token.
+        `Span` and `Token`.
        RETURNS (unicode): The original verbatim text of the document.
        """
        def __get__(self):
            return self.text
    property ents:
-        """
+        """Iterate over the entities in the document. Yields named-entity `Span`
-        Yields named-entity `Span` objects, if the entity recognizer
+        objects, if the entity recognizer has been applied to the document.
        has been applied to the document. Iterate over the span to get
        individual Token objects, or access the label:
-        Example:
+        YIELDS (Span): Entities in the document.
-            from spacy.en import English
+
-            nlp = English()
+        EXAMPLE: Iterate over the span to get individual Token objects, or access
-            tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
+            the label:
-            ents = list(tokens.ents)
+
-            assert ents[0].label == 346
+            >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
-            assert ents[0].label_ == 'PERSON'
+            >>> ents = list(tokens.ents)
-            assert ents[0].orth_ == 'Best'
+            >>> assert ents[0].label == 346
-            assert ents[0].text == 'Mr. Best'
+            >>> assert ents[0].label_ == 'PERSON'
            >>> assert ents[0].orth_ == 'Best'
            >>> assert ents[0].text == 'Mr. Best'
        """
        def __get__(self):
            cdef int i
@ -387,12 +392,13 @@ cdef class Doc:
                    self.c[start].ent_iob = 3
    property noun_chunks:
-        """
+        """Iterate over the base noun phrases in the document. Yields base
-        Yields base noun-phrase #[code Span] objects, if the document
+        noun-phrase #[code Span] objects, if the document has been syntactically
-        has been syntactically parsed. A base noun phrase, or
+        parsed. A base noun phrase, or "NP chunk", is a noun phrase that does
-        'NP chunk', is a noun phrase that does not permit other NPs to
+        not permit other NPs to be nested within it – so no NP-level
-        be nested within it – so no NP-level coordination, no prepositional
+        coordination, no prepositional phrases, and no relative clauses.
-        phrases, and no relative clauses.
+
        YIELDS (Span): Noun chunks in the document.
        """
        def __get__(self):
            if not self.is_parsed:
@ -411,17 +417,15 @@ cdef class Doc:
                yield span
    property sents:
-        """
+        """Iterate over the sentences in the document. Yields sentence `Span`
-        Yields sentence `Span` objects. Sentence spans have no label.
+        objects. Sentence spans have no label. To improve accuracy on informal
-        To improve accuracy on informal texts, spaCy calculates sentence
+        texts, spaCy calculates sentence boundaries from the syntactic
-        boundaries from the syntactic dependency parse. If the parser is disabled,
+        dependency parse. If the parser is disabled, the `sents` iterator will
-        `sents` iterator will be unavailable.
+        be unavailable.
-        Example:
+        EXAMPLE:
-            from spacy.en import English
+            >>> doc = nlp("This is a sentence. Here's another...")
-            nlp = English()
+            >>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
            doc = nlp("This is a sentence. Here's another...")
            assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
        """
        def __get__(self):
            if 'sents' in self.user_hooks:
@ -467,24 +471,20 @@ cdef class Doc:
    @cython.boundscheck(False)
    cpdef np.ndarray to_array(self, object py_attr_ids):
-        """
+        """Given a list of M attribute IDs, export the tokens to a numpy
-        Given a list of M attribute IDs, export the tokens to a numpy
+        `ndarray` of shape `(N, M)`, where `N` is the length of the document.
-        `ndarray` of shape (N, M), where `N` is the length
+        The values will be 32-bit integers.
        of the document. The values will be 32-bit integers.
-        Example:
+        attr_ids (list[int]): A list of attribute ID ints.
-            from spacy import attrs
+        RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
-            doc = nlp(text)
+            per word, and one column per attribute indicated in the input
-            # All strings mapped to integers, for easy export to numpy
+            `attr_ids`.
            np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
-        Arguments:
+        EXAMPLE:
-            attr_ids (list[int]): A list of attribute ID ints.
+            >>> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
-
+            >>> doc = nlp(text)
-        Returns:
+            >>> # All strings mapped to integers, for easy export to numpy
-            feat_array (numpy.ndarray[long, ndim=2]):
+            >>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
              A feature matrix, with one row per word, and one column per attribute
              indicated in the input attr_ids.
        """
        cdef int i, j
        cdef attr_id_t feature
@ -499,27 +499,20 @@ cdef class Doc:
        return output
    def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
-        """
+        """Count the frequencies of a given attribute. Produces a dict of
-        Produce a dict of {attribute (int): count (ints)} frequencies, keyed
+        `{attribute (int): count (ints)}` frequencies, keyed by the values of
-        by the values of the given attribute ID.
+        the given attribute ID.
-        Example:
+        attr_id (int): The attribute ID to key the counts.
-            from spacy.en import English
+        RETURNS (dict): A dictionary mapping attributes to integer counts.
            from spacy import attrs
            nlp = English()
            tokens = nlp(u'apple apple orange banana')
            tokens.count_by(attrs.ORTH)
            # {12800L: 1, 11880L: 2, 7561L: 1}
            tokens.to_array([attrs.ORTH])
            # array([[11880],
            #   [11880],
            #   [ 7561],
            #   [12800]])
-        Arguments:
+        EXAMPLE:
-            attr_id
+            >>> from spacy import attrs
-                int
+            >>> doc = nlp(u'apple apple orange banana')
-                The attribute ID to key the counts.
+            >>> tokens.count_by(attrs.ORTH)
            {12800L: 1, 11880L: 2, 7561L: 1}
            >>> tokens.to_array([attrs.ORTH])
            array([[11880], [11880], [7561], [12800]])
        """
        cdef int i
        cdef attr_t attr
@ -567,8 +560,12 @@ cdef class Doc:
            self.c[i] = parsed[i]
    def from_array(self, attrs, int[:, :] array):
-        """
+        """Load attributes from a numpy array. Write to a `Doc` object, from an
-        Write to a `Doc` object, from an `(M, N)` array of attributes.
+        `(M, N)` array of attributes.
        attrs (ints): A list of attribute ID ints.
        array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load.
        RETURNS (Doc): Itself.
        """
        cdef int i, col
        cdef attr_id_t attr_id
@ -597,8 +594,10 @@ cdef class Doc:
        return self
    def to_bytes(self):
-        """
+        """Serialize, i.e. export the document contents to a binary string.
-        Serialize, producing a byte string.
+
        RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
            all annotations.
        """
        return dill.dumps(
            (self.text,
@ -611,8 +610,10 @@ cdef class Doc:
            protocol=-1)
    def from_bytes(self, data):
-        """
+        """Deserialize, i.e. import the document contents from a binary string.
-        Deserialize, loading from bytes.
+
        data (bytes): The string to load from.
        RETURNS (Doc): Itself.
        """
        if self.length != 0:
            raise ValueError("Cannot load into non-empty Doc")
@ -640,21 +641,16 @@ cdef class Doc:
        return self
    def merge(self, int start_idx, int end_idx, *args, **attributes):
-        """
+        """Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
-        Retokenize the document, such that the span at doc.text[start_idx : end_idx]
+        is merged into a single token. If `start_idx` and `end_idx `do not mark
-        is merged into a single token. If start_idx and end_idx do not mark start
+        start and end token boundaries, the document remains unchanged.
        and end token boundaries, the document remains unchanged.
-        Arguments:
+        start_idx (int): The character index of the start of the slice to merge.
-            start_idx (int): The character index of the start of the slice to merge.
+        end_idx (int): The character index after the end of the slice to merge.
-            end_idx (int): The character index after the end of the slice to merge.
+        **attributes: Attributes to assign to the merged token. By default,
-            **attributes:
+            attributes are inherited from the syntactic root token of the span.
-                Attributes to assign to the merged token. By default, attributes
+        RETURNS (Token): The newly merged token, or `None` if the start and end
-                are inherited from the syntactic root token of the span.
+            indices did not fall at token boundaries.
        Returns:
            token (Token):
                The newly merged token, or None if the start and end indices did
                not fall at token boundaries.
        """
        cdef unicode tag, lemma, ent_type
        if len(args) == 3:
@ -758,7 +754,29 @@ cdef class Doc:
        return self[start]
    def print_tree(self, light=False, flat=False):
-        """Returns the parse trees in the JSON (Dict) format."""
+        """Returns the parse trees in JSON (dict) format.
        light (bool): Don't include lemmas or entities.
        flat (bool): Don't include arcs or modifiers.
        RETURNS (dict): Parse tree as dict.
        EXAMPLE:
            >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
            >>> trees = doc.print_tree()
            >>> trees[1]
            {'modifiers': [
                {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
                'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
                {'modifiers': [
                    {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
                    'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
                'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
                'POS_fine': 'NN', 'lemma': 'pizza'},
                {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
                'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
                'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
                'POS_fine': 'VBD', 'lemma': 'eat'}
        """
        return parse_tree(self, light=light, flat=flat)
--- a/spacy/tokens/printers.py
+++ b/spacy/tokens/printers.py
@ -6,18 +6,14 @@ from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE
 def merge_ents(doc):
-    """
+    """Helper: merge adjacent entities into single tokens; modifies the doc."""
    Helper: merge adjacent entities into single tokens; modifies the doc.
    """
    for ent in doc.ents:
        ent.merge(ent.root.tag_, ent.text, ent.label_)
    return doc
 def format_POS(token, light, flat):
-    """
+    """Helper: form the POS output for a token."""
    Helper: form the POS output for a token.
    """
    subtree = dict([
        ("word", token.text),
        ("lemma", token.lemma_),  # trigger
@ -37,9 +33,8 @@ def format_POS(token, light, flat):
 def POS_tree(root, light=False, flat=False):
-    """
+    """Helper: generate a POS tree for a root token. The doc must have
-    Helper: generate a POS tree for a root token. The doc must have
+    `merge_ents(doc)` ran on it.
    merge_ents(doc) ran on it.
    """
    subtree = format_POS(root, light=light, flat=flat)
    for c in root.children:
@ -48,21 +43,28 @@ def POS_tree(root, light=False, flat=False):
 def parse_tree(doc, light=False, flat=False):
-    """
+    """Makes a copy of the doc, then construct a syntactic parse tree, similar to
    Makes a copy of the doc, then construct a syntactic parse tree, similar to
    the one used in displaCy. Generates the POS tree for all sentences in a doc.
-    Args:
+    doc (Doc): The doc for parsing.
-        doc: The doc for parsing.
+    RETURNS (dict): The parse tree.
-    Returns:
+    EXAMPLE:
-        [parse_trees (Dict)]:
+        >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
-
+        >>> trees = doc.print_tree()
-    >>> from spacy.en import English
+        >>> trees[1]
-    >>> nlp = English()
+        {'modifiers': [
-    >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
+            {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
-    >>> trees = doc.print_tree()
+             'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
-    [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
+            {'modifiers': [
                {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
                 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
             'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
             'POS_fine': 'NN', 'lemma': 'pizza'},
            {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
             'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
            'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
            'POS_fine': 'VBD', 'lemma': 'eat'}
    """
    doc_clone  = Doc(doc.vocab, words=[w.text for w in doc])
    doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -20,22 +20,17 @@ from .. import about
 cdef class Span:
-    """
+    """A slice from a Doc object."""
    A slice from a Doc object.
    """
    def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
                  vector_norm=None):
-        """
+        """Create a `Span` object from the slice `doc[start : end]`.
        Create a Span object from the slice doc[start : end]
-        Arguments:
+        doc (Doc): The parent document.
-            doc (Doc): The parent document.
+        start (int): The index of the first token of the span.
-            start (int): The index of the first token of the span.
+        end (int): The index of the first token after the span.
-            end (int): The index of the first token after the span.
+        label (int): A label to attach to the Span, e.g. for named entities.
-            label (int): A label to attach to the Span, e.g. for named entities.
+        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
-            vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
+        RETURNS (Span): The newly constructed object.
        Returns:
            Span The newly constructed object.
        """
        if not (0 <= start <= end <= len(doc)):
            raise IndexError
@ -70,8 +65,11 @@ cdef class Span:
    def __hash__(self):
        return hash((self.doc, self.label, self.start_char, self.end_char))
    def __len__(self):
        """Get the number of tokens in the span.
        RETURNS (int): The number of tokens in the span.
        """
        self._recalculate_indices()
        if self.end < self.start:
            return 0
@ -83,6 +81,16 @@ cdef class Span:
        return self.text.encode('utf-8')
    def __getitem__(self, object i):
        """Get a `Token` or a `Span` object
        i (int or tuple): The index of the token within the span, or slice of
            the span to get.
        RETURNS (Token or Span): The token at `span[i]`.
        EXAMPLE:
            >>> span[0]
            >>> span[1:3]
        """
        self._recalculate_indices()
        if isinstance(i, slice):
            start, end = normalize_slice(len(self), i.start, i.stop, i.step)
@ -94,35 +102,31 @@ cdef class Span:
                return self.doc[self.start + i]
    def __iter__(self):
        """Iterate over `Token` objects.
        YIELDS (Token): A `Token` object.
        """
        self._recalculate_indices()
        for i in range(self.start, self.end):
            yield self.doc[i]
    def merge(self, *args, **attributes):
-        """
+        """Retokenize the document, such that the span is merged into a single
-        Retokenize the document, such that the span is merged into a single token.
+        token.
-        Arguments:
+        **attributes: Attributes to assign to the merged token. By default,
-            **attributes:
+            attributes are inherited from the syntactic root token of the span.
-                Attributes to assign to the merged token. By default, attributes
+        RETURNS (Token): The newly merged token.
                are inherited from the syntactic root token of the span.
        Returns:
            token (Token):
                The newly merged token.
        """
        return self.doc.merge(self.start_char, self.end_char, *args, **attributes)
    def similarity(self, other):
-        """
+        """Make a semantic similarity estimate. The default estimate is cosine
        Make a semantic similarity estimate. The default estimate is cosine
        similarity using an average of word vectors.
-        Arguments:
+        other (object): The object to compare with. By default, accepts `Doc`,
-            other (object): The object to compare with. By default, accepts Doc,
+            `Span`, `Token` and `Lexeme` objects.
-                Span, Token and Lexeme objects.
+        RETURNS (float): A scalar similarity score. Higher is more similar.
        Return:
            score (float): A scalar similarity score. Higher is more similar.
        """
        if 'similarity' in self.doc.user_span_hooks:
            self.doc.user_span_hooks['similarity'](self, other)
@ -145,11 +149,9 @@ cdef class Span:
            self.end = end + 1
    property sent:
-        """
+        """The sentence span that this span is a part of.
        The sentence span that this span is a part of.
-        Returns:
+        RETURNS (Span): The sentence span that the span is a part of.
            Span The sentence this is part of.
        """
        def __get__(self):
            if 'sent' in self.doc.user_span_hooks:
@ -166,12 +168,23 @@ cdef class Span:
            return self.doc[root.l_edge : root.r_edge + 1]
    property has_vector:
        """A boolean value indicating whether a word vector is associated with
        the object.
        RETURNS (bool): Whether a word vector is associated with the object.
        """
        def __get__(self):
            if 'has_vector' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['has_vector'](self)
            return any(token.has_vector for token in self)
    property vector:
        """A real-valued meaning representation. Defaults to an average of the
        token vectors.
        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
            representing the span's semantics.
        """
        def __get__(self):
            if 'vector' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['vector'](self)
@ -180,6 +193,10 @@ cdef class Span:
            return self._vector
    property vector_norm:
        """The L2 norm of the document's vector representation.
        RETURNS (float): The L2 norm of the vector representation.
        """
        def __get__(self):
            if 'vector_norm' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['vector'](self)
@ -193,6 +210,7 @@ cdef class Span:
            return self._vector_norm
    property sentiment:
        # TODO: docstring
        def __get__(self):
            if 'sentiment' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['sentiment'](self)
@ -200,6 +218,10 @@ cdef class Span:
                return sum([token.sentiment for token in self]) / len(self)
    property text:
        """A unicode representation of the span text.
        RETURNS (unicode): The original verbatim text of the span.
        """
        def __get__(self):
            text = self.text_with_ws
            if self[-1].whitespace_:
@ -207,16 +229,21 @@ cdef class Span:
            return text
    property text_with_ws:
        """The text content of the span with a trailing whitespace character if
        the last token has one.
        RETURNS (unicode): The text content of the span (with trailing whitespace).
        """
        def __get__(self):
            return u''.join([t.text_with_ws for t in self])
    property noun_chunks:
-        """
+        """Yields base noun-phrase `Span` objects, if the document has been
-        Yields base noun-phrase #[code Span] objects, if the document
+        syntactically parsed. A base noun phrase, or "NP chunk", is a noun
-        has been syntactically parsed. A base noun phrase, or
+        phrase that does not permit other NPs to be nested within it – so no
-        'NP chunk', is a noun phrase that does not permit other NPs to
+        NP-level coordination, no prepositional phrases, and no relative clauses.
-        be nested within it – so no NP-level coordination, no prepositional
+
-        phrases, and no relative clauses. For example:
+        YIELDS (Span): Base noun-phrase `Span` objects
        """
        def __get__(self):
            if not self.doc.is_parsed:
@ -235,49 +262,47 @@ cdef class Span:
                yield span
    property root:
-        """
+        """The token within the span that's highest in the parse tree.
-        The token within the span that's highest in the parse tree. If there's a
+        If there's a tie, the earliest is prefered.
        tie, the earlist is prefered.
-        Returns:
+        RETURNS (Token): The root token.
            Token: The root token.
-        i.e. has the shortest path to the root of the sentence (or is the root
+        EXAMPLE: The root token has the shortest path to the root of the sentence
-        itself). If multiple words are equally high in the tree, the first word
+            (or is the root itself). If multiple words are equally high in the
-        is taken. For example:
+            tree, the first word is taken. For example:
-        >>> toks = nlp(u'I like New York in Autumn.')
+            >>> toks = nlp(u'I like New York in Autumn.')
-        Let's name the indices --- easier than writing "toks[4]" etc.
+            Let's name the indices – easier than writing `toks[4]` etc.
-        >>> i, like, new, york, in_, autumn, dot = range(len(toks))
+            >>> i, like, new, york, in_, autumn, dot = range(len(toks))
-        The head of 'new' is 'York', and the head of 'York' is 'like'
+            The head of 'new' is 'York', and the head of "York" is "like"
-        >>> toks[new].head.orth_
+            >>> toks[new].head.text
-        'York'
+            'York'
-        >>> toks[york].head.orth_
+            >>> toks[york].head.text
-        'like'
+            'like'
-        Create a span for "New York". Its root is "York".
+            Create a span for "New York". Its root is "York".
-        >>> new_york = toks[new:york+1]
+            >>> new_york = toks[new:york+1]
-        >>> new_york.root.orth_
+            >>> new_york.root.text
-        'York'
+            'York'
-        Here's a more complicated case, raise by Issue #214
+            Here's a more complicated case, raised by issue #214:
-        >>> toks = nlp(u'to, north and south carolina')
+            >>> toks = nlp(u'to, north and south carolina')
-        >>> to, north, and_, south, carolina = toks
+            >>> to, north, and_, south, carolina = toks
-        >>> south.head.text, carolina.head.text
+            >>> south.head.text, carolina.head.text
-        ('north', 'to')
+            ('north', 'to')
-        Here 'south' is a child of 'north', which is a child of 'carolina'.
+            Here "south" is a child of "north", which is a child of "carolina".
-        Carolina is the root of the span:
+            Carolina is the root of the span:
-        >>> south_carolina = toks[-2:]
+            >>> south_carolina = toks[-2:]
-        >>> south_carolina.root.text
+            >>> south_carolina.root.text
-        'carolina'
+            'carolina'
        """
        def __get__(self):
            self._recalculate_indices()
@ -314,10 +339,10 @@ cdef class Span:
                return self.doc[root]
    property lefts:
-        """
+        """ Tokens that are to the left of the span, whose head is within the
-        Tokens that are to the left of the span, whose head is within the Span.
+        `Span`.
-        Yields: Token A left-child of a token of the span.
+        YIELDS (Token):A left-child of a token of the span.
        """
        def __get__(self):
            for token in reversed(self): # Reverse, so we get the tokens in order
@ -326,10 +351,10 @@ cdef class Span:
                        yield left
    property rights:
-        """
+        """Tokens that are to the right of the Span, whose head is within the
-        Tokens that are to the right of the Span, whose head is within the Span.
+        `Span`.
-        Yields: Token A right-child of a token of the span.
+        YIELDS (Token): A right-child of a token of the span.
        """
        def __get__(self):
            for token in self:
@ -338,10 +363,9 @@ cdef class Span:
                        yield right
    property subtree:
-        """
+        """Tokens that descend from tokens in the span, but fall outside it.
        Tokens that descend from tokens in the span, but fall outside it.
-        Yields: Token A descendant of a token within the span.
+        YIELDS (Token): A descendant of a token within the span.
        """
        def __get__(self):
            for word in self.lefts:
@ -351,8 +375,9 @@ cdef class Span:
                yield from word.subtree
    property ent_id:
-        """
+        """An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
-        An (integer) entity ID. Usually assigned by patterns in the Matcher.
+
        RETURNS (int): The entity ID.
        """
        def __get__(self):
            return self.root.ent_id
@ -362,9 +387,11 @@ cdef class Span:
            raise NotImplementedError(
                "Can't yet set ent_id from Span. Vote for this feature on the issue "
                "tracker: http://github.com/explosion/spaCy/issues")
    property ent_id_:
-        """
+        """A (string) entity ID. Usually assigned by patterns in the `Matcher`.
-        A (string) entity ID. Usually assigned by patterns in the Matcher.
+
        RETURNS (unicode): The entity ID.
        """
        def __get__(self):
            return self.root.ent_id_
@ -376,26 +403,38 @@ cdef class Span:
                "tracker: http://github.com/explosion/spaCy/issues")
    property orth_:
        # TODO: docstring
        def __get__(self):
            return ''.join([t.string for t in self]).strip()
    property lemma_:
        """The span's lemma.
        RETURNS (unicode): The span's lemma.
        """
        def __get__(self):
            return ' '.join([t.lemma_ for t in self]).strip()
    property upper_:
        # TODO: docstring
        def __get__(self):
            return ''.join([t.string.upper() for t in self]).strip()
    property lower_:
        # TODO: docstring
        def __get__(self):
            return ''.join([t.string.lower() for t in self]).strip()
    property string:
        # TODO: docstring
        def __get__(self):
            return ''.join([t.string for t in self])
    property label_:
        """The span's label.
        RETURNS (unicode): The span's label.
        """
        def __get__(self):
            return self.doc.vocab.strings[self.label]
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -23,10 +23,14 @@ from .. import about
 cdef class Token:
-    """
+    """An individual token – i.e. a word, punctuation symbol, whitespace, etc."""
    An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
    """
    def __cinit__(self, Vocab vocab, Doc doc, int offset):
        """Construct a `Token` object.
        vocab (Vocab): A storage container for lexical types.
        doc (Doc): The parent document.
        offset (int): The index of the token within the document.
        """
        self.vocab = vocab
        self.doc = doc
        self.c = &self.doc.c[offset]
@ -36,8 +40,9 @@ cdef class Token:
        return hash((self.doc, self.i))
    def __len__(self):
-        """
+        """The number of unicode characters in the token, i.e. `token.text`.
-        Number of unicode characters in token.text.
+
        RETURNS (int): The number of unicode characters in the token.
        """
        return self.c.lex.length
@ -75,37 +80,35 @@ cdef class Token:
            raise ValueError(op)
    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
-        """
+        """Check the value of a boolean flag.
        Check the value of a boolean flag.
-        Arguments:
+        flag_id (int): The ID of the flag attribute.
-            flag_id (int): The ID of the flag attribute.
+        RETURNS (bool): Whether the flag is set.
-        Returns:
+
-            is_set (bool): Whether the flag is set.
+        EXAMPLE:
            >>> from spacy.attrs import IS_TITLE
            >>> doc = nlp(u'Give it back! He pleaded.')
            >>> token = doc[0]
            >>> token.check_flag(IS_TITLE)
            True
        """
        return Lexeme.c_check_flag(self.c.lex, flag_id)
    def nbor(self, int i=1):
-        """
+        """Get a neighboring token.
        Get a neighboring token.
-        Arguments:
+        i (int): The relative position of the token to get. Defaults to 1.
-            i (int): The relative position of the token to get. Defaults to 1.
+        RETURNS (Token): The token at position `self.doc[self.i+i]`.
        Returns:
            neighbor (Token): The token at position self.doc[self.i+i]
        """
        return self.doc[self.i+i]
    def similarity(self, other):
-        """
+        """Make a semantic similarity estimate. The default estimate is cosine
-        Compute a semantic similarity estimate. Defaults to cosine over vectors.
+        similarity using an average of word vectors.
-        Arguments:
+        other (object): The object to compare with. By default, accepts `Doc`,
-            other:
+            `Span`, `Token` and `Lexeme` objects.
-                The object to compare with. By default, accepts Doc, Span,
+        RETURNS (float): A scalar similarity score. Higher is more similar.
                Token and Lexeme objects.
        Returns:
            score (float): A scalar similarity score. Higher is more similar.
        """
        if 'similarity' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['similarity'](self)
@ -114,10 +117,14 @@ cdef class Token:
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
    property lex_id:
        """ID of the token's lexical type.
        RETURNS (int): ID of the token's lexical type."""
        def __get__(self):
            return self.c.lex.id
    property rank:
        # TODO: add docstring
        def __get__(self):
            return self.c.lex.id
@ -126,10 +133,19 @@ cdef class Token:
            return self.text_with_ws
    property text:
        """A unicode representation of the token text.
        RETURNS (unicode): The original verbatim text of the token.
        """
        def __get__(self):
            return self.orth_
    property text_with_ws:
        """The text content of the token with a trailing whitespace character if
        it has one.
        RETURNS (unicode): The text content of the span (with trailing whitespace).
        """
        def __get__(self):
            cdef unicode orth = self.vocab.strings[self.c.lex.orth]
            if self.c.spacy:
@ -184,6 +200,10 @@ cdef class Token:
            return self.c.lex.suffix
    property lemma:
        """Base form of the word, with no inflectional suffixes.
        RETURNS (int): Token lemma.
        """
        def __get__(self):
            return self.c.lemma
        def __set__(self, int lemma):
@ -206,8 +226,10 @@ cdef class Token:
            self.c.dep = label
    property has_vector:
-        """
+        """A boolean value indicating whether a word vector is associated with
-        A boolean value indicating whether a word vector is associated with the object.
+        the object.
        RETURNS (bool): Whether a word vector is associated with the object.
        """
        def __get__(self):
            if 'has_vector' in self.doc.user_token_hooks:
@ -220,10 +242,10 @@ cdef class Token:
                return False
    property vector:
-        """
+        """A real-valued meaning representation.
        A real-valued meaning representation.
-        Type: numpy.ndarray[ndim=1, dtype='float32']
+        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
            representing the token's semantics.
        """
        def __get__(self):
            if 'vector' in self.doc.user_token_hooks:
@ -239,15 +261,11 @@ cdef class Token:
            vector_view = <float[:length,]>self.c.lex.vector
            return numpy.asarray(vector_view)
    property repvec:
        def __get__(self):
            raise AttributeError("repvec was renamed to vector in v0.100")
    property has_repvec:
        def __get__(self):
            raise AttributeError("has_repvec was renamed to has_vector in v0.100")
    property vector_norm:
        """The L2 norm of the token's vector representation.
        RETURNS (float): The L2 norm of the vector representation.
        """
        def __get__(self):
            if 'vector_norm' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['vector_norm'](self)
@ -324,28 +342,26 @@ cdef class Token:
                yield from word.subtree
    property left_edge:
-        """
+        """The leftmost token of this token's syntactic descendents.
        The leftmost token of this token's syntactic descendents.
-        Returns: Token The first token such that self.is_ancestor(token)
+        RETURNS (Token): The first token such that `self.is_ancestor(token)`.
        """
        def __get__(self):
            return self.doc[self.c.l_edge]
    property right_edge:
-        """
+        """The rightmost token of this token's syntactic descendents.
        The rightmost token of this token's syntactic descendents.
-        Returns: Token The last token such that self.is_ancestor(token)
+        RETURNS (Token): The last token such that `self.is_ancestor(token)`.
        """
        def __get__(self):
            return self.doc[self.c.r_edge]
    property ancestors:
-        """
+        """A sequence of this token's syntactic ancestors.
        A sequence of this token's syntactic ancestors.
-        Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
+        YIELDS (Token): A sequence of ancestor tokens such that
            `ancestor.is_ancestor(self)`.
        """
        def __get__(self):
            cdef const TokenC* head_ptr = self.c
@ -357,33 +373,25 @@ cdef class Token:
                yield self.doc[head_ptr - (self.c - self.i)]
                i += 1
    def is_ancestor_of(self, descendant):
        # TODO: Remove after backward compatibility check.
        return self.is_ancestor(descendant)
    def is_ancestor(self, descendant):
-        """
+        """Check whether this token is a parent, grandparent, etc. of another
        Check whether this token is a parent, grandparent, etc. of another
        in the dependency tree.
-        Arguments:
+        descendant (Token): Another token.
-            descendant (Token): Another token.
+        RETURNS (bool): Whether this token is the ancestor of the descendant.
        Returns:
            is_ancestor (bool): Whether this token is the ancestor of the descendant.
        """
        if self.doc is not descendant.doc:
            return False
        return any( ancestor.i == self.i for ancestor in descendant.ancestors )
    property head:
-        """
+        """The syntactic parent, or "governor", of this token.
        The syntactic parent, or "governor", of this token.
-        Returns: Token
+        RETURNS (Token): The token head.
        """
        def __get__(self):
-            """
+            """The token predicted by the parser to be the head of the current
-            The token predicted by the parser to be the head of the current token.
+            token.
            """
            return self.doc[self.i + self.c.head]
        def __set__(self, Token new_head):
@ -399,7 +407,7 @@ cdef class Token:
            cdef int rel_newhead_i = new_head.i - self.i
            # is the new head a descendant of the old head
-            cdef bint is_desc = old_head.is_ancestor_of(new_head)
+            cdef bint is_desc = old_head.is_ancestor(new_head)
            cdef int new_edge
            cdef Token anc, child
@ -477,10 +485,9 @@ cdef class Token:
            self.c.head = rel_newhead_i
    property conjuncts:
-        """
+        """A sequence of coordinated tokens, including the token itself.
        A sequence of coordinated tokens, including the token itself.
-        Yields: Token A coordinated token
+        YIELDS (Token): A coordinated token.
        """
        def __get__(self):
            """Get a list of conjoined words."""
@ -495,25 +502,46 @@ cdef class Token:
                            yield from word.conjuncts
    property ent_type:
        """Named entity type.
        RETURNS (int): Named entity type.
        """
        def __get__(self):
            return self.c.ent_type
    property ent_iob:
        """IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag
        is assigned.
        RETURNS (int): IOB code of named entity tag.
        """
        def __get__(self):
            return self.c.ent_iob
    property ent_type_:
        """Named entity type.
        RETURNS (unicode): Named entity type.
        """
        def __get__(self):
            return self.vocab.strings[self.c.ent_type]
    property ent_iob_:
        """IOB code of named entity tag. "B" means the token begins an entity,
        "I" means it is inside an entity, "O" means it is outside an entity, and
        "" means no entity tag is set.
        RETURNS (unicode): IOB code of named entity tag.
        """
        def __get__(self):
            iob_strings = ('', 'I', 'O', 'B')
            return iob_strings[self.c.ent_iob]
    property ent_id:
-        """
+        """ID of the entity the token is an instance of, if any. Usually
-        An (integer) entity ID. Usually assigned by patterns in the Matcher.
+        assigned by patterns in the Matcher.
        RETURNS (int): ID of the entity.
        """
        def __get__(self):
            return self.c.ent_id
@ -522,8 +550,10 @@ cdef class Token:
            self.c.ent_id = key
    property ent_id_:
-        """
+        """ID of the entity the token is an instance of, if any. Usually
-        A (string) entity ID. Usually assigned by patterns in the Matcher.
+        assigned by patterns in the Matcher.
        RETURNS (unicode): ID of the entity.
        """
        def __get__(self):
            return self.vocab.strings[self.c.ent_id]
@ -564,6 +594,10 @@ cdef class Token:
            return self.vocab.strings[self.c.lex.lang]
    property lemma_:
        """Base form of the word, with no inflectional suffixes.
        RETURNS (unicode): Token lemma.
        """
        def __get__(self):
            return self.vocab.strings[self.c.lemma]
        def __set__(self, unicode lemma_):
--- a/spacy/util.py
+++ b/spacy/util.py
@ -145,7 +145,8 @@ def parse_package_meta(package_path, require=True):
 def is_in_jupyter():
-    """Check if user is in a Jupyter notebook. Mainly used for displaCy.
+    """Check if user is running spaCy from a Jupyter notebook by detecting the
    IPython kernel. Mainly used for the displaCy visualizer.
    RETURNS (bool): True if in Jupyter, False if not.
    """
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -36,79 +36,22 @@ EMPTY_LEXEME.vector = EMPTY_VEC
 cdef class Vocab:
    """A look-up table that allows you to access `Lexeme` objects. The `Vocab`
    instance also provides access to the `StringStore`, and owns underlying
    C-data that is shared between `Doc` objects.
    """
    A map container for a language's LexemeC structs.
    """
    @classmethod
    def load(cls, path, lex_attr_getters=None, lemmatizer=True,
             tag_map=True, oov_prob=True, **deprecated_kwargs):
        """
        Deprecated --- replace in spaCy 2
        Load the vocabulary from a path.
        Arguments:
            path (Path):
                The path to load from.
            lex_attr_getters (dict):
                A dictionary mapping attribute IDs to functions to compute them.
                Defaults to None.
            lemmatizer (object):
                A lemmatizer. Defaults to None.
            tag_map (dict):
                A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
                and optionally morphological attributes.
            oov_prob (float):
                The default probability for out-of-vocabulary words.
        Returns:
            Vocab: The newly constructed vocab object.
        """
        path = util.ensure_path(path)
        util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
        if 'vectors' in deprecated_kwargs:
            raise AttributeError(
                "vectors argument to Vocab.load() deprecated. "
                "Install vectors after loading.")
        if tag_map is True and (path / 'vocab' / 'tag_map.json').exists():
            with (path / 'vocab' / 'tag_map.json').open('r', encoding='utf8') as file_:
                tag_map = ujson.load(file_)
        elif tag_map is True:
            tag_map = None
        if lex_attr_getters is not None \
        and oov_prob is True \
        and (path / 'vocab' / 'oov_prob').exists():
            with (path / 'vocab' / 'oov_prob').open('r', encoding='utf8') as file_:
                oov_prob = float(file_.read())
            lex_attr_getters[PROB] = lambda text: oov_prob
        if lemmatizer is True:
            lemmatizer = Lemmatizer.load(path)
        with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
            strings_list = ujson.load(file_)
        cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
                              lemmatizer=lemmatizer,
                              strings=strings_list)
        self.load_lexemes(path / 'vocab' / 'lexemes.bin')
        return self
    def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
            strings=tuple(), **deprecated_kwargs):
-        """
+        """Create the vocabulary.
        Create the vocabulary.
-        lex_attr_getters (dict):
+        lex_attr_getters (dict): A dictionary mapping attribute IDs to functions
-            A dictionary mapping attribute IDs to functions to compute them.
+            to compute them. Defaults to `None`.
-            Defaults to None.
+        tag_map (dict): A dictionary mapping fine-grained tags to coarse-grained
-        lemmatizer (object):
+            parts-of-speech, and optionally morphological attributes.
-            A lemmatizer. Defaults to None.
+        lemmatizer (object): A lemmatizer. Defaults to `None`.
-        tag_map (dict):
+        strings (StringStore): StringStore that maps strings to integers, and
-            A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
+            vice versa.
-            and optionally morphological attributes.
+        RETURNS (Vocab): The newly constructed vocab object.
        oov_prob (float):
            The default probability for out-of-vocabulary words.
        Returns:
            Vocab: The newly constructed vocab object.
        """
        util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
@ -148,33 +91,32 @@ cdef class Vocab:
            return langfunc('_') if langfunc else ''
    def __len__(self):
-        """
+        """The current number of lexemes stored.
-        The current number of lexemes stored.
+
        RETURNS (int): The current number of lexemes stored.
        """
        return self.length
    def add_flag(self, flag_getter, int flag_id=-1):
-        """
+        """Set a new boolean flag to words in the vocabulary.
        Set a new boolean flag to words in the vocabulary.
-        The flag_setter function will be called over the words currently in the
+        The flag_getter function will be called over the words currently in the
        vocab, and then applied to new words as they occur. You'll then be able
        to access the flag value on each token, using token.check_flag(flag_id).
        See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
        `Token.check_flag`.
-        See also:
+        flag_getter (callable): A function `f(unicode) -> bool`, to get the flag
-            Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag.
+            value.
        flag_id (int): An integer between 1 and 63 (inclusive), specifying
            the bit at which the flag will be stored. If -1, the lowest
            available bit will be chosen.
        RETURNS (int): The integer ID by which the flag value can be checked.
-        Arguments:
+        EXAMPLE:
-            flag_getter:
+            >>> MY_PRODUCT = nlp.vocab.add_flag(lambda text: text in ['spaCy', 'dislaCy'])
-                A function f(unicode) -> bool, to get the flag value.
+            >>> doc = nlp(u'I like spaCy')
-
+            >>> assert doc[2].check_flag(MY_PRODUCT) == True
            flag_id (int):
                An integer between 1 and 63 (inclusive), specifying the bit at which the
                flag will be stored. If -1, the lowest available bit will be
                chosen.
        Returns:
            flag_id (int): The integer ID by which the flag value can be checked.
        """
        if flag_id == -1:
            for bit in range(1, 64):
@ -196,9 +138,8 @@ cdef class Vocab:
        return flag_id
    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
-        """
+        """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
-        Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
+        if necessary, using memory acquired from the given pool. If the pool
        if necessary, using memory acquired from the given pool.  If the pool
        is the lexicon's own memory, the lexeme is saved in the lexicon.
        """
        if string == u'':
@ -216,9 +157,8 @@ cdef class Vocab:
            return self._new_lexeme(mem, string)
    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
-        """
+        """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
-        Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
+        if necessary, using memory acquired from the given pool. If the pool
        if necessary, using memory acquired from the given pool.  If the pool
        is the lexicon's own memory, the lexeme is saved in the lexicon.
        """
        if orth == 0:
@ -263,24 +203,19 @@ cdef class Vocab:
        self.length += 1
    def __contains__(self, unicode string):
-        """
+        """Check whether the string has an entry in the vocabulary.
        Check whether the string has an entry in the vocabulary.
-        Arguments:
+        string (unicode): The ID string.
-            string (unicode): The ID string.
+        RETURNS (bool) Whether the string has an entry in the vocabulary.
        Returns:
            bool Whether the string has an entry in the vocabulary.
        """
        key = hash_string(string)
        lex = self._by_hash.get(key)
        return lex is not NULL
    def __iter__(self):
-        """
+        """Iterate over the lexemes in the vocabulary.
        Iterate over the lexemes in the vocabulary.
-        Yields: Lexeme An entry in the vocabulary.
+        YIELDS (Lexeme): An entry in the vocabulary.
        """
        cdef attr_t orth
        cdef size_t addr
@ -288,19 +223,19 @@ cdef class Vocab:
            yield Lexeme(self, orth)
    def __getitem__(self,  id_or_string):
-        """
+        """Retrieve a lexeme, given an int ID or a unicode string.  If a
-        Retrieve a lexeme, given an int ID or a unicode string.  If a previously
+        previously unseen unicode string is given, a new lexeme is created and
-        unseen unicode string is given, a new lexeme is created and stored.
+        stored.
-        Arguments:
+        id_or_string (int or unicode): The integer ID of a word, or its unicode
-            id_or_string (int or unicode):
+            string. If `int >= Lexicon.size`, `IndexError` is raised. If
-              The integer ID of a word, or its unicode string.
+            `id_or_string` is neither an int nor a unicode string, `ValueError`
            is raised.
        RETURNS (Lexeme): The lexeme indicated by the given ID.
-              If an int >= Lexicon.size, IndexError is raised. If id_or_string
+        EXAMPLE:
-              is neither an int nor a unicode string, ValueError is raised.
+            >>> apple = nlp.vocab.strings['apple']
-
+            >>> assert nlp.vocab[apple] == nlp.vocab[u'apple']
        Returns:
            lexeme (Lexeme): The lexeme indicated by the given ID.
        """
        cdef attr_t orth
        if type(id_or_string) == unicode:
@ -324,15 +259,29 @@ cdef class Vocab:
        return tokens
    def to_disk(self, path):
        """Save the current state to a directory.
        path (unicode or Path): A path to a directory, which will be created if
            it doesn't exist. Paths may be either strings or `Path`-like objects.
        """
        path = util.ensure_path(path)
        if not path.exists():
            path.mkdir()
        strings_loc = path / 'strings.json'
        with strings_loc.open('w', encoding='utf8') as file_:
            self.strings.dump(file_)
-        self.dump(path / 'lexemes.bin')
+
        # TODO: pickle
        # self.dump(path / 'lexemes.bin')
    def from_disk(self, path):
        """Loads state from a directory. Modifies the object in place and
        returns it.
        path (unicode or Path): A path to a directory. Paths may be either
            strings or `Path`-like objects.
        RETURNS (Vocab): The modified `Vocab` object.
        """
        path = util.ensure_path(path)
        with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
            strings_list = ujson.load(file_)
@ -340,6 +289,23 @@ cdef class Vocab:
            self.strings[string]
        self.load_lexemes(path / 'lexemes.bin')
    def to_bytes(self, **exclude):
        """Serialize the current state to a binary string.
        **exclude: Named attributes to prevent from being serialized.
        RETURNS (bytes): The serialized form of the `Vocab` object.
        """
        raise NotImplementedError()
    def from_bytes(self, bytes_data, **exclude):
        """Load state from a binary string.
        bytes_data (bytes): The data to load from.
        **exclude: Named attributes to prevent from being loaded.
        RETURNS (Vocab): The `Vocab` object.
        """
        raise NotImplementedError()
    def lexemes_to_bytes(self, **exclude):
        cdef hash_t key
        cdef size_t addr
@ -365,9 +331,7 @@ cdef class Vocab:
        return byte_string
    def lexemes_from_bytes(self, bytes bytes_data):
-        """
+        """Load the binary vocabulary data from the given string."""
        Load the binary vocabulary data from the given string.
        """
        cdef LexemeC* lexeme
        cdef hash_t key
        cdef unicode py_str
@ -393,14 +357,10 @@ cdef class Vocab:
    # Deprecated --- delete these once stable
    def dump_vectors(self, out_loc):
-        """
+        """Save the word vectors to a binary file.
        Save the word vectors to a binary file.
-        Arguments:
+        loc (Path): The path to save to.
-            loc (Path): The path to save to.
+        """
        Returns:
            None
        #"""
        cdef int32_t vec_len = self.vectors_length
        cdef int32_t word_len
        cdef bytes word_str
@ -424,17 +384,14 @@ cdef class Vocab:
    def load_vectors(self, file_):
-        """
+        """Load vectors from a text-based file.
        Load vectors from a text-based file.
-        Arguments:
+        file_ (buffer): The file to read from. Entries should be separated by
-            file_ (buffer): The file to read from. Entries should be separated by newlines,
+            newlines, and each entry should be whitespace delimited. The first value of the entry
-        and each entry should be whitespace delimited. The first value of the entry
+            should be the word string, and subsequent entries should be the values of the
-        should be the word string, and subsequent entries should be the values of the
+            vector.
        vector.
-        Returns:
+        RETURNS (int): The length of the vectors loaded.
            vec_len (int): The length of the vectors loaded.
        """
        cdef LexemeC* lexeme
        cdef attr_t orth
@ -464,14 +421,11 @@ cdef class Vocab:
        return vec_len
    def load_vectors_from_bin_loc(self, loc):
-        """
+        """Load vectors from the location of a binary file.
        Load vectors from the location of a binary file.
-        Arguments:
+        loc (unicode): The path of the binary file to load from.
            loc (unicode): The path of the binary file to load from.
-        Returns:
+        RETURNS (int): The length of the vectors loaded.
            vec_len (int): The length of the vectors loaded.
        """
        cdef CFile file_ = CFile(loc, b'rb')
        cdef int32_t word_len
@ -526,12 +480,10 @@ cdef class Vocab:
    def resize_vectors(self, int new_size):
-        """
+        """Set vectors_length to a new size, and allocate more memory for the
-        Set vectors_length to a new size, and allocate more memory for the Lexeme
+        `Lexeme` vectors if necessary. The memory will be zeroed.
        vectors if necessary. The memory will be zeroed.
-        Arguments:
+        new_size (int): The new size of the vectors.
            new_size (int): The new size of the vectors.
        """
        cdef hash_t key
        cdef size_t addr
@ -633,237 +585,3 @@ class VectorReadError(Exception):
            "Vector size: %d\n"
            "Max size: %d\n"
            "Min size: 1\n" % (loc, size, MAX_VEC_SIZE))
 #
 #Deprecated --- delete these once stable
 #    
 #    def dump_vectors(self, out_loc):
 #        """
 #        Save the word vectors to a binary file.
 #
 #        Arguments:
 #            loc (Path): The path to save to.
 #        Returns:
 #            None
 #        #"""
 #        cdef int32_t vec_len = self.vectors_length
 #        cdef int32_t word_len
 #        cdef bytes word_str
 #        cdef char* chars
 #
 #        cdef Lexeme lexeme
 #        cdef CFile out_file = CFile(out_loc, 'wb')
 #        for lexeme in self:
 #            word_str = lexeme.orth_.encode('utf8')
 #            vec = lexeme.c.vector
 #            word_len = len(word_str)
 #
 #            out_file.write_from(&word_len, 1, sizeof(word_len))
 #            out_file.write_from(&vec_len, 1, sizeof(vec_len))
 #
 #            chars = <char*>word_str
 #            out_file.write_from(chars, word_len, sizeof(char))
 #            out_file.write_from(vec, vec_len, sizeof(float))
 #        out_file.close()
 #
 #
 #
 #    def load_vectors(self, file_):
 #        """
 #        Load vectors from a text-based file.
 #
 #        Arguments:
 #            file_ (buffer): The file to read from. Entries should be separated by newlines,
 #        and each entry should be whitespace delimited. The first value of the entry
 #        should be the word string, and subsequent entries should be the values of the
 #        vector.
 #
 #        Returns:
 #            vec_len (int): The length of the vectors loaded.
 #        """
 #        cdef LexemeC* lexeme
 #        cdef attr_t orth
 #        cdef int32_t vec_len = -1
 #        cdef double norm = 0.0
 #
 #        whitespace_pattern = re.compile(r'\s', re.UNICODE)
 #
 #        for line_num, line in enumerate(file_):
 #            pieces = line.split()
 #            word_str = " " if whitespace_pattern.match(line) else pieces.pop(0)
 #            if vec_len == -1:
 #                vec_len = len(pieces)
 #            elif vec_len != len(pieces):
 #                raise VectorReadError.mismatched_sizes(file_, line_num,
 #                                                        vec_len, len(pieces))
 #            orth = self.strings[word_str]
 #            lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
 #            lexeme.vector = <float*>self.mem.alloc(vec_len, sizeof(float))
 #            for i, val_str in enumerate(pieces):
 #                lexeme.vector[i] = float(val_str)
 #            norm = 0.0
 #            for i in range(vec_len):
 #                norm += lexeme.vector[i] * lexeme.vector[i]
 #            lexeme.l2_norm = sqrt(norm)
 #        self.vectors_length = vec_len
 #        return vec_len
 #
 #    def load_vectors_from_bin_loc(self, loc):
 #        """
 #        Load vectors from the location of a binary file.
 #
 #        Arguments:
 #            loc (unicode): The path of the binary file to load from.
 #
 #        Returns:
 #            vec_len (int): The length of the vectors loaded.
 #        """
 #        cdef CFile file_ = CFile(loc, b'rb')
 #        cdef int32_t word_len
 #        cdef int32_t vec_len = 0
 #        cdef int32_t prev_vec_len = 0
 #        cdef float* vec
 #        cdef Address mem
 #        cdef attr_t string_id
 #        cdef bytes py_word
 #        cdef vector[float*] vectors
 #        cdef int line_num = 0
 #        cdef Pool tmp_mem = Pool()
 #        while True:
 #            try:
 #                file_.read_into(&word_len, sizeof(word_len), 1)
 #            except IOError:
 #                break
 #            file_.read_into(&vec_len, sizeof(vec_len), 1)
 #            if prev_vec_len != 0 and vec_len != prev_vec_len:
 #                raise VectorReadError.mismatched_sizes(loc, line_num,
 #                                                       vec_len, prev_vec_len)
 #            if 0 >= vec_len >= MAX_VEC_SIZE:
 #                raise VectorReadError.bad_size(loc, vec_len)
 #
 #            chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char))
 #            vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
 #
 #            string_id = self.strings[chars[:word_len]]
 #            # Insert words into vocab to add vector.
 #            self.get_by_orth(self.mem, string_id)
 #            while string_id >= vectors.size():
 #                vectors.push_back(EMPTY_VEC)
 #            assert vec != NULL
 #            vectors[string_id] = vec
 #            line_num += 1
 #        cdef LexemeC* lex
 #        cdef size_t lex_addr
 #        cdef double norm = 0.0
 #        cdef int i
 #        for orth, lex_addr in self._by_orth.items():
 #            lex = <LexemeC*>lex_addr
 #            if lex.lower < vectors.size():
 #                lex.vector = vectors[lex.lower]
 #                norm = 0.0
 #                for i in range(vec_len):
 #                    norm += lex.vector[i] * lex.vector[i]
 #                lex.l2_norm = sqrt(norm)
 #            else:
 #                lex.vector = EMPTY_VEC
 #        self.vectors_length = vec_len
 #        return vec_len
 #
 #
 #def write_binary_vectors(in_loc, out_loc):
 #    cdef CFile out_file = CFile(out_loc, 'wb')
 #    cdef Address mem
 #    cdef int32_t word_len
 #    cdef int32_t vec_len
 #    cdef char* chars
 #    with bz2.BZ2File(in_loc, 'r') as file_:
 #        for line in file_:
 #            pieces = line.split()
 #            word = pieces.pop(0)
 #            mem = Address(len(pieces), sizeof(float))
 #            vec = <float*>mem.ptr
 #            for i, val_str in enumerate(pieces):
 #                vec[i] = float(val_str)
 #
 #            word_len = len(word)
 #            vec_len = len(pieces)
 #
 #            out_file.write_from(&word_len, 1, sizeof(word_len))
 #            out_file.write_from(&vec_len, 1, sizeof(vec_len))
 #
 #            chars = <char*>word
 #            out_file.write_from(chars, len(word), sizeof(char))
 #            out_file.write_from(vec, vec_len, sizeof(float))
 #
 #
 #    def resize_vectors(self, int new_size):
 #        """
 #        Set vectors_length to a new size, and allocate more memory for the Lexeme
 #        vectors if necessary. The memory will be zeroed.
 #
 #        Arguments:
 #            new_size (int): The new size of the vectors.
 #        """
 #        cdef hash_t key
 #        cdef size_t addr
 #        if new_size > self.vectors_length:
 #            for key, addr in self._by_hash.items():
 #                lex = <LexemeC*>addr
 #                lex.vector = <float*>self.mem.realloc(lex.vector,
 #                                        new_size * sizeof(lex.vector[0]))
 #        self.vectors_length = new_size
 #
 #
 #
 #    def dump(self, loc=None):
 #        """
 #        Save the lexemes binary data to the given location, or
 #        return a byte-string with the data if loc is None.
 #
 #        Arguments:
 #            loc (Path or None): The path to save to, or None.
 #        """
 #        if loc is None:
 #            return self.to_bytes()
 #        else:
 #            return self.to_disk(loc)
 #
 #    def load_lexemes(self, loc):
 #        """
 #        Load the binary vocabulary data from the given location.
 #
 #        Arguments:
 #            loc (Path): The path to load from.
 #
 #        Returns:
 #            None
 #        """
 #        fp = CFile(loc, 'rb',
 #                on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
 #        cdef LexemeC* lexeme = NULL
 #        cdef SerializedLexemeC lex_data
 #        cdef hash_t key
 #        cdef unicode py_str
 #        cdef attr_t orth = 0
 #        assert sizeof(orth) == sizeof(lexeme.orth)
 #        i = 0
 #        while True:
 #            try:
 #                fp.read_into(&orth, 1, sizeof(orth))
 #            except IOError:
 #                break
 #            lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
 #            # Copy data from the file into the lexeme
 #            fp.read_into(&lex_data.data, 1, sizeof(lex_data.data))
 #            Lexeme.c_from_bytes(lexeme, lex_data)
 #
 #            lexeme.vector = EMPTY_VEC
 #            py_str = self.strings[lexeme.orth]
 #            key = hash_string(py_str)
 #            self._by_hash.set(key, lexeme)
 #            self._by_orth.set(lexeme.orth, lexeme)
 #            self.length += 1
 #            i += 1
 #        fp.close()
--- a/website/_harp.json
+++ b/website/_harp.json
@ -80,6 +80,7 @@
            }
        ],
        "ALPHA": true,
        "V_CSS": "1.6",
        "V_JS": "1.2",
        "DEFAULT_SYNTAX": "python",
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -34,17 +34,17 @@ mixin src(url)
        +a(url)
            block
-        |   #[+icon("code", 16).o-icon--inline.u-color-subtle]
+        |   #[+icon("code", 16).o-icon--inline.u-color-theme]
 //- API link (with added tag and automatically generated path)
    path - [string] path to API docs page relative to /docs/api/
 mixin api(path)
-    +a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block
+    +a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap
        block
-        |  #[+icon("book", 18).o-icon--inline.u-color-subtle]
+        |  #[+icon("book", 18).o-icon--inline.u-color-theme]
 //- Help icon with tooltip
@ -104,15 +104,31 @@ mixin button(url, trusted, ...style)
    language - [string] language for syntax highlighting (default: "python")
               supports basic relevant languages available for PrismJS
-mixin code(label, language)
+mixin code(label, language, icon)
-    pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}")&attributes(attributes)
+    pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : "")&attributes(attributes)
        if label
            h4.u-text-label.u-text-label--dark=label
        if icon
            - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
            .c-code-block__icon(class=classes[icon] || "")
                +icon(icon, 18)
        code.c-code-block__content
            block
 //- Code blocks to display old/new versions
 mixin code-old()
    +code(false, false, "reject").o-block-small
        block
 mixin code-new()
    +code(false, false, "accept").o-block-small
        block
 //- CodePen embed
    slug        - [string] ID of CodePen demo (taken from URL)
    height      - [integer] height of demo embed iframe
@ -164,6 +180,16 @@ mixin tag()
        block
 //- "Requires model" tag with tooltip and list of capabilities
    ...capabs - [string] Required model capabilities, e.g. "vectors".
 mixin tag-model(...capabs)
    - var intro = "To use this functionality, spaCy needs a model to be installed"
    - var ext = capabs.length ? " that supports the following capabilities: " + capabs.join(', ') : ""
    +tag Requires model
    +help(intro + ext + ".").u-color-theme
 //- List
    type  - [string] "numbers", "letters", "roman" (bulleted list if none set)
    start - [integer] start number
--- a/website/_includes/_navigation.jade
+++ b/website/_includes/_navigation.jade
@ -9,6 +9,9 @@ nav.c-nav.u-text.js-nav(class=landing ? "c-nav--theme" : null)
        .u-text-label.u-padding-small.u-hidden-xs=SUBSECTION
    ul.c-nav__menu
        if ALPHA
            - var NAVIGATION = { "Usage": "/docs/usage", "Reference": "/docs/api" }
        each url, item in NAVIGATION
            li.c-nav__menu__item(class=(url == "/") ? "u-hidden-xs" : null)
                +a(url)=item
--- a/website/_includes/_page-docs.jade
+++ b/website/_includes/_page-docs.jade
@ -10,6 +10,14 @@ main.o-main.o-main--sidebar.o-main--aside
            if tag
                +tag=tag
        if ALPHA
            +infobox("⚠️ You are viewing the spaCy v2.0 alpha docs")
                |  This page is part of the alpha documentation for spaCy v2.0
                |  and does not reflect the state of the latest stable release.
                |  #[+a("#") See here] for more information on how to install
                |  and test the new version. To read the official docs for
                |  v1.x, #[+a("https://spacy.io/docs") go here].
        !=yield
    +grid.o-content.u-text
--- a/website/_layout.jade
+++ b/website/_layout.jade
@ -35,7 +35,10 @@ html(lang="en")
    link(rel="shortcut icon" href="/assets/img/favicon.ico")
    link(rel="icon" type="image/x-icon" href="/assets/img/favicon.ico")
-    if SUBSECTION == "usage"
+    if ALPHA && SECTION == "docs"
        link(href="/assets/css/style_green.css?v#{V_CSS}" rel="stylesheet")
    else if SUBSECTION == "usage"
        link(href="/assets/css/style_red.css?v#{V_CSS}" rel="stylesheet")
    else
--- a/website/assets/css/_components/_code.sass
+++ b/website/assets/css/_components/_code.sass
@ -13,6 +13,17 @@
    white-space: pre
    direction: ltr
    &.c-code-block--has-icon
        padding: 0
        display: flex
 .c-code-block__icon
    padding: 0 0 0 1rem
    display: flex
    justify-content: center
    align-items: center
    border-left: 6px solid
 //- Code block content
@ -26,8 +37,8 @@
 *:not(.c-code-block) > code
    font: normal 600 0.8em/#{1} $font-code
-    background: rgba($color-front, 0.05)
+    background: darken($color-theme-light, 5)
-    box-shadow: 1px 1px 0 rgba($color-front, 0.1)
+    box-shadow: 1px 1px 0 rgba($color-front, 0.05)
    text-shadow: 1px 1px 0 rgba($color-back, 0.5)
    color: $color-front
    padding: 0.1em 0.5em
--- a/website/assets/css/_components/_tables.sass
+++ b/website/assets/css/_components/_tables.sass
@ -13,7 +13,7 @@
        background: rgba($color-subtle-light, 0.35)
    &.c-table__row--foot
-        background: rgba($color-theme, 0.025)
+        background: $color-theme-light
        border-top: 2px solid $color-theme
        .c-table__cell:first-child
--- a/website/assets/css/_components/_tooltips.sass
+++ b/website/assets/css/_components/_tooltips.sass
@ -11,9 +11,8 @@
            background: $color-front
            border-radius: 2px
            color: $color-back
-            font-family: inherit
+            font: normal 1.3rem/#{1.25} $font-primary
-            font-size: 1.3rem
+            text-transform: none
            line-height: 1.25
            opacity: 0
            padding: 0.5em 0.75em
            transform: translateX(-50%) translateY(-2px)
--- a/website/assets/css/_variables.sass
+++ b/website/assets/css/_variables.sass
@ -26,8 +26,7 @@ $font-code: 'Source Code Pro', Consolas, 'Andale Mono', Menlo, Monaco, Courier,
 // Colors
-$colors: ( blue: #09a3d5, red: #d9515d )
+$colors: ( blue: #09a3d5, red: #d9515d, green: #08c35e )
 $colors-light: (blue: #cceaf4, red: #f9d7da)
 $color-back: #fff !default
 $color-front: #1a1e23 !default
@ -35,7 +34,7 @@ $color-dark: lighten($color-front, 20) !default
 $color-theme: map-get($colors, $theme)
 $color-theme-dark: darken(map-get($colors, $theme), 5)
-$color-theme-light: map-get($colors-light, $theme)
+$color-theme-light: rgba($color-theme, 0.05)
 $color-subtle: #ddd !default
 $color-subtle-light: #f6f6f6 !default
--- a/website/assets/css/style_green.sass
+++ b/website/assets/css/style_green.sass
@ -0,0 +1,4 @@
 //- 💫 STYLESHEET (GREEN)
 $theme: green
@import style
--- a/website/assets/img/icons.svg
+++ b/website/assets/img/icons.svg
@ -30,5 +30,11 @@
        <symbol id="help" viewBox="0 0 24 24">
            <path d="M12 6c2.203 0 3.984 1.781 3.984 3.984 0 2.484-3 2.766-3 5.016h-1.969c0-3.234 3-3 3-5.016 0-1.078-0.938-1.969-2.016-1.969s-2.016 0.891-2.016 1.969h-1.969c0-2.203 1.781-3.984 3.984-3.984zM12 20.016c4.406 0 8.016-3.609 8.016-8.016s-3.609-8.016-8.016-8.016-8.016 3.609-8.016 8.016 3.609 8.016 8.016 8.016zM12 2.016c5.531 0 9.984 4.453 9.984 9.984s-4.453 9.984-9.984 9.984-9.984-4.453-9.984-9.984 4.453-9.984 9.984-9.984zM11.016 18v-2.016h1.969v2.016h-1.969z"/>
        </symbol>
        <symbol id="reject" viewBox="0 0 24 24">
            <path d="M18.984 6.422l-5.578 5.578 5.578 5.578-1.406 1.406-5.578-5.578-5.578 5.578-1.406-1.406 5.578-5.578-5.578-5.578 1.406-1.406 5.578 5.578 5.578-5.578z"/>
        </symbol>
        <symbol id="accept" viewBox="0 0 24 24">
            <path d="M9 16.172l10.594-10.594 1.406 1.406-12 12-5.578-5.578 1.406-1.406z"/>
        </symbol>
    </defs>
 </svg>
--- a/website/assets/img/pattern_green.jpg
+++ b/website/assets/img/pattern_green.jpg
--- a/website/docs/api/_data.json
+++ b/website/docs/api/_data.json
@ -2,8 +2,13 @@
    "sidebar": {
        "Introduction": {
            "Facts & Figures": "./",
-            "Languages": "language-models",
+            "Languages": "language-models"
-            "Philosophy": "philosophy"
+        },
        "Top-level": {
            "spacy": "spacy",
            "displacy": "displacy",
            "Utility Functions": "util",
            "Command line": "cli"
        },
        "Classes": {
            "Doc": "doc",
@ -21,9 +26,6 @@
            "GoldParse": "goldparse"
        },
        "Other": {
            "Command line": "cli",
            "displaCy": "displacy",
            "Utility Functions": "util",
            "Annotation Specs": "annotation",
            "Feature Scheme": "features"
        }
@ -43,6 +45,26 @@
        "title": "Philosophy"
    },
    "spacy": {
        "title": "spaCy top-level functions",
        "next": "displacy"
    },
    "displacy": {
        "title": "displaCy",
        "tag": "module",
        "next": "util"
    },
    "util": {
        "title": "Utility Functions",
        "next": "cli"
    },
    "cli": {
        "title": "Command Line Interface"
    },
    "language": {
        "title": "Language",
        "tag": "class"
@ -113,20 +135,6 @@
        "tag": "class"
    },
    "cli": {
        "title": "Command Line Interface",
        "next": "displacy"
    },
    "displacy": {
        "title": "displaCy",
        "tag": "module"
    },
    "util": {
        "title": "Utility Functions"
    },
    "annotation": {
        "title": "Annotation Specifications"
    },
--- a/website/docs/api/annotation.jade
+++ b/website/docs/api/annotation.jade
@ -71,6 +71,44 @@ include _annotation/_dep-labels
 include _annotation/_named-entities
 +h(3, "biluo") BILUO Scheme
 p
    |  spaCy translates character offsets into the BILUO scheme, in order to
    |  decide the cost of each action given the current state of the entity
    |  recognizer. The costs are then used to calculate the gradient of the
    |  loss, to train the model.
 +aside("Why BILUO, not IOB?")
    |  There are several coding schemes for encoding entity annotations as
    |  token tags.  These coding schemes are equally expressive, but not
    |  necessarily equally learnable.
    |  #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth]
    |  showed that the minimal #[strong Begin], #[strong In], #[strong Out]
    |  scheme was more difficult to learn than the #[strong BILUO] scheme that
    |  we use, which explicitly marks boundary tokens.
 +table([ "Tag", "Description" ])
    +row
        +cell #[code #[span.u-color-theme B] EGIN]
        +cell The first token of a multi-token entity.
    +row
        +cell #[code #[span.u-color-theme I] N]
        +cell An inner token of a multi-token entity.
    +row
        +cell #[code #[span.u-color-theme L] AST]
        +cell The final token of a multi-token entity.
    +row
        +cell #[code #[span.u-color-theme U] NIT]
        +cell A single-token entity.
    +row
        +cell #[code #[span.u-color-theme O] UT]
        +cell A non-entity token.
 +h(2, "json-input") JSON input format for training
 p
--- a/website/docs/api/cli.jade
+++ b/website/docs/api/cli.jade
@ -10,11 +10,11 @@ p
 +aside("Why python -m?")
    |  The problem with a global entry point is that it's resolved by looking up
    |  entries in your #[code PATH] environment variable. This can give you
-    |  unexpected results, especially when using #[code virtualenv]. For
+    |  unexpected results, like executing the wrong spaCy installation
-    |  instance, you may have spaCy installed on your system but not in your
+    |  (especially when using #[code virtualenv]). #[code python -m] prevents
-    |  current environment. The command will then execute the wrong
+    |  fallbacks to system modules and makes sure the correct spaCy version is
-    |  spaCy installation. #[code python -m] prevents fallbacks to system modules
+    |  used. If you hate typing it every time, we recommend creating an
-    |  and makes sure the correct version of spaCy is used.
+    |  #[code alias] instead.
 +h(2, "download") Download
@ -45,13 +45,24 @@ p
        +cell flag
        +cell Show help message and available arguments.
 +infobox("Important note")
    |  The #[code download] command is mostly intended as a convenient,
    |  interactive wrapper – it performs compatibility checks and prints
    |  detailed messages in case things go wrong. It's #[strong not recommended]
    |  to use this command as part of an automated process. If you know which
    |  model your project needs, you should consider a
    |  #[+a("/docs/usage/models#download-pip") direct download via pip], or
    |  uploading the model to a local PyPi installation and fetching it straight
    |  from there. This will also allow you to add it as a versioned package
    |  dependency to your project.
 +h(2, "link") Link
 p
    |  Create a #[+a("/docs/usage/models#usage") shortcut link] for a model,
    |  either a Python package or a local directory. This will let you load
-    |  models from any location via #[code spacy.load()].
+    |  models from any location using a custom name via
    |  #[+api("spacy#load") #[code spacy.load()]].
 +code(false, "bash").
    python -m spacy link [origin] [link_name] [--force]
@ -92,7 +103,7 @@ p
    +row
        +cell #[code model]
        +cell positional
-        +cell Shortcut link of model (optional).
+        +cell A model, i.e. shortcut link, package name or path (optional).
    +row
        +cell #[code --markdown], #[code -md]
@ -114,7 +125,7 @@ p
    |  the input file. Currently only supports #[code .conllu].
 +code(false, "bash").
-    python -m spacy convert [input_file] [output_dir] [--n_sents] [--morphology]
+    python -m spacy convert [input_file] [output_dir] [--n-sents] [--morphology]
 +table(["Argument", "Type", "Description"])
    +row
@ -128,7 +139,7 @@ p
        +cell Output directory for converted JSON file.
    +row
-        +cell #[code --n_sents], #[code -n]
+        +cell #[code --n-sents], #[code -n]
        +cell option
        +cell Number of sentences per document.
@ -191,7 +202,7 @@ p
    |  #[+a("/docs/api/annotation#json-input") JSON format].
 +code(false, "bash").
-    python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner]
+    python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--parser-L1] [--no-tagger] [--no-parser] [--no-ner]
 +table(["Argument", "Type", "Description"])
    +row
@ -215,27 +226,37 @@ p
        +cell Location of JSON-formatted dev data (optional).
    +row
-        +cell #[code --n_iter], #[code -n]
+        +cell #[code --n-iter], #[code -n]
        +cell option
        +cell Number of iterations (default: #[code 15]).
    +row
-        +cell #[code --parser_L1], #[code -L]
+        +cell #[code --nsents]
        +cell option
        +cell Number of sentences (default: #[code 0]).
    +row
        +cell #[code --parser-L1], #[code -L]
        +cell option
        +cell L1 regularization penalty for parser (default: #[code 0.0]).
    +row
-        +cell #[code --no_tagger], #[code -T]
+        +cell #[code --use-gpu], #[code -g]
        +cell flag
        +cell Use GPU.
    +row
        +cell #[code --no-tagger], #[code -T]
        +cell flag
        +cell Don't train tagger.
    +row
-        +cell #[code --no_parser], #[code -P]
+        +cell #[code --no-parser], #[code -P]
        +cell flag
        +cell Don't train parser.
    +row
-        +cell #[code --no_ner], #[code -N]
+        +cell #[code --no-ner], #[code -N]
        +cell flag
        +cell Don't train NER.
--- a/website/docs/api/dependencyparser.jade
+++ b/website/docs/api/dependencyparser.jade
@ -4,32 +4,6 @@ include ../../_includes/_mixins
 p Annotate syntactic dependencies on #[code Doc] objects.
 +h(2, "load") DependencyParser.load
    +tag classmethod
 p Load the statistical model from the supplied path.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell #[code Path]
        +cell The path to load from.
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell The vocabulary. Must be shared by the documents to be processed.
    +row
        +cell #[code require]
        +cell bool
        +cell Whether to raise an error if the files are not found.
    +footrow
        +cell return
        +cell #[code DependencyParser]
        +cell The newly constructed object.
 +h(2, "init") DependencyParser.__init__
    +tag method
@ -47,7 +21,7 @@ p Create a #[code DependencyParser].
        +cell The statistical model.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code DependencyParser]
        +cell The newly constructed object.
@ -65,7 +39,7 @@ p
        +cell The document to be processed.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code None]
        +cell -
@ -93,7 +67,7 @@ p Process a stream of documents.
            |  parallel.
    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Doc]
        +cell Documents, in order.
@ -114,7 +88,7 @@ p Update the statistical model.
        +cell The gold-standard annotations, to calculate the loss.
    +footrow
-        +cell return
+        +cell returns
        +cell int
        +cell The loss on this example.
@ -130,6 +104,6 @@ p Set up a stepwise state, to introspect and control the transition sequence.
        +cell The document to step through.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code StepwiseState]
        +cell A state object, to step through the annotation process.
--- a/website/docs/api/displacy.jade
+++ b/website/docs/api/displacy.jade
@ -8,7 +8,7 @@ p
    |  #[+a("/docs/usage/visualizers") visualizing spaCy].
-+h(2, "serve") serve
+h(2, "serve") displacy.serve
    +tag method
 p
@ -60,7 +60,7 @@ p
        +cell Port to serve visualization.
        +cell #[code 5000]
-+h(2, "render") render
+h(2, "render") displacy.render
    +tag method
 p Render a dependency parse tree or named entity visualization.
@ -112,7 +112,7 @@ p Render a dependency parse tree or named entity visualization.
        +cell #[code {}]
    +footrow
-        +cell return
+        +cell returns
        +cell unicode
        +cell Rendered HTML markup.
        +cell
@ -218,7 +218,7 @@ p
        +cell #[code colors]
        +cell dict
        +cell
-            |  Color overrides. Entity types in lowercase should be mapped to
+            |  Color overrides. Entity types in uppercase should be mapped to
            |  color names or values.
        +cell #[code {}]
--- a/website/docs/api/doc.jade
+++ b/website/docs/api/doc.jade
@ -4,9 +4,508 @@ include ../../_includes/_mixins
 p A container for accessing linguistic annotations.
 p
    |  A #[code Doc] is a sequence of #[+api("token") #[code Token]] objects.
    |  Access sentences and named entities, export annotations to numpy arrays,
    |  losslessly serialize to compressed binary strings. The #[code Doc] object
    |  holds an array of #[code TokenC] structs. The Python-level #[code Token]
    |  and #[+api("span") #[code Span]] objects are views of this array, i.e.
    |  they don't own the data themselves.
 +aside-code("Example").
    # Construction 1
    doc = nlp(u'Some text')
    # Construction 2
    from spacy.tokens import Doc
    doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
                               spaces=[True, False, False])
 +h(2, "init") Doc.__init__
    +tag method
 p
    |  Construct a #[code Doc] object. The most common way to get a #[code Doc]
    |  object is via the #[code nlp] object.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell A storage container for lexical types.
    +row
        +cell #[code words]
        +cell -
        +cell A list of strings to add to the container.
    +row
        +cell #[code spaces]
        +cell -
        +cell
            |  A list of boolean values indicating whether each word has a
            |  subsequent space. Must have the same length as #[code words], if
            |  specified. Defaults to a sequence of #[code True].
    +footrow
        +cell returns
        +cell #[code Doc]
        +cell The newly constructed object.
 +h(2, "getitem") Doc.__getitem__
    +tag method
 p
    |  Get a #[+api("token") #[code Token]] object at position #[code i], where
    |  #[code i] is an integer. Negative indexing is supported, and follows the
    |  usual Python semantics, i.e. #[code doc[-2]] is #[code doc[len(doc) - 2]].
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    assert doc[0].text == 'Give'
    assert doc[-1].text == '.'
    span = doc[1:1]
    assert span.text == 'it back'
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code i]
        +cell int
        +cell The index of the token.
    +footrow
        +cell returns
        +cell #[code Token]
        +cell The token at #[code doc[i]].
 p
    |  Get a #[+api("span") #[code Span]] object, starting at position
    |  #[code start] (token index) and ending at position #[code end] (token
    |  index).
 p
    |  For instance, #[code doc[2:5]] produces a span consisting of tokens 2, 3
    |  and 4. Stepped slices (e.g. #[code doc[start : end : step]]) are not
    |  supported, as #[code Span] objects must be contiguous (cannot have gaps).
    |  You can use negative indices and open-ended ranges, which have their
    |  normal Python semantics.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code start_end]
        +cell tuple
        +cell The slice of the document to get.
    +footrow
        +cell returns
        +cell #[code Span]
        +cell The span at #[code doc[start : end]].
 +h(2, "iter") Doc.__iter__
    +tag method
 p
    |  Iterate over #[code Token] objects, from which the annotations can be
    |  easily accessed.
 +aside-code("Example").
    doc = nlp(u'Give it back')
    assert [t.text for t in doc] == [u'Give', u'it', u'back']
 p
    |  This is the main way of accessing #[+api("token") #[code Token]] objects,
    |  which are the main way annotations are accessed from Python. If
    |  faster-than-Python speeds are required, you can instead access the
    |  annotations as a numpy array, or access the underlying C data directly
    |  from Cython.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yields
        +cell #[code Token]
        +cell A #[code Token] object.
 +h(2, "len") Doc.__len__
    +tag method
 p Get the number of tokens in the document.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    assert len(doc) == 7
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell int
        +cell The number of tokens in the document.
 +h(2, "similarity") Doc.similarity
    +tag method
    +tag-model("vectors")
 p
    |  Make a semantic similarity estimate. The default estimate is cosine
    |  similarity using an average of word vectors.
 +aside-code("Example").
    apples = nlp(u'I like apples')
    oranges = nlp(u'I like oranges')
    apples_oranges = apples.similarity(oranges)
    oranges_apples = oranges.similarity(apples)
    assert apples_oranges == oranges_apples
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code other]
        +cell -
        +cell
            |  The object to compare with. By default, accepts #[code Doc],
            |  #[code Span], #[code Token] and #[code Lexeme] objects.
    +footrow
        +cell returns
        +cell float
        +cell A scalar similarity score. Higher is more similar.
 +h(2, "count_by") Doc.count_by
    +tag method
 p
    |  Count the frequencies of a given attribute. Produces a dict of
    |  #[code {attr (int): count (ints)}] frequencies, keyed by the values
    |  of the given attribute ID.
 +aside-code("Example").
    from spacy.attrs import ORTH
    doc = nlp(u'apple apple orange banana')
    assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2}
    doc.to_array([attrs.ORTH])
    # array([[11880], [11880], [7561], [12800]])
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code attr_id]
        +cell int
        +cell The attribute ID
    +footrow
        +cell returns
        +cell dict
        +cell A dictionary mapping attributes to integer counts.
 +h(2, "to_array") Doc.to_array
    +tag method
 p
    |  Export the document annotations to a numpy array of shape #[code N*M]
    |  where #[code N] is the length of the document and #[code M] is the number
    |  of attribute IDs to export. The values will be 32-bit integers.
 +aside-code("Example").
    from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
    doc = nlp(text)
    # All strings mapped to integers, for easy export to numpy
    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code attr_ids]
        +cell ints
        +cell A list of attribute ID ints.
    +footrow
        +cell returns
        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
        +cell
            |  The exported attributes as a 2D numpy array, with one row per
            |  token and one column per attribute.
 +h(2, "from_array") Doc.from_array
    +tag method
 p
    |  Load attributes from a numpy array. Write to a #[code Doc] object, from
    |  an #[code (M, N)] array of attributes.
 +aside-code("Example").
    from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
    from spacy.tokens import Doc
    doc = nlp(text)
    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
    doc2 = Doc(doc.vocab)
    doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
    assert doc.text == doc2.text
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code attrs]
        +cell ints
        +cell A list of attribute ID ints.
    +row
        +cell #[code array]
        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
        +cell The attribute values to load.
    +footrow
        +cell returns
        +cell #[code Doc]
        +cell Itself.
 +h(2, "to_bytes") Doc.to_bytes
    +tag method
 p Serialize, i.e. export the document contents to a binary string.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    doc_bytes = doc.to_bytes()
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell bytes
        +cell
            |  A losslessly serialized copy of the #[code Doc], including all
            |  annotations.
 +h(2, "from_bytes") Doc.from_bytes
    +tag method
 p Deserialize, i.e. import the document contents from a binary string.
 +aside-code("Example").
    from spacy.tokens import Doc
    text = u'Give it back! He pleaded.'
    doc = nlp(text)
    bytes = doc.to_bytes()
    doc2 = Doc(doc.vocab).from_bytes(bytes)
    assert doc.text == doc2.text
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code data]
        +cell bytes
        +cell The string to load from.
    +footrow
        +cell returns
        +cell #[code Doc]
        +cell The #[code Doc] object.
 +h(2, "merge") Doc.merge
    +tag method
 p
    |  Retokenize the document, such that the span at
    |  #[code doc.text[start_idx : end_idx]] is merged into a single token. If
    |  #[code start_idx] and #[end_idx] do not mark start and end token
    |  boundaries, the document remains unchanged.
 +aside-code("Example").
    doc = nlp(u'Los Angeles start.')
    doc.merge(0, len('Los Angeles'), 'NNP', 'Los Angeles', 'GPE')
    assert [t.text for t in doc] == [u'Los Angeles', u'start', u'.']
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code start_idx]
        +cell int
        +cell The character index of the start of the slice to merge.
    +row
        +cell #[code end_idx]
        +cell int
        +cell The character index after the end of the slice to merge.
    +row
        +cell #[code **attributes]
        +cell -
        +cell
            |  Attributes to assign to the merged token. By default,
            |  attributes are inherited from the syntactic root token of
            |  the span.
    +footrow
        +cell returns
        +cell #[code Token]
        +cell
            |  The newly merged token, or #[code None] if the start and end
            |  indices did not fall at token boundaries
 +h(2, "print_tree") Doc.print_tree
    +tag method
    +tag-model("parse")
 p
    |  Returns the parse trees in JSON (dict) format. Especially useful for
    |  web applications.
 +aside-code("Example").
    doc = nlp('Alice ate the pizza.')
    trees = doc.print_tree()
    # {'modifiers': [
    #   {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
    #   {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'},
    #   {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}
    # ], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code light]
        +cell bool
        +cell Don't include lemmas or entities.
    +row
        +cell #[code flat]
        +cell bool
        +cell Don't include arcs or modifiers.
    +footrow
        +cell returns
        +cell dict
        +cell Parse tree as dict.
 +h(2, "ents") Doc.ents
    +tag property
    +tag-model("NER")
 p
    |  Iterate over the entities in the document. Yields named-entity
    |  #[code Span] objects, if the entity recognizer has been applied to the
    |  document.
 +aside-code("Example").
    tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
    ents = list(tokens.ents)
    assert ents[0].label == 346
    assert ents[0].label_ == 'PERSON'
    assert ents[0].text == 'Mr. Best'
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yields
        +cell #[code Span]
        +cell Entities in the document.
 +h(2, "noun_chunks") Doc.noun_chunks
    +tag property
    +tag-model("parse")
 p
    |  Iterate over the base noun phrases in the document. Yields base
    |  noun-phrase #[code Span] objects, if the document has been syntactically
    |  parsed. A base noun phrase, or "NP chunk", is a noun phrase that does not
    |  permit other NPs to be nested within it – so no NP-level coordination, no
    |  prepositional phrases, and no relative clauses.
 +aside-code("Example").
    doc = nlp(u'A phrase with another phrase occurs.')
    chunks = list(doc.noun_chunks)
    assert chunks[0].text == "A phrase"
    assert chunks[1].text == "another phrase"
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yields
        +cell #[code Span]
        +cell Noun chunks in the document.
 +h(2, "sents") Doc.sents
    +tag property
    +tag-model("parse")
 p
    |  Iterate over the sentences in the document. Sentence spans have no label.
    |  To improve accuracy on informal texts, spaCy calculates sentence boundaries
    |  from the syntactic dependency parse. If the parser is disabled,
    |  the #[code sents] iterator will be unavailable.
 +aside-code("Example").
    doc = nlp(u"This is a sentence. Here's another...")
    sents = list(doc.sents)
    assert len(sents) == 2
    assert [s.root.text for s in sents] == ["is", "'s"]
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yields
        +cell #[code Span]
        +cell Sentences in the document.
 +h(2, "has_vector") Doc.has_vector
    +tag property
    +tag-model("vectors")
 p
    |  A boolean value indicating whether a word vector is associated with the
    |  object.
 +aside-code("Example").
    doc = nlp(u'I like apples')
    assert doc.has_vector
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell bool
        +cell Whether the document has a vector data attached.
 +h(2, "vector") Doc.vector
    +tag property
    +tag-model("vectors")
 p
    |  A real-valued meaning representation. Defaults to an average of the
    |  token vectors.
 +aside-code("Example").
    apples = nlp(u'I like apples')
    assert doc.vector.dtype == 'float32'
    assert doc.vector.shape == (300,)
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
        +cell A 1D numpy array representing the document's semantics.
 +h(2, "vector_norm") Doc.vector_norm
    +tag property
    +tag-model("vectors")
 p
    |  The L2 norm of the document's vector representation.
 +aside-code("Example").
    doc1 = nlp(u'I like apples')
    doc2 = nlp(u'I like oranges')
    doc1.vector_norm # 4.54232424414368
    doc2.vector_norm # 3.304373298575751
    assert doc1.vector_norm != doc2.vector_norm
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell float
        +cell The L2 norm of the vector representation.
 +h(2, "attributes") Attributes
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code text]
        +cell unicode
        +cell A unicode representation of the document text.
    +row
        +cell #[code text_with_ws]
        +cell unicode
        +cell
            |  An alias of #[code Doc.text], provided for duck-type compatibility
            |  with #[code Span] and #[code Token].
    +row
        +cell #[code mem]
        +cell #[code Pool]
@ -17,6 +516,11 @@ p A container for accessing linguistic annotations.
        +cell #[code Vocab]
        +cell The store of lexical types.
    +row
        +cell #[code tensor]
        +cell object
        +cell Container for dense vector representations.
    +row
        +cell #[code user_data]
        +cell -
@ -59,358 +563,3 @@ p A container for accessing linguistic annotations.
        +cell
            |  A dictionary that allows customisation of properties of
            |  #[code Span] children.
 +h(2, "init") Doc.__init__
    +tag method
 p Construct a #[code Doc] object.
 +aside("Note")
    |  The most common way to get a #[code Doc] object is via the #[code nlp]
    |  object. This method is usually only used for deserialization or preset
    |  tokenization.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell A storage container for lexical types.
    +row
        +cell #[code words]
        +cell -
        +cell A list of strings to add to the container.
    +row
        +cell #[code spaces]
        +cell -
        +cell
            |  A list of boolean values indicating whether each word has a
            |  subsequent space. Must have the same length as #[code words], if
            |  specified. Defaults to a sequence of #[code True].
    +footrow
        +cell return
        +cell #[code Doc]
        +cell The newly constructed object.
 +h(2, "getitem") Doc.__getitem__
    +tag method
 p Get a #[code Token] object.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    assert doc[0].text == 'Give'
    assert doc[-1].text == '.'
    span = doc[1:1]
    assert span.text == 'it back'
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code i]
        +cell int
        +cell The index of the token.
    +footrow
        +cell return
        +cell #[code Token]
        +cell The token at #[code doc[i]].
 p Get a #[code Span] object.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code start_end]
        +cell tuple
        +cell The slice of the document to get.
    +footrow
        +cell return
        +cell #[code Span]
        +cell The span at #[code doc[start : end]].
 +h(2, "iter") Doc.__iter__
    +tag method
 p Iterate over #[code Token] objects.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yield
        +cell #[code Token]
        +cell A #[code Token] object.
 +h(2, "len") Doc.__len__
    +tag method
 p Get the number of tokens in the document.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell int
        +cell The number of tokens in the document.
 +h(2, "similarity") Doc.similarity
    +tag method
 p
    |  Make a semantic similarity estimate. The default estimate is cosine
    |  similarity using an average of word vectors.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code other]
        +cell -
        +cell
            |  The object to compare with. By default, accepts #[code Doc],
            |  #[code Span], #[code Token] and #[code Lexeme] objects.
    +footrow
        +cell return
        +cell float
        +cell A scalar similarity score. Higher is more similar.
 +h(2, "to_array") Doc.to_array
    +tag method
 p
    |  Export the document annotations to a numpy array of shape #[code N*M]
    |  where #[code N] is the length of the document and #[code M] is the number
    |  of attribute IDs to export. The values will be 32-bit integers.
 +aside-code("Example").
    from spacy import attrs
    doc = nlp(text)
    # All strings mapped to integers, for easy export to numpy
    np_array = doc.to_array([attrs.LOWER, attrs.POS,
                             attrs.ENT_TYPE, attrs.IS_ALPHA])
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code attr_ids]
        +cell ints
        +cell A list of attribute ID ints.
    +footrow
        +cell return
        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
        +cell
            |  The exported attributes as a 2D numpy array, with one row per
            |  token and one column per attribute.
 +h(2, "count_by") Doc.count_by
    +tag method
 p Count the frequencies of a given attribute.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code attr_id]
        +cell int
        +cell The attribute ID
    +footrow
        +cell return
        +cell dict
        +cell A dictionary mapping attributes to integer counts.
 +h(2, "from_array") Doc.from_array
    +tag method
 p Load attributes from a numpy array.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code attr_ids]
        +cell ints
        +cell A list of attribute ID ints.
    +row
        +cell #[code values]
        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
        +cell The attribute values to load.
    +footrow
        +cell return
        +cell #[code None]
        +cell -
 +h(2, "to_bytes") Doc.to_bytes
    +tag method
 p Export the document contents to a binary string.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell bytes
        +cell
            |  A losslessly serialized copy of the #[code Doc] including all
            |  annotations.
 +h(2, "from_bytes") Doc.from_bytes
    +tag method
 p Import the document contents from a binary string.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code byte_string]
        +cell bytes
        +cell The string to load from.
    +footrow
        +cell return
        +cell #[code Doc]
        +cell The #[code self] variable.
 +h(2, "merge") Doc.merge
    +tag method
 p
    |  Retokenize the document, such that the span at
    |  #[code doc.text[start_idx : end_idx]] is merged into a single token. If
    |  #[code start_idx] and #[end_idx] do not mark start and end token
    |  boundaries, the document remains unchanged.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code start_idx]
        +cell int
        +cell The character index of the start of the slice to merge.
    +row
        +cell #[code end_idx]
        +cell int
        +cell The character index after the end of the slice to merge.
    +row
        +cell #[code **attributes]
        +cell -
        +cell
            |  Attributes to assign to the merged token. By default,
            |  attributes are inherited from the syntactic root token of
            |  the span.
    +footrow
        +cell return
        +cell #[code Token]
        +cell
            |  The newly merged token, or None if the start and end
            |  indices did not fall at token boundaries
 +h(2, "read_bytes") Doc.read_bytes
    +tag staticmethod
 p A static method, used to read serialized #[code Doc] objects from a file.
 +aside-code("Example").
    from spacy.tokens.doc import Doc
    loc = 'test_serialize.bin'
    with open(loc, 'wb') as file_:
        file_.write(nlp(u'This is a document.').to_bytes())
        file_.write(nlp(u'This is another.').to_bytes())
    docs = []
    with open(loc, 'rb') as file_:
        for byte_string in Doc.read_bytes(file_):
            docs.append(Doc(nlp.vocab).from_bytes(byte_string))
    assert len(docs) == 2
 +table(["Name", "Type", "Description"])
    +row
        +cell file
        +cell buffer
        +cell A binary buffer to read the serialized annotations from.
    +footrow
        +cell yield
        +cell bytes
        +cell Binary strings from with documents can be loaded.
 +h(2, "text") Doc.text
    +tag property
 p A unicode representation of the document text.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell unicode
        +cell The original verbatim text of the document.
 +h(2, "text_with_ws") Doc.text_with_ws
    +tag property
 p
    |  An alias of #[code Doc.text], provided for duck-type compatibility with
    |  #[code Span] and #[code Token].
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell unicode
        +cell The original verbatim text of the document.
 +h(2, "sents") Doc.sents
    +tag property
 p Iterate over the sentences in the document.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yield
        +cell #[code Span]
        +cell Sentences in the document.
 +h(2, "ents") Doc.ents
    +tag property
 p Iterate over the entities in the document.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yield
        +cell #[code Span]
        +cell Entities in the document.
 +h(2, "noun_chunks") Doc.noun_chunks
    +tag property
 p
    |  Iterate over the base noun phrases in the document. A base noun phrase,
    |  or "NP chunk", is a noun phrase that does not permit other NPs to be
    |  nested within it.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yield
        +cell #[code Span]
        +cell Noun chunks in the document
 +h(2, "vector") Doc.vector
    +tag property
 p
    |  A real-valued meaning representation. Defaults to an average of the
    |  token vectors.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
        +cell A 1D numpy array representing the document's semantics.
 +h(2, "has_vector") Doc.has_vector
    +tag property
 p
    |  A boolean value indicating whether a word vector is associated with the
    |  object.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell bool
        +cell Whether the document has a vector data attached.
--- a/website/docs/api/entityrecognizer.jade
+++ b/website/docs/api/entityrecognizer.jade
@ -4,32 +4,6 @@ include ../../_includes/_mixins
 p Annotate named entities on #[code Doc] objects.
 +h(2, "load") EntityRecognizer.load
    +tag classmethod
 p Load the statistical model from the supplied path.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell #[code Path]
        +cell The path to load from.
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell The vocabulary. Must be shared by the documents to be processed.
    +row
        +cell #[code require]
        +cell bool
        +cell Whether to raise an error if the files are not found.
    +footrow
        +cell return
        +cell #[code EntityRecognizer]
        +cell The newly constructed object.
 +h(2, "init") EntityRecognizer.__init__
    +tag method
@ -47,7 +21,7 @@ p Create an #[code EntityRecognizer].
        +cell The statistical model.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code EntityRecognizer]
        +cell The newly constructed object.
@ -63,7 +37,7 @@ p Apply the entity recognizer, setting the NER tags onto the #[code Doc] object.
        +cell The document to be processed.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code None]
        +cell -
@ -91,7 +65,7 @@ p Process a stream of documents.
            |  parallel.
    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Doc]
        +cell Documents, in order.
@ -112,7 +86,7 @@ p Update the statistical model.
        +cell The gold-standard annotations, to calculate the loss.
    +footrow
-        +cell return
+        +cell returns
        +cell int
        +cell The loss on this example.
@ -128,6 +102,6 @@ p Set up a stepwise state, to introspect and control the transition sequence.
        +cell The document to step through.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code StepwiseState]
        +cell A state object, to step through the annotation process.
--- a/website/docs/api/goldparse.jade
+++ b/website/docs/api/goldparse.jade
@ -4,6 +4,72 @@ include ../../_includes/_mixins
 p Collection for training annotations.
 +h(2, "init") GoldParse.__init__
    +tag method
 p Create a GoldParse.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code doc]
        +cell #[code Doc]
        +cell The document the annotations refer to.
    +row
        +cell #[code words]
        +cell iterable
        +cell A sequence of unicode word strings.
    +row
        +cell #[code tags]
        +cell iterable
        +cell A sequence of strings, representing tag annotations.
    +row
        +cell #[code heads]
        +cell iterable
        +cell A sequence of integers, representing syntactic head offsets.
    +row
        +cell #[code deps]
        +cell iterable
        +cell A sequence of strings, representing the syntactic relation types.
    +row
        +cell #[code entities]
        +cell iterable
        +cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.
    +footrow
        +cell returns
        +cell #[code GoldParse]
        +cell The newly constructed object.
 +h(2, "len") GoldParse.__len__
    +tag method
 p Get the number of gold-standard tokens.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell int
        +cell The number of gold-standard tokens.
 +h(2, "is_projective") GoldParse.is_projective
    +tag property
 p
    |  Whether the provided syntactic annotations form a projective dependency
    |  tree.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell bool
        +cell Whether annotations form projective tree.
 +h(2, "attributes") Attributes
 +table(["Name", "Type", "Description"])
@ -37,67 +103,57 @@ p Collection for training annotations.
        +cell list
        +cell The alignment from gold tokenization to candidate tokenization.
 +h(2, "init") GoldParse.__init__
    +tag method
-p Create a GoldParse.
+h(2, "util") Utilities
 +h(3, "biluo_tags_from_offsets") gold.biluo_tags_from_offsets
    +tag function
 p
    |  Encode labelled spans into per-token tags, using the
    |  #[+a("/docs/api/annotation#biluo") BILUO scheme] (Begin/In/Last/Unit/Out).
 p
    |  Returns a list of unicode strings, describing the tags. Each tag string
    |  will be of the form either #[code ""], #[code "O"] or
    |  #[code "{action}-{label}"], where action is one of #[code "B"],
    |  #[code "I"], #[code "L"], #[code "U"]. The string #[code &quot;-&quot;]
    |  is used where the entity offsets don't align with the tokenization in the
    |  #[code Doc] object. The training algorithm will view these as missing
    |  values. #[code O] denotes a non-entity token. #[code B] denotes the
    |  beginning of a multi-token entity, #[code I] the inside of an entity
    |  of three or more tokens, and #[code L] the end of an entity of two or
    |  more tokens. #[code U] denotes a single-token entity.
 +aside-code("Example").
    from spacy.gold import biluo_tags_from_offsets
    text = 'I like London.'
    entities = [(len('I like '), len('I like London'), 'LOC')]
    doc = tokenizer(text)
    tags = biluo_tags_from_offsets(doc, entities)
    assert tags == ['O', 'O', 'U-LOC', 'O']
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code doc]
        +cell #[code Doc]
-        +cell The document the annotations refer to.
+        +cell
-
+            |  The document that the entity offsets refer to. The output tags
-    +row
+            |  will refer to the token boundaries within the document.
        +cell #[code words]
        +cell -
        +cell A sequence of unicode word strings.
    +row
        +cell #[code tags]
        +cell -
        +cell A sequence of strings, representing tag annotations.
    +row
        +cell #[code heads]
        +cell -
        +cell A sequence of integers, representing syntactic head offsets.
    +row
        +cell #[code deps]
        +cell -
        +cell A sequence of strings, representing the syntactic relation types.
    +row
        +cell #[code entities]
-        +cell -
+        +cell iterable
-        +cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.
+        +cell
            |  A sequence of #[code (start, end, label)] triples. #[code start]
            |  and #[code end] should be character-offset integers denoting the
            |  slice into the original string.
    +footrow
-        +cell return
+        +cell returns
-        +cell #[code GoldParse]
+        +cell list
-        +cell The newly constructed object.
+        +cell
            |  Unicode strings, describing the
            |  #[+a("/docs/api/annotation#biluo") BILUO] tags.
 +h(2, "len") GoldParse.__len__
    +tag method
 p Get the number of gold-standard tokens.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell int
        +cell The number of gold-standard tokens.
 +h(2, "is_projective") GoldParse.is_projective
    +tag property
 p
    |  Whether the provided syntactic annotations form a projective dependency
    |  tree.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell bool
        +cell Whether annotations form projective tree.
--- a/website/docs/api/language.jade
+++ b/website/docs/api/language.jade
@ -2,79 +2,69 @@
 include ../../_includes/_mixins
-p A text processing pipeline.
+p
    |  A text-processing pipeline. Usually you'll load this once per process,
    |  and pass the instance around your application.
-+h(2, "attributes") Attributes
+h(2, "init") Language.__init__
    +tag method
 p Initialise a #[code Language] object.
 +aside-code("Example").
    from spacy.language import Language
    nlp = Language(pipeline=['token_vectors', 'tags',
                             'dependencies'])
    from spacy.lang.en import English
    nlp = English()
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
-        +cell A container for the lexical types.
+        +cell
-
+            |  A #[code Vocab] object. If #[code True], a vocab is created via
-    +row
+            |  #[code Language.Defaults.create_vocab].
        +cell #[code tokenizer]
        +cell #[code Tokenizer]
        +cell Find word boundaries and create #[code Doc] object.
    +row
        +cell #[code tagger]
        +cell #[code Tagger]
        +cell Annotate #[code Doc] objects with POS tags.
    +row
        +cell #[code parser]
        +cell #[code DependencyParser]
        +cell Annotate #[code Doc] objects with syntactic dependencies.
    +row
        +cell #[code entity]
        +cell #[code EntityRecognizer]
        +cell Annotate #[code Doc] objects with named entities.
    +row
        +cell #[code matcher]
        +cell #[code Matcher]
        +cell Rule-based sequence matcher.
    +row
        +cell #[code make_doc]
-        +cell #[code lambda text: Doc]
+        +cell callable
-        +cell Create a #[code Doc] object from unicode text.
+        +cell
            |  A function that takes text and returns a #[code Doc] object.
            |  Usually a #[code Tokenizer].
    +row
        +cell #[code pipeline]
-        +cell -
+        +cell list
-        +cell Sequence of annotation functions.
+        +cell
            |  A list of annotation processes or IDs of annotation, processes,
            |  e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked
            |  up in #[code Language.Defaults.factories].
 +h(2, "init") Language.__init__
    +tag method
 p Create or load the pipeline.
 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code **overrides]
+        +cell #[code meta]
-        +cell -
+        +cell dict
-        +cell Keyword arguments indicating which defaults to override.
+        +cell
            |  Custom meta data for the #[code Language] class. Is written to by
            |  models to add model meta data.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code Language]
        +cell The newly constructed object.
 +h(2, "call") Language.__call__
    +tag method
-p Apply the pipeline to a single text.
+p
    |  Apply the pipeline to some text. The text can span multiple sentences,
    |  and can contain arbtrary whitespace. Alignment into the original string
    |  is preserved.
 +aside-code("Example").
-    from spacy.en import English
+    doc = nlp(u'An example sentence. Another sentence.')
-    nlp = English()
+    assert (doc[0].text, doc[0].head.tag_) == ('An', 'NN')
    doc = nlp('An example sentence. Another example sentence.')
    doc[0].orth_, doc[0].head.tag_
    # ('An', 'NN')
 +table(["Name", "Type", "Description"])
    +row
@ -83,24 +73,104 @@ p Apply the pipeline to a single text.
        +cell The text to be processed.
    +row
-        +cell #[code tag]
+        +cell #[code **disabled]
-        +cell bool
+        +cell -
-        +cell Whether to apply the part-of-speech tagger.
+        +cell Elements of the pipeline that should not be run.
    +row
        +cell #[code parse]
        +cell bool
        +cell Whether to apply the syntactic dependency parser.
    +row
        +cell #[code entity]
        +cell bool
        +cell Whether to apply the named entity recognizer.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code Doc]
-        +cell A container for accessing the linguistic annotations.
+        +cell A container for accessing the annotations.
 +h(2, "update") Language.update
    +tag method
 p Update the models in the pipeline.
 +aside-code("Example").
    with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
        for epoch in trainer.epochs(gold):
            for docs, golds in epoch:
                state = nlp.update(docs, golds, sgd=optimizer)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code docs]
        +cell iterable
        +cell A batch of #[code Doc] objects.
    +row
        +cell #[code golds]
        +cell iterable
        +cell A batch of #[code GoldParse] objects.
    +row
        +cell #[code drop]
        +cell float
        +cell The dropout rate.
    +row
        +cell #[code sgd]
        +cell callable
        +cell An optimizer.
    +footrow
        +cell returns
        +cell dict
        +cell Results from the update.
 +h(2, "begin_training") Language.begin_training
    +tag contextmanager
 p
    |  Allocate models, pre-process training data and acquire a trainer and
    |  optimizer. Used as a contextmanager.
 +aside-code("Example").
    with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
        for epoch in trainer.epochs(gold):
            for docs, golds in epoch:
                state = nlp.update(docs, golds, sgd=optimizer)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code gold_tuples]
        +cell iterable
        +cell Gold-standard training data.
    +row
        +cell #[code **cfg]
        +cell -
        +cell Config parameters.
    +footrow
        +cell yields
        +cell tuple
        +cell A trainer and an optimizer.
 +h(2, "use_params") Language.use_params
    +tag contextmanager
    +tag method
 p
    |  Replace weights of models in the pipeline with those provided in the
    |  params dictionary. Can be used as a contextmanager, in which case, models
    |  go back to their original weights after the block.
 +aside-code("Example").
    with nlp.use_params(optimizer.averages):
        nlp.to_disk('/tmp/checkpoint')
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code params]
        +cell dict
        +cell A dictionary of parameters keyed by model ID.
    +row
        +cell #[code **cfg]
        +cell -
        +cell Config parameters.
 +h(2, "pipe") Language.pipe
    +tag method
@ -133,22 +203,142 @@ p
        +cell The number of texts to buffer.
    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Doc]
-        +cell Containers for accessing the linguistic annotations.
+        +cell Documents in the order of the original text.
-+h(2, "save_to_directory") Language.save_to_directory
+h(2, "to_disk") Language.to_disk
    +tag method
-p Save the #[code Vocab], #[code StringStore] and pipeline to a directory.
+p Save the current state to a directory.
 +aside-code("Example").
    nlp.to_disk('/path/to/models')
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
-        +cell string or pathlib path
+        +cell unicode or #[code Path]
-        +cell Path to save the model.
+        +cell
            |  A path to a directory, which will be created if it doesn't exist.
            |  Paths may be either strings or #[code Path]-like objects.
    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being saved.
 +h(2, "from_disk") Language.from_disk
    +tag method
 p Loads state from a directory. Modifies the object in place and returns it.
 +aside-code("Example").
    from spacy.language import Language
    nlp = Language().from_disk('/path/to/models')
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell unicode or #[code Path]
        +cell
            |  A path to a directory. Paths may be either strings or
            |  #[code Path]-like objects.
    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being loaded.
    +footrow
-        +cell return
+        +cell returns
-        +cell #[code None]
+        +cell #[code Language]
        +cell The modified #[code Language] object.
 +h(2, "to_bytes") Language.to_bytes
    +tag method
 p Serialize the current state to a binary string.
 +aside-code("Example").
    nlp_bytes = nlp.to_bytes()
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being serialized.
    +footrow
        +cell returns
        +cell bytes
        +cell The serialized form of the #[code Language] object.
 +h(2, "from_bytes") Language.from_bytes
    +tag method
 p Load state from a binary string.
 +aside-code("Example").
    fron spacy.lang.en import English
    nlp_bytes = nlp.to_bytes()
    nlp2 = English()
    nlp2.from_bytes(nlp_bytes)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code bytes_data]
        +cell bytes
        +cell The data to load from.
    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being loaded.
    +footrow
        +cell returns
        +cell #[code Language]
        +cell The #[code Language] object.
 +h(2, "attributes") Attributes
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell A container for the lexical types.
    +row
        +cell #[code make_doc]
        +cell #[code lambda text: Doc]
        +cell Create a #[code Doc] object from unicode text.
    +row
        +cell #[code pipeline]
        +cell list
        +cell Sequence of annotation functions.
    +row
        +cell #[code meta]
        +cell dict
        +cell
            |  Custom meta data for the Language class. If a model is loaded,
            |  contains meta data of the model.
 +h(2, "class-attributes") Class attributes
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code Defaults]
        +cell class
        +cell
            |  Settings, data and factory methods for creating the
            |  #[code nlp] object and processing pipeline.
    +row
        +cell #[code lang]
        +cell unicode
        +cell
            |  Two-letter language ID, i.e.
            |  #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code].
--- a/website/docs/api/lexeme.jade
+++ b/website/docs/api/lexeme.jade
@ -2,7 +2,154 @@
 include ../../_includes/_mixins
-p An entry in the vocabulary.
+p
    |  An entry in the vocabulary. A #[code Lexeme] has no string context – it's
    |  a word-type, as opposed to a word token. It therefore has no
    |  part-of-speech tag, dependency parse, or lemma (if lemmatization depends
    |  on the part-of-speech tag).
 +h(2, "init") Lexeme.__init__
    +tag method
 p Create a #[code Lexeme] object.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell The parent vocabulary.
    +row
        +cell #[code orth]
        +cell int
        +cell The orth id of the lexeme.
    +footrow
        +cell returns
        +cell #[code Lexeme]
        +cell The newly constructed object.
 +h(2, "set_flag") Lexeme.set_flag
    +tag method
 p Change the value of a boolean flag.
 +aside-code("Example").
    COOL_FLAG = nlp.vocab.add_flag(lambda text: False)
    nlp.vocab[u'spaCy'].set_flag(COOL_FLAG, True)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code flag_id]
        +cell int
        +cell The attribute ID of the flag to set.
    +row
        +cell #[code value]
        +cell bool
        +cell The new value of the flag.
 +h(2, "check_flag") Lexeme.check_flag
    +tag method
 p Check the value of a boolean flag.
 +aside-code("Example").
    is_my_library = lambda text: text in ['spaCy', 'Thinc']
    MY_LIBRARY = nlp.vocab.add_flag(is_my_library)
    assert nlp.vocab[u'spaCy'].check_flag(MY_LIBRARY) == True
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code flag_id]
        +cell int
        +cell The attribute ID of the flag to query.
    +footrow
        +cell returns
        +cell bool
        +cell The value of the flag.
 +h(2, "similarity") Lexeme.similarity
    +tag method
    +tag-model("vectors")
 p Compute a semantic similarity estimate. Defaults to cosine over vectors.
 +aside-code("Example").
    apple = nlp.vocab[u'apple']
    orange = nlp.vocab[u'orange']
    apple_orange = apple.similarity(orange)
    orange_apple = orange.similarity(apple)
    assert apple_orange == orange_apple
 +table(["Name", "Type", "Description"])
    +row
        +cell other
        +cell -
        +cell
            |  The object to compare with. By default, accepts #[code Doc],
            |  #[code Span], #[code Token] and #[code Lexeme] objects.
    +footrow
        +cell returns
        +cell float
        +cell A scalar similarity score. Higher is more similar.
 +h(2, "has_vector") Lexeme.has_vector
    +tag property
    +tag-model("vectors")
 p
    |  A boolean value indicating whether a word vector is associated with the
    |  lexeme.
 +aside-code("Example").
    apple = nlp.vocab[u'apple']
    assert apple.has_vector
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell bool
        +cell Whether the lexeme has a vector data attached.
 +h(2, "vector") Lexeme.vector
    +tag property
    +tag-model("vectors")
 p A real-valued meaning representation.
 +aside-code("Example").
    apple = nlp.vocab[u'apple']
    assert apple.vector.dtype == 'float32'
    assert apple.vector.shape == (300,)
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
        +cell A 1D numpy array representing the lexeme's semantics.
 +h(2, "vector_norm") Lexeme.vector_norm
    +tag property
    +tag-model("vectors")
 p The L2 norm of the lexeme's vector representation.
 +aside-code("Example").
    apple = nlp.vocab[u'apple']
    pasta = nlp.vocab[u'pasta']
    apple.vector_norm # 7.1346845626831055
    pasta.vector_norm # 7.759851932525635
    assert apple.vector_norm != pasta.vector_norm
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell float
        +cell The L2 norm of the vector representation.
 +h(2, "attributes") Attributes
@ -12,6 +159,16 @@ p An entry in the vocabulary.
        +cell #[code Vocab]
        +cell
    +row
        +cell #[code text]
        +cell unicode
        +cell Verbatim text content.
    +row
        +cell #[code lex_id]
        +cell int
        +cell ID of the lexeme's lexical type.
    +row
        +cell #[code lower]
        +cell int
@ -124,116 +281,9 @@ p An entry in the vocabulary.
    +row
        +cell #[code prob]
        +cell float
-        +cell Smoothed log probability estimate of token's type.
+        +cell Smoothed log probability estimate of lexeme's type.
    +row
        +cell #[code sentiment]
        +cell float
-        +cell A scalar value indicating the positivity or negativity of the token.
+        +cell A scalar value indicating the positivity or negativity of the lexeme.
    +row
        +cell #[code lex_id]
        +cell int
        +cell ID of the token's lexical type.
    +row
        +cell #[code text]
        +cell unicode
        +cell Verbatim text content.
 +h(2, "init") Lexeme.__init__
    +tag method
 p Create a #[code Lexeme] object.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell The parent vocabulary.
    +row
        +cell #[code orth]
        +cell int
        +cell The orth id of the lexeme.
    +footrow
        +cell return
        +cell #[code Lexeme]
        +cell The newly constructed object.
 +h(2, "set_flag") Lexeme.set_flag
    +tag method
 p Change the value of a boolean flag.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code flag_id]
        +cell int
        +cell The attribute ID of the flag to set.
    +row
        +cell #[code value]
        +cell bool
        +cell The new value of the flag.
    +footrow
        +cell return
        +cell #[code None]
        +cell -
 +h(2, "check_flag") Lexeme.check_flag
    +tag method
 p Check the value of a boolean flag.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code flag_id]
        +cell int
        +cell The attribute ID of the flag to query.
    +footrow
        +cell return
        +cell bool
        +cell The value of the flag.
 +h(2, "similarity") Lexeme.similarity
    +tag method
 p Compute a semantic similarity estimate. Defaults to cosine over vectors.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code other]
        +cell -
        +cell
            |  The object to compare with. By default, accepts #[code Doc],
            |  #[code Span], #[code Token] and #[code Lexeme] objects.
    +footrow
        +cell return
        +cell float
        +cell A scalar similarity score. Higher is more similar.
 +h(2, "vector") Lexeme.vector
    +tag property
 p A real-valued meaning representation.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
        +cell A real-valued meaning representation.
 +h(2, "has_vector") Lexeme.has_vector
    +tag property
 p A boolean value indicating whether a word vector is associated with the object.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell bool
        +cell Whether a word vector is associated with the object.
--- a/website/docs/api/matcher.jade
+++ b/website/docs/api/matcher.jade
@ -4,31 +4,26 @@ include ../../_includes/_mixins
 p Match sequences of tokens, based on pattern rules.
-+h(2, "load") Matcher.load
+infobox("⚠️ Deprecation note")
-    +tag classmethod
+    |  As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
-
+    |  are deprecated and have been replaced with a simpler
-p Load the matcher and patterns from a file path.
+    |  #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
-
+    |  patterns and a callback for a given match ID. #[code Matcher.get_entity]
-+table(["Name", "Type", "Description"])
+    |  is now called #[+api("matcher#get") #[code matcher.get]].
-    +row
+    |  #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
-        +cell #[code path]
+    |  and #[code Matcher.has_entity] (now redundant) have been removed.
        +cell #[code Path]
        +cell Path to a JSON-formatted patterns file.
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell The vocabulary that the documents to match over will refer to.
    +footrow
        +cell return
        +cell #[code Matcher]
        +cell The newly constructed object.
 +h(2, "init") Matcher.__init__
    +tag method
-p Create the Matcher.
+p Create the rule-based #[code Matcher].
 +aside-code("Example").
    from spacy.matcher import Matcher
    from spacy.attrs import LOWER
    patterns = {"HelloWorld": [{LOWER: "hello"}, {LOWER: "world"}]}
    matcher = Matcher(nlp.vocab)
 +table(["Name", "Type", "Description"])
    +row
@ -41,17 +36,38 @@ p Create the Matcher.
    +row
        +cell #[code patterns]
        +cell dict
-        +cell Patterns to add to the matcher.
+        +cell Patterns to add to the matcher, keyed by ID.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code Matcher]
        +cell The newly constructed object.
 +h(2, "call") Matcher.__call__
    +tag method
-p Find all token sequences matching the supplied patterns on the Doc.
+p Find all token sequences matching the supplied patterns on the #[code Doc].
 +aside-code("Example").
    from spacy.matcher import Matcher
    from spacy.attrs import LOWER
    matcher = Matcher(nlp.vocab)
    pattern = [{LOWER: "hello"}, {LOWER: "world"}]
    matcher.add("HelloWorld", on_match=None, pattern)
    doc = nlp(u'hello world!')
    matches = matcher(doc)
 +infobox("Important note")
    |  By default, the matcher #[strong does not perform any action] on matches,
    |  like tagging matched phrases with entity types. Instead, actions need to
    |  be specified when #[strong adding patterns or entities], by
    |  passing in a callback function as the #[code on_match] argument on
    |  #[+api("matcher#add") #[code add]]. This allows you to define custom
    |  actions per pattern within the same matcher. For example, you might only
    |  want to merge some entity types, and set custom flags for other matched
    |  patterns. For more details and examples, see the usage workflow on
    |  #[+a("/docs/usage/rule-based-matching") rule-based matching].
 +table(["Name", "Type", "Description"])
    +row
@ -60,23 +76,28 @@ p Find all token sequences matching the supplied patterns on the Doc.
        +cell The document to match over.
    +footrow
-        +cell return
+        +cell returns
        +cell list
        +cell
-            |  A list of#[code (entity_key, label_id,  start, end)] tuples,
+            |  A list of #[code (match_id, start, end)] tuples, describing the
-            |  describing the matches. A match tuple describes a
+            |  matches. A match tuple describes a span #[code doc[start:end]].
-            |  #[code span doc[start:end]]. The #[code label_id] and
+            |  The #[code match_id] is the ID of the added match pattern.
            |  #[code entity_key] are both integers.
 +h(2, "pipe") Matcher.pipe
    +tag method
 p Match a stream of documents, yielding them in turn.
 +aside-code("Example").
    from spacy.matcher import Matcher
    matcher = Matcher(nlp.vocab)
    for doc in matcher.pipe(texts, batch_size=50, n_threads=4):
        pass
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code docs]
-        +cell -
+        +cell iterable
        +cell A stream of documents.
    +row
@ -93,87 +114,132 @@ p Match a stream of documents, yielding them in turn.
            |  multi-threading.
    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Doc]
        +cell Documents, in order.
-+h(2, "add_entity") Matcher.add_entity
+h(2, "len") Matcher.__len__
    +tag method
-p Add an entity to the matcher.
+p
    |  Get the number of rules added to the matcher. Note that this only returns
    |  the number of rules (identical with the number of IDs), not the number
    |  of individual patterns.
 +aside-code("Example").
    matcher = Matcher(nlp.vocab)
    assert len(matcher) == 0
    matcher.add('Rule', None, [{ORTH: 'test'}])
    assert len(matcher) == 1
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell int
        +cell The number of rules.
 +h(2, "contains") Matcher.__contains__
    +tag method
 p Check whether the matcher contains rules for a match ID.
 +aside-code("Example").
    matcher = Matcher(nlp.vocab)
    assert 'Rule' in matcher == False
    matcher.add('Rule', None, [{ORTH: 'test'}])
    assert 'Rule' in matcher == True
 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code entity_key]
+        +cell #[code key]
        +cell unicode / int
        +cell An ID for the entity.
    +row
        +cell #[code attrs]
        +cell -
        +cell Attributes to associate with the Matcher.
    +row
        +cell #[code if_exists]
        +cell unicode
-        +cell
+        +cell The match ID.
-            |  #[code 'raise'], #[code 'ignore'] or #[code 'update']. Controls
+    +footrow
-            |  what happens if the entity ID already exists. Defaults to
+        +cell returns
-            |  #[code 'raise'].
+        +cell int
        +cell Whether the matcher contains rules for this match ID.
 +h(2, "add") Matcher.add
    +tag method
 p
    |  Add a rule to the matcher, consisting of an ID key, one or more patterns, and
    |  a callback function to act on the matches. The callback function will
    |  receive the arguments #[code matcher], #[code doc], #[code i] and
    |  #[code matches]. If a pattern already exists for the given ID, the
    |  patterns will be extended. An #[code on_match] callback will be
    |  overwritten.
 +aside-code("Example").
    def on_match(matcher, doc, id, matches):
        print('Matched!', matches)
    matcher = Matcher(nlp.vocab)
    matcher.add('HelloWorld', on_match, [{LOWER: "hello"}, {LOWER: "world"}])
    matcher.add('GoogleMaps', on_match, [{ORTH: "Google"}, {ORTH: "Maps"}])
    doc = nlp(u'HELLO WORLD on Google Maps.')
    matches = matcher(doc)
 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code acceptor]
+        +cell #[code match_id]
-        +cell -
+        +cell unicode
-        +cell Callback function to filter matches of the entity.
+        +cell An ID for the thing you're matching.
    +row
        +cell #[code on_match]
-        +cell -
+        +cell callable or #[code None]
-        +cell Callback function to act on matches of the entity.
+        +cell
            |  Callback function to act on matches. Takes the arguments
            |  #[code matcher], #[code doc], #[code i] and #[code matches].
-    +footrow
+    +row
-        +cell return
+        +cell #[code *patterns]
-        +cell #[code None]
+        +cell list
-        +cell -
+        +cell
            |  Match pattern. A pattern consists of a list of dicts, where each
            |  dict describes a token.
-+h(2, "add_pattern") Matcher.add_pattern
+h(2, "remove") Matcher.remove
    +tag method
-p Add a pattern to the matcher.
+p
    |  Remove a rule from the matcher. A #[code KeyError] is raised if the match
    |  ID does not exist.
 +aside-code("Example").
    matcher.add('Rule', None, [{ORTH: 'test'}])
    assert 'Rule' in matcher == True
    matcher.remove('Rule')
    assert 'Rule' in matcher == False
 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code entity_key]
+        +cell #[code key]
-        +cell unicode / int
+        +cell unicode
-        +cell An ID for the entity.
+        +cell The ID of the match rule.
-    +row
+h(2, "get") Matcher.get
        +cell #[code token_specs]
        +cell -
        +cell Description of the pattern to be matched.
    +row
        +cell #[code label]
        +cell unicode / int
        +cell Label to assign to the matched pattern. Defaults to #[code ""].
    +footrow
        +cell return
        +cell #[code None]
        +cell -
 +h(2, "has_entity") Matcher.has_entity
    +tag method
-p Check whether the matcher has an entity.
+p
    |  Retrieve the pattern stored for a key. Returns the rule as an
    |  #[code (on_match, patterns)] tuple containing the callback and available
    |  patterns.
 +aside-code("Example").
    pattern = [{ORTH: 'test'}]
    matcher.add('Rule', None, pattern)
    (on_match, patterns) = matcher.get('Rule')
    assert patterns = [pattern]
 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code entity_key]
+        +cell #[code key]
-        +cell unicode / int
+        +cell unicode
-        +cell The entity key to check.
+        +cell The ID of the match rule.
    +footrow
-        +cell return
+        +cell returns
-        +cell bool
+        +cell tuple
-        +cell Whether the matcher has the entity.
+        +cell The rule, as an #[code (on_match, patterns)] tuple.
--- a/website/docs/api/spacy.jade
+++ b/website/docs/api/spacy.jade
@ -0,0 +1,95 @@
 //- 💫 DOCS > API > SPACY
 include ../../_includes/_mixins
 +h(2, "load") spacy.load
    +tag function
    +tag-model
 p
    |  Load a model via its #[+a("/docs/usage/models#usage") shortcut link],
    |  the name of an installed
    |  #[+a("/docs/usage/saving-loading#generating") model package], a unicode
    |  path or a #[code Path]-like object. spaCy will try resolving the load
    |  argument in this order. The #[code Language] class to initialise will be
    |  determined based on the model's settings.
 +aside-code("Example").
    nlp = spacy.load('en') # shortcut link
    nlp = spacy.load('en_core_web_sm') # package
    nlp = spacy.load('/path/to/en') # unicode path
    nlp = spacy.load(Path('/path/to/en')) # pathlib Path
 +infobox("⚠️ Deprecation note")
    |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
    |  will also raise an error if no model could be loaded and never just
    |  return an empty #[code Language] object. If you need a blank language,
    |  you need to import it explicitly (#[code from spacy.lang.en import English])
    |  or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code name]
        +cell unicode or #[code Path]
        +cell Model to load, i.e. shortcut link, package name or path.
    +footrow
        +cell returns
        +cell #[code Language]
        +cell A #[code Language] object with the loaded model.
 +h(2, "info") spacy.info
    +tag function
 p
    |  The same as the #[+api("cli#info") #[code info] command]. Pretty-print
    |  information about your installation, models and local setup from within
    |  spaCy. To get the model meta data as a dictionary instead, you can
    |  use the #[code meta] attribute on your #[code nlp] object with a
    |  loaded model, e.g. #[code nlp['meta']].
 +aside-code("Example").
    spacy.info()
    spacy.info('en')
    spacy.info('de', markdown=True)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code model]
        +cell unicode
        +cell A model, i.e. shortcut link, package name or path (optional).
    +row
        +cell #[code markdown]
        +cell bool
        +cell Print information as Markdown.
 +h(2, "explain") spacy.explain
    +tag function
 p
    |  Get a description for a given POS tag, dependency label or entity type.
    |  For a list of available terms, see
    |  #[+src(gh("spacy", "spacy/glossary.py")) glossary.py].
 +aside-code("Example").
    spacy.explain('NORP')
    # Nationalities or religious or political groups
    doc = nlp(u'Hello world')
    for word in doc:
        print(word.text, word.tag_, spacy.explain(word.tag_))
    # Hello UH interjection
    # world NN noun, singular or mass
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code term]
        +cell unicode
        +cell Term to explain.
    +footrow
        +cell returns
        +cell unicode
        +cell The explanation, or #[code None] if not found in the glossary.
--- a/website/docs/api/span.jade
+++ b/website/docs/api/span.jade
@ -2,66 +2,18 @@
 include ../../_includes/_mixins
-p A slice from a #[code Doc] object.
+p A slice from a #[+api("doc") #[code Doc]] object.
 +h(2, "attributes") Attributes
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code doc]
        +cell #[code Doc]
        +cell The parent document.
    +row
        +cell #[code start]
        +cell int
        +cell The token offset for the start of the span.
    +row
        +cell #[code end]
        +cell int
        +cell The token offset for the end of the span.
    +row
        +cell #[code start_char]
        +cell int
        +cell The character offset for the start of the span.
    +row
        +cell #[code end_char]
        +cell int
        +cell The character offset for the end of the span.
    +row
        +cell #[code label]
        +cell int
        +cell The span's label.
    +row
        +cell #[code label_]
        +cell unicode
        +cell The span's label.
    +row
        +cell #[code lemma_]
        +cell unicode
        +cell The span's lemma.
    +row
        +cell #[code ent_id]
        +cell int
        +cell The integer ID of the named entity the token is an instance of.
    +row
        +cell #[code ent_id_]
        +cell unicode
        +cell The string ID of the named entity the token is an instance of.
 +h(2, "init") Span.__init__
    +tag method
 p Create a Span object from the #[code slice doc[start : end]].
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    span = doc[1:4]
    assert [t.text for t in span] ==  [u'it', u'back', u'!']
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code doc]
@ -89,7 +41,7 @@ p Create a Span object from the #[code slice doc[start : end]].
        +cell A meaning representation of the span.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code Span]
        +cell The newly constructed object.
@ -98,6 +50,11 @@ p Create a Span object from the #[code slice doc[start : end]].
 p Get a #[code Token] object.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    span = doc[1:4]
    assert span[1].text == 'back'
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code i]
@ -105,12 +62,17 @@ p Get a #[code Token] object.
        +cell The index of the token within the span.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code Token]
        +cell The token at #[code span[i]].
 p Get a #[code Span] object.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    span = doc[1:4]
    assert span[1:3].text == 'back!'
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code start_end]
@ -118,7 +80,7 @@ p Get a #[code Span] object.
        +cell The slice of the span to get.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code Span]
        +cell The span at #[code span[start : end]].
@ -127,9 +89,14 @@ p Get a #[code Span] object.
 p Iterate over #[code Token] objects.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    span = doc[1:4]
    assert [t.text for t in span] == ['it', 'back', '!']
 +table(["Name", "Type", "Description"])
    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Token]
        +cell A #[code Token] object.
@ -138,19 +105,33 @@ p Iterate over #[code Token] objects.
 p Get the number of tokens in the span.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    span = doc[1:4]
    assert len(span) == 3
 +table(["Name", "Type", "Description"])
    +footrow
-        +cell return
+        +cell returns
        +cell int
        +cell The number of tokens in the span.
 +h(2, "similarity") Span.similarity
    +tag method
    +tag-model("vectors")
 p
    |  Make a semantic similarity estimate. The default estimate is cosine
    |  similarity using an average of word vectors.
 +aside-code("Example").
    doc = nlp(u'green apples and red oranges')
    green_apples = doc[:2]
    red_oranges = doc[3:]
    apples_oranges = green_apples.similarity(red_oranges)
    oranges_apples = red_oranges.similarity(green_apples)
    assert apples_oranges == oranges_apples
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code other]
@ -160,7 +141,7 @@ p
            |  #[code Span], #[code Token] and #[code Lexeme] objects.
    +footrow
-        +cell return
+        +cell returns
        +cell float
        +cell A scalar similarity score. Higher is more similar.
@ -178,87 +159,205 @@ p Retokenize the document, such that the span is merged into a single token.
            |  are inherited from the syntactic root token of the span.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code Token]
        +cell The newly merged token.
 +h(2, "text") Span.text
    +tag property
 p A unicode representation of the span text.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell unicode
        +cell The original verbatim text of the span.
 +h(2, "text_with_ws") Span.text_with_ws
    +tag property
 p
    |  The text content of the span with a trailing whitespace character if the
    |  last token has one.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell unicode
        +cell The text content of the span (with trailing whitespace).
 +h(2, "sent") Span.sent
    +tag property
 p The sentence span that this span is a part of.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell #[code Span]
        +cell The sentence this is part of.
 +h(2, "root") Span.root
    +tag property
    +tag-model("parse")
 p
    |  The token within the span that's highest in the parse tree. If there's a
    |  tie, the earlist is prefered.
 +aside-code("Example").
    doc = nlp(u'I like New York in Autumn.')
    i, like, new, york, in_, autumn, dot = range(len(doc))
    assert doc[new].head.text == 'York'
    assert doc[york].head.text == 'like'
    new_york = doc[new&#58;york+1]
    assert new_york.root.text == 'York'
 +table(["Name", "Type", "Description"])
    +footrow
-        +cell return
+        +cell returns
        +cell #[code Token]
        +cell The root token.
 +h(2, "lefts") Span.lefts
    +tag property
    +tag-model("parse")
 p Tokens that are to the left of the span, whose head is within the span.
 +aside-code("Example").
    doc = nlp(u'I like New York in Autumn.')
    lefts = [t.text for t in doc[3:7].lefts]
    assert lefts == [u'New']
 +table(["Name", "Type", "Description"])
    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Token]
        +cell A left-child of a token of the span.
 +h(2, "rights") Span.rights
    +tag property
    +tag-model("parse")
 p Tokens that are to the right of the span, whose head is within the span.
 +aside-code("Example").
    doc = nlp(u'I like New York in Autumn.')
    rights = [t.text for t in doc[2:4].rights]
    assert rights == [u'in']
 +table(["Name", "Type", "Description"])
    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Token]
        +cell A right-child of a token of the span.
 +h(2, "subtree") Span.subtree
    +tag property
    +tag-model("parse")
 p Tokens that descend from tokens in the span, but fall outside it.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    subtree = [t.text for t in doc[:3].subtree]
    assert subtree == [u'Give', u'it', u'back', u'!']
 +table(["Name", "Type", "Description"])
    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Token]
        +cell A descendant of a token within the span.
 +h(2, "has_vector") Span.has_vector
    +tag property
    +tag-model("vectors")
 p
    |  A boolean value indicating whether a word vector is associated with the
    |  object.
 +aside-code("Example").
    doc = nlp(u'I like apples')
    assert doc[1:].has_vector
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell bool
        +cell Whether the span has a vector data attached.
 +h(2, "vector") Span.vector
    +tag property
    +tag-model("vectors")
 p
    |  A real-valued meaning representation. Defaults to an average of the
    |  token vectors.
 +aside-code("Example").
    doc = nlp(u'I like apples')
    assert doc[1:].vector.dtype == 'float32'
    assert doc[1:].vector.shape == (300,)
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
        +cell A 1D numpy array representing the span's semantics.
 +h(2, "vector_norm") Span.vector_norm
    +tag property
    +tag-model("vectors")
 p
    |  The L2 norm of the span's vector representation.
 +aside-code("Example").
    doc = nlp(u'I like apples')
    doc[1:].vector_norm # 4.800883928527915
    doc[2:].vector_norm # 6.895897646384268
    assert doc[1:].vector_norm != doc[2:].vector_norm
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell float
        +cell The L2 norm of the vector representation.
 +h(2, "attributes") Attributes
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code doc]
        +cell #[code Doc]
        +cell The parent document.
    +row
        +cell #[code sent]
        +cell #[code Span]
        +cell The sentence span that this span is a part of.
    +row
        +cell #[code start]
        +cell int
        +cell The token offset for the start of the span.
    +row
        +cell #[code end]
        +cell int
        +cell The token offset for the end of the span.
    +row
        +cell #[code start_char]
        +cell int
        +cell The character offset for the start of the span.
    +row
        +cell #[code end_char]
        +cell int
        +cell The character offset for the end of the span.
    +row
        +cell #[code text]
        +cell unicode
        +cell A unicode representation of the span text.
    +row
        +cell #[code text_with_ws]
        +cell unicode
        +cell
            |  The text content of the span with a trailing whitespace character
            |  if the last token has one.
    +row
        +cell #[code label]
        +cell int
        +cell The span's label.
    +row
        +cell #[code label_]
        +cell unicode
        +cell The span's label.
    +row
        +cell #[code lemma_]
        +cell unicode
        +cell The span's lemma.
    +row
        +cell #[code ent_id]
        +cell int
        +cell The integer ID of the named entity the token is an instance of.
    +row
        +cell #[code ent_id_]
        +cell unicode
        +cell The string ID of the named entity the token is an instance of.
--- a/website/docs/api/stringstore.jade
+++ b/website/docs/api/stringstore.jade
@ -7,16 +7,22 @@ p Map strings to and from integer IDs.
 +h(2, "init") StringStore.__init__
    +tag method
-p Create the #[code StringStore].
+p
    |  Create the #[code StringStore]. Note that a newly initialised store will
    |  always include an empty string #[code ''] at position #[code 0].
 +aside-code("Example").
    from spacy.strings import StringStore
    stringstore = StringStore([u'apple', u'orange'])
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code strings]
-        +cell -
+        +cell iterable
        +cell A sequence of unicode strings to add to the store.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code StringStore]
        +cell The newly constructed object.
@ -25,9 +31,13 @@ p Create the #[code StringStore].
 p Get the number of strings in the store.
 +aside-code("Example").
    stringstore = StringStore([u'apple', u'orange'])
    assert len(stringstore) == 2
 +table(["Name", "Type", "Description"])
    +footrow
-        +cell return
+        +cell returns
        +cell int
        +cell The number of strings in the store.
@ -36,22 +46,32 @@ p Get the number of strings in the store.
 p Retrieve a string from a given integer ID, or vice versa.
 +aside-code("Example").
    stringstore = StringStore([u'apple', u'orange'])
    int_id = stringstore[u'apple'] # 1
    assert stringstore[int_id] == u'apple'
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string_or_id]
-        +cell bytes / unicode / int
+        +cell bytes, unicode or int
        +cell The value to encode.
    +footrow
-        +cell return
+        +cell returns
-        +cell unicode / int
+        +cell unicode or int
-        +cell The value to retrieved.
+        +cell The value to be retrieved.
 +h(2, "contains") StringStore.__contains__
    +tag method
 p Check whether a string is in the store.
 +aside-code("Example").
    stringstore = StringStore([u'apple', u'orange'])
    assert u'apple' in stringstore == True
    assert u'cherry' in stringstore == False
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
@ -59,49 +79,108 @@ p Check whether a string is in the store.
        +cell The string to check.
    +footrow
-        +cell return
+        +cell returns
        +cell bool
        +cell Whether the store contains the string.
 +h(2, "iter") StringStore.__iter__
    +tag method
-p Iterate over the strings in the store, in order.
+p
    |  Iterate over the strings in the store, in order. Note that a newly
    |  initialised store will always include an empty string #[code ''] at
    |  position #[code 0].
 +aside-code("Example").
    stringstore = StringStore([u'apple', u'orange'])
    all_strings = [s for s in stringstore]
    assert all_strings == [u'', u'apple', u'orange']
 +table(["Name", "Type", "Description"])
    +footrow
-        +cell yield
+        +cell yields
        +cell unicode
        +cell A string in the store.
-+h(2, "dump") StringStore.dump
+h(2, "to_disk") StringStore.to_disk
    +tag method
-p Save the strings to a JSON file.
+p Save the current state to a directory.
 +aside-code("Example").
    stringstore.to_disk('/path/to/strings')
 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code file]
+        +cell #[code path]
-        +cell buffer
+        +cell unicode or #[code Path]
-        +cell The file to save the strings.
+        +cell
            |  A path to a directory, which will be created if it doesn't exist.
            |  Paths may be either strings or #[code Path]-like objects.
-    +footrow
+h(2, "from_disk") Tokenizer.from_disk
        +cell return
        +cell #[code None]
        +cell -
 +h(2, "load") StringStore.load
    +tag method
-p Load the strings from a JSON file.
+p Loads state from a directory. Modifies the object in place and returns it.
 +aside-code("Example").
    from spacy.strings import StringStore
    stringstore = StringStore().from_disk('/path/to/strings')
 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code file]
+        +cell #[code path]
-        +cell buffer
+        +cell unicode or #[code Path]
-        +cell The file from which to load the strings.
+        +cell
            |  A path to a directory. Paths may be either strings or
            |  #[code Path]-like objects.
    +footrow
-        +cell return
+        +cell returns
-        +cell #[code None]
+        +cell #[code Tokenizer]
        +cell The modified #[code Tokenizer] object.
 +h(2, "to_bytes") Tokenizer.to_bytes
    +tag method
 p Serialize the current state to a binary string.
 +aside-code("Example").
    store_bytes = stringstore.to_bytes()
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being serialized.
    +footrow
        +cell returns
        +cell bytes
        +cell The serialized form of the #[code Tokenizer] object.
 +h(2, "from_bytes") Tokenizer.from_bytes
    +tag method
 p Load state from a binary string.
 +aside-code("Example").
    fron spacy.strings import StringStore
    store_bytes = stringstore.to_bytes()
    new_store = StringStore().from_bytes(store_bytes)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code bytes_data]
        +cell bytes
        +cell The data to load from.
    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being loaded.
    +footrow
        +cell returns
        +cell #[code StringStore]
        +cell The #[code StringStore] object.
--- a/website/docs/api/tagger.jade
+++ b/website/docs/api/tagger.jade
@ -4,32 +4,6 @@ include ../../_includes/_mixins
 p Annotate part-of-speech tags on #[code Doc] objects.
 +h(2, "load") Tagger.load
    +tag classmethod
 p Load the statistical model from the supplied path.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell #[code Path]
        +cell The path to load from.
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell The vocabulary. Must be shared by the documents to be processed.
    +row
        +cell #[code require]
        +cell bool
        +cell Whether to raise an error if the files are not found.
    +footrow
        +cell return
        +cell #[code Tagger]
        +cell The newly constructed object.
 +h(2, "init") Tagger.__init__
    +tag method
@ -47,7 +21,7 @@ p Create a #[code Tagger].
        +cell The statistical model.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code Tagger]
        +cell The newly constructed object.
@ -63,7 +37,7 @@ p Apply the tagger, setting the POS tags onto the #[code Doc] object.
        +cell The tokens to be tagged.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code None]
        +cell -
@ -91,7 +65,7 @@ p Tag a stream of documents.
            |  parallel.
    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Doc]
        +cell Documents, in order.
@ -112,6 +86,6 @@ p Update the statistical model, with tags supplied for the given document.
        +cell Manager for the gold-standard tags.
    +footrow
-        +cell return
+        +cell returns
        +cell int
        +cell Number of tags predicted correctly.
--- a/website/docs/api/token.jade
+++ b/website/docs/api/token.jade
@ -4,9 +4,296 @@ include ../../_includes/_mixins
 p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
 +h(2, "init") Token.__init__
    +tag method
 p Construct a #[code Token] object.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    token = doc[0]
    assert token.text == u'Give'
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell A storage container for lexical types.
    +row
        +cell #[code doc]
        +cell #[code Doc]
        +cell The parent document.
    +row
        +cell #[code offset]
        +cell int
        +cell The index of the token within the document.
    +footrow
        +cell returns
        +cell #[code Token]
        +cell The newly constructed object.
 +h(2, "len") Token.__len__
    +tag method
 p The number of unicode characters in the token, i.e. #[code token.text].
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    token = doc[0]
    assert len(token) == 4
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell int
        +cell The number of unicode characters in the token.
 +h(2, "check_flag") Token.check_flag
    +tag method
 p Check the value of a boolean flag.
 +aside-code("Example").
    from spacy.attrs import IS_TITLE
    doc = nlp(u'Give it back! He pleaded.')
    token = doc[0]
    assert token.check_flag(IS_TITLE) == True
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code flag_id]
        +cell int
        +cell The attribute ID of the flag to check.
    +footrow
        +cell returns
        +cell bool
        +cell Whether the flag is set.
 +h(2, "similarity") Token.similarity
    +tag method
    +tag-model("vectors")
 p Compute a semantic similarity estimate. Defaults to cosine over vectors.
 +aside-code("Example").
    apples, _, oranges = nlp(u'apples and oranges')
    apples_oranges = apples.similarity(oranges)
    oranges_apples = oranges.similarity(apples)
    assert apples_oranges == oranges_apples
 +table(["Name", "Type", "Description"])
    +row
        +cell other
        +cell -
        +cell
            |  The object to compare with. By default, accepts #[code Doc],
            |  #[code Span], #[code Token] and #[code Lexeme] objects.
    +footrow
        +cell returns
        +cell float
        +cell A scalar similarity score. Higher is more similar.
 +h(2, "nbor") Token.nbor
    +tag method
 p Get a neighboring token.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    give_nbor = doc[0].nbor()
    assert give_nbor.text == u'it'
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code i]
        +cell int
        +cell The relative position of the token to get. Defaults to #[code 1].
    +footrow
        +cell returns
        +cell #[code Token]
        +cell The token at position #[code self.doc[self.i+i]].
 +h(2, "is_ancestor") Token.is_ancestor
    +tag method
    +tag-model("parse")
 p
    |  Check whether this token is a parent, grandparent, etc. of another
    |  in the dependency tree.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    give = doc[0]
    it = doc[1]
    assert give.is_ancestor(it)
 +table(["Name", "Type", "Description"])
    +row
        +cell descendant
        +cell #[code Token]
        +cell Another token.
    +footrow
        +cell returns
        +cell bool
        +cell Whether this token is the ancestor of the descendant.
 +h(2, "ancestors") Token.ancestors
    +tag property
    +tag-model("parse")
 p The rightmost token of this token's syntactic descendants.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    it_ancestors = doc[1].ancestors
    assert [t.text for t in it_ancestors] == [u'Give']
    he_ancestors = doc[4].ancestors
    assert [t.text for t in he_ancestors] == [u'pleaded']
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yields
        +cell #[code Token]
        +cell
            |  A sequence of ancestor tokens such that
            |  #[code ancestor.is_ancestor(self)].
 +h(2, "conjuncts") Token.conjuncts
    +tag property
    +tag-model("parse")
 p A sequence of coordinated tokens, including the token itself.
 +aside-code("Example").
    doc = nlp(u'I like apples and oranges')
    apples_conjuncts = doc[2].conjuncts
    assert [t.text for t in apples_conjuncts] == [u'oranges']
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yields
        +cell #[code Token]
        +cell A coordinated token.
 +h(2, "children") Token.children
    +tag property
    +tag-model("parse")
 p A sequence of the token's immediate syntactic children.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    give_children = doc[0].children
    assert [t.text for t in give_children] == [u'it', u'back', u'!']
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yields
        +cell #[code Token]
        +cell A child token such that #[code child.head==self].
 +h(2, "subtree") Token.subtree
    +tag property
    +tag-model("parse")
 p A sequence of all the token's syntactic descendents.
 +aside-code("Example").
    doc = nlp(u'Give it back! He pleaded.')
    give_subtree = doc[0].subtree
    assert [t.text for t in give_subtree] == [u'Give', u'it', u'back', u'!']
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yields
        +cell #[code Token]
        +cell A descendant token such that #[code self.is_ancestor(descendant)].
 +h(2, "has_vector") Token.has_vector
    +tag property
    +tag-model("vectors")
 p
    |  A boolean value indicating whether a word vector is associated with the
    |  token.
 +aside-code("Example").
    doc = nlp(u'I like apples')
    apples = doc[2]
    assert apples.has_vector
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell bool
        +cell Whether the token has a vector data attached.
 +h(2, "vector") Token.vector
    +tag property
    +tag-model("vectors")
 p A real-valued meaning representation.
 +aside-code("Example").
    doc = nlp(u'I like apples')
    apples = doc[2]
    assert apples.vector.dtype == 'float32'
    assert apples.vector.shape == (300,)
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
        +cell A 1D numpy array representing the token's semantics.
 +h(2, "vector_norm") Span.vector_norm
    +tag property
    +tag-model("vectors")
 p The L2 norm of the token's vector representation.
 +aside-code("Example").
    doc = nlp(u'I like apples and pasta')
    apples = doc[2]
    pasta = doc[4]
    apples.vector_norm # 6.89589786529541
    pasta.vector_norm # 7.759851932525635
    assert apples.vector_norm != pasta.vector_norm
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell float
        +cell The L2 norm of the vector representation.
 +h(2, "attributes") Attributes
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code text]
        +cell unicode
        +cell Verbatim text content.
    +row
        +cell #[code text_with_ws]
        +cell unicode
        +cell Text content, with trailing space character if present.
    +row
        +cell #[code whitespace]
        +cell int
        +cell Trailing space character if present.
    +row
        +cell #[code whitespace_]
        +cell unicode
        +cell Trailing space character if present.
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
@ -17,14 +304,31 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
        +cell #[code Doc]
        +cell The parent document.
    +row
        +cell #[code head]
        +cell #[code Token]
        +cell The syntactic parent, or "governor", of this token.
    +row
        +cell #[code left_edge]
        +cell #[code Token]
        +cell The leftmost token of this token's syntactic descendants.
    +row
        +cell #[code right_edge]
        +cell #[code Token]
        +cell The rightmost token of this token's syntactic descendents.
    +row
        +cell #[code i]
        +cell int
        +cell The index of the token within the parent document.
    +row
        +cell #[code ent_type]
        +cell int
        +cell Named entity type.
    +row
        +cell #[code ent_type_]
        +cell unicode
@ -42,19 +346,23 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
        +cell unicode
        +cell
            |  IOB code of named entity tag. #[code "B"]
-            |  means the token begins an entity, #[code "I"] means it inside an
+            |  means the token begins an entity, #[code "I"] means it is inside
-            |  entity, #[code "O"] means it is outside an entity, and
+            |  an entity, #[code "O"] means it is outside an entity, and
            |  #[code ""] means no entity tag is set.
    +row
        +cell #[code ent_id]
        +cell int
-        +cell ID of the entity the token is an instance of, if any.
+        +cell
            |  ID of the entity the token is an instance of, if any. Usually
            |  assigned by patterns in the Matcher.
    +row
        +cell #[code ent_id_]
        +cell unicode
-        +cell ID of the entity the token is an instance of, if any.
+        +cell
            |  ID of the entity the token is an instance of, if any. Usually
            |  assigned by patterns in the Matcher.
    +row
        +cell #[code lemma]
@ -229,232 +537,3 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
        +cell #[code lex_id]
        +cell int
        +cell ID of the token's lexical type.
    +row
        +cell #[code text]
        +cell unicode
        +cell Verbatim text content.
    +row
        +cell #[code text_with_ws]
        +cell unicode
        +cell Text content, with trailing space character if present.
    +row
        +cell #[code whitespace]
        +cell int
        +cell Trailing space character if present.
    +row
        +cell #[code whitespace_]
        +cell unicode
        +cell Trailing space character if present.
 +h(2, "init") Token.__init__
    +tag method
 p Construct a #[code Token] object.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell A storage container for lexical types.
    +row
        +cell #[code doc]
        +cell #[code Doc]
        +cell The parent document.
    +row
        +cell #[code offset]
        +cell int
        +cell The index of the token within the document.
    +footrow
        +cell return
        +cell #[code Token]
        +cell The newly constructed object.
 +h(2, "len") Token.__len__
    +tag method
 p Get the number of unicode characters in the token.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell int
        +cell The number of unicode characters in the token.
 +h(2, "check_flag") Token.check_flag
    +tag method
 p Check the value of a boolean flag.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code flag_id]
        +cell int
        +cell The attribute ID of the flag to check.
    +footrow
        +cell return
        +cell bool
        +cell Whether the flag is set.
 +h(2, "nbor") Token.nbor
    +tag method
 p Get a neighboring token.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code i]
        +cell int
        +cell The relative position of the token to get. Defaults to #[code 1].
    +footrow
        +cell return
        +cell #[code Token]
        +cell The token at position #[code self.doc[self.i+i]]
 +h(2, "similarity") Token.similarity
    +tag method
 p Compute a semantic similarity estimate. Defaults to cosine over vectors.
 +table(["Name", "Type", "Description"])
    +row
        +cell other
        +cell -
        +cell
            |  The object to compare with. By default, accepts #[code Doc],
            |  #[code Span], #[code Token] and #[code Lexeme] objects.
    +footrow
        +cell return
        +cell float
        +cell A scalar similarity score. Higher is more similar.
 +h(2, "is_ancestor") Token.is_ancestor
    +tag method
 p
    |  Check whether this token is a parent, grandparent, etc. of another
    |  in the dependency tree.
 +table(["Name", "Type", "Description"])
    +row
        +cell descendant
        +cell #[code Token]
        +cell Another token.
    +footrow
        +cell return
        +cell bool
        +cell Whether this token is the ancestor of the descendant.
 +h(2, "vector") Token.vector
    +tag property
 p A real-valued meaning representation.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
        +cell A 1D numpy array representing the token's semantics.
 +h(2, "has_vector") Token.has_vector
    +tag property
 p
    |  A boolean value indicating whether a word vector is associated with the
    |  object.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell bool
        +cell Whether the token has a vector data attached.
 +h(2, "head") Token.head
    +tag property
 p The syntactic parent, or "governor", of this token.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell #[code Token]
        +cell The head.
 +h(2, "conjuncts") Token.conjuncts
    +tag property
 p A sequence of coordinated tokens, including the token itself.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yield
        +cell #[code Token]
        +cell A coordinated token.
 +h(2, "children") Token.children
    +tag property
 p A sequence of the token's immediate syntactic children.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yield
        +cell #[code Token]
        +cell A child token such that #[code child.head==self].
 +h(2, "subtree") Token.subtree
    +tag property
 p A sequence of all the token's syntactic descendents.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yield
        +cell #[code Token]
        +cell A descendant token such that #[code self.is_ancestor(descendant)].
 +h(2, "left_edge") Token.left_edge
    +tag property
 p The leftmost token of this token's syntactic descendants.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell #[code Token]
        +cell The first token such that #[code self.is_ancestor(token)].
 +h(2, "right_edge") Token.right_edge
    +tag property
 p The rightmost token of this token's syntactic descendents.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell return
        +cell #[code Token]
        +cell The last token such that #[code self.is_ancestor(token)].
 +h(2, "ancestors") Token.ancestors
    +tag property
 p The rightmost token of this token's syntactic descendants.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell yield
        +cell #[code Token]
        +cell
            |  A sequence of ancestor tokens such that
            |  #[code ancestor.is_ancestor(self)].
--- a/website/docs/api/tokenizer.jade
+++ b/website/docs/api/tokenizer.jade
@ -6,6 +6,283 @@ p
    |  Segment text, and create #[code Doc] objects with the discovered segment
    |  boundaries.
 +h(2, "init") Tokenizer.__init__
    +tag method
 p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
 +aside-code("Example").
    # Construction 1
    from spacy.tokenizer import Tokenizer
    tokenizer = Tokenizer(nlp.vocab)
    # Construction 2
    from spacy.lang.en import English
    tokenizer = English().Defaults.create_tokenizer(nlp)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell A storage container for lexical types.
    +row
        +cell #[code rules]
        +cell dict
        +cell Exceptions and special-cases for the tokenizer.
    +row
        +cell #[code prefix_search]
        +cell callable
        +cell
            |  A function matching the signature of
            |  #[code re.compile(string).search] to match prefixes.
    +row
        +cell #[code suffix_search]
        +cell callable
        +cell
            |  A function matching the signature of
            |  #[code re.compile(string).search] to match suffixes.
    +row
        +cell #[code infix_finditer]
        +cell callable
        +cell
            |  A function matching the signature of
            |  #[code re.compile(string).finditer] to find infixes.
    +row
        +cell #[code token_match]
        +cell callable
        +cell A boolean function matching strings to be recognised as tokens.
    +footrow
        +cell returns
        +cell #[code Tokenizer]
        +cell The newly constructed object.
 +h(2, "call") Tokenizer.__call__
    +tag method
 p Tokenize a string.
 +aside-code("Example").
    tokens = tokenizer(u'This is a sentence')
    assert len(tokens) == 4
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to tokenize.
    +footrow
        +cell returns
        +cell #[code Doc]
        +cell A container for linguistic annotations.
 +h(2, "pipe") Tokenizer.pipe
    +tag method
 p Tokenize a stream of texts.
 +aside-code("Example").
    texts = [u'One document.', u'...', u'Lots of documents']
    for doc in tokenizer.pipe(texts, batch_size=50):
        pass
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code texts]
        +cell -
        +cell A sequence of unicode texts.
    +row
        +cell #[code batch_size]
        +cell int
        +cell The number of texts to accumulate in an internal buffer.
    +row
        +cell #[code n_threads]
        +cell int
        +cell
            |  The number of threads to use, if the implementation supports
            |  multi-threading. The default tokenizer is single-threaded.
    +footrow
        +cell yields
        +cell #[code Doc]
        +cell A sequence of Doc objects, in order.
 +h(2, "find_infix") Tokenizer.find_infix
    +tag method
 p Find internal split points of the string.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to split.
    +footrow
        +cell returns
        +cell list
        +cell
            |  A list of #[code re.MatchObject] objects that have #[code .start()]
            |  and #[code .end()] methods, denoting the placement of internal
            |  segment separators, e.g. hyphens.
 +h(2, "find_prefix") Tokenizer.find_prefix
    +tag method
 p
    |  Find the length of a prefix that should be segmented from the string, or
    |  #[code None] if no prefix rules match.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to segment.
    +footrow
        +cell returns
        +cell int
        +cell The length of the prefix if present, otherwise #[code None].
 +h(2, "find_suffix") Tokenizer.find_suffix
    +tag method
 p
    |  Find the length of a suffix that should be segmented from the string, or
    |  #[code None] if no suffix rules match.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to segment.
    +footrow
        +cell returns
        +cell int / #[code None]
        +cell The length of the suffix if present, otherwise #[code None].
 +h(2, "add_special_case") Tokenizer.add_special_case
    +tag method
 p
    |  Add a special-case tokenization rule. This mechanism is also used to add
    |  custom tokenizer exceptions to the language data. See the usage workflow
    |  on #[+a("/docs/usage/adding-languages#tokenizer-exceptions") adding languages]
    |  for more details and examples.
 +aside-code("Example").
    from spacy.attrs import ORTH, LEMMA
    case = [{"don't": [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]}]
    tokenizer.add_special_case(case)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to specially tokenize.
    +row
        +cell #[code token_attrs]
        +cell iterable
        +cell
            |  A sequence of dicts, where each dict describes a token and its
            |  attributes. The #[code ORTH] fields of the attributes must
            |  exactly match the string when they are concatenated.
 +h(2, "to_disk") Tokenizer.to_disk
    +tag method
 p Save the current state to a directory.
 +aside-code("Example").
    tokenizer.to_disk('/path/to/tokenizer')
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell unicode or #[code Path]
        +cell
            |  A path to a directory, which will be created if it doesn't exist.
            |  Paths may be either strings or #[code Path]-like objects.
 +h(2, "from_disk") Tokenizer.from_disk
    +tag method
 p Loads state from a directory. Modifies the object in place and returns it.
 +aside-code("Example").
    from spacy.tokenizer import Tokenizer
    tokenizer = Tokenizer(nlp.vocab)
    tokenizer = tokenizer.from_disk('/path/to/tokenizer')
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell unicode or #[code Path]
        +cell
            |  A path to a directory. Paths may be either strings or
            |  #[code Path]-like objects.
    +footrow
        +cell returns
        +cell #[code Tokenizer]
        +cell The modified #[code Tokenizer] object.
 +h(2, "to_bytes") Tokenizer.to_bytes
    +tag method
 p Serialize the current state to a binary string.
 +aside-code("Example").
    tokenizer_bytes = tokenizer.to_bytes()
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being serialized.
    +footrow
        +cell returns
        +cell bytes
        +cell The serialized form of the #[code Tokenizer] object.
 +h(2, "from_bytes") Tokenizer.from_bytes
    +tag method
 p Load state from a binary string.
 +aside-code("Example").
    fron spacy.tokenizer import Tokenizer
    tokenizer_bytes = tokenizer.to_bytes()
    new_tokenizer = Tokenizer(nlp.vocab)
    new_tokenizer.from_bytes(tokenizer_bytes)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code bytes_data]
        +cell bytes
        +cell The data to load from.
    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being loaded.
    +footrow
        +cell returns
        +cell #[code Tokenizer]
        +cell The #[code Tokenizer] object.
 +h(2, "attributes") Attributes
 +table(["Name", "Type", "Description"])
@ -35,215 +312,3 @@ p
            |  A function to find internal segment separators, e.g. hyphens.
            |  Returns a (possibly empty) list of #[code re.MatchObject]
            |  objects.
 +h(2, "load") Tokenizer.load
    +tag classmethod
 p Load a #[code Tokenizer], reading unsupplied components from the path.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell #[code Path]
        +cell The path to load from.
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell A storage container for lexical types.
    +row
        +cell #[code rules]
        +cell dict
        +cell Exceptions and special-cases for the tokenizer.
    +row
        +cell #[code prefix_search]
        +cell callable
        +cell
            |  A function matching the signature of
            |  #[code re.compile(string).search] to match prefixes.
    +row
        +cell #[code suffix_search]
        +cell callable
        +cell
            |  A function matching the signature of
            |  #[code re.compile(string).search] to match suffixes.
    +row
        +cell #[code infix_finditer]
        +cell callable
        +cell
            |  A function matching the signature of
            |  #[code re.compile(string).finditer] to find infixes.
    +footrow
        +cell return
        +cell #[code Tokenizer]
        +cell The newly constructed object.
 +h(2, "init") Tokenizer.__init__
    +tag method
 p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell A storage container for lexical types.
    +row
        +cell #[code rules]
        +cell dict
        +cell Exceptions and special-cases for the tokenizer.
    +row
        +cell #[code prefix_search]
        +cell callable
        +cell
            |  A function matching the signature of
            |  #[code re.compile(string).search] to match prefixes.
    +row
        +cell #[code suffix_search]
        +cell callable
        +cell
            |  A function matching the signature of
            |  #[code re.compile(string).search] to match suffixes.
    +row
        +cell #[code infix_finditer]
        +cell callable
        +cell
            |  A function matching the signature of
            |  #[code re.compile(string).finditer] to find infixes.
    +footrow
        +cell return
        +cell #[code Tokenizer]
        +cell The newly constructed object.
 +h(2, "call") Tokenizer.__call__
    +tag method
 p Tokenize a string.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to tokenize.
    +footrow
        +cell return
        +cell #[code Doc]
        +cell A container for linguistic annotations.
 +h(2, "pipe") Tokenizer.pipe
    +tag method
 p Tokenize a stream of texts.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code texts]
        +cell -
        +cell A sequence of unicode texts.
    +row
        +cell #[code batch_size]
        +cell int
        +cell The number of texts to accumulate in an internal buffer.
    +row
        +cell #[code n_threads]
        +cell int
        +cell
            |  The number of threads to use, if the implementation supports
            |  multi-threading. The default tokenizer is single-threaded.
    +footrow
        +cell yield
        +cell #[code Doc]
        +cell A sequence of Doc objects, in order.
 +h(2, "find_infix") Tokenizer.find_infix
    +tag method
 p Find internal split points of the string.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to split.
    +footrow
        +cell return
        +cell #[code List[re.MatchObject]]
        +cell
            |  A list of objects that have #[code .start()] and #[code .end()]
            |  methods, denoting the placement of internal segment separators,
            |  e.g. hyphens.
 +h(2, "find_prefix") Tokenizer.find_prefix
    +tag method
 p
    |  Find the length of a prefix that should be segmented from the string, or
    |  #[code None] if no prefix rules match.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to segment.
    +footrow
        +cell return
        +cell int / #[code None]
        +cell The length of the prefix if present, otherwise #[code None].
 +h(2, "find_suffix") Tokenizer.find_suffix
    +tag method
 p
    |  Find the length of a suffix that should be segmented from the string, or
    |  #[code None] if no suffix rules match.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to segment.
    +footrow
        +cell return
        +cell int / #[code None]
        +cell The length of the suffix if present, otherwise #[code None].
 +h(2, "add_special_case") Tokenizer.add_special_case
    +tag method
 p Add a special-case tokenization rule.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to specially tokenize.
    +row
        +cell #[code token_attrs]
        +cell -
        +cell
            |  A sequence of dicts, where each dict describes a token and its
            |  attributes. The #[code ORTH] fields of the attributes must
            |  exactly match the string when they are concatenated.
    +footrow
        +cell return
        +cell #[code None]
        +cell -
--- a/website/docs/api/util.jade
+++ b/website/docs/api/util.jade
@ -14,7 +14,7 @@ p
    |  recommend having additional tests in place if your application depends on
    |  any of spaCy's utilities.
-+h(2, "get_data_path") get_data_path
+h(2, "get_data_path") util.get_data_path
    +tag function
 p
@ -28,11 +28,11 @@ p
        +cell Only return path if it exists, otherwise return #[code None].
    +footrow
-        +cell return
+        +cell returns
        +cell #[code Path] / #[code None]
        +cell Data path or #[code None].
-+h(2, "set_data_path") set_data_path
+h(2, "set_data_path") util.set_data_path
    +tag function
 p
@ -49,7 +49,7 @@ p
        +cell unicode or #[code Path]
        +cell Path to new data directory.
-+h(2, "get_lang_class") get_lang_class
+h(2, "get_lang_class") util.get_lang_class
    +tag function
 p
@ -70,11 +70,11 @@ p
        +cell Two-letter language code, e.g. #[code 'en'].
    +footrow
-        +cell return
+        +cell returns
        +cell #[code Language]
        +cell Language class.
-+h(2, "resolve_model_path") resolve_model_path
+h(2, "resolve_model_path") util.resolve_model_path
    +tag function
 p Resolve a model name or string to a model path.
@ -90,11 +90,11 @@ p Resolve a model name or string to a model path.
        +cell Package name, shortcut link or model path.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code Path]
        +cell Path to model data directory.
-+h(2, "is_package") is_package
+h(2, "is_package") util.is_package
    +tag function
 p
@ -112,11 +112,11 @@ p
        +cell Name of package.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code bool]
        +cell #[code True] if installed package, #[code False] if not.
-+h(2, "get_model_package_path") get_model_package_path
+h(2, "get_model_package_path") util.get_model_package_path
    +tag function
 p
@ -134,11 +134,11 @@ p
        +cell Name of installed package.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code Path]
        +cell Path to model data directory.
-+h(2, "parse_package_meta") parse_package_meta
+h(2, "parse_package_meta") util.parse_package_meta
    +tag function
 p
@ -163,11 +163,31 @@ p
        +cell If #[code True], raise error if no #[code meta.json] is found.
    +footrow
-        +cell return
+        +cell returns
        +cell dict / #[code None]
        +cell Model meta data or #[code None].
-+h(2, "update_exc") update_exc
+h(2, "is_in_jupyter") util.is_in_jupyter
    +tag function
 p
    |  Check if user is running spaCy from a #[+a("https://jupyter.org") Jupyter]
    |  notebook by detecting the IPython kernel. Mainly used for the
    |  #[+api("displacy") #[code displacy]] visualizer.
 +aside-code("Example").
    html = '&lt;h1&gt;Hello world!&lt;/h1&gt;'
    if util.is_in_jupyter():
        from IPython.core.display import display, HTML
        return display(HTML(html))
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell bool
        +cell #[code True] if in Jupyter, #[code False] if not.
 +h(2, "update_exc") util.update_exc
    +tag function
 p
@ -194,12 +214,12 @@ p
        +cell Exception dictionaries to add to the base exceptions, in order.
    +footrow
-        +cell return
+        +cell returns
        +cell dict
        +cell Combined tokenizer exceptions.
-+h(2, "prints") prints
+h(2, "prints") util.prints
    +tag function
 p
--- a/website/docs/api/vocab.jade
+++ b/website/docs/api/vocab.jade
@ -7,59 +7,6 @@ p
    |  #[code Vocab] instance also provides access to the #[code StringStore],
    |  and owns underlying C-data that is shared between #[code Doc] objects.
 +h(2, "attributes") Attributes
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code strings]
        +cell #[code StringStore]
        +cell A table managing the string-to-int mapping.
    +row
        +cell #[code vectors_length]
        +cell int
        +cell The dimensionality of the word vectors, if present.
 +h(2, "load") Vocab.load
    +tag classmethod
 p Load the vocabulary from a path.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell #[code Path]
        +cell The path to load from.
    +row
        +cell #[code lex_attr_getters]
        +cell dict
        +cell
            |  A dictionary mapping attribute IDs to functions to compute them.
            |  Defaults to #[code None].
    +row
        +cell #[code lemmatizer]
        +cell -
        +cell A lemmatizer. Defaults to #[code None].
    +row
        +cell #[code tag_map]
        +cell dict
        +cell
            |  A dictionary mapping fine-grained tags to coarse-grained
            |  parts-of-speech, and optionally morphological attributes.
    +row
        +cell #[code oov_prob]
        +cell float
        +cell The default probability for out-of-vocabulary words.
    +footrow
        +cell return
        +cell #[code Vocab]
        +cell The newly constructed object.
 +h(2, "init") Vocab.__init__
    +tag method
@ -73,11 +20,6 @@ p Create the vocabulary.
            |  A dictionary mapping attribute IDs to functions to compute them.
            |  Defaults to #[code None].
    +row
        +cell #[code lemmatizer]
        +cell -
        +cell A lemmatizer. Defaults to #[code None].
    +row
        +cell #[code tag_map]
        +cell dict
@ -86,23 +28,34 @@ p Create the vocabulary.
            |  parts-of-speech, and optionally morphological attributes.
    +row
-        +cell #[code oov_prob]
+        +cell #[code lemmatizer]
-        +cell float
+        +cell object
-        +cell The default probability for out-of-vocabulary words.
+        +cell A lemmatizer. Defaults to #[code None].
    +row
        +cell #[code strings]
        +cell #[code StringStore]
        +cell
            |  A #[code StringStore] that maps strings to integers, and vice
            |  versa.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code Vocab]
        +cell The newly constructed object.
 +h(2, "len") Vocab.__len__
    +tag method
-p Get the number of lexemes in the vocabulary.
+p Get the current number of lexemes in the vocabulary.
 +aside-code("Example").
    doc = nlp(u'This is a sentence.')
    assert len(nlp.vocab) > 0
 +table(["Name", "Type", "Description"])
    +footrow
-        +cell return
+        +cell returns
        +cell int
        +cell The number of lexems in the vocabulary.
@ -113,6 +66,10 @@ p
    |  Retrieve a lexeme, given an int ID or a unicode string. If a previously
    |  unseen unicode string is given, a new lexeme is created and stored.
 +aside-code("Example").
    apple = nlp.vocab.strings['apple']
    assert nlp.vocab[apple] == nlp.vocab[u'apple']
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code id_or_string]
@ -120,25 +77,37 @@ p
        +cell The integer ID of a word, or its unicode string.
    +footrow
-        +cell return
+        +cell returns
        +cell #[code Lexeme]
        +cell The lexeme indicated by the given ID.
-+h(2, "iter") Span.__iter__
+h(2, "iter") Vocab.__iter__
    +tag method
 p Iterate over the lexemes in the vocabulary.
 +aside-code("Example").
    stop_words = (lex for lex in nlp.vocab if lex.is_stop)
 +table(["Name", "Type", "Description"])
    +footrow
-        +cell yield
+        +cell yields
        +cell #[code Lexeme]
        +cell An entry in the vocabulary.
 +h(2, "contains") Vocab.__contains__
    +tag method
-p Check whether the string has an entry in the vocabulary.
+p
    |  Check whether the string has an entry in the vocabulary. To get the ID
    |  for a given string, you need to look it up in
    |  #[+api("vocab#attributes") #[code vocab.strings]].
 +aside-code("Example").
    apple = nlp.vocab.strings['apple']
    oov = nlp.vocab.strings['dskfodkfos']
    assert apple in nlp.vocab
    assert oov not in nlp.vocab
 +table(["Name", "Type", "Description"])
    +row
@ -147,32 +116,27 @@ p Check whether the string has an entry in the vocabulary.
        +cell The ID string.
    +footrow
-        +cell return
+        +cell returns
        +cell bool
        +cell Whether the string has an entry in the vocabulary.
 +h(2, "resize_vectors") Vocab.resize_vectors
    +tag method
 p
    |  Set #[code vectors_length] to a new size, and allocate more memory for
    |  the #[code Lexeme] vectors if necessary. The memory will be zeroed.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code new_size]
        +cell int
        +cell The new size of the vectors.
    +footrow
        +cell return
        +cell #[code None]
        +cell -
 +h(2, "add_flag") Vocab.add_flag
    +tag method
-p Set a new boolean flag to words in the vocabulary.
+p
    |  Set a new boolean flag to words in the vocabulary. The #[code flag_getter]
    |  function will be called over the words currently in the vocab, and then
    |  applied to new words as they occur. You'll then be able to access the flag
    |  value on each token, using #[code token.check_flag(flag_id)].
 +aside-code("Example").
    def is_my_product(text):
        products = [u'spaCy', u'Thinc', u'displaCy']
        return text in products
    MY_PRODUCT = nlp.vocab.add_flag(is_my_product)
    doc = nlp(u'I like spaCy')
    assert doc[2].check_flag(MY_PRODUCT) == True
 +table(["Name", "Type", "Description"])
    +row
@ -189,90 +153,104 @@ p Set a new boolean flag to words in the vocabulary.
            |  available bit will be chosen.
    +footrow
-        +cell return
+        +cell returns
        +cell int
        +cell The integer ID by which the flag value can be checked.
-+h(2, "dump") Vocab.dump
+h(2, "to_disk") Vocab.to_disk
    +tag method
-p Save the lexemes binary data to the given location.
+p Save the current state to a directory.
 +aside-code("Example").
    nlp.vocab.to_disk('/path/to/vocab')
 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code loc]
+        +cell #[code path]
-        +cell #[code Path]
+        +cell unicode or #[code Path]
        +cell The path to load from.
    +footrow
        +cell return
        +cell #[code None]
        +cell -
 +h(2, "load_lexemes") Vocab.load_lexemes
    +tag method
 p
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code loc]
        +cell unicode
        +cell Path to load the lexemes.bin file from.
    +footrow
        +cell return
        +cell #[code None]
        +cell -
 +h(2, "dump_vectors") Vocab.dump_vectors
    +tag method
 p Save the word vectors to a binary file.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code loc]
        +cell #[code Path]
        +cell The path to save to.
    +footrow
        +cell return
        +cell #[code None]
        +cell -
 +h(2, "load_vectors") Vocab.load_vectors
    +tag method
 p Load vectors from a text-based file.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code file_]
        +cell buffer
        +cell
-            |  The file to read from. Entries should be separated by newlines,
+            |  A path to a directory, which will be created if it doesn't exist.
-            |  and each entry should be whitespace delimited. The first value
+            |  Paths may be either strings or #[code Path]-like objects.
            |  of the entry should be the word string, and subsequent entries
            |  should be the values of the vector.
-    +footrow
+h(2, "from_disk") Vocab.from_disk
        +cell return
        +cell int
        +cell The length of the vectors loaded.
 +h(2, "load_vectors_from_bin_loc") Vocab.load_vectors_from_bin_loc
    +tag method
-p Load vectors from the location of a binary file.
+p Loads state from a directory. Modifies the object in place and returns it.
 +aside-code("Example").
    from spacy.vocab import Vocab
    vocab = Vocab().from_disk('/path/to/vocab')
 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code loc]
+        +cell #[code path]
-        +cell unicode
+        +cell unicode or #[code Path]
-        +cell The path of the binary file to load from.
+        +cell
            |  A path to a directory. Paths may be either strings or
            |  #[code Path]-like objects.
    +footrow
-        +cell return
+        +cell returns
-        +cell int
+        +cell #[code Vocab]
-        +cell The length of the vectors loaded.
+        +cell The modified #[code Vocab] object.
 +h(2, "to_bytes") Vocab.to_bytes
    +tag method
 p Serialize the current state to a binary string.
 +aside-code("Example").
    vocab_bytes = nlp.vocab.to_bytes()
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being serialized.
    +footrow
        +cell returns
        +cell bytes
        +cell The serialized form of the #[code Vocab] object.
 +h(2, "from_bytes") Vocab.from_bytes
    +tag method
 p Load state from a binary string.
 +aside-code("Example").
    fron spacy.vocab import Vocab
    vocab_bytes = nlp.vocab.to_bytes()
    vocab = Vocab()
    vocab.from_bytes(vocab_bytes)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code bytes_data]
        +cell bytes
        +cell The data to load from.
    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being loaded.
    +footrow
        +cell returns
        +cell #[code Vocab]
        +cell The #[code Vocab] object.
 +h(2, "attributes") Attributes
 +aside-code("Example").
    apple_id = nlp.vocab.strings['apple']
    assert type(apple_id) == int
    PERSON = nlp.vocab.strings['PERSON']
    assert type(PERSON) == int
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code strings]
        +cell #[code StringStore]
        +cell A table managing the string-to-int mapping.
--- a/website/docs/usage/adding-languages.jade
+++ b/website/docs/usage/adding-languages.jade
@ -56,20 +56,22 @@ p
    from ...attrs import LANG
    from ...util import update_exc
    # create Defaults class in the module scope (necessary for pickling!)
    class XxxxxDefaults(Language.Defaults):
        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
        lex_attr_getters[LANG] = lambda text: 'xx' # language ISO code
        # optional: replace flags with custom functions, e.g. like_num()
        lex_attr_getters.update(LEX_ATTRS)
        # merge base exceptions and custom tokenizer exceptions
        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
        stop_words = set(STOP_WORDS)
    # create actual Language class
    class Xxxxx(Language):
        lang = 'xx' # language ISO code
-
+        Defaults = XxxxxDefaults # override defaults
        # override defaults
        class Defaults(Language.Defaults):
            lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
            lex_attr_getters[LANG] = lambda text: 'xx' # language ISO code
            # optional: replace flags with custom functions, e.g. like_num()
            lex_attr_getters.update(LEX_ATTRS)
            # merge base exceptions and custom tokenizer exceptions
            tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
            stop_words = set(STOP_WORDS)
    # set default export – this allows the language class to be lazy-loaded
    __all__ = ['Xxxxx']
--- a/website/docs/usage/entity-recognition.jade
+++ b/website/docs/usage/entity-recognition.jade
@ -141,11 +141,11 @@ p
 include ../api/_annotation/_named-entities
 +aside("Install")
-    |  The #[+api("load") spacy.load()] function configures a pipeline that
+    |  The #[+api("load") #[code spacy.load()]] function configures a pipeline that
    |  includes all of the available annotators for the given ID. In the example
    |  above, the #[code 'en'] ID tells spaCy to load the default English
    |  pipeline. If you have installed the data with
-    |  #[code python -m spacy.en.download] this will include the entity
+    |  #[code python -m spacy download en], this will include the entity
    |  recognition model.
 +h(2, "updating") Training and updating
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/docs/usage/rule-based-matching.jade
@ -4,58 +4,190 @@ include ../../_includes/_mixins
 p
    |  spaCy features a rule-matching engine that operates over tokens, similar
-    |  to regular expressions. The rules can refer to token annotations and
+    |  to regular expressions. The rules can refer to token annotations (e.g.
-    |  flags, and matches support callbacks to accept, modify and/or act on the
+    |  the token #[code text] or #[code tag_], and flags (e.g. #[code IS_PUNCT]).
-    |  match. The rule matcher also allows you to associate patterns with
+    |  The rule matcher also lets you pass in a custom callback
-    |  entity IDs, to allow some basic entity linking or disambiguation.
+    |  to act on matches – for example, to merge entities and apply custom labels.
    |  You can also associate patterns with entity IDs, to allow some basic
    |  entity linking or disambiguation.
-p Here's a minimal example. We first add a pattern that specifies three tokens:
+aside("What about \"real\" regular expressions?")
-+list("numbers")
+h(2, "adding-patterns") Adding patterns
    +item A token whose lower-case form matches "hello"
    +item A token whose #[code is_punct] flag is set to #[code True]
    +item A token whose lower-case form matches "world"
 p
-    |  Once we've added the pattern, we can use the #[code matcher] as a
+    |  Let's say we want to enable spaCy to find a combination of three tokens:
-    |  callable, to receive a list of #[code (ent_id, start, end)] tuples.
+
-    |  Note that #[code LOWER] and #[code IS_PUNCT] are data attributes
+list("numbers")
-    |  of #[code spacy.attrs].
+    +item
        |  A token whose #[strong lower-case form matches "hello"], e.g. "Hello"
        |  or "HELLO".
    +item
        |  A token whose #[strong #[code is_punct] flag is set to #[code True]],
        |  i.e. any punctuation.
    +item
        |  A token whose #[strong lower-case form matches "world"], e.g. "World"
        |  or "WORLD".
 +code.
-    from spacy.matcher import Matcher
+    [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}]
    matcher = Matcher(nlp.vocab)
    matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True}, {LOWER: "world"}])
-    doc = nlp(u'Hello, world!')
+p
    |  First, we initialise the #[code Matcher] with a vocab. The matcher must
    |  always share the same vocab with the documents it will operate on. We
    |  can now call #[+api("matcher#add") #[code matcher.add()]] with an ID and
    |  our custom pattern. The second argument lets you pass in an optional
    |  callback function to invoke on a successful match. For now, we set it
    |  to #[code None].
 +code.
    import spacy
    from spacy.matcher import Matcher
    from spacy.attrs import LOWER, IS_PUNCT # don't forget to import the attrs!
    nlp = spacy.load('en')
    matcher = Matcher(nlp.vocab)
    # add match ID "HelloWorld" with no callback and one pattern
    matcher.add('HelloWorld', on_match=None,
                [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}])
    doc = nlp(u'Hello, world! Hello world!')
    matches = matcher(doc)
 p
-    |  The returned matches include the ID, to let you associate the matches
+    |  The matcher returns a list of #[code (match_id, start, end)] tuples – in
-    |  with the patterns. You can also group multiple patterns together, which
+    |  this case, #[code [('HelloWorld', 0, 2)]], which maps to the span
-    |  is useful when you have a knowledge base of entities you want to match,
+    |  #[code doc[0:2]] of our original document. Optionally, we could also
-    |  and you want to write multiple patterns for each entity.
+    |  choose to add more than one pattern, for example to also match sequences
-
+    |  without punctuation between "hello" and "world":
 +h(2, "entities-patterns") Entities and patterns
 +code.
-    matcher.add_entity(
+    matcher.add('HelloWorld', on_match=None,
-        "GoogleNow", # Entity ID -- Helps you act on the match.
+                [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
-        {"ent_type": "PRODUCT", "wiki_en": "Google_Now"}, # Arbitrary attributes (optional)
+                [{LOWER: 'hello'}, {LOWER: 'world'}])
    )
-    matcher.add_pattern(
+p
-        "GoogleNow", # Entity ID -- Created if doesn't exist.
+    |  By default, the matcher will only return the matches and
-        [ # The pattern is a list of *Token Specifiers*.
+    |  #[strong not do anything else], like merge entities or assign labels.
-            { # This Token Specifier matches tokens whose orth field is "Google"
+    |  This is all up to you and can be defined individually for each pattern,
-              ORTH: "Google"
+    |  by passing in a callback function as the #[code on_match] argument on
-            },
+    |  #[code add()]. This is useful, because it lets you write entirely custom
-            { # This Token Specifier matches tokens whose orth field is "Now"
+    |  and #[strong pattern-specific logic]. For example, you might want to
-              ORTH: "Now"
+    |  merge #[em some] patterns into one token, while adding entity labels for
-            }
+    |  other pattern types. You shouldn't have to create different matchers for
-        ],
+    |  each of those processes.
-        label=None # Can associate a label to the pattern-match, to handle it better.
+
-    )
+h(2, "on_match") Adding #[code on_match] rules
 p
    |  To move on to a more realistic example, let's say you're working with a
    |  large corpus of blog articles, and you want to match all mentions of
    |  "Google I/O" (which spaCy tokenizes as #[code ['Google', 'I', '/', 'O']]).
    |  To be safe, you only match on the uppercase versions, in case someone has
    |  written it as "Google i/o". You also add a second pattern with an added
    |  #[code {IS_DIGIT: True}] token – this will make sure you also match on
    |  "Google I/O 2017". If your pattern matches, spaCy should execute your
    |  custom callback function #[code add_event_ent].
 +code.
    import spacy
    from spacy.matcher import Matcher
    from spacy.attrs import ORTH, UPPER, LOWER, IS_DIGIT
    nlp = spacy.load('en')
    matcher = Matcher(nlp.vocab)
    matcher.add('GoogleIO', on_match=add_event_ent,
                [{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}],
                [{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}, {IS_DIGIT: True}])
    # Get the ID of the 'EVENT' entity type. This is required to set an entity.
    EVENT = nlp.vocab.strings['EVENT']
    def add_event_ent(matcher, doc, i, matches):
        # Get the current match and create tuple of entity label, start and end.
        # Append entity to the doc's entity. (Don't overwrite doc.ents!)
        match_id, start, end = matches[i]
        doc.ents += ((EVENT, start, end),)
 p
    |  In addition to mentions of "Google I/O", your data also contains some
    |  annoying pre-processing artefacts, like leftover HTML line breaks
    |  (e.g. #[code &lt;br&gt;] or #[code &lt;BR/&gt;]). While you're at it,
    |  you want to merge those into one token and flag them, to make sure you
    |  can easily ignore them later. So you add a second pattern and pass in a
    |  function #[code merge_and_flag]:
 +code.
    matcher.add('BAD_HTML', on_match=merge_and_flag,
                [{ORTH: '&lt;'}, {LOWER: 'br'}, {ORTH: '&gt;'}],
                [{ORTH: '&lt;'}, {LOWER: 'br/'}, {ORTH: '&gt;'}])
    # Add a new custom flag to the vocab, which is always False by default.
    # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
    BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False)
    def merge_and_flag(matcher, doc, i, matches):
        match_id, start, end = matches[i]
        span = doc[start : end]
        span.merge(is_stop=True) # merge (and mark it as a stop word, just in case)
        span.set_flag(BAD_HTML_FLAG, True) # set BAD_HTML_FLAG
 +aside("Tip: Visualizing matches")
    |  When working with entities, you can use #[+api("displacy") displaCy]
    |  to quickly generate a NER visualization from your updated #[code Doc],
    |  which can be exported as an HTML file:
    +code.o-no-block.
        from spacy import displacy
        html = displacy.render(doc, style='ent', page=True,
                               options={'ents': ['EVENT']})
    |  For more info and examples, see the usage workflow on
    |  #[+a("/docs/usage/visualizers") visualizing spaCy].
 p
    |  We can now call the matcher on our documents. The patterns will be
    |  matched in the order they occur in the text.
 +code.
    doc = nlp(LOTS_OF_TEXT)
    matcher(doc)
 +h(3, "on_match-callback") The callback function
 p
    |  The matcher will first collect all matches over the document. It will
    |  then iterate over the matches, lookup the callback for the entity ID
    |  that was matched, and invoke it. When the callback is invoked, it is
    |  passed four arguments: the matcher itself, the document, the position of
    |  the current match, and the total list of matches. This allows you to
    |  write callbacks that consider the entire set of matched phrases, so that
    |  you can resolve overlaps and other conflicts in whatever way you prefer.
 +table(["Argument", "Type", "Description"])
    +row
        +cell #[code matcher]
        +cell #[code Matcher]
        +cell The matcher instance.
    +row
        +cell #[code doc]
        +cell #[code Doc]
        +cell The document the matcher was used on.
    +row
        +cell #[code i]
        +cell int
        +cell Index of the current match (#[code matches[i]]).
    +row
        +cell #[code matches]
        +cell list
        +cell
            |  A list of #[code (match_id, start, end)] tuples, describing the
            |  matches. A match tuple describes a span #[code doc[start:end]].
            |  The #[code match_id] is the ID of the added match pattern.
 +h(2, "quantifiers") Using quantifiers
@ -82,78 +214,4 @@ p
 p
    |  There are no nested or scoped quantifiers. You can build those
-    |  behaviours with acceptors and
+    |  behaviours with #[code on_match] callbacks.
    |  #[+api("matcher#add_entity") #[code on_match]] callbacks.
 +h(2, "acceptor-functions") Acceptor functions
 p
    |  The #[code acceptor] keyword of #[code matcher.add_entity()] allows you to
    |  pass a function to reject or modify matches. The function you pass should
    |  take five arguments: #[code doc], #[code ent_id], #[code label], #[code start],
    |  and #[code end].  You can return a falsey value to reject the match, or
    |  return a 4-tuple #[code (ent_id, label, start, end)].
 +code.
    from spacy.tokens.doc import Doc
    def trim_title(doc, ent_id, label, start, end):
        if doc[start].check_flag(IS_TITLE_TERM):
            return (ent_id, label, start+1, end)
        else:
            return (ent_id, label, start, end)
    titles = set(title.lower() for title in [u'Mr.', 'Dr.', 'Ms.', u'Admiral'])
    IS_TITLE_TERM = matcher.vocab.add_flag(lambda string: string.lower() in titles)
    matcher.add_entity('PersonName', acceptor=trim_title)
    matcher.add_pattern('PersonName', [{LOWER: 'mr.'}, {LOWER: 'cruise'}])
    matcher.add_pattern('PersonName', [{LOWER: 'dr.'}, {LOWER: 'seuss'}])
    doc = Doc(matcher.vocab, words=[u'Mr.', u'Cruise', u'likes', 'Dr.', u'Seuss'])
    for ent_id, label, start, end in matcher(doc):
        print(doc[start:end].text)
        # Cruise
        # Seuss
 p
    |  Passing an #[code acceptor] function allows you to match patterns with
    |  arbitrary logic that can't easily be expressed by a finite-state machine.
    |  You can look at the entirety of the
    |  matched phrase, and its context in the document, and decide to move
    |  the boundaries or reject the match entirely.
 +h(2, "callback-functions") Callback functions
 p
    |  In spaCy &lt;1.0, the #[code Matcher] automatically tagged matched phrases
    |  with entity types. Since spaCy 1.0, the matcher no longer acts on matches
    |  automatically. By default, the match list is returned for the user to action.
    |  However, it's often more convenient to register the required actions as a
    |  callback. You can do this by passing a function to the #[code on_match]
    |  keyword argument of #[code matcher.add_entity].
 +aside-code("Example").
    def merge_phrases(matcher, doc, i, matches):
        '''
        Merge a phrase. We have to be careful here because we'll change the token indices.
        To avoid problems, merge all the phrases once we're called on the last match.
        '''
        if i != len(matches)-1:
            return None
        # Get Span objects
        spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches]
        for ent_id, label, span in spans:
            span.merge(label=label, tag='NNP' if label else span.root.tag_)
    matcher.add_entity('GoogleNow', on_match=merge_phrases)
    matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
    doc = Doc(matcher.vocab, words=[u'Google', u'Now', u'is', u'being', u'rebranded'])
    matcher(doc)
    print([w.text for w in doc])
    # [u'Google Now', u'is', u'being', u'rebranded']
 p
    |  The matcher will first collect all matches over the document. It will
    |  then iterate over the matches, look-up the callback for the entity ID
    |  that was matched, and invoke it. When the callback is invoked, it is
    |  passed four arguments: the matcher itself, the document, the position of
    |  the current match, and the total list of matches. This allows you to
    |  write callbacks that consider the entire set of matched phrases, so that
    |  you can resolve overlaps and other conflicts in whatever way you prefer.
--- a/website/docs/usage/v2.jade
+++ b/website/docs/usage/v2.jade
@ -2,9 +2,218 @@
 include ../../_includes/_mixins
 p
    |  We also re-wrote a large part of the documentation and usage workflows,
    |  and added more examples.
 +h(2, "features") New features
 +h(3, "features-displacy") displaCy visualizer with Jupyter support
 +aside-code("Example").
    from spacy import displacy
    doc = nlp(u'This is a sentence about Facebook.')
    displacy.serve(doc, style='dep') # run the web server
    html = displacy.render(doc, style='ent') # generate HTML
 p
    |  Our popular dependency and named entity visualizers are now an official
    |  part of the spaCy library! displaCy can run a simple web server, or
    |  generate raw HTML markup or SVG files to be exported. You can pass in one
    |  or more docs, and customise the style. displaCy also auto-detects whether
    |  you're running #[+a("https://jupyter.org") Jupyter] and will render the
    |  visualizations in your notebook.
 +infobox
    |  #[strong API:] #[+api("displacy") #[code displacy]]
    |  #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizing spaCy]
 +h(3, "features-loading") Loading
 +aside-code("Example").
    nlp = spacy.load('en') # shortcut link
    nlp = spacy.load('en_core_web_sm') # package
    nlp = spacy.load('/path/to/en') # unicode path
    nlp = spacy.load(Path('/path/to/en')) # pathlib Path
 p
    |  The improved #[code spacy.load] makes loading models easier and more
    |  transparent. You can load a model by supplying its
    |  #[+a("/docs/usage/models#usage") shortcut link], the name of an installed
    |  #[+a("/docs/usage/saving-loading#generating") model package], a unicode
    |  path or a #[code Path]-like object. spaCy will try resolving the load
    |  argument in this order. The #[code path] keyword argument is now deprecated.
 p
    |  The #[code Language] class to initialise will be determined based on the
    |  model's settings. If no model is found, spaCy will let you know and won't
    |  just return an empty #[code Language] object anymore. If you want a blank
    |  language, you can always import the class directly, e.g.
    |  #[code from spacy.lang.en import English].
 +infobox
    |  #[strong API:] #[+api("spacy#load") #[code spacy.load]]
    |  #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
 +h(3, "features-language") Improved language data and processing pipelines
 +aside-code("Example").
    from spacy.language import Language
    nlp = Language(pipeline=['token_vectors', 'tags',
                             'dependencies'])
 +infobox
    |  #[strong API:] #[+api("language") #[code Language]]
    |  #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages]
 +h(3, "features-lemmatizer") Simple lookup-based lemmatization
 +aside-code("Example").
    LOOKUP = {
        "aba": "abar",
        "ababa": "abar",
        "ababais": "abar",
        "ababan": "abar",
        "ababanes": "ababán"
    }
 p
    |  spaCy now supports simple lookup-based lemmatization. The data is stored
    |  in a dictionary mapping a string to its lemma. To determine a token's
    |  lemma, spaCy simply looks it up in the table. The lookup lemmatizer can
    |  be imported from #[code spacy.lemmatizerlookup]. It's initialised with
    |  the lookup table, and should be returned by the #[code create_lemmatizer]
    |  classmethod of the language's defaults.
 +infobox
    |  #[strong API:] #[+api("language") #[code Language]]
    |  #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages]
 +h(3, "features-matcher") Revised matcher API
 +aside-code("Example").
    from spacy.matcher import Matcher
    from spacy.attrs import LOWER, IS_PUNCT
    matcher = Matcher(nlp.vocab)
    matcher.add('HelloWorld', on_match=None,
                [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
                [{LOWER: 'hello'}, {LOWER: 'world'}])
    assert len(matcher) == 1
    assert 'HelloWorld' in matcher
 p
    |  Patterns can now be added to the matcher by calling
    |  #[+api("matcher-add") #[code matcher.add()]] with a match ID, an optional
    |  callback function to be invoked on each match, and one or more patterns.
    |  This allows you to write powerful, pattern-specific logic using only one
    |  matcher. For example, you might only want to merge some entity types,
    |  and set custom flags for other matched patterns.
 +infobox
    |  #[strong API:] #[+api("matcher") #[code Matcher]]
    |  #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching]
 +h(3, "features-serializer") Serialization
 +infobox
    |  #[strong API:] #[+api("serializer") #[code Serializer]]
    |  #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
 +h(3, "features-models") Neural network models for English, German, French and Spanish
 +infobox
    |  #[strong Details:] #[+src(gh("spacy-models")) spacy-models]
    |  #[strong Usage:] #[+a("/docs/usage/models") Models]
 +h(2, "incompat") Backwards incompatibilities
 +table(["Old", "New"])
    +row
        +cell #[code Language.save_to_directory]
        +cell #[+api("language#to_disk") #[code Language.to_disk]]
    +row
        +cell #[code Tokenizer.load]
        +cell
            |  #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
            |  #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
    +row
        +cell #[code Tagger.load]
        +cell
            |  #[+api("tagger#from_disk") #[code Tagger.from_disk]]
            |  #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
    +row
        +cell #[code DependencyParser.load]
        +cell
            |  #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
            |  #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
    +row
        +cell #[code EntityRecognizer.load]
        +cell
            |  #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
            |  #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
    +row
        +cell
            |  #[code Vocab.load]
            |  #[code Vocab.load_lexemes]
            |  #[code Vocab.load_vectors]
            |  #[code Vocab.load_vectors_from_bin_loc]
        +cell
            |  #[+api("vocab#from_disk") #[code Vocab.from_disk]]
            |  #[+api("vocab#from_bytes") #[code Vocab.from_bytes]]
    +row
        +cell
            |  #[code Vocab.dump]
            |  #[code Vocab.dump_vectors]
        +cell
            |  #[+api("vocab#to_disk") #[code Vocab.to_disk]]
            |  #[+api("vocab#to_bytes") #[code Vocab.to_bytes]]
    +row
        +cell
            |  #[code StringStore.load]
        +cell
            |  #[+api("stringstore#from_disk") #[code StringStore.from_disk]]
            |  #[+api("stringstore#from_bytes") #[code StringStore.from_bytes]]
    +row
        +cell
            |  #[code StringStore.dump]
        +cell
            |  #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
            |  #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
    +row
        +cell #[code Matcher.load]
        +cell -
    +row
        +cell
            |  #[code Matcher.add_pattern]
            |  #[code Matcher.add_entity]
        +cell #[+api("matcher#add") #[code Matcher.add]]
    +row
        +cell #[code Matcher.get_entity]
        +cell #[+api("matcher#get") #[code Matcher.get]]
    +row
        +cell #[code Matcher.has_entity]
        +cell #[+api("matcher#contains") #[code Matcher.__contains__]]
    +row
        +cell #[code Doc.read_bytes]
        +cell
    +row
        +cell #[code Token.is_ancestor_of]
        +cell #[+api("token#is_ancestor") #[code Token.is_ancestor]]
 +h(2, "migrating") Migrating from spaCy 1.x