Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-01-24 00:04:15 +03:00 · 2017-05-26 11:31:41 -05:00 · 2017-05-26 11:31:41 -05:00 · 8af3100143
commit 8af3100143
parent 3d5a536eaa 70afcfec3e
40 changed files with 856 additions and 444 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -173,13 +173,13 @@ class Language(object):
                flat_list.append(pipe)
        self.pipeline = flat_list

-    def __call__(self, text, **disabled):
+    def __call__(self, text, disable=[]):
        """'Apply the pipeline to some text. The text can span multiple sentences,
        and can contain arbtrary whitespace. Alignment into the original string
        is preserved.

        text (unicode): The text to be processed.
-        **disabled: Elements of the pipeline that should not be run.
+        disable (list): Names of the pipeline components to disable.
        RETURNS (Doc): A container for accessing the annotations.

        EXAMPLE:
@ -190,7 +190,7 @@ class Language(object):
        doc = self.make_doc(text)
        for proc in self.pipeline:
            name = getattr(proc, 'name', None)
-            if name in disabled and not disabled[name]:
+            if name in disable:
                continue
            proc(doc)
        return doc
@ -322,7 +322,7 @@ class Language(object):
            except StopIteration:
                pass

-    def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
+    def pipe(self, texts, n_threads=2, batch_size=1000, disable=[]):
        """Process texts as a stream, and yield `Doc` objects in order. Supports
        GIL-free multi-threading.

@ -330,7 +330,7 @@ class Language(object):
        n_threads (int): The number of worker threads to use. If -1, OpenMP will
            decide how many to use at run time. Default is 2.
        batch_size (int): The number of texts to buffer.
-        **disabled: Pipeline components to exclude.
+        disable (list): Names of the pipeline components to disable.
        YIELDS (Doc): Documents in the order of the original text.

        EXAMPLE:
@ -342,7 +342,7 @@ class Language(object):
        docs = texts
        for proc in self.pipeline:
            name = getattr(proc, 'name', None)
-            if name in disabled and not disabled[name]:
+            if name in disable:
                continue
            if hasattr(proc, 'pipe'):
                docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
@ -352,12 +352,14 @@ class Language(object):
        for doc in docs:
            yield doc

-    def to_disk(self, path, **exclude):
-        """Save the current state to a directory.
+    def to_disk(self, path, disable=[]):
+        """Save the current state to a directory.  If a model is loaded, this
+        will include the model.

        path (unicode or Path): A path to a directory, which will be created if
            it doesn't exist. Paths may be either strings or `Path`-like objects.
-        **exclude: Named attributes to prevent from being saved.
+        disable (list): Nameds of pipeline components to disable and prevent
+            from being saved.

        EXAMPLE:
            >>> nlp.to_disk('/path/to/models')
@ -369,7 +371,7 @@ class Language(object):
            raise IOError("Output path must be a directory")
        props = {}
        for name, value in self.__dict__.items():
-            if name in exclude:
+            if name in disable:
                continue
            if hasattr(value, 'to_disk'):
                value.to_disk(path / name)
@ -378,13 +380,14 @@ class Language(object):
        with (path / 'props.pickle').open('wb') as file_:
            dill.dump(props, file_)

-    def from_disk(self, path, **exclude):
+    def from_disk(self, path, disable=[]):
        """Loads state from a directory. Modifies the object in place and
-        returns it.
+        returns it. If the saved `Language` object contains a model, the
+        model will be loaded.

        path (unicode or Path): A path to a directory. Paths may be either
            strings or `Path`-like objects.
-        **exclude: Named attributes to prevent from being loaded.
+        disable (list): Names of the pipeline components to disable.
        RETURNS (Language): The modified `Language` object.

        EXAMPLE:
@ -393,35 +396,36 @@ class Language(object):
        """
        path = util.ensure_path(path)
        for name in path.iterdir():
-            if name not in exclude and hasattr(self, str(name)):
+            if name not in disable and hasattr(self, str(name)):
                getattr(self, name).from_disk(path / name)
        with (path / 'props.pickle').open('rb') as file_:
            bytes_data = file_.read()
-        self.from_bytes(bytes_data, **exclude)
+        self.from_bytes(bytes_data, disable)
        return self

-    def to_bytes(self, **exclude):
+    def to_bytes(self, disable=[]):
        """Serialize the current state to a binary string.

-        **exclude: Named attributes to prevent from being serialized.
+        disable (list): Nameds of pipeline components to disable and prevent
+            from being serialized.
        RETURNS (bytes): The serialized form of the `Language` object.
        """
        props = dict(self.__dict__)
-        for key in exclude:
+        for key in disable:
            if key in props:
                props.pop(key)
        return dill.dumps(props, -1)

-    def from_bytes(self, bytes_data, **exclude):
+    def from_bytes(self, bytes_data, disable=[]):
        """Load state from a binary string.

        bytes_data (bytes): The data to load from.
-        **exclude: Named attributes to prevent from being loaded.
+        disable (list): Names of the pipeline components to disable.
        RETURNS (Language): The `Language` object.
        """
        props = dill.loads(bytes_data)
        for key, value in props.items():
-            if key not in exclude:
+            if key not in disable:
                setattr(self, key, value)
        return self

--- a/spacy/util.py
+++ b/spacy/util.py
@ -174,6 +174,7 @@ def get_async(stream, numpy_array):
        array.set(numpy_array, stream=stream)
        return array

+
 def itershuffle(iterable, bufsize=1000):
    """Shuffle an iterator. This works by holding `bufsize` items back
    and yielding them sometime later. Obviously, this is not unbiased --
--- a/website/_includes/_mixins-base.jade
+++ b/website/_includes/_mixins-base.jade
@ -37,7 +37,8 @@ mixin svg(file, name, width, height)
    size - [integer] icon width and height (default: 20)

 mixin icon(name, size)
-    +svg("icons", name, size || 20).o-icon&attributes(attributes)
+    - var size = size || 20
+    +svg("icons", name, size).o-icon(style="min-width: #{size}px")&attributes(attributes)


 //- Pro/Con/Neutral icon
@ -185,3 +186,14 @@ mixin landing-header()
 mixin landing-badge(url, graphic, alt, size)
    +a(url)(aria-label=alt title=alt).c-landing__badge
        +svg("graphics", graphic, size || 225)
+
+
+//- Under construction (temporary)
+    Marks sections that still need to be completed for the v2.0 release.
+
+mixin under-construction()
+    +infobox("🚧 Under construction")
+        |  This section is still being written and will be updated for the v2.0
+        |  release. Is there anything that you think should definitely mentioned or
+        |  explained here? Any examples you'd like to see? #[strong Let us know]
+        |  on the #[+a(gh("spacy") + "/issues") v2.0 alpha thread] on GitHub!
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -178,7 +178,7 @@ mixin label()
 //- Tag

 mixin tag()
-    span.u-text-tag.u-text-tag--spaced(aria-hidden="true")
+    span.u-text-tag.u-text-tag--spaced(aria-hidden="true")&attributes(attributes)
        block


@ -192,6 +192,17 @@ mixin tag-model(...capabs)
    +help(intro + ext + ".").u-color-theme


+//- "New" tag to label features new in a specific version
+    By using a separate mixin with a version ID, it becomes easy to quickly
+    enable/disable tags without having to modify the markup in the docs.
+    version - [string or integer] version number, without "v" prefix
+
+mixin tag-new(version)
+    - var version = (typeof version == 'number') ? version.toFixed(1) : version
+    +tag(data-tooltip="This feature is new and was introduced in spaCy v#{version}.")
+        | v#{version}
+
+
 //- List
    type  - [string] "numbers", "letters", "roman" (bulleted list if none set)
    start - [integer] start number
--- a/website/_includes/_page-docs.jade
+++ b/website/_includes/_page-docs.jade
@ -6,9 +6,17 @@ include _sidebar

 main.o-main.o-main--sidebar.o-main--aside
    article.o-content
-        +h(1)=title
-            if tag
-                +tag=tag
+        +grid.o-no-block
+            +grid-col(source ? "two-thirds" : "full")
+                +h(1)=title
+                    if tag
+                        +tag=tag
+
+            if source
+                +grid-col("third").u-text-right
+                    .o-inline-list
+                        +button(gh("spacy", source), false, "secondary").u-text-tag Source #[+icon("code", 14)]
+

        if ALPHA
            +infobox("⚠️ You are viewing the spaCy v2.0 alpha docs")
--- a/website/assets/img/docs/architecture.svg
+++ b/website/assets/img/docs/architecture.svg
@ -1,128 +1,128 @@
-<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="736">
+<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="746">
    <style>
-        .text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
-        .text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" }
-        .text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" }
-        .text-code {  fill: #1a1e23; font: 600 12px "Source Code Pro" }
+        .svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
+        .svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" }
+        .svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" }
+        .svg__architecture__text-code {  fill: #1a1e23; font: 600 12px "Source Code Pro" }
    </style>
    <ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
-    <text class="text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>
+    <text class="svg__architecture__text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>
    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M345 432v242.8" stroke-dasharray="2 2"/>
    <path fill="#82b366" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M345 680.8l-4-8 4 2 4-2z"/>
    <rect fill="#fff" transform="translate(324 535.5)" width="37" height="18"/>
-    <text class="text-small" dy="1em" style="fill: #82b366" transform="translate(324 535.5)" width="37" height="18">MAKES</text>
+    <text class="svg__architecture__text-small" dy="1em" style="fill: #82b366" transform="translate(324 535.5)" width="37" height="18">MAKES</text>
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M457 434l100.5 80"/>
    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M562.3 517.6l-8.8-1.8 4-2 1-4.3z"/>
    <rect fill="#f6f6f6" transform="translate(424.5 462.5)" width="158" height="18"/>
-    <text class="text-code" dy="1em" dx="0.5em" transform="translate(424.5 462.5)" width="158" height="18">nlp.vocab.morphology</text>
+    <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(424.5 462.5)" width="158" height="18">nlp.vocab.morphology</text>
    <ellipse cx="404" cy="399" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="74.8" ry="49.8"/>
-    <text class="text-large" dy="1em" transform="translate(377.5 386.5)" width="51" height="22">Vocab</text>
+    <text class="svg__architecture__text-large" dy="1em" transform="translate(377.5 386.5)" width="51" height="22">Vocab</text>
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 253v87.8"/>
    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 346.8l-4-8 4 2 4-2z"/>
    <rect fill="#f6f6f6" transform="translate(364.5 285.5)" width="79" height="18" />
-    <text class="text-code" dy="1em" dx="0.5em" transform="translate(364.5 285.5)" width="79" height="18">nlp.vocab</text>
+    <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(364.5 285.5)" width="79" height="18">nlp.vocab</text>
    <ellipse cx="743" cy="399" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/>
-    <text class="text-large" transform="translate(694.5 386.5)" dy="1em" width="95" height="22">StringStore</text>
+    <text class="svg__architecture__text-large" transform="translate(694.5 386.5)" dy="1em" width="95" height="22">StringStore</text>
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M478 399h181.8"/>
    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M665.8 399l-8 4 2-4-2-4z"/>
    <rect fill="#f6f6f6" transform="translate(498.5 388.5)" width="137" height="18"/>
-    <text class="text-code" dy="1em" dx="0.5em" transform="translate(498.5 388.5)" width="137" height="18">nlp.vocab.strings</text>
+    <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(498.5 388.5)" width="137" height="18">nlp.vocab.strings</text>
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M108 244l235.6 115.4"/>
    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M349 362h-9l3.6-2.6V355z"/>
    <rect fill="#f6f6f6" transform="translate(141.5 284.5)" width="151" height="18" />
-    <text class="text-code" dy="1em" dx="0.5em" transform="translate(141.5 284.5)" width="151" height="18">nlp.tokenizer.vocab</text>
+    <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(141.5 284.5)" width="151" height="18">nlp.tokenizer.vocab</text>
    <path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M38.7 159.3H104l33 43.6-32.8 43.5H38.7L6 203z"/>
-    <text class="text-large" transform="translate(30.5 190.5)" dy="1em" width="80" height="22">Tokenizer</text>
+    <text class="svg__architecture__text-large" transform="translate(30.5 190.5)" dy="1em" width="80" height="22">Tokenizer</text>
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M329 203v-1H145.2"/>
    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M139.2 202l8-4-2 4 2 4z"/>
    <rect fill="#f6f6f6" transform="translate(188.5 191.5)" width="115" height="18"/>
-    <text class="text-code" dy="1em" dx="0.5em" transform="translate(188.5 191.5)" width="115" height="18">nlp.make_doc()</text>
+    <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(188.5 191.5)" width="115" height="18">nlp.make_doc()</text>
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M478 203h83v-4h105.8"/>
    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M672.8 199l-8 4 2-4-2-4z"/>
    <rect fill="#f6f6f6" transform="translate(512.5 191.5)" width="101" height="18"/>
-    <text class="text-code" dy="1em" dx="0.5em" transform="translate(512.5 191.5)" width="101" height="18">nlp.pipeline</text>
+    <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(512.5 191.5)" width="101" height="18">nlp.pipeline</text>
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M709 242.8L464.4 359.4"/>
    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M459 362l5.5-7v4.4l3.5 2.8z"/>
    <rect fill="#f6f6f6" transform="translate(505.5 297.5)" width="166" height="18"/>
-    <text class="text-code" dy="1em" dx="0.5em" transform="translate(505.5 297.5)" width="166" height="18">nlp.pipeline[i].vocab</text>
+    <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(505.5 297.5)" width="166" height="18">nlp.pipeline[i].vocab</text>
    <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M275.3 34.6L288.6 1h54L329 34.6z"/>
-    <text class="text-small" dy="0.85em" transform="translate(301.5 9.5)" width="12" height="14">pt</text>
+    <text class="svg__architecture__text-small" dy="0.85em" transform="translate(301.5 9.5)" width="12" height="14">pt</text>
    <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M60.8 34.6L74.3 1h54l-13.6 33.6z"/>
-    <text class="text-small" dy="0.85em" transform="translate(86.5 9.5)" width="14" height="14">en</text>
+    <text class="svg__architecture__text-small" dy="0.85em" transform="translate(86.5 9.5)" width="14" height="14">en</text>
    <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M114.4 34.6L128 1h53.8l-13.5 33.6z"/>
-    <text class="text-small" dy="0.85em" transform="translate(140.5 9.5)" width="14" height="14">de</text>
+    <text class="svg__architecture__text-small" dy="0.85em" transform="translate(140.5 9.5)" width="14" height="14">de</text>
    <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M168 34.6L181.5 1h54l-13.6 33.6z"/>
-    <text class="text-small" dy="0.85em" transform="translate(196.5 9.5)" width="8" height="14">fr</text>
+    <text class="svg__architecture__text-small" dy="0.85em" transform="translate(196.5 9.5)" width="8" height="14">fr</text>
    <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M221.6 34.6L235 1h54l-13.5 33.6z"/>
-    <text class="text-small" dy="0.85em" transform="translate(248.5 9.5)" width="12" height="14">es</text>
+    <text class="svg__architecture__text-small" dy="0.85em" transform="translate(248.5 9.5)" width="12" height="14">es</text>
    <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M47 68.3l13.6-33.6h53.8L101 68.3z"/>
-    <text class="text-small" dy="0.85em" transform="translate(75.5 43.5)" width="8" height="14">it</text>
+    <text class="svg__architecture__text-small" dy="0.85em" transform="translate(75.5 43.5)" width="8" height="14">it</text>
    <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M100.7 68.3l13.5-33.6H168l-13.4 33.6z"/>
-    <text class="text-small" dy="0.85em" transform="translate(127.5 43.5)" width="12" height="14">nl</text>
+    <text class="svg__architecture__text-small" dy="0.85em" transform="translate(127.5 43.5)" width="12" height="14">nl</text>
    <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M154.3 68.3l13.5-33.6h53.8l-13.4 33.6z"/>
-    <text class="text-small" dy="0.85em" transform="translate(180.5 43.5)" width="12" height="14">sv</text>
+    <text class="svg__architecture__text-small" dy="0.85em" transform="translate(180.5 43.5)" width="12" height="14">sv</text>
    <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M208 68.3l13.4-33.6h53.8L262 68.3z"/>
-    <text class="text-small" dy="0.85em" transform="translate(236.5 43.5)" width="8" height="14">fi</text>
+    <text class="svg__architecture__text-small" dy="0.85em" transform="translate(236.5 43.5)" width="8" height="14">fi</text>
    <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M261.5 68.3L275 34.7h54l-13.6 33.6z"/>
-    <text class="text-small" dy="0.85em" transform="translate(286.5 43.5)" width="16" height="14">nb</text>
+    <text class="svg__architecture__text-small" dy="0.85em" transform="translate(286.5 43.5)" width="16" height="14">nb</text>
    <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M33.4 102L47 68.2h53.7L87.3 102z"/>
-    <text class="text-small" dy="0.85em" transform="translate(58.5 77.5)" width="16" height="14">hu</text>
+    <text class="svg__architecture__text-small" dy="0.85em" transform="translate(58.5 77.5)" width="16" height="14">hu</text>
    <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M87 102l13.5-33.7h53.8L141 102z"/>
-    <text class="text-small" dy="0.85em" transform="translate(112.5 77.5)" width="14" height="14">he</text>
+    <text class="svg__architecture__text-small" dy="0.85em" transform="translate(112.5 77.5)" width="14" height="14">he</text>
    <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M140.6 102L154 68.2h54L194.4 102z"/>
-    <text class="text-small" dy="0.85em" transform="translate(165.5 77.5)" width="16" height="14">bn</text>
+    <text class="svg__architecture__text-small" dy="0.85em" transform="translate(165.5 77.5)" width="16" height="14">bn</text>
    <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M194.2 102l13.5-33.7h53.8L248 102z"/>
-    <text transform="translate(221.5 77.5)" class="text-small" dy="0.85em" width="10" height="14">ja</text>
+    <text transform="translate(221.5 77.5)" class="svg__architecture__text-small" dy="0.85em" width="10" height="14">ja</text>
    <path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M247.8 102l13.5-33.7H315L301.8 102z"/>
-    <text class="text-small" dy="0.85em" transform="translate(273.5 77.5)" width="14" height="14">zh</text>
+    <text class="svg__architecture__text-small" dy="0.85em" transform="translate(273.5 77.5)" width="14" height="14">zh</text>
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M329 51h75v93.8"/>
    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 150.8l-4-8 4 2 4-2z"/>
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M213 480l109.3-76.3"/>
    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M327.2 400.3L323 408l-.8-4.3-4-2z"/>
    <rect fill="#f6f6f6" transform="translate(226.5 431.5)" width="79" height="18"/>
-    <text class="text-code" dy="1em" dx="0.5em" transform="translate(226.5 431.5)" width="79" height="18">doc.vocab</text>
+    <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(226.5 431.5)" width="79" height="18">doc.vocab</text>
    <path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M39.6 555.5l.4 121.3" stroke-dasharray="2 2"/>
    <path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M40 682.8l-4-8 4 2 4-2z"/>
    <rect fill="#fff" transform="translate(23.5 604.5)" width="37" height="18"/>
-    <text class="text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(23.5 604.5)" width="37" height="18">MAKES</text>
+    <text class="svg__architecture__text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(23.5 604.5)" width="37" height="18">MAKES</text>
    <path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M1 479.5h283v74.8H1z"/>
-    <text class="text-large" dy="1em" transform="translate(125.5 504.5)" width="32" height="22">Doc</text>
+    <text class="svg__architecture__text-large" dy="1em" transform="translate(125.5 504.5)" width="32" height="22">Doc</text>
    <path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 246v117h1v108.8" stroke-dasharray="2 2"/>
    <path fill="#c00" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M72 477.8l-4-8 4 2 4-2z"/>
    <rect fill="#fff" transform="translate(54.5 355.5)" width="37" height="18"/>
-    <text class="text-small" style="fill: #cc0000" dy="1em" dx="-0.5em" transform="translate(54.5 355.5)" width="37" height="18">MAKES</text>
+    <text class="svg__architecture__text-small" style="fill: #cc0000" dy="1em" dx="-0.5em" transform="translate(54.5 355.5)" width="37" height="18">MAKES</text>
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M104 685l.4-121.2"/>
    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M104.5 557.8l4 8-4-2-4 2z"/>
    <rect fill="#f6f6f6" transform="translate(62.5 632.5)" width="79" height="18"/>
-    <text class="text-code" dy="1em" dx="0.5em" transform="translate(62.5 632.5)" width="79" height="18">token.doc</text>
+    <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(62.5 632.5)" width="79" height="18">token.doc</text>
    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M7.2 685h129.6v50H7.2z"/>
-    <text class="text-medium" dy="1em" transform="translate(49.5 700.5)" width="43" height="18">Token</text>
+    <text class="svg__architecture__text-medium" dy="1em" transform="translate(49.5 700.5)" width="43" height="18">Token</text>
    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M148 685h129.7v50H148z"/>
-    <text class="text-medium" dy="1em" transform="translate(193.5 700.5)" width="37" height="18">Span</text>
+    <text class="svg__architecture__text-medium" dy="1em" transform="translate(193.5 700.5)" width="37" height="18">Span</text>
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M405 686V456.6"/>
    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M405 450.6l4 8-4-2-4 2z"/>
    <rect fill="#f6f6f6" transform="translate(356.5 584.5)" width="101" height="18"/>
-    <text class="text-code" dy="1em" dx="0.5em" transform="translate(356.5 584.5)" width="101" height="18">lexeme.vocab</text>
+    <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(356.5 584.5)" width="101" height="18">lexeme.vocab</text>
    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M296.7 685h155.8v50H296.7z"/>
-    <text class="text-medium" dy="1em" transform="translate(346.5 700.5)" width="55" height="18">Lexeme</text>
+    <text class="svg__architecture__text-medium" dy="1em" transform="translate(346.5 700.5)" width="55" height="18">Lexeme</text>
    <path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M180.5 559.3l.5 117.5" stroke-dasharray="2 2"/>
    <path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M181 682.8l-4-8 4 2 4-2z"/>
    <rect fill="#fff" transform="translate(164.5 606.5)" width="37" height="18" />
-    <text class="text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(164.5 606.5)" width="37" height="18">MAKES</text>
+    <text class="svg__architecture__text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(164.5 606.5)" width="37" height="18">MAKES</text>
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M245.3 685V564"/>
    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M245.3 557.8l4 8-4-2-4 2z"/>
    <rect fill="#f6f6f6" transform="translate(211.5 633.5)" width="72" height="18"/>
-    <text class="text-code" dy="1em" dx="0.5em" transform="translate(211.5 633.5)" width="72" height="18">span.doc</text>
+    <text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(211.5 633.5)" width="72" height="18">span.doc</text>
    <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.6 112H872l32.8 43.5L872 199h-65.4L774 155.6z"/>
-    <text class="text-medium" dy="1em" transform="translate(794.5 135.5)" width="88" height="38">Dependency <tspan dy="1.25em" dx="-4.1em">Parser</tspan></text>
+    <text class="svg__architecture__text-medium" dy="1em" transform="translate(794.5 135.5)" width="88" height="38">Dependency <tspan dy="1.25em" dx="-4.1em">Parser</tspan></text>
    <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.6 199H872l32.8 43.8-32.8 43.6h-65.4L774 242.8z"/>
-    <text class="text-medium" dy="1em" dx="1.1em" transform="translate(799.5 222.5)" width="78" height="38">Entity <tspan dy="1.25em" dx="-3.75em">Recognizer</tspan></text>
+    <text class="svg__architecture__text-medium" dy="1em" dx="1.1em" transform="translate(799.5 222.5)" width="78" height="38">Entity <tspan dy="1.25em" dx="-3.75em">Recognizer</tspan></text>
    <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M708 155.5h65.6l32.7 43.6-32.7 43.8H708L675.5 199z"/>
-    <text class="text-medium" dy="1em" transform="translate(715.5 189.5)" width="48" height="18">Tagger</text>
+    <text class="svg__architecture__text-medium" dy="1em" transform="translate(715.5 189.5)" width="48" height="18">Tagger</text>
    <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.8 24.5h65.5L905 68 872.3 112h-65.5L774 68z"/>
-    <text class="text-medium" dy="1em" transform="translate(809.5 58.5)" width="58" height="18">Matcher</text>
+    <text class="svg__architecture__text-medium" dy="1em" transform="translate(809.5 58.5)" width="58" height="18">Matcher</text>
    <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M708.6 68H774l32.8 43.5L774 155h-65.4L676 111.6z"/>
-    <text class="text-medium" dy="1em" transform="translate(698.5 101.5)" width="84" height="18">Lemmatizer</text>
+    <text class="svg__architecture__text-medium" dy="1em" transform="translate(698.5 101.5)" width="84" height="18">Lemmatizer</text>
    <ellipse cx="617" cy="555" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/>
-    <text class="text-large" dy="1em" transform="translate(565.5 542.5)" width="101" height="22">Morphology</text>
+    <text class="svg__architecture__text-large" dy="1em" transform="translate(565.5 542.5)" width="101" height="22">Morphology</text>
 </svg>
--- a/website/assets/img/docs/language_data.svg
+++ b/website/assets/img/docs/language_data.svg
@ -1,13 +1,13 @@
 <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="931" height="456" viewBox="-1 -1 932 480" preserveAspectRatio="xMinYMin meet">
    <style>
-        .text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
-        .text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
-        .text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
+        .svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
+        .svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
+        .svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
    </style>
    <path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M610 404h-69.8" stroke-dasharray="1 6" stroke-linecap="round"/>
    <path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M534.2 404l8-4-2 4 2 4z"/>
    <path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M642.7 361.3H708l33 43.6-33 43.5H643L610 405z"/>
-    <text class="text-large" transform="translate(634.5 410)" width="80" height="22">Tokenizer</text>
+    <text class="svg__langdata__text-large" transform="translate(634.5 410)" width="80" height="22">Tokenizer</text>
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M780 303H621v-56.8"/>
    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M621 240.2l4 8-4-2-4 2z"/>
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M855 253v-20.8"/>
@ -17,7 +17,7 @@
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M780 303H504v-56.8"/>
    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M504 240.2l4 8-4-2-4 2z"/>
    <ellipse cx="855" cy="303" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/>
-    <text class="text-large" transform="translate(815 308)" width="119" height="46">Base data</text>
+    <text class="svg__langdata__text-large" transform="translate(815 308)" width="119" height="46">Base data</text>
    <path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100l.4 39.8"/>
    <path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389.5 145.8l-4-8 4 2 4-2z"/>
    <path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17h232v22.8"/>
@ -33,50 +33,50 @@
    <path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17H46v22.8"/>
    <path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M46 145.8l-4-8 4 2 4-2z"/>
    <ellipse cx="389" cy="50" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
-    <text class="text-large" transform="translate(350 42)" width="81" height="46">Language <tspan dy="1.45em" dx="-3.25em">data</tspan></text>
+    <text class="svg__langdata__text-large" transform="translate(350 42)" width="81" height="46">Language <tspan dy="1.45em" dx="-3.25em">data</tspan></text>
    <path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M435 193h15.8"/>
    <path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M456.8 193l-8 4 2-4-2-4z"/>
    <ellipse cx="390" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
-    <text class="text-small" transform="translate(375 187.5)" width="39" height="30">stop <tspan dx="-2.5em" dy="1.25em">words</tspan></text>
+    <text class="svg__langdata__text-small" transform="translate(375 187.5)" width="39" height="30">stop <tspan dx="-2.5em" dy="1.25em">words</tspan></text>
    <path fill="none" stroke="#9673a6" stroke-width="3" stroke-miterlimit="10" d="M472 225l-1.5 133.8" stroke-dasharray="1 6" stroke-linecap="round"/>
    <path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M470.4 364.8l-4-8 4 2 4-2z"/>
    <ellipse cx="504" cy="193" fill="#f5f5f5" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
-    <text class="text-small" transform="translate(481 187.5)" width="85" height="30">lexical <tspan dx="-3.75em" dy="1.25em">attributes</tspan></text>
+    <text class="svg__langdata__text-small" transform="translate(481 187.5)" width="85" height="30">lexical <tspan dx="-3.75em" dy="1.25em">attributes</tspan></text>
    <path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M653 225l5.6 127.8" stroke-dasharray="1 6" stroke-linecap="round"/>
    <path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M659 358.8l-4.5-8 4 2 4-2.2z"/>
    <path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M576 193h-18.8"/>
    <path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M551.2 193l8-4-2 4 2 4z"/>
    <ellipse cx="621" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
-    <text class="text-small" transform="translate(588 187.5)" width="85" height="30">tokenizer <tspan dx="-4.6em" dy="1.25em">exceptions</tspan></text>
+    <text class="svg__langdata__text-small" transform="translate(588 187.5)" width="85" height="30">tokenizer <tspan dx="-4.6em" dy="1.25em">exceptions</tspan></text>
    <path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M690 193h-15.8"/>
    <path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M668.2 193l8-4-2 4 2 4z"/>
    <path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M703 225l-10.3 127.8" stroke-dasharray="1 6" stroke-linecap="round"/>
    <path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M692.2 358.8l-3.4-8.3 4 2.3 4-1.7z"/>
    <ellipse cx="735" cy="193" fill="#f5f5f5" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
-    <text class="text-small" transform="translate(705 182)" width="53" height="46">prefixes, <tspan dy="1.25em" dx="-3.9em">suffixes,</tspan> <tspan dy="1.25em" dx="-3.6em">infixes</tspan>
+    <text class="svg__langdata__text-small" transform="translate(705 182)" width="53" height="46">prefixes, <tspan dy="1.25em" dx="-3.9em">suffixes,</tspan> <tspan dy="1.25em" dx="-3.6em">infixes</tspan>
    </text>
    <path fill="none" stroke="#d79b00" stroke-width="3" stroke-miterlimit="10" d="M280 238v114.8" stroke-dasharray="1 6" stroke-linecap="round"/>
    <path fill="#d79b00" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M280 358.8l-4-8 4 2 4-2z"/>
    <ellipse cx="280" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
-    <text class="text-small" transform="translate(254 187.5)" width="71" height="30">lemma <tspan dy="1.25em" dx="-2.7em">data</tspan></text>
+    <text class="svg__langdata__text-small" transform="translate(254 187.5)" width="71" height="30">lemma <tspan dy="1.25em" dx="-2.7em">data</tspan></text>
    <path fill="none" stroke="#d79b00" stroke-width="3" stroke-miterlimit="10" d="M346 404h53.8" stroke-dasharray="1 6" stroke-linecap="round"/>
    <path fill="#d79b00" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M405.8 404l-8 4 2-4-2-4z"/>
    <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M247.7 361.3H313l33 43.6-33 43.5h-65.3L215 405z"/>
-    <text class="text-large" transform="translate(232 410)" width="100" height="22">Lemmatizer</text>
+    <text class="svg__langdata__text-large" transform="translate(232 410)" width="100" height="22">Lemmatizer</text>
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M823 193h-34.8"/>
    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M782.2 193l8-4-2 4 2 4z"/>
    <ellipse cx="855" cy="193" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="31.5" ry="31.5"/>
-    <text class="text-tiny" transform="translate(838 189)" width="50" height="30">char <tspan dy="1.1em" dx="-2.75em">classes</tspan></text>
+    <text class="svg__langdata__text-tiny" transform="translate(838 189)" width="50" height="30">char <tspan dy="1.1em" dx="-2.75em">classes</tspan></text>
    <path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M408 367h124v74H408z"/>
-    <text class="text-large" transform="translate(443.5 410)" width="51" height="22">Token</text>
+    <text class="svg__langdata__text-large" transform="translate(443.5 410)" width="51" height="22">Token</text>
    <path fill="none" stroke="#666" stroke-width="3" stroke-miterlimit="10" d="M131 225l-21 122.2" stroke-dasharray="1 6" stroke-linecap="round"/>
    <path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M109 353l-2.5-8.5 3.6 2.7 4.4-1.3z"/>
    <ellipse cx="163" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
-    <text class="text-small" transform="translate(139 187.5)" width="45" height="30">morph <tspan dy="1.25em" dx="-2.8em">rules</tspan></text>
+    <text class="svg__langdata__text-small" transform="translate(139 187.5)" width="45" height="30">morph <tspan dy="1.25em" dx="-2.8em">rules</tspan></text>
    <path fill="none" stroke="#666" stroke-width="3" stroke-miterlimit="10" d="M78 225l15.4 122" stroke-dasharray="1 6" stroke-linecap="round"/>
    <path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M94.2 353l-5-7.5 4.2 1.5 3.7-2.5z"/>
    <ellipse cx="46" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
-    <text class="text-small" transform="translate(33 187.5)" width="27" height="30">tag <tspan dy="1.25em" dx="-1.8em">map</tspan></text>
+    <text class="svg__langdata__text-small" transform="translate(33 187.5)" width="27" height="30">tag <tspan dy="1.25em" dx="-1.8em">map</tspan></text>
    <ellipse cx="101" cy="405" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.5" ry="49.5"/>
-    <text class="text-large" transform="translate(49.5 410)" width="100" height="22">Morphology</text>
+    <text class="svg__langdata__text-large" transform="translate(49.5 410)" width="100" height="22">Morphology</text>
 </svg>
--- a/website/assets/img/docs/pipeline.svg
+++ b/website/assets/img/docs/pipeline.svg
@ -1,30 +1,30 @@
 <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 923 200" width="923" height="200">
    <style>
-        .text { fill: #1a1e23; font: 20px "Source Sans Pro" }
-        .text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro" }
-        .text-code {  fill: #1a1e23; font: 600 16px "Source Code Pro" }
+        .svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro" }
+        .svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro" }
+        .svg__pipeline__text-code {  fill: #1a1e23; font: 600 16px "Source Code Pro" }
    </style>
    <rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/>
    <path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/>
-    <text class="text" dy="0.75em" width="28" height="19" transform="translate(846.5 75.5)">Doc</text>
+    <text class="svg__pipeline__text" dy="0.75em" width="28" height="19" transform="translate(846.5 75.5)">Doc</text>
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M121.2 84.7h29.4"/>
    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M156.6 84.7l-8 4 2-4-2-4z"/>
    <path fill="#f5f5f5" stroke="#999" stroke-width="2" d="M1 55h120v60H1z"/>
-    <text class="text" dy="0.85em" width="34" height="22" transform="translate(43.5 73.5)">Text</text>
+    <text class="svg__pipeline__text" dy="0.85em" width="34" height="22" transform="translate(43.5 73.5)">Text</text>
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M760 84.7h33"/>
    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M799 84.7l-8 4 2-4-2-4z"/>
    <rect width="75" height="39" x="422" y="1" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="5.8" ry="5.8"/>
-    <text class="text-code" dy="0.8em" dx="0.1em" width="29" height="17" transform="translate(444.5 11.5)">nlp</text>
+    <text class="svg__pipeline__text-code" dy="0.8em" dx="0.1em" width="29" height="17" transform="translate(444.5 11.5)">nlp</text>
    <path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M176 58h103.3L296 88l-16.8 30H176l16.8-30z"/>
-    <text class="text-small" dy="0.75em" dx="-0.25em" width="58" height="14" transform="translate(206.5 80.5)">tokenizer</text>
+    <text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="58" height="14" transform="translate(206.5 80.5)">tokenizer</text>
    <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M314 58h103.3L434 88l-16.8 30H314l16.8-30z"/>
-    <text class="text-small" dy="0.75em" dx="-0.25em" width="62" height="14" transform="translate(342.5 80.5)">vectorizer</text>
+    <text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="62" height="14" transform="translate(342.5 80.5)">vectorizer</text>
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M296.5 88.2h24.7"/>
    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M327.2 88.2l-8 4 2-4-2-4z"/>
    <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M416 58h103.3L536 88l-16.8 30H416l16.8-30z"/>
-    <text class="text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(455.5 80.5)">tagger</text>
+    <text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(455.5 80.5)">tagger</text>
    <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M519 58h103.3L639 88l-16.8 30H519l16.8-30z"/>
-    <text class="text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(558.5 80.5)">parser</text>
+    <text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(558.5 80.5)">parser</text>
    <path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M622 58h103.3L742 88l-16.8 30H622l16.8-30z"/>
-    <text class="text-small" dy="0.75em" dx="-0.25em" width="20" height="14" transform="translate(671.5 80.5)">ner</text>
+    <text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="20" height="14" transform="translate(671.5 80.5)">ner</text>
 </svg>
--- a/website/assets/img/docs/vocab_stringstore.svg
+++ b/website/assets/img/docs/vocab_stringstore.svg
@ -0,0 +1,77 @@
+<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-10 -10 582 365" width="572" height="355">
+    <style>
+        .svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro" }
+        .svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro"; text-transform: uppercase }
+        .svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro" }
+        .svg__vocab__text-code {  fill: #1a1e23; font: bold 12px "Source Code Pro" }
+    </style>
+    <rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/>
+    <text class="svg__vocab__text" dy="1em" transform="translate(477.5 174.5)" width="31" height="17">3572</text>
+    <rect width="52" height="20" x="468" y="152" fill="#666" rx="3" ry="3"/>
+    <text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(471.5 155.5)">Lexeme</text>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 164h100v40H76z"/>
+    <text class="svg__vocab__text" dy="1em" width="23" height="17" transform="translate(113.5 174.5)">508</text>
+    <rect width="52" height="20" x="100" y="152" fill="#666" rx="3" ry="3"/>
+    <text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(103.5 155.5)">Lexeme</text>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 164h100v40H263z"/>
+    <text class="svg__vocab__text" dy="1em" width="23" height="17" transform="translate(300.5 174.5)">949</text>
+    <rect width="52" height="20" x="287" y="152" fill="#666" rx="3" ry="3"/>
+    <text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(290.5 155.5)">Lexeme</text>
+    <rect width="570" height="88" x="1" y="246" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="13.2" ry="13.2"/>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 275h100v40H444z"/>
+    <text class="svg__vocab__text" dy="1em" width="55" height="17" transform="translate(465.5 285.5)">&quot;coffee&quot;</text>
+    <rect width="52" height="20" x="468" y="263" fill="#666" rx="3" ry="3"/>
+    <text class="svg__vocab__text-box" dy="0.9em" width="28" height="12" transform="translate(479.5 266.5)">3672</text>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 275h100v40H76z"/>
+    <text class="svg__vocab__text" dy="1em" width="17" height="17" transform="translate(116.5 285.5)">&quot;I&quot;</text>
+    <rect width="52" height="20" x="100" y="263" fill="#666" rx="3" ry="3"/>
+    <text class="svg__vocab__text-box" dy="0.9em" width="22" height="12"  transform="translate(114.5 266.5)">508</text>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 275h100v40H263z"/>
+    <text class="svg__vocab__text" dy="1em" width="41" height="17" transform="translate(291.5 285.5)">&quot;love&quot;</text>
+    <rect width="52" height="20" x="287" y="263" fill="#666" rx="3" ry="3"/>
+    <text class="svg__vocab__text-box" dy="0.9em" width="22" height="12" transform="translate(301.5 266.5)">949</text>
+    <rect width="570" height="110" x="1" y="1" fill="#e1d5e7" stroke="#9673a6" stroke-width="2" rx="16.5" ry="16.5"/>
+    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M263 60h-78.8"/>
+    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M178.2 60l8-4-2 4 2 4z"/>
+    <rect fill="#E1D5E7" width="50" height="12" transform="translate(202.5 53.5)"/>
+    <text class="svg__vocab__text-code" dx="0.5em" dy="1em" width="50" height="12" transform="translate(202.5 53.5)">nsubj</text>
+    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M363 60h72.8"/>
+    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M441.8 60l-8 4 2-4-2-4z"/>
+    <rect fill="#E1D5E7" width="43" height="12" transform="translate(375.5 54.5)"/>
+    <text class="svg__vocab__text-code" dx="0.5em" dy="1em" width="43" height="12" transform="translate(375.5 54.5)">dobj</text>
+    <rect width="50" height="88" x="1" y="246" fill="#666" stroke="#666" stroke-width="2" rx="7.5" ry="7.5"/>
+    <text class="svg__vocab__text-large" dx="-0.25em" dy="1em" width="53" height="36" transform="rotate(-90 162 155)">String</text>
+    <text class="svg__vocab__text-large" dy="2em" width="53" height="36" transform="rotate(-90 162 155)">Store</text>
+    <rect width="50" height="88" x="1" y="135" fill="#82b366" stroke="#82b366" stroke-width="2" rx="7.5" ry="7.5"/>
+    <text class="svg__vocab__text-large" dx="-0.25em" dy="0.9em" width="47" height="17" transform="rotate(-90 109.5 93)">Vocab</text>
+    <rect width="50" height="110" x="1" y="1" fill="#9673a6" stroke="#9673a6" stroke-width="2" rx="7.5" ry="7.5"/>
+    <text class="svg__vocab__text-large" dx="-0.25em" dy="0.9em" width="31" height="17" transform="rotate(-90 44 27.5)">Doc</text>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 27h100v66H263z"/>
+    <text class="svg__vocab__text" dy="1em" width="31" height="33" transform="translate(296.5 42.5)">love</text>
+    <text class="svg__vocab__text-code" dy="2.8em" width="31" height="33" transform="translate(296.5 42.5)">VERB</text>
+    <rect width="50" height="20" x="288" y="16" fill="#666" rx="3" ry="3"/>
+    <text class="svg__vocab__text-box" dy="0.9em" transform="translate(294.5 19.5)">Token</text>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 27h100v66H76z"/>
+    <text class="svg__vocab__text" dx="0.8em" dy="1em" width="29" height="33" transform="translate(110.5 42.5)">I</text>
+    <text class="svg__vocab__text-code" dy="2.8em" width="29" height="33" transform="translate(110.5 42.5)">PRON</text>
+    <rect width="50" height="20" x="105" y="17" fill="#666" rx="3" ry="3"/>
+    <text class="svg__vocab__text-box" dy="0.9em" width="36" height="12" transform="translate(111.5 20.5)">Token</text>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 27h100v66H444z"/>
+    <text class="svg__vocab__text" dy="1em" width="45" height="33" transform="translate(470.5 42.5)">coffee</text>
+    <text class="svg__vocab__text-code" dx="0.6em" dy="2.8em" width="45" height="33" transform="translate(470.5 42.5)">NOUN</text>
+    <rect width="50" height="20" x="469" y="16" fill="#666" rx="3" ry="3"/>
+    <text class="svg__vocab__text-box" dy="0.9em" width="36" height="12" transform="translate(475.5 19.5)">Token</text>
+    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 141.8v-38.6"/>
+    <path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 149.8l-2.7-8h5.4zM126 95.2l2.7 8h-5.4z"/>
+    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 214.2v38.6"/>
+    <path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 206.2l2.7 8h-5.4zM126 260.8l-2.7-8h5.4z"/>
+    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 103.2v38.6"/>
+    <path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 95.2l2.7 8h-5.4zM313 149.8l-2.7-8h5.4z"/>
+    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 214.2v38.6"/>
+    <path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 206.2l2.7 8h-5.4zM313 260.8l-2.7-8h5.4z"/>
+    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 214.2v38.6"/>
+    <path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 206.2l2.7 8h-5.4zM494 260.8l-2.7-8h5.4z"/>
+    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 103.2v38.6"/>
+    <path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 95.2l2.7 8h-5.4zM494 149.8l-2.7-8h5.4z"/>
+</svg>
--- a/website/docs/api/_data.json
+++ b/website/docs/api/_data.json
@ -24,7 +24,8 @@
            "Vocab": "vocab",
            "StringStore": "stringstore",
            "GoldParse": "goldparse",
-            "GoldCorpus": "goldcorpus"
+            "GoldCorpus": "goldcorpus",
+            "Binder": "binder"
        },
        "Other": {
            "Annotation Specs": "annotation"
@ -47,62 +48,74 @@

    "spacy": {
        "title": "spaCy top-level functions",
+        "source": "spacy/__init__.py",
        "next": "displacy"
    },

    "displacy": {
        "title": "displaCy",
        "tag": "module",
+        "source": "spacy/displacy",
        "next": "util"
    },

    "util": {
        "title": "Utility Functions",
+        "source": "spacy/util.py",
        "next": "cli"
    },

    "cli": {
-        "title": "Command Line Interface"
+        "title": "Command Line Interface",
+        "source": "spacy/cli"
    },

    "language": {
        "title": "Language",
-        "tag": "class"
+        "tag": "class",
+        "source": "spacy/language.py"
    },

    "doc": {
        "title": "Doc",
-        "tag": "class"
+        "tag": "class",
+        "source": "spacy/tokens/doc.pyx"
    },

    "token": {
        "title": "Token",
-        "tag": "class"
+        "tag": "class",
+        "source": "spacy/tokens/token.pyx"
    },

    "span": {
        "title": "Span",
-        "tag": "class"
+        "tag": "class",
+        "source": "spacy/tokens/span.pyx"
    },

    "lexeme": {
        "title": "Lexeme",
-        "tag": "class"
+        "tag": "class",
+        "source": "spacy/lexeme.pyx"
    },

    "vocab": {
        "title": "Vocab",
-        "tag": "class"
+        "tag": "class",
+        "source": "spacy/vocab.pyx"
    },

    "stringstore": {
        "title": "StringStore",
-        "tag": "class"
+        "tag": "class",
+        "source": "spacy/strings.pyx"
    },

    "matcher": {
        "title": "Matcher",
-        "tag": "class"
+        "tag": "class",
+        "source": "spacy/matcher.pyx"
    },

    "dependenyparser": {
@ -122,7 +135,8 @@

    "tokenizer": {
        "title": "Tokenizer",
-        "tag": "class"
+        "tag": "class",
+        "source": "spacy/tokenizer.pyx"
    },

    "tagger": {
@ -132,11 +146,18 @@

    "goldparse": {
        "title": "GoldParse",
-        "tag": "class"
+        "tag": "class",
+        "source": "spacy/gold.pyx"
    },

    "goldcorpus": {
        "title": "GoldCorpus",
+        "tag": "class",
+        "source": "spacy/gold.pyx"
+    },
+
+    "binder": {
+        "title": "Binder",
        "tag": "class"
    },

--- a/website/docs/api/binder.jade
+++ b/website/docs/api/binder.jade
@ -0,0 +1,5 @@
+//- 💫 DOCS > API > BINDER
+
+include ../../_includes/_mixins
+
+under-construction
--- a/website/docs/api/cli.jade
+++ b/website/docs/api/cli.jade
@ -166,7 +166,7 @@ p
    |  #[+a("/docs/api/annotation#json-input") JSON format].

 +code(false, "bash").
-    python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--parser-L1] [--no-tagger] [--no-parser] [--no-ner]
+    python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]

 +table(["Argument", "Type", "Description"])
    +row
@ -192,18 +192,13 @@ p
    +row
        +cell #[code --n-iter], #[code -n]
        +cell option
-        +cell Number of iterations (default: #[code 15]).
+        +cell Number of iterations (default: #[code 20]).

    +row
-        +cell #[code --n_sents], #[code -ns]
+        +cell #[code --n-sents], #[code -ns]
        +cell option
        +cell Number of sentences (default: #[code 0]).

-    +row
-        +cell #[code --parser-L1], #[code -L]
-        +cell option
-        +cell L1 regularization penalty for parser (default: #[code 0.0]).
-
    +row
        +cell #[code --use-gpu], #[code -G]
        +cell flag
@ -220,7 +215,7 @@ p
        +cell Don't train parser.

    +row
-        +cell #[code --no-ner], #[code -N]
+        +cell #[code --no-entities], #[code -N]
        +cell flag
        +cell Don't train NER.

@ -229,6 +224,106 @@ p
        +cell flag
        +cell Show help message and available arguments.

+h(3, "train-hyperparams") Environment variables for hyperparameters
+
+p
+    |  spaCy lets you set hyperparameters for training via environment variables.
+    |  This is useful, because it keeps the command simple and allows you to
+    |  #[+a("https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537") create an alias]
+    |  for your custom #[code train] command while still being able to easily
+    |  tweak the hyperparameters. For example:
+
+code(false, "bash").
+    parser_hidden_depth=2 parser_maxout_pieces=1 train-parser
+
+under-construction
+
+table(["Name", "Description", "Default"])
+    +row
+        +cell #[code dropout_from]
+        +cell
+        +cell #[code 0.2]
+
+    +row
+        +cell #[code dropout_to]
+        +cell
+        +cell #[code 0.2]
+
+    +row
+        +cell #[code dropout_decay]
+        +cell
+        +cell #[code 0.0]
+
+    +row
+        +cell #[code batch_from]
+        +cell
+        +cell #[code 1]
+
+    +row
+        +cell #[code batch_to]
+        +cell
+        +cell #[code 64]
+
+    +row
+        +cell #[code batch_compound]
+        +cell
+        +cell #[code 1.001]
+
+    +row
+        +cell #[code token_vector_width]
+        +cell
+        +cell #[code 128]
+
+    +row
+        +cell #[code embed_size]
+        +cell
+        +cell #[code 7500]
+
+    +row
+        +cell #[code parser_maxout_pieces]
+        +cell
+        +cell #[code 2]
+
+    +row
+        +cell #[code parser_hidden_depth]
+        +cell
+        +cell #[code 1]
+
+    +row
+        +cell #[code hidden_width]
+        +cell
+        +cell #[code 128]
+
+    +row
+        +cell #[code learn_rate]
+        +cell
+        +cell #[code 0.001]
+
+    +row
+        +cell #[code optimizer_B1]
+        +cell
+        +cell #[code 0.9]
+
+    +row
+        +cell #[code optimizer_B2]
+        +cell
+        +cell #[code 0.999]
+
+    +row
+        +cell #[code optimizer_eps]
+        +cell
+        +cell #[code 1e-08]
+
+    +row
+        +cell #[code L2_penalty]
+        +cell
+        +cell #[code 1e-06]
+
+    +row
+        +cell #[code grad_norm_clip]
+        +cell
+        +cell #[code 1.0]
+
 +h(2, "package") Package

 p
--- a/website/docs/api/displacy.jade
+++ b/website/docs/api/displacy.jade
@ -10,6 +10,7 @@ p

 +h(2, "serve") displacy.serve
    +tag method
+    +tag-new(2)

 p
    |  Serve a dependency parse tree or named entity visualization to view it
@ -71,6 +72,7 @@ p

 +h(2, "render") displacy.render
    +tag method
+    +tag-new(2)

 p Render a dependency parse tree or named entity visualization.

--- a/website/docs/api/doc.jade
+++ b/website/docs/api/doc.jade
@ -255,6 +255,7 @@ p

 +h(2, "to_disk") Doc.to_disk
    +tag method
+    +tag-new(2)

 p Save the current state to a directory.

@ -271,12 +272,14 @@ p Save the current state to a directory.

 +h(2, "from_disk") Doc.from_disk
    +tag method
+    +tag-new(2)

 p Loads state from a directory. Modifies the object in place and returns it.

 +aside-code("Example").
    from spacy.tokens import Doc
-    doc = Doc().from_disk('/path/to/doc')
+    from spacy.vocab import Vocab
+    doc = Doc(Vocab()).from_disk('/path/to/doc')

 +table(["Name", "Type", "Description"])
    +row
--- a/website/docs/api/goldcorpus.jade
+++ b/website/docs/api/goldcorpus.jade
@ -8,6 +8,7 @@ p

 +h(2, "init") GoldCorpus.__init__
    +tag method
+    +tag-new(2)

 p Create a #[code GoldCorpus].

--- a/website/docs/api/language.jade
+++ b/website/docs/api/language.jade
@ -73,15 +73,26 @@ p
        +cell The text to be processed.

    +row
-        +cell #[code **disabled]
-        +cell -
-        +cell Elements of the pipeline that should not be run.
+        +cell #[code disable]
+        +cell list
+        +cell
+            |  Names of pipeline components to
+            |  #[+a("/docs/usage/language-processing-pipeline#disabling") disable].

    +footrow
        +cell returns
        +cell #[code Doc]
        +cell A container for accessing the annotations.

+infobox("⚠️ Deprecation note")
+    .o-block
+        |  Pipeline components to prevent from being loaded can now be added as
+        |  a list to #[code disable], instead of specifying one keyword argument
+        |  per component.
+
+    +code-new doc = nlp(u"I don't want parsed", disable=['parser'])
+    +code-old doc = nlp(u"I don't want parsed", parse=False)
+
 +h(2, "pipe") Language.pipe
    +tag method

@ -112,6 +123,13 @@ p
        +cell int
        +cell The number of texts to buffer.

+    +row
+        +cell #[code disable]
+        +cell list
+        +cell
+            |  Names of pipeline components to
+            |  #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+
    +footrow
        +cell yields
        +cell #[code Doc]
@ -227,8 +245,11 @@ p

 +h(2, "to_disk") Language.to_disk
    +tag method
+    +tag-new(2)

-p Save the current state to a directory.
+p
+    |  Save the current state to a directory. If a model is loaded, this will
+    |  #[strong include the model].

 +aside-code("Example").
    nlp.to_disk('/path/to/models')
@ -242,14 +263,21 @@ p Save the current state to a directory.
            |  Paths may be either strings or #[code Path]-like objects.

    +row
-        +cell #[code **exclude]
-        +cell -
-        +cell Named attributes to prevent from being saved.
+        +cell #[code disable]
+        +cell list
+        +cell
+            |  Names of pipeline components to
+            |  #[+a("/docs/usage/language-processing-pipeline#disabling") disable]
+            |  and prevent from being saved.

 +h(2, "from_disk") Language.from_disk
    +tag method
+    +tag-new(2)

-p Loads state from a directory. Modifies the object in place and returns it.
+p
+    |  Loads state from a directory. Modifies the object in place and returns
+    |  it. If the saved #[code Language] object contains a model, the
+    |  #[strong model will be loaded].

 +aside-code("Example").
    from spacy.language import Language
@ -264,15 +292,28 @@ p Loads state from a directory. Modifies the object in place and returns it.
            |  #[code Path]-like objects.

    +row
-        +cell #[code **exclude]
-        +cell -
-        +cell Named attributes to prevent from being loaded.
+        +cell #[code disable]
+        +cell list
+        +cell
+            |  Names of pipeline components to
+            |  #[+a("/docs/usage/language-processing-pipeline#disabling") disable].

    +footrow
        +cell returns
        +cell #[code Language]
        +cell The modified #[code Language] object.

+infobox("⚠️ Deprecation note")
+    .o-block
+        |  As of spaCy v2.0, the #[code save_to_directory] method has been
+        |  renamed to #[code to_disk], to improve consistency across classes.
+        |  Pipeline components to prevent from being loaded can now be added as
+        |  a list to #[code disable], instead of specifying one keyword argument
+        |  per component.
+
+    +code-new nlp = English().from_disk(disable=['tagger', 'ner'])
+    +code-old nlp = spacy.load('en', tagger=False, entity=False)
+
 +h(2, "to_bytes") Language.to_bytes
    +tag method

@ -283,9 +324,12 @@ p Serialize the current state to a binary string.

 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code **exclude]
-        +cell -
-        +cell Named attributes to prevent from being serialized.
+        +cell #[code disable]
+        +cell list
+        +cell
+            |  Names of pipeline components to
+            |  #[+a("/docs/usage/language-processing-pipeline#disabling") disable]
+            |  and prevent from being serialized.

    +footrow
        +cell returns
@ -310,15 +354,26 @@ p Load state from a binary string.
        +cell The data to load from.

    +row
-        +cell #[code **exclude]
-        +cell -
-        +cell Named attributes to prevent from being loaded.
+        +cell #[code disable]
+        +cell list
+        +cell
+            |  Names of pipeline components to
+            |  #[+a("/docs/usage/language-processing-pipeline#disabling") disable].

    +footrow
        +cell returns
        +cell #[code Language]
        +cell The #[code Language] object.

+infobox("⚠️ Deprecation note")
+    .o-block
+        |  Pipeline components to prevent from being loaded can now be added as
+        |  a list to #[code disable], instead of specifying one keyword argument
+        |  per component.
+
+    +code-new nlp = English().from_bytes(bytes, disable=['tagger', 'ner'])
+    +code-old nlp = English().from_bytes('en', tagger=False, entity=False)
+
 +h(2, "attributes") Attributes

 +table(["Name", "Type", "Description"])
@ -327,6 +382,11 @@ p Load state from a binary string.
        +cell #[code Vocab]
        +cell A container for the lexical types.

+    +row
+        +cell #[code tokenizer]
+        +cell #[code Tokenizer]
+        +cell The tokenizer.
+
    +row
        +cell #[code make_doc]
        +cell #[code lambda text: Doc]
--- a/website/docs/api/lexeme.jade
+++ b/website/docs/api/lexeme.jade
@ -212,62 +212,74 @@ p The L2 norm of the lexeme's vector representation.
    +row
        +cell #[code is_alpha]
        +cell bool
-        +cell Equivalent to #[code word.orth_.isalpha()].
+        +cell
+            |  Does the lexeme consist of alphabetic characters? Equivalent to
+            |  #[code lexeme.text.isalpha()].

    +row
        +cell #[code is_ascii]
        +cell bool
-        +cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]].
+        +cell
+            |  Does the lexeme consist of ASCII characters? Equivalent to
+            |  #[code [any(ord(c) >= 128 for c in lexeme.text)]].

    +row
        +cell #[code is_digit]
        +cell bool
-        +cell Equivalent to #[code word.orth_.isdigit()].
+        +cell
+            |  Does the lexeme consist of digits? Equivalent to
+            |  #[code lexeme.text.isdigit()].

    +row
        +cell #[code is_lower]
        +cell bool
-        +cell Equivalent to #[code word.orth_.islower()].
+        +cell
+            |  Is the lexeme in lowercase? Equivalent to
+            |  #[code lexeme.text.islower()].

    +row
        +cell #[code is_title]
        +cell bool
-        +cell Equivalent to #[code word.orth_.istitle()].
+        +cell
+            |  Is the lexeme in titlecase? Equivalent to
+            |  #[code lexeme.text.istitle()].

    +row
        +cell #[code is_punct]
        +cell bool
-        +cell Equivalent to #[code word.orth_.ispunct()].
+        +cell Is the lexeme punctuation?

    +row
        +cell #[code is_space]
        +cell bool
-        +cell Equivalent to #[code word.orth_.isspace()].
+        +cell
+            |  Does the lexeme consist of whitespace characters? Equivalent to
+            |  #[code lexeme.text.isspace()].

    +row
        +cell #[code like_url]
        +cell bool
-        +cell Does the word resemble a URL?
+        +cell Does the lexeme resemble a URL?

    +row
        +cell #[code like_num]
        +cell bool
-        +cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc.
+        +cell Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc.

    +row
        +cell #[code like_email]
        +cell bool
-        +cell Does the word resemble an email address?
+        +cell Does the lexeme resemble an email address?

    +row
        +cell #[code is_oov]
        +cell bool
-        +cell Is the word out-of-vocabulary?
+        +cell Is the lexeme out-of-vocabulary?

    +row
        +cell #[code is_stop]
        +cell bool
-        +cell Is the word part of a "stop list"?
+        +cell Is the lexeme part of a "stop list"?

    +row
        +cell #[code lang]
--- a/website/docs/api/matcher.jade
+++ b/website/docs/api/matcher.jade
@ -5,13 +5,14 @@ include ../../_includes/_mixins
 p Match sequences of tokens, based on pattern rules.

 +infobox("⚠️ Deprecation note")
-    |  As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
-    |  are deprecated and have been replaced with a simpler
-    |  #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
-    |  patterns and a callback for a given match ID. #[code Matcher.get_entity]
-    |  is now called #[+api("matcher#get") #[code matcher.get]].
-    |  #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
-    |  and #[code Matcher.has_entity] (now redundant) have been removed.
+    .o-block
+        |  As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
+        |  are deprecated and have been replaced with a simpler
+        |  #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
+        |  patterns and a callback for a given match ID. #[code Matcher.get_entity]
+        |  is now called #[+api("matcher#get") #[code matcher.get]].
+        |  #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
+        |  and #[code Matcher.has_entity] (now redundant) have been removed.

 +h(2, "init") Matcher.__init__
    +tag method
@ -56,17 +57,6 @@ p Find all token sequences matching the supplied patterns on the #[code Doc].
    doc = nlp(u'hello world!')
    matches = matcher(doc)

-+infobox("Important note")
-    |  By default, the matcher #[strong does not perform any action] on matches,
-    |  like tagging matched phrases with entity types. Instead, actions need to
-    |  be specified when #[strong adding patterns or entities], by
-    |  passing in a callback function as the #[code on_match] argument on
-    |  #[+api("matcher#add") #[code add]]. This allows you to define custom
-    |  actions per pattern within the same matcher. For example, you might only
-    |  want to merge some entity types, and set custom flags for other matched
-    |  patterns. For more details and examples, see the usage workflow on
-    |  #[+a("/docs/usage/rule-based-matching") rule-based matching].
-
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code doc]
@ -81,6 +71,17 @@ p Find all token sequences matching the supplied patterns on the #[code Doc].
            |  matches. A match tuple describes a span #[code doc[start:end]].
            |  The #[code match_id] is the ID of the added match pattern.

+infobox("Important note")
+    |  By default, the matcher #[strong does not perform any action] on matches,
+    |  like tagging matched phrases with entity types. Instead, actions need to
+    |  be specified when #[strong adding patterns or entities], by
+    |  passing in a callback function as the #[code on_match] argument on
+    |  #[+api("matcher#add") #[code add]]. This allows you to define custom
+    |  actions per pattern within the same matcher. For example, you might only
+    |  want to merge some entity types, and set custom flags for other matched
+    |  patterns. For more details and examples, see the usage workflow on
+    |  #[+a("/docs/usage/rule-based-matching") rule-based matching].
+
 +h(2, "pipe") Matcher.pipe
    +tag method

@ -118,6 +119,7 @@ p Match a stream of documents, yielding them in turn.

 +h(2, "len") Matcher.__len__
    +tag method
+    +tag-new(2)

 p
    |  Get the number of rules added to the matcher. Note that this only returns
@ -138,6 +140,7 @@ p

 +h(2, "contains") Matcher.__contains__
    +tag method
+    +tag-new(2)

 p Check whether the matcher contains rules for a match ID.

@ -159,6 +162,7 @@ p Check whether the matcher contains rules for a match ID.

 +h(2, "add") Matcher.add
    +tag method
+    +tag-new(2)

 p
    |  Add a rule to the matcher, consisting of an ID key, one or more patterns, and
@ -198,8 +202,23 @@ p
            |  Match pattern. A pattern consists of a list of dicts, where each
            |  dict describes a token.

+infobox("⚠️ Deprecation note")
+    .o-block
+        |  As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
+        |  are deprecated and have been replaced with a simpler
+        |  #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
+        |  patterns and a callback for a given match ID.
+
+    +code-new.
+        matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}])
+
+    +code-old.
+        matcher.add_entity('GoogleNow', on_match=merge_phrases)
+        matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
+
 +h(2, "remove") Matcher.remove
    +tag method
+    +tag-new(2)

 p
    |  Remove a rule from the matcher. A #[code KeyError] is raised if the match
@ -219,6 +238,7 @@ p

 +h(2, "get") Matcher.get
    +tag method
+    +tag-new(2)

 p
    |  Retrieve the pattern stored for a key. Returns the rule as an
--- a/website/docs/api/spacy.jade
+++ b/website/docs/api/spacy.jade
@ -20,12 +20,7 @@ p
    nlp = spacy.load('/path/to/en') # unicode path
    nlp = spacy.load(Path('/path/to/en')) # pathlib Path

-+infobox("⚠️ Deprecation note")
-    |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
-    |  will also raise an error if no model could be loaded and never just
-    |  return an empty #[code Language] object. If you need a blank language,
-    |  you need to import it explicitly (#[code from spacy.lang.en import English])
-    |  or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
+    nlp = spacy.load('en', disable['parser', 'tagger'])

 +table(["Name", "Type", "Description"])
    +row
@ -34,15 +29,28 @@ p
        +cell Model to load, i.e. shortcut link, package name or path.

    +row
-        +cell #[code **overrides]
-        +cell -
-        +cell Override or disable components.
+        +cell #[code disable]
+        +cell list
+        +cell
+            |  Names of pipeline components to
+            |  #[+a("/docs/usage/language-processing-pipeline#disabling") disable].

    +footrow
        +cell returns
        +cell #[code Language]
        +cell A #[code Language] object with the loaded model.

+infobox("⚠️ Deprecation note")
+    .o-block
+        |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
+        |  will also raise an error if no model could be loaded and never just
+        |  return an empty #[code Language] object. If you need a blank language,
+        |  you need to import it explicitly (#[code from spacy.lang.en import English])
+        |  or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
+
+    +code-new nlp = spacy.load('/model')
+    +code-old nlp = spacy.load('en', path='/model')
+
 +h(2, "info") spacy.info
    +tag function

@ -98,3 +106,37 @@ p
        +cell returns
        +cell unicode
        +cell The explanation, or #[code None] if not found in the glossary.
+
+h(2, "set_factory") spacy.set_factory
+    +tag function
+    +tag-new(2)
+
+p
+    |  Set a factory that returns a custom
+    |  #[+a("/docs/usage/language-processing-pipeline") processing pipeline]
+    |  component. Factories are useful for creating stateful components, especially ones which depend on shared data.
+
+aside-code("Example").
+    def my_factory(vocab):
+        def my_component(doc):
+            return doc
+        return my_component
+
+    spacy.set_factory('my_factory', my_factory)
+    nlp = Language(pipeline=['my_factory'])
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code factory_id]
+        +cell unicode
+        +cell
+            |  Unique name of factory. If added to a new pipeline, spaCy will
+            |  look up the factory for this ID and use it to create the
+            |  component.
+
+    +row
+        +cell #[code factory]
+        +cell callable
+        +cell
+            |  Callable that takes a #[code Vocab] object and returns a pipeline
+            |  component.
--- a/website/docs/api/stringstore.jade
+++ b/website/docs/api/stringstore.jade
@ -104,6 +104,7 @@ p

 +h(2, "to_disk") StringStore.to_disk
    +tag method
+    +tag-new(2)

 p Save the current state to a directory.

@ -118,8 +119,9 @@ p Save the current state to a directory.
            |  A path to a directory, which will be created if it doesn't exist.
            |  Paths may be either strings or #[code Path]-like objects.

-+h(2, "from_disk") Tokenizer.from_disk
+h(2, "from_disk") StringStore.from_disk
    +tag method
+    +tag-new(2)

 p Loads state from a directory. Modifies the object in place and returns it.

@ -137,10 +139,10 @@ p Loads state from a directory. Modifies the object in place and returns it.

    +footrow
        +cell returns
-        +cell #[code Tokenizer]
-        +cell The modified #[code Tokenizer] object.
+        +cell #[code StringStore]
+        +cell The modified #[code StringStore] object.

-+h(2, "to_bytes") Tokenizer.to_bytes
+h(2, "to_bytes") StringStore.to_bytes
    +tag method

 p Serialize the current state to a binary string.
@ -157,9 +159,9 @@ p Serialize the current state to a binary string.
    +footrow
        +cell returns
        +cell bytes
-        +cell The serialized form of the #[code Tokenizer] object.
+        +cell The serialized form of the #[code StringStore] object.

-+h(2, "from_bytes") Tokenizer.from_bytes
+h(2, "from_bytes") StringStore.from_bytes
    +tag method

 p Load state from a binary string.
--- a/website/docs/api/token.jade
+++ b/website/docs/api/token.jade
@ -370,116 +370,131 @@ p The L2 norm of the token's vector representation.
        +cell #[code lemma]
        +cell int
        +cell
-            |  Base form of the word, with no inflectional suffixes.
+            |  Base form of the token, with no inflectional suffixes.

    +row
        +cell #[code lemma_]
        +cell unicode
-        +cell Base form of the word, with no inflectional suffixes.
+        +cell Base form of the token, with no inflectional suffixes.

    +row
        +cell #[code lower]
        +cell int
-        +cell Lower-case form of the word.
+        +cell Lower-case form of the token.

    +row
        +cell #[code lower_]
        +cell unicode
-        +cell Lower-case form of the word.
+        +cell Lower-case form of the token.

    +row
        +cell #[code shape]
        +cell int
-        +cell Transform of the word's string, to show orthographic features.
+        +cell
+            |  Transform of the tokens's string, to show orthographic features.
+            |  For example, "Xxxx" or "dd".

    +row
        +cell #[code shape_]
        +cell unicode
-        +cell A transform of the word's string, to show orthographic features.
+            |  Transform of the tokens's string, to show orthographic features.
+            |  For example, "Xxxx" or "dd".

    +row
        +cell #[code prefix]
        +cell int
        +cell Integer ID of a length-N substring from the start of the
-            |  word. Defaults to #[code N=1].
+            |  token. Defaults to #[code N=1].

    +row
        +cell #[code prefix_]
        +cell unicode
        +cell
-            |  A length-N substring from the start of the word. Defaults to
+            |  A length-N substring from the start of the token. Defaults to
            |  #[code N=1].

    +row
        +cell #[code suffix]
        +cell int
        +cell
-            |  Length-N substring from the end of the word. Defaults to #[code N=3].
+            |  Length-N substring from the end of the token. Defaults to #[code N=3].

    +row
        +cell #[code suffix_]
        +cell unicode
-        +cell Length-N substring from the end of the word. Defaults to #[code N=3].
+        +cell Length-N substring from the end of the token. Defaults to #[code N=3].

    +row
        +cell #[code is_alpha]
        +cell bool
-        +cell Equivalent to #[code word.orth_.isalpha()].
+        +cell
+            |  Does the token consist of alphabetic characters? Equivalent to
+            |  #[code token.text.isalpha()].

    +row
        +cell #[code is_ascii]
        +cell bool
-        +cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]].
+        +cell
+            |  Does the token consist of ASCII characters? Equivalent to
+            |  #[code [any(ord(c) >= 128 for c in token.text)]].

    +row
        +cell #[code is_digit]
        +cell bool
-        +cell Equivalent to #[code word.orth_.isdigit()].
+        +cell
+            |  Does the token consist of digits? Equivalent to
+            |  #[code token.text.isdigit()].

    +row
        +cell #[code is_lower]
        +cell bool
-        +cell Equivalent to #[code word.orth_.islower()].
+        +cell
+            |  Is the token in lowercase? Equivalent to
+            |  #[code token.text.islower()].

    +row
        +cell #[code is_title]
        +cell bool
-        +cell Equivalent to #[code word.orth_.istitle()].
+        +cell
+            |  Is the token in titlecase? Equivalent to
+            |  #[code token.text.istitle()].

    +row
        +cell #[code is_punct]
        +cell bool
-        +cell Equivalent to #[code word.orth_.ispunct()].
+        +cell Is the token punctuation?

    +row
        +cell #[code is_space]
        +cell bool
-        +cell Equivalent to #[code word.orth_.isspace()].
+        +cell
+            |  Does the token consist of whitespace characters? Equivalent to
+            |  #[code token.text.isspace()].

    +row
        +cell #[code like_url]
        +cell bool
-        +cell Does the word resemble a URL?
+        +cell Does the token resemble a URL?

    +row
        +cell #[code like_num]
        +cell bool
-        +cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc.
+        +cell Does the token represent a number? e.g. "10.9", "10", "ten", etc.

    +row
        +cell #[code like_email]
        +cell bool
-        +cell Does the word resemble an email address?
+        +cell Does the token resemble an email address?

    +row
        +cell #[code is_oov]
        +cell bool
-        +cell Is the word out-of-vocabulary?
+        +cell Is the token out-of-vocabulary?

    +row
        +cell #[code is_stop]
        +cell bool
-        +cell Is the word part of a "stop list"?
+        +cell Is the token part of a "stop list"?

    +row
        +cell #[code pos]
--- a/website/docs/api/tokenizer.jade
+++ b/website/docs/api/tokenizer.jade
@ -198,91 +198,6 @@ p
            |  attributes. The #[code ORTH] fields of the attributes must
            |  exactly match the string when they are concatenated.

-+h(2, "to_disk") Tokenizer.to_disk
-    +tag method
-
-p Save the current state to a directory.
-
-+aside-code("Example").
-    tokenizer.to_disk('/path/to/tokenizer')
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code path]
-        +cell unicode or #[code Path]
-        +cell
-            |  A path to a directory, which will be created if it doesn't exist.
-            |  Paths may be either strings or #[code Path]-like objects.
-
-+h(2, "from_disk") Tokenizer.from_disk
-    +tag method
-
-p Loads state from a directory. Modifies the object in place and returns it.
-
-+aside-code("Example").
-    from spacy.tokenizer import Tokenizer
-    tokenizer = Tokenizer(nlp.vocab)
-    tokenizer = tokenizer.from_disk('/path/to/tokenizer')
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code path]
-        +cell unicode or #[code Path]
-        +cell
-            |  A path to a directory. Paths may be either strings or
-            |  #[code Path]-like objects.
-
-    +footrow
-        +cell returns
-        +cell #[code Tokenizer]
-        +cell The modified #[code Tokenizer] object.
-
-+h(2, "to_bytes") Tokenizer.to_bytes
-    +tag method
-
-p Serialize the current state to a binary string.
-
-+aside-code("Example").
-    tokenizer_bytes = tokenizer.to_bytes()
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code **exclude]
-        +cell -
-        +cell Named attributes to prevent from being serialized.
-
-    +footrow
-        +cell returns
-        +cell bytes
-        +cell The serialized form of the #[code Tokenizer] object.
-
-+h(2, "from_bytes") Tokenizer.from_bytes
-    +tag method
-
-p Load state from a binary string.
-
-+aside-code("Example").
-    fron spacy.tokenizer import Tokenizer
-    tokenizer_bytes = tokenizer.to_bytes()
-    new_tokenizer = Tokenizer(nlp.vocab)
-    new_tokenizer.from_bytes(tokenizer_bytes)
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code bytes_data]
-        +cell bytes
-        +cell The data to load from.
-
-    +row
-        +cell #[code **exclude]
-        +cell -
-        +cell Named attributes to prevent from being loaded.
-
-    +footrow
-        +cell returns
-        +cell #[code Tokenizer]
-        +cell The #[code Tokenizer] object.
-
 +h(2, "attributes") Attributes

 +table(["Name", "Type", "Description"])
--- a/website/docs/api/util.jade
+++ b/website/docs/api/util.jade
@ -76,6 +76,7 @@ p

 +h(2, "resolve_model_path") util.resolve_model_path
    +tag function
+    +tag-new(2)

 p Resolve a model name or string to a model path.

@ -169,6 +170,7 @@ p

 +h(2, "is_in_jupyter") util.is_in_jupyter
    +tag function
+    +tag-new(2)

 p
    |  Check if user is running spaCy from a #[+a("https://jupyter.org") Jupyter]
@ -221,6 +223,7 @@ p

 +h(2, "prints") util.prints
    +tag function
+    +tag-new(2)

 p
    |  Print a formatted, text-wrapped message with optional title. If a text
--- a/website/docs/api/vocab.jade
+++ b/website/docs/api/vocab.jade
@ -159,6 +159,7 @@ p

 +h(2, "to_disk") Vocab.to_disk
    +tag method
+    +tag-new(2)

 p Save the current state to a directory.

@ -175,6 +176,7 @@ p Save the current state to a directory.

 +h(2, "from_disk") Vocab.from_disk
    +tag method
+    +tag-new(2)

 p Loads state from a directory. Modifies the object in place and returns it.

--- a/website/docs/usage/_data.json
+++ b/website/docs/usage/_data.json
@ -80,7 +80,7 @@
    },

    "customizing-tokenizer": {
-        "title": "Customizing the tokenizer",
+        "title": "Customising the tokenizer",
        "next": "rule-based-matching"
    },

--- a/website/docs/usage/_spacy-101/_pipelines.jade
+++ b/website/docs/usage/_spacy-101/_pipelines.jade
@ -48,3 +48,13 @@ p
        +cell ner
        +cell #[+api("entityrecognizer") #[code EntityRecognizer]]
        +cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type]
+
+p
+    |  The processing pipeline always #[strong depends on the statistical model]
+    |  and its capabilities. For example, a pipeline can only include an entity
+    |  recognizer component if the model includes data to make predictions of
+    |  entity labels. This is why each model will specify the pipeline to use
+    |  in its meta data, as a simple list containing the component names:
+
+code(false, "json").
+    "pipeline": ["vectorizer", "tagger", "parser", "ner"]
--- a/website/docs/usage/_spacy-101/_serialization.jade
+++ b/website/docs/usage/_spacy-101/_serialization.jade
@ -22,10 +22,10 @@ p
    |  untrusted sources.

 p
-    |  All container classes and pipeline components, i.e.
-    for cls in ["Doc", "Language", "Tokenizer", "Tagger", "DependencyParser", "EntityRecognizer", "Vocab", "StringStore"]
-        |  #[+api(cls.toLowerCase()) #[code=cls]],
-    |  have the following methods available:
+    |  All container classes, i.e. #[+api("language") #[code Language]],
+    |  #[+api("doc") #[code Doc]], #[+api("vocab") #[code Vocab]] and
+    |  #[+api("stringstore") #[code StringStore]] have the following methods
+    |  available:

 +table(["Method", "Returns", "Example"])
    - style = [1, 0, 1]
@ -34,7 +34,35 @@ p
    +annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style)
    +annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style)

+p
+    |  For example, if you've processed a very large document, you can use
+    |  #[+api("doc#to_disk") #[code Doc.to_disk]] to save it to a file on your
+    |  local machine. This will save the document and its tokens, as well as
+    |  the vocabulary associated with the #[code Doc].
+
+aside("Why saving the vocab?")
+    |  Saving the vocabulary with the #[code Doc] is important, because the
+    |  #[code Vocab] holds the context-independent information about the words,
+    |  tags and labels, and their #[strong integer IDs]. If the #[code Vocab]
+    |  wasn't saved with the #[code Doc], spaCy wouldn't know how to resolve
+    |  those IDs – for example, the word text or the dependency labels. You
+    |  might be saving #[code 446] for "whale", but in a different vocabulary,
+    |  this ID could map to "VERB". Similarly, if your document was processed by
+    |  a German model, its vocab will include the specific
+    |  #[+a("/docs/api/annotation#dependency-parsing-german") German dependency labels].
+
 +code.
    moby_dick = open('moby_dick.txt', 'r') # open a large document
    doc = nlp(moby_dick) # process it
    doc.to_disk('/moby_dick.bin') # save the processed Doc
+
+p
+    |  If you need it again later, you can load it back into an empty #[code Doc]
+    |  with an empty #[code Vocab] by calling
+    |  #[+api("doc#from_disk") #[code from_disk()]]:
+
+code.
+    from spacy.tokens import Doc # to create empty Doc
+    from spacy.vocab import Vocab # to create empty Vocab
+
+    doc = Doc(Vocab()).from_disk('/moby_dick.bin') # load processed Doc
--- a/website/docs/usage/_spacy-101/_training.jade
+++ b/website/docs/usage/_spacy-101/_training.jade
@ -1,3 +1,3 @@
 //- 💫 DOCS > USAGE > SPACY 101 > TRAINING

-p
+under-construction
--- a/website/docs/usage/_spacy-101/_vocab-stringstore.jade
+++ b/website/docs/usage/_spacy-101/_vocab-stringstore.jade
@ -0,0 +1,92 @@
+//- 💫 DOCS > USAGE > SPACY 101 > VOCAB & STRINGSTORE
+
+p
+    |  Whenever possible, spaCy tries to store data in a vocabulary, the
+    |  #[+api("vocab") #[code Vocab]], that will be
+    |  #[strong shared by multiple documents]. To save memory, spaCy also
+    |  encodes all strings to #[strong integer IDs] – in this case for example,
+    |  "coffee" has the ID #[code 3672]. Entity labels like "ORG" and
+    |  part-of-speech tags like "VERB" are also encoded. Internally, spaCy
+    |  only "speaks" in integer IDs.
+
+aside
+    |  #[strong Token]: A word, punctuation mark etc. #[em in context], including
+    |  its attributes, tags and dependencies.#[br]
+    |  #[strong Lexeme]: A "word type" with no context. Includes the word shape
+    |  and flags, e.g. if it's lowercase, a digit or punctuation.#[br]
+    |  #[strong Doc]: A processed container of tokens in context.#[br]
+    |  #[strong Vocab]: The collection of lexemes.#[br]
+    |  #[strong StringStore]: The dictionary mapping integer IDs to strings, for
+    |  example #[code 3672] &rarr; "coffee".
+
+image
+    include ../../../assets/img/docs/vocab_stringstore.svg
+    .u-text-right
+        +button("/assets/img/docs/vocab_stringstore.svg", false, "secondary").u-text-tag View large graphic
+
+p
+    |  If you process lots of documents containing the word "coffee" in all
+    |  kinds of different contexts, storing the exact string "coffee" every time
+    |  would take up way too much space. So instead, spaCy assigns it an ID
+    |  and stores it in the #[+api("stringstore") #[code StringStore]]. You can
+    |  think of the #[code StringStore] as a
+    |  #[strong lookup table that works in both directions] – you can look up a
+    |  string to get its ID, or an ID to get its string:
+
+code.
+    doc = nlp(u'I like coffee')
+    assert doc.vocab.strings[u'coffee'] == 3572
+    assert doc.vocab.strings[3572] == u'coffee'
+
+p
+    |  Now that all strings are encoded, the entries in the vocabulary
+    |  #[strong don&apos;t need to include the word text] themselves. Instead,
+    |  they can look it up in the #[code StringStore] via its integer ID. Each
+    |  entry in the vocabulary, also called #[+api("lexeme") #[code Lexeme]],
+    |  contains the #[strong context-independent] information about a word.
+    |  For example, no matter if "love" is used as a verb or a noun in some
+    |  context, its spelling and whether it consists of alphabetic characters
+    |  won't ever change.
+
+code.
+    for word in doc:
+        lexeme = doc.vocab[word.text]
+        print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
+              lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)
+
+aside
+    |  #[strong Text]: The original text of the lexeme.#[br]
+    |  #[strong Orth]: The integer ID of the lexeme.#[br]
+    |  #[strong Shape]: The abstract word shape of the lexeme.#[br]
+    |  #[strong Prefix]: By default, the first letter of the word string.#[br]
+    |  #[strong Suffix]: By default, the last three letters of the word string.#[br]
+    |  #[strong is alpha]: Does the lexeme consist of alphabetic characters?#[br]
+    |  #[strong is digit]: Does the lexeme consist of digits?#[br]
+    |  #[strong is title]: Does the lexeme consist of alphabetic characters?#[br]
+    |  #[strong Lang]: The language of the parent vocabulary.
+
+table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit", "is_title", "lang"])
+    - var style = [0, 1, 1, 0, 0, 1, 1, 1, 0]
+    +annotation-row(["I", 508, "X", "I", "I", true, false, true, "en"], style)
+    +annotation-row(["love", 949, "xxxx", "l", "ove", true, false, false, "en"], style)
+    +annotation-row(["coffee", 3572, "xxxx", "c", "ffe", true, false, false, "en"], style)
+
+p
+    |  The specific entries in the voabulary and their IDs don't really matter –
+    |  #[strong as long as they match]. That's why you always need to make sure
+    |  all objects you create have access to the same vocabulary. If they don't,
+    |  the IDs won't match and spaCy will either produce very confusing results,
+    |  or fail alltogether.
+
+code.
+    from spacy.tokens import Doc
+    from spacy.vocab import Vocab
+
+    doc = nlp(u'I like coffee') # original Doc
+    new_doc = Doc(Vocab(), words=['I', 'like', 'coffee']) # new Doc with empty Vocab
+    assert doc.vocab.strings[u'coffee'] == 3572 # ID in vocab of Doc
+    assert new_doc.vocab.strings[u'coffee'] == 446 # ID in vocab of new Doc
+
+p
+    |  Even though both #[code Doc] objects contain the same words, the internal
+    |  integer IDs are very different.
--- a/website/docs/usage/adding-languages.jade
+++ b/website/docs/usage/adding-languages.jade
@ -107,7 +107,6 @@ p
    .u-text-right
        +button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic

-
 +table(["File name", "Variables", "Description"])
    +row
        +cell #[+src(gh("spacy-dev-resources", "templates/new_language/stop_words.py")) stop_words.py]
@ -439,7 +438,7 @@ p

 +h(3, "morph-rules") Morph rules

-//- TODO: write morph rules section
+under-construction

 +h(2, "testing") Testing the new language tokenizer

@ -631,7 +630,7 @@ p
    |  trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim].
    |  The #[code vectors.bin] file should consist of one word and vector per line.

-+aside-code("your_data_directory", "yaml").
+//-+aside-code("your_data_directory", "yaml").
    ├── vocab/
    |   ├── lexemes.bin
    |   ├── strings.json
@ -662,4 +661,4 @@ p
    |  model use the using spaCy's #[+api("cli#train") #[code train]] command:

 +code(false, "bash").
-    python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner]
+    python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
--- a/website/docs/usage/deep-learning.jade
+++ b/website/docs/usage/deep-learning.jade
@ -17,6 +17,8 @@ p
    |  #[+a("http://deeplearning.net/software/theano/") Theano] is also
    |  supported.

+under-construction
+
 +code("Runtime usage").
    def count_entity_sentiment(nlp, texts):
        '''Compute the net document sentiment for each entity in the texts.'''
@ -153,7 +155,9 @@ p
    |  adding another LSTM layer, using attention mechanism, using character
    |  features, etc.

-+h(2, "attribute-hooks") Attribute hooks (experimental)
+h(2, "attribute-hooks") Attribute hooks
+
+under-construction

 p
    |  Earlier, we saw how to store data in the new generic #[code user_data]
--- a/website/docs/usage/language-processing-pipeline.jade
+++ b/website/docs/usage/language-processing-pipeline.jade
@ -322,8 +322,9 @@ p
    |  If you don't need a particular component of the pipeline – for
    |  example, the tagger or the parser, you can disable loading it. This can
    |  sometimes make a big difference and improve loading speed. Disabled
-    |  component names can be provided to #[code spacy.load], #[code from_disk]
-    |  or the #[code nlp] object itself as a list:
+    |  component names can be provided to #[+api("spacy#load") #[code spacy.load]],
+    |  #[+api("language#from_disk") #[code Language.from_disk]] or the
+    |  #[code nlp] object itself as a list:

 +code.
    nlp = spacy.load('en', disable['parser', 'tagger'])
--- a/website/docs/usage/lightning-tour.jade
+++ b/website/docs/usage/lightning-tour.jade
@ -35,7 +35,7 @@ p
    assert doc[0].text == u'Peach'
    assert doc[1].text == u'emoji'
    assert doc[-1].text == u'🍑'
-    assert doc[17:19] == u'outranking eggplant'
+    assert doc[17:19].text == u'outranking eggplant'
    assert doc.noun_chunks[0].text == u'Peach emoji'

    sentences = list(doc.sents)
--- a/website/docs/usage/production-use.jade
+++ b/website/docs/usage/production-use.jade
@ -2,16 +2,18 @@

 include ../../_includes/_mixins

+under-construction
+
 +h(2, "multithreading") Multi-threading with #[code .pipe()]

 p
    |  If you have a sequence of documents to process, you should use the
-    |  #[+api("language#pipe") #[code .pipe()]] method. The method takes an
-    |  iterator of texts, and accumulates an internal buffer,
+    |  #[+api("language#pipe") #[code Language.pipe()]] method. The method takes
+    |  an iterator of texts, and accumulates an internal buffer,
    |  which it works on in parallel. It then yields the documents in order,
    |  one-by-one. After a long and bitter struggle, the global interpreter
    |  lock was freed around spaCy's main parsing loop in v0.100.3. This means
-    |  that the #[code .pipe()] method will be significantly faster in most
+    |  that #[code .pipe()] will be significantly faster in most
    |  practical situations, because it allows shared memory parallelism.

 +code.
@ -20,23 +22,27 @@ p

 p
    |  To make full use of the #[code .pipe()] function, you might want to
-    |  brush up on Python generators. Here are a few quick hints:
+    |  brush up on #[strong Python generators]. Here are a few quick hints:

 +list
    +item
-        |  Generator comprehensions can be written
-        |  (#[code item for item in sequence])
+        |  Generator comprehensions can be written as
+        |  #[code (item for item in sequence)].

    +item
-        |  The #[code itertools] built-in library and the #[code cytoolz]
-        |  package provide a lot of handy generator tools
+        |  The
+        |  #[+a("https://docs.python.org/2/library/itertools.html") #[code itertools] built-in library]
+        |  and the
+        |  #[+a("https://github.com/pytoolz/cytoolz") #[code cytoolz] package]
+        |  provide a lot of handy #[strong generator tools].

    +item
        |  Often you'll have an input stream that pairs text with some
-        |  important metadata, e.g. a JSON document. To pair up the metadata
-        |  with the processed #[code Doc] object, you should use the tee
-        |  function to split the generator in two, and then #[code izip] the
-        |  extra stream to the document stream.
+        |  important meta data, e.g. a JSON document. To
+        |  #[strong pair up the meta data] with the processed #[code Doc]
+        |  object, you should use the #[code itertools.tee] function to split
+        |  the generator in two, and then #[code izip] the extra stream to the
+        |  document stream.

 +h(2, "own-annotations") Bringing your own annotations

--- a/website/docs/usage/spacy-101.jade
+++ b/website/docs/usage/spacy-101.jade
@ -4,6 +4,8 @@ include ../../_includes/_mixins

 +h(2, "features") Features

+under-construction
+
 +aside
    |  If one of spaCy's functionalities #[strong needs a model], it means that
    |  you need to have one our the available
@ -91,17 +93,35 @@ p

 include _spacy-101/_tokenization

+infobox
+    |  To learn more about how spaCy's tokenizer and its rules work in detail,
+    |  how to #[strong customise] it and how to #[strong add your own tokenizer]
+    |  to a processing pipeline, see the usage guide on
+    |  #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer].

 +h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
    +tag-model("dependency parse")

 include _spacy-101/_pos-deps

+infobox
+    |  To learn more about #[strong part-of-speech tagging] and rule-based
+    |  morphology, and how to #[strong navigate and use the parse tree]
+    |  effectively, see the usage guides on
+    |  #[+a("/docs/usage/pos-tagging") part-of-speech tagging] and
+    |  #[+a("/docs/usage/dependency-parse") using the dependency parse].
+
 +h(3, "annotations-ner") Named Entities
    +tag-model("named entities")

 include _spacy-101/_named-entities

+infobox
+    |  To learn more about entity recognition in spaCy, how to
+    |  #[strong add your own entities] to a document and how to train and update
+    |  the entity predictions of a model, see the usage guide on
+    |  #[+a("/docs/usage/entity-recognition") named entity recognition].
+
 +h(2, "vectors-similarity") Word vectors and similarity
    +tag-model("vectors")

@ -109,20 +129,43 @@ include _spacy-101/_similarity

 include _spacy-101/_word-vectors

+infobox
+    |  To learn more about word vectors, how to #[strong customise them] and
+    |  how to load #[strong your own vectors] into spaCy, see the usage
+    |  guide on
+    |  #[+a("/docs/usage/word-vectors-similarities") using word vectors and semantic similarities].
+
 +h(2, "pipelines") Pipelines

 include _spacy-101/_pipelines

+infobox
+    |  To learn more about #[strong how processing pipelines work] in detail,
+    |  how to enable and disable their components, and how to
+    |  #[strong create your own], see the usage guide on
+    |  #[+a("/docs/usage/language-processing-pipeline") language processing pipelines].
+
+h(2, "vocab-stringstore") Vocab, lexemes and the string store
+
+include _spacy-101/_vocab-stringstore
+
 +h(2, "serialization") Serialization

 include _spacy-101/_serialization

+infobox
+    |  To learn more about #[strong serialization] and how to
+    |  #[strong save and load your own models], see the usage guide on
+    |  #[+a("/docs/usage/saving-loading") saving, loading and data serialization].
+
 +h(2, "training") Training

 include _spacy-101/_training

 +h(2, "architecture") Architecture

+under-construction
+
 +image
    include ../../assets/img/docs/architecture.svg
    .u-text-right
--- a/website/docs/usage/training-ner.jade
+++ b/website/docs/usage/training-ner.jade
@ -64,44 +64,10 @@ p
    |  predicts the new category with minimal difference from the previous
    |  output.

-+h(2, "saving-loading") Saving and loading
-
-p
-    |  After training our model, you'll usually want to save its state, and load
-    |  it back later. You can do this with the #[code Language.save_to_directory()]
-    |  method:
-
-+code.
-    nlp.save_to_directory('/home/me/data/en_technology')
-
-p
-    |  To make the model more convenient to deploy, we recommend wrapping it as
-    |  a Python package, so that you can install it via pip and load it as a
-    |  module. spaCy comes with a handy #[+api("cli#package") #[code package]]
-    |  CLI command to create all required files and directories.
-
-+code(false, "bash").
-    python -m spacy package /home/me/data/en_technology /home/me/my_models
-
-p
-    |  To build the package and create a #[code .tar.gz] archive, run
-    |  #[code python setup.py sdist] from within its directory.
-
-+infobox("Saving and loading models")
-    |  For more information and a detailed guide on how to package your model,
-    |  see the documentation on
-    |  #[+a("/docs/usage/saving-loading") saving and loading models].
-
-p
-    |  After you've generated and installed the package, you'll be able to
-    |  load the model as follows:
-
-+code.
-    import en_technology
-    nlp = en_technology.load()
-
 +h(2, "example") Example: Adding and training an #[code ANIMAL] entity

+under-construction
+
 p
    |  This script shows how to add a new entity type to an existing pre-trained
    |  NER model. To keep the example short and simple, only four sentences are
@ -170,5 +136,33 @@ p

 p
    |  After training your model, you can
-    |  #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend wrapping
-    |  models as Python packages, for ease of deployment.
+    |  #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend
+    |  wrapping models as Python packages, for ease of deployment.
+
+h(2, "saving-loading") Saving and loading
+
+p
+    |  After training our model, you'll usually want to save its state, and load
+    |  it back later. You can do this with the
+    |  #[+api("language#to_disk") #[code Language.to_disk()]] method:
+
+code.
+    nlp.to_disk('/home/me/data/en_technology')
+
+p
+    |  To make the model more convenient to deploy, we recommend wrapping it as
+    |  a Python package, so that you can install it via pip and load it as a
+    |  module. spaCy comes with a handy #[+api("cli#package") #[code package]]
+    |  CLI command to create all required files and directories.
+
+code(false, "bash").
+    python -m spacy package /home/me/data/en_technology /home/me/my_models
+
+p
+    |  To build the package and create a #[code .tar.gz] archive, run
+    |  #[code python setup.py sdist] from within its directory.
+
+infobox("Saving and loading models")
+    |  For more information and a detailed guide on how to package your model,
+    |  see the documentation on
+    |  #[+a("/docs/usage/saving-loading#models") saving and loading models].
--- a/website/docs/usage/training.jade
+++ b/website/docs/usage/training.jade
@ -81,59 +81,3 @@ p.o-inline-list

 p
    +button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example
-
-+h(2, "feature-templates") Customizing the feature extraction
-
-p
-    |  spaCy currently uses linear models for the tagger, parser and entity
-    |  recognizer, with weights learned using the
-    |  #[+a("https://explosion.ai/blog/part-of-speech-pos-tagger-in-python") Averaged Perceptron algorithm].
-
-+aside("Linear Model Feature Scheme")
-    |  For a list of the available feature atoms, see the #[+a("/docs/api/features") Linear Model Feature Scheme].
-
-p
-    |  Because it's a linear model, it's important for accuracy to build
-    |  conjunction features out of the atomic predictors. Let's say you have
-    |  two atomic predictors asking, "What is the part-of-speech of the
-    |  previous token?", and "What is the part-of-speech of the previous
-    |  previous token?". These predictors will introduce a number of features,
-    |  e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction
-    |  template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ].
-
-p
-    |  The feature extraction proceeds in two passes. In the first pass, we
-    |  fill an array with the values of all of the atomic predictors. In the
-    |  second pass, we iterate over the feature templates, and fill a small
-    |  temporary array with the predictors that will be combined into a
-    |  conjunction feature. Finally, we hash this array into a 64-bit integer,
-    |  using the MurmurHash algorithm. You can see this at work in the
-    |  #[+a(gh("thinc", "thinc/linear/features.pyx", "94dbe06fd3c8f24d86ab0f5c7984e52dbfcdc6cb")) #[code thinc.linear.features]] module.
-
-p
-    |  It's very easy to change the feature templates, to create novel
-    |  combinations of the existing atomic predictors. There's currently no API
-    |  available to add new atomic predictors, though. You'll have to create a
-    |  subclass of the model, and write your own #[code set_featuresC] method.
-
-p
-    |  The feature templates are passed in using the #[code features] keyword
-    |  argument to the constructors of the #[+api("tagger") #[code Tagger]],
-    |  #[+api("dependencyparser") #[code DependencyParser]] and
-    |  #[+api("entityrecognizer") #[code EntityRecognizer]]:
-
-+code.
-    from spacy.vocab import Vocab
-    from spacy.pipeline import Tagger
-    from spacy.tagger import P2_orth, P1_orth
-    from spacy.tagger import P2_cluster, P1_cluster, W_orth, N1_orth, N2_orth
-
-    vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}})
-    tagger = Tagger(vocab, features=[(P2_orth, P2_cluster), (P1_orth, P1_cluster),
-                                     (P2_orth,), (P1_orth,), (W_orth,),
-                                     (N1_orth,), (N2_orth,)])
-
-p
-    |  Custom feature templates can be passed to the #[code DependencyParser]
-    |  and #[code EntityRecognizer] as well, also using the #[code features]
-    |  keyword argument of the constructor.
--- a/website/docs/usage/v2.jade
+++ b/website/docs/usage/v2.jade
@ -50,9 +50,10 @@ p

 p
    |  spay's serialization API has been made consistent across classes and
-    |  objects. All container classes and pipeline components now have a
-    |  #[code to_bytes()], #[code from_bytes()], #[code to_disk()] and
-    |  #[code from_disk()] method that supports the Pickle protocol.
+    |  objects. All container classes, i.e. #[code Language], #[code Doc],
+    |  #[code Vocab] and #[code StringStore] now have a #[code to_bytes()],
+    |  #[code from_bytes()], #[code to_disk()] and #[code from_disk()] method
+    |  that supports the Pickle protocol.

 p
    |  The improved #[code spacy.load] makes loading models easier and more
--- a/website/docs/usage/visualizers.jade
+++ b/website/docs/usage/visualizers.jade
@ -334,7 +334,7 @@ p
    |  token #[code &lt;script src="malicious-code.js"&gt;&lt;script&gt;].
    |  Instead of relying on the server to render and sanitize HTML, you
    |  can do this on the client in JavaScript. displaCy.js creates
-    |  the SVG markup as DOM nodes and will never insert raw HTML.
+    |  the markup as DOM nodes and will never insert raw HTML.

 p
    |  The #[code parse_deps] function takes a #[code Doc] object and returns
--- a/website/docs/usage/word-vectors-similarities.jade
+++ b/website/docs/usage/word-vectors-similarities.jade
@ -23,41 +23,20 @@ p
 include _spacy-101/_similarity
 include _spacy-101/_word-vectors

-
 +h(2, "custom") Customising word vectors

+under-construction
+
 p
    |  By default, #[+api("token#vector") #[code Token.vector]] returns the
    |  vector for its underlying #[+api("lexeme") #[code Lexeme]], while
    |  #[+api("doc#vector") #[code Doc.vector]] and
    |  #[+api("span#vector") #[code Span.vector]] return an average of the
-    |  vectors of their tokens.
-
-p
-    |  You can customize these
+    |  vectors of their tokens. You can customize these
    |  behaviours by modifying the #[code doc.user_hooks],
    |  #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
    |  dictionaries.

-+code("Example").
-    # TODO
-
-p
-    |  You can load new word vectors from a file-like buffer using the
-    |  #[code vocab.load_vectors()] method. The file should be a
-    |  whitespace-delimited text file, where the word is in the first column,
-    |  and subsequent columns provide the vector data.  For faster loading, you
-    |  can use the #[code vocab.vectors_from_bin_loc()] method, which accepts a
-    |  path to a binary file written by #[code vocab.dump_vectors()].
-
-+code("Example").
-    # TODO
-
-p
-    |  You can also load vectors from memory by writing to the
-    |  #[+api("lexeme#vector") #[code Lexeme.vector]] property. If the vectors
-    |  you are writing are of different dimensionality
-    |  from the ones currently loaded, you should first call
-    |  #[code vocab.resize_vectors(new_size)].
-
 +h(2, "similarity") Similarity
+
+under-construction