From a8e58e04efc5b57a2425595eaf1e049c23a37352 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 27 May 2017 17:57:10 +0200
Subject: [PATCH 01/25] Add symbols class to punctuation rules to handle emoji
 (see #1088)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently doesn't work for Hungarian, because of conflicts with the
custom punctuation rules. Also doesn't take multi-character emoji like
👩🏽‍💻 into account.
---
 spacy/lang/bn/punctuation.py             | 10 +++++-----
 spacy/lang/char_classes.py               |  5 +++--
 spacy/lang/punctuation.py                | 11 ++++++-----
 spacy/tests/tokenizer/test_exceptions.py | 12 +++++++++---
 4 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/spacy/lang/bn/punctuation.py b/spacy/lang/bn/punctuation.py
index 66b7d967c..96485dd55 100644
--- a/spacy/lang/bn/punctuation.py
+++ b/spacy/lang/bn/punctuation.py
@@ -1,8 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS
-from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
+from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
 
 
 _currency = r"\$|¢|£|€|¥|฿|৳"
@@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
 _list_punct = LIST_PUNCT + '। ॥'.strip().split()
 
 
-_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES)
+_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
 
-_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES +
+_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
              [r'(?<=[0-9])\+',
               r'(?<=°[FfCcKk])\.',
               r'(?<=[0-9])(?:{})'.format(_currency),
               r'(?<=[0-9])(?:{})'.format(UNITS),
               r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
 
-_infixes = (LIST_ELLIPSES +
+_infixes = (LIST_ELLIPSES + LIST_ICONS +
             [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
              r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
              r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index 5b81eddde..bec685646 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -20,7 +20,6 @@ _upper = [_latin_upper]
 _lower = [_latin_lower]
 _uncased = [_bengali, _hebrew]
 
-
 ALPHA = merge_char_classes(_upper + _lower + _uncased)
 ALPHA_LOWER = merge_char_classes(_lower + _uncased)
 ALPHA_UPPER = merge_char_classes(_upper + _uncased)
@@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
 _punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
 _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
 _hyphens = '- – — -- ---'
-
+_other_symbols = r'[\p{So}]'
 
 UNITS = merge_chars(_units)
 CURRENCY = merge_chars(_currency)
 QUOTES = merge_chars(_quotes)
 PUNCT = merge_chars(_punct)
 HYPHENS = merge_chars(_hyphens)
+ICONS = _other_symbols
 
 LIST_UNITS = split_chars(_units)
 LIST_CURRENCY = split_chars(_currency)
@@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
 LIST_PUNCT = split_chars(_punct)
 LIST_HYPHENS = split_chars(_hyphens)
 LIST_ELLIPSES = [r'\.\.+', '…']
+LIST_ICONS = [_other_symbols]
diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py
index 74bb28f5f..680f5cff0 100644
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@@ -2,15 +2,16 @@
 from __future__ import unicode_literals
 
 from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
-from .char_classes import CURRENCY, UNITS
+from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
+from .char_classes import QUOTES, CURRENCY, UNITS
 
 
 _prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
-             LIST_CURRENCY)
+             LIST_CURRENCY + LIST_ICONS)
 
 
-_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
+_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
+             ["'s", "'S", "’s", "’S"] +
              [r'(?<=[0-9])\+',
               r'(?<=°[FfCcKk])\.',
               r'(?<=[0-9])(?:{})'.format(CURRENCY),
@@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
               r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
 
 
-_infixes = (LIST_ELLIPSES +
+_infixes = (LIST_ELLIPSES + LIST_ICONS +
             [r'(?<=[0-9])[+\-\*^](?=[0-9-])',
              r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
              r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py
index aab27714e..70fb103dc 100644
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@@ -1,7 +1,4 @@
 # coding: utf-8
-"""Test that tokenizer exceptions and emoticons are handled correctly."""
-
-
 from __future__ import unicode_literals
 
 import pytest
@@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer):
 def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
     tokens = tokenizer(text)
     assert len(tokens) == length
+
+
+@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
+                                         ('i💙you', 3), ('🤘🤘yay!', 4)])
+def test_tokenizer_handles_emoji(tokenizer, text, length):
+    exceptions = ["hu"]
+    tokens = tokenizer(text)
+    if tokens[0].lang_ not in exceptions:
+        assert len(tokens) == length

From e05bcd6aa838a7098c699a920e92628296961927 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 27 May 2017 17:57:46 +0200
Subject: [PATCH 02/25] Update docs to reflect flattened model meta.json

Don't use "setup" key and instead, keep "lang" on root level and add
"pipeline".
---
 .../usage/language-processing-pipeline.jade   | 22 ++++++++-----------
 website/docs/usage/saving-loading.jade        | 18 +++++++--------
 2 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade
index ce23a1666..1392fc2f8 100644
--- a/website/docs/usage/language-processing-pipeline.jade
+++ b/website/docs/usage/language-processing-pipeline.jade
@@ -19,19 +19,17 @@ p
 
 p
     |  When you load a model, spaCy first consults the model's
-    |  #[+a("/docs/usage/saving-loading#models-generating") meta.json] for its
-    |  #[code setup] details. This typically includes the ID of a language class,
+    |  #[+a("/docs/usage/saving-loading#models-generating") meta.json]. The
+    |  meta typically includes the model details, the ID of a language class,
     |  and an optional list of pipeline components. spaCy then does the
     |  following:
 
 +aside-code("meta.json (excerpt)", "json").
     {
         "name": "example_model",
+        "lang": "en"
         "description": "Example model for spaCy",
-        "setup": {
-            "lang": "en",
-            "pipeline": ["token_vectors", "tagger"]
-        }
+        "pipeline": ["token_vectors", "tagger"]
     }
 
 +list("numbers")
@@ -287,17 +285,15 @@ p
 
 p
     |  In the model package's meta.json, specify the language class and pipeline
-    |  IDs in #[code setup]:
+    |  IDs:
 
 +code("meta.json (excerpt)", "json").
     {
-        "name": "my_sentiment_model",
+        "name": "sentiment_model",
+        "lang": "en",
         "version": "1.0.0",
         "spacy_version": "&gt;=2.0.0,&lt;3.0.0",
-        "setup": {
-            "lang": "en",
-            "pipeline": ["vectorizer", "sentiment"]
-        }
+        "pipeline": ["vectorizer", "sentiment"]
     }
 
 p
@@ -307,7 +303,7 @@ p
     |  by your custom #[code "sentiment"] factory.
 
 +code.
-    nlp = spacy.load('my_sentiment_model')
+    nlp = spacy.load('en_sentiment_model')
     doc = nlp(u'I love pizza')
     assert doc.sentiment
 
diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade
index 477db925c..1ecb7d7ee 100644
--- a/website/docs/usage/saving-loading.jade
+++ b/website/docs/usage/saving-loading.jade
@@ -74,16 +74,14 @@ p
 +aside-code("meta.json", "json").
     {
         "name": "example_model",
+        "lang": "en",
         "version": "1.0.0",
         "spacy_version": "&gt;=2.0.0,&lt;3.0.0",
         "description": "Example model for spaCy",
         "author": "You",
         "email": "you@example.com",
         "license": "CC BY-SA 3.0",
-        "setup": {
-            "lang": "en",
-            "pipeline": ["token_vectors", "tagger"]
-        }
+        "pipeline": ["token_vectors", "tagger"]
     }
 
 +code(false, "bash").
@@ -110,9 +108,9 @@ p
 +h(3, "models-custom") Customising the model setup
 
 p
-    |  The meta.json includes a #[code setup] key that lets you customise how
-    |  the model should be initialised and loaded. You can define the language
-    |  data to be loaded and the
+    |  The meta.json includes the model details, like name, requirements and
+    |  license, and lets you customise how the model should be initialised and
+    |  loaded. You can define the language data to be loaded and the
     |  #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to
     |  execute.
 
@@ -183,9 +181,9 @@ p
 p
     |  To load a model from a data directory, you can use
     |  #[+api("spacy#load") #[code spacy.load()]] with the local path. This will
-    |  look for a meta.json in the directory and use the #[code setup] details
-    |  to initialise a #[code Language] class with a processing pipeline and
-    |  load in the model data.
+    |  look for a meta.json in the directory and use the #[code lang] and
+    |  #[code pipeline] settings to initialise a #[code Language] class with a
+    |  processing pipeline and load in the model data.
 
 +code.
     nlp = spacy.load('/path/to/model')

From 0d33ead507bfc79ac341fd9b0bbe3a1e8aacc1d9 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 27 May 2017 17:58:06 +0200
Subject: [PATCH 03/25] Fix initialisation of Doc in lightning tour example

---
 website/docs/usage/lightning-tour.jade | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade
index 4a9a2315f..eefb7a11a 100644
--- a/website/docs/usage/lightning-tour.jade
+++ b/website/docs/usage/lightning-tour.jade
@@ -129,13 +129,14 @@ p
 +code.
     import spacy
     from spacy.tokens.doc import Doc
+    from spacy.vocab import Vocab
 
     nlp = spacy.load('en')
     moby_dick = open('moby_dick.txt', 'r')
     doc = nlp(moby_dick)
     doc.to_disk('/moby_dick.bin')
 
-    new_doc = Doc().from_disk('/moby_dick.bin')
+    new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')
 
 +infobox
     |  #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]

From 22bf5f63bfb4a37fc8b01724c121d2abbfecaf6e Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 27 May 2017 17:58:18 +0200
Subject: [PATCH 04/25] Update Matcher docs and add social media analysis
 example

---
 website/docs/usage/rule-based-matching.jade | 119 +++++++++++++++++++-
 1 file changed, 115 insertions(+), 4 deletions(-)

diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade
index a54b70b89..fde6da6ef 100644
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/docs/usage/rule-based-matching.jade
@@ -11,7 +11,7 @@ p
     |  You can also associate patterns with entity IDs, to allow some basic
     |  entity linking or disambiguation.
 
-+aside("What about \"real\" regular expressions?")
+//-+aside("What about \"real\" regular expressions?")
 
 +h(2, "adding-patterns") Adding patterns
 
@@ -119,7 +119,7 @@ p
 +code.
     # Add a new custom flag to the vocab, which is always False by default.
     # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
-    BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False)
+    BAD_HTML_FLAG = nlp.vocab.add_flag(lambda text: False)
 
     def merge_and_flag(matcher, doc, i, matches):
         match_id, start, end = matches[i]
@@ -221,7 +221,7 @@ p
         +cell match 0 or 1 times
         +cell optional, max one
 
-+h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations
++h(2, "example1") Example: Using linguistic annotations
 
 p
     |  Let's say you're analysing user comments and you want to find out what
@@ -283,7 +283,7 @@ p
     # set manual=True to make displaCy render straight from a dictionary
     displacy.serve(matched_sents, style='ent', manual=True)
 
-+h(3, "quantifiers-example2") Quantifiers example: Phone numbers
++h(2, "example2") Example: Phone numbers
 
 p
     |  Phone numbers can have many different formats and matching them is often
@@ -320,3 +320,114 @@ p
     |  It'll produce more predictable results, is much easier to modify and
     |  extend, and doesn't require any training data – only a set of
     |  test cases.
+
++h(2, "example3") Example: Hashtags and emoji on social media
+
+p
+    |  Social media posts, especially tweets, can be difficult to work with.
+    |  They're very short and often contain various emoji and hashtags. By only
+    |  looking at the plain text, you'll lose a lot of valuable semantic
+    |  information.
+
+p
+    |  Let's say you've extracted a large sample of social media posts on a
+    |  specific topic, for example posts mentioning a brand name or product.
+    |  As the first step of your data exploration, you want to filter out posts
+    |  containing certain emoji and use them to assign a general sentiment
+    |  score, based on whether the expressed emotion is positive or negative,
+    |  e.g. #[span.o-icon.o-icon--inline 😀] or #[span.o-icon.o-icon--inline 😞].
+    |  You also want to find, merge and label hashtags like
+    |  #[code #MondayMotivation], to be able to ignore or analyse them later.
+
++aside("Note on sentiment analysis")
+    |  Ultimately, sentiment analysis is not always #[em that] easy. In
+    |  addition to the emoji, you'll also want to take specific words into
+    |  account and check the #[code subtree] for intensifiers like "very", to
+    |  increase the sentiment score. At some point, you might also want to train
+    |  a sentiment model. However, the approach described in this example is
+    |  very useful for #[strong bootstrapping rules to gather training data].
+    |  It's also an incredibly fast way to gather first insights into your data
+    |  – with about 1 million tweets, you'd be looking at a processing time of
+    |  #[strong under 1 minute].
+
+p
+    |  By default, spaCy's tokenizer will split emoji into separate tokens. This
+    |  means that you can create a pattern for one or more emoji tokens. In this
+    |  case, a sequence of identical emoji should be treated as one instance.
+    |  Valid hashtags usually consist of a #[code #], plus a sequence of
+    |  ASCII characters with no whitespace, making them easy to match as well.
+
++code.
+    from spacy.lang.en import English
+    from spacy.matcher import Matcher
+
+    nlp = English() # we only want the tokenizer, so no need to load a model
+    matcher = Matcher(nlp.vocab)
+
+    pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍'] # positive emoji
+    neg_emoji = [u'😞', u'😠', u'😩', u'😢', u'😭', u'😒'] # negative emoji
+
+    # add patterns to match one or more emoji tokens
+    pos_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in pos_emoji]
+    neg_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in neg_emoji]
+
+    matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern
+    matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern
+
+    # add pattern to merge valid hashtag, i.e. '#' plus any ASCII token
+    matcher.add('HASHTAG', merge_hashtag, [{'ORTH': '#'}, {'IS_ASCII': True}])
+
+p
+    |  Because the #[code on_match] callback receives the ID of each match, you
+    |  can use the same function to handle the sentiment assignment for both
+    |  the positive and negative pattern. To keep it simple, we'll either add
+    |  or subtract #[code 0.1] points – this way, the score will also reflect
+    |  combinations of emoji, even positive #[em and] negative ones.
+
+p
+    |  With a library like
+    |  #[+a("https://github.com/bcongdon/python-emojipedia") Emojipedia],
+    |  we can also retrieve a short description for each emoji – for example,
+    |  #[span.o-icon.o-icon--inline 😍]'s official title is "Smiling Face With
+    |  Heart-Eyes". Assigning it to the merged token's norm will make it
+    |  available as #[code token.norm_].
+
++code.
+    from emojipedia import Emojipedia # installation: pip install emojipedia
+
+    def label_sentiment(matcher, doc, i, matches):
+        match_id, start, end = matches[i]
+        if match_id is 'HAPPY':
+            doc.sentiment += 0.1 # add 0.1 for positive sentiment
+        elif match_id is 'SAD':
+            doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment
+        span = doc[start : end]
+        emoji = Emojipedia.search(span[0].text) # get data for emoji
+        span.merge(norm=emoji.title) # merge span and set NORM to emoji title
+
+p
+    |  To label the hashtags, we first need to add a new custom flag.
+    |  #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it
+    |  to the hashtag's span, and check its value via a token's
+    |  #[+api("token#check_flag") #[code code check_flag()]] method. On each
+    |  match, we merge the hashtag and assign the flag.
+
++code.
+    # Add a new custom flag to the vocab, which is always False by default
+    IS_HASHTAG = nlp.vocab.add_flag(lambda text: False)
+
+    def merge_hashtag(matcher, doc, i, matches):
+        match_id, start, end = matches[i]
+        span = doc[start : end]
+        span.merge() # merge hashtag
+        span.set_flag(IS_HASHTAG, True) # set IS_HASHTAG to True
+
+p
+    |  To process a stream of social media posts, we can use
+    |  #[+api("language#pipe") #[code Language.pipe()]], which will return a
+    |  stream of #[code Doc] objects that we can pass to
+    |  #[+api("matcher#pipe") #[code Matcher.pipe()]].
+
++code.
+    docs = nlp.pipe(LOTS_OF_TWEETS)
+    matches = matcher.pipe(docs)

From 086a06e7d750da5852a447effdb32a376bd86ec7 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 27 May 2017 20:01:46 +0200
Subject: [PATCH 05/25] Fix CLI docstrings and add command as first argument

Workaround for Plac
---
 spacy/__init__.py     | 6 +++++-
 spacy/cli/convert.py  | 5 +++--
 spacy/cli/download.py | 7 ++++---
 spacy/cli/info.py     | 2 +-
 spacy/cli/link.py     | 5 +++--
 spacy/cli/package.py  | 5 +++--
 spacy/cli/train.py    | 6 ++++--
 7 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/spacy/__init__.py b/spacy/__init__.py
index 8dc0937f5..6beb7955e 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
 import importlib
 
 from .compat import basestring_
-from .cli.info import info
+from .cli.info import info as cli_info
 from .glossary import explain
 from .deprecated import resolve_load_name
 from . import util
@@ -20,3 +20,7 @@ def load(name, **overrides):
     overrides['meta'] = meta
     overrides['path'] = model_path
     return cls(**overrides)
+
+
+def info(model=None, markdown=False):
+    return cli_info(None, model, markdown)
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index e95ffd08b..82b39bba2 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -24,8 +24,9 @@ CONVERTERS = {
     n_sents=("Number of sentences per doc", "option", "n", float),
     morphology=("Enable appending morphology to tags", "flag", "m", bool)
 )
-def convert(_, input_file, output_dir, n_sents, morphology):
-    """Convert files into JSON format for use with train command and other
+def convert(cmd, input_file, output_dir, n_sents, morphology):
+    """
+    Convert files into JSON format for use with train command and other
     experiment management functions.
     """
     input_path = Path(input_file)
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index fdcacb891..b6e5549da 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -17,8 +17,9 @@ from .. import about
     direct=("force direct download. Needs model name with version and won't "
             "perform compatibility check", "flag", "d", bool)
 )
-def download(model, direct=False):
-    """Download compatible model from default download path using pip. Model
+def download(cmd, model, direct=False):
+    """
+    Download compatible model from default download path using pip. Model
     can be shortcut, model name or, if --direct flag is set, full model name
     with version.
     """
@@ -31,7 +32,7 @@ def download(model, direct=False):
         version = get_version(model_name, compatibility)
         download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
         try:
-            link(model_name, model, force=True)
+            link(None, model_name, model, force=True)
         except:
             # Dirty, but since spacy.download and the auto-linking is mostly
             # a convenience wrapper, it's best to show a success message and
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index 6f7467521..75aac10c7 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -14,7 +14,7 @@ from .. import util
     model=("optional: shortcut link of model", "positional", None, str),
     markdown=("generate Markdown for GitHub issues", "flag", "md", str)
 )
-def info(model=None, markdown=False):
+def info(cmd, model=None, markdown=False):
     """Print info about spaCy installation. If a model shortcut link is
     speficied as an argument, print model information. Flag --markdown
     prints details in Markdown for easy copy-pasting to GitHub issues.
diff --git a/spacy/cli/link.py b/spacy/cli/link.py
index 1feef8bce..9aecdabfe 100644
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@@ -14,8 +14,9 @@ from .. import util
     link_name=("name of shortuct link to create", "positional", None, str),
     force=("force overwriting of existing link", "flag", "f", bool)
 )
-def link(origin, link_name, force=False):
-    """Create a symlink for models within the spacy/data directory. Accepts
+def link(cmd, origin, link_name, force=False):
+    """
+    Create a symlink for models within the spacy/data directory. Accepts
     either the name of a pip package, or the local path to the model data
     directory. Linking models allows loading them via spacy.load(link_name).
     """
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 9acd0a2fa..1c3128d99 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -18,8 +18,9 @@ from .. import about
     meta=("path to meta.json", "option", "m", str),
     force=("force overwriting of existing folder in output directory", "flag", "f", bool)
 )
-def package(input_dir, output_dir, meta, force):
-    """Generate Python package for model data, including meta and required
+def package(cmd, input_dir, output_dir, meta=None, force=False):
+    """
+    Generate Python package for model data, including meta and required
     installation files. A new directory will be created in the specified
     output directory, and model data will be copied over.
     """
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index ed146cb24..25b53e49d 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -32,9 +32,11 @@ from .. import displacy
     no_parser=("Don't train parser", "flag", "P", bool),
     no_entities=("Don't train NER", "flag", "N", bool)
 )
-def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
+def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
           use_gpu=False, no_tagger=False, no_parser=False, no_entities=False):
-    """Train a model. Expects data in spaCy's JSON format."""
+    """
+    Train a model. Expects data in spaCy's JSON format.
+    """
     n_sents = n_sents or None
     output_path = util.ensure_path(output_dir)
     train_path = util.ensure_path(train_data)

From 1203959625954fc1164485883ff49e9b5f3b43c3 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 27 May 2017 20:02:01 +0200
Subject: [PATCH 06/25] Add pipeline setting to meta.json generator

---
 spacy/cli/package.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 1c3128d99..e78a4eeb4 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -43,7 +43,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
         meta = util.read_json(meta_path)
     else:
         meta = generate_meta()
-    validate_meta(meta, ['lang', 'name', 'version'])
+    meta = validate_meta(meta, ['lang', 'name', 'version'])
 
     model_name = meta['lang'] + '_' + meta['name']
     model_name_v = model_name + '-' + meta['version']
@@ -86,20 +86,32 @@ def generate_meta():
                 ('email', 'Author email', False),
                 ('url', 'Author website', False),
                 ('license', 'License', 'CC BY-NC 3.0')]
-
     prints("Enter the package settings for your model.", title="Generating meta.json")
     meta = {}
     for setting, desc, default in settings:
         response = util.get_raw_input(desc, default)
         meta[setting] = default if response == '' and default else response
+    meta['pipeline'] = generate_pipeline()
     return meta
 
 
+def generate_pipeline():
+    prints("If set to 'True', the default pipeline is used. If set to 'False', "
+           "the pipeline will be disabled. Components should be specified as a "
+           "comma-separated list of component names, e.g. vectorizer, tagger, "
+           "parser, ner. For more information, see the docs on processing pipelines.",
+           title="Enter your model's pipeline components")
+    pipeline = util.get_raw_input("Pipeline components", True)
+    replace = {'True': True, 'False': False}
+    return replace[pipeline] if pipeline in replace else pipeline.split(', ')
+
+
 def validate_meta(meta, keys):
     for key in keys:
         if key not in meta or meta[key] == '':
             prints("This setting is required to build your package.",
                    title='No "%s" setting found in meta.json' % key, exits=1)
+    return meta
 
 
 def get_template(filepath):

From ae11c8d60f07f5f9257a347f51b72d93aaea3699 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 27 May 2017 20:02:20 +0200
Subject: [PATCH 07/25] Add emoji sentiment to lightning tour matcher example

---
 website/docs/usage/lightning-tour.jade | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade
index eefb7a11a..7de486070 100644
--- a/website/docs/usage/lightning-tour.jade
+++ b/website/docs/usage/lightning-tour.jade
@@ -149,9 +149,14 @@ p
 
     nlp = spacy.load('en')
     matcher = Matcher(nlp.vocab)
-    # match "Google I/O" or "Google i/o"
-    pattern = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
-    matcher.add('GoogleIO', None, pattern)
+
+    def set_sentiment(matcher, doc, i, matches):
+        doc.sentiment += 0.1
+
+    pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
+    pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
+    matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
+    matcher.add('HAPPY', set_sentiment, pattern2) # match one or more happy emoji
     matches = nlp(LOTS_OF TEXT)
 
 +infobox

From 7cc9c3e9a6f28422485eb2a054d12850481aeb71 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 27 May 2017 15:44:42 -0500
Subject: [PATCH 08/25] Fix convert CLI

---
 spacy/cli/convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index e95ffd08b..ac608a64a 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -39,4 +39,4 @@ def convert(_, input_file, output_dir, n_sents, morphology):
         prints("Can't find converter for %s" % input_path.parts[-1],
                title="Unknown format", exits=1)
     CONVERTERS[file_ext](input_path, output_path,
-            n_sents=n_sents, morphology=morphology)
+            n_sents=n_sents, use_morphology=morphology)

From 34bbad8e0e115e412e857c71d5f4d0b3ab339681 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 27 May 2017 15:46:06 -0500
Subject: [PATCH 09/25] Add __reduce__ methods on parser subclasses. Fixes
 pickling.

---
 spacy/pipeline.pyx | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index 98b79d709..724891c9b 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -335,6 +335,9 @@ cdef class NeuralDependencyParser(NeuralParser):
     name = 'parser'
     TransitionSystem = ArcEager
 
+    def __reduce__(self):
+        return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
+
 
 cdef class NeuralEntityRecognizer(NeuralParser):
     name = 'entity'
@@ -342,6 +345,10 @@ cdef class NeuralEntityRecognizer(NeuralParser):
 
     nr_feature = 6
 
+    def __reduce__(self):
+        return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
+
+
 
 cdef class BeamDependencyParser(BeamParser):
     TransitionSystem = ArcEager

From 5e4312feede7c2511b4d61a5723077c1b16c142d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 27 May 2017 15:47:02 -0500
Subject: [PATCH 10/25] Evaluate loaded class, to ensure save/load works

---
 spacy/cli/train.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index b25cdcbd5..7bbda5a47 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -84,11 +84,11 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                     pbar.update(len(docs))
 
             with nlp.use_params(optimizer.averages):
-                scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False))
                 with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
                     dill.dump(nlp, file_, -1)
-
-
+                with (output_path / ('model%d.pickle' % i)).open('rb') as file_:
+                    nlp_loaded = dill.load(file_)
+                scorer = nlp_loaded.evaluate(corpus.dev_docs(nlp_loaded, gold_preproc=False))
             print_progress(i, losses, scorer.scores)
     finally:
         print("Saving model...")

From 655ca58c16880c50661039c4db7181b4700cd0e5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 27 May 2017 15:49:37 -0500
Subject: [PATCH 11/25] Clarifying change to StateC.clone

---
 spacy/syntax/_state.pxd | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd
index 4b2b47270..0b29412bf 100644
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@@ -335,17 +335,18 @@ cdef cppclass StateC:
             this._break = this._b_i
 
     void clone(const StateC* src) nogil:
+        this.length = src.length
         memcpy(this._sent, src._sent, this.length * sizeof(TokenC))
         memcpy(this._stack, src._stack, this.length * sizeof(int))
         memcpy(this._buffer, src._buffer, this.length * sizeof(int))
         memcpy(this._ents, src._ents, this.length * sizeof(Entity))
         memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
-        this.length = src.length
         this._b_i = src._b_i
         this._s_i = src._s_i
         this._e_i = src._e_i
         this._break = src._break
         this.offset = src.offset
+        this._empty_token = src._empty_token
 
     void fast_forward() nogil:
         # space token attachement policy:

From 99316fa631efd86a5ab5d68b11654c7366ece650 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 27 May 2017 15:50:21 -0500
Subject: [PATCH 12/25] Use ordered dict to specify actions

---
 spacy/syntax/arc_eager.pyx | 14 ++++++++------
 spacy/syntax/ner.pyx       | 31 ++++++++++++++++++++++---------
 2 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index f7c1c7922..2e424c1a9 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -9,6 +9,7 @@ import ctypes
 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
 from cymem.cymem cimport Pool
+from collections import OrderedDict
 
 from .stateclass cimport StateClass
 from ._state cimport StateC, is_space_token
@@ -312,12 +313,13 @@ cdef class ArcEager(TransitionSystem):
     @classmethod
     def get_actions(cls, **kwargs):
         actions = kwargs.get('actions',
-                    {
-                        SHIFT: [''],
-                        REDUCE: [''],
-                        RIGHT: [],
-                        LEFT: [],
-                        BREAK: ['ROOT']})
+                    OrderedDict((
+                        (SHIFT, ['']),
+                        (REDUCE, ['']),
+                        (RIGHT, []),
+                        (LEFT, []),
+                        (BREAK, ['ROOT'])
+                    )))
         seen_actions = set()
         for label in kwargs.get('left_labels', []):
             if label.upper() != 'ROOT':
diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx
index af42eded4..f8db0a433 100644
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 
 from thinc.typedefs cimport weight_t
+from collections import OrderedDict
 
 from .stateclass cimport StateClass
 from ._state cimport StateC
@@ -51,17 +52,29 @@ cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil:
 
 
 cdef class BiluoPushDown(TransitionSystem):
+    def __init__(self, *args, **kwargs):
+        TransitionSystem.__init__(self, *args, **kwargs)
+
+    def __reduce__(self):
+        labels_by_action = OrderedDict()
+        cdef Transition t
+        for trans in self.c[:self.n_moves]:
+            label_str = self.strings[trans.label]
+            labels_by_action.setdefault(trans.move, []).append(label_str)
+        return (BiluoPushDown, (self.strings, labels_by_action),
+                None, None)
+
     @classmethod
     def get_actions(cls, **kwargs):
         actions = kwargs.get('actions',
-                    {
-                        MISSING: [''],
-                        BEGIN: [],
-                        IN: [],
-                        LAST: [],
-                        UNIT: [],
-                        OUT: ['']
-                    })
+                    OrderedDict((
+                        (MISSING, ['']),
+                        (BEGIN, []),
+                        (IN, []),
+                        (LAST, []),
+                        (UNIT, []),
+                        (OUT, [''])
+                    )))
         seen_entities = set()
         for entity_type in kwargs.get('entity_types', []):
             if entity_type in seen_entities:
@@ -90,7 +103,7 @@ cdef class BiluoPushDown(TransitionSystem):
     def move_name(self, int move, int label):
         if move == OUT:
             return 'O'
-        elif move == 'MISSING':
+        elif move == MISSING:
             return 'M'
         else:
             return MOVE_NAMES[move] + '-' + self.strings[label]

From 8de9829f094fbf1ed418c527236218667baa1989 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 27 May 2017 15:50:40 -0500
Subject: [PATCH 13/25] Don't overwrite model in initialization, when loading

---
 spacy/_ml.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index f589704a6..ac7849bbb 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -19,6 +19,8 @@ import numpy
 
 
 def _init_for_precomputed(W, ops):
+    if (W**2).sum() != 0.:
+        return
     reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2]))
     ops.xavier_uniform_init(reshaped)
     W[:] = reshaped.reshape(W.shape)
@@ -247,6 +249,7 @@ def doc2feats(cols=None):
     model.cols = cols
     return model
 
+
 def print_shape(prefix):
     def forward(X, drop=0.):
         return X, lambda dX, **kwargs: dX

From 3eea5383a1adc179ed7d7feb2c957b1d78f0171b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 27 May 2017 15:51:55 -0500
Subject: [PATCH 14/25] Add move_names property to parser

---
 spacy/syntax/nn_parser.pyx | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 35966d536..6db6e5ae1 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -518,6 +518,14 @@ cdef class Parser:
                 xp.add.at(d_tokvecs,
                     ids, d_state_features * active_feats)
 
+    @property
+    def move_names(self):
+        names = []
+        for i in range(self.moves.n_moves):
+            name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
+            names.append(name)
+        return names
+
     def get_batch_model(self, batch_size, tokvecs, stream, dropout):
         lower, upper = self.model
         state2vec = precompute_hiddens(batch_size, tokvecs,

From 7ebd26b8aae34464c3b02cbc9b497bfe0ebfa7d2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 27 May 2017 15:52:20 -0500
Subject: [PATCH 15/25] Use ordered dict to specify transitions

---
 spacy/syntax/transition_system.pyx | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index 07102aeb0..211b2c950 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -5,7 +5,7 @@ from __future__ import unicode_literals
 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport weight_t
-from collections import defaultdict
+from collections import defaultdict, OrderedDict
 
 from ..structs cimport TokenC
 from .stateclass cimport StateClass
@@ -26,7 +26,7 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
 
 
 cdef class TransitionSystem:
-    def __init__(self, StringStore string_table, dict labels_by_action):
+    def __init__(self, StringStore string_table, labels_by_action):
         self.mem = Pool()
         self.strings = string_table
         self.n_moves = 0
@@ -34,14 +34,14 @@ cdef class TransitionSystem:
 
         self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))
 
-        for action, label_strs in sorted(labels_by_action.items()):
+        for action, label_strs in labels_by_action.items():
             for label_str in label_strs:
                 self.add_action(int(action), label_str)
         self.root_label = self.strings['ROOT']
         self.init_beam_state = _init_state
 
     def __reduce__(self):
-        labels_by_action = {}
+        labels_by_action = OrderedDict()
         cdef Transition t
         for trans in self.c[:self.n_moves]:
             label_str = self.strings[trans.label]
@@ -77,6 +77,11 @@ cdef class TransitionSystem:
                     history.append(i)
                     action.do(state.c, action.label)
                     break
+            else:
+                print(gold.words)
+                print(gold.ner)
+                print(history)
+                raise ValueError("Could not find gold move")
         return history
 
     cdef int initialize_state(self, StateC* state) nogil:

From b03fb2d7b068f4752fda7cb5783d3c08dd0adb63 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 28 May 2017 00:03:16 +0200
Subject: [PATCH 16/25] Update 101 and usage docs

---
 website/assets/img/docs/pipeline.svg                  | 2 +-
 website/docs/usage/_spacy-101/_vocab-stringstore.jade | 4 +++-
 website/docs/usage/lightning-tour.jade                | 2 ++
 website/docs/usage/rule-based-matching.jade           | 2 +-
 4 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/website/assets/img/docs/pipeline.svg b/website/assets/img/docs/pipeline.svg
index e42c2362f..2ff00d787 100644
--- a/website/assets/img/docs/pipeline.svg
+++ b/website/assets/img/docs/pipeline.svg
@@ -2,7 +2,7 @@
     <style>
         .svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro" }
         .svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro" }
-        .svg__pipeline__text-code {  fill: #1a1e23; font: 600 16px "Source Code Pro" }
+        .svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro" }
     </style>
     <rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/>
     <path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/>
diff --git a/website/docs/usage/_spacy-101/_vocab-stringstore.jade b/website/docs/usage/_spacy-101/_vocab-stringstore.jade
index 3f551c9e1..dd300b5b9 100644
--- a/website/docs/usage/_spacy-101/_vocab-stringstore.jade
+++ b/website/docs/usage/_spacy-101/_vocab-stringstore.jade
@@ -89,4 +89,6 @@ p
 
 p
     |  Even though both #[code Doc] objects contain the same words, the internal
-    |  integer IDs are very different.
+    |  integer IDs are very different. The same applies for all other strings,
+    |  like the annotation scheme. To avoid mismatched IDs, spaCy will always
+    |  export the vocab if you save a #[code Doc] or #[code nlp] object.
diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade
index 7de486070..8cf651be0 100644
--- a/website/docs/usage/lightning-tour.jade
+++ b/website/docs/usage/lightning-tour.jade
@@ -139,6 +139,8 @@ p
     new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')
 
 +infobox
+    |  #[strong API:] #[+api("language") #[code Language]],
+    |  #[+api("doc") #[code Doc]]
     |  #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
 
 +h(2, "rule-matcher") Match text with token rules
diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade
index fde6da6ef..1fd398ad9 100644
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/docs/usage/rule-based-matching.jade
@@ -345,7 +345,7 @@ p
     |  account and check the #[code subtree] for intensifiers like "very", to
     |  increase the sentiment score. At some point, you might also want to train
     |  a sentiment model. However, the approach described in this example is
-    |  very useful for #[strong bootstrapping rules to gather training data].
+    |  very useful for #[strong bootstrapping rules to collect training data].
     |  It's also an incredibly fast way to gather first insights into your data
     |  – with about 1 million tweets, you'd be looking at a processing time of
     |  #[strong under 1 minute].

From db116cbedabccb65a100898a3d285e1c2ee804a6 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 28 May 2017 00:03:31 +0200
Subject: [PATCH 17/25] Update tokenization 101 and add illustration

---
 website/assets/img/docs/tokenization.svg      | 123 ++++++++++++++++++
 .../docs/usage/_spacy-101/_tokenization.jade  |  44 +++++++
 website/docs/usage/spacy-101.jade             |   7 +-
 3 files changed, 171 insertions(+), 3 deletions(-)
 create mode 100644 website/assets/img/docs/tokenization.svg

diff --git a/website/assets/img/docs/tokenization.svg b/website/assets/img/docs/tokenization.svg
new file mode 100644
index 000000000..cc185a3a7
--- /dev/null
+++ b/website/assets/img/docs/tokenization.svg
@@ -0,0 +1,123 @@
+<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="600" height="380" viewBox="-20 -10 550 400">
+    <style>
+        .svg__tokenization__text { fill: #1a1e23; font: 18px "Source Sans Pro" }
+        .svg__tokenization__text-small { fill: #fff; font: 600 13px "Source Code Pro" }
+    </style>
+    <path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 39v12H16v11M71 39v12h20v11"/>
+    <path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M1 1h140v38.2H1z"/>
+    <text class="svg__tokenization__text" dy="1em" width="43" height="19" transform="translate(48.5 9.5)">“Let’s</text>
+    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 39v23"/>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 1h50v38.2h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(164.5 9.5)" width="19" height="19">go</text>
+    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 39v23"/>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 1h50v38.2h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(226.5 9.5)" width="15" height="19">to</text>
+    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 39v23"/>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 1h141v38.2H270z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(320.5 9.5)" width="38" height="19">N.Y.!”</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 100v20"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 62h30v38.2H1z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(11.5 70.5)" width="7" height="19">“</text>
+    <path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M91 100v11H66v9M91 100v11h29v9"/>
+    <path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M41 62h100v38.2H41z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(72.5 70.5)" width="35" height="19">Let’s</text>
+    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 100v20"/>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 62h50v38.2h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(164.5 70.5)" width="19" height="19">go</text>
+    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 100v20"/>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 62h50v38.2h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(226.5 70.5)" width="15" height="19">to</text>
+    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 100v20"/>
+    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 62h141v38.2H270z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(320.5 70.5)" width="38" height="19">N.Y.!”</text>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 120h30v38H1z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(11.5 128.5)" width="7" height="19">“</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 158v24"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 120h50v38H41z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(53.5 128.5)" width="23" height="19">Let</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 158v24"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 120h50v38h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(164.5 128.5)" width="19" height="19">go</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 158v24"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 120h50v38h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(226.5 128.5)" width="15" height="19">to</text>
+    <path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M341 158v13h-20v11M341 158v13h55v11"/>
+    <path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 120h141v38H270z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(320.5 128.5)" width="38" height="19">N.Y.!”</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 158v24"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 120h40v38h-40z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(113.5 128.5)" width="11" height="19">’s</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 220v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 181.8h30V220H1z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(11.5 190.5)" width="7" height="19">“</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 220v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 181.8h50V220H41z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(53.5 190.5)" width="23" height="19">Let</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 220v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 181.8h50V220h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(164.5 190.5)" width="19" height="19">go</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 220v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 181.8h50V220h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(226.5 190.5)" width="15" height="19">to</text>
+    <path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M321 220v11h-20v12M321 220v11h34v12"/>
+    <path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 181.8h101V220H270z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(304.5 190.5)" width="30" height="19">N.Y.!</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 220v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 181.8h40V220h-40z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(113.5 190.5)" width="11" height="19">’s</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 220v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 181.8h30V220h-30z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(391.5 190.5)" width="7" height="19">”</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 281v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 242.7h30V281H1z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(11.5 251.5)" width="7" height="19">“</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 281v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 242.7h50V281H41z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(53.5 251.5)" width="23" height="19">Let</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 281v20-17 20"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 242.7h50V281h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(164.5 251.5)" width="19" height="19">go</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 281v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 242.7h50V281h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(226.5 251.5)" width="15" height="19">to</text>
+    <path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M301 281v23"/>
+    <path fill="#f8cecc" stroke="#b85450" stroke-width="2" d="M270 242.7h61V281h-61z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(286.5 251.5)" width="26" height="19">N.Y.</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 281v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 242.7h40V281h-40z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(113.5 251.5)" width="11" height="19">’s</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 281v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 242.7h30V281h-30z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(391.5 251.5)" width="7" height="19">”</text>
+    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M355 281v23"/>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 242.7h30V281h-30z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(351.5 251.5)" width="5" height="19">!</text>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 304h30v38H1z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(11.5 312.5)" width="7" height="19">“</text>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 304h50v38H41z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(53.5 312.5)" width="23" height="19">Let</text>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 304h50v38h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(164.5 312.5)" width="19" height="19">go</text>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 304h50v38h-50z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(226.5 312.5)" width="15" height="19">to</text>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M270 304h61v38h-61z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(286.5 312.5)" width="26" height="19">N.Y.</text>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 304h40v38h-40z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(113.5 312.5)" width="11" height="19">’s</text>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 304h30v38h-30z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(391.5 312.5)" width="7" height="19">”</text>
+    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 304h30v38h-30z"/>
+    <text class="svg__tokenization__text" dy="1em" transform="translate(351.5 312.5)" width="5" height="19">!</text>
+    <rect width="104" height="19" x="437" y="72" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
+    <text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 74.5)" width="65" height="12">EXCEPTION</text>
+    <rect width="104" height="19" x="437" y="11" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
+    <text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 13.5)" width="43" height="12">PREFIX</text>
+    <rect width="104" height="19" x="437" y="130" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
+    <text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 132.5)" width="43" height="12">SUFFIX</text>
+    <rect width="104" height="19" x="437" y="191" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
+    <text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 193.5)" width="43" height="12">SUFFIX</text>
+    <rect width="104" height="19" x="437" y="252" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
+    <text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 254.5)" width="65" height="12">EXCEPTION</text>
+    <rect width="104" height="19" x="437" y="313" fill="#82b366" stroke="#82b366" stroke-width="2" rx="2.9" ry="2.9"/>
+    <text class="svg__tokenization__text-small" dy="0.9em" transform="translate(473.5 315.5)" width="29" height="12">DONE</text>
+</svg>
diff --git a/website/docs/usage/_spacy-101/_tokenization.jade b/website/docs/usage/_spacy-101/_tokenization.jade
index 64e3f5881..95a9cc520 100644
--- a/website/docs/usage/_spacy-101/_tokenization.jade
+++ b/website/docs/usage/_spacy-101/_tokenization.jade
@@ -16,3 +16,47 @@ p
     +row
         for cell in ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "$", "1", "billion"]
             +cell=cell
+
+p
+    |  Fist, the raw text is split on whitespace characters, similar to
+    |  #[code text.split(' ')]. Then, the tokenizer processes the text from
+    |  left to right. On each substring, it performs two checks:
+
++list("numbers")
+    +item
+        |  #[strong Does the substring match a tokenizer exception rule?] For
+        |  example, "don't" does not contain whitespace, but should be split
+        |  into two tokens, "do" and "n't", while "U.K." should always
+        |  remain one token.
+    +item
+        |  #[strong Can a prefix, suffix or infixes be split off?]. For example
+        |  punctuation like commas, periods, hyphens or quotes.
+
+p
+    |  If there's a match, the rule is applied and the tokenizer continues its
+    |  loop, starting with the newly split substrings. This way, spaCy can split
+    |  #[strong complex, nested tokens] like combinations of abbreviations and
+    |  multiple punctuation marks.
+
++aside
+    |  #[strong Tokenizer exception:] Special-case rule to split a string into
+    |  several tokens or prevent a token from being split when punctuation rules
+    |  are applied.#[br]
+    |  #[strong Prefix:] Character(s) at the beginning, e.g.
+    |  #[code $], #[code (], #[code “], #[code ¿].#[br]
+    |  #[strong Suffix:] Character(s) at the end, e.g.
+    |  #[code km], #[code &#41;], #[code ”], #[code !].#[br]
+    |  #[strong Infix:] Character(s) in between, e.g.
+    |  #[code -], #[code --], #[code /], #[code …].#[br]
+
++image
+    include ../../../assets/img/docs/tokenization.svg
+    .u-text-right
+        +button("/assets/img/docs/tokenization.svg", false, "secondary").u-text-tag View large graphic
+
+p
+    |  While punctuation rules are usually pretty general, tokenizer exceptions
+    |  strongly depend on the specifics of the individual language. This is
+    |  why each #[+a("/docs/api/language-models") available language] has its
+    |  own subclass like #[code English] or #[code German], that loads in lists
+    |  of hard-coded data and exception rules.
diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade
index 7c6525004..8b2d0c17e 100644
--- a/website/docs/usage/spacy-101.jade
+++ b/website/docs/usage/spacy-101.jade
@@ -94,9 +94,10 @@ p
 include _spacy-101/_tokenization
 
 +infobox
-    |  To learn more about how spaCy's tokenizer and its rules work in detail,
-    |  how to #[strong customise] it and how to #[strong add your own tokenizer]
-    |  to a processing pipeline, see the usage guide on
+    |  To learn more about how spaCy's tokenization rules work in detail,
+    |  how to #[strong customise and replace] the default tokenizer and how to
+    |  #[strong add language-specific data], see the usage guides on
+    |  #[+a("/docs/usage/adding-languages") adding languages] and
     |  #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer].
 
 +h(3, "annotations-pos-deps") Part-of-speech tags and dependencies

From c8543c823792710dae5b0c6d77dc31c53fec177c Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 28 May 2017 00:04:04 +0200
Subject: [PATCH 18/25] Fix formatting and docstrings and remove deprecated
 function

---
 spacy/util.py   | 22 +++++++++-------------
 spacy/vocab.pyx |  2 --
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/spacy/util.py b/spacy/util.py
index e42bde810..a30b35a06 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -177,10 +177,13 @@ def get_async(stream, numpy_array):
 
 def itershuffle(iterable, bufsize=1000):
     """Shuffle an iterator. This works by holding `bufsize` items back
-    and yielding them sometime later. Obviously, this is not unbiased --
+    and yielding them sometime later. Obviously, this is not unbiased –
     but should be good enough for batching. Larger bufsize means less bias.
-
     From https://gist.github.com/andres-erbsen/1307752
+
+    iterable (iterable): Iterator to shuffle.
+    bufsize (int): Items to hold back.
+    YIELDS (iterable): The shuffled iterator.
     """
     iterable = iter(iterable)
     buf = []
@@ -315,17 +318,16 @@ def normalize_slice(length, start, stop, step=None):
 
 
 def compounding(start, stop, compound):
-    '''Yield an infinite series of compounding values. Each time the
+    """Yield an infinite series of compounding values. Each time the
     generator is called, a value is produced by multiplying the previous
     value by the compound rate.
 
-    EXAMPLE
-
+    EXAMPLE:
       >>> sizes = compounding(1., 10., 1.5)
       >>> assert next(sizes) == 1.
       >>> assert next(sizes) == 1 * 1.5
       >>> assert next(sizes) == 1.5 * 1.5
-    '''
+    """
     def clip(value):
         return max(value, stop) if (start>stop) else min(value, stop)
     curr = float(start)
@@ -335,7 +337,7 @@ def compounding(start, stop, compound):
 
 
 def decaying(start, stop, decay):
-    '''Yield an infinite series of linearly decaying values.'''
+    """Yield an infinite series of linearly decaying values."""
     def clip(value):
         return max(value, stop) if (start>stop) else min(value, stop)
     nr_upd = 1.
@@ -344,12 +346,6 @@ def decaying(start, stop, decay):
         nr_upd += 1
 
 
-def check_renamed_kwargs(renamed, kwargs):
-    for old, new in renamed.items():
-        if old in kwargs:
-            raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
-
-
 def read_json(location):
     """Open and load JSON from file.
 
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index d7d27a3e4..55fde0123 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -53,8 +53,6 @@ cdef class Vocab:
             vice versa.
         RETURNS (Vocab): The newly constructed vocab object.
         """
-        util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
-
         lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
         tag_map = tag_map if tag_map is not None else {}
         if lemmatizer in (None, True, False):

From c1983621fbe34659b9243b1af603ed9b85495ac6 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 28 May 2017 00:22:00 +0200
Subject: [PATCH 19/25] Update util functions for model loading

---
 spacy/__init__.py          |  12 +---
 spacy/cli/info.py          |  10 +++-
 spacy/cli/link.py          |   2 +-
 spacy/util.py              | 111 +++++++++++++++++++++++++------------
 website/docs/api/util.jade |  90 ++++++++++++++++--------------
 5 files changed, 132 insertions(+), 93 deletions(-)

diff --git a/spacy/__init__.py b/spacy/__init__.py
index 6beb7955e..f9e29037f 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -1,9 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-import importlib
-
-from .compat import basestring_
 from .cli.info import info as cli_info
 from .glossary import explain
 from .deprecated import resolve_load_name
@@ -12,14 +9,7 @@ from . import util
 
 def load(name, **overrides):
     name = resolve_load_name(name, **overrides)
-    model_path = util.resolve_model_path(name)
-    meta = util.parse_package_meta(model_path)
-    if 'lang' not in meta:
-        raise IOError('No language setting found in model meta.')
-    cls = util.get_lang_class(meta['lang'])
-    overrides['meta'] = meta
-    overrides['path'] = model_path
-    return cls(**overrides)
+    return util.load_model(name)
 
 
 def info(model=None, markdown=False):
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index 75aac10c7..70f054d84 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -20,8 +20,14 @@ def info(cmd, model=None, markdown=False):
     prints details in Markdown for easy copy-pasting to GitHub issues.
     """
     if model:
-        model_path = util.resolve_model_path(model)
-        meta = util.parse_package_meta(model_path)
+        if util.is_package(model):
+            model_path = util.get_package_path(model)
+        else:
+            model_path = util.get_data_path() / model
+        meta_path = model_path / 'meta.json'
+        if not meta_path.is_file():
+            prints(meta_path, title="Can't find model meta.json", exits=1)
+        meta = read_json(meta_path)
         if model_path.resolve() != model_path:
             meta['link'] = path2str(model_path)
             meta['source'] = path2str(model_path.resolve())
diff --git a/spacy/cli/link.py b/spacy/cli/link.py
index 9aecdabfe..66824c042 100644
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@@ -21,7 +21,7 @@ def link(cmd, origin, link_name, force=False):
     directory. Linking models allows loading them via spacy.load(link_name).
     """
     if util.is_package(origin):
-        model_path = util.get_model_package_path(origin)
+        model_path = util.get_package_path(model)
     else:
         model_path = Path(origin)
     if not model_path.exists():
diff --git a/spacy/util.py b/spacy/util.py
index a30b35a06..25fe198f4 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -78,27 +78,86 @@ def ensure_path(path):
         return path
 
 
-def resolve_model_path(name):
-    """Resolve a model name or string to a model path.
+def load_model(name):
+    """Load a model from a shortcut link, package or data path.
 
     name (unicode): Package name, shortcut link or model path.
-    RETURNS (Path): Path to model data directory.
+    RETURNS (Language): `Language` class with the loaded model.
     """
     data_path = get_data_path()
     if not data_path or not data_path.exists():
         raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
     if isinstance(name, basestring_):
-        if (data_path / name).exists(): # in data dir or shortcut link
-            return (data_path / name)
-        if is_package(name): # installed as a package
-            return get_model_package_path(name)
-        if Path(name).exists(): # path to model
-            return Path(name)
-    elif hasattr(name, 'exists'): # Path or Path-like object
-        return name
+        if (data_path / name).exists(): # in data dir or shortcut
+            return load_model_from_path(data_path / name)
+        if is_package(name): # installed as package
+            return load_model_from_pkg(name)
+        if Path(name).exists(): # path to model data directory
+            return load_data_from_path(Path(name))
+    elif hasattr(name, 'exists'): # Path or Path-like to model data
+        return load_data_from_path(name)
     raise IOError("Can't find model '%s'" % name)
 
 
+def load_model_from_init_py(init_file):
+    """Helper function to use in the `load()` method of a model package's
+    __init__.py.
+
+    init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
+    RETURNS (Language): `Language` class with loaded model.
+    """
+    model_path = Path(init_file).parent
+    return load_data_from_path(model_path, package=True)
+
+
+def load_model_from_path(model_path):
+    """Import and load a model package from its file path.
+
+    path (unicode or Path): Path to package directory.
+    RETURNS (Language): `Language` class with loaded model.
+    """
+    model_path = ensure_path(model_path)
+    spec = importlib.util.spec_from_file_location('model', model_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module.load()
+
+
+def load_model_from_pkg(name):
+    """Import and load a model package.
+
+    name (unicode): Name of model package installed via pip.
+    RETURNS (Language): `Language` class with loaded model.
+    """
+    module = importlib.import_module(name)
+    return module.load()
+
+
+def load_data_from_path(model_path, package=False):
+    """Initialie a `Language` class with a loaded model from a model data path.
+
+    model_path (unicode or Path): Path to model data directory.
+    package (bool): Does the path point to the parent package directory?
+    RETURNS (Language): `Language` class with loaded model.
+    """
+    model_path = ensure_path(model_path)
+    meta_path = model_path / 'meta.json'
+    if not meta_path.is_file():
+        raise IOError("Could not read meta.json from %s" % location)
+    meta = read_json(location)
+    for setting in ['lang', 'name', 'version']:
+        if setting not in meta:
+            raise IOError('No %s setting found in model meta.json' % setting)
+    if package:
+        model_data_path = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
+        model_path = model_path / model_data_path
+    if not model_path.exists():
+        raise ValueError("Can't find model directory: %s" % path2str(model_path))
+    cls = get_lang_class(meta['lang'])
+    nlp = cls(pipeline=meta.get('pipeline', True))
+    return nlp.from_disk(model_path)
+
+
 def is_package(name):
     """Check if string maps to a package installed via pip.
 
@@ -112,36 +171,16 @@ def is_package(name):
     return False
 
 
-def get_model_package_path(package_name):
-    """Get path to a model package installed via pip.
+def get_package_path(name):
+    """Get the path to an installed package.
 
-    package_name (unicode): Name of installed package.
-    RETURNS (Path): Path to model data directory.
+    name (unicode): Package name.
+    RETURNS (Path): Path to installed package.
     """
     # Here we're importing the module just to find it. This is worryingly
     # indirect, but it's otherwise very difficult to find the package.
-    # Python's installation and import rules are very complicated.
     pkg = importlib.import_module(package_name)
-    package_path = Path(pkg.__file__).parent.parent
-    meta = parse_package_meta(package_path / package_name)
-    model_name = '%s-%s' % (package_name, meta['version'])
-    return package_path / package_name / model_name
-
-
-def parse_package_meta(package_path, require=True):
-    """Check if a meta.json exists in a package and return its contents.
-
-    package_path (Path): Path to model package directory.
-    require (bool): If True, raise error if no meta.json is found.
-    RETURNS (dict or None): Model meta.json data or None.
-    """
-    location = package_path / 'meta.json'
-    if location.is_file():
-        return read_json(location)
-    elif require:
-        raise IOError("Could not read meta.json from %s" % location)
-    else:
-        return None
+    return Path(pkg.__file__).parent
 
 
 def is_in_jupyter():
diff --git a/website/docs/api/util.jade b/website/docs/api/util.jade
index 717abf34a..3e132b7b4 100644
--- a/website/docs/api/util.jade
+++ b/website/docs/api/util.jade
@@ -1,12 +1,10 @@
-//- 💫 DOCS > API > ANNOTATION SPECS
+//- 💫 DOCS > API > UTIL
 
 include ../../_includes/_mixins
 
 p
     |  spaCy comes with a small collection of utility functions located in
     |  #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py].
-
-+infobox("Important note")
     |  Because utility functions are mostly intended for
     |  #[strong internal use within spaCy], their behaviour may change with
     |  future releases. The functions documented on this page should be safe
@@ -74,15 +72,23 @@ p
         +cell #[code Language]
         +cell Language class.
 
-+h(2, "resolve_model_path") util.resolve_model_path
++h(2, "load_model") util.load_model
     +tag function
     +tag-new(2)
 
-p Resolve a model name or string to a model path.
+p
+    |  Load a model from a shortcut link, package or data path. If called with a
+    |  shortcut link or package name, spaCy will assume the model is a Python
+    |  package and import and call its #[code load()] method. If called with a
+    |  path, spaCy will assume it's a data directory, read the language and
+    |  pipeline settings from the meta.json and initialise a #[code Language]
+    |  class. The model data will then be loaded in via
+    |  #[+api("language#from_disk") #[code Language.from_disk()]].
 
 +aside-code("Example").
-    model_path = util.resolve_model_path('en')
-    model_path = util.resolve_model_path('/path/to/en')
+    nlp = util.load_model('en')
+    nlp = util.load_model('en_core_web_sm')
+    nlp = util.load_model('/path/to/data')
 
 +table(["Name", "Type", "Description"])
     +row
@@ -92,8 +98,33 @@ p Resolve a model name or string to a model path.
 
     +footrow
         +cell returns
-        +cell #[code Path]
-        +cell Path to model data directory.
+        +cell #[code Language]
+        +cell #[code Language] class with the loaded model.
+
++h(2, "load_model_from_init_py") util.load_model_from_init_py
+    +tag function
+    +tag-new(2)
+
+p
+    |  A helper function to use in the #[code load()] method of a model package's
+    |  #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py].
+
++aside-code("Example").
+    from spacy.util import load_model_from_init_py
+
+    def load():
+        return load_model_from_init_py(__file__)
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code init_file]
+        +cell unicode
+        +cell Path to model's __init__.py, i.e. #[code __file__].
+
+    +footrow
+        +cell returns
+        +cell #[code Language]
+        +cell #[code Language] class with the loaded model.
 
 +h(2, "is_package") util.is_package
     +tag function
@@ -117,16 +148,18 @@ p
         +cell #[code bool]
         +cell #[code True] if installed package, #[code False] if not.
 
-+h(2, "get_model_package_path") util.get_model_package_path
++h(2, "get_package_path") util.get_package_path
     +tag function
+    +tag-new(2)
 
 p
-    |  Get path to a #[+a("/docs/usage/models") model package] installed via pip.
-    |  Currently imports the package to find it and parse its meta data.
+    |  Get path to an installed package. Mainly used to resolve the location of
+    |  #[+a("/docs/usage/models") model packages]. Currently imports the package
+    |  to find its path.
 
 +aside-code("Example").
-    util.get_model_package_path('en_core_web_sm')
-    # /usr/lib/python3.6/site-packages/en_core_web_sm/en_core_web_sm-1.2.0
+    util.get_package_path('en_core_web_sm')
+    # /usr/lib/python3.6/site-packages/en_core_web_sm
 
 +table(["Name", "Type", "Description"])
     +row
@@ -137,37 +170,8 @@ p
     +footrow
         +cell returns
         +cell #[code Path]
-        +cell Path to model data directory.
-
-+h(2, "parse_package_meta") util.parse_package_meta
-    +tag function
-
-p
-    |  Check if a #[code meta.json] exists in a model package and return its
-    |  contents.
-
-+aside-code("Example").
-    if util.is_package('en_core_web_sm'):
-        path = util.get_model_package_path('en_core_web_sm')
-        meta = util.parse_package_meta(path, require=True)
-        # {'name': 'core_web_sm', 'lang': 'en', ...}
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code package_path]
-        +cell #[code Path]
         +cell Path to model package directory.
 
-    +row
-        +cell #[code require]
-        +cell #[code bool]
-        +cell If #[code True], raise error if no #[code meta.json] is found.
-
-    +footrow
-        +cell returns
-        +cell dict / #[code None]
-        +cell Model meta data or #[code None].
-
 +h(2, "is_in_jupyter") util.is_in_jupyter
     +tag function
     +tag-new(2)

From eb703f7656a85fa3a7bf01877edd3b9bfd7f7e7d Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 28 May 2017 00:32:43 +0200
Subject: [PATCH 20/25] Update API docs

---
 website/docs/api/_data.json |  3 ++-
 website/docs/api/spacy.jade | 11 ++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/website/docs/api/_data.json b/website/docs/api/_data.json
index f6a6a7e31..2af9bca1b 100644
--- a/website/docs/api/_data.json
+++ b/website/docs/api/_data.json
@@ -158,7 +158,8 @@
 
     "binder": {
         "title": "Binder",
-        "tag": "class"
+        "tag": "class",
+        "source": "spacy/tokens/binder.pyx"
     },
 
     "annotation": {
diff --git a/website/docs/api/spacy.jade b/website/docs/api/spacy.jade
index f2fcfde2c..a45307378 100644
--- a/website/docs/api/spacy.jade
+++ b/website/docs/api/spacy.jade
@@ -11,8 +11,13 @@ p
     |  the name of an installed
     |  #[+a("/docs/usage/saving-loading#generating") model package], a unicode
     |  path or a #[code Path]-like object. spaCy will try resolving the load
-    |  argument in this order. The #[code Language] class to initialise will be
-    |  determined based on the model's settings.
+    |  argument in this order. If a model is loaded from a shortcut link or
+    |  package name, spaCy will assume it's a Python package and import it and
+    |  call the model's own #[code load()] method. If a model is loaded from a
+    |  path, spaCy will assume it's a data directory, read the language and
+    |  pipeline settings off the meta.json and initialise the #[code Language]
+    |  class. The data will be loaded in via
+    |  #[+api("language#from_disk") #[code Language.from_disk()]].
 
 +aside-code("Example").
     nlp = spacy.load('en') # shortcut link
@@ -20,7 +25,7 @@ p
     nlp = spacy.load('/path/to/en') # unicode path
     nlp = spacy.load(Path('/path/to/en')) # pathlib Path
 
-    nlp = spacy.load('en', disable['parser', 'tagger'])
+    nlp = spacy.load('en', disable=['parser', 'tagger'])
 
 +table(["Name", "Type", "Description"])
     +row

From 01a7b10319cf8e73a0c88faf8de8f8ecb1426dfa Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 28 May 2017 00:32:54 +0200
Subject: [PATCH 21/25] Add fallback fonts to illustrations

---
 website/assets/img/docs/architecture.svg      | 8 ++++----
 website/assets/img/docs/language_data.svg     | 6 +++---
 website/assets/img/docs/pipeline.svg          | 6 +++---
 website/assets/img/docs/tokenization.svg      | 4 ++--
 website/assets/img/docs/vocab_stringstore.svg | 8 ++++----
 5 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/website/assets/img/docs/architecture.svg b/website/assets/img/docs/architecture.svg
index f586b75eb..c1d12d79b 100644
--- a/website/assets/img/docs/architecture.svg
+++ b/website/assets/img/docs/architecture.svg
@@ -1,9 +1,9 @@
 <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="746">
     <style>
-        .svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
-        .svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" }
-        .svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" }
-        .svg__architecture__text-code {  fill: #1a1e23; font: 600 12px "Source Code Pro" }
+        .svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
+        .svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
+        .svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
+        .svg__architecture__text-code {  fill: #1a1e23; font: 600 12px "Source Code Pro", Monaco, "Courier New", monospace }
     </style>
     <ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
     <text class="svg__architecture__text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>
diff --git a/website/assets/img/docs/language_data.svg b/website/assets/img/docs/language_data.svg
index b74fffba6..31e1a1b29 100644
--- a/website/assets/img/docs/language_data.svg
+++ b/website/assets/img/docs/language_data.svg
@@ -1,8 +1,8 @@
 <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="931" height="456" viewBox="-1 -1 932 480" preserveAspectRatio="xMinYMin meet">
     <style>
-        .svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
-        .svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
-        .svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
+        .svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
+        .svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
+        .svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
     </style>
     <path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M610 404h-69.8" stroke-dasharray="1 6" stroke-linecap="round"/>
     <path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M534.2 404l8-4-2 4 2 4z"/>
diff --git a/website/assets/img/docs/pipeline.svg b/website/assets/img/docs/pipeline.svg
index 2ff00d787..8f9dc6dac 100644
--- a/website/assets/img/docs/pipeline.svg
+++ b/website/assets/img/docs/pipeline.svg
@@ -1,8 +1,8 @@
 <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 923 200" width="923" height="200">
     <style>
-        .svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro" }
-        .svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro" }
-        .svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro" }
+        .svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
+        .svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
+        .svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro", Monaco, "Courier New", monospace }
     </style>
     <rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/>
     <path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/>
diff --git a/website/assets/img/docs/tokenization.svg b/website/assets/img/docs/tokenization.svg
index cc185a3a7..f5b164725 100644
--- a/website/assets/img/docs/tokenization.svg
+++ b/website/assets/img/docs/tokenization.svg
@@ -1,7 +1,7 @@
 <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="600" height="380" viewBox="-20 -10 550 400">
     <style>
-        .svg__tokenization__text { fill: #1a1e23; font: 18px "Source Sans Pro" }
-        .svg__tokenization__text-small { fill: #fff; font: 600 13px "Source Code Pro" }
+        .svg__tokenization__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
+        .svg__tokenization__text-small { fill: #fff; font: 600 13px "Source Code Pro", Monaco, "Courier New", monospace }
     </style>
     <path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 39v12H16v11M71 39v12h20v11"/>
     <path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M1 1h140v38.2H1z"/>
diff --git a/website/assets/img/docs/vocab_stringstore.svg b/website/assets/img/docs/vocab_stringstore.svg
index f660a8604..644453737 100644
--- a/website/assets/img/docs/vocab_stringstore.svg
+++ b/website/assets/img/docs/vocab_stringstore.svg
@@ -1,9 +1,9 @@
 <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-10 -10 582 365" width="572" height="355">
     <style>
-        .svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro" }
-        .svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro"; text-transform: uppercase }
-        .svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro" }
-        .svg__vocab__text-code {  fill: #1a1e23; font: bold 12px "Source Code Pro" }
+        .svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
+        .svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif; text-transform: uppercase }
+        .svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
+        .svg__vocab__text-code {  fill: #1a1e23; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
     </style>
     <rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/>
     <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/>

From 33e332e67ce7163982806dc5b45a97c6de697486 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 28 May 2017 00:57:59 +0200
Subject: [PATCH 22/25] Remove unused export

---
 spacy/lang/en/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py
index 7b7d4e1bb..7e1da789b 100644
--- a/spacy/lang/en/__init__.py
+++ b/spacy/lang/en/__init__.py
@@ -35,4 +35,4 @@ class English(Language):
     Defaults = EnglishDefaults
 
 
-__all__ = ['English', 'EnglishDefaults']
+__all__ = ['English']

From 84189c1cab1f8534597cbdf740a8ba51ac1d086a Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 28 May 2017 00:58:59 +0200
Subject: [PATCH 23/25] Add 'xx' language ID for multi-language support

Allows models to specify their language ID as 'xx'.
---
 spacy/lang/xx/__init__.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 spacy/lang/xx/__init__.py

diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/xx/__init__.py
new file mode 100644
index 000000000..fef8c9d59
--- /dev/null
+++ b/spacy/lang/xx/__init__.py
@@ -0,0 +1,26 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ...language import Language
+from ...attrs import LANG
+from ...util import update_exc
+
+
+class MultiLanguageDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'xx'
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
+
+
+class MultiLanguage(Language):
+    """Language class to be used for models that support multiple languages.
+    This module allows models to specify their language ID as 'xx'.
+    """
+    lang = 'xx'
+    Defaults = MultiLanguageDefaults
+
+
+__all__ = ['MultiLanguage']

From eb5a8be9ade339d7c0a9c01e8075c9ee6827f749 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 28 May 2017 01:15:44 +0200
Subject: [PATCH 24/25] Update language overview and add section on 'xx' lang
 class

---
 website/docs/api/language-models.jade | 43 +++++++++++++++++++++++----
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/website/docs/api/language-models.jade b/website/docs/api/language-models.jade
index 0990de358..74007f228 100644
--- a/website/docs/api/language-models.jade
+++ b/website/docs/api/language-models.jade
@@ -2,7 +2,10 @@
 
 include ../../_includes/_mixins
 
-p spaCy currently supports the following languages and capabilities:
+p
+    |  spaCy currently provides models for the following languages and
+    |  capabilities:
+
 
 +aside-code("Download language models", "bash").
     python -m spacy download en
@@ -22,12 +25,16 @@ p spaCy currently supports the following languages and capabilities:
 
     +row
         +cell French #[code fr]
-        each icon in [ "pro", "pro", "con", "pro", "con", "pro", "pro", "con" ]
+        each icon in [ "pro", "con", "con", "pro", "con", "pro", "pro", "con" ]
             +cell.u-text-center #[+procon(icon)]
 
-+h(2, "available") Available models
+    +row
+        +cell Spanish #[code es]
+        each icon in [ "pro", "pro", "con", "pro", "pro", "pro", "pro", "con" ]
+            +cell.u-text-center #[+procon(icon)]
 
-include ../usage/_models-list
+p
+    +button("/docs/usage/models", true, "primary") See available models
 
 +h(2, "alpha-support") Alpha tokenization support
 
@@ -52,9 +59,35 @@ p
     |  #[+a("https://github.com/mocobeta/janome") Janome].
 
 +table([ "Language", "Code", "Source" ])
-    each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
+    each language, code in { it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
         +row
             +cell #{language}
             +cell #[code=code]
             +cell
                 +src(gh("spaCy", "spacy/lang/" + code)) lang/#{code}
+
++h(2, "multi-language") Multi-language support
+    +tag-new(2)
+
+p
+    |  As of v2.0, spaCy supports models trained on more than one language. This
+    |  is especially useful for named entity recognition. The language ID used
+    |  for multi-language or language-neutral models is #[code xx]. The
+    |  language class, a generic subclass containing only the base language data,
+    |  can be found in #[+src(gh("spaCy", "spacy/lang/xx")) lang/xx].
+
+p
+    |  To load your model with the neutral, multi-language class, simply set
+    |  #[code "language": "xx"] in your
+    |  #[+a("/docs/usage/saving-loading#models-generating") model package]'s
+    |  meta.json. You can also import the class directly, or call
+    |  #[+api("util#get_lang_class") #[code util.get_lang_class()]] for
+    |  lazy-loading.
+
++code("Standard import").
+    from spacy.lang.xx import MultiLanguage
+    nlp = MultiLanguage()
+
++code("With lazy-loading").
+    from spacy.util import get_lang_class
+    nlp = get_lang_class('xx')

From 10d05c2b9274073da0edac0379e3a42d97816992 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 28 May 2017 01:30:12 +0200
Subject: [PATCH 25/25] Fix typos, wording and formatting

---
 .../docs/usage/_spacy-101/_similarity.jade    |  2 +-
 .../usage/language-processing-pipeline.jade   |  2 +-
 website/docs/usage/spacy-101.jade             | 10 ++-
 website/docs/usage/v2.jade                    | 85 +++++++++----------
 4 files changed, 49 insertions(+), 50 deletions(-)

diff --git a/website/docs/usage/_spacy-101/_similarity.jade b/website/docs/usage/_spacy-101/_similarity.jade
index c99bc9658..6eed1eb7f 100644
--- a/website/docs/usage/_spacy-101/_similarity.jade
+++ b/website/docs/usage/_spacy-101/_similarity.jade
@@ -5,7 +5,7 @@ p
     |  #[strong how similar they are]. Predicting similarity is useful for
     |  building recommendation systems or flagging duplicates. For example, you
     |  can suggest a user content that's similar to what they're currently
-    |  looking at, or label a support ticket as a duplicate, if it's very
+    |  looking at, or label a support ticket as a duplicate if it's very
     |  similar to an already existing one.
 
 p
diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade
index 1392fc2f8..ffad01ead 100644
--- a/website/docs/usage/language-processing-pipeline.jade
+++ b/website/docs/usage/language-processing-pipeline.jade
@@ -144,7 +144,7 @@ p
 +table(["Argument", "Type", "Description"])
     +row
         +cell #[code vocab]
-        +cell #[coce Vocab]
+        +cell #[code Vocab]
         +cell
             |  Shared data between components, including strings, morphology,
             |  vectors etc.
diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade
index 8b2d0c17e..6a1f780dc 100644
--- a/website/docs/usage/spacy-101.jade
+++ b/website/docs/usage/spacy-101.jade
@@ -65,7 +65,7 @@ p
     |  spaCy provides a variety of linguistic annotations to give you insights
     |  into a text's grammatical structure. This includes the word types,
     |  i.e. the parts of speech, and how the words are related to each other.
-    |  For example, if you're analysing text, it makes a #[em huge] difference
+    |  For example, if you're analysing text, it makes a huge difference
     |  whether a noun is the subject of a sentence, or the object – or whether
     |  "google" is used as a verb, or refers to the website or company in a
     |  specific context.
@@ -119,9 +119,11 @@ include _spacy-101/_named-entities
 
 +infobox
     |  To learn more about entity recognition in spaCy, how to
-    |  #[strong add your own entities] to a document and how to train and update
-    |  the entity predictions of a model, see the usage guide on
-    |  #[+a("/docs/usage/entity-recognition") named entity recognition].
+    |  #[strong add your own entities] to a document and how to
+    |  #[strong train and update] the entity predictions of a model, see the
+    |  usage guides on
+    |  #[+a("/docs/usage/entity-recognition") named entity recognition] and
+    |  #[+a("/docs/usage/training-ner") training the named entity recognizer].
 
 +h(2, "vectors-similarity") Word vectors and similarity
     +tag-model("vectors")
diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade
index 23b234c43..25aae8706 100644
--- a/website/docs/usage/v2.jade
+++ b/website/docs/usage/v2.jade
@@ -20,19 +20,18 @@ p
     nlp = Language(pipeline=['my_factory', mycomponent])
 
 p
-    |  It's now much easier to customise the pipeline with your own components.
-    |  Components are functions that receive a #[code Doc] object, modify and
-    |  return it. If your component is stateful, you'll want to create a new one
-    |  for each pipeline. You can do that by defining and registering a factory
-    |  which receives the shared #[code Vocab] object and returns a component.
-
-p
-    |  spaCy's default components – the vectorizer, tagger, parser and entity
-    |  recognizer, can be added to your pipeline by using their string IDs.
-    |  This way, you won't have to worry about finding and implementing them –
-    |  to use the default tagger, simply add #[code "tagger"] to the pipeline,
+    |  It's now much easier to #[strong customise the pipeline] with your own
+    |  components, functions that receive a #[code Doc] object, modify and
+    |  return it. If your component is stateful, you can define and register a
+    |  factory which receives the shared #[code Vocab] object and returns a
+    |  component. spaCy's default components can be added to your pipeline by
+    |  using their string IDs. This way, you won't have to worry about finding
+    |  and implementing them – simply add #[code "tagger"] to the pipeline,
     |  and spaCy will know what to do.
 
++image
+    include ../../assets/img/docs/pipeline.svg
+
 +infobox
     |  #[strong API:] #[+api("language") #[code Language]]
     |  #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
@@ -96,11 +95,10 @@ p
     |  #[code Language] class, or load a model that initialises one. This allows
     |  languages to contain more custom data, e.g. lemmatizer lookup tables, or
     |  complex regular expressions. The language data has also been tidied up
-    |  and simplified. It's now also possible to overwrite the functions that
-    |  compute lexical attributes like #[code like_num], and supply
-    |  language-specific syntax iterators, e.g. to determine noun chunks. spaCy
-    |  now also supports simple lookup-based lemmatization. The data is stored
-    |  in a dictionary mapping a string to its lemma.
+    |  and simplified. spaCy now also supports simple lookup-based lemmatization.
+
++image
+    include ../../assets/img/docs/language_data.svg
 
 +infobox
     |  #[strong API:] #[+api("language") #[code Language]]
@@ -111,13 +109,10 @@ p
 
 +aside-code("Example").
     from spacy.matcher import Matcher
-    from spacy.attrs import LOWER, IS_PUNCT
     matcher = Matcher(nlp.vocab)
-    matcher.add('HelloWorld', None,
-                [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
-                [{LOWER: 'hello'}, {LOWER: 'world'}])
+    matcher.add('HEARTS', None, [{'ORTH': '❤️', 'OP': '+'}])
     assert len(matcher) == 1
-    assert 'HelloWorld' in matcher
+    assert 'HEARTS' in matcher
 
 p
     |  Patterns can now be added to the matcher by calling
@@ -157,28 +152,8 @@ p
         +cell #[+api("language#to_disk") #[code Language.to_disk]]
 
     +row
-        +cell #[code Tokenizer.load]
-        +cell
-            |  #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
-            |  #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
-
-    +row
-        +cell #[code Tagger.load]
-        +cell
-            |  #[+api("tagger#from_disk") #[code Tagger.from_disk]]
-            |  #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
-
-    +row
-        +cell #[code DependencyParser.load]
-        +cell
-            |  #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
-            |  #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
-
-    +row
-        +cell #[code EntityRecognizer.load]
-        +cell
-            |  #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
-            |  #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
+        +cell #[code Language.create_make_doc]
+        +cell #[+api("language#attributes") #[code Language.tokenizer]]
 
     +row
         +cell
@@ -212,6 +187,28 @@ p
             |  #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
             |  #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
 
+    +row
+        +cell #[code Tokenizer.load]
+        +cell -
+
+    +row
+        +cell #[code Tagger.load]
+        +cell
+            |  #[+api("tagger#from_disk") #[code Tagger.from_disk]]
+            |  #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
+
+    +row
+        +cell #[code DependencyParser.load]
+        +cell
+            |  #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
+            |  #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
+
+    +row
+        +cell #[code EntityRecognizer.load]
+        +cell
+            |  #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
+            |  #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
+
     +row
         +cell #[code Matcher.load]
         +cell -
@@ -232,7 +229,7 @@ p
 
     +row
         +cell #[code Doc.read_bytes]
-        +cell
+        +cell #[+api("binder") #[code Binder]]
 
     +row
         +cell #[code Token.is_ancestor_of]