From 3d22fcaf0b3c7e4114153b5b3e1d8eb078fa8e44 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 26 May 2017 14:02:59 -0500
Subject: [PATCH 01/10] Return None from parser if there are no annotations

---
 spacy/syntax/nn_parser.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 35966d536..b7aca26b8 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -432,6 +432,8 @@ cdef class Parser:
                                                       0.0)
         todo = [(s, g) for (s, g) in zip(states, golds)
                 if not s.is_final() and g is not None]
+        if not todo:
+            return None
 
         backprops = []
         d_tokvecs = state2vec.ops.allocate(tokvecs.shape)

From 73a643d32a20d8c4a109bf3a92dff645c370bd17 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 27 May 2017 08:20:13 -0500
Subject: [PATCH 02/10] Don't randomise pipeline for training, and don't update
 if no gradient

---
 spacy/language.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index e4c18f8ca..7adae0ed5 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -212,18 +212,17 @@ class Language(object):
         """
         tok2vec = self.pipeline[0]
         feats = tok2vec.doc2feats(docs)
-        procs = list(self.pipeline[1:])
-        random.shuffle(procs)
         grads = {}
         def get_grads(W, dW, key=None):
             grads[key] = (W, dW)
-        for proc in procs:
+        for proc in self.pipeline[1:]:
             if not hasattr(proc, 'update'):
                 continue
             tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
             d_tokvecses = proc.update((docs, tokvecses), golds,
                                       drop=drop, sgd=get_grads, losses=losses)
-            bp_tokvecses(d_tokvecses, sgd=sgd)
+            if d_tokvecses is not None:
+                bp_tokvecses(d_tokvecses, sgd=sgd)
         for key, (W, dW) in grads.items():
             sgd(W, dW, key=key)
         # Clear the tensor variable, to free GPU memory.

From de13fe030548acf86e759e2c16c85712ab8e30bb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 27 May 2017 08:20:32 -0500
Subject: [PATCH 03/10] Remove length cap on sentences

---
 spacy/cli/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index b25cdcbd5..ed146cb24 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -70,12 +70,12 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
 
     optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu)
 
-    print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
+    print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
     try:
         for i in range(n_iter):
             with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar:
                 train_docs = corpus.train_docs(nlp, projectivize=True,
-                                               gold_preproc=False, max_length=1000)
+                                               gold_preproc=False, max_length=0)
                 losses = {}
                 for batch in minibatch(train_docs, size=batch_sizes):
                     docs, golds = zip(*batch)

From a8e58e04efc5b57a2425595eaf1e049c23a37352 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 27 May 2017 17:57:10 +0200
Subject: [PATCH 04/10] Add symbols class to punctuation rules to handle emoji
 (see #1088)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently doesn't work for Hungarian, because of conflicts with the
custom punctuation rules. Also doesn't take multi-character emoji like
👩🏽‍💻 into account.
---
 spacy/lang/bn/punctuation.py             | 10 +++++-----
 spacy/lang/char_classes.py               |  5 +++--
 spacy/lang/punctuation.py                | 11 ++++++-----
 spacy/tests/tokenizer/test_exceptions.py | 12 +++++++++---
 4 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/spacy/lang/bn/punctuation.py b/spacy/lang/bn/punctuation.py
index 66b7d967c..96485dd55 100644
--- a/spacy/lang/bn/punctuation.py
+++ b/spacy/lang/bn/punctuation.py
@@ -1,8 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS
-from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
+from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
 
 
 _currency = r"\$|¢|£|€|¥|฿|৳"
@@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
 _list_punct = LIST_PUNCT + '। ॥'.strip().split()
 
 
-_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES)
+_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
 
-_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES +
+_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
              [r'(?<=[0-9])\+',
               r'(?<=°[FfCcKk])\.',
               r'(?<=[0-9])(?:{})'.format(_currency),
               r'(?<=[0-9])(?:{})'.format(UNITS),
               r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
 
-_infixes = (LIST_ELLIPSES +
+_infixes = (LIST_ELLIPSES + LIST_ICONS +
             [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
              r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
              r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index 5b81eddde..bec685646 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -20,7 +20,6 @@ _upper = [_latin_upper]
 _lower = [_latin_lower]
 _uncased = [_bengali, _hebrew]
 
-
 ALPHA = merge_char_classes(_upper + _lower + _uncased)
 ALPHA_LOWER = merge_char_classes(_lower + _uncased)
 ALPHA_UPPER = merge_char_classes(_upper + _uncased)
@@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
 _punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
 _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
 _hyphens = '- – — -- ---'
-
+_other_symbols = r'[\p{So}]'
 
 UNITS = merge_chars(_units)
 CURRENCY = merge_chars(_currency)
 QUOTES = merge_chars(_quotes)
 PUNCT = merge_chars(_punct)
 HYPHENS = merge_chars(_hyphens)
+ICONS = _other_symbols
 
 LIST_UNITS = split_chars(_units)
 LIST_CURRENCY = split_chars(_currency)
@@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
 LIST_PUNCT = split_chars(_punct)
 LIST_HYPHENS = split_chars(_hyphens)
 LIST_ELLIPSES = [r'\.\.+', '…']
+LIST_ICONS = [_other_symbols]
diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py
index 74bb28f5f..680f5cff0 100644
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@@ -2,15 +2,16 @@
 from __future__ import unicode_literals
 
 from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
-from .char_classes import CURRENCY, UNITS
+from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
+from .char_classes import QUOTES, CURRENCY, UNITS
 
 
 _prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
-             LIST_CURRENCY)
+             LIST_CURRENCY + LIST_ICONS)
 
 
-_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
+_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
+             ["'s", "'S", "’s", "’S"] +
              [r'(?<=[0-9])\+',
               r'(?<=°[FfCcKk])\.',
               r'(?<=[0-9])(?:{})'.format(CURRENCY),
@@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
               r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
 
 
-_infixes = (LIST_ELLIPSES +
+_infixes = (LIST_ELLIPSES + LIST_ICONS +
             [r'(?<=[0-9])[+\-\*^](?=[0-9-])',
              r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
              r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py
index aab27714e..70fb103dc 100644
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@@ -1,7 +1,4 @@
 # coding: utf-8
-"""Test that tokenizer exceptions and emoticons are handled correctly."""
-
-
 from __future__ import unicode_literals
 
 import pytest
@@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer):
 def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
     tokens = tokenizer(text)
     assert len(tokens) == length
+
+
+@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
+                                         ('i💙you', 3), ('🤘🤘yay!', 4)])
+def test_tokenizer_handles_emoji(tokenizer, text, length):
+    exceptions = ["hu"]
+    tokens = tokenizer(text)
+    if tokens[0].lang_ not in exceptions:
+        assert len(tokens) == length

From e05bcd6aa838a7098c699a920e92628296961927 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 27 May 2017 17:57:46 +0200
Subject: [PATCH 05/10] Update docs to reflect flattened model meta.json

Don't use "setup" key and instead, keep "lang" on root level and add
"pipeline".
---
 .../usage/language-processing-pipeline.jade   | 22 ++++++++-----------
 website/docs/usage/saving-loading.jade        | 18 +++++++--------
 2 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade
index ce23a1666..1392fc2f8 100644
--- a/website/docs/usage/language-processing-pipeline.jade
+++ b/website/docs/usage/language-processing-pipeline.jade
@@ -19,19 +19,17 @@ p
 
 p
     |  When you load a model, spaCy first consults the model's
-    |  #[+a("/docs/usage/saving-loading#models-generating") meta.json] for its
-    |  #[code setup] details. This typically includes the ID of a language class,
+    |  #[+a("/docs/usage/saving-loading#models-generating") meta.json]. The
+    |  meta typically includes the model details, the ID of a language class,
     |  and an optional list of pipeline components. spaCy then does the
     |  following:
 
 +aside-code("meta.json (excerpt)", "json").
     {
         "name": "example_model",
+        "lang": "en"
         "description": "Example model for spaCy",
-        "setup": {
-            "lang": "en",
-            "pipeline": ["token_vectors", "tagger"]
-        }
+        "pipeline": ["token_vectors", "tagger"]
     }
 
 +list("numbers")
@@ -287,17 +285,15 @@ p
 
 p
     |  In the model package's meta.json, specify the language class and pipeline
-    |  IDs in #[code setup]:
+    |  IDs:
 
 +code("meta.json (excerpt)", "json").
     {
-        "name": "my_sentiment_model",
+        "name": "sentiment_model",
+        "lang": "en",
         "version": "1.0.0",
         "spacy_version": "&gt;=2.0.0,&lt;3.0.0",
-        "setup": {
-            "lang": "en",
-            "pipeline": ["vectorizer", "sentiment"]
-        }
+        "pipeline": ["vectorizer", "sentiment"]
     }
 
 p
@@ -307,7 +303,7 @@ p
     |  by your custom #[code "sentiment"] factory.
 
 +code.
-    nlp = spacy.load('my_sentiment_model')
+    nlp = spacy.load('en_sentiment_model')
     doc = nlp(u'I love pizza')
     assert doc.sentiment
 
diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade
index 477db925c..1ecb7d7ee 100644
--- a/website/docs/usage/saving-loading.jade
+++ b/website/docs/usage/saving-loading.jade
@@ -74,16 +74,14 @@ p
 +aside-code("meta.json", "json").
     {
         "name": "example_model",
+        "lang": "en",
         "version": "1.0.0",
         "spacy_version": "&gt;=2.0.0,&lt;3.0.0",
         "description": "Example model for spaCy",
         "author": "You",
         "email": "you@example.com",
         "license": "CC BY-SA 3.0",
-        "setup": {
-            "lang": "en",
-            "pipeline": ["token_vectors", "tagger"]
-        }
+        "pipeline": ["token_vectors", "tagger"]
     }
 
 +code(false, "bash").
@@ -110,9 +108,9 @@ p
 +h(3, "models-custom") Customising the model setup
 
 p
-    |  The meta.json includes a #[code setup] key that lets you customise how
-    |  the model should be initialised and loaded. You can define the language
-    |  data to be loaded and the
+    |  The meta.json includes the model details, like name, requirements and
+    |  license, and lets you customise how the model should be initialised and
+    |  loaded. You can define the language data to be loaded and the
     |  #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to
     |  execute.
 
@@ -183,9 +181,9 @@ p
 p
     |  To load a model from a data directory, you can use
     |  #[+api("spacy#load") #[code spacy.load()]] with the local path. This will
-    |  look for a meta.json in the directory and use the #[code setup] details
-    |  to initialise a #[code Language] class with a processing pipeline and
-    |  load in the model data.
+    |  look for a meta.json in the directory and use the #[code lang] and
+    |  #[code pipeline] settings to initialise a #[code Language] class with a
+    |  processing pipeline and load in the model data.
 
 +code.
     nlp = spacy.load('/path/to/model')

From 0d33ead507bfc79ac341fd9b0bbe3a1e8aacc1d9 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 27 May 2017 17:58:06 +0200
Subject: [PATCH 06/10] Fix initialisation of Doc in lightning tour example

---
 website/docs/usage/lightning-tour.jade | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade
index 4a9a2315f..eefb7a11a 100644
--- a/website/docs/usage/lightning-tour.jade
+++ b/website/docs/usage/lightning-tour.jade
@@ -129,13 +129,14 @@ p
 +code.
     import spacy
     from spacy.tokens.doc import Doc
+    from spacy.vocab import Vocab
 
     nlp = spacy.load('en')
     moby_dick = open('moby_dick.txt', 'r')
     doc = nlp(moby_dick)
     doc.to_disk('/moby_dick.bin')
 
-    new_doc = Doc().from_disk('/moby_dick.bin')
+    new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')
 
 +infobox
     |  #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]

From 22bf5f63bfb4a37fc8b01724c121d2abbfecaf6e Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 27 May 2017 17:58:18 +0200
Subject: [PATCH 07/10] Update Matcher docs and add social media analysis
 example

---
 website/docs/usage/rule-based-matching.jade | 119 +++++++++++++++++++-
 1 file changed, 115 insertions(+), 4 deletions(-)

diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade
index a54b70b89..fde6da6ef 100644
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/docs/usage/rule-based-matching.jade
@@ -11,7 +11,7 @@ p
     |  You can also associate patterns with entity IDs, to allow some basic
     |  entity linking or disambiguation.
 
-+aside("What about \"real\" regular expressions?")
+//-+aside("What about \"real\" regular expressions?")
 
 +h(2, "adding-patterns") Adding patterns
 
@@ -119,7 +119,7 @@ p
 +code.
     # Add a new custom flag to the vocab, which is always False by default.
     # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
-    BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False)
+    BAD_HTML_FLAG = nlp.vocab.add_flag(lambda text: False)
 
     def merge_and_flag(matcher, doc, i, matches):
         match_id, start, end = matches[i]
@@ -221,7 +221,7 @@ p
         +cell match 0 or 1 times
         +cell optional, max one
 
-+h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations
++h(2, "example1") Example: Using linguistic annotations
 
 p
     |  Let's say you're analysing user comments and you want to find out what
@@ -283,7 +283,7 @@ p
     # set manual=True to make displaCy render straight from a dictionary
     displacy.serve(matched_sents, style='ent', manual=True)
 
-+h(3, "quantifiers-example2") Quantifiers example: Phone numbers
++h(2, "example2") Example: Phone numbers
 
 p
     |  Phone numbers can have many different formats and matching them is often
@@ -320,3 +320,114 @@ p
     |  It'll produce more predictable results, is much easier to modify and
     |  extend, and doesn't require any training data – only a set of
     |  test cases.
+
++h(2, "example3") Example: Hashtags and emoji on social media
+
+p
+    |  Social media posts, especially tweets, can be difficult to work with.
+    |  They're very short and often contain various emoji and hashtags. By only
+    |  looking at the plain text, you'll lose a lot of valuable semantic
+    |  information.
+
+p
+    |  Let's say you've extracted a large sample of social media posts on a
+    |  specific topic, for example posts mentioning a brand name or product.
+    |  As the first step of your data exploration, you want to filter out posts
+    |  containing certain emoji and use them to assign a general sentiment
+    |  score, based on whether the expressed emotion is positive or negative,
+    |  e.g. #[span.o-icon.o-icon--inline 😀] or #[span.o-icon.o-icon--inline 😞].
+    |  You also want to find, merge and label hashtags like
+    |  #[code #MondayMotivation], to be able to ignore or analyse them later.
+
++aside("Note on sentiment analysis")
+    |  Ultimately, sentiment analysis is not always #[em that] easy. In
+    |  addition to the emoji, you'll also want to take specific words into
+    |  account and check the #[code subtree] for intensifiers like "very", to
+    |  increase the sentiment score. At some point, you might also want to train
+    |  a sentiment model. However, the approach described in this example is
+    |  very useful for #[strong bootstrapping rules to gather training data].
+    |  It's also an incredibly fast way to gather first insights into your data
+    |  – with about 1 million tweets, you'd be looking at a processing time of
+    |  #[strong under 1 minute].
+
+p
+    |  By default, spaCy's tokenizer will split emoji into separate tokens. This
+    |  means that you can create a pattern for one or more emoji tokens. In this
+    |  case, a sequence of identical emoji should be treated as one instance.
+    |  Valid hashtags usually consist of a #[code #], plus a sequence of
+    |  ASCII characters with no whitespace, making them easy to match as well.
+
++code.
+    from spacy.lang.en import English
+    from spacy.matcher import Matcher
+
+    nlp = English() # we only want the tokenizer, so no need to load a model
+    matcher = Matcher(nlp.vocab)
+
+    pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍'] # positive emoji
+    neg_emoji = [u'😞', u'😠', u'😩', u'😢', u'😭', u'😒'] # negative emoji
+
+    # add patterns to match one or more emoji tokens
+    pos_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in pos_emoji]
+    neg_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in neg_emoji]
+
+    matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern
+    matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern
+
+    # add pattern to merge valid hashtag, i.e. '#' plus any ASCII token
+    matcher.add('HASHTAG', merge_hashtag, [{'ORTH': '#'}, {'IS_ASCII': True}])
+
+p
+    |  Because the #[code on_match] callback receives the ID of each match, you
+    |  can use the same function to handle the sentiment assignment for both
+    |  the positive and negative pattern. To keep it simple, we'll either add
+    |  or subtract #[code 0.1] points – this way, the score will also reflect
+    |  combinations of emoji, even positive #[em and] negative ones.
+
+p
+    |  With a library like
+    |  #[+a("https://github.com/bcongdon/python-emojipedia") Emojipedia],
+    |  we can also retrieve a short description for each emoji – for example,
+    |  #[span.o-icon.o-icon--inline 😍]'s official title is "Smiling Face With
+    |  Heart-Eyes". Assigning it to the merged token's norm will make it
+    |  available as #[code token.norm_].
+
++code.
+    from emojipedia import Emojipedia # installation: pip install emojipedia
+
+    def label_sentiment(matcher, doc, i, matches):
+        match_id, start, end = matches[i]
+        if match_id is 'HAPPY':
+            doc.sentiment += 0.1 # add 0.1 for positive sentiment
+        elif match_id is 'SAD':
+            doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment
+        span = doc[start : end]
+        emoji = Emojipedia.search(span[0].text) # get data for emoji
+        span.merge(norm=emoji.title) # merge span and set NORM to emoji title
+
+p
+    |  To label the hashtags, we first need to add a new custom flag.
+    |  #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it
+    |  to the hashtag's span, and check its value via a token's
+    |  #[+api("token#check_flag") #[code code check_flag()]] method. On each
+    |  match, we merge the hashtag and assign the flag.
+
++code.
+    # Add a new custom flag to the vocab, which is always False by default
+    IS_HASHTAG = nlp.vocab.add_flag(lambda text: False)
+
+    def merge_hashtag(matcher, doc, i, matches):
+        match_id, start, end = matches[i]
+        span = doc[start : end]
+        span.merge() # merge hashtag
+        span.set_flag(IS_HASHTAG, True) # set IS_HASHTAG to True
+
+p
+    |  To process a stream of social media posts, we can use
+    |  #[+api("language#pipe") #[code Language.pipe()]], which will return a
+    |  stream of #[code Doc] objects that we can pass to
+    |  #[+api("matcher#pipe") #[code Matcher.pipe()]].
+
++code.
+    docs = nlp.pipe(LOTS_OF_TWEETS)
+    matches = matcher.pipe(docs)

From 086a06e7d750da5852a447effdb32a376bd86ec7 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 27 May 2017 20:01:46 +0200
Subject: [PATCH 08/10] Fix CLI docstrings and add command as first argument

Workaround for Plac
---
 spacy/__init__.py     | 6 +++++-
 spacy/cli/convert.py  | 5 +++--
 spacy/cli/download.py | 7 ++++---
 spacy/cli/info.py     | 2 +-
 spacy/cli/link.py     | 5 +++--
 spacy/cli/package.py  | 5 +++--
 spacy/cli/train.py    | 6 ++++--
 7 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/spacy/__init__.py b/spacy/__init__.py
index 8dc0937f5..6beb7955e 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
 import importlib
 
 from .compat import basestring_
-from .cli.info import info
+from .cli.info import info as cli_info
 from .glossary import explain
 from .deprecated import resolve_load_name
 from . import util
@@ -20,3 +20,7 @@ def load(name, **overrides):
     overrides['meta'] = meta
     overrides['path'] = model_path
     return cls(**overrides)
+
+
+def info(model=None, markdown=False):
+    return cli_info(None, model, markdown)
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index e95ffd08b..82b39bba2 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -24,8 +24,9 @@ CONVERTERS = {
     n_sents=("Number of sentences per doc", "option", "n", float),
     morphology=("Enable appending morphology to tags", "flag", "m", bool)
 )
-def convert(_, input_file, output_dir, n_sents, morphology):
-    """Convert files into JSON format for use with train command and other
+def convert(cmd, input_file, output_dir, n_sents, morphology):
+    """
+    Convert files into JSON format for use with train command and other
     experiment management functions.
     """
     input_path = Path(input_file)
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index fdcacb891..b6e5549da 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -17,8 +17,9 @@ from .. import about
     direct=("force direct download. Needs model name with version and won't "
             "perform compatibility check", "flag", "d", bool)
 )
-def download(model, direct=False):
-    """Download compatible model from default download path using pip. Model
+def download(cmd, model, direct=False):
+    """
+    Download compatible model from default download path using pip. Model
     can be shortcut, model name or, if --direct flag is set, full model name
     with version.
     """
@@ -31,7 +32,7 @@ def download(model, direct=False):
         version = get_version(model_name, compatibility)
         download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
         try:
-            link(model_name, model, force=True)
+            link(None, model_name, model, force=True)
         except:
             # Dirty, but since spacy.download and the auto-linking is mostly
             # a convenience wrapper, it's best to show a success message and
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index 6f7467521..75aac10c7 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -14,7 +14,7 @@ from .. import util
     model=("optional: shortcut link of model", "positional", None, str),
     markdown=("generate Markdown for GitHub issues", "flag", "md", str)
 )
-def info(model=None, markdown=False):
+def info(cmd, model=None, markdown=False):
     """Print info about spaCy installation. If a model shortcut link is
     speficied as an argument, print model information. Flag --markdown
     prints details in Markdown for easy copy-pasting to GitHub issues.
diff --git a/spacy/cli/link.py b/spacy/cli/link.py
index 1feef8bce..9aecdabfe 100644
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@@ -14,8 +14,9 @@ from .. import util
     link_name=("name of shortuct link to create", "positional", None, str),
     force=("force overwriting of existing link", "flag", "f", bool)
 )
-def link(origin, link_name, force=False):
-    """Create a symlink for models within the spacy/data directory. Accepts
+def link(cmd, origin, link_name, force=False):
+    """
+    Create a symlink for models within the spacy/data directory. Accepts
     either the name of a pip package, or the local path to the model data
     directory. Linking models allows loading them via spacy.load(link_name).
     """
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 9acd0a2fa..1c3128d99 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -18,8 +18,9 @@ from .. import about
     meta=("path to meta.json", "option", "m", str),
     force=("force overwriting of existing folder in output directory", "flag", "f", bool)
 )
-def package(input_dir, output_dir, meta, force):
-    """Generate Python package for model data, including meta and required
+def package(cmd, input_dir, output_dir, meta=None, force=False):
+    """
+    Generate Python package for model data, including meta and required
     installation files. A new directory will be created in the specified
     output directory, and model data will be copied over.
     """
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index ed146cb24..25b53e49d 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -32,9 +32,11 @@ from .. import displacy
     no_parser=("Don't train parser", "flag", "P", bool),
     no_entities=("Don't train NER", "flag", "N", bool)
 )
-def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
+def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
           use_gpu=False, no_tagger=False, no_parser=False, no_entities=False):
-    """Train a model. Expects data in spaCy's JSON format."""
+    """
+    Train a model. Expects data in spaCy's JSON format.
+    """
     n_sents = n_sents or None
     output_path = util.ensure_path(output_dir)
     train_path = util.ensure_path(train_data)

From 1203959625954fc1164485883ff49e9b5f3b43c3 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 27 May 2017 20:02:01 +0200
Subject: [PATCH 09/10] Add pipeline setting to meta.json generator

---
 spacy/cli/package.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 1c3128d99..e78a4eeb4 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -43,7 +43,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
         meta = util.read_json(meta_path)
     else:
         meta = generate_meta()
-    validate_meta(meta, ['lang', 'name', 'version'])
+    meta = validate_meta(meta, ['lang', 'name', 'version'])
 
     model_name = meta['lang'] + '_' + meta['name']
     model_name_v = model_name + '-' + meta['version']
@@ -86,20 +86,32 @@ def generate_meta():
                 ('email', 'Author email', False),
                 ('url', 'Author website', False),
                 ('license', 'License', 'CC BY-NC 3.0')]
-
     prints("Enter the package settings for your model.", title="Generating meta.json")
     meta = {}
     for setting, desc, default in settings:
         response = util.get_raw_input(desc, default)
         meta[setting] = default if response == '' and default else response
+    meta['pipeline'] = generate_pipeline()
     return meta
 
 
+def generate_pipeline():
+    prints("If set to 'True', the default pipeline is used. If set to 'False', "
+           "the pipeline will be disabled. Components should be specified as a "
+           "comma-separated list of component names, e.g. vectorizer, tagger, "
+           "parser, ner. For more information, see the docs on processing pipelines.",
+           title="Enter your model's pipeline components")
+    pipeline = util.get_raw_input("Pipeline components", True)
+    replace = {'True': True, 'False': False}
+    return replace[pipeline] if pipeline in replace else pipeline.split(', ')
+
+
 def validate_meta(meta, keys):
     for key in keys:
         if key not in meta or meta[key] == '':
             prints("This setting is required to build your package.",
                    title='No "%s" setting found in meta.json' % key, exits=1)
+    return meta
 
 
 def get_template(filepath):

From ae11c8d60f07f5f9257a347f51b72d93aaea3699 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 27 May 2017 20:02:20 +0200
Subject: [PATCH 10/10] Add emoji sentiment to lightning tour matcher example

---
 website/docs/usage/lightning-tour.jade | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade
index eefb7a11a..7de486070 100644
--- a/website/docs/usage/lightning-tour.jade
+++ b/website/docs/usage/lightning-tour.jade
@@ -149,9 +149,14 @@ p
 
     nlp = spacy.load('en')
     matcher = Matcher(nlp.vocab)
-    # match "Google I/O" or "Google i/o"
-    pattern = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
-    matcher.add('GoogleIO', None, pattern)
+
+    def set_sentiment(matcher, doc, i, matches):
+        doc.sentiment += 0.1
+
+    pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
+    pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
+    matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
+    matcher.add('HAPPY', set_sentiment, pattern2) # match one or more happy emoji
     matches = nlp(LOTS_OF TEXT)
 
 +infobox