Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-07-14 10:12:22 +03:00 · 2017-05-27 16:34:28 -05:00 · 2017-05-27 16:34:28 -05:00 · 49235017bf
commit 49235017bf
parent 7ebd26b8aa ae11c8d60f
17 changed files with 209 additions and 67 deletions
--- a/spacy/init.py
+++ b/spacy/init.py
@ -4,7 +4,7 @@ from __future__ import unicode_literals
 import importlib

 from .compat import basestring_
-from .cli.info import info
+from .cli.info import info as cli_info
 from .glossary import explain
 from .deprecated import resolve_load_name
 from . import util
@ -20,3 +20,7 @@ def load(name, **overrides):
    overrides['meta'] = meta
    overrides['path'] = model_path
    return cls(**overrides)
+
+
+def info(model=None, markdown=False):
+    return cli_info(None, model, markdown)
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -24,8 +24,9 @@ CONVERTERS = {
    n_sents=("Number of sentences per doc", "option", "n", float),
    morphology=("Enable appending morphology to tags", "flag", "m", bool)
 )
-def convert(_, input_file, output_dir, n_sents, morphology):
-    """Convert files into JSON format for use with train command and other
+def convert(cmd, input_file, output_dir, n_sents, morphology):
+    """
+    Convert files into JSON format for use with train command and other
    experiment management functions.
    """
    input_path = Path(input_file)
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -17,8 +17,9 @@ from .. import about
    direct=("force direct download. Needs model name with version and won't "
            "perform compatibility check", "flag", "d", bool)
 )
-def download(model, direct=False):
-    """Download compatible model from default download path using pip. Model
+def download(cmd, model, direct=False):
+    """
+    Download compatible model from default download path using pip. Model
    can be shortcut, model name or, if --direct flag is set, full model name
    with version.
    """
@ -31,7 +32,7 @@ def download(model, direct=False):
        version = get_version(model_name, compatibility)
        download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
        try:
-            link(model_name, model, force=True)
+            link(None, model_name, model, force=True)
        except:
            # Dirty, but since spacy.download and the auto-linking is mostly
            # a convenience wrapper, it's best to show a success message and
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -14,7 +14,7 @@ from .. import util
    model=("optional: shortcut link of model", "positional", None, str),
    markdown=("generate Markdown for GitHub issues", "flag", "md", str)
 )
-def info(model=None, markdown=False):
+def info(cmd, model=None, markdown=False):
    """Print info about spaCy installation. If a model shortcut link is
    speficied as an argument, print model information. Flag --markdown
    prints details in Markdown for easy copy-pasting to GitHub issues.
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@ -14,8 +14,9 @@ from .. import util
    link_name=("name of shortuct link to create", "positional", None, str),
    force=("force overwriting of existing link", "flag", "f", bool)
 )
-def link(origin, link_name, force=False):
-    """Create a symlink for models within the spacy/data directory. Accepts
+def link(cmd, origin, link_name, force=False):
+    """
+    Create a symlink for models within the spacy/data directory. Accepts
    either the name of a pip package, or the local path to the model data
    directory. Linking models allows loading them via spacy.load(link_name).
    """
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -18,8 +18,9 @@ from .. import about
    meta=("path to meta.json", "option", "m", str),
    force=("force overwriting of existing folder in output directory", "flag", "f", bool)
 )
-def package(input_dir, output_dir, meta, force):
-    """Generate Python package for model data, including meta and required
+def package(cmd, input_dir, output_dir, meta=None, force=False):
+    """
+    Generate Python package for model data, including meta and required
    installation files. A new directory will be created in the specified
    output directory, and model data will be copied over.
    """
@ -42,7 +43,7 @@ def package(input_dir, output_dir, meta, force):
        meta = util.read_json(meta_path)
    else:
        meta = generate_meta()
-    validate_meta(meta, ['lang', 'name', 'version'])
+    meta = validate_meta(meta, ['lang', 'name', 'version'])

    model_name = meta['lang'] + '_' + meta['name']
    model_name_v = model_name + '-' + meta['version']
@ -85,20 +86,32 @@ def generate_meta():
                ('email', 'Author email', False),
                ('url', 'Author website', False),
                ('license', 'License', 'CC BY-NC 3.0')]
-
    prints("Enter the package settings for your model.", title="Generating meta.json")
    meta = {}
    for setting, desc, default in settings:
        response = util.get_raw_input(desc, default)
        meta[setting] = default if response == '' and default else response
+    meta['pipeline'] = generate_pipeline()
    return meta


+def generate_pipeline():
+    prints("If set to 'True', the default pipeline is used. If set to 'False', "
+           "the pipeline will be disabled. Components should be specified as a "
+           "comma-separated list of component names, e.g. vectorizer, tagger, "
+           "parser, ner. For more information, see the docs on processing pipelines.",
+           title="Enter your model's pipeline components")
+    pipeline = util.get_raw_input("Pipeline components", True)
+    replace = {'True': True, 'False': False}
+    return replace[pipeline] if pipeline in replace else pipeline.split(', ')
+
+
 def validate_meta(meta, keys):
    for key in keys:
        if key not in meta or meta[key] == '':
            prints("This setting is required to build your package.",
                   title='No "%s" setting found in meta.json' % key, exits=1)
+    return meta


 def get_template(filepath):
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -32,9 +32,11 @@ from .. import displacy
    no_parser=("Don't train parser", "flag", "P", bool),
    no_entities=("Don't train NER", "flag", "N", bool)
 )
-def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
+def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
          use_gpu=False, no_tagger=False, no_parser=False, no_entities=False):
-    """Train a model. Expects data in spaCy's JSON format."""
+    """
+    Train a model. Expects data in spaCy's JSON format.
+    """
    n_sents = n_sents or None
    output_path = util.ensure_path(output_dir)
    train_path = util.ensure_path(train_data)
@ -70,12 +72,12 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,

    optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu)

-    print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
+    print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
    try:
        for i in range(n_iter):
            with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar:
                train_docs = corpus.train_docs(nlp, projectivize=True,
-                                               gold_preproc=False, max_length=1000)
+                                               gold_preproc=False, max_length=0)
                losses = {}
                for batch in minibatch(train_docs, size=batch_sizes):
                    docs, golds = zip(*batch)
--- a/spacy/lang/bn/punctuation.py
+++ b/spacy/lang/bn/punctuation.py
@ -1,8 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS
-from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
+from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS


 _currency = r"\$|¢|£|€|¥|฿|৳"
@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
 _list_punct = LIST_PUNCT + '। ॥'.strip().split()


-_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES)
+_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)

-_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES +
+_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
             [r'(?<=[0-9])\+',
              r'(?<=°[FfCcKk])\.',
              r'(?<=[0-9])(?:{})'.format(_currency),
              r'(?<=[0-9])(?:{})'.format(UNITS),
              r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])

-_infixes = (LIST_ELLIPSES +
+_infixes = (LIST_ELLIPSES + LIST_ICONS +
            [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
             r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -20,7 +20,6 @@ _upper = [_latin_upper]
 _lower = [_latin_lower]
 _uncased = [_bengali, _hebrew]

-
 ALPHA = merge_char_classes(_upper + _lower + _uncased)
 ALPHA_LOWER = merge_char_classes(_lower + _uncased)
 ALPHA_UPPER = merge_char_classes(_upper + _uncased)
@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
 _punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
 _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
 _hyphens = '- – — -- ---'
-
+_other_symbols = r'[\p{So}]'

 UNITS = merge_chars(_units)
 CURRENCY = merge_chars(_currency)
 QUOTES = merge_chars(_quotes)
 PUNCT = merge_chars(_punct)
 HYPHENS = merge_chars(_hyphens)
+ICONS = _other_symbols

 LIST_UNITS = split_chars(_units)
 LIST_CURRENCY = split_chars(_currency)
@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
 LIST_PUNCT = split_chars(_punct)
 LIST_HYPHENS = split_chars(_hyphens)
 LIST_ELLIPSES = [r'\.\.+', '…']
+LIST_ICONS = [_other_symbols]
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@ -2,15 +2,16 @@
 from __future__ import unicode_literals

 from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
-from .char_classes import CURRENCY, UNITS
+from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
+from .char_classes import QUOTES, CURRENCY, UNITS


 _prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
-             LIST_CURRENCY)
+             LIST_CURRENCY + LIST_ICONS)


-_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
+_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
+             ["'s", "'S", "’s", "’S"] +
             [r'(?<=[0-9])\+',
              r'(?<=°[FfCcKk])\.',
              r'(?<=[0-9])(?:{})'.format(CURRENCY),
@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
              r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])


-_infixes = (LIST_ELLIPSES +
+_infixes = (LIST_ELLIPSES + LIST_ICONS +
            [r'(?<=[0-9])[+\-\*^](?=[0-9-])',
             r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
--- a/spacy/language.py
+++ b/spacy/language.py
@ -212,18 +212,17 @@ class Language(object):
        """
        tok2vec = self.pipeline[0]
        feats = tok2vec.doc2feats(docs)
-        procs = list(self.pipeline[1:])
-        random.shuffle(procs)
        grads = {}
        def get_grads(W, dW, key=None):
            grads[key] = (W, dW)
-        for proc in procs:
+        for proc in self.pipeline[1:]:
            if not hasattr(proc, 'update'):
                continue
            tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
            d_tokvecses = proc.update((docs, tokvecses), golds,
                                      drop=drop, sgd=get_grads, losses=losses)
-            bp_tokvecses(d_tokvecses, sgd=sgd)
+            if d_tokvecses is not None:
+                bp_tokvecses(d_tokvecses, sgd=sgd)
        for key, (W, dW) in grads.items():
            sgd(W, dW, key=key)
        # Clear the tensor variable, to free GPU memory.
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -432,6 +432,8 @@ cdef class Parser:
                                                      0.0)
        todo = [(s, g) for (s, g) in zip(states, golds)
                if not s.is_final() and g is not None]
+        if not todo:
+            return None

        backprops = []
        d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@ -1,7 +1,4 @@
 # coding: utf-8
-"""Test that tokenizer exceptions and emoticons are handled correctly."""
-
-
 from __future__ import unicode_literals

 import pytest
@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer):
 def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
    tokens = tokenizer(text)
    assert len(tokens) == length
+
+
+@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
+                                         ('i💙you', 3), ('🤘🤘yay!', 4)])
+def test_tokenizer_handles_emoji(tokenizer, text, length):
+    exceptions = ["hu"]
+    tokens = tokenizer(text)
+    if tokens[0].lang_ not in exceptions:
+        assert len(tokens) == length
--- a/website/docs/usage/language-processing-pipeline.jade
+++ b/website/docs/usage/language-processing-pipeline.jade
@ -19,19 +19,17 @@ p

 p
    |  When you load a model, spaCy first consults the model's
-    |  #[+a("/docs/usage/saving-loading#models-generating") meta.json] for its
-    |  #[code setup] details. This typically includes the ID of a language class,
+    |  #[+a("/docs/usage/saving-loading#models-generating") meta.json]. The
+    |  meta typically includes the model details, the ID of a language class,
    |  and an optional list of pipeline components. spaCy then does the
    |  following:

 +aside-code("meta.json (excerpt)", "json").
    {
        "name": "example_model",
+        "lang": "en"
        "description": "Example model for spaCy",
-        "setup": {
-            "lang": "en",
-            "pipeline": ["token_vectors", "tagger"]
-        }
+        "pipeline": ["token_vectors", "tagger"]
    }

 +list("numbers")
@ -287,17 +285,15 @@ p

 p
    |  In the model package's meta.json, specify the language class and pipeline
-    |  IDs in #[code setup]:
+    |  IDs:

 +code("meta.json (excerpt)", "json").
    {
-        "name": "my_sentiment_model",
+        "name": "sentiment_model",
+        "lang": "en",
        "version": "1.0.0",
        "spacy_version": "&gt;=2.0.0,&lt;3.0.0",
-        "setup": {
-            "lang": "en",
-            "pipeline": ["vectorizer", "sentiment"]
-        }
+        "pipeline": ["vectorizer", "sentiment"]
    }

 p
@ -307,7 +303,7 @@ p
    |  by your custom #[code "sentiment"] factory.

 +code.
-    nlp = spacy.load('my_sentiment_model')
+    nlp = spacy.load('en_sentiment_model')
    doc = nlp(u'I love pizza')
    assert doc.sentiment

--- a/website/docs/usage/lightning-tour.jade
+++ b/website/docs/usage/lightning-tour.jade
@ -129,13 +129,14 @@ p
 +code.
    import spacy
    from spacy.tokens.doc import Doc
+    from spacy.vocab import Vocab

    nlp = spacy.load('en')
    moby_dick = open('moby_dick.txt', 'r')
    doc = nlp(moby_dick)
    doc.to_disk('/moby_dick.bin')

-    new_doc = Doc().from_disk('/moby_dick.bin')
+    new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')

 +infobox
    |  #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
@ -148,9 +149,14 @@ p

    nlp = spacy.load('en')
    matcher = Matcher(nlp.vocab)
-    # match "Google I/O" or "Google i/o"
-    pattern = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
-    matcher.add('GoogleIO', None, pattern)
+
+    def set_sentiment(matcher, doc, i, matches):
+        doc.sentiment += 0.1
+
+    pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
+    pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
+    matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
+    matcher.add('HAPPY', set_sentiment, pattern2) # match one or more happy emoji
    matches = nlp(LOTS_OF TEXT)

 +infobox
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/docs/usage/rule-based-matching.jade
@ -11,7 +11,7 @@ p
    |  You can also associate patterns with entity IDs, to allow some basic
    |  entity linking or disambiguation.

-+aside("What about \"real\" regular expressions?")
+//-+aside("What about \"real\" regular expressions?")

 +h(2, "adding-patterns") Adding patterns

@ -119,7 +119,7 @@ p
 +code.
    # Add a new custom flag to the vocab, which is always False by default.
    # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
-    BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False)
+    BAD_HTML_FLAG = nlp.vocab.add_flag(lambda text: False)

    def merge_and_flag(matcher, doc, i, matches):
        match_id, start, end = matches[i]
@ -221,7 +221,7 @@ p
        +cell match 0 or 1 times
        +cell optional, max one

-+h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations
+h(2, "example1") Example: Using linguistic annotations

 p
    |  Let's say you're analysing user comments and you want to find out what
@ -283,7 +283,7 @@ p
    # set manual=True to make displaCy render straight from a dictionary
    displacy.serve(matched_sents, style='ent', manual=True)

-+h(3, "quantifiers-example2") Quantifiers example: Phone numbers
+h(2, "example2") Example: Phone numbers

 p
    |  Phone numbers can have many different formats and matching them is often
@ -320,3 +320,114 @@ p
    |  It'll produce more predictable results, is much easier to modify and
    |  extend, and doesn't require any training data – only a set of
    |  test cases.
+
+h(2, "example3") Example: Hashtags and emoji on social media
+
+p
+    |  Social media posts, especially tweets, can be difficult to work with.
+    |  They're very short and often contain various emoji and hashtags. By only
+    |  looking at the plain text, you'll lose a lot of valuable semantic
+    |  information.
+
+p
+    |  Let's say you've extracted a large sample of social media posts on a
+    |  specific topic, for example posts mentioning a brand name or product.
+    |  As the first step of your data exploration, you want to filter out posts
+    |  containing certain emoji and use them to assign a general sentiment
+    |  score, based on whether the expressed emotion is positive or negative,
+    |  e.g. #[span.o-icon.o-icon--inline 😀] or #[span.o-icon.o-icon--inline 😞].
+    |  You also want to find, merge and label hashtags like
+    |  #[code #MondayMotivation], to be able to ignore or analyse them later.
+
+aside("Note on sentiment analysis")
+    |  Ultimately, sentiment analysis is not always #[em that] easy. In
+    |  addition to the emoji, you'll also want to take specific words into
+    |  account and check the #[code subtree] for intensifiers like "very", to
+    |  increase the sentiment score. At some point, you might also want to train
+    |  a sentiment model. However, the approach described in this example is
+    |  very useful for #[strong bootstrapping rules to gather training data].
+    |  It's also an incredibly fast way to gather first insights into your data
+    |  – with about 1 million tweets, you'd be looking at a processing time of
+    |  #[strong under 1 minute].
+
+p
+    |  By default, spaCy's tokenizer will split emoji into separate tokens. This
+    |  means that you can create a pattern for one or more emoji tokens. In this
+    |  case, a sequence of identical emoji should be treated as one instance.
+    |  Valid hashtags usually consist of a #[code #], plus a sequence of
+    |  ASCII characters with no whitespace, making them easy to match as well.
+
+code.
+    from spacy.lang.en import English
+    from spacy.matcher import Matcher
+
+    nlp = English() # we only want the tokenizer, so no need to load a model
+    matcher = Matcher(nlp.vocab)
+
+    pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍'] # positive emoji
+    neg_emoji = [u'😞', u'😠', u'😩', u'😢', u'😭', u'😒'] # negative emoji
+
+    # add patterns to match one or more emoji tokens
+    pos_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in pos_emoji]
+    neg_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in neg_emoji]
+
+    matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern
+    matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern
+
+    # add pattern to merge valid hashtag, i.e. '#' plus any ASCII token
+    matcher.add('HASHTAG', merge_hashtag, [{'ORTH': '#'}, {'IS_ASCII': True}])
+
+p
+    |  Because the #[code on_match] callback receives the ID of each match, you
+    |  can use the same function to handle the sentiment assignment for both
+    |  the positive and negative pattern. To keep it simple, we'll either add
+    |  or subtract #[code 0.1] points – this way, the score will also reflect
+    |  combinations of emoji, even positive #[em and] negative ones.
+
+p
+    |  With a library like
+    |  #[+a("https://github.com/bcongdon/python-emojipedia") Emojipedia],
+    |  we can also retrieve a short description for each emoji – for example,
+    |  #[span.o-icon.o-icon--inline 😍]'s official title is "Smiling Face With
+    |  Heart-Eyes". Assigning it to the merged token's norm will make it
+    |  available as #[code token.norm_].
+
+code.
+    from emojipedia import Emojipedia # installation: pip install emojipedia
+
+    def label_sentiment(matcher, doc, i, matches):
+        match_id, start, end = matches[i]
+        if match_id is 'HAPPY':
+            doc.sentiment += 0.1 # add 0.1 for positive sentiment
+        elif match_id is 'SAD':
+            doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment
+        span = doc[start : end]
+        emoji = Emojipedia.search(span[0].text) # get data for emoji
+        span.merge(norm=emoji.title) # merge span and set NORM to emoji title
+
+p
+    |  To label the hashtags, we first need to add a new custom flag.
+    |  #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it
+    |  to the hashtag's span, and check its value via a token's
+    |  #[+api("token#check_flag") #[code code check_flag()]] method. On each
+    |  match, we merge the hashtag and assign the flag.
+
+code.
+    # Add a new custom flag to the vocab, which is always False by default
+    IS_HASHTAG = nlp.vocab.add_flag(lambda text: False)
+
+    def merge_hashtag(matcher, doc, i, matches):
+        match_id, start, end = matches[i]
+        span = doc[start : end]
+        span.merge() # merge hashtag
+        span.set_flag(IS_HASHTAG, True) # set IS_HASHTAG to True
+
+p
+    |  To process a stream of social media posts, we can use
+    |  #[+api("language#pipe") #[code Language.pipe()]], which will return a
+    |  stream of #[code Doc] objects that we can pass to
+    |  #[+api("matcher#pipe") #[code Matcher.pipe()]].
+
+code.
+    docs = nlp.pipe(LOTS_OF_TWEETS)
+    matches = matcher.pipe(docs)
--- a/website/docs/usage/saving-loading.jade
+++ b/website/docs/usage/saving-loading.jade
@ -74,16 +74,14 @@ p
 +aside-code("meta.json", "json").
    {
        "name": "example_model",
+        "lang": "en",
        "version": "1.0.0",
        "spacy_version": "&gt;=2.0.0,&lt;3.0.0",
        "description": "Example model for spaCy",
        "author": "You",
        "email": "you@example.com",
        "license": "CC BY-SA 3.0",
-        "setup": {
-            "lang": "en",
-            "pipeline": ["token_vectors", "tagger"]
-        }
+        "pipeline": ["token_vectors", "tagger"]
    }

 +code(false, "bash").
@ -110,9 +108,9 @@ p
 +h(3, "models-custom") Customising the model setup

 p
-    |  The meta.json includes a #[code setup] key that lets you customise how
-    |  the model should be initialised and loaded. You can define the language
-    |  data to be loaded and the
+    |  The meta.json includes the model details, like name, requirements and
+    |  license, and lets you customise how the model should be initialised and
+    |  loaded. You can define the language data to be loaded and the
    |  #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to
    |  execute.

@ -183,9 +181,9 @@ p
 p
    |  To load a model from a data directory, you can use
    |  #[+api("spacy#load") #[code spacy.load()]] with the local path. This will
-    |  look for a meta.json in the directory and use the #[code setup] details
-    |  to initialise a #[code Language] class with a processing pipeline and
-    |  load in the model data.
+    |  look for a meta.json in the directory and use the #[code lang] and
+    |  #[code pipeline] settings to initialise a #[code Language] class with a
+    |  processing pipeline and load in the model data.

 +code.
    nlp = spacy.load('/path/to/model')