Merge branch 'develop' into feature/fix-matcher-operators

2025-10-24 12:41:23 +03:00 · 2017-10-16 13:38:36 +02:00 · 2017-10-16 13:38:36 +02:00 · a928ae2f35
commit a928ae2f35
parent 56aa42cc5d 9fd68334ab
78 changed files with 2319 additions and 779 deletions
--- a/examples/pipeline/custom_attr_methods.py
+++ b/examples/pipeline/custom_attr_methods.py
@ -0,0 +1,52 @@
+# coding: utf-8
+"""This example contains several snippets of methods that can be set via custom
+Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like
+they're "bound" to the object and are partially applied – i.e. the object
+they're called on is passed in as the first argument."""
+from __future__ import unicode_literals
+
+from spacy.lang.en import English
+from spacy.tokens import Doc, Span
+from spacy import displacy
+from pathlib import Path
+
+
+def to_html(doc, output='/tmp', style='dep'):
+    """Doc method extension for saving the current state as a displaCy
+    visualization.
+    """
+    # generate filename from first six non-punct tokens
+    file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
+    output_path = Path(output) / file_name
+    html = displacy.render(doc, style=style, page=True)  # render markup
+    output_path.open('w', encoding='utf-8').write(html)  # save to file
+    print('Saved HTML to {}'.format(output_path))
+
+
+Doc.set_extension('to_html', method=to_html)
+
+nlp = English()
+doc = nlp(u"This is a sentence about Apple.")
+# add entity manually for demo purposes, to make it work without a model
+doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
+doc._.to_html(style='ent')
+
+
+def overlap_tokens(doc, other_doc):
+    """Get the tokens from the original Doc that are also in the comparison Doc.
+    """
+    overlap = []
+    other_tokens = [token.text for token in other_doc]
+    for token in doc:
+        if token.text in other_tokens:
+            overlap.append(token)
+    return overlap
+
+
+Doc.set_extension('overlap', method=overlap_tokens)
+
+nlp = English()
+doc1 = nlp(u"Peach emoji is where it has always been.")
+doc2 = nlp(u"Peach is the superior emoji.")
+tokens = doc1._.overlap(doc2)
+print(tokens)
--- a/examples/pipeline/custom_component_countries_api.py
+++ b/examples/pipeline/custom_component_countries_api.py
@ -0,0 +1,108 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import requests
+
+from spacy.lang.en import English
+from spacy.matcher import PhraseMatcher
+from spacy.tokens import Doc, Span, Token
+
+
+class RESTCountriesComponent(object):
+    """Example of a spaCy v2.0 pipeline component that requests all countries
+    via the REST Countries API, merges country names into one token, assigns
+    entity labels and sets attributes on country tokens, e.g. the capital and
+    lat/lng coordinates. Can be extended with more details from the API.
+
+    REST Countries API: https://restcountries.eu
+    API License: Mozilla Public License MPL 2.0
+    """
+    name = 'rest_countries' # component name, will show up in the pipeline
+
+    def __init__(self, nlp, label='GPE'):
+        """Initialise the pipeline component. The shared nlp instance is used
+        to initialise the matcher with the shared vocab, get the label ID and
+        generate Doc objects as phrase match patterns.
+        """
+        # Make request once on initialisation and store the data
+        r = requests.get('https://restcountries.eu/rest/v2/all')
+        r.raise_for_status()  # make sure requests raises an error if it fails
+        countries = r.json()
+
+        # Convert API response to dict keyed by country name for easy lookup
+        # This could also be extended using the alternative and foreign language
+        # names provided by the API
+        self.countries = {c['name']: c for c in countries}
+        self.label = nlp.vocab.strings[label]  # get entity label ID
+
+        # Set up the PhraseMatcher with Doc patterns for each country name
+        patterns = [nlp(c) for c in self.countries.keys()]
+        self.matcher = PhraseMatcher(nlp.vocab)
+        self.matcher.add('COUNTRIES', None, *patterns)
+
+        # Register attribute on the Token. We'll be overwriting this based on
+        # the matches, so we're only setting a default value, not a getter.
+        # If no default value is set, it defaults to None.
+        Token.set_extension('is_country', default=False)
+        Token.set_extension('country_capital')
+        Token.set_extension('country_latlng')
+        Token.set_extension('country_flag')
+
+        # Register attributes on Doc and Span via a getter that checks if one of
+        # the contained tokens is set to is_country == True.
+        Doc.set_extension('has_country', getter=self.has_country)
+        Span.set_extension('has_country', getter=self.has_country)
+
+
+    def __call__(self, doc):
+        """Apply the pipeline component on a Doc object and modify it if matches
+        are found. Return the Doc, so it can be processed by the next component
+        in the pipeline, if available.
+        """
+        matches = self.matcher(doc)
+        spans = []  # keep the spans for later so we can merge them afterwards
+        for _, start, end in matches:
+            # Generate Span representing the entity & set label
+            entity = Span(doc, start, end, label=self.label)
+            spans.append(entity)
+            # Set custom attribute on each token of the entity
+            # Can be extended with other data returned by the API, like
+            # currencies, country code, flag, calling code etc.
+            for token in entity:
+                token._.set('is_country', True)
+                token._.set('country_capital', self.countries[entity.text]['capital'])
+                token._.set('country_latlng', self.countries[entity.text]['latlng'])
+                token._.set('country_flag', self.countries[entity.text]['flag'])
+            # Overwrite doc.ents and add entity – be careful not to replace!
+            doc.ents = list(doc.ents) + [entity]
+        for span in spans:
+            # Iterate over all spans and merge them into one token. This is done
+            # after setting the entities – otherwise, it would cause mismatched
+            # indices!
+            span.merge()
+        return doc  # don't forget to return the Doc!
+
+    def has_country(self, tokens):
+        """Getter for Doc and Span attributes. Returns True if one of the tokens
+        is a country. Since the getter is only called when we access the
+        attribute, we can refer to the Token's 'is_country' attribute here,
+        which is already set in the processing step."""
+        return any([t._.get('is_country') for t in tokens])
+
+
+# For simplicity, we start off with only the blank English Language class and
+# no model or pre-defined pipeline loaded.
+
+nlp = English()
+rest_countries = RESTCountriesComponent(nlp)  # initialise component
+nlp.add_pipe(rest_countries) # add it to the pipeline
+
+doc = nlp(u"Some text about Colombia and the Czech Republic")
+
+print('Pipeline', nlp.pipe_names)  # pipeline contains component name
+print('Doc has countries', doc._.has_country)  # Doc contains countries
+for token in doc:
+    if token._.is_country:
+        print(token.text, token._.country_capital, token._.country_latlng,
+              token._.country_flag)  # country data
+print('Entities', [(e.text, e.label_) for e in doc.ents])  # all countries are entities
--- a/examples/pipeline/custom_component_entities.py
+++ b/examples/pipeline/custom_component_entities.py
@ -0,0 +1,85 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from spacy.lang.en import English
+from spacy.matcher import PhraseMatcher
+from spacy.tokens import Doc, Span, Token
+
+
+class TechCompanyRecognizer(object):
+    """Example of a spaCy v2.0 pipeline component that sets entity annotations
+    based on list of single or multiple-word company names. Companies are
+    labelled as ORG and their spans are merged into one token. Additionally,
+    ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
+    respectively."""
+    name = 'tech_companies'  # component name, will show up in the pipeline
+
+    def __init__(self, nlp, companies=tuple(), label='ORG'):
+        """Initialise the pipeline component. The shared nlp instance is used
+        to initialise the matcher with the shared vocab, get the label ID and
+        generate Doc objects as phrase match patterns.
+        """
+        self.label = nlp.vocab.strings[label]  # get entity label ID
+
+        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
+        # so even if the list of companies is long, it's very efficient
+        patterns = [nlp(org) for org in companies]
+        self.matcher = PhraseMatcher(nlp.vocab)
+        self.matcher.add('TECH_ORGS', None, *patterns)
+
+        # Register attribute on the Token. We'll be overwriting this based on
+        # the matches, so we're only setting a default value, not a getter.
+        Token.set_extension('is_tech_org', default=False)
+
+        # Register attributes on Doc and Span via a getter that checks if one of
+        # the contained tokens is set to is_tech_org == True.
+        Doc.set_extension('has_tech_org', getter=self.has_tech_org)
+        Span.set_extension('has_tech_org', getter=self.has_tech_org)
+
+    def __call__(self, doc):
+        """Apply the pipeline component on a Doc object and modify it if matches
+        are found. Return the Doc, so it can be processed by the next component
+        in the pipeline, if available.
+        """
+        matches = self.matcher(doc)
+        spans = []  # keep the spans for later so we can merge them afterwards
+        for _, start, end in matches:
+            # Generate Span representing the entity & set label
+            entity = Span(doc, start, end, label=self.label)
+            spans.append(entity)
+            # Set custom attribute on each token of the entity
+            for token in entity:
+                token._.set('is_tech_org', True)
+            # Overwrite doc.ents and add entity – be careful not to replace!
+            doc.ents = list(doc.ents) + [entity]
+        for span in spans:
+            # Iterate over all spans and merge them into one token. This is done
+            # after setting the entities – otherwise, it would cause mismatched
+            # indices!
+            span.merge()
+        return doc  # don't forget to return the Doc!
+
+    def has_tech_org(self, tokens):
+        """Getter for Doc and Span attributes. Returns True if one of the tokens
+        is a tech org. Since the getter is only called when we access the
+        attribute, we can refer to the Token's 'is_tech_org' attribute here,
+        which is already set in the processing step."""
+        return any([t._.get('is_tech_org') for t in tokens])
+
+
+# For simplicity, we start off with only the blank English Language class and
+# no model or pre-defined pipeline loaded.
+
+nlp = English()
+companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple']  # etc.
+component = TechCompanyRecognizer(nlp, companies)  # initialise component
+nlp.add_pipe(component, last=True)  # add it to the pipeline as the last element
+
+doc = nlp(u"Alphabet Inc. is the company behind Google.")
+
+print('Pipeline', nlp.pipe_names)  # pipeline contains component name
+print('Tokens', [t.text for t in doc])  # company names from the list are merged
+print('Doc has_tech_org', doc._.has_tech_org)  # Doc contains tech orgs
+print('Token 0 is_tech_org', doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org
+print('Token 1 is_tech_org', doc[1]._.is_tech_org)  # "is" is not
+print('Entities', [(e.text, e.label_) for e in doc.ents])  # all orgs are entities
--- a/examples/training/train_ner_standalone.py
+++ b/examples/training/train_ner_standalone.py
@ -6,7 +6,7 @@ To achieve that, it duplicates some of spaCy's internal functionality.

 Specifically, in this example, we don't use spaCy's built-in Language class to
 wire together the Vocab, Tokenizer and EntityRecognizer. Instead, we write
-our own simle Pipeline class, so that it's easier to see how the pieces
+our own simple Pipeline class, so that it's easier to see how the pieces
 interact.

 Input data:
@ -142,16 +142,15 @@ def train(nlp, train_examples, dev_examples, nr_epoch=5):
            inputs, annots = zip(*batch)
            nlp.update(list(inputs), list(annots), sgd, losses=losses)
        scores = nlp.evaluate(dev_examples)
-        report_scores(i, losses['ner'], scores)
-    scores = nlp.evaluate(dev_examples)
-    report_scores(channels, i+1, loss, scores)
+        report_scores(i+1, losses['ner'], scores)


 def report_scores(i, loss, scores):
    precision = '%.2f' % scores['ents_p']
    recall = '%.2f' % scores['ents_r']
    f_measure = '%.2f' % scores['ents_f']
-    print('%d %s %s %s' % (int(loss), precision, recall, f_measure))
+    print('Epoch %d: %d %s %s %s' % (
+        i, int(loss), precision, recall, f_measure))


 def read_examples(path):
--- a/spacy/main.py
+++ b/spacy/main.py
@ -7,7 +7,7 @@ if __name__ == '__main__':
    import plac
    import sys
    from spacy.cli import download, link, info, package, train, convert, model
-    from spacy.cli import profile, evaluate
+    from spacy.cli import profile, evaluate, validate
    from spacy.util import prints

    commands = {
@ -20,6 +20,7 @@ if __name__ == '__main__':
        'package': package,
        'model': model,
        'profile': profile,
+        'validate': validate
    }
    if len(sys.argv) == 1:
        prints(', '.join(commands), title="Available commands", exits=1)
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -311,7 +311,7 @@ def link_vectors_to_models(vocab):

 def Tok2Vec(width, embed_size, **kwargs):
    pretrained_dims = kwargs.get('pretrained_dims', 0)
-    cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
+    cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
                                 '*': reapply}):
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -7,3 +7,4 @@ from .train import train
 from .evaluate import evaluate
 from .convert import convert
 from .model import model
+from .validate import validate
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -4,7 +4,7 @@ from __future__ import unicode_literals
 import plac
 from pathlib import Path

-from .converters import conllu2json, iob2json
+from .converters import conllu2json, iob2json, conll_ner2json
 from ..util import prints

 # Converters are matched by file extension. To add a converter, add a new entry
@ -12,9 +12,10 @@ from ..util import prints
 # from /converters.

 CONVERTERS = {
-    '.conllu': conllu2json,
-    '.conll': conllu2json,
-    '.iob': iob2json,
+    'conllu': conllu2json,
+    'conll': conllu2json,
+    'ner': conll_ner2json,
+    'iob': iob2json,
 }


@ -22,9 +23,11 @@ CONVERTERS = {
    input_file=("input file", "positional", None, str),
    output_dir=("output directory for converted file", "positional", None, str),
    n_sents=("Number of sentences per doc", "option", "n", int),
+    converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
    morphology=("Enable appending morphology to tags", "flag", "m", bool)
 )
-def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
+def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
+            converter='auto'):
    """
    Convert files into JSON format for use with train command and other
    experiment management functions.
@ -35,9 +38,11 @@ def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
        prints(input_path, title="Input file not found", exits=1)
    if not output_path.exists():
        prints(output_path, title="Output directory not found", exits=1)
-    file_ext = input_path.suffix
-    if not file_ext in CONVERTERS:
-        prints("Can't find converter for %s" % input_path.parts[-1],
-               title="Unknown format", exits=1)
-    CONVERTERS[file_ext](input_path, output_path,
-            n_sents=n_sents, use_morphology=morphology)
+    if converter == 'auto':
+        converter = input_path.suffix[1:]
+    if not converter in CONVERTERS:
+            prints("Can't find converter for %s" % converter,
+                title="Unknown format", exits=1)
+    func = CONVERTERS[converter]
+    func(input_path, output_path,
+         n_sents=n_sents, use_morphology=morphology)
--- a/spacy/cli/converters/init.py
+++ b/spacy/cli/converters/init.py
@ -1,2 +1,3 @@
 from .conllu2json import conllu2json
 from .iob2json import iob2json
+from .conll_ner2json import conll_ner2json
--- a/spacy/cli/converters/conll_ner2json.py
+++ b/spacy/cli/converters/conll_ner2json.py
@ -0,0 +1,50 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...compat import json_dumps, path2str
+from ...util import prints
+from ...gold import iob_to_biluo
+
+
+def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
+    """
+    Convert files in the CoNLL-2003 NER format into JSON format for use with train cli.
+    """
+    docs = read_conll_ner(input_path)
+
+    output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
+    output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
+    output_file = output_path / output_filename
+    with output_file.open('w', encoding='utf-8') as f:
+        f.write(json_dumps(docs))
+    prints("Created %d documents" % len(docs),
+           title="Generated output file %s" % path2str(output_file))
+
+
+def read_conll_ner(input_path):
+    text = input_path.open('r', encoding='utf-8').read()
+    i = 0
+    delimit_docs = '-DOCSTART- -X- O O'
+    output_docs = []
+    for doc in text.strip().split(delimit_docs):
+        doc = doc.strip()
+        if not doc:
+            continue
+        output_doc = []
+        for sent in doc.split('\n\n'):
+            sent = sent.strip()
+            if not sent:
+                continue
+            lines = [line.strip() for line in sent.split('\n') if line.strip()]
+            words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
+            biluo_ents = iob_to_biluo(iob_ents)
+            output_doc.append({'tokens': [
+                {'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in
+                zip(words, tags, biluo_ents)
+            ]})
+        output_docs.append({
+            'id': len(output_docs),
+            'paragraphs': [{'sentences': output_doc}]
+        })
+        output_doc = []
+    return output_docs
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -44,7 +44,7 @@ numpy.random.seed(0)
    version=("Model version", "option", "V", str),
    meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
 )
-def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
+def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
          use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
          gold_preproc=False, version="0.0.0", meta_path=None):
    """
@ -68,6 +68,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
    if not isinstance(meta, dict):
        prints("Expected dict but got: {}".format(type(meta)),
               title="Not a valid meta.json format", exits=1)
+    meta.setdefault('lang', lang)
+    meta.setdefault('name', 'unnamed')

    pipeline = ['tagger', 'parser', 'ner']
    if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
@ -88,9 +90,13 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
    n_train_words = corpus.count_train()

    lang_class = util.get_lang_class(lang)
-    nlp = lang_class(pipeline=pipeline)
+    nlp = lang_class()
+    meta['pipeline'] = pipeline
+    nlp.meta.update(meta)
    if vectors:
        util.load_model(vectors, vocab=nlp.vocab)
+    for name in pipeline:
+        nlp.add_pipe(nlp.create_pipe(name), name=name)
    optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
    nlp._optimizer = None

@ -112,17 +118,33 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
                util.set_env_log(False)
                epoch_model_path = output_path / ('model%d' % i)
                nlp.to_disk(epoch_model_path)
-                nlp_loaded = lang_class(pipeline=pipeline)
-                nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
-                scorer = nlp_loaded.evaluate(
-                            list(corpus.dev_docs(
+                nlp_loaded = util.load_model_from_path(epoch_model_path)
+                dev_docs = list(corpus.dev_docs(
                                nlp_loaded,
-                                gold_preproc=gold_preproc)))
+                                gold_preproc=gold_preproc))
+                nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
+                start_time = timer()
+                scorer = nlp_loaded.evaluate(dev_docs)
+                end_time = timer()
+                if use_gpu < 0:
+                    gpu_wps = None
+                    cpu_wps = nwords/(end_time-start_time)
+                else:
+                    gpu_wps = nwords/(end_time-start_time)
+                    with Model.use_device('cpu'):
+                        nlp_loaded = util.load_model_from_path(epoch_model_path)
+                        dev_docs = list(corpus.dev_docs(
+                                        nlp_loaded, gold_preproc=gold_preproc))
+                        start_time = timer()
+                        scorer = nlp_loaded.evaluate(dev_docs)
+                        end_time = timer()
+                        cpu_wps = nwords/(end_time-start_time)
                acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
                with acc_loc.open('w') as file_:
                    file_.write(json_dumps(scorer.scores))
                meta_loc = output_path / ('model%d' % i) / 'meta.json'
                meta['accuracy'] = scorer.scores
+                meta['speed'] = {'nwords': nwords, 'cpu':cpu_wps, 'gpu': gpu_wps}
                meta['lang'] = nlp.lang
                meta['pipeline'] = pipeline
                meta['spacy_version'] = '>=%s' % about.__version__
@ -132,7 +154,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
                with meta_loc.open('w') as file_:
                    file_.write(json_dumps(meta))
                util.set_env_log(True)
-            print_progress(i, losses, scorer.scores)
+            print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
    finally:
        print("Saving model...")
        try:
@ -153,16 +175,17 @@ def _render_parses(i, to_render):
        file_.write(html)


-def print_progress(itn, losses, dev_scores, wps=0.0):
+def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0):
    scores = {}
    for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
-                'ents_p', 'ents_r', 'ents_f', 'wps']:
+                'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']:
        scores[col] = 0.0
    scores['dep_loss'] = losses.get('parser', 0.0)
    scores['ner_loss'] = losses.get('ner', 0.0)
    scores['tag_loss'] = losses.get('tagger', 0.0)
    scores.update(dev_scores)
-    scores['wps'] = wps
+    scores['cpu_wps'] = cpu_wps
+    scores['gpu_wps'] = gpu_wps or 0.0
    tpl = '\t'.join((
        '{:d}',
        '{dep_loss:.3f}',
@ -173,7 +196,9 @@ def print_progress(itn, losses, dev_scores, wps=0.0):
        '{ents_f:.3f}',
        '{tags_acc:.3f}',
        '{token_acc:.3f}',
-        '{wps:.1f}'))
+        '{cpu_wps:.1f}',
+        '{gpu_wps:.1f}',
+    ))
    print(tpl.format(itn, **scores))


--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@ -0,0 +1,123 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import requests
+import pkg_resources
+from pathlib import Path
+
+from ..compat import path2str, locale_escape
+from ..util import prints, get_data_path, read_json
+from .. import about
+
+
+def validate(cmd):
+    """Validate that the currently installed version of spaCy is compatible
+    with the installed models. Should be run after `pip install -U spacy`.
+    """
+    r = requests.get(about.__compatibility__)
+    if r.status_code != 200:
+        prints("Couldn't fetch compatibility table.",
+               title="Server error (%d)" % r.status_code, exits=1)
+    compat = r.json()['spacy']
+    all_models = set()
+    for spacy_v, models in dict(compat).items():
+        all_models.update(models.keys())
+        for model, model_vs in models.items():
+            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
+
+    current_compat = compat[about.__version__]
+    model_links = get_model_links(current_compat)
+    model_pkgs = get_model_pkgs(current_compat, all_models)
+    incompat_links = {l for l, d in model_links.items() if not d['compat']}
+    incompat_models = {d['name'] for _, d in model_pkgs.items() if not d['compat']}
+    incompat_models.update([d['name'] for _, d in model_links.items() if not d['compat']])
+    na_models = [m for m in incompat_models if m not in current_compat]
+    update_models = [m for m in incompat_models if m in current_compat]
+
+    prints(path2str(Path(__file__).parent.parent),
+           title="Installed models (spaCy v{})".format(about.__version__))
+    if model_links or model_pkgs:
+        print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', ''))
+        for name, data in model_pkgs.items():
+            print(get_model_row(current_compat, name, data, 'package'))
+        for name, data in model_links.items():
+            print(get_model_row(current_compat, name, data, 'link'))
+    else:
+        prints("No models found in your current environment.", exits=0)
+
+    if update_models:
+        cmd = '    python -m spacy download {}'
+        print("\n    Use the following commands to update the model packages:")
+        print('\n'.join([cmd.format(pkg) for pkg in update_models]))
+
+    if na_models:
+        prints("The following models are not available for spaCy v{}: {}"
+               .format(about.__version__, ', '.join(na_models)))
+
+    if incompat_links:
+        prints("You may also want to overwrite the incompatible links using "
+               "the `spacy link` command with `--force`, or remove them from "
+               "the data directory. Data path: {}"
+               .format(path2str(get_data_path())))
+
+
+def get_model_links(compat):
+    links = {}
+    data_path = get_data_path()
+    if data_path:
+        models = [p for p in data_path.iterdir() if is_model_path(p)]
+        for model in models:
+            meta_path = Path(model) / 'meta.json'
+            if not meta_path.exists():
+                continue
+            meta = read_json(meta_path)
+            link = model.parts[-1]
+            name = meta['lang'] + '_' + meta['name']
+            links[link] = {'name': name, 'version': meta['version'],
+                           'compat': is_compat(compat, name, meta['version'])}
+    return links
+
+
+def get_model_pkgs(compat, all_models):
+    pkgs = {}
+    for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
+        package = pkg_name.replace('-', '_')
+        if package in all_models:
+            version = pkg_data.version
+            pkgs[pkg_name] = {'name': package, 'version': version,
+                              'compat': is_compat(compat, package, version)}
+    return pkgs
+
+
+def get_model_row(compat, name, data, type='package'):
+    tpl_row = '    {:<10}' + ('  {:<20}' * 4)
+    tpl_red = '\x1b[38;5;1m{}\x1b[0m'
+    tpl_green = '\x1b[38;5;2m{}\x1b[0m'
+    if data['compat']:
+        comp = tpl_green.format(locale_escape('✔', errors='ignore'))
+        version = tpl_green.format(data['version'])
+    else:
+        comp = '--> {}'.format(compat.get(data['name'], ['n/a'])[0])
+        version = tpl_red.format(data['version'])
+    return get_row(type, name, data['name'], version, comp)
+
+
+def get_row(*args):
+    tpl_row = '    {:<10}' + ('  {:<20}' * 4)
+    return tpl_row.format(*args)
+
+
+def is_model_path(model_path):
+    exclude = ['cache', 'pycache', '__pycache__']
+    name = model_path.parts[-1]
+    return model_path.is_dir() and name not in exclude and not name.startswith('.')
+
+
+def is_compat(compat, name, version):
+    return name in compat and version in compat[name]
+
+
+def reformat_version(version):
+    if version.endswith('-alpha'):
+        return version.replace('-alpha', 'a0')
+    return version.replace('-alpha', 'a')
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -6,6 +6,7 @@ import ftfy
 import sys
 import ujson
 import itertools
+import locale

 from thinc.neural.util import copy_array

@ -113,3 +114,12 @@ def import_file(name, loc):
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        return module
+
+
+def locale_escape(string, errors='replace'):
+    '''
+    Mangle non-supported characters, for savages with ascii terminals.
+    '''
+    encoding = locale.getpreferredencoding()
+    string = string.encode(encoding, errors).decode('utf8')
+    return string
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -213,7 +213,7 @@ class GoldCorpus(object):
        train_tuples = self.train_tuples
        if projectivize:
            train_tuples = nonproj.preprocess_training_data(
-                               self.train_tuples)
+                               self.train_tuples, label_freq_cutoff=100)
        random.shuffle(train_tuples)
        gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
                                        max_length=max_length,
--- a/spacy/lang/bn/init.py
+++ b/spacy/lang/bn/init.py
@ -16,15 +16,13 @@ from ...util import update_exc
 class BengaliDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'bn'
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    tag_map = TAG_MAP
    stop_words = STOP_WORDS
    lemma_rules = LEMMA_RULES
-
-    prefixes = tuple(TOKENIZER_PREFIXES)
-    suffixes = tuple(TOKENIZER_SUFFIXES)
-    infixes = tuple(TOKENIZER_INFIXES)
+    prefixes = TOKENIZER_PREFIXES
+    suffixes = TOKENIZER_SUFFIXES
+    infixes = TOKENIZER_INFIXES


 class Bengali(Language):
--- a/spacy/lang/da/init.py
+++ b/spacy/lang/da/init.py
@ -15,9 +15,8 @@ class DanishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'da'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
+    stop_words = STOP_WORDS


 class Danish(Language):
--- a/spacy/lang/de/init.py
+++ b/spacy/lang/de/init.py
@ -12,7 +12,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...lemmatizerlookup import Lemmatizer
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups

@ -22,16 +21,12 @@ class GermanDefaults(Language.Defaults):
    lex_attr_getters[LANG] = lambda text: 'de'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
                                         NORM_EXCEPTIONS, BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    infixes = tuple(TOKENIZER_INFIXES)
-    tag_map = dict(TAG_MAP)
-    stop_words = set(STOP_WORDS)
-    syntax_iterators = dict(SYNTAX_ITERATORS)
-
-    @classmethod
-    def create_lemmatizer(cls, nlp=None):
-        return Lemmatizer(LOOKUP)
+    infixes = TOKENIZER_INFIXES
+    tag_map = TAG_MAP
+    stop_words = STOP_WORDS
+    syntax_iterators = SYNTAX_ITERATORS
+    lemma_lookup = LOOKUP


 class German(Language):
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -7,7 +7,7 @@ from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .morph_rules import MORPH_RULES
-from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
+from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC, LOOKUP
 from .syntax_iterators import SYNTAX_ITERATORS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
@ -23,15 +23,15 @@ class EnglishDefaults(Language.Defaults):
    lex_attr_getters[LANG] = lambda text: 'en'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
                                         BASE_NORMS, NORM_EXCEPTIONS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    tag_map = dict(TAG_MAP)
-    stop_words = set(STOP_WORDS)
-    morph_rules = dict(MORPH_RULES)
-    lemma_rules = dict(LEMMA_RULES)
-    lemma_index = dict(LEMMA_INDEX)
-    lemma_exc = dict(LEMMA_EXC)
-    syntax_iterators = dict(SYNTAX_ITERATORS)
+    tag_map = TAG_MAP
+    stop_words = STOP_WORDS
+    morph_rules = MORPH_RULES
+    lemma_rules = LEMMA_RULES
+    lemma_index = LEMMA_INDEX
+    lemma_exc = LEMMA_EXC
+    lemma_lookup = LOOKUP
+    syntax_iterators = SYNTAX_ITERATORS


 class English(Language):
--- a/spacy/lang/es/init.py
+++ b/spacy/lang/es/init.py
@ -10,7 +10,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...lemmatizerlookup import Lemmatizer
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups

@ -19,15 +18,11 @@ class SpanishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'es'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    tag_map = dict(TAG_MAP)
-    stop_words = set(STOP_WORDS)
-    sytax_iterators = dict(SYNTAX_ITERATORS)
-
-    @classmethod
-    def create_lemmatizer(cls, nlp=None):
-        return Lemmatizer(LOOKUP)
+    tag_map = TAG_MAP
+    stop_words = STOP_WORDS
+    sytax_iterators = SYNTAX_ITERATORS
+    lemma_lookup = LOOKUP


 class Spanish(Language):
--- a/spacy/lang/fi/init.py
+++ b/spacy/lang/fi/init.py
@ -15,9 +15,8 @@ class FinnishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'fi'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
+    stop_words = STOP_WORDS


 class Finnish(Language):
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...lemmatizerlookup import Lemmatizer
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups

@ -21,17 +20,13 @@ class FrenchDefaults(Language.Defaults):
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: 'fr'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
-    infixes = tuple(TOKENIZER_INFIXES)
-    suffixes = tuple(TOKENIZER_SUFFIXES)
+    stop_words = STOP_WORDS
+    infixes = TOKENIZER_INFIXES
+    suffixes = TOKENIZER_SUFFIXES
    token_match = TOKEN_MATCH
-    syntax_iterators = dict(SYNTAX_ITERATORS)
-
-    @classmethod
-    def create_lemmatizer(cls, nlp=None):
-        return Lemmatizer(LOOKUP)
+    syntax_iterators = SYNTAX_ITERATORS
+    lemma_lookup = LOOKUP


 class French(Language):
--- a/spacy/lang/he/init.py
+++ b/spacy/lang/he/init.py
@ -12,9 +12,8 @@ from ...util import update_exc
 class HebrewDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'he'
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
+    stop_words = STOP_WORDS


 class Hebrew(Language):
--- a/spacy/lang/hu/init.py
+++ b/spacy/lang/hu/init.py
@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...lemmatizerlookup import Lemmatizer
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups

@ -18,17 +17,13 @@ class HungarianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'hu'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
-    prefixes = tuple(TOKENIZER_PREFIXES)
-    suffixes = tuple(TOKENIZER_SUFFIXES)
-    infixes = tuple(TOKENIZER_INFIXES)
+    stop_words = STOP_WORDS
+    prefixes = TOKENIZER_PREFIXES
+    suffixes = TOKENIZER_SUFFIXES
+    infixes = TOKENIZER_INFIXES
    token_match = TOKEN_MATCH
-
-    @classmethod
-    def create_lemmatizer(cls, nlp=None):
-        return Lemmatizer(LOOKUP)
+    lemma_lookup = LOOKUP


 class Hungarian(Language):
--- a/spacy/lang/id/init.py
+++ b/spacy/lang/id/init.py
@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
-from ...lemmatizerlookup import Lemmatizer
 from ...attrs import LANG
 from ...util import update_exc

@ -19,19 +18,14 @@ from ...util import update_exc
 class IndonesianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'id'
-
    lex_attr_getters.update(LEX_ATTRS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
-    prefixes = tuple(TOKENIZER_PREFIXES)
-    suffixes = tuple(TOKENIZER_SUFFIXES)
-    infixes = tuple(TOKENIZER_INFIXES)
-    syntax_iterators = dict(SYNTAX_ITERATORS)
-
-    @classmethod
-    def create_lemmatizer(cls, nlp=None):
-        return Lemmatizer(LOOKUP)
+    stop_words = STOP_WORDS
+    prefixes = TOKENIZER_PREFIXES
+    suffixes = TOKENIZER_SUFFIXES
+    infixes = TOKENIZER_INFIXES
+    syntax_iterators = SYNTAX_ITERATORS
+    lemma_lookup = LOOKUP


 class Indonesian(Language):
--- a/spacy/lang/id/lex_attrs.py
+++ b/spacy/lang/id/lex_attrs.py
@ -16,8 +16,7 @@ _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
              'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta',
              'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun',
              'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun',
-              'noniliun', 'desiliun',
-              ]
+              'noniliun', 'desiliun']


 def like_num(text):
--- a/spacy/lang/it/init.py
+++ b/spacy/lang/it/init.py
@ -7,7 +7,6 @@ from .lemmatizer import LOOKUP
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...lemmatizerlookup import Lemmatizer
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups

@ -16,13 +15,9 @@ class ItalianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'it'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
-
-    @classmethod
-    def create_lemmatizer(cls, nlp=None):
-        return Lemmatizer(LOOKUP)
+    stop_words = STOP_WORDS
+    lemma_lookup = LOOKUP


 class Italian(Language):
--- a/spacy/lang/nb/init.py
+++ b/spacy/lang/nb/init.py
@ -16,9 +16,8 @@ class NorwegianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'nb'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
+    stop_words = STOP_WORDS


 class Norwegian(Language):
--- a/spacy/lang/nl/init.py
+++ b/spacy/lang/nl/init.py
@ -16,9 +16,8 @@ class DutchDefaults(Language.Defaults):
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: 'nl'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
+    stop_words = STOP_WORDS


 class Dutch(Language):
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -15,9 +15,8 @@ class PolishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'pl'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
+    stop_words = STOP_WORDS


 class Polish(Language):
--- a/spacy/lang/pt/init.py
+++ b/spacy/lang/pt/init.py
@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...lemmatizerlookup import Lemmatizer
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups

@ -19,13 +18,9 @@ class PortugueseDefaults(Language.Defaults):
    lex_attr_getters[LANG] = lambda text: 'pt'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
    lex_attr_getters.update(LEX_ATTRS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
-
-    @classmethod
-    def create_lemmatizer(cls, nlp=None):
-        return Lemmatizer(LOOKUP)
+    stop_words = STOP_WORDS
+    lemma_lookup = LOOKUP


 class Portuguese(Language):
--- a/spacy/lang/sv/init.py
+++ b/spacy/lang/sv/init.py
@ -9,7 +9,6 @@ from .lemmatizer import LEMMA_RULES, LOOKUP
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...lemmatizerlookup import Lemmatizer
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups

@ -18,13 +17,10 @@ class SwedishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'sv'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    stop_words = set(STOP_WORDS)
-
-    @classmethod
-    def create_lemmatizer(cls, nlp=None):
-        return Lemmatizer(LOOKUP)
+    stop_words = STOP_WORDS
+    lemma_rules = LEMMA_RULES
+    lemma_lookup = LOOKUP


 class Swedish(Language):
--- a/spacy/lang/th/init.py
+++ b/spacy/lang/th/init.py
@ -12,24 +12,27 @@ from ...language import Language
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups

+
 class ThaiDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'th'
-    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
-    tag_map = dict(TAG_MAP)
-    stop_words = set(STOP_WORDS)
+    tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
+    tag_map = TAG_MAP
+    stop_words = STOP_WORDS


 class Thai(Language):
-	lang = 'th'
-	Defaults = ThaiDefaults
-	def make_doc(self, text):
-		try:
-			from pythainlp.tokenize import word_tokenize
-		except ImportError:
-			raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
-								"https://github.com/wannaphongcom/pythainlp/")
-		words = [x for x in list(word_tokenize(text,"newmm"))]
-		return Doc(self.vocab, words=words, spaces=[False]*len(words))
+    lang = 'th'
+    Defaults = ThaiDefaults
+
+    def make_doc(self, text):
+        try:
+            from pythainlp.tokenize import word_tokenize
+        except ImportError:
+            raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
+                              "https://github.com/wannaphongcom/pythainlp/")
+        words = [x for x in list(word_tokenize(text,"newmm"))]
+        return Doc(self.vocab, words=words, spaces=[False]*len(words))
+

 __all__ = ['Thai']
--- a/spacy/lang/xx/init.py
+++ b/spacy/lang/xx/init.py
@ -13,7 +13,6 @@ class MultiLanguageDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'xx'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
-
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)


--- a/spacy/language.py
+++ b/spacy/language.py
@ -1,12 +1,9 @@
 # coding: utf8
 from __future__ import absolute_import, unicode_literals
 from contextlib import contextmanager
-import dill

-import numpy
 from thinc.neural import Model
-from thinc.neural.ops import NumpyOps, CupyOps
-from thinc.neural.optimizers import Adam, SGD
+from thinc.neural.optimizers import Adam
 import random
 import ujson
 from collections import OrderedDict
@ -17,30 +14,27 @@ from .vocab import Vocab
 from .tagger import Tagger
 from .lemmatizer import Lemmatizer
 from .syntax.parser import get_templates
-from .syntax import nonproj

-from .pipeline import NeuralDependencyParser, EntityRecognizer
-from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
-from .pipeline import NeuralLabeller
-from .pipeline import SimilarityHook
-from .pipeline import TextCategorizer
-from . import about
+from .pipeline import NeuralDependencyParser, TokenVectorEncoder, NeuralTagger
+from .pipeline import NeuralEntityRecognizer, SimilarityHook, TextCategorizer

 from .compat import json_dumps, izip
+from .scorer import Scorer
+from ._ml import link_vectors_to_models
 from .attrs import IS_STOP
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .lang.tokenizer_exceptions import TOKEN_MATCH
 from .lang.tag_map import TAG_MAP
 from .lang.lex_attrs import LEX_ATTRS
 from . import util
-from .scorer import Scorer
-from ._ml import link_vectors_to_models
+from . import about


 class BaseDefaults(object):
    @classmethod
    def create_lemmatizer(cls, nlp=None):
-        return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules)
+        return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules,
+                          cls.lemma_lookup)

    @classmethod
    def create_vocab(cls, nlp=None):
@ -70,59 +64,7 @@ class BaseDefaults(object):
                         prefix_search=prefix_search, suffix_search=suffix_search,
                         infix_finditer=infix_finditer, token_match=token_match)

-    @classmethod
-    def create_tagger(cls, nlp=None, **cfg):
-        if nlp is None:
-            return NeuralTagger(cls.create_vocab(nlp), **cfg)
-        else:
-            return NeuralTagger(nlp.vocab, **cfg)
-
-    @classmethod
-    def create_parser(cls, nlp=None, **cfg):
-        if nlp is None:
-            return NeuralDependencyParser(cls.create_vocab(nlp), **cfg)
-        else:
-            return NeuralDependencyParser(nlp.vocab, **cfg)
-
-    @classmethod
-    def create_entity(cls, nlp=None, **cfg):
-        if nlp is None:
-            return NeuralEntityRecognizer(cls.create_vocab(nlp), **cfg)
-        else:
-            return NeuralEntityRecognizer(nlp.vocab, **cfg)
-
-    @classmethod
-    def create_pipeline(cls, nlp=None, disable=tuple()):
-        meta = nlp.meta if nlp is not None else {}
-        # Resolve strings, like "cnn", "lstm", etc
-        pipeline = []
-        for entry in meta.get('pipeline', []):
-            if entry in disable or getattr(entry, 'name', entry) in disable:
-                continue
-            factory = cls.Defaults.factories[entry]
-            pipeline.append(factory(nlp, **meta.get(entry, {})))
-        return pipeline
-
-    factories = {
-        'make_doc': create_tokenizer,
-        'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
-        'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
-        'parser': lambda nlp, **cfg: [
-            NeuralDependencyParser(nlp.vocab, **cfg),
-            nonproj.deprojectivize],
-        'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
-        'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)],
-        'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)],
-        # Temporary compatibility -- delete after pivot
-        'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
-        'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
-        'dependencies': lambda nlp, **cfg: [
-            NeuralDependencyParser(nlp.vocab, **cfg),
-            nonproj.deprojectivize,
-        ],
-        'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
-    }
-
+    pipe_names = ['tensorizer', 'tagger', 'parser', 'ner']
    token_match = TOKEN_MATCH
    prefixes = tuple(TOKENIZER_PREFIXES)
    suffixes = tuple(TOKENIZER_SUFFIXES)
@ -136,6 +78,7 @@ class BaseDefaults(object):
    lemma_rules = {}
    lemma_exc = {}
    lemma_index = {}
+    lemma_lookup = {}
    morph_rules = {}
    lex_attr_getters = LEX_ATTRS
    syntax_iterators = {}
@ -152,8 +95,17 @@ class Language(object):
    Defaults = BaseDefaults
    lang = None

-    def __init__(self, vocab=True, make_doc=True, pipeline=None,
-                 meta={}, disable=tuple(), **kwargs):
+    factories = {
+        'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
+        'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
+        'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg),
+        'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg),
+        'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg),
+        'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
+        'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg)
+    }
+
+    def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):
        """Initialise a Language object.

        vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
@ -179,28 +131,7 @@ class Language(object):
            factory = self.Defaults.create_tokenizer
            make_doc = factory(self, **meta.get('tokenizer', {}))
        self.tokenizer = make_doc
-        if pipeline is True:
-            self.pipeline = self.Defaults.create_pipeline(self, disable)
-        elif pipeline:
-            # Careful not to do getattr(p, 'name', None) here
-            # If we had disable=[None], we'd disable everything!
-            self.pipeline = [p for p in pipeline
-                             if p not in disable
-                             and getattr(p, 'name', p) not in disable]
-            # Resolve strings, like "cnn", "lstm", etc
-            for i, entry in enumerate(self.pipeline):
-                if entry in self.Defaults.factories:
-                    factory = self.Defaults.factories[entry]
-                    self.pipeline[i] = factory(self, **meta.get(entry, {}))
-        else:
-            self.pipeline = []
-        flat_list = []
-        for pipe in self.pipeline:
-            if isinstance(pipe, list):
-                flat_list.extend(pipe)
-            else:
-                flat_list.append(pipe)
-        self.pipeline = flat_list
+        self.pipeline = []
        self._optimizer = None

    @property
@ -214,11 +145,7 @@ class Language(object):
        self._meta.setdefault('email', '')
        self._meta.setdefault('url', '')
        self._meta.setdefault('license', '')
-        pipeline = []
-        for component in self.pipeline:
-            if hasattr(component, 'name'):
-                pipeline.append(component.name)
-        self._meta['pipeline'] = pipeline
+        self._meta['pipeline'] = self.pipe_names
        return self._meta

    @meta.setter
@ -228,34 +155,144 @@ class Language(object):
    # Conveniences to access pipeline components
    @property
    def tensorizer(self):
-        return self.get_component('tensorizer')
+        return self.get_pipe('tensorizer')

    @property
    def tagger(self):
-        return self.get_component('tagger')
+        return self.get_pipe('tagger')

    @property
    def parser(self):
-        return self.get_component('parser')
+        return self.get_pipe('parser')

    @property
    def entity(self):
-        return self.get_component('ner')
+        return self.get_pipe('ner')

    @property
    def matcher(self):
-        return self.get_component('matcher')
+        return self.get_pipe('matcher')

-    def get_component(self, name):
-        if self.pipeline in (True, None):
-            return None
-        for proc in self.pipeline:
-            if hasattr(proc, 'name') and proc.name.endswith(name):
-                return proc
-        return None
+    @property
+    def pipe_names(self):
+        """Get names of available pipeline components.
+
+        RETURNS (list): List of component name strings, in order.
+        """
+        return [pipe_name for pipe_name, _ in self.pipeline]
+
+    def get_pipe(self, name):
+        """Get a pipeline component for a given component name.
+
+        name (unicode): Name of pipeline component to get.
+        RETURNS (callable): The pipeline component.
+        """
+        for pipe_name, component in self.pipeline:
+            if pipe_name == name:
+                return component
+        msg = "No component '{}' found in pipeline. Available names: {}"
+        raise KeyError(msg.format(name, self.pipe_names))
+
+    def create_pipe(self, name, config=dict()):
+        """Create a pipeline component from a factory.
+
+        name (unicode): Factory name to look up in `Language.factories`.
+        config (dict): Configuration parameters to initialise component.
+        RETURNS (callable): Pipeline component.
+        """
+        if name not in self.factories:
+            raise KeyError("Can't find factory for '{}'.".format(name))
+        factory = self.factories[name]
+        return factory(self, **config)
+
+    def add_pipe(self, component, name=None, before=None, after=None,
+                 first=None, last=None):
+        """Add a component to the processing pipeline. Valid components are
+        callables that take a `Doc` object, modify it and return it. Only one of
+        before, after, first or last can be set. Default behaviour is "last".
+
+        component (callable): The pipeline component.
+        name (unicode): Name of pipeline component. Overwrites existing
+            component.name attribute if available. If no name is set and
+            the component exposes no name attribute, component.__name__ is
+            used. An error is raised if the name already exists in the pipeline.
+        before (unicode): Component name to insert component directly before.
+        after (unicode): Component name to insert component directly after.
+        first (bool): Insert component first / not first in the pipeline.
+        last (bool): Insert component last / not last in the pipeline.
+
+        EXAMPLE:
+            >>> nlp.add_pipe(component, before='ner')
+            >>> nlp.add_pipe(component, name='custom_name', last=True)
+        """
+        if name is None:
+            if hasattr(component, 'name'):
+                name = component.name
+            elif hasattr(component, '__name__'):
+                name = component.__name__
+            elif hasattr(component, '__class__') and hasattr(component.__class__, '__name__'):
+                name = component.__class__.__name__
+            else:
+                name = repr(component)
+        if name in self.pipe_names:
+            raise ValueError("'{}' already exists in pipeline.".format(name))
+        if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
+            msg = ("Invalid constraints. You can only set one of the "
+                   "following: before, after, first, last.")
+            raise ValueError(msg)
+        pipe = (name, component)
+        if last or not any([first, before, after]):
+            self.pipeline.append(pipe)
+        elif first:
+            self.pipeline.insert(0, pipe)
+        elif before and before in self.pipe_names:
+            self.pipeline.insert(self.pipe_names.index(before), pipe)
+        elif after and after in self.pipe_names:
+            self.pipeline.insert(self.pipe_names.index(after), pipe)
+        else:
+            msg = "Can't find '{}' in pipeline. Available names: {}"
+            unfound = before or after
+            raise ValueError(msg.format(unfound, self.pipe_names))
+
+    def replace_pipe(self, name, component):
+        """Replace a component in the pipeline.
+
+        name (unicode): Name of the component to replace.
+        component (callable): Pipeline component.
+        """
+        if name not in self.pipe_names:
+            msg = "Can't find '{}' in pipeline. Available names: {}"
+            raise ValueError(msg.format(name, self.pipe_names))
+        self.pipeline[self.pipe_names.index(name)] = (name, component)
+
+    def rename_pipe(self, old_name, new_name):
+        """Rename a pipeline component.
+
+        old_name (unicode): Name of the component to rename.
+        new_name (unicode): New name of the component.
+        """
+        if old_name not in self.pipe_names:
+            msg = "Can't find '{}' in pipeline. Available names: {}"
+            raise ValueError(msg.format(old_name, self.pipe_names))
+        if new_name in self.pipe_names:
+            msg = "'{}' already exists in pipeline. Existing names: {}"
+            raise ValueError(msg.format(new_name, self.pipe_names))
+        i = self.pipe_names.index(old_name)
+        self.pipeline[i] = (new_name, self.pipeline[i][1])
+
+    def remove_pipe(self, name):
+        """Remove a component from the pipeline.
+
+        name (unicode): Name of the component to remove.
+        RETURNS (tuple): A `(name, component)` tuple of the removed component.
+        """
+        if name not in self.pipe_names:
+            msg = "Can't find '{}' in pipeline. Available names: {}"
+            raise ValueError(msg.format(name, self.pipe_names))
+        return self.pipeline.pop(self.pipe_names.index(name))

    def __call__(self, text, disable=[]):
-        """'Apply the pipeline to some text. The text can span multiple sentences,
+        """Apply the pipeline to some text. The text can span multiple sentences,
        and can contain arbtrary whitespace. Alignment into the original string
        is preserved.

@ -269,8 +306,7 @@ class Language(object):
            ('An', 'NN')
        """
        doc = self.make_doc(text)
-        for proc in self.pipeline:
-            name = getattr(proc, 'name', None)
+        for name, proc in self.pipeline:
            if name in disable:
                continue
            doc = proc(doc)
@ -308,7 +344,7 @@ class Language(object):
            grads[key] = (W, dW)
        pipes = list(self.pipeline)
        random.shuffle(pipes)
-        for proc in pipes:
+        for name, proc in pipes:
            if not hasattr(proc, 'update'):
                continue
            proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
@ -322,7 +358,7 @@ class Language(object):
        docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
        YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
        """
-        for proc in self.pipeline:
+        for name, proc in self.pipeline:
            if hasattr(proc, 'preprocess_gold'):
                docs_golds = proc.preprocess_gold(docs_golds)
        for doc, gold in docs_golds:
@ -354,7 +390,7 @@ class Language(object):

        get_gold_tuples (function): Function returning gold data
        **cfg: Config parameters.
-        returns: An optimizer
+        RETURNS: An optimizer
        """
        # Populate vocab
        if get_gold_tuples is not None:
@ -371,7 +407,7 @@ class Language(object):
        else:
            device = None
        link_vectors_to_models(self.vocab)
-        for proc in self.pipeline:
+        for name, proc in self.pipeline:
            if hasattr(proc, 'begin_training'):
                context = proc.begin_training(get_gold_tuples(),
                                              pipeline=self.pipeline)
@ -393,7 +429,7 @@ class Language(object):
        docs, golds = zip(*docs_golds)
        docs = list(docs)
        golds = list(golds)
-        for pipe in self.pipeline:
+        for name, pipe in self.pipeline:
            if not hasattr(pipe, 'pipe'):
                for doc in docs:
                    pipe(doc)
@ -419,7 +455,7 @@ class Language(object):
            >>> with nlp.use_params(optimizer.averages):
            >>>     nlp.to_disk('/tmp/checkpoint')
        """
-        contexts = [pipe.use_params(params) for pipe
+        contexts = [pipe.use_params(params) for name, pipe
                    in self.pipeline if hasattr(pipe, 'use_params')]
        # TODO: Having trouble with contextlib
        # Workaround: these aren't actually context managers atm.
@ -466,8 +502,7 @@ class Language(object):
                yield (doc, context)
            return
        docs = (self.make_doc(text) for text in texts)
-        for proc in self.pipeline:
-            name = getattr(proc, 'name', None)
+        for name, proc in self.pipeline:
            if name in disable:
                continue
            if hasattr(proc, 'pipe'):
@ -495,14 +530,14 @@ class Language(object):
            ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
            ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
        ))
-        for proc in self.pipeline:
+        for name, proc in self.pipeline:
            if not hasattr(proc, 'name'):
                continue
-            if proc.name in disable:
+            if name in disable:
                continue
            if not hasattr(proc, 'to_disk'):
                continue
-            serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
+            serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
        serializers['vocab'] = lambda p: self.vocab.to_disk(p)
        util.to_disk(path, serializers, {p: False for p in disable})

@ -526,14 +561,12 @@ class Language(object):
            ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
            ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
        ))
-        for proc in self.pipeline:
-            if not hasattr(proc, 'name'):
-                continue
-            if proc.name in disable:
+        for name, proc in self.pipeline:
+            if name in disable:
                continue
            if not hasattr(proc, 'to_disk'):
                continue
-            deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
+            deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
        exclude = {p: False for p in disable}
        if not (path / 'vocab').exists():
            exclude['vocab'] = True
@ -552,8 +585,8 @@ class Language(object):
            ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
            ('meta', lambda: ujson.dumps(self.meta))
        ))
-        for i, proc in enumerate(self.pipeline):
-            if getattr(proc, 'name', None) in disable:
+        for i, (name, proc) in enumerate(self.pipeline):
+            if name in disable:
                continue
            if not hasattr(proc, 'to_bytes'):
                continue
@ -572,8 +605,8 @@ class Language(object):
            ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
            ('meta', lambda b: self.meta.update(ujson.loads(b)))
        ))
-        for i, proc in enumerate(self.pipeline):
-            if getattr(proc, 'name', None) in disable:
+        for i, (name, proc) in enumerate(self.pipeline):
+            if name in disable:
                continue
            if not hasattr(proc, 'from_bytes'):
                continue
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -10,20 +10,23 @@ class Lemmatizer(object):
    def load(cls, path, index=None, exc=None, rules=None):
        return cls(index or {}, exc or {}, rules or {})

-    def __init__(self, index, exceptions, rules):
-        self.index = index
-        self.exc = exceptions
-        self.rules = rules
+    def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
+        self.index = index if index is not None else {}
+        self.exc = exceptions if exceptions is not None else {}
+        self.rules = rules if rules is not None else {}
+        self.lookup_table = lookup if lookup is not None else {}

    def __call__(self, string, univ_pos, morphology=None):
-        if univ_pos == NOUN:
+        if univ_pos in (NOUN, 'NOUN', 'noun'):
            univ_pos = 'noun'
-        elif univ_pos == VERB:
+        elif univ_pos in (VERB, 'VERB', 'verb'):
            univ_pos = 'verb'
-        elif univ_pos == ADJ:
+        elif univ_pos in (ADJ, 'ADJ', 'adj'):
            univ_pos = 'adj'
-        elif univ_pos == PUNCT:
+        elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
            univ_pos = 'punct'
+        else:
+            return set([string.lower()])
        # See Issue #435 for example of where this logic is requied.
        if self.is_base_form(univ_pos, morphology):
            return set([string.lower()])
@ -77,6 +80,11 @@ class Lemmatizer(object):
    def punct(self, string, morphology=None):
        return self(string, 'punct', morphology)

+    def lookup(self, string):
+        if string in self.lookup_table:
+            return self.lookup_table[string]
+        return string
+

 def lemmatize(string, index, exceptions, rules):
    string = string.lower()
--- a/spacy/lemmatizerlookup.py
+++ b/spacy/lemmatizerlookup.py
@ -1,19 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from .lemmatizer import Lemmatizer
-
-
-class Lemmatizer(Lemmatizer):
-    @classmethod
-    def load(cls, path, lookup):
-        return cls(lookup or {})
-
-    def __init__(self, lookup):
-        self.lookup = lookup
-
-    def __call__(self, string, univ_pos, morphology=None):
-        try:
-            return set([self.lookup[string]])
-        except:
-            return set([string])
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -35,6 +35,8 @@ cdef class Morphology:
    cdef RichTagC* rich_tags
    cdef PreshMapArray _cache

+    cdef int assign_untagged(self, TokenC* token) except -1
+
    cdef int assign_tag(self, TokenC* token, tag) except -1

    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -42,7 +42,7 @@ cdef class Morphology:
        self.tag_names = tuple(sorted(tag_map.keys()))
        self.reverse_index = {}

-        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
+        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
            self.tag_map[tag_str] = dict(attrs)
            attrs = _normalize_props(attrs)
@ -52,6 +52,10 @@ cdef class Morphology:
            self.rich_tags[i].morph = 0
            self.rich_tags[i].pos = attrs[POS]
            self.reverse_index[self.rich_tags[i].name] = i
+        # Add a 'null' tag, which we can reference when assign morphology to
+        # untagged tokens.
+        self.rich_tags[self.n_tags].id = self.n_tags
+
        self._cache = PreshMapArray(self.n_tags)
        self.exc = {}
        if exc is not None:
@ -62,6 +66,15 @@ cdef class Morphology:
        return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
                             self.exc), None, None)

+    cdef int assign_untagged(self, TokenC* token) except -1:
+        """Set morphological attributes on a token without a POS tag. Uses
+        the lemmatizer's lookup() method, which looks up the string in the
+        table provided by the language data as lemma_lookup (if available)."""
+        if token.lemma == 0:
+            orth_str = self.strings[token.lex.orth]
+            lemma = self.lemmatizer.lookup(orth_str)
+            token.lemma = self.strings.add(lemma)
+
    cdef int assign_tag(self, TokenC* token, tag) except -1:
        if isinstance(tag, basestring):
            tag = self.strings.add(tag)
@ -72,7 +85,7 @@ cdef class Morphology:
            token.tag = tag

    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
-        if tag_id >= self.n_tags:
+        if tag_id > self.n_tags:
            raise ValueError("Unknown tag ID: %s" % tag_id)
        # TODO: It's pretty arbitrary to put this logic here. I guess the justification
        # is that this is where the specific word and the tag interact. Still,
@ -151,8 +164,6 @@ cdef class Morphology:
        cdef unicode py_string = self.strings[orth]
        if self.lemmatizer is None:
            return self.strings.add(py_string.lower())
-        if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
-            return self.strings.add(py_string.lower())
        cdef set lemma_strings
        cdef unicode lemma_string
        lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -28,6 +28,7 @@ from thinc.neural._classes.difference import Siamese, CauchySimilarity
 from .tokens.doc cimport Doc
 from .syntax.parser cimport Parser as LinearParser
 from .syntax.nn_parser cimport Parser as NeuralParser
+from .syntax import nonproj
 from .syntax.parser import get_templates as get_feature_templates
 from .syntax.beam_parser cimport BeamParser
 from .syntax.ner cimport BiluoPushDown
@ -157,11 +158,13 @@ class BaseThincComponent(object):

    def to_bytes(self, **exclude):
        """Serialize the pipe to a bytestring."""
-        serialize = OrderedDict((
-            ('cfg', lambda: json_dumps(self.cfg)),
-            ('model', lambda: self.model.to_bytes()),
-            ('vocab', lambda: self.vocab.to_bytes())
-        ))
+        serialize = OrderedDict()
+        serialize['cfg'] = lambda: json_dumps(self.cfg)
+        if self.model in (True, False, None):
+            serialize['model'] = lambda: self.model
+        else:
+            serialize['model'] = self.model.to_bytes
+        serialize['vocab'] = self.vocab.to_bytes
        return util.to_bytes(serialize, exclude)

    def from_bytes(self, bytes_data, **exclude):
@ -182,11 +185,11 @@ class BaseThincComponent(object):

    def to_disk(self, path, **exclude):
        """Serialize the pipe to disk."""
-        serialize = OrderedDict((
-            ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
-            ('vocab', lambda p: self.vocab.to_disk(p)),
-            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
-        ))
+        serialize = OrderedDict()
+        serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg))
+        serialize['vocab'] = lambda p: self.vocab.to_disk(p)
+        if self.model not in (None, True, False):
+            serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())
        util.to_disk(path, serialize, exclude)

    def from_disk(self, path, **exclude):
@ -437,13 +440,16 @@ class NeuralTagger(BaseThincComponent):
            yield

    def to_bytes(self, **exclude):
-        serialize = OrderedDict((
-            ('model', lambda: self.model.to_bytes()),
-            ('vocab', lambda: self.vocab.to_bytes()),
-            ('tag_map', lambda: msgpack.dumps(self.vocab.morphology.tag_map,
-                                             use_bin_type=True,
-                                             encoding='utf8'))
-        ))
+        serialize = OrderedDict()
+        if self.model in (None, True, False):
+            serialize['model'] = lambda: self.model
+        else:
+            serialize['model'] = self.model.to_bytes
+        serialize['vocab'] = self.vocab.to_bytes
+
+        serialize['tag_map'] = lambda: msgpack.dumps(self.vocab.morphology.tag_map,
+                                                     use_bin_type=True,
+                                                     encoding='utf8')
        return util.to_bytes(serialize, exclude)

    def from_bytes(self, bytes_data, **exclude):
@ -778,11 +784,19 @@ cdef class DependencyParser(LinearParser):
        if isinstance(label, basestring):
            label = self.vocab.strings[label]

+    @property
+    def postprocesses(self):
+        return [nonproj.deprojectivize]
+

 cdef class NeuralDependencyParser(NeuralParser):
    name = 'parser'
    TransitionSystem = ArcEager

+    @property
+    def postprocesses(self):
+        return [nonproj.deprojectivize]
+
    def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
        for target in []:
            labeller = NeuralLabeller(self.vocab, target=target)
@ -823,6 +837,11 @@ cdef class BeamDependencyParser(BeamParser):
        if isinstance(label, basestring):
            label = self.vocab.strings[label]

+    @property
+    def postprocesses(self):
+        return [nonproj.deprojectivize]
+
+

 __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser',
           'BeamEntityRecognizer', 'TokenVectorEnoder']
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -241,8 +241,8 @@ cdef class Parser:
    def Model(cls, nr_class, **cfg):
        depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
        token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128))
-        hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128))
-        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 1))
+        hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200))
+        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2))
        embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
        hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
        hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
@ -779,6 +779,14 @@ cdef class Parser:
            for i in range(doc.length):
                doc.c[i] = state.c._sent[i]
            self.moves.finalize_doc(doc)
+            for hook in self.postprocesses:
+                for doc in docs:
+                    hook(doc)
+
+    @property
+    def postprocesses(self):
+        # Available for subclasses, e.g. to deprojectivize
+        return []

    def add_label(self, label):
        resized = False
@ -792,16 +800,25 @@ cdef class Parser:
        if self.model not in (True, False, None) and resized:
            # Weights are stored in (nr_out, nr_in) format, so we're basically
            # just adding rows here.
-            smaller = self.model[-1]._layers[-1]
-            larger = Affine(self.moves.n_moves, smaller.nI)
-            copy_array(larger.W[:smaller.nO], smaller.W)
-            copy_array(larger.b[:smaller.nO], smaller.b)
-            self.model[-1]._layers[-1] = larger
+            if self.model[-1].is_noop:
+                smaller = self.model[1]
+                dims = dict(self.model[1]._dims)
+                dims['nO'] = self.moves.n_moves
+                larger = self.model[1].__class__(**dims)
+                copy_array(larger.W[:, :smaller.nO], smaller.W)
+                copy_array(larger.b[:smaller.nO], smaller.b)
+                self.model = (self.model[0], larger, self.model[2])
+            else:
+                smaller = self.model[-1]._layers[-1]
+                larger = Affine(self.moves.n_moves, smaller.nI)
+                copy_array(larger.W[:smaller.nO], smaller.W)
+                copy_array(larger.b[:smaller.nO], smaller.b)
+                self.model[-1]._layers[-1] = larger

    def begin_training(self, gold_tuples, pipeline=None, **cfg):
        if 'model' in cfg:
            self.model = cfg['model']
-        gold_tuples = nonproj.preprocess_training_data(gold_tuples)
+        gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100)
        actions = self.moves.get_actions(gold_parses=gold_tuples)
        for action, labels in actions.items():
            for label in labels:
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -58,8 +58,9 @@ def en_vocab():


@pytest.fixture
-def en_parser():
-    return util.get_lang_class('en').Defaults.create_parser()
+def en_parser(en_vocab):
+    nlp = util.get_lang_class('en')(en_vocab)
+    return nlp.create_pipe('parser')


@pytest.fixture
--- a/spacy/tests/doc/test_creation.py
+++ b/spacy/tests/doc/test_creation.py
@ -0,0 +1,37 @@
+'''Test Doc sets up tokens correctly.'''
+from __future__ import unicode_literals
+import pytest
+
+from ...vocab import Vocab
+from ...tokens.doc import Doc
+from ...lemmatizer import Lemmatizer
+
+
+@pytest.fixture
+def lemmatizer():
+    return Lemmatizer(lookup={'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'})
+
+
+@pytest.fixture
+def vocab(lemmatizer):
+    return Vocab(lemmatizer=lemmatizer)
+
+
+def test_empty_doc(vocab):
+    doc = Doc(vocab)
+    assert len(doc) == 0
+
+
+def test_single_word(vocab):
+    doc = Doc(vocab, words=['a'])
+    assert doc.text == 'a '
+    doc = Doc(vocab, words=['a'], spaces=[False])
+    assert doc.text == 'a'
+
+
+def test_lookup_lemmatization(vocab):
+    doc = Doc(vocab, words=['dogs', 'dogses'])
+    assert doc[0].text == 'dogs'
+    assert doc[0].lemma_ == 'dog'
+    assert doc[1].text == 'dogses'
+    assert doc[1].lemma_ == 'dogses'
--- a/spacy/tests/lang/de/test_lemma.py
+++ b/spacy/tests/lang/de/test_lemma.py
@ -0,0 +1,13 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize('string,lemma', [('Abgehängten', 'Abgehängte'),
+                                          ('engagierte', 'engagieren'),
+                                          ('schließt', 'schließen'),
+                                          ('vorgebenden', 'vorgebend')])
+def test_lemmatizer_lookup_assigns(de_tokenizer, string, lemma):
+    tokens = de_tokenizer(string)
+    assert tokens[0].lemma_ == lemma
--- a/spacy/tests/lang/en/test_lemmatizer.py
+++ b/spacy/tests/lang/en/test_lemmatizer.py
@ -57,6 +57,5 @@ def test_en_lemmatizer_punct(en_lemmatizer):
 def test_en_lemmatizer_lemma_assignment(EN):
    text = "Bananas in pyjamas are geese."
    doc = EN.make_doc(text)
-    assert all(t.lemma_ == '' for t in doc)
    EN.tagger(doc)
    assert all(t.lemma_ != '' for t in doc)
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@ -22,14 +22,14 @@ def vocab():
@pytest.fixture
 def parser(vocab):
    parser = NeuralDependencyParser(vocab)
-    parser.cfg['token_vector_width'] = 4
-    parser.cfg['hidden_width'] = 6
+    parser.cfg['token_vector_width'] = 8
+    parser.cfg['hidden_width'] = 30
    parser.cfg['hist_size'] = 0
    parser.add_label('left')
    parser.begin_training([], **parser.cfg)
    sgd = Adam(NumpyOps(), 0.001)

-    for i in range(30):
+    for i in range(10):
        losses = {}
        doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
        gold = GoldParse(doc, heads=[1, 1, 3, 3],
@ -37,6 +37,8 @@ def parser(vocab):
        parser.update([doc], [gold], sgd=sgd, losses=losses)
    return parser

+def test_init_parser(parser):
+    pass

 def test_add_label(parser):
    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
--- a/spacy/tests/parser/test_beam_parse.py
+++ b/spacy/tests/parser/test_beam_parse.py
@ -1,10 +1,11 @@
-import spacy
+# coding: utf8
+from __future__ import unicode_literals
+
 import pytest

-@pytest.mark.models
-def test_beam_parse():
-    nlp = spacy.load('en_core_web_sm')
-    doc = nlp(u'Australia is a country', disable=['ner'])
-    ents = nlp.entity(doc, beam_width=2)
-    print(ents)

+@pytest.mark.models('en')
+def test_beam_parse(EN):
+    doc = EN(u'Australia is a country', disable=['ner'])
+    ents = EN.entity(doc, beam_width=2)
+    print(ents)
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@ -35,7 +35,7 @@ def parser(vocab):
 def test_no_sentences(parser):
    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
    doc = parser(doc)
-    assert len(list(doc.sents)) == 2
+    assert len(list(doc.sents)) >= 1


 def test_sents_1(parser):
@ -64,7 +64,7 @@ def test_sents_1_3(parser):
    doc[1].sent_start = True
    doc[3].sent_start = True
    doc = parser(doc)
-    assert len(list(doc.sents)) == 4
+    assert len(list(doc.sents)) >= 3
    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
    doc[1].sent_start = True
    doc[2].sent_start = False
--- a/spacy/tests/pipeline/init.py
+++ b/spacy/tests/pipeline/init.py
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@ -0,0 +1,84 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+
+from ...language import Language
+
+
+@pytest.fixture
+def nlp():
+    return Language()
+
+
+def new_pipe(doc):
+    return doc
+
+
+def test_add_pipe_no_name(nlp):
+    nlp.add_pipe(new_pipe)
+    assert 'new_pipe' in nlp.pipe_names
+
+
+def test_add_pipe_duplicate_name(nlp):
+    nlp.add_pipe(new_pipe, name='duplicate_name')
+    with pytest.raises(ValueError):
+        nlp.add_pipe(new_pipe, name='duplicate_name')
+
+
+@pytest.mark.parametrize('name', ['parser'])
+def test_add_pipe_first(nlp, name):
+    nlp.add_pipe(new_pipe, name=name, first=True)
+    assert nlp.pipeline[0][0] == name
+
+
+@pytest.mark.parametrize('name1,name2', [('parser', 'lambda_pipe')])
+def test_add_pipe_last(nlp, name1, name2):
+    nlp.add_pipe(lambda doc: doc, name=name2)
+    nlp.add_pipe(new_pipe, name=name1, last=True)
+    assert nlp.pipeline[0][0] != name1
+    assert nlp.pipeline[-1][0] == name1
+
+
+def test_cant_add_pipe_first_and_last(nlp):
+    with pytest.raises(ValueError):
+        nlp.add_pipe(new_pipe, first=True, last=True)
+
+
+@pytest.mark.parametrize('name', ['my_component'])
+def test_get_pipe(nlp, name):
+    with pytest.raises(KeyError):
+        nlp.get_pipe(name)
+    nlp.add_pipe(new_pipe, name=name)
+    assert nlp.get_pipe(name) == new_pipe
+
+
+@pytest.mark.parametrize('name,replacement', [('my_component', lambda doc: doc)])
+def test_replace_pipe(nlp, name, replacement):
+    with pytest.raises(ValueError):
+        nlp.replace_pipe(name, new_pipe)
+    nlp.add_pipe(new_pipe, name=name)
+    nlp.replace_pipe(name, replacement)
+    assert nlp.get_pipe(name) != new_pipe
+    assert nlp.get_pipe(name) == replacement
+
+
+@pytest.mark.parametrize('old_name,new_name', [('old_pipe', 'new_pipe')])
+def test_rename_pipe(nlp, old_name, new_name):
+    with pytest.raises(ValueError):
+        nlp.rename_pipe(old_name, new_name)
+    nlp.add_pipe(new_pipe, name=old_name)
+    nlp.rename_pipe(old_name, new_name)
+    assert nlp.pipeline[0][0] == new_name
+
+
+@pytest.mark.parametrize('name', ['my_component'])
+def test_remove_pipe(nlp, name):
+    with pytest.raises(ValueError):
+        nlp.remove_pipe(name)
+    nlp.add_pipe(new_pipe, name=name)
+    assert len(nlp.pipeline) == 1
+    removed_name, removed_component = nlp.remove_pipe(name)
+    assert not len(nlp.pipeline)
+    assert removed_name == name
+    assert removed_component == new_pipe
--- a/spacy/tests/regression/test_issue589.py
+++ b/spacy/tests/regression/test_issue589.py
@ -7,6 +7,7 @@ from ..util import get_doc
 import pytest


+@pytest.mark.xfail
 def test_issue589():
    vocab = Vocab()
    vocab.strings.set_frozen(True)
--- a/spacy/tests/serialize/test_serialize_empty_model.py
+++ b/spacy/tests/serialize/test_serialize_empty_model.py
@ -0,0 +1,9 @@
+import spacy
+import spacy.lang.en
+from spacy.pipeline import TextCategorizer
+
+def test_bytes_serialize_issue_1105():
+    nlp = spacy.lang.en.English()
+    tokenizer = nlp.tokenizer
+    textcat = TextCategorizer(tokenizer.vocab, labels=['ENTITY', 'ACTION', 'MODIFIER'])
+    textcat_bytes = textcat.to_bytes()
--- a/spacy/tests/test_underscore.py
+++ b/spacy/tests/test_underscore.py
@ -0,0 +1,53 @@
+from mock import Mock
+from ..tokens.underscore import Underscore
+
+
+def test_create_doc_underscore():
+    doc = Mock()
+    doc.doc = doc
+    uscore = Underscore(Underscore.doc_extensions, doc)
+    assert uscore._doc is doc
+    assert uscore._start is None
+    assert uscore._end is None
+
+
+def test_doc_underscore_getattr_setattr():
+    doc = Mock()
+    doc.doc = doc
+    doc.user_data = {}
+    Underscore.doc_extensions['hello'] = (False, None, None, None)
+    doc._ = Underscore(Underscore.doc_extensions, doc)
+    assert doc._.hello == False
+    doc._.hello = True
+    assert doc._.hello == True
+
+
+def test_create_span_underscore():
+    span = Mock(doc=Mock(), start=0, end=2)
+    uscore = Underscore(Underscore.span_extensions, span,
+                        start=span.start, end=span.end)
+    assert uscore._doc is span.doc
+    assert uscore._start is span.start
+    assert uscore._end is span.end
+
+
+def test_span_underscore_getter_setter():
+    span = Mock(doc=Mock(), start=0, end=2)
+    Underscore.span_extensions['hello'] = (None, None,
+                                           lambda s: (s.start, 'hi'),
+                                           lambda s, value: setattr(s, 'start',
+                                                                    value))
+    span._ = Underscore(Underscore.span_extensions, span,
+                        start=span.start, end=span.end)
+
+    assert span._.hello == (0, 'hi')
+    span._.hello = 1
+    assert span._.hello == (1, 'hi')
+
+
+def test_token_underscore_method():
+    token = Mock(doc=Mock(), idx=7, say_cheese=lambda token: 'cheese')
+    Underscore.token_extensions['hello'] = (None, token.say_cheese,
+                                            None, None)
+    token._ = Underscore(Underscore.token_extensions, token, start=token.idx)
+    assert token._.hello() == 'cheese'
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -30,7 +30,7 @@ from ..util import normalize_slice
 from ..compat import is_config
 from .. import about
 from .. import util
-
+from .underscore import Underscore

 DEF PADDING = 5

@ -64,6 +64,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
    else:
        return Lexeme.get_struct_attr(token.lex, feat_name)

+
 def _get_chunker(lang):
    try:
        cls = util.get_lang_class(lang)
@ -73,6 +74,7 @@ def _get_chunker(lang):
        return None
    return cls.Defaults.syntax_iterators.get(u'noun_chunks')

+
 cdef class Doc:
    """A sequence of Token objects. Access sentences and named entities, export
    annotations to numpy arrays, losslessly serialize to compressed binary strings.
@ -87,6 +89,21 @@ cdef class Doc:
        >>> from spacy.tokens import Doc
        >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
    """
+    @classmethod
+    def set_extension(cls, name, default=None, method=None,
+                      getter=None, setter=None):
+        nr_defined = sum(t is not None for t in (default, getter, setter, method))
+        assert nr_defined == 1
+        Underscore.doc_extensions[name] = (default, method, getter, setter) 
+
+    @classmethod
+    def get_extension(cls, name):
+        return Underscore.doc_extensions.get(name)
+
+    @classmethod
+    def has_extension(cls, name):
+        return name in Underscore.doc_extensions
+
    def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
        """Create a Doc object.

@ -159,6 +176,10 @@ cdef class Doc:
            self.is_tagged = True
            self.is_parsed = True

+    @property
+    def _(self):
+        return Underscore(Underscore.doc_extensions, self)
+
    def __getitem__(self, object i):
        """Get a `Token` or `Span` object.

@ -512,6 +533,8 @@ cdef class Doc:
        assert t.lex.orth != 0
        t.spacy = has_space
        self.length += 1
+        # Set morphological attributes, e.g. by lemma, if possible
+        self.vocab.morphology.assign_untagged(t)
        self._py_tokens.append(None)
        return t.idx + t.lex.length + t.spacy

--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -17,10 +17,24 @@ from ..attrs cimport IS_PUNCT, IS_SPACE
 from ..lexeme cimport Lexeme
 from ..compat import is_config
 from .. import about
+from .underscore import Underscore


 cdef class Span:
    """A slice from a Doc object."""
+    @classmethod
+    def set_extension(cls, name, default=None, method=None,
+                      getter=None, setter=None):
+        Underscore.span_extensions[name] = (default, method, getter, setter)
+
+    @classmethod
+    def get_extension(cls, name):
+        return Underscore.span_extensions.get(name)
+
+    @classmethod
+    def has_extension(cls, name):
+        return name in Underscore.span_extensions
+
    def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
                  vector_norm=None):
        """Create a `Span` object from the slice `doc[start : end]`.
@ -111,6 +125,10 @@ cdef class Span:
        for i in range(self.start, self.end):
            yield self.doc[i]

+    @property
+    def _(self):
+        return Underscore(Underscore.span_extensions, self,
+                          start=self.start_char, end=self.end_char)
    def as_doc(self):
        '''Create a Doc object view of the Span's data.

--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -20,10 +20,24 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST
 from ..attrs cimport LEMMA, POS, TAG, DEP
 from ..compat import is_config
 from .. import about
+from .underscore import Underscore


 cdef class Token:
    """An individual token – i.e. a word, punctuation symbol, whitespace, etc."""
+    @classmethod
+    def set_extension(cls, name, default=None, method=None,
+                      getter=None, setter=None):
+        Underscore.token_extensions[name] = (default, method, getter, setter)
+
+    @classmethod
+    def get_extension(cls, name):
+        return Underscore.span_extensions.get(name)
+
+    @classmethod
+    def has_extension(cls, name):
+        return name in Underscore.span_extensions
+
    def __cinit__(self, Vocab vocab, Doc doc, int offset):
        """Construct a `Token` object.

@ -87,6 +101,11 @@ cdef class Token:
        else:
            raise ValueError(op)

+    @property
+    def _(self):
+        return Underscore(Underscore.token_extensions, self,
+                          start=self.idx, end=None)
+
    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
        """Check the value of a boolean flag.

--- a/spacy/tokens/underscore.py
+++ b/spacy/tokens/underscore.py
@ -0,0 +1,50 @@
+import functools
+
+class Underscore(object):
+    doc_extensions = {}
+    span_extensions = {}
+    token_extensions = {}
+
+    def __init__(self, extensions, obj, start=None, end=None):
+        object.__setattr__(self, '_extensions', extensions)
+        object.__setattr__(self, '_obj', obj)
+        # Assumption is that for doc values, _start and _end will both be None
+        # Span will set non-None values for _start and _end
+        # Token will have _start be non-None, _end be None
+        # This lets us key everything into the doc.user_data dictionary,
+        # (see _get_key), and lets us use a single Underscore class.
+        object.__setattr__(self, '_doc', obj.doc)
+        object.__setattr__(self, '_start', start)
+        object.__setattr__(self, '_end', end)
+
+    def __getattr__(self, name):
+        if name not in self._extensions:
+            raise AttributeError(name)
+        default, method, getter, setter = self._extensions[name]
+        if getter is not None:
+            return getter(self._obj)
+        elif method is not None:
+            return functools.partial(method, self._obj)
+        else:
+            return self._doc.user_data.get(self._get_key(name), default)
+
+    def __setattr__(self, name, value):
+        if name not in self._extensions:
+            raise AttributeError(name)
+        default, method, getter, setter = self._extensions[name]
+        if setter is not None:
+            return setter(self._obj, value)
+        else:
+            self._doc.user_data[self._get_key(name)] = value
+
+    def set(self, name, value):
+        return self.__setattr__(name, value)
+
+    def get(self, name):
+        return self.__getattr__(name)
+
+    def has(self, name):
+        return name in self._extensions
+
+    def _get_key(self, name):
+        return ('._.', name, self._start, self._end)
--- a/spacy/util.py
+++ b/spacy/util.py
@ -135,7 +135,18 @@ def load_model_from_path(model_path, meta=False, **overrides):
    if not meta:
        meta = get_model_meta(model_path)
    cls = get_lang_class(meta['lang'])
-    nlp = cls(pipeline=meta.get('pipeline', True), meta=meta, **overrides)
+    nlp = cls(meta=meta, **overrides)
+    pipeline = meta.get('pipeline', [])
+    disable = overrides.get('disable', [])
+    if pipeline is True:
+        pipeline = nlp.Defaults.pipe_names
+    elif pipeline in (False, None):
+        pipeline = []
+    for name in pipeline:
+        if name not in disable:
+            config = meta.get('pipeline_args', {}).get(name, {})
+            component = nlp.create_pipe(name, config=config)
+            nlp.add_pipe(component, name=name)
    return nlp.from_disk(model_path)


--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -149,6 +149,10 @@ mixin code(label, language, prompt, height, icon, wrap)

 //- Code blocks to display old/new versions

+mixin code-wrapper()
+    span.u-inline-block.u-padding-top.u-width-full
+        block
+
 mixin code-old()
    +code(false, false, false, false, "reject").o-block-small
        block
--- a/website/api/_top-level/_cli.jade
+++ b/website/api/_top-level/_cli.jade
@ -113,6 +113,22 @@ p
        +cell flag
        +cell Show help message and available arguments.

+h(3, "validate") Validate
+    +tag-new(2)
+
+p
+    |  Find all models installed in the current environment (both packages and
+    |  shortcut links) and check whether they are compatible with the currently
+    |  installed version of spaCy. Should be run after upgrading spaCy via
+    |  #[code pip install -U spacy] to ensure that all installed models are
+    |  can be used with the new version. The command is also useful to detect
+    |  out-of-sync model links resulting from links created in different virtual
+    |  environments. Prints a list of models, the installed versions, the latest
+    |  compatible version (if out of date) and the commands for updating.
+
+code(false, "bash", "$").
+    spacy validate
+
 +h(3, "convert") Convert

 p
--- a/website/api/_top-level/_spacy.jade
+++ b/website/api/_top-level/_spacy.jade
@ -43,6 +43,20 @@ p
        +cell #[code Language]
        +cell A #[code Language] object with the loaded model.

+p
+    |  Essentially, #[code spacy.load()] is a convenience wrapper that reads
+    |  the language ID and pipeline components from a model's #[code meta.json],
+    |  initialises the #[code Language] class, loads in the model data and
+    |  returns it.
+
+code("Abstract example").
+    cls = util.get_lang_class(lang)         #  get language for ID, e.g. 'en'
+    nlp = cls()                             #  initialise the language
+    for name in pipeline:
+        component = nlp.create_pipe(name)   #  create each pipeline component
+        nlp.add_pipe(component)             #  add component to pipeline
+    nlp.from_disk(model_data_path)          #  load in model data
+
 +infobox("Deprecation note", "⚠️")
    .o-block
        |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
@ -141,37 +155,3 @@ p
        +cell returns
        +cell unicode
        +cell The explanation, or #[code None] if not found in the glossary.
-
-+h(3, "spacy.set_factory") spacy.set_factory
-    +tag function
-    +tag-new(2)
-
-p
-    |  Set a factory that returns a custom
-    |  #[+a("/usage/processing-pipelines") processing pipeline]
-    |  component. Factories are useful for creating stateful components, especially ones which depend on shared data.
-
-+aside-code("Example").
-    def my_factory(vocab):
-        def my_component(doc):
-            return doc
-        return my_component
-
-    spacy.set_factory('my_factory', my_factory)
-    nlp = Language(pipeline=['my_factory'])
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code factory_id]
-        +cell unicode
-        +cell
-            |  Unique name of factory. If added to a new pipeline, spaCy will
-            |  look up the factory for this ID and use it to create the
-            |  component.
-
-    +row
-        +cell #[code factory]
-        +cell callable
-        +cell
-            |  Callable that takes a #[code Vocab] object and returns a pipeline
-            |  component.
--- a/website/api/doc.jade
+++ b/website/api/doc.jade
@ -138,6 +138,109 @@ p Get the number of tokens in the document.
        +cell int
        +cell The number of tokens in the document.

+h(2, "set_extension") Doc.set_extension
+    +tag classmethod
+    +tag-new(2)
+
+p
+    |  Define a custom attribute on the #[code Doc] which becomes available via
+    |  #[code Doc._]. For details, see the documentation on
+    |  #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes].
+
+aside-code("Example").
+    from spacy.tokens import Doc
+    city_getter = lambda doc: doc.text in ('New York', 'Paris', 'Berlin')
+    Doc.set_extension('has_city', getter=city_getter)
+    doc = nlp(u'I like New York')
+    assert doc._.has_city
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code name]
+        +cell unicode
+        +cell
+            |  Name of the attribute to set by the extension. For example,
+            |  #[code 'my_attr'] will be available as #[code doc._.my_attr].
+
+    +row
+        +cell #[code default]
+        +cell -
+        +cell
+            |  Optional default value of the attribute if no getter or method
+            |  is defined.
+
+    +row
+        +cell #[code method]
+        +cell callable
+        +cell
+            |  Set a custom method on the object, for example
+            |  #[code doc._.compare(other_doc)].
+
+    +row
+        +cell #[code getter]
+        +cell callable
+        +cell
+            |  Getter function that takes the object and returns an attribute
+            |  value. Is called when the user accesses the #[code ._] attribute.
+
+    +row
+        +cell #[code setter]
+        +cell callable
+        +cell
+            |  Setter function that takes the #[code Doc] and a value, and
+            |  modifies the object. Is called when the user writes to the
+            |  #[code Doc._] attribute.
+
+h(2, "get_extension") Doc.get_extension
+    +tag classmethod
+    +tag-new(2)
+
+p
+    |  Look up a previously registered extension by name. Returns a 4-tuple
+    |  #[code.u-break (default, method, getter, setter)] if the extension is
+    |  registered. Raises a #[code KeyError] otherwise.
+
+aside-code("Example").
+    from spacy.tokens import Doc
+    Doc.set_extension('is_city', default=False)
+    extension = Doc.get_extension('is_city')
+    assert extension == (False, None, None, None)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code name]
+        +cell unicode
+        +cell Name of the extension.
+
+    +row("foot")
+        +cell returns
+        +cell tuple
+        +cell
+            |  A #[code.u-break (default, method, getter, setter)] tuple of the
+            |  extension.
+
+h(2, "has_extension") Doc.has_extension
+    +tag classmethod
+    +tag-new(2)
+
+p Check whether an extension has been registered on the #[code Doc] class.
+
+aside-code("Example").
+    from spacy.tokens import Doc
+    Doc.set_extension('is_city', default=False)
+    assert Doc.has_extension('is_city')
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code name]
+        +cell unicode
+        +cell Name of the extension to check.
+
+    +row("foot")
+        +cell returns
+        +cell bool
+        +cell Whether the extension has been registered.
+
 +h(2, "char_span") Doc.char_span
    +tag method
    +tag-new(2)
--- a/website/api/language.jade
+++ b/website/api/language.jade
@ -4,7 +4,14 @@ include ../_includes/_mixins

 p
    |  Usually you'll load this once per process as #[code nlp] and pass the
-    |  instance around your application.
+    |  instance around your application. The #[code Language] class is created
+    |  when you call #[+api("spacy#load") #[code spacy.load()]] and contains
+    |  the shared vocabulary and #[+a("/usage/adding-languages") language data],
+    |  optional model data loaded from a #[+a("/models") model package] or
+    |  a path, and a #[+a("/usage/processing-pipelines") processing pipeline]
+    |  containing components like the tagger or parser that are called on a
+    |  document in order. You can also add your own processing pipeline
+    |  components that take a #[code Doc] object, modify it and return it.

 +h(2, "init") Language.__init__
    +tag method
@ -12,9 +19,9 @@ p
 p Initialise a #[code Language] object.

 +aside-code("Example").
+    from spacy.vocab import Vocab
    from spacy.language import Language
-    nlp = Language(pipeline=['token_vectors', 'tags',
-                             'dependencies'])
+    nlp = Language(Vocab())

    from spacy.lang.en import English
    nlp = English()
@ -34,14 +41,6 @@ p Initialise a #[code Language] object.
            |  A function that takes text and returns a #[code Doc] object.
            |  Usually a #[code Tokenizer].

-    +row
-        +cell #[code pipeline]
-        +cell list
-        +cell
-            |  A list of annotation processes or IDs of annotation, processes,
-            |  e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked
-            |  up in #[code Language.Defaults.factories].
-
    +row
        +cell #[code meta]
        +cell dict
@ -235,7 +234,6 @@ p
    |  Can be called before training to pre-process gold data. By default, it
    |  handles nonprojectivity and adds missing tags to the tag map.

-
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code docs_golds]
@ -247,6 +245,177 @@ p
        +cell tuple
        +cell Tuples of #[code Doc] and #[code GoldParse] objects.

+h(2, "create_pipe") Language.create_pipe
+    +tag method
+    +tag-new(2)
+
+p Create a pipeline component from a factory.
+
+aside-code("Example").
+    parser = nlp.create_pipe('parser')
+    nlp.add_pipe(parser)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code name]
+        +cell unicode
+        +cell
+            |  Factory name to look up in
+            |  #[+api("language#class-attributes") #[code Language.factories]].
+
+    +row
+        +cell #[code config]
+        +cell dict
+        +cell Configuration parameters to initialise component.
+
+    +row("foot")
+        +cell returns
+        +cell callable
+        +cell The pipeline component.
+
+h(2, "add_pipe") Language.add_pipe
+    +tag method
+    +tag-new(2)
+
+p
+    |  Add a component to the processing pipeline. Valid components are
+    |  callables that take a #[code Doc] object, modify it and return it. Only
+    |  one of #[code before], #[code after], #[code first] or #[code last] can
+    |  be set. Default behaviour is #[code last=True].
+
+aside-code("Example").
+    def component(doc):
+        # modify Doc and return it
+        return doc
+
+    nlp.add_pipe(component, before='ner')
+    nlp.add_pipe(component, name='custom_name', last=True)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code component]
+        +cell callable
+        +cell The pipeline component.
+
+    +row
+        +cell #[code name]
+        +cell unicode
+        +cell
+            |  Name of pipeline component. Overwrites existing
+            |  #[code component.name] attribute if available. If no #[code name]
+            |  is set and the component exposes no name attribute,
+            |  #[code component.__name__] is used. An error is raised if the
+            |  name already exists in the pipeline.
+
+    +row
+        +cell #[code before]
+        +cell unicode
+        +cell Component name to insert component directly before.
+
+    +row
+        +cell #[code after]
+        +cell unicode
+        +cell Component name to insert component directly after:
+
+    +row
+        +cell #[code first]
+        +cell bool
+        +cell Insert component first / not first in the pipeline.
+
+    +row
+        +cell #[code last]
+        +cell bool
+        +cell Insert component last / not last in the pipeline.
+
+h(2, "get_pipe") Language.get_pipe
+    +tag method
+    +tag-new(2)
+
+p Get a pipeline component for a given component name.
+
+aside-code("Example").
+    parser = nlp.get_pipe('parser')
+    custom_component = nlp.get_pipe('custom_component')
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code name]
+        +cell unicode
+        +cell Name of the pipeline component to get.
+
+    +row("foot")
+        +cell returns
+        +cell callable
+        +cell The pipeline component.
+
+h(2, "replace_pipe") Language.replace_pipe
+    +tag method
+    +tag-new(2)
+
+p Replace a component in the pipeline.
+
+aside-code("Example").
+    nlp.replace_pipe('parser', my_custom_parser)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code name]
+        +cell unicode
+        +cell Name of the component to replace.
+
+    +row
+        +cell #[code component]
+        +cell callable
+        +cell The pipeline component to inser.
+
+
+h(2, "rename_pipe") Language.rename_pipe
+    +tag method
+    +tag-new(2)
+
+p
+    |  Rename a component in the pipeline. Useful to create custom names for
+    |  pre-defined and pre-loaded components. To change the default name of
+    |  a component added to the pipeline, you can also use the #[code name]
+    |  argument on #[+api("language#add_pipe") #[code add_pipe]].
+
+aside-code("Example").
+    nlp.rename_pipe('parser', 'spacy_parser')
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code old_name]
+        +cell unicode
+        +cell Name of the component to rename.
+
+    +row
+        +cell #[code new_name]
+        +cell unicode
+        +cell New name of the component.
+
+h(2, "remove_pipe") Language.remove_pipe
+    +tag method
+    +tag-new(2)
+
+p
+    |  Remove a component from the pipeline. Returns the removed component name
+    |  and component function.
+
+aside-code("Example").
+    name, component = nlp.remove_pipe('parser')
+    assert name == 'parser'
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code name]
+        +cell unicode
+        +cell Name of the component to remove.
+
+    +row("foot")
+        +cell returns
+        +cell tuple
+        +cell A #[code (name, component)] tuple of the removed component.
+
 +h(2, "to_disk") Language.to_disk
    +tag method
    +tag-new(2)
@ -399,7 +568,15 @@ p Load state from a binary string.
    +row
        +cell #[code pipeline]
        +cell list
-        +cell Sequence of annotation functions.
+        +cell
+            |  List of #[code (name, component)] tuples describing the current
+            |  processing pipeline, in order.
+
+    +row
+        +cell #[code pipe_names]
+            +tag-new(2)
+        +cell list
+        +cell List of pipeline component names, in order.

    +row
        +cell #[code meta]
@ -424,3 +601,12 @@ p Load state from a binary string.
        +cell
            |  Two-letter language ID, i.e.
            |  #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code].
+
+    +row
+        +cell #[code factories]
+            +tag-new(2)
+        +cell dict
+        +cell
+            |  Factories that create pre-defined pipeline components, e.g. the
+            |  tagger, parser or entity recognizer, keyed by their component
+            |  name.
--- a/website/api/span.jade
+++ b/website/api/span.jade
@ -116,6 +116,109 @@ p Get the number of tokens in the span.
        +cell int
        +cell The number of tokens in the span.

+h(2, "set_extension") Span.set_extension
+    +tag classmethod
+    +tag-new(2)
+
+p
+    |  Define a custom attribute on the #[code Span] which becomes available via
+    |  #[code Span._]. For details, see the documentation on
+    |  #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes].
+
+aside-code("Example").
+    from spacy.tokens import Span
+    city_getter = lambda span: span.text in ('New York', 'Paris', 'Berlin')
+    Span.set_extension('has_city', getter=city_getter)
+    doc = nlp(u'I like New York in Autumn')
+    assert doc[1:4]._.has_city
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code name]
+        +cell unicode
+        +cell
+            |  Name of the attribute to set by the extension. For example,
+            |  #[code 'my_attr'] will be available as #[code span._.my_attr].
+
+    +row
+        +cell #[code default]
+        +cell -
+        +cell
+            |  Optional default value of the attribute if no getter or method
+            |  is defined.
+
+    +row
+        +cell #[code method]
+        +cell callable
+        +cell
+            |  Set a custom method on the object, for example
+            |  #[code span._.compare(other_span)].
+
+    +row
+        +cell #[code getter]
+        +cell callable
+        +cell
+            |  Getter function that takes the object and returns an attribute
+            |  value. Is called when the user accesses the #[code ._] attribute.
+
+    +row
+        +cell #[code setter]
+        +cell callable
+        +cell
+            |  Setter function that takes the #[code Span] and a value, and
+            |  modifies the object. Is called when the user writes to the
+            |  #[code Span._] attribute.
+
+h(2, "get_extension") Span.get_extension
+    +tag classmethod
+    +tag-new(2)
+
+p
+    |  Look up a previously registered extension by name. Returns a 4-tuple
+    |  #[code.u-break (default, method, getter, setter)] if the extension is
+    |  registered. Raises a #[code KeyError] otherwise.
+
+aside-code("Example").
+    from spacy.tokens import Span
+    Span.set_extension('is_city', default=False)
+    extension = Span.get_extension('is_city')
+    assert extension == (False, None, None, None)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code name]
+        +cell unicode
+        +cell Name of the extension.
+
+    +row("foot")
+        +cell returns
+        +cell tuple
+        +cell
+            |  A #[code.u-break (default, method, getter, setter)] tuple of the
+            |  extension.
+
+h(2, "has_extension") Span.has_extension
+    +tag classmethod
+    +tag-new(2)
+
+p Check whether an extension has been registered on the #[code Span] class.
+
+aside-code("Example").
+    from spacy.tokens import Span
+    Span.set_extension('is_city', default=False)
+    assert Span.has_extension('is_city')
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code name]
+        +cell unicode
+        +cell Name of the extension to check.
+
+    +row("foot")
+        +cell returns
+        +cell bool
+        +cell Whether the extension has been registered.
+
 +h(2, "similarity") Span.similarity
    +tag method
    +tag-model("vectors")
--- a/website/api/token.jade
+++ b/website/api/token.jade
@ -51,6 +51,109 @@ p The number of unicode characters in the token, i.e. #[code token.text].
        +cell int
        +cell The number of unicode characters in the token.

+h(2, "set_extension") Token.set_extension
+    +tag classmethod
+    +tag-new(2)
+
+p
+    |  Define a custom attribute on the #[code Token] which becomes available
+    |  via #[code Token._]. For details, see the documentation on
+    |  #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes].
+
+aside-code("Example").
+    from spacy.tokens import Token
+    fruit_getter = lambda token: token.text in ('apple', 'pear', 'banana')
+    Token.set_extension('is_fruit', getter=fruit_getter)
+    doc = nlp(u'I have an apple')
+    assert doc[3]._.is_fruit
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code name]
+        +cell unicode
+        +cell
+            |  Name of the attribute to set by the extension. For example,
+            |  #[code 'my_attr'] will be available as #[code token._.my_attr].
+
+    +row
+        +cell #[code default]
+        +cell -
+        +cell
+            |  Optional default value of the attribute if no getter or method
+            |  is defined.
+
+    +row
+        +cell #[code method]
+        +cell callable
+        +cell
+            |  Set a custom method on the object, for example
+            |  #[code token._.compare(other_token)].
+
+    +row
+        +cell #[code getter]
+        +cell callable
+        +cell
+            |  Getter function that takes the object and returns an attribute
+            |  value. Is called when the user accesses the #[code ._] attribute.
+
+    +row
+        +cell #[code setter]
+        +cell callable
+        +cell
+            |  Setter function that takes the #[code Token] and a value, and
+            |  modifies the object. Is called when the user writes to the
+            |  #[code Token._] attribute.
+
+h(2, "get_extension") Token.get_extension
+    +tag classmethod
+    +tag-new(2)
+
+p
+    |  Look up a previously registered extension by name. Returns a 4-tuple
+    |  #[code.u-break (default, method, getter, setter)] if the extension is
+    |  registered. Raises a #[code KeyError] otherwise.
+
+aside-code("Example").
+    from spacy.tokens import Token
+    Token.set_extension('is_fruit', default=False)
+    extension = Token.get_extension('is_fruit')
+    assert extension == (False, None, None, None)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code name]
+        +cell unicode
+        +cell Name of the extension.
+
+    +row("foot")
+        +cell returns
+        +cell tuple
+        +cell
+            |  A #[code.u-break (default, method, getter, setter)] tuple of the
+            |  extension.
+
+h(2, "has_extension") Token.has_extension
+    +tag classmethod
+    +tag-new(2)
+
+p Check whether an extension has been registered on the #[code Token] class.
+
+aside-code("Example").
+    from spacy.tokens import Token
+    Token.set_extension('is_fruit', default=False)
+    assert Token.has_extension('is_fruit')
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code name]
+        +cell unicode
+        +cell Name of the extension to check.
+
+    +row("foot")
+        +cell returns
+        +cell bool
+        +cell Whether the extension has been registered.
+
 +h(2, "check_flag") Token.check_flag
    +tag method

--- a/website/assets/css/_base/_utilities.sass
+++ b/website/assets/css/_base/_utilities.sass
@ -143,6 +143,9 @@

 //- Layout

+.u-width-full
+    width: 100%
+
 .u-float-left
    float: left
    margin-right: 1rem
@ -166,6 +169,9 @@
 .u-padding-medium
    padding: 1.8rem

+.u-padding-top
+    padding-top: 2rem
+
 .u-inline-block
    display: inline-block

--- a/website/assets/css/_components/_lists.sass
+++ b/website/assets/css/_components/_lists.sass
@ -25,7 +25,7 @@
        display: inline-block
        font-size: 0.6em
        font-weight: bold
-        padding-right: 1.25rem
+        padding-right: 1em
        margin-left: -3.75rem
        text-align: right
        width: 2.5rem
--- a/website/usage/_adding-languages/_language-data.jade
+++ b/website/usage/_adding-languages/_language-data.jade
@ -456,24 +456,11 @@ p
    }

 p
-    |  To add a lookup lemmatizer to your language, import the #[code LOOKUP]
-    |  table and #[code Lemmatizer], and create a new classmethod:
+    |  To provide a lookup lemmatizer for your language, import the lookup table
+    |  and add it to the #[code Language] class as #[code lemma_lookup]:

-
-+code("__init__py (excerpt)").
-    # other imports here, plus lookup table and lookup lemmatizer
-    from .lemmatizer import LOOKUP
-    from ...lemmatizerlookup import Lemmatizer
-
-    class Xxxxx(Language):
-        lang = 'xx'
-
-        class Defaults(Language.Defaults):
-            # other language defaults here
-
-            @classmethod
-            def create_lemmatizer(cls, nlp=None):
-                return Lemmatizer(LOOKUP)
+code.
+    lemma_lookup = dict(LOOKUP)

 +h(3, "tag-map") Tag map

--- a/website/usage/_data.json
+++ b/website/usage/_data.json
@ -103,10 +103,10 @@
        "title": "Language Processing Pipelines",
        "next": "vectors-similarity",
        "menu": {
-            "How pipelines work": "pipelines",
-            "Examples": "examples",
+            "How Pipelines Work": "pipelines",
+            "Custom Components": "custom-components",
+            "Developing Extensions": "extensions",
            "Multi-threading": "multithreading",
-            "User Hooks": "user-hooks",
            "Serialization": "serialization"
        }
    },
@ -195,6 +195,7 @@
        "teaser": "Full code examples you can modify and run.",
        "next": "resources",
        "menu": {
+            "Pipeline": "pipeline",
            "Matching": "matching",
            "Training": "training",
            "Deep Learning": "deep-learning"
--- a/website/usage/_processing-pipelines/_custom-components.jade
+++ b/website/usage/_processing-pipelines/_custom-components.jade
@ -0,0 +1,369 @@
+//- 💫 DOCS > USAGE > PROCESSING PIPELINES > CUSTOM COMPONENTS
+
+p
+    |  A component receives a #[code Doc] object and can modify it – for example,
+    |  by using the current weights to make a prediction and set some annotation
+    |  on the document. By adding a component to the pipeline, you'll get access
+    |  to the #[code Doc] at any point #[strong during processing] – instead of
+    |  only being able to modify it afterwards.
+
+aside-code("Example").
+    def my_component(doc):
+        # do something to the doc here
+        return doc
+
+table(["Argument", "Type", "Description"])
+    +row
+        +cell #[code doc]
+        +cell #[code Doc]
+        +cell The #[code Doc] object processed by the previous component.
+
+    +row("foot")
+        +cell returns
+        +cell #[code Doc]
+        +cell The #[code Doc] object processed by this pipeline component.
+
+p
+    |  Custom components can be added to the pipeline using the
+    |  #[+api("language#add_pipe") #[code add_pipe]] method. Optionally, you
+    |  can either specify a component to add it #[strong before or after], tell
+    |  spaCy to add it #[strong first or last] in the pipeline, or define a
+    |  #[strong custom name]. If no name is set and no #[code name] attribute
+    |  is present on your component, the function name is used.
+
+code("Adding pipeline components").
+    def my_component(doc):
+        print("After tokenization, this doc has %s tokens." % len(doc))
+        if len(doc) &lt; 10:
+            print("This is a pretty short document.")
+        return doc
+
+    nlp = spacy.load('en')
+    nlp.pipeline.add_pipe(my_component, name='print_info', first=True)
+    print(nlp.pipe_names) # ['print_info', 'tagger', 'parser', 'ner']
+    doc = nlp(u"This is a sentence.")
+
+p
+    |  Of course, you can also wrap your component as a class to allow
+    |  initialising it with custom settings and hold state within the component.
+    |  This is useful for #[strong stateful components], especially ones which
+    |  #[strong depend on shared data].
+
+code.
+    class MyComponent(object):
+        name = 'print_info'
+
+        def __init__(vocab, short_limit=10):
+            self.vocab = nlp.vocab
+            self.short_limit = short_limit
+
+        def __call__(doc):
+            if len(doc) &lt; self.short_limit:
+                print("This is a pretty short document.")
+            return doc
+
+    my_component = MyComponent(nlp.vocab, short_limit=25)
+    nlp.add_pipe(my_component, first=True)
+
+h(3, "custom-components-attributes")
+    |  Extension attributes on #[code Doc], #[code Span] and #[code Token]
+    +tag-new(2)
+
+p
+    |  As of v2.0, spaCy allows you to set any custom attributes and methods
+    |  on the #[code Doc], #[code Span] and #[code Token], which become
+    |  available as #[code Doc._], #[code Span._] and #[code Token._] – for
+    |  example, #[code Token._.my_attr]. This lets you store additional
+    |  information relevant to your application, add new features and
+    |  functionality to spaCy, and implement your own models trained with other
+    |  machine learning libraries. It also lets you take advantage of spaCy's
+    |  data structures and the #[code Doc] object as the "single source of
+    |  truth".
+
+aside("Why ._?")
+    |  Writing to a #[code ._] attribute instead of to the #[code Doc] directly
+    |  keeps a clearer separation and makes it easier to ensure backwards
+    |  compatibility. For example, if you've implemented your own #[code .coref]
+    |  property and spaCy claims it one day, it'll break your code. Similarly,
+    |  just by looking at the code, you'll immediately know what's built-in and
+    |  what's custom – for example, #[code doc.sentiment] is spaCy, while
+    |  #[code doc._.sent_score] isn't.
+
+p
+    |  There are three main types of extensions, which can be defined using the
+    |  #[+api("doc#set_extension") #[code Doc.set_extension]],
+    |  #[+api("span#set_extension") #[code Span.set_extension]] and
+    |  #[+api("token#set_extension") #[code Token.set_extension]] methods.
+
+list("numbers")
+    +item #[strong Attribute extensions].
+        |  Set a default value for an attribute, which can be overwritten
+        |  manually at any time. Attribute extensions work like "normal"
+        |  variables and are the quickest way to store arbitrary information
+        |  on a #[code Doc], #[code Span] or #[code Token].
+
+        +code-wrapper
+            +code.
+                Doc.set_extension('hello', default=True)
+                assert doc._.hello
+                doc._.hello = False
+
+    +item #[strong Property extensions].
+        |  Define a getter and an optional setter function. If no setter is
+        |  provided, the extension is immutable. Since the getter and setter
+        |  functions are only called when you #[em retrieve] the attribute,
+        |  you can also access values of previously added attribute extensions.
+        |  For example, a #[code Doc] getter can average over #[code Token]
+        |   attributes. For #[code Span] extensions, you'll almost always want
+        |  to use a property – otherwise, you'd have to write to
+        |  #[em every possible] #[code Span] in the #[code Doc] to set up the
+        |  values correctly.
+
+        +code-wrapper
+            +code.
+                Doc.set_extension('hello', getter=get_hello_value, setter=set_hello_value)
+                assert doc._.hello
+                doc._.hello = 'Hi!'
+
+    +item #[strong Method extensions].
+        |  Assign a function that becomes available as an object method. Method
+        |  extensions are always immutable. For more details and implementation
+        |  ideas, see
+        |  #[+a("/usage/examples#custom-components-attr-methods") these examples].
+
+        +code-wrapper
+            +code.o-no-block.
+                Doc.set_extension('hello', method=lambda doc, name: 'Hi {}!'.format(name))
+                assert doc._.hello('Bob') == 'Hi Bob!'
+
+p
+    |  Before you can access a custom extension, you need to register it using
+    |  the #[code set_extension] method on the object you want
+    |  to add it to, e.g. the #[code Doc]. Keep in mind that extensions are
+    |  always #[strong added globally] and not just on a particular instance.
+    |  If an attribute of the same name
+    |  already exists, or if you're trying to access an attribute that hasn't
+    |  been registered, spaCy will raise an #[code AttributeError].
+
+code("Example").
+    from spacy.tokens import Doc, Span, Token
+
+    fruits = ['apple', 'pear', 'banana', 'orange', 'strawberry']
+    is_fruit_getter = lambda token: token.text in fruits
+    has_fruit_getter = lambda obj: any([t.text in fruits for t in obj])
+
+    Token.set_extension('is_fruit', getter=is_fruit_getter)
+    Doc.set_extension('has_fruit', getter=has_fruit_getter)
+    Span.set_extension('has_fruit', getter=has_fruit_getter)
+
+aside-code("Usage example").
+    doc = nlp(u"I have an apple and a melon")
+    assert doc[3]._.is_fruit      # get Token attributes
+    assert not doc[0]._.is_fruit
+    assert doc._.has_fruit        # get Doc attributes
+    assert doc[1:4]._.has_fruit   # get Span attributes
+
+p
+    |  Once you've registered your custom attribute, you can also use the
+    |  built-in #[code set], #[code get] and #[code has] methods to modify and
+    |  retrieve the attributes. This is especially useful it you want to pass in
+    |  a string instead of calling #[code doc._.my_attr].
+
+table(["Method", "Description", "Valid for", "Example"])
+    +row
+        +cell #[code ._.set()]
+        +cell Set a value for an attribute.
+        +cell Attributes, mutable properties.
+        +cell #[code.u-break token._.set('my_attr', True)]
+
+    +row
+        +cell #[code ._.get()]
+        +cell Get the value of an attribute.
+        +cell Attributes, mutable properties, immutable properties, methods.
+        +cell #[code.u-break my_attr = span._.get('my_attr')]
+
+    +row
+        +cell #[code ._.has()]
+        +cell Check if an attribute exists.
+        +cell Attributes, mutable properties, immutable properties, methods.
+        +cell #[code.u-break doc._.has('my_attr')]
+
+infobox("How the ._ is implemented")
+    |  Extension definitions – the defaults, methods, getters and setters you
+    |  pass in to #[code set_extension] are stored in class attributes on the
+    |  #[code Underscore] class. If you write to an extension attribute, e.g.
+    |  #[code doc._.hello = True], the data is stored within the
+    |  #[+api("doc#attributes") #[code Doc.user_data]] dictionary. To keep the
+    |  underscore data separate from your other dictionary entries, the string
+    |  #[code "._."] is placed before the name, in a tuple.
+
+h(4, "component-example1") Example: Custom sentence segmentation logic
+
+p
+    |  Let's say you want to implement custom logic to improve spaCy's sentence
+    |  boundary detection. Currently, sentence segmentation is based on the
+    |  dependency parse, which doesn't always produce ideal results. The custom
+    |  logic should therefore be applied #[strong after] tokenization, but
+    |  #[strong before] the dependency parsing – this way, the parser can also
+    |  take advantage of the sentence boundaries.
+
+code.
+    def sbd_component(doc):
+        for i, token in enumerate(doc[:-2]):
+            # define sentence start if period + titlecase token
+            if token.text == '.' and doc[i+1].is_title:
+                doc[i+1].sent_start = True
+        return doc
+
+    nlp = spacy.load('en')
+    nlp.add_pipe(sbd_component, before='parser')  # insert before the parser
+
+h(4, "component-example2")
+    |  Example: Pipeline component for entity matching and tagging with
+    |  custom attributes
+
+p
+    |  This example shows how to create a spaCy extension that takes a
+    |  terminology list (in this case, single- and multi-word company names),
+    |  matches the occurences in a document, labels them as #[code ORG] entities,
+    |  merges the tokens and sets custom #[code is_tech_org] and
+    |  #[code has_tech_org] attributes. For efficient matching, the example uses
+    |  the #[+api("phrasematcher") #[code PhraseMatcher]] which accepts
+    |  #[code Doc] objects as match patterns and works well for large
+    |  terminology lists. It also ensures your patterns will always match, even
+    |  when you customise spaCy's tokenization rules. When you call #[code nlp]
+    |  on a text, the custom pipeline component is applied to the #[code Doc]
+
+github("spacy", "examples/pipeline/custom_component_entities.py", false, 500)
+
+p
+    |  Wrapping this functionality in a
+    |  pipeline component allows you to reuse the module with different
+    |  settings, and have all pre-processing taken care of when you call
+    |  #[code nlp] on your text and receive a #[code Doc] object.
+
+h(4, "component-example3")
+    |  Example: Pipeline component for GPE entities and country meta data via a
+    |  REST API
+
+p
+    |  This example shows the implementation of a pipeline component
+    |  that fetches country meta data via the
+    |  #[+a("https://restcountries.eu") REST Countries API] sets entity
+    |  annotations for countries, merges entities into one token and
+    |  sets custom attributes on the #[code Doc], #[code Span] and
+    |  #[code Token] – for example, the capital, latitude/longitude coordinates
+    |  and even the country flag.
+
+github("spacy", "examples/pipeline/custom_component_countries_api.py", false, 500)
+
+p
+    |  In this case, all data can be fetched on initialisation in one request.
+    |  However, if you're working with text that contains incomplete country
+    |  names, spelling mistakes or foreign-language versions, you could also
+    |  implement a #[code like_country]-style getter function that makes a
+    |  request to the search API endpoint and returns the best-matching
+    |  result.
+
+h(4, "custom-components-usage-ideas") Other usage ideas
+
+list
+    +item
+        |  #[strong Adding new features and hooking in models]. For example,
+        |  a sentiment analysis model, or your preferred solution for
+        |  lemmatization or sentiment analysis. spaCy's built-in tagger,
+        |  parser and entity recognizer respect annotations that were already
+        |  set on the #[code Doc] in a previous step of the pipeline.
+    +item
+        |  #[strong Integrating other libraries and APIs]. For example, your
+        |  pipeline component can write additional information and data
+        |  directly to the #[code Doc] or #[code Token] as custom attributes,
+        |  while making sure no information is lost in the process. This can
+        |  be output generated by other libraries and models, or an external
+        |  service with a REST API.
+    +item
+        |  #[strong Debugging and logging]. For example, a component which
+        |  stores and/or exports relevant information about the current state
+        |  of the processed document, and insert it at any point of your
+        |  pipeline.
+
+infobox("Developing third-party extensions")
+    |  The new pipeline management and custom attributes finally make it easy
+    |  to develop your own spaCy extensions and plugins and share them with
+    |  others. Extensions can claim their own #[code ._] namespace and exist as
+    |  standalone packages. If you're developing a tool or library and want to
+    |  make it easy for others to use it with spaCy and add it to their
+    |  pipeline, all you have to do is expose a function that takes a
+    |  #[code Doc], modifies it and returns it. For more details and
+    |  #[strong best practices], see the section on
+    |  #[+a("#extensions") developing spaCy extensions].
+
+h(3, "custom-components-user-hooks") User hooks
+
+p
+    |  While it's generally recommended to use the #[code Doc._], #[code Span._]
+    |  and #[code Token._] proxies to add your own custom attributes, spaCy
+    |  offers a few exceptions to allow #[strong customising the built-in methods]
+    |  like #[+api("doc#similarity") #[code Doc.similarity]] or
+    |  #[+api("doc#vector") #[code Doc.vector]]. with your own hooks, which can
+    |  rely on statistical models you train yourself. For instance, you can
+    |  provide your own on-the-fly sentence segmentation algorithm or document
+    |  similarity method.
+
+p
+    |  Hooks let you customize some of the behaviours of the #[code Doc],
+    |  #[code Span] or #[code Token] objects by adding a component to the
+    |  pipeline. For instance, to customize the
+    |  #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a
+    |  component that sets a custom function to
+    |  #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity]
+    |  method will check the #[code user_hooks] dict, and delegate to your
+    |  function if you've set one. Similar results can be achieved by setting
+    |  functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks].
+
+aside("Implementation note")
+    |  The hooks live on the #[code Doc] object because the #[code Span] and
+    |  #[code Token] objects are created lazily, and don't own any data. They
+    |  just proxy to their parent #[code Doc]. This turns out to be convenient
+    |  here — we only have to worry about installing hooks in one place.
+
+table(["Name", "Customises"])
+    +row
+        +cell #[code user_hooks]
+        +cell
+            +api("doc#vector") #[code Doc.vector]
+            +api("doc#has_vector") #[code Doc.has_vector]
+            +api("doc#vector_norm") #[code Doc.vector_norm]
+            +api("doc#sents") #[code Doc.sents]
+
+    +row
+        +cell #[code user_token_hooks]
+        +cell
+            +api("token#similarity") #[code Token.similarity]
+            +api("token#vector") #[code Token.vector]
+            +api("token#has_vector") #[code Token.has_vector]
+            +api("token#vector_norm") #[code Token.vector_norm]
+            +api("token#conjuncts") #[code Token.conjuncts]
+
+    +row
+        +cell #[code user_span_hooks]
+        +cell
+            +api("span#similarity") #[code Span.similarity]
+            +api("span#vector") #[code Span.vector]
+            +api("span#has_vector") #[code Span.has_vector]
+            +api("span#vector_norm") #[code Span.vector_norm]
+            +api("span#root") #[code Span.root]
+
+code("Add custom similarity hooks").
+    class SimilarityModel(object):
+        def __init__(self, model):
+            self._model = model
+
+        def __call__(self, doc):
+            doc.user_hooks['similarity'] = self.similarity
+            doc.user_span_hooks['similarity'] = self.similarity
+            doc.user_token_hooks['similarity'] = self.similarity
+
+        def similarity(self, obj1, obj2):
+            y = self._model([obj1.vector, obj2.vector])
+            return float(y[0])
--- a/website/usage/_processing-pipelines/_examples.jade
+++ b/website/usage/_processing-pipelines/_examples.jade
@ -1,126 +0,0 @@
-//- 💫 DOCS > USAGE > PROCESSING PIPELINES > EXAMPLES
-
-p
-    |  To see real-world examples of pipeline factories and components in action,
-    |  you can have a look at the source of spaCy's built-in components, e.g.
-    |  the #[+api("tagger") #[code Tagger]], #[+api("parser") #[code Parser]] or
-    |  #[+api("entityrecognizer") #[code EntityRecongnizer]].
-
-+h(3, "example1") Example: Custom sentence segmentation logic
-
-p
-    |  Let's say you want to implement custom logic to improve spaCy's sentence
-    |  boundary detection. Currently, sentence segmentation is based on the
-    |  dependency parse, which doesn't always produce ideal results. The custom
-    |  logic should therefore be applied #[strong after] tokenization, but
-    |  #[strong before] the dependency parsing – this way, the parser can also
-    |  take advantage of the sentence boundaries.
-
-+code.
-    def sbd_component(doc):
-        for i, token in enumerate(doc[:-2]):
-            # define sentence start if period + titlecase token
-            if token.text == '.' and doc[i+1].is_title:
-                doc[i+1].sent_start = True
-        return doc
-
-p
-    |  In this case, we simply want to add the component to the existing
-    |  pipeline of the English model. We can do this by inserting it at index 0
-    |  of #[code nlp.pipeline]:
-
-+code.
-    nlp = spacy.load('en')
-    nlp.pipeline.insert(0, sbd_component)
-
-p
-    |  When you call #[code nlp] on some text, spaCy will tokenize it to create
-    |  a #[code Doc] object, and first call #[code sbd_component] on it, followed
-    |  by the model's default pipeline.
-
-+h(3, "example2") Example: Sentiment model
-
-p
-    |  Let's say you have trained your own document sentiment model on English
-    |  text. After tokenization, you want spaCy to first execute the
-    |  #[strong default tensorizer], followed by a custom
-    |  #[strong sentiment component] that adds a #[code .sentiment]
-    |  property to the #[code Doc], containing your model's sentiment precition.
-
-p
-    |  Your component class will have a #[code from_disk()] method that spaCy
-    |  calls to load the model data. When called, the component will compute
-    |  the sentiment score, add it to the #[code Doc] and return the modified
-    |  document. Optionally, the component can include an #[code update()] method
-    |  to allow training the model.
-
-+code.
-    import pickle
-    from pathlib import Path
-
-    class SentimentComponent(object):
-        def __init__(self, vocab):
-            self.weights = None
-
-        def __call__(self, doc):
-            doc.sentiment = sum(self.weights*doc.vector) # set sentiment property
-            return doc
-
-        def from_disk(self, path): # path = model path + factory ID ('sentiment')
-            self.weights = pickle.load(Path(path) / 'weights.bin') # load weights
-            return self
-
-        def update(self, doc, gold): # update weights – allows training!
-            prediction = sum(self.weights*doc.vector)
-            self.weights -= 0.001*doc.vector*(prediction-gold.sentiment)
-
-p
-    |  The factory will initialise the component with the #[code Vocab] object.
-    |  To be able to add it to your model's pipeline as #[code 'sentiment'],
-    |  it also needs to be registered via
-    |  #[+api("spacy#set_factory") #[code set_factory()]].
-
-+code.
-    def sentiment_factory(vocab):
-        component = SentimentComponent(vocab) # initialise component
-        return component
-
-    spacy.set_factory('sentiment', sentiment_factory)
-
-p
-    |  The above code should be #[strong shipped with your model]. You can use
-    |  the #[+api("cli#package") #[code package]] command to create all required
-    |  files and directories. The model package will include an
-    |  #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) #[code __init__.py]]
-    |  with a #[code load()] method, that will initialise the language class with
-    |  the model's pipeline and call the #[code from_disk()] method to load
-    |  the model data.
-
-p
-    |  In the model package's meta.json, specify the language class and pipeline
-    |  IDs:
-
-+code("meta.json (excerpt)", "json").
-    {
-        "name": "sentiment_model",
-        "lang": "en",
-        "version": "1.0.0",
-        "spacy_version": "&gt;=2.0.0,&lt;3.0.0",
-        "pipeline": ["tensorizer", "sentiment"]
-    }
-
-p
-    |  When you load your new model, spaCy will call the model's #[code load()]
-    |  method. This will return a #[code Language] object with a pipeline
-    |  containing the default tensorizer, and the sentiment component returned
-    |  by your custom #[code "sentiment"] factory.
-
-+code.
-    nlp = spacy.load('en_sentiment_model')
-    doc = nlp(u'I love pizza')
-    assert doc.sentiment
-
-+infobox("Saving and loading models")
-    |  For more information and a detailed guide on how to package your model,
-    |  see the documentation on
-    |  #[+a("/usage/training#saving-loading") saving and loading models].
--- a/website/usage/_processing-pipelines/_extensions.jade
+++ b/website/usage/_processing-pipelines/_extensions.jade
@ -0,0 +1,110 @@
+//- 💫 DOCS > USAGE > PROCESSING PIPELINES > DEVELOPING EXTENSIONS
+
+p
+    |  We're very excited about all the new possibilities for community
+    |  extensions and plugins in spaCy v2.0, and we can't wait to see what
+    |  you build with it! To get you started, here are a few tips, tricks and
+    |  best practices:
+
+list
+    +item
+        |  Make sure to choose a #[strong descriptive and specific name] for
+        |  your pipeline component class, and set it as its #[code name]
+        |  attribute. Avoid names that are too common or likely to clash with
+        |  built-in or a user's other custom components. While it's fine to call
+        |  your package "spacy_my_extension", avoid component names including
+        |  "spacy", since this can easily lead to confusion.
+
+        +code-wrapper
+            +code-new name = 'myapp_lemmatizer'
+            +code-old name = 'lemmatizer'
+
+    +item
+        |  When writing to #[code Doc], #[code Token] or #[code Span] objects,
+        |  #[strong use getter functions] wherever possible, and avoid setting
+        |  values explicitly. Tokens and spans don't own any data themselves,
+        |  so you should provide a function that allows them to compute the
+        |  values instead of writing static properties to individual objects.
+
+        +code-wrapper
+            +code-new.
+                is_fruit = lambda token: token.text in ('apple', 'orange')
+                Token.set_extension('is_fruit', getter=is_fruit)
+            +code-old.
+                token._.set_extension('is_fruit', default=False)
+                if token.text in ('apple', 'orange'):
+                    token._.set('is_fruit', True)
+
+    +item
+        |  Always add your custom attributes to the #[strong global] #[code Doc]
+        |  #[code Token] or #[code Span] objects, not a particular instance of
+        |  them. Add the attributes #[strong as early as possible], e.g. in
+        |  your extension's #[code __init__] method or in the global scope of
+        |  your module. This means that in the case of namespace collisions,
+        |  the user will see an error immediately, not just when they run their
+        |  pipeline.
+
+        +code-wrapper
+            +code-new.
+                from spacy.tokens import Doc
+                def __init__(attr='my_attr'):
+                    Doc.set_extension(attr, getter=self.get_doc_attr)
+            +code-old.
+                def __call__(doc):
+                    doc.set_extension('my_attr', getter=self.get_doc_attr)
+
+    +item
+        |  If your extension is setting properties on the #[code Doc],
+        |  #[code Token] or #[code Span], include an option to
+        |  #[strong let the user to change those attribute names]. This makes
+        |  it easier to avoid namespace collisions and accommodate users with
+        |  different naming preferences. We recommend adding an #[code attrs]
+        |  argument to the #[code __init__] method of your class so you can
+        |  write the names to class attributes and reuse them across your
+        |  component.
+
+        +code-wrapper
+            +code-new Doc.set_extension(self.doc_attr, default='some value')
+            +code-old Doc.set_extension('my_doc_attr', default='some value')
+
+    +item
+        |  Ideally, extensions should be #[strong standalone packages] with
+        |  spaCy and optionally, other packages specified as a dependency. They
+        |  can freely assign to their own #[code ._] namespace, but should stick
+        |  to that. If your extension's only job is to provide a better
+        |  #[code .similarity] implementation, and your docs state this
+        |  explicitly, there's no problem with writing to the
+        |  #[+a("#custom-components-user-hooks") #[code user_hooks]], and
+        |  overwriting spaCy's built-in method. However, a third-party
+        |  extension should #[strong never silently overwrite built-ins], or
+        |  attributes set by other extensions.
+
+    +item
+        |  If you're looking to publish a model that depends on a custom
+        |  pipeline component, you can either #[strong require it] in the model
+        |  package's dependencies, or – if the component is specific and
+        |  lightweight – choose to #[strong ship it with your model package]
+        |  and add it to the #[code Language] instance returned by the
+        |  model's #[code load()] method. For examples of this, check out the
+        |  implementations of spaCy's
+        |  #[+api("util#load_model_from_init_py") #[code load_model_from_init_py()]]
+        |  and  #[+api("util#load_model_from_path") #[code load_model_from_path()]]
+        |  utility functions.
+
+        +code-wrapper
+            +code-new.
+                nlp.add_pipe(my_custom_component)
+                return nlp.from_disk(model_path)
+
+    +item
+        |  Once you're ready to share your extension with others, make sure to
+        |  #[strong add docs and installation instructions] (you can
+        |  always link to this page for more info). Make it easy for others to
+        |  install and use your extension, for example by uploading it to
+        |  #[+a("https://pypi.python.org") PyPi]. If you're sharing your code on
+        |  GitHub, don't forget to tag it
+        |  with #[+a("https://github.com/search?q=topic%3Aspacy") #[code spacy]]
+        |  and #[+a("https://github.com/search?q=topic%3Aspacy-pipeline") #[code spacy-pipeline]]
+        |  to help people find it. If you post it on Twitter, feel free to tag
+        |  #[+a("https://twitter.com/" + SOCIAL.twitter) @#{SOCIAL.twitter}]
+        |  so we can check it out.
--- a/website/usage/_processing-pipelines/_pipelines.jade
+++ b/website/usage/_processing-pipelines/_pipelines.jade
@ -11,7 +11,7 @@ p

 p
    |  When you load a model, spaCy first consults the model's
-    |  #[+a("/usage/saving-loading#models-generating") meta.json]. The
+    |  #[+a("/usage/saving-loading#models-generating") #[code meta.json]]. The
    |  meta typically includes the model details, the ID of a language class,
    |  and an optional list of pipeline components. spaCy then does the
    |  following:
@ -21,24 +21,26 @@ p
        "name": "example_model",
        "lang": "en"
        "description": "Example model for spaCy",
-        "pipeline": ["tensorizer", "tagger"]
+        "pipeline": ["tagger", "parser"]
    }

 +list("numbers")
-    +item
-        |  Look up #[strong pipeline IDs] in the available
-        |  #[strong pipeline factories].
-    +item
-        |  Initialise the #[strong pipeline components] by calling their
-        |  factories with the #[code Vocab] as an argument. This gives each
-        |  factory and component access to the pipeline's shared data, like
-        |  strings, morphology and annotation scheme.
    +item
        |  Load the #[strong language class and data] for the given ID via
-        |  #[+api("util.get_lang_class") #[code get_lang_class]].
+        |  #[+api("util.get_lang_class") #[code get_lang_class]] and initialise
+        |  it. The #[code Language] class contains the shared vocabulary,
+        |  tokenization rules and the language-specific annotation scheme.
    +item
-        |  Pass the path to the #[strong model data] to the #[code Language]
-        |  class and return it.
+        |  Iterate over the #[strong pipeline names] and create each component
+        |  using #[+api("language#create_pipe") #[code create_pipe]], which
+        |  looks them up in #[code Language.factories].
+    +item
+        |  Add each pipeline component to the pipeline in order, using
+        |  #[+api("language#add_pipe") #[code add_pipe]].
+    +item
+        |  Make the #[strong model data] available to the #[code Language] class
+        |  by calling #[+api("language#from_disk") #[code from_disk]] with the
+        |  path to the model data ditectory.

 p
    |  So when you call this...
@ -47,12 +49,12 @@ p
    nlp = spacy.load('en')

 p
-    | ... the model tells spaCy to use the pipeline
+    | ... the model tells spaCy to use the language #[code "en"] and the pipeline
    |  #[code.u-break ["tensorizer", "tagger", "parser", "ner"]]. spaCy will
-    |  then look up each string in its internal factories registry and
-    |  initialise the individual components. It'll then load
-    |  #[code spacy.lang.en.English], pass it the path to the model's data
-    |  directory, and return it for you to use as the #[code nlp] object.
+    |  then initialise #[code spacy.lang.en.English], and create each pipeline
+    |  component and add it to the processing pipeline. It'll then load in the
+    |  model's data from its data ditectory and return the modified
+    |  #[code Language] class for you to use as the #[code nlp] object.

 p
    |  Fundamentally, a #[+a("/models") spaCy model] consists of three
@ -73,9 +75,12 @@ p
    pipeline = ['tensorizer', 'tagger', 'parser', 'ner']
    data_path = 'path/to/en_core_web_sm/en_core_web_sm-2.0.0'

-    cls = spacy.util.get_lang_class(lang)  # 1. get Language instance, e.g. English()
-    nlp = cls(pipeline=pipeline)           # 2. initialise it with the pipeline
-    nlp.from_disk(model_data_path)         # 3. load in the binary data
+    cls = spacy.util.get_lang_class(lang)   # 1. get Language instance, e.g. English()
+    nlp = cls()                             # 2. initialise it
+    for name in pipeline:
+        component = nlp.create_pipe(name)   # 3. create the pipeline components
+        nlp.add_pipe(component)             # 4. add the component to the pipeline
+    nlp.from_disk(model_data_path)          # 5. load in the binary data

 p
    |  When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and
@ -87,124 +92,23 @@ p
    |  document, which is then processed by the component next in the pipeline.

 +code("The pipeline under the hood").
-    doc = nlp.make_doc(u'This is a sentence')
-    for proc in nlp.pipeline:
-        doc = proc(doc)
-
-+h(3, "creating") Creating pipeline components and factories
+    doc = nlp.make_doc(u'This is a sentence')   # create a Doc from raw text
+    for name, proc in nlp.pipeline:             # iterate over components in order
+        doc = proc(doc)                         # apply each component

 p
-    |  spaCy lets you customise the pipeline with your own components. Components
-    |  are functions that receive a #[code Doc] object, modify and return it.
-    |  If your component is stateful, you'll want to create a new one for each
-    |  pipeline. You can do that by defining and registering a factory which
-    |  receives the shared #[code Vocab] object and returns a component.
-
-+h(4, "creating-component") Creating a  component
-
-p
-    |  A component receives a #[code Doc] object and
-    |  #[strong performs the actual processing] – for example, using the current
-    |  weights to make a prediction and set some annotation on the document. By
-    |  adding a component to the pipeline, you'll get access to the #[code Doc]
-    |  at any point #[strong during] processing – instead of only being able to
-    |  modify it afterwards.
-
-+aside-code("Example").
-    def my_component(doc):
-        # do something to the doc here
-        return doc
-
-+table(["Argument", "Type", "Description"])
-    +row
-        +cell #[code doc]
-        +cell #[code Doc]
-        +cell The #[code Doc] object processed by the previous component.
-
-    +row("foot")
-        +cell returns
-        +cell #[code Doc]
-        +cell The #[code Doc] object processed by this pipeline component.
-
-p
-    |  When creating a new #[code Language] class, you can pass it a list of
-    |  pipeline component functions to execute in that order. You can also
-    |  add it to an existing pipeline by modifying #[code nlp.pipeline] – just
-    |  be careful not to overwrite a pipeline or its components by accident!
+    |  The current processing pipeline is available as #[code nlp.pipeline],
+    |  which returns a list of #[code (name, component)] tuples, or
+    |  #[code nlp.pipe_names], which only returns a list of human-readable
+    |  component names.

 +code.
-    # Create a new Language object with a pipeline
-    from spacy.language import Language
-    nlp = Language(pipeline=[my_component])
+    nlp.pipeline
+    # [('tagger', &lt;spacy.pipeline.Tagger&gt;), ('parser', &lt;spacy.pipeline.DependencyParser&gt;), ('ner', &lt;spacy.pipeline.EntityRecognizer&gt;)]
+    nlp.pipe_names
+    # ['tagger', 'parser', 'ner']

-    # Modify an existing pipeline
-    nlp = spacy.load('en')
-    nlp.pipeline.append(my_component)
-
-+h(4, "creating-factory") Creating a factory
-
-p
-    |  A factory is a #[strong function that returns a pipeline component].
-    |  It's called with the #[code Vocab] object, to give it access to the
-    |  shared data between components – for example, the strings, morphology,
-    |  vectors or annotation scheme. Factories are useful for creating
-    |  #[strong stateful components], especially ones which
-    |  #[strong depend on shared data].
-
-+aside-code("Example").
-    def my_factory(vocab):
-        # load some state
-        def my_component(doc):
-            # process the doc
-            return doc
-        return my_component
-
-+table(["Argument", "Type", "Description"])
-    +row
-        +cell #[code vocab]
-        +cell #[code Vocab]
-        +cell
-            |  Shared data between components, including strings, morphology,
-            |  vectors etc.
-
-    +row("foot")
-        +cell returns
-        +cell callable
-        +cell The pipeline component.
-
-p
-    |  By creating a factory, you're essentially telling spaCy how to get the
-    |  pipeline component #[strong once the vocab is available]. Factories need to
-    |  be registered via #[+api("spacy#set_factory") #[code set_factory()]] and
-    |  by assigning them a unique ID. This ID can be added to the pipeline as a
-    |  string. When creating a pipeline, you're free to mix strings and
-    |  callable components:
-
-+code.
-    spacy.set_factory('my_factory', my_factory)
-    nlp = Language(pipeline=['my_factory', my_other_component])
-
-p
-    |  If spaCy comes across a string in the pipeline, it will try to resolve it
-    |  by looking it up in the available factories. The factory will then be
-    |  initialised with the #[code Vocab]. Providing factory names instead of
-    |  callables also makes it easy to specify them in the model's
-    |  #[+a("/usage/saving-loading#models-generating") meta.json]. If you're
-    |  training your own model and want to use one of spaCy's default components,
-    |  you won't have to worry about finding and implementing it either – to use
-    |  the default tagger, simply add #[code "tagger"] to the pipeline, and
-    |  #[strong spaCy will know what to do].
-
-+infobox("Important note")
-    |  Because factories are #[strong resolved on initialisation] of the
-    |  #[code Language] class, it's #[strong not possible] to add them to the
-    |  pipeline afterwards, e.g. by modifying #[code nlp.pipeline]. This only
-    |  works with individual component functions. To use factories, you need to
-    |  create a new #[code Language] object, or generate a
-    |  #[+a("/usage/training#models-generating") model package] with
-    |  a custom pipeline.
-
-+h(3, "disabling") Disabling pipeline components
+h(3, "disabling") Disabling and modifying pipeline components

 p
    |  If you don't need a particular component of the pipeline – for
@ -217,16 +121,19 @@ p
 +code.
    nlp = spacy.load('en', disable['parser', 'tagger'])
    nlp = English().from_disk('/model', disable=['tensorizer', 'ner'])
-    doc = nlp(u"I don't want parsed", disable=['parser'])

 p
-    |  Note that you can't write directly to #[code nlp.pipeline], as this list
-    |  holds the #[em actual components], not the IDs. However, if you know the
-    |  order of the components, you can still slice the list:
+    |  You can also use the #[+api("language#remove_pipe") #[code remove_pipe]]
+    |  method to remove pipeline components from an existing pipeline, the
+    |  #[+api("language#rename_pipe") #[code rename_pipe]] method to rename them,
+    |  or the #[+api("language#replace_pipe") #[code replace_pipe]] method
+    |  to replace them with a custom component entirely (more details on this
+    |  in the section on #[+a("#custom-components") custom components].

 +code.
-    nlp = spacy.load('en')
-    nlp.pipeline = nlp.pipeline[:2] # only use the first two components
+    nlp.remove_pipe('parser')
+    nlp.rename_pipe('ner', 'entityrecognizer')
+    nlp.replace_pipe('tagger', my_custom_tagger)

 +infobox("Important note: disabling pipeline components")
    .o-block
@ -234,12 +141,14 @@ p
        |  processing pipeline components, the #[code parser], #[code tagger]
        |  and #[code entity] keyword arguments have been replaced with
        |  #[code disable], which takes a list of pipeline component names.
-        |  This lets you disable both default and custom components when loading
+        |  This lets you disable pre-defined components when loading
        |  a model, or initialising a Language class via
        |  #[+api("language-from_disk") #[code from_disk]].
+
    +code-new.
-        nlp = spacy.load('en', disable=['tagger', 'ner'])
-        doc = nlp(u"I don't want parsed", disable=['parser'])
+        nlp = spacy.load('en', disable=['ner'])
+        nlp.remove_pipe('parser')
+        doc = nlp(u"I don't want parsed")
    +code-old.
        nlp = spacy.load('en', tagger=False, entity=False)
        doc = nlp(u"I don't want parsed", parse=False)
--- a/website/usage/_processing-pipelines/_user-hooks.jade
+++ b/website/usage/_processing-pipelines/_user-hooks.jade
@ -1,61 +0,0 @@
-//- 💫 DOCS > USAGE > PROCESSING PIPELINES > ATTRIBUTE HOOKS
-
-p
-    |  Hooks let you customize some of the behaviours of the #[code Doc],
-    |  #[code Span] or #[code Token] objects by adding a component to the
-    |  pipeline. For instance, to customize the
-    |  #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a
-    |  component that sets a custom function to
-    |  #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity]
-    |  method will check the #[code user_hooks] dict, and delegate to your
-    |  function if you've set one. Similar results can be achieved by setting
-    |  functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks].
-
-+code("Polymorphic similarity example").
-    span.similarity(doc)
-    token.similarity(span)
-    doc1.similarity(doc2)
-
-p
-    |  By default, this just averages the vectors for each document, and
-    |  computes their cosine. Obviously, spaCy should make it easy for you to
-    |  install your own similarity model. This introduces a tricky design
-    |  challenge. The current solution is to add three more dicts to the
-    |  #[code Doc] object:
-
-+aside("Implementation note")
-    |  The hooks live on the #[code Doc] object because the #[code Span] and
-    |  #[code Token] objects are created lazily, and don't own any data. They
-    |  just proxy to their parent #[code Doc]. This turns out to be convenient
-    |  here — we only have to worry about installing hooks in one place.
-
-+table(["Name", "Description"])
-    +row
-        +cell #[code user_hooks]
-        +cell Customise behaviour of #[code doc.vector], #[code doc.has_vector], #[code doc.vector_norm] or #[code doc.sents]
-
-    +row
-        +cell #[code user_token_hooks]
-        +cell Customise behaviour of #[code token.similarity], #[code token.vector], #[code token.has_vector], #[code token.vector_norm] or #[code token.conjuncts]
-
-    +row
-        +cell #[code user_span_hooks]
-        +cell Customise behaviour of #[code span.similarity], #[code span.vector], #[code span.has_vector], #[code span.vector_norm] or #[code span.root]
-
-p
-    |  To sum up, here's an example of hooking in custom #[code .similarity()]
-    |  methods:
-
-+code("Add custom similarity hooks").
-    class SimilarityModel(object):
-        def __init__(self, model):
-            self._model = model
-
-        def __call__(self, doc):
-            doc.user_hooks['similarity'] = self.similarity
-            doc.user_span_hooks['similarity'] = self.similarity
-            doc.user_token_hooks['similarity'] = self.similarity
-
-        def similarity(self, obj1, obj2):
-            y = self._model([obj1.vector, obj2.vector])
-            return float(y[0])
--- a/website/usage/_spacy-101/_lightning-tour.jade
+++ b/website/usage/_spacy-101/_lightning-tour.jade
@ -175,7 +175,7 @@ p

 +code.
    import spacy
-    from spacy.tokens.doc import Doc
+    from spacy.tokens import Doc
    from spacy.vocab import Vocab

    nlp = spacy.load('en')
--- a/website/usage/_visualizers/_html.jade
+++ b/website/usage/_visualizers/_html.jade
@ -61,7 +61,7 @@ p
        output_path.open('w', encoding='utf-8').write(svg)

 p
-    |  The above code will generate the dependency visualizations and them to
+    |  The above code will generate the dependency visualizations as to
    |  two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg].


--- a/website/usage/examples.jade
+++ b/website/usage/examples.jade
@ -2,6 +2,44 @@

 include ../_includes/_mixins

+section("pipeline")
+    +h(3, "custom-components-entities") Custom pipeline components and attribute extensions
+        +tag-new(2)
+
+    p
+        |  This example shows the implementation of a pipeline component
+        |  that sets entity annotations based on a list of single or
+        |  multiple-word company names, merges entities into one token and
+        |  sets custom attributes on the #[code Doc], #[code Span] and
+        |  #[code Token].
+
+    +github("spacy", "examples/pipeline/custom_component_entities.py")
+
+    +h(3, "custom-components-api")
+        |  Custom pipeline components and attribute extensions via a REST API
+        +tag-new(2)
+
+    p
+        |  This example shows the implementation of a pipeline component
+        |  that fetches country meta data via the
+        |  #[+a("https://restcountries.eu") REST Countries API] sets entity
+        |  annotations for countries, merges entities into one token and
+        |  sets custom attributes on the #[code Doc], #[code Span] and
+        |  #[code Token] – for example, the capital, latitude/longitude
+        |  coordinates and the country flag.
+
+    +github("spacy", "examples/pipeline/custom_component_countries_api.py")
+
+    +h(3, "custom-components-attr-methods") Custom method extensions
+        +tag-new(2)
+
+    p
+        |  A collection of snippets showing examples of extensions adding
+        |  custom methods to the #[code Doc], #[code Token] and
+        |  #[code Span].
+
+    +github("spacy", "examples/pipeline/custom_attr_methods.py")
+
 +section("matching")
    +h(3, "matcher") Using spaCy's rule-based matcher

--- a/website/usage/processing-pipelines.jade
+++ b/website/usage/processing-pipelines.jade
@ -8,18 +8,18 @@ include _spacy-101/_pipelines
    +h(2, "pipelines") How pipelines work
    include _processing-pipelines/_pipelines

-+section("examples")
-    +h(2, "examples") Examples
-    include _processing-pipelines/_examples
+section("custom-components")
+    +h(2, "custom-components") Creating custom pipeline components
+    include _processing-pipelines/_custom-components
+
+section("extensions")
+    +h(2, "extensions") Developing spaCy extensions
+    include _processing-pipelines/_extensions

 +section("multithreading")
    +h(2, "multithreading") Multi-threading
    include _processing-pipelines/_multithreading

-+section("user-hooks")
-    +h(2, "user-hooks") User hooks
-    include _processing-pipelines/_user-hooks
-
 +section("serialization")
    +h(2, "serialization") Serialization
    include _processing-pipelines/_serialization
--- a/website/usage/v2.jade
+++ b/website/usage/v2.jade
@ -102,30 +102,36 @@ p
    +h(3, "features-pipelines") Improved processing pipelines

    +aside-code("Example").
-        # Modify an existing pipeline
-        nlp = spacy.load('en')
-        nlp.pipeline.append(my_component)
+        # Set custom attributes
+        Doc.set_extension('my_attr', default=False)
+        Token.set_extension('my_attr', getter=my_token_getter)
+        assert doc._.my_attr, token._.my_attr

-        # Register a factory to create a component
-        spacy.set_factory('my_factory', my_factory)
-        nlp = Language(pipeline=['my_factory', mycomponent])
+        # Add components to the pipeline
+        my_component = lambda doc: doc
+        nlp.add_pipe(my_component)

    p
        |  It's now much easier to #[strong customise the pipeline] with your own
-        |  components, functions that receive a #[code Doc] object, modify and
-        |  return it. If your component is stateful, you can define and register a
-        |  factory which receives the shared #[code Vocab] object and returns a
-        |  component. spaCy's default components can be added to your pipeline by
-        |  using their string IDs. This way, you won't have to worry about finding
-        |  and implementing them – simply add #[code "tagger"] to the pipeline,
-        |  and spaCy will know what to do.
+        |  components: functions that receive a #[code Doc] object, modify and
+        |  return it. Extensions let you write any
+        |  #[strong attributes, properties and methods] to the #[code Doc],
+        |  #[code Token] and #[code Span]. You can add data, implement new
+        |  features, integrate other libraries with spaCy or plug in your own
+        |  machine learning models.

    +image
        include ../assets/img/pipeline.svg

    +infobox
-        |  #[+label-inline API:] #[+api("language") #[code Language]]
-        |  #[+label-inline Usage:] #[+a("/usage/language-processing-pipeline") Processing text]
+        |  #[+label-inline API:] #[+api("language") #[code Language]],
+        |  #[+api("doc#set_extension") #[code Doc.set_extension]],
+        |  #[+api("span#set_extension") #[code Span.set_extension]],
+        |  #[+api("token#set_extension") #[code Token.set_extension]]
+        |  #[+label-inline Usage:]
+        |  #[+a("/usage/processing-pipelines") Processing pipelines]
+        |  #[+label-inline Code:]
+        |  #[+src("/usage/examples#section-pipeline") Pipeline examples]

    +h(3, "features-text-classification") Text classification

@ -478,15 +484,16 @@ p
    p
        |  If you've been using custom pipeline components, check out the new
        |  guide on #[+a("/usage/language-processing-pipelines") processing pipelines].
-        |  Appending functions to the pipeline still works – but you might be able
-        |  to make this more convenient by registering "component factories".
-        |  Components of the processing pipeline can now be disabled by passing a
-        |  list of their names to the #[code disable] keyword argument on loading
-        |  or processing.
+        |  Appending functions to the pipeline still works – but the
+        |  #[+api("language#add_pipe") #[code add_pipe]] methods now makes this
+        |  much more convenient. Components of the processing pipeline can now
+        |  be disabled by passing a list of their names to the #[code disable]
+        |  keyword argument on load, or by simply demoving them from the
+        |  pipeline alltogether.

    +code-new.
        nlp = spacy.load('en', disable=['tagger', 'ner'])
-        doc = nlp(u"I don't want parsed", disable=['parser'])
+        nlp.remove_pipe('parser')
    +code-old.
        nlp = spacy.load('en', tagger=False, entity=False)
        doc = nlp(u"I don't want parsed", parse=False)