Merge pull request #1408 from explosion/feature/dot-underscore

💫 Custom attributes via Doc._, Token._ and Span._
2025-12-12 12:44:29 +03:00 · 2017-10-11 18:35:56 +02:00 · 2017-10-11 18:35:56 +02:00 · 37aa523a8e
commit 37aa523a8e
parent 40dbc85ffa 51519251c2
23 changed files with 1142 additions and 175 deletions
--- a/examples/pipeline/custom_attr_methods.py
+++ b/examples/pipeline/custom_attr_methods.py
@ -0,0 +1,52 @@
 # coding: utf-8
 """This example contains several snippets of methods that can be set via custom
 Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like
 they're "bound" to the object and are partially applied – i.e. the object
 they're called on is passed in as the first argument."""
 from __future__ import unicode_literals
 from spacy.lang.en import English
 from spacy.tokens import Doc, Span
 from spacy import displacy
 from pathlib import Path
 def to_html(doc, output='/tmp', style='dep'):
    """Doc method extension for saving the current state as a displaCy
    visualization.
    """
    # generate filename from first six non-punct tokens
    file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
    output_path = Path(output) / file_name
    html = displacy.render(doc, style=style, page=True)  # render markup
    output_path.open('w', encoding='utf-8').write(html)  # save to file
    print('Saved HTML to {}'.format(output_path))
 Doc.set_extension('to_html', method=to_html)
 nlp = English()
 doc = nlp(u"This is a sentence about Apple.")
 # add entity manually for demo purposes, to make it work without a model
 doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
 doc._.to_html(style='ent')
 def overlap_tokens(doc, other_doc):
    """Get the tokens from the original Doc that are also in the comparison Doc.
    """
    overlap = []
    other_tokens = [token.text for token in other_doc]
    for token in doc:
        if token.text in other_tokens:
            overlap.append(token)
    return overlap
 Doc.set_extension('overlap', method=overlap_tokens)
 nlp = English()
 doc1 = nlp(u"Peach emoji is where it has always been.")
 doc2 = nlp(u"Peach is the superior emoji.")
 tokens = doc1._.overlap(doc2)
 print(tokens)
--- a/examples/pipeline/custom_component_countries_api.py
+++ b/examples/pipeline/custom_component_countries_api.py
@ -0,0 +1,108 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import requests
 from spacy.lang.en import English
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc, Span, Token
 class RESTCountriesComponent(object):
    """Example of a spaCy v2.0 pipeline component that requests all countries
    via the REST Countries API, merges country names into one token, assigns
    entity labels and sets attributes on country tokens, e.g. the capital and
    lat/lng coordinates. Can be extended with more details from the API.
    REST Countries API: https://restcountries.eu
    API License: Mozilla Public License MPL 2.0
    """
    name = 'rest_countries' # component name, will show up in the pipeline
    def __init__(self, nlp, label='GPE'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        # Make request once on initialisation and store the data
        r = requests.get('https://restcountries.eu/rest/v2/all')
        r.raise_for_status()  # make sure requests raises an error if it fails
        countries = r.json()
        # Convert API response to dict keyed by country name for easy lookup
        # This could also be extended using the alternative and foreign language
        # names provided by the API
        self.countries = {c['name']: c for c in countries}
        self.label = nlp.vocab.strings[label]  # get entity label ID
        # Set up the PhraseMatcher with Doc patterns for each country name
        patterns = [nlp(c) for c in self.countries.keys()]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('COUNTRIES', None, *patterns)
        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        # If no default value is set, it defaults to None.
        Token.set_extension('is_country', default=False)
        Token.set_extension('country_capital')
        Token.set_extension('country_latlng')
        Token.set_extension('country_flag')
        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_country == True.
        Doc.set_extension('has_country', getter=self.has_country)
        Span.set_extension('has_country', getter=self.has_country)
    def __call__(self, doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.
        """
        matches = self.matcher(doc)
        spans = []  # keep the spans for later so we can merge them afterwards
        for _, start, end in matches:
            # Generate Span representing the entity & set label
            entity = Span(doc, start, end, label=self.label)
            spans.append(entity)
            # Set custom attribute on each token of the entity
            # Can be extended with other data returned by the API, like
            # currencies, country code, flag, calling code etc.
            for token in entity:
                token._.set('is_country', True)
                token._.set('country_capital', self.countries[entity.text]['capital'])
                token._.set('country_latlng', self.countries[entity.text]['latlng'])
                token._.set('country_flag', self.countries[entity.text]['flag'])
            # Overwrite doc.ents and add entity – be careful not to replace!
            doc.ents = list(doc.ents) + [entity]
        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()
        return doc  # don't forget to return the Doc!
    def has_country(self, tokens):
        """Getter for Doc and Span attributes. Returns True if one of the tokens
        is a country. Since the getter is only called when we access the
        attribute, we can refer to the Token's 'is_country' attribute here,
        which is already set in the processing step."""
        return any([t._.get('is_country') for t in tokens])
 # For simplicity, we start off with only the blank English Language class and
 # no model or pre-defined pipeline loaded.
 nlp = English()
 rest_countries = RESTCountriesComponent(nlp)  # initialise component
 nlp.add_pipe(rest_countries) # add it to the pipeline
 doc = nlp(u"Some text about Colombia and the Czech Republic")
 print('Pipeline', nlp.pipe_names)  # pipeline contains component name
 print('Doc has countries', doc._.has_country)  # Doc contains countries
 for token in doc:
    if token._.is_country:
        print(token.text, token._.country_capital, token._.country_latlng,
              token._.country_flag)  # country data
 print('Entities', [(e.text, e.label_) for e in doc.ents])  # all countries are entities
--- a/examples/pipeline/custom_component_entities.py
+++ b/examples/pipeline/custom_component_entities.py
@ -0,0 +1,85 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from spacy.lang.en import English
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc, Span, Token
 class TechCompanyRecognizer(object):
    """Example of a spaCy v2.0 pipeline component that sets entity annotations
    based on list of single or multiple-word company names. Companies are
    labelled as ORG and their spans are merged into one token. Additionally,
    ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
    respectively."""
    name = 'tech_companies'  # component name, will show up in the pipeline
    def __init__(self, nlp, companies=tuple(), label='ORG'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        self.label = nlp.vocab.strings[label]  # get entity label ID
        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
        # so even if the list of companies is long, it's very efficient
        patterns = [nlp(org) for org in companies]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('TECH_ORGS', None, *patterns)
        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Token.set_extension('is_tech_org', default=False)
        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_tech_org == True.
        Doc.set_extension('has_tech_org', getter=self.has_tech_org)
        Span.set_extension('has_tech_org', getter=self.has_tech_org)
    def __call__(self, doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.
        """
        matches = self.matcher(doc)
        spans = []  # keep the spans for later so we can merge them afterwards
        for _, start, end in matches:
            # Generate Span representing the entity & set label
            entity = Span(doc, start, end, label=self.label)
            spans.append(entity)
            # Set custom attribute on each token of the entity
            for token in entity:
                token._.set('is_tech_org', True)
            # Overwrite doc.ents and add entity – be careful not to replace!
            doc.ents = list(doc.ents) + [entity]
        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()
        return doc  # don't forget to return the Doc!
    def has_tech_org(self, tokens):
        """Getter for Doc and Span attributes. Returns True if one of the tokens
        is a tech org. Since the getter is only called when we access the
        attribute, we can refer to the Token's 'is_tech_org' attribute here,
        which is already set in the processing step."""
        return any([t._.get('is_tech_org') for t in tokens])
 # For simplicity, we start off with only the blank English Language class and
 # no model or pre-defined pipeline loaded.
 nlp = English()
 companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple']  # etc.
 component = TechCompanyRecognizer(nlp, companies)  # initialise component
 nlp.add_pipe(component, last=True)  # add it to the pipeline as the last element
 doc = nlp(u"Alphabet Inc. is the company behind Google.")
 print('Pipeline', nlp.pipe_names)  # pipeline contains component name
 print('Tokens', [t.text for t in doc])  # company names from the list are merged
 print('Doc has_tech_org', doc._.has_tech_org)  # Doc contains tech orgs
 print('Token 0 is_tech_org', doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org
 print('Token 1 is_tech_org', doc[1]._.is_tech_org)  # "is" is not
 print('Entities', [(e.text, e.label_) for e in doc.ents])  # all orgs are entities
--- a/spacy/language.py
+++ b/spacy/language.py
@ -226,7 +226,14 @@ class Language(object):
            >>> nlp.add_pipe(component, name='custom_name', last=True)
        """
        if name is None:
-            name = getattr(component, 'name', component.__name__)
+            if hasattr(component, 'name'):
                name = component.name
            elif hasattr(component, '__name__'):
                name = component.__name__
            elif hasattr(component, '__class__') and hasattr(component.__class__, '__name__'):
                name = component.__class__.__name__
            else:
                name = repr(component)
        if name in self.pipe_names:
            raise ValueError("'{}' already exists in pipeline.".format(name))
        if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
--- a/spacy/tests/test_underscore.py
+++ b/spacy/tests/test_underscore.py
@ -0,0 +1,53 @@
 from mock import Mock
 from ..tokens.underscore import Underscore
 def test_create_doc_underscore():
    doc = Mock()
    doc.doc = doc
    uscore = Underscore(Underscore.doc_extensions, doc)
    assert uscore._doc is doc
    assert uscore._start is None
    assert uscore._end is None
 def test_doc_underscore_getattr_setattr():
    doc = Mock()
    doc.doc = doc
    doc.user_data = {}
    Underscore.doc_extensions['hello'] = (False, None, None, None)
    doc._ = Underscore(Underscore.doc_extensions, doc)
    assert doc._.hello == False
    doc._.hello = True
    assert doc._.hello == True
 def test_create_span_underscore():
    span = Mock(doc=Mock(), start=0, end=2)
    uscore = Underscore(Underscore.span_extensions, span,
                        start=span.start, end=span.end)
    assert uscore._doc is span.doc
    assert uscore._start is span.start
    assert uscore._end is span.end
 def test_span_underscore_getter_setter():
    span = Mock(doc=Mock(), start=0, end=2)
    Underscore.span_extensions['hello'] = (None, None,
                                           lambda s: (s.start, 'hi'),
                                           lambda s, value: setattr(s, 'start',
                                                                    value))
    span._ = Underscore(Underscore.span_extensions, span,
                        start=span.start, end=span.end)
    assert span._.hello == (0, 'hi')
    span._.hello = 1
    assert span._.hello == (1, 'hi')
 def test_token_underscore_method():
    token = Mock(doc=Mock(), idx=7, say_cheese=lambda token: 'cheese')
    Underscore.token_extensions['hello'] = (None, token.say_cheese,
                                            None, None)
    token._ = Underscore(Underscore.token_extensions, token, start=token.idx)
    assert token._.hello() == 'cheese'
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -30,7 +30,7 @@ from ..util import normalize_slice
 from ..compat import is_config
 from .. import about
 from .. import util
-
+from .underscore import Underscore
 DEF PADDING = 5
@ -64,6 +64,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
    else:
        return Lexeme.get_struct_attr(token.lex, feat_name)
 def _get_chunker(lang):
    try:
        cls = util.get_lang_class(lang)
@ -73,6 +74,7 @@ def _get_chunker(lang):
        return None
    return cls.Defaults.syntax_iterators.get(u'noun_chunks')
 cdef class Doc:
    """A sequence of Token objects. Access sentences and named entities, export
    annotations to numpy arrays, losslessly serialize to compressed binary strings.
@ -87,6 +89,21 @@ cdef class Doc:
        >>> from spacy.tokens import Doc
        >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
    """
    @classmethod
    def set_extension(cls, name, default=None, method=None,
                      getter=None, setter=None):
        nr_defined = sum(t is not None for t in (default, getter, setter, method))
        assert nr_defined == 1
        Underscore.doc_extensions[name] = (default, method, getter, setter) 
    @classmethod
    def get_extension(cls, name):
        return Underscore.doc_extensions.get(name)
    @classmethod
    def has_extension(cls, name):
        return name in Underscore.doc_extensions
    def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
        """Create a Doc object.
@ -159,6 +176,10 @@ cdef class Doc:
            self.is_tagged = True
            self.is_parsed = True
    @property
    def _(self):
        return Underscore(Underscore.doc_extensions, self)
    def __getitem__(self, object i):
        """Get a `Token` or `Span` object.
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -17,10 +17,24 @@ from ..attrs cimport IS_PUNCT, IS_SPACE
 from ..lexeme cimport Lexeme
 from ..compat import is_config
 from .. import about
 from .underscore import Underscore
 cdef class Span:
    """A slice from a Doc object."""
    @classmethod
    def set_extension(cls, name, default=None, method=None,
                      getter=None, setter=None):
        Underscore.span_extensions[name] = (default, method, getter, setter)
    @classmethod
    def get_extension(cls, name):
        return Underscore.span_extensions.get(name)
    @classmethod
    def has_extension(cls, name):
        return name in Underscore.span_extensions
    def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
                  vector_norm=None):
        """Create a `Span` object from the slice `doc[start : end]`.
@ -111,10 +125,14 @@ cdef class Span:
        for i in range(self.start, self.end):
            yield self.doc[i]
    @property
    def _(self):
        return Underscore(Underscore.span_extensions, self,
                          start=self.start_char, end=self.end_char)
    def as_doc(self):
        '''Create a Doc object view of the Span's data.
-        This is mostly useful for C-typed interfaces. 
+        This is mostly useful for C-typed interfaces.
        '''
        cdef Doc doc = Doc(self.doc.vocab)
        doc.length = self.end-self.start
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -20,10 +20,24 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST
 from ..attrs cimport LEMMA, POS, TAG, DEP
 from ..compat import is_config
 from .. import about
 from .underscore import Underscore
 cdef class Token:
    """An individual token – i.e. a word, punctuation symbol, whitespace, etc."""
    @classmethod
    def set_extension(cls, name, default=None, method=None,
                      getter=None, setter=None):
        Underscore.token_extensions[name] = (default, method, getter, setter)
    @classmethod
    def get_extension(cls, name):
        return Underscore.span_extensions.get(name)
    @classmethod
    def has_extension(cls, name):
        return name in Underscore.span_extensions
    def __cinit__(self, Vocab vocab, Doc doc, int offset):
        """Construct a `Token` object.
@ -87,6 +101,11 @@ cdef class Token:
        else:
            raise ValueError(op)
    @property
    def _(self):
        return Underscore(Underscore.token_extensions, self,
                          start=self.idx, end=None)
    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
        """Check the value of a boolean flag.
@ -266,7 +285,7 @@ cdef class Token:
        def __get__(self):
            if 'vector_norm' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['vector_norm'](self)
-            vector = self.vector 
+            vector = self.vector
            return numpy.sqrt((vector ** 2).sum())
    property n_lefts:
--- a/spacy/tokens/underscore.py
+++ b/spacy/tokens/underscore.py
@ -0,0 +1,50 @@
 import functools
 class Underscore(object):
    doc_extensions = {}
    span_extensions = {}
    token_extensions = {}
    def __init__(self, extensions, obj, start=None, end=None):
        object.__setattr__(self, '_extensions', extensions)
        object.__setattr__(self, '_obj', obj)
        # Assumption is that for doc values, _start and _end will both be None
        # Span will set non-None values for _start and _end
        # Token will have _start be non-None, _end be None
        # This lets us key everything into the doc.user_data dictionary,
        # (see _get_key), and lets us use a single Underscore class.
        object.__setattr__(self, '_doc', obj.doc)
        object.__setattr__(self, '_start', start)
        object.__setattr__(self, '_end', end)
    def __getattr__(self, name):
        if name not in self._extensions:
            raise AttributeError(name)
        default, method, getter, setter = self._extensions[name]
        if getter is not None:
            return getter(self._obj)
        elif method is not None:
            return functools.partial(method, self._obj)
        else:
            return self._doc.user_data.get(self._get_key(name), default)
    def __setattr__(self, name, value):
        if name not in self._extensions:
            raise AttributeError(name)
        default, method, getter, setter = self._extensions[name]
        if setter is not None:
            return setter(self._obj, value)
        else:
            self._doc.user_data[self._get_key(name)] = value
    def set(self, name, value):
        return self.__setattr__(name, value)
    def get(self, name):
        return self.__getattr__(name)
    def has(self, name):
        return name in self._extensions
    def _get_key(self, name):
        return ('._.', name, self._start, self._end)
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -149,7 +149,7 @@ mixin code(label, language, prompt, height, icon, wrap)
 //- Code blocks to display old/new versions
-mixin code-compare()
+mixin code-wrapper()
    span.u-inline-block.u-padding-top.u-width-full
        block
--- a/website/api/doc.jade
+++ b/website/api/doc.jade
@ -138,6 +138,109 @@ p Get the number of tokens in the document.
        +cell int
        +cell The number of tokens in the document.
 +h(2, "set_extension") Doc.set_extension
    +tag classmethod
    +tag-new(2)
 p
    |  Define a custom attribute on the #[code Doc] which becomes available via
    |  #[code Doc._]. For details, see the documentation on
    |  #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes].
 +aside-code("Example").
    from spacy.tokens import Doc
    city_getter = lambda doc: doc.text in ('New York', 'Paris', 'Berlin')
    Doc.set_extension('has_city', getter=city_getter)
    doc = nlp(u'I like New York')
    assert doc._.has_city
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code name]
        +cell unicode
        +cell
            |  Name of the attribute to set by the extension. For example,
            |  #[code 'my_attr'] will be available as #[code doc._.my_attr].
    +row
        +cell #[code default]
        +cell -
        +cell
            |  Optional default value of the attribute if no getter or method
            |  is defined.
    +row
        +cell #[code method]
        +cell callable
        +cell
            |  Set a custom method on the object, for example
            |  #[code doc._.compare(other_doc)].
    +row
        +cell #[code getter]
        +cell callable
        +cell
            |  Getter function that takes the object and returns an attribute
            |  value. Is called when the user accesses the #[code ._] attribute.
    +row
        +cell #[code setter]
        +cell callable
        +cell
            |  Setter function that takes the #[code Doc] and a value, and
            |  modifies the object. Is called when the user writes to the
            |  #[code Doc._] attribute.
 +h(2, "get_extension") Doc.get_extension
    +tag classmethod
    +tag-new(2)
 p
    |  Look up a previously registered extension by name. Returns a 4-tuple
    |  #[code.u-break (default, method, getter, setter)] if the extension is
    |  registered. Raises a #[code KeyError] otherwise.
 +aside-code("Example").
    from spacy.tokens import Doc
    Doc.set_extension('is_city', default=False)
    extension = Doc.get_extension('is_city')
    assert extension == (False, None, None, None)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code name]
        +cell unicode
        +cell Name of the extension.
    +row("foot")
        +cell returns
        +cell tuple
        +cell
            |  A #[code.u-break (default, method, getter, setter)] tuple of the
            |  extension.
 +h(2, "has_extension") Doc.has_extension
    +tag classmethod
    +tag-new(2)
 p Check whether an extension has been registered on the #[code Doc] class.
 +aside-code("Example").
    from spacy.tokens import Doc
    Doc.set_extension('is_city', default=False)
    assert Doc.has_extension('is_city')
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code name]
        +cell unicode
        +cell Name of the extension to check.
    +row("foot")
        +cell returns
        +cell bool
        +cell Whether the extension has been registered.
 +h(2, "char_span") Doc.char_span
    +tag method
    +tag-new(2)
--- a/website/api/span.jade
+++ b/website/api/span.jade
@ -116,6 +116,109 @@ p Get the number of tokens in the span.
        +cell int
        +cell The number of tokens in the span.
 +h(2, "set_extension") Span.set_extension
    +tag classmethod
    +tag-new(2)
 p
    |  Define a custom attribute on the #[code Span] which becomes available via
    |  #[code Span._]. For details, see the documentation on
    |  #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes].
 +aside-code("Example").
    from spacy.tokens import Span
    city_getter = lambda span: span.text in ('New York', 'Paris', 'Berlin')
    Span.set_extension('has_city', getter=city_getter)
    doc = nlp(u'I like New York in Autumn')
    assert doc[1:4]._.has_city
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code name]
        +cell unicode
        +cell
            |  Name of the attribute to set by the extension. For example,
            |  #[code 'my_attr'] will be available as #[code span._.my_attr].
    +row
        +cell #[code default]
        +cell -
        +cell
            |  Optional default value of the attribute if no getter or method
            |  is defined.
    +row
        +cell #[code method]
        +cell callable
        +cell
            |  Set a custom method on the object, for example
            |  #[code span._.compare(other_span)].
    +row
        +cell #[code getter]
        +cell callable
        +cell
            |  Getter function that takes the object and returns an attribute
            |  value. Is called when the user accesses the #[code ._] attribute.
    +row
        +cell #[code setter]
        +cell callable
        +cell
            |  Setter function that takes the #[code Span] and a value, and
            |  modifies the object. Is called when the user writes to the
            |  #[code Span._] attribute.
 +h(2, "get_extension") Span.get_extension
    +tag classmethod
    +tag-new(2)
 p
    |  Look up a previously registered extension by name. Returns a 4-tuple
    |  #[code.u-break (default, method, getter, setter)] if the extension is
    |  registered. Raises a #[code KeyError] otherwise.
 +aside-code("Example").
    from spacy.tokens import Span
    Span.set_extension('is_city', default=False)
    extension = Span.get_extension('is_city')
    assert extension == (False, None, None, None)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code name]
        +cell unicode
        +cell Name of the extension.
    +row("foot")
        +cell returns
        +cell tuple
        +cell
            |  A #[code.u-break (default, method, getter, setter)] tuple of the
            |  extension.
 +h(2, "has_extension") Span.has_extension
    +tag classmethod
    +tag-new(2)
 p Check whether an extension has been registered on the #[code Span] class.
 +aside-code("Example").
    from spacy.tokens import Span
    Span.set_extension('is_city', default=False)
    assert Span.has_extension('is_city')
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code name]
        +cell unicode
        +cell Name of the extension to check.
    +row("foot")
        +cell returns
        +cell bool
        +cell Whether the extension has been registered.
 +h(2, "similarity") Span.similarity
    +tag method
    +tag-model("vectors")
--- a/website/api/token.jade
+++ b/website/api/token.jade
@ -51,6 +51,109 @@ p The number of unicode characters in the token, i.e. #[code token.text].
        +cell int
        +cell The number of unicode characters in the token.
 +h(2, "set_extension") Token.set_extension
    +tag classmethod
    +tag-new(2)
 p
    |  Define a custom attribute on the #[code Token] which becomes available
    |  via #[code Token._]. For details, see the documentation on
    |  #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes].
 +aside-code("Example").
    from spacy.tokens import Token
    fruit_getter = lambda token: token.text in ('apple', 'pear', 'banana')
    Token.set_extension('is_fruit', getter=fruit_getter)
    doc = nlp(u'I have an apple')
    assert doc[3]._.is_fruit
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code name]
        +cell unicode
        +cell
            |  Name of the attribute to set by the extension. For example,
            |  #[code 'my_attr'] will be available as #[code token._.my_attr].
    +row
        +cell #[code default]
        +cell -
        +cell
            |  Optional default value of the attribute if no getter or method
            |  is defined.
    +row
        +cell #[code method]
        +cell callable
        +cell
            |  Set a custom method on the object, for example
            |  #[code token._.compare(other_token)].
    +row
        +cell #[code getter]
        +cell callable
        +cell
            |  Getter function that takes the object and returns an attribute
            |  value. Is called when the user accesses the #[code ._] attribute.
    +row
        +cell #[code setter]
        +cell callable
        +cell
            |  Setter function that takes the #[code Token] and a value, and
            |  modifies the object. Is called when the user writes to the
            |  #[code Token._] attribute.
 +h(2, "get_extension") Token.get_extension
    +tag classmethod
    +tag-new(2)
 p
    |  Look up a previously registered extension by name. Returns a 4-tuple
    |  #[code.u-break (default, method, getter, setter)] if the extension is
    |  registered. Raises a #[code KeyError] otherwise.
 +aside-code("Example").
    from spacy.tokens import Token
    Token.set_extension('is_fruit', default=False)
    extension = Token.get_extension('is_fruit')
    assert extension == (False, None, None, None)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code name]
        +cell unicode
        +cell Name of the extension.
    +row("foot")
        +cell returns
        +cell tuple
        +cell
            |  A #[code.u-break (default, method, getter, setter)] tuple of the
            |  extension.
 +h(2, "has_extension") Token.has_extension
    +tag classmethod
    +tag-new(2)
 p Check whether an extension has been registered on the #[code Token] class.
 +aside-code("Example").
    from spacy.tokens import Token
    Token.set_extension('is_fruit', default=False)
    assert Token.has_extension('is_fruit')
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code name]
        +cell unicode
        +cell Name of the extension to check.
    +row("foot")
        +cell returns
        +cell bool
        +cell Whether the extension has been registered.
 +h(2, "check_flag") Token.check_flag
    +tag method
--- a/website/usage/_data.json
+++ b/website/usage/_data.json
@ -105,9 +105,9 @@
        "menu": {
            "How Pipelines Work": "pipelines",
            "Custom Components": "custom-components",
            "Developing Extensions": "extensions",
            "Multi-threading": "multithreading",
-            "Serialization": "serialization",
+            "Serialization": "serialization"
            "Developing Extensions": "extensions"
        }
    },
@ -195,6 +195,7 @@
        "teaser": "Full code examples you can modify and run.",
        "next": "resources",
        "menu": {
            "Pipeline": "pipeline",
            "Matching": "matching",
            "Training": "training",
            "Deep Learning": "deep-learning"
--- a/website/usage/_processing-pipelines/_custom-components.jade
+++ b/website/usage/_processing-pipelines/_custom-components.jade
@ -1,12 +1,11 @@
 //- 💫 DOCS > USAGE > PROCESSING PIPELINES > CUSTOM COMPONENTS
 p
-    |  A component receives a #[code Doc] object and
+    |  A component receives a #[code Doc] object and can modify it – for example,
-    |  #[strong performs the actual processing] – for example, using the current
+    |  by using the current weights to make a prediction and set some annotation
-    |  weights to make a prediction and set some annotation on the document. By
+    |  on the document. By adding a component to the pipeline, you'll get access
-    |  adding a component to the pipeline, you'll get access to the #[code Doc]
+    |  to the #[code Doc] at any point #[strong during processing] – instead of
-    |  at any point #[strong during] processing – instead of only being able to
+    |  only being able to modify it afterwards.
    |  modify it afterwards.
 +aside-code("Example").
    def my_component(doc):
@ -27,10 +26,10 @@ p
 p
    |  Custom components can be added to the pipeline using the
    |  #[+api("language#add_pipe") #[code add_pipe]] method. Optionally, you
-    |  can either specify a component to add it before or after, tell spaCy
+    |  can either specify a component to add it #[strong before or after], tell
-    |  to add it first or last in the pipeline, or define a custom name.
+    |  spaCy to add it #[strong first or last] in the pipeline, or define a
-    |  If no name is set and no #[code name] attribute is present on your
+    |  #[strong custom name]. If no name is set and no #[code name] attribute
-    |  component, the function name, e.g. #[code component.__name__] is used.
+    |  is present on your component, the function name is used.
 +code("Adding pipeline components").
    def my_component(doc):
@ -67,7 +66,19 @@ p
    nlp.add_pipe(my_component, first=True)
 +h(3, "custom-components-attributes")
-    |  Setting attributes on the #[code Doc], #[code Span] and #[code Token]
+    |  Extension attributes on #[code Doc], #[code Span] and #[code Token]
    +tag-new(2)
 p
    |  As of v2.0, spaCy allows you to set any custom attributes and methods
    |  on the #[code Doc], #[code Span] and #[code Token], which become
    |  available as #[code Doc._], #[code Span._] and #[code Token._] – for
    |  example, #[code Token._.my_attr]. This lets you store additional
    |  information relevant to your application, add new features and
    |  functionality to spaCy, and implement your own models trained with other
    |  machine learning libraries. It also lets you take advantage of spaCy's
    |  data structures and the #[code Doc] object as the "single source of
    |  truth".
 +aside("Why ._?")
    |  Writing to a #[code ._] attribute instead of to the #[code Doc] directly
@ -78,9 +89,216 @@ p
    |  what's custom – for example, #[code doc.sentiment] is spaCy, while
    |  #[code doc._.sent_score] isn't.
-+under-construction
+p
    |  There are three main types of extensions, which can be defined using the
    |  #[+api("doc#set_extension") #[code Doc.set_extension]],
    |  #[+api("span#set_extension") #[code Span.set_extension]] and
    |  #[+api("token#set_extension") #[code Token.set_extension]] methods.
-+h(3, "custom-components-user-hooks") Other user hooks
+list("numbers")
    +item #[strong Attribute extensions].
        |  Set a default value for an attribute, which can be overwritten
        |  manually at any time. Attribute extensions work like "normal"
        |  variables and are the quickest way to store arbitrary information
        |  on a #[code Doc], #[code Span] or #[code Token].
        +code-wrapper
            +code.
                Doc.set_extension('hello', default=True)
                assert doc._.hello
                doc._.hello = False
    +item #[strong Property extensions].
        |  Define a getter and an optional setter function. If no setter is
        |  provided, the extension is immutable. Since the getter and setter
        |  functions are only called when you #[em retrieve] the attribute,
        |  you can also access values of previously added attribute extensions.
        |  For example, a #[code Doc] getter can average over #[code Token]
        |   attributes. For #[code Span] extensions, you'll almost always want
        |  to use a property – otherwise, you'd have to write to
        |  #[em every possible] #[code Span] in the #[code Doc] to set up the
        |  values correctly.
        +code-wrapper
            +code.
                Doc.set_extension('hello', getter=get_hello_value, setter=set_hello_value)
                assert doc._.hello
                doc._.hello = 'Hi!'
    +item #[strong Method extensions].
        |  Assign a function that becomes available as an object method. Method
        |  extensions are always immutable. For more details and implementation
        |  ideas, see
        |  #[+a("/usage/examples#custom-components-attr-methods") these examples].
        +code-wrapper
            +code.o-no-block.
                Doc.set_extension('hello', method=lambda doc, name: 'Hi {}!'.format(name))
                assert doc._.hello('Bob') == 'Hi Bob!'
 p
    |  Before you can access a custom extension, you need to register it using
    |  the #[code set_extension] method on the object you want
    |  to add it to, e.g. the #[code Doc]. Keep in mind that extensions are
    |  always #[strong added globally] and not just on a particular instance.
    |  If an attribute of the same name
    |  already exists, or if you're trying to access an attribute that hasn't
    |  been registered, spaCy will raise an #[code AttributeError].
 +code("Example").
    from spacy.tokens import Doc, Span, Token
    fruits = ['apple', 'pear', 'banana', 'orange', 'strawberry']
    is_fruit_getter = lambda token: token.text in fruits
    has_fruit_getter = lambda obj: any([t.text in fruits for t in obj])
    Token.set_extension('is_fruit', getter=is_fruit_getter)
    Doc.set_extension('has_fruit', getter=has_fruit_getter)
    Span.set_extension('has_fruit', getter=has_fruit_getter)
 +aside-code("Usage example").
    doc = nlp(u"I have an apple and a melon")
    assert doc[3]._.is_fruit      # get Token attributes
    assert not doc[0]._.is_fruit
    assert doc._.has_fruit        # get Doc attributes
    assert doc[1:4]._.has_fruit   # get Span attributes
 p
    |  Once you've registered your custom attribute, you can also use the
    |  built-in #[code set], #[code get] and #[code has] methods to modify and
    |  retrieve the attributes. This is especially useful it you want to pass in
    |  a string instead of calling #[code doc._.my_attr].
 +table(["Method", "Description", "Valid for", "Example"])
    +row
        +cell #[code ._.set()]
        +cell Set a value for an attribute.
        +cell Attributes, mutable properties.
        +cell #[code.u-break token._.set('my_attr', True)]
    +row
        +cell #[code ._.get()]
        +cell Get the value of an attribute.
        +cell Attributes, mutable properties, immutable properties, methods.
        +cell #[code.u-break my_attr = span._.get('my_attr')]
    +row
        +cell #[code ._.has()]
        +cell Check if an attribute exists.
        +cell Attributes, mutable properties, immutable properties, methods.
        +cell #[code.u-break doc._.has('my_attr')]
 +infobox("How the ._ is implemented")
    |  Extension definitions – the defaults, methods, getters and setters you
    |  pass in to #[code set_extension] are stored in class attributes on the
    |  #[code Underscore] class. If you write to an extension attribute, e.g.
    |  #[code doc._.hello = True], the data is stored within the
    |  #[+api("doc#attributes") #[code Doc.user_data]] dictionary. To keep the
    |  underscore data separate from your other dictionary entries, the string
    |  #[code "._."] is placed before the name, in a tuple.
 +h(4, "component-example1") Example: Custom sentence segmentation logic
 p
    |  Let's say you want to implement custom logic to improve spaCy's sentence
    |  boundary detection. Currently, sentence segmentation is based on the
    |  dependency parse, which doesn't always produce ideal results. The custom
    |  logic should therefore be applied #[strong after] tokenization, but
    |  #[strong before] the dependency parsing – this way, the parser can also
    |  take advantage of the sentence boundaries.
 +code.
    def sbd_component(doc):
        for i, token in enumerate(doc[:-2]):
            # define sentence start if period + titlecase token
            if token.text == '.' and doc[i+1].is_title:
                doc[i+1].sent_start = True
        return doc
    nlp = spacy.load('en')
    nlp.add_pipe(sbd_component, before='parser')  # insert before the parser
 +h(4, "component-example2")
    |  Example: Pipeline component for entity matching and tagging with
    |  custom attributes
 p
    |  This example shows how to create a spaCy extension that takes a
    |  terminology list (in this case, single- and multi-word company names),
    |  matches the occurences in a document, labels them as #[code ORG] entities,
    |  merges the tokens and sets custom #[code is_tech_org] and
    |  #[code has_tech_org] attributes. For efficient matching, the example uses
    |  the #[+api("phrasematcher") #[code PhraseMatcher]] which accepts
    |  #[code Doc] objects as match patterns and works well for large
    |  terminology lists. It also ensures your patterns will always match, even
    |  when you customise spaCy's tokenization rules. When you call #[code nlp]
    |  on a text, the custom pipeline component is applied to the #[code Doc]
 +github("spacy", "examples/pipeline/custom_component_entities.py", false, 500)
 p
    |  Wrapping this functionality in a
    |  pipeline component allows you to reuse the module with different
    |  settings, and have all pre-processing taken care of when you call
    |  #[code nlp] on your text and receive a #[code Doc] object.
 +h(4, "component-example3")
    |  Example: Pipeline component for GPE entities and country meta data via a
    |  REST API
 p
    |  This example shows the implementation of a pipeline component
    |  that fetches country meta data via the
    |  #[+a("https://restcountries.eu") REST Countries API] sets entity
    |  annotations for countries, merges entities into one token and
    |  sets custom attributes on the #[code Doc], #[code Span] and
    |  #[code Token] – for example, the capital, latitude/longitude coordinates
    |  and even the country flag.
 +github("spacy", "examples/pipeline/custom_component_countries_api.py", false, 500)
 p
    |  In this case, all data can be fetched on initialisation in one request.
    |  However, if you're working with text that contains incomplete country
    |  names, spelling mistakes or foreign-language versions, you could also
    |  implement a #[code like_country]-style getter function that makes a
    |  request to the search API endpoint and returns the best-matching
    |  result.
 +h(4, "custom-components-usage-ideas") Other usage ideas
 +list
    +item
        |  #[strong Adding new features and hooking in models]. For example,
        |  a sentiment analysis model, or your preferred solution for
        |  lemmatization or sentiment analysis. spaCy's built-in tagger,
        |  parser and entity recognizer respect annotations that were already
        |  set on the #[code Doc] in a previous step of the pipeline.
    +item
        |  #[strong Integrating other libraries and APIs]. For example, your
        |  pipeline component can write additional information and data
        |  directly to the #[code Doc] or #[code Token] as custom attributes,
        |  while making sure no information is lost in the process. This can
        |  be output generated by other libraries and models, or an external
        |  service with a REST API.
    +item
        |  #[strong Debugging and logging]. For example, a component which
        |  stores and/or exports relevant information about the current state
        |  of the processed document, and insert it at any point of your
        |  pipeline.
 +infobox("Developing third-party extensions")
    |  The new pipeline management and custom attributes finally make it easy
    |  to develop your own spaCy extensions and plugins and share them with
    |  others. Extensions can claim their own #[code ._] namespace and exist as
    |  standalone packages. If you're developing a tool or library and want to
    |  make it easy for others to use it with spaCy and add it to their
    |  pipeline, all you have to do is expose a function that takes a
    |  #[code Doc], modifies it and returns it. For more details and
    |  #[strong best practices], see the section on
    |  #[+a("#extensions") developing spaCy extensions].
 +h(3, "custom-components-user-hooks") User hooks
 p
    |  While it's generally recommended to use the #[code Doc._], #[code Span._]
--- a/website/usage/_processing-pipelines/_examples.jade
+++ b/website/usage/_processing-pipelines/_examples.jade
@ -1,126 +0,0 @@
 //- 💫 DOCS > USAGE > PROCESSING PIPELINES > EXAMPLES
 p
    |  To see real-world examples of pipeline factories and components in action,
    |  you can have a look at the source of spaCy's built-in components, e.g.
    |  the #[+api("tagger") #[code Tagger]], #[+api("parser") #[code Parser]] or
    |  #[+api("entityrecognizer") #[code EntityRecongnizer]].
 +h(3, "example1") Example: Custom sentence segmentation logic
 p
    |  Let's say you want to implement custom logic to improve spaCy's sentence
    |  boundary detection. Currently, sentence segmentation is based on the
    |  dependency parse, which doesn't always produce ideal results. The custom
    |  logic should therefore be applied #[strong after] tokenization, but
    |  #[strong before] the dependency parsing – this way, the parser can also
    |  take advantage of the sentence boundaries.
 +code.
    def sbd_component(doc):
        for i, token in enumerate(doc[:-2]):
            # define sentence start if period + titlecase token
            if token.text == '.' and doc[i+1].is_title:
                doc[i+1].sent_start = True
        return doc
 p
    |  In this case, we simply want to add the component to the existing
    |  pipeline of the English model. We can do this by inserting it at index 0
    |  of #[code nlp.pipeline]:
 +code.
    nlp = spacy.load('en')
    nlp.pipeline.insert(0, sbd_component)
 p
    |  When you call #[code nlp] on some text, spaCy will tokenize it to create
    |  a #[code Doc] object, and first call #[code sbd_component] on it, followed
    |  by the model's default pipeline.
 +h(3, "example2") Example: Sentiment model
 p
    |  Let's say you have trained your own document sentiment model on English
    |  text. After tokenization, you want spaCy to first execute the
    |  #[strong default tensorizer], followed by a custom
    |  #[strong sentiment component] that adds a #[code .sentiment]
    |  property to the #[code Doc], containing your model's sentiment precition.
 p
    |  Your component class will have a #[code from_disk()] method that spaCy
    |  calls to load the model data. When called, the component will compute
    |  the sentiment score, add it to the #[code Doc] and return the modified
    |  document. Optionally, the component can include an #[code update()] method
    |  to allow training the model.
 +code.
    import pickle
    from pathlib import Path
    class SentimentComponent(object):
        def __init__(self, vocab):
            self.weights = None
        def __call__(self, doc):
            doc.sentiment = sum(self.weights*doc.vector) # set sentiment property
            return doc
        def from_disk(self, path): # path = model path + factory ID ('sentiment')
            self.weights = pickle.load(Path(path) / 'weights.bin') # load weights
            return self
        def update(self, doc, gold): # update weights – allows training!
            prediction = sum(self.weights*doc.vector)
            self.weights -= 0.001*doc.vector*(prediction-gold.sentiment)
 p
    |  The factory will initialise the component with the #[code Vocab] object.
    |  To be able to add it to your model's pipeline as #[code 'sentiment'],
    |  it also needs to be registered via
    |  #[+api("spacy#set_factory") #[code set_factory()]].
 +code.
    def sentiment_factory(vocab):
        component = SentimentComponent(vocab) # initialise component
        return component
    spacy.set_factory('sentiment', sentiment_factory)
 p
    |  The above code should be #[strong shipped with your model]. You can use
    |  the #[+api("cli#package") #[code package]] command to create all required
    |  files and directories. The model package will include an
    |  #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) #[code __init__.py]]
    |  with a #[code load()] method, that will initialise the language class with
    |  the model's pipeline and call the #[code from_disk()] method to load
    |  the model data.
 p
    |  In the model package's meta.json, specify the language class and pipeline
    |  IDs:
 +code("meta.json (excerpt)", "json").
    {
        "name": "sentiment_model",
        "lang": "en",
        "version": "1.0.0",
        "spacy_version": "&gt;=2.0.0,&lt;3.0.0",
        "pipeline": ["tensorizer", "sentiment"]
    }
 p
    |  When you load your new model, spaCy will call the model's #[code load()]
    |  method. This will return a #[code Language] object with a pipeline
    |  containing the default tensorizer, and the sentiment component returned
    |  by your custom #[code "sentiment"] factory.
 +code.
    nlp = spacy.load('en_sentiment_model')
    doc = nlp(u'I love pizza')
    assert doc.sentiment
 +infobox("Saving and loading models")
    |  For more information and a detailed guide on how to package your model,
    |  see the documentation on
    |  #[+a("/usage/training#saving-loading") saving and loading models].
--- a/website/usage/_processing-pipelines/_extensions.jade
+++ b/website/usage/_processing-pipelines/_extensions.jade
@ -1,3 +1,110 @@
 //- 💫 DOCS > USAGE > PROCESSING PIPELINES > DEVELOPING EXTENSIONS
-+under-construction
+p
    |  We're very excited about all the new possibilities for community
    |  extensions and plugins in spaCy v2.0, and we can't wait to see what
    |  you build with it! To get you started, here are a few tips, tricks and
    |  best practices:
 +list
    +item
        |  Make sure to choose a #[strong descriptive and specific name] for
        |  your pipeline component class, and set it as its #[code name]
        |  attribute. Avoid names that are too common or likely to clash with
        |  built-in or a user's other custom components. While it's fine to call
        |  your package "spacy_my_extension", avoid component names including
        |  "spacy", since this can easily lead to confusion.
        +code-wrapper
            +code-new name = 'myapp_lemmatizer'
            +code-old name = 'lemmatizer'
    +item
        |  When writing to #[code Doc], #[code Token] or #[code Span] objects,
        |  #[strong use getter functions] wherever possible, and avoid setting
        |  values explicitly. Tokens and spans don't own any data themselves,
        |  so you should provide a function that allows them to compute the
        |  values instead of writing static properties to individual objects.
        +code-wrapper
            +code-new.
                is_fruit = lambda token: token.text in ('apple', 'orange')
                Token.set_extension('is_fruit', getter=is_fruit)
            +code-old.
                token._.set_extension('is_fruit', default=False)
                if token.text in ('apple', 'orange'):
                    token._.set('is_fruit', True)
    +item
        |  Always add your custom attributes to the #[strong global] #[code Doc]
        |  #[code Token] or #[code Span] objects, not a particular instance of
        |  them. Add the attributes #[strong as early as possible], e.g. in
        |  your extension's #[code __init__] method or in the global scope of
        |  your module. This means that in the case of namespace collisions,
        |  the user will see an error immediately, not just when they run their
        |  pipeline.
        +code-wrapper
            +code-new.
                from spacy.tokens import Doc
                def __init__(attr='my_attr'):
                    Doc.set_extension(attr, getter=self.get_doc_attr)
            +code-old.
                def __call__(doc):
                    doc.set_extension('my_attr', getter=self.get_doc_attr)
    +item
        |  If your extension is setting properties on the #[code Doc],
        |  #[code Token] or #[code Span], include an option to
        |  #[strong let the user to change those attribute names]. This makes
        |  it easier to avoid namespace collisions and accommodate users with
        |  different naming preferences. We recommend adding an #[code attrs]
        |  argument to the #[code __init__] method of your class so you can
        |  write the names to class attributes and reuse them across your
        |  component.
        +code-wrapper
            +code-new Doc.set_extension(self.doc_attr, default='some value')
            +code-old Doc.set_extension('my_doc_attr', default='some value')
    +item
        |  Ideally, extensions should be #[strong standalone packages] with
        |  spaCy and optionally, other packages specified as a dependency. They
        |  can freely assign to their own #[code ._] namespace, but should stick
        |  to that. If your extension's only job is to provide a better
        |  #[code .similarity] implementation, and your docs state this
        |  explicitly, there's no problem with writing to the
        |  #[+a("#custom-components-user-hooks") #[code user_hooks]], and
        |  overwriting spaCy's built-in method. However, a third-party
        |  extension should #[strong never silently overwrite built-ins], or
        |  attributes set by other extensions.
    +item
        |  If you're looking to publish a model that depends on a custom
        |  pipeline component, you can either #[strong require it] in the model
        |  package's dependencies, or – if the component is specific and
        |  lightweight – choose to #[strong ship it with your model package]
        |  and add it to the #[code Language] instance returned by the
        |  model's #[code load()] method. For examples of this, check out the
        |  implementations of spaCy's
        |  #[+api("util#load_model_from_init_py") #[code load_model_from_init_py()]]
        |  and  #[+api("util#load_model_from_path") #[code load_model_from_path()]]
        |  utility functions.
        +code-wrapper
            +code-new.
                nlp.add_pipe(my_custom_component)
                return nlp.from_disk(model_path)
    +item
        |  Once you're ready to share your extension with others, make sure to
        |  #[strong add docs and installation instructions] (you can
        |  always link to this page for more info). Make it easy for others to
        |  install and use your extension, for example by uploading it to
        |  #[+a("https://pypi.python.org") PyPi]. If you're sharing your code on
        |  GitHub, don't forget to tag it
        |  with #[+a("https://github.com/search?q=topic%3Aspacy") #[code spacy]]
        |  and #[+a("https://github.com/search?q=topic%3Aspacy-pipeline") #[code spacy-pipeline]]
        |  to help people find it. If you post it on Twitter, feel free to tag
        |  #[+a("https://twitter.com/" + SOCIAL.twitter) @#{SOCIAL.twitter}]
        |  so we can check it out.
--- a/website/usage/_processing-pipelines/_serialization.jade
+++ b/website/usage/_processing-pipelines/_serialization.jade
@ -21,7 +21,7 @@ p
 +code.
    import spacy
-    from spacy.tokens.span import Span
+    from spacy.tokens import Span
    text = u'Netflix is hiring a new VP of global policy'
--- a/website/usage/_spacy-101/_lightning-tour.jade
+++ b/website/usage/_spacy-101/_lightning-tour.jade
@ -175,7 +175,7 @@ p
 +code.
    import spacy
-    from spacy.tokens.doc import Doc
+    from spacy.tokens import Doc
    from spacy.vocab import Vocab
    nlp = spacy.load('en')
--- a/website/usage/_visualizers/_html.jade
+++ b/website/usage/_visualizers/_html.jade
@ -61,7 +61,7 @@ p
        output_path.open('w', encoding='utf-8').write(svg)
 p
-    |  The above code will generate the dependency visualizations and them to
+    |  The above code will generate the dependency visualizations as to
    |  two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg].
--- a/website/usage/examples.jade
+++ b/website/usage/examples.jade
@ -2,6 +2,44 @@
 include ../_includes/_mixins
 +section("pipeline")
    +h(3, "custom-components-entities") Custom pipeline components and attribute extensions
        +tag-new(2)
    p
        |  This example shows the implementation of a pipeline component
        |  that sets entity annotations based on a list of single or
        |  multiple-word company names, merges entities into one token and
        |  sets custom attributes on the #[code Doc], #[code Span] and
        |  #[code Token].
    +github("spacy", "examples/pipeline/custom_component_entities.py")
    +h(3, "custom-components-api")
        |  Custom pipeline components and attribute extensions via a REST API
        +tag-new(2)
    p
        |  This example shows the implementation of a pipeline component
        |  that fetches country meta data via the
        |  #[+a("https://restcountries.eu") REST Countries API] sets entity
        |  annotations for countries, merges entities into one token and
        |  sets custom attributes on the #[code Doc], #[code Span] and
        |  #[code Token] – for example, the capital, latitude/longitude
        |  coordinates and the country flag.
    +github("spacy", "examples/pipeline/custom_component_countries_api.py")
    +h(3, "custom-components-attr-methods") Custom method extensions
        +tag-new(2)
    p
        |  A collection of snippets showing examples of extensions adding
        |  custom methods to the #[code Doc], #[code Token] and
        |  #[code Span].
    +github("spacy", "examples/pipeline/custom_attr_methods.py")
 +section("matching")
    +h(3, "matcher") Using spaCy's rule-based matcher
--- a/website/usage/processing-pipelines.jade
+++ b/website/usage/processing-pipelines.jade
@ -12,6 +12,10 @@ include _spacy-101/_pipelines
    +h(2, "custom-components") Creating custom pipeline components
    include _processing-pipelines/_custom-components
 +section("extensions")
    +h(2, "extensions") Developing spaCy extensions
    include _processing-pipelines/_extensions
 +section("multithreading")
    +h(2, "multithreading") Multi-threading
    include _processing-pipelines/_multithreading
@ -19,7 +23,3 @@ include _spacy-101/_pipelines
 +section("serialization")
    +h(2, "serialization") Serialization
    include _processing-pipelines/_serialization
 +section("extensions")
    +h(2, "extensions") Developing spaCy extensions
    include _processing-pipelines/_extensions
--- a/website/usage/v2.jade
+++ b/website/usage/v2.jade
@ -102,30 +102,36 @@ p
    +h(3, "features-pipelines") Improved processing pipelines
    +aside-code("Example").
-        # Modify an existing pipeline
+        # Set custom attributes
-        nlp = spacy.load('en')
+        Doc.set_extension('my_attr', default=False)
-        nlp.pipeline.append(my_component)
+        Token.set_extension('my_attr', getter=my_token_getter)
        assert doc._.my_attr, token._.my_attr
-        # Register a factory to create a component
+        # Add components to the pipeline
-        spacy.set_factory('my_factory', my_factory)
+        my_component = lambda doc: doc
-        nlp = Language(pipeline=['my_factory', mycomponent])
+        nlp.add_pipe(my_component)
    p
        |  It's now much easier to #[strong customise the pipeline] with your own
-        |  components, functions that receive a #[code Doc] object, modify and
+        |  components: functions that receive a #[code Doc] object, modify and
-        |  return it. If your component is stateful, you can define and register a
+        |  return it. Extensions let you write any
-        |  factory which receives the shared #[code Vocab] object and returns a
+        |  #[strong attributes, properties and methods] to the #[code Doc],
-        |  component. spaCy's default components can be added to your pipeline by
+        |  #[code Token] and #[code Span]. You can add data, implement new
-        |  using their string IDs. This way, you won't have to worry about finding
+        |  features, integrate other libraries with spaCy or plug in your own
-        |  and implementing them – simply add #[code "tagger"] to the pipeline,
+        |  machine learning models.
        |  and spaCy will know what to do.
    +image
        include ../assets/img/pipeline.svg
    +infobox
-        |  #[+label-inline API:] #[+api("language") #[code Language]]
+        |  #[+label-inline API:] #[+api("language") #[code Language]],
-        |  #[+label-inline Usage:] #[+a("/usage/language-processing-pipeline") Processing text]
+        |  #[+api("doc#set_extension") #[code Doc.set_extension]],
        |  #[+api("span#set_extension") #[code Span.set_extension]],
        |  #[+api("token#set_extension") #[code Token.set_extension]]
        |  #[+label-inline Usage:]
        |  #[+a("/usage/processing-pipelines") Processing pipelines]
        |  #[+label-inline Code:]
        |  #[+src("/usage/examples#section-pipeline") Pipeline examples]
    +h(3, "features-text-classification") Text classification
@ -478,15 +484,16 @@ p
    p
        |  If you've been using custom pipeline components, check out the new
        |  guide on #[+a("/usage/language-processing-pipelines") processing pipelines].
-        |  Appending functions to the pipeline still works – but you might be able
+        |  Appending functions to the pipeline still works – but the
-        |  to make this more convenient by registering "component factories".
+        |  #[+api("language#add_pipe") #[code add_pipe]] methods now makes this
-        |  Components of the processing pipeline can now be disabled by passing a
+        |  much more convenient. Components of the processing pipeline can now
-        |  list of their names to the #[code disable] keyword argument on loading
+        |  be disabled by passing a list of their names to the #[code disable]
-        |  or processing.
+        |  keyword argument on load, or by simply demoving them from the
        |  pipeline alltogether.
    +code-new.
        nlp = spacy.load('en', disable=['tagger', 'ner'])
-        doc = nlp(u"I don't want parsed", disable=['parser'])
+        nlp.remove_pipe('parser')
    +code-old.
        nlp = spacy.load('en', tagger=False, entity=False)
        doc = nlp(u"I don't want parsed", parse=False)