Merge pull request #1408 from explosion/feature/dot-underscore

💫 Custom attributes via Doc._, Token._ and Span._
This commit is contained in:
Ines Montani 2017-10-11 18:35:56 +02:00 committed by GitHub
commit 37aa523a8e
23 changed files with 1142 additions and 175 deletions

View File

@ -0,0 +1,52 @@
# coding: utf-8
"""This example contains several snippets of methods that can be set via custom
Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like
they're "bound" to the object and are partially applied i.e. the object
they're called on is passed in as the first argument."""
from __future__ import unicode_literals
from spacy.lang.en import English
from spacy.tokens import Doc, Span
from spacy import displacy
from pathlib import Path
def to_html(doc, output='/tmp', style='dep'):
"""Doc method extension for saving the current state as a displaCy
visualization.
"""
# generate filename from first six non-punct tokens
file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
output_path = Path(output) / file_name
html = displacy.render(doc, style=style, page=True) # render markup
output_path.open('w', encoding='utf-8').write(html) # save to file
print('Saved HTML to {}'.format(output_path))
Doc.set_extension('to_html', method=to_html)
nlp = English()
doc = nlp(u"This is a sentence about Apple.")
# add entity manually for demo purposes, to make it work without a model
doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
doc._.to_html(style='ent')
def overlap_tokens(doc, other_doc):
"""Get the tokens from the original Doc that are also in the comparison Doc.
"""
overlap = []
other_tokens = [token.text for token in other_doc]
for token in doc:
if token.text in other_tokens:
overlap.append(token)
return overlap
Doc.set_extension('overlap', method=overlap_tokens)
nlp = English()
doc1 = nlp(u"Peach emoji is where it has always been.")
doc2 = nlp(u"Peach is the superior emoji.")
tokens = doc1._.overlap(doc2)
print(tokens)

View File

@ -0,0 +1,108 @@
# coding: utf-8
from __future__ import unicode_literals
import requests
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token
class RESTCountriesComponent(object):
"""Example of a spaCy v2.0 pipeline component that requests all countries
via the REST Countries API, merges country names into one token, assigns
entity labels and sets attributes on country tokens, e.g. the capital and
lat/lng coordinates. Can be extended with more details from the API.
REST Countries API: https://restcountries.eu
API License: Mozilla Public License MPL 2.0
"""
name = 'rest_countries' # component name, will show up in the pipeline
def __init__(self, nlp, label='GPE'):
"""Initialise the pipeline component. The shared nlp instance is used
to initialise the matcher with the shared vocab, get the label ID and
generate Doc objects as phrase match patterns.
"""
# Make request once on initialisation and store the data
r = requests.get('https://restcountries.eu/rest/v2/all')
r.raise_for_status() # make sure requests raises an error if it fails
countries = r.json()
# Convert API response to dict keyed by country name for easy lookup
# This could also be extended using the alternative and foreign language
# names provided by the API
self.countries = {c['name']: c for c in countries}
self.label = nlp.vocab.strings[label] # get entity label ID
# Set up the PhraseMatcher with Doc patterns for each country name
patterns = [nlp(c) for c in self.countries.keys()]
self.matcher = PhraseMatcher(nlp.vocab)
self.matcher.add('COUNTRIES', None, *patterns)
# Register attribute on the Token. We'll be overwriting this based on
# the matches, so we're only setting a default value, not a getter.
# If no default value is set, it defaults to None.
Token.set_extension('is_country', default=False)
Token.set_extension('country_capital')
Token.set_extension('country_latlng')
Token.set_extension('country_flag')
# Register attributes on Doc and Span via a getter that checks if one of
# the contained tokens is set to is_country == True.
Doc.set_extension('has_country', getter=self.has_country)
Span.set_extension('has_country', getter=self.has_country)
def __call__(self, doc):
"""Apply the pipeline component on a Doc object and modify it if matches
are found. Return the Doc, so it can be processed by the next component
in the pipeline, if available.
"""
matches = self.matcher(doc)
spans = [] # keep the spans for later so we can merge them afterwards
for _, start, end in matches:
# Generate Span representing the entity & set label
entity = Span(doc, start, end, label=self.label)
spans.append(entity)
# Set custom attribute on each token of the entity
# Can be extended with other data returned by the API, like
# currencies, country code, flag, calling code etc.
for token in entity:
token._.set('is_country', True)
token._.set('country_capital', self.countries[entity.text]['capital'])
token._.set('country_latlng', self.countries[entity.text]['latlng'])
token._.set('country_flag', self.countries[entity.text]['flag'])
# Overwrite doc.ents and add entity be careful not to replace!
doc.ents = list(doc.ents) + [entity]
for span in spans:
# Iterate over all spans and merge them into one token. This is done
# after setting the entities otherwise, it would cause mismatched
# indices!
span.merge()
return doc # don't forget to return the Doc!
def has_country(self, tokens):
"""Getter for Doc and Span attributes. Returns True if one of the tokens
is a country. Since the getter is only called when we access the
attribute, we can refer to the Token's 'is_country' attribute here,
which is already set in the processing step."""
return any([t._.get('is_country') for t in tokens])
# For simplicity, we start off with only the blank English Language class and
# no model or pre-defined pipeline loaded.
nlp = English()
rest_countries = RESTCountriesComponent(nlp) # initialise component
nlp.add_pipe(rest_countries) # add it to the pipeline
doc = nlp(u"Some text about Colombia and the Czech Republic")
print('Pipeline', nlp.pipe_names) # pipeline contains component name
print('Doc has countries', doc._.has_country) # Doc contains countries
for token in doc:
if token._.is_country:
print(token.text, token._.country_capital, token._.country_latlng,
token._.country_flag) # country data
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all countries are entities

View File

@ -0,0 +1,85 @@
# coding: utf-8
from __future__ import unicode_literals
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token
class TechCompanyRecognizer(object):
"""Example of a spaCy v2.0 pipeline component that sets entity annotations
based on list of single or multiple-word company names. Companies are
labelled as ORG and their spans are merged into one token. Additionally,
._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
respectively."""
name = 'tech_companies' # component name, will show up in the pipeline
def __init__(self, nlp, companies=tuple(), label='ORG'):
"""Initialise the pipeline component. The shared nlp instance is used
to initialise the matcher with the shared vocab, get the label ID and
generate Doc objects as phrase match patterns.
"""
self.label = nlp.vocab.strings[label] # get entity label ID
# Set up the PhraseMatcher it can now take Doc objects as patterns,
# so even if the list of companies is long, it's very efficient
patterns = [nlp(org) for org in companies]
self.matcher = PhraseMatcher(nlp.vocab)
self.matcher.add('TECH_ORGS', None, *patterns)
# Register attribute on the Token. We'll be overwriting this based on
# the matches, so we're only setting a default value, not a getter.
Token.set_extension('is_tech_org', default=False)
# Register attributes on Doc and Span via a getter that checks if one of
# the contained tokens is set to is_tech_org == True.
Doc.set_extension('has_tech_org', getter=self.has_tech_org)
Span.set_extension('has_tech_org', getter=self.has_tech_org)
def __call__(self, doc):
"""Apply the pipeline component on a Doc object and modify it if matches
are found. Return the Doc, so it can be processed by the next component
in the pipeline, if available.
"""
matches = self.matcher(doc)
spans = [] # keep the spans for later so we can merge them afterwards
for _, start, end in matches:
# Generate Span representing the entity & set label
entity = Span(doc, start, end, label=self.label)
spans.append(entity)
# Set custom attribute on each token of the entity
for token in entity:
token._.set('is_tech_org', True)
# Overwrite doc.ents and add entity be careful not to replace!
doc.ents = list(doc.ents) + [entity]
for span in spans:
# Iterate over all spans and merge them into one token. This is done
# after setting the entities otherwise, it would cause mismatched
# indices!
span.merge()
return doc # don't forget to return the Doc!
def has_tech_org(self, tokens):
"""Getter for Doc and Span attributes. Returns True if one of the tokens
is a tech org. Since the getter is only called when we access the
attribute, we can refer to the Token's 'is_tech_org' attribute here,
which is already set in the processing step."""
return any([t._.get('is_tech_org') for t in tokens])
# For simplicity, we start off with only the blank English Language class and
# no model or pre-defined pipeline loaded.
nlp = English()
companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc.
component = TechCompanyRecognizer(nlp, companies) # initialise component
nlp.add_pipe(component, last=True) # add it to the pipeline as the last element
doc = nlp(u"Alphabet Inc. is the company behind Google.")
print('Pipeline', nlp.pipe_names) # pipeline contains component name
print('Tokens', [t.text for t in doc]) # company names from the list are merged
print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs
print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org
print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities

View File

@ -226,7 +226,14 @@ class Language(object):
>>> nlp.add_pipe(component, name='custom_name', last=True) >>> nlp.add_pipe(component, name='custom_name', last=True)
""" """
if name is None: if name is None:
name = getattr(component, 'name', component.__name__) if hasattr(component, 'name'):
name = component.name
elif hasattr(component, '__name__'):
name = component.__name__
elif hasattr(component, '__class__') and hasattr(component.__class__, '__name__'):
name = component.__class__.__name__
else:
name = repr(component)
if name in self.pipe_names: if name in self.pipe_names:
raise ValueError("'{}' already exists in pipeline.".format(name)) raise ValueError("'{}' already exists in pipeline.".format(name))
if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2: if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:

View File

@ -0,0 +1,53 @@
from mock import Mock
from ..tokens.underscore import Underscore
def test_create_doc_underscore():
doc = Mock()
doc.doc = doc
uscore = Underscore(Underscore.doc_extensions, doc)
assert uscore._doc is doc
assert uscore._start is None
assert uscore._end is None
def test_doc_underscore_getattr_setattr():
doc = Mock()
doc.doc = doc
doc.user_data = {}
Underscore.doc_extensions['hello'] = (False, None, None, None)
doc._ = Underscore(Underscore.doc_extensions, doc)
assert doc._.hello == False
doc._.hello = True
assert doc._.hello == True
def test_create_span_underscore():
span = Mock(doc=Mock(), start=0, end=2)
uscore = Underscore(Underscore.span_extensions, span,
start=span.start, end=span.end)
assert uscore._doc is span.doc
assert uscore._start is span.start
assert uscore._end is span.end
def test_span_underscore_getter_setter():
span = Mock(doc=Mock(), start=0, end=2)
Underscore.span_extensions['hello'] = (None, None,
lambda s: (s.start, 'hi'),
lambda s, value: setattr(s, 'start',
value))
span._ = Underscore(Underscore.span_extensions, span,
start=span.start, end=span.end)
assert span._.hello == (0, 'hi')
span._.hello = 1
assert span._.hello == (1, 'hi')
def test_token_underscore_method():
token = Mock(doc=Mock(), idx=7, say_cheese=lambda token: 'cheese')
Underscore.token_extensions['hello'] = (None, token.say_cheese,
None, None)
token._ = Underscore(Underscore.token_extensions, token, start=token.idx)
assert token._.hello() == 'cheese'

View File

@ -30,7 +30,7 @@ from ..util import normalize_slice
from ..compat import is_config from ..compat import is_config
from .. import about from .. import about
from .. import util from .. import util
from .underscore import Underscore
DEF PADDING = 5 DEF PADDING = 5
@ -64,6 +64,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
else: else:
return Lexeme.get_struct_attr(token.lex, feat_name) return Lexeme.get_struct_attr(token.lex, feat_name)
def _get_chunker(lang): def _get_chunker(lang):
try: try:
cls = util.get_lang_class(lang) cls = util.get_lang_class(lang)
@ -73,6 +74,7 @@ def _get_chunker(lang):
return None return None
return cls.Defaults.syntax_iterators.get(u'noun_chunks') return cls.Defaults.syntax_iterators.get(u'noun_chunks')
cdef class Doc: cdef class Doc:
"""A sequence of Token objects. Access sentences and named entities, export """A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary strings. annotations to numpy arrays, losslessly serialize to compressed binary strings.
@ -87,6 +89,21 @@ cdef class Doc:
>>> from spacy.tokens import Doc >>> from spacy.tokens import Doc
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False]) >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
""" """
@classmethod
def set_extension(cls, name, default=None, method=None,
getter=None, setter=None):
nr_defined = sum(t is not None for t in (default, getter, setter, method))
assert nr_defined == 1
Underscore.doc_extensions[name] = (default, method, getter, setter)
@classmethod
def get_extension(cls, name):
return Underscore.doc_extensions.get(name)
@classmethod
def has_extension(cls, name):
return name in Underscore.doc_extensions
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None): def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
"""Create a Doc object. """Create a Doc object.
@ -159,6 +176,10 @@ cdef class Doc:
self.is_tagged = True self.is_tagged = True
self.is_parsed = True self.is_parsed = True
@property
def _(self):
return Underscore(Underscore.doc_extensions, self)
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a `Token` or `Span` object. """Get a `Token` or `Span` object.

View File

@ -17,10 +17,24 @@ from ..attrs cimport IS_PUNCT, IS_SPACE
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..compat import is_config from ..compat import is_config
from .. import about from .. import about
from .underscore import Underscore
cdef class Span: cdef class Span:
"""A slice from a Doc object.""" """A slice from a Doc object."""
@classmethod
def set_extension(cls, name, default=None, method=None,
getter=None, setter=None):
Underscore.span_extensions[name] = (default, method, getter, setter)
@classmethod
def get_extension(cls, name):
return Underscore.span_extensions.get(name)
@classmethod
def has_extension(cls, name):
return name in Underscore.span_extensions
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None, def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
vector_norm=None): vector_norm=None):
"""Create a `Span` object from the slice `doc[start : end]`. """Create a `Span` object from the slice `doc[start : end]`.
@ -111,6 +125,10 @@ cdef class Span:
for i in range(self.start, self.end): for i in range(self.start, self.end):
yield self.doc[i] yield self.doc[i]
@property
def _(self):
return Underscore(Underscore.span_extensions, self,
start=self.start_char, end=self.end_char)
def as_doc(self): def as_doc(self):
'''Create a Doc object view of the Span's data. '''Create a Doc object view of the Span's data.

View File

@ -20,10 +20,24 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST
from ..attrs cimport LEMMA, POS, TAG, DEP from ..attrs cimport LEMMA, POS, TAG, DEP
from ..compat import is_config from ..compat import is_config
from .. import about from .. import about
from .underscore import Underscore
cdef class Token: cdef class Token:
"""An individual token i.e. a word, punctuation symbol, whitespace, etc.""" """An individual token i.e. a word, punctuation symbol, whitespace, etc."""
@classmethod
def set_extension(cls, name, default=None, method=None,
getter=None, setter=None):
Underscore.token_extensions[name] = (default, method, getter, setter)
@classmethod
def get_extension(cls, name):
return Underscore.span_extensions.get(name)
@classmethod
def has_extension(cls, name):
return name in Underscore.span_extensions
def __cinit__(self, Vocab vocab, Doc doc, int offset): def __cinit__(self, Vocab vocab, Doc doc, int offset):
"""Construct a `Token` object. """Construct a `Token` object.
@ -87,6 +101,11 @@ cdef class Token:
else: else:
raise ValueError(op) raise ValueError(op)
@property
def _(self):
return Underscore(Underscore.token_extensions, self,
start=self.idx, end=None)
cpdef bint check_flag(self, attr_id_t flag_id) except -1: cpdef bint check_flag(self, attr_id_t flag_id) except -1:
"""Check the value of a boolean flag. """Check the value of a boolean flag.

View File

@ -0,0 +1,50 @@
import functools
class Underscore(object):
doc_extensions = {}
span_extensions = {}
token_extensions = {}
def __init__(self, extensions, obj, start=None, end=None):
object.__setattr__(self, '_extensions', extensions)
object.__setattr__(self, '_obj', obj)
# Assumption is that for doc values, _start and _end will both be None
# Span will set non-None values for _start and _end
# Token will have _start be non-None, _end be None
# This lets us key everything into the doc.user_data dictionary,
# (see _get_key), and lets us use a single Underscore class.
object.__setattr__(self, '_doc', obj.doc)
object.__setattr__(self, '_start', start)
object.__setattr__(self, '_end', end)
def __getattr__(self, name):
if name not in self._extensions:
raise AttributeError(name)
default, method, getter, setter = self._extensions[name]
if getter is not None:
return getter(self._obj)
elif method is not None:
return functools.partial(method, self._obj)
else:
return self._doc.user_data.get(self._get_key(name), default)
def __setattr__(self, name, value):
if name not in self._extensions:
raise AttributeError(name)
default, method, getter, setter = self._extensions[name]
if setter is not None:
return setter(self._obj, value)
else:
self._doc.user_data[self._get_key(name)] = value
def set(self, name, value):
return self.__setattr__(name, value)
def get(self, name):
return self.__getattr__(name)
def has(self, name):
return name in self._extensions
def _get_key(self, name):
return ('._.', name, self._start, self._end)

View File

@ -149,7 +149,7 @@ mixin code(label, language, prompt, height, icon, wrap)
//- Code blocks to display old/new versions //- Code blocks to display old/new versions
mixin code-compare() mixin code-wrapper()
span.u-inline-block.u-padding-top.u-width-full span.u-inline-block.u-padding-top.u-width-full
block block

View File

@ -138,6 +138,109 @@ p Get the number of tokens in the document.
+cell int +cell int
+cell The number of tokens in the document. +cell The number of tokens in the document.
+h(2, "set_extension") Doc.set_extension
+tag classmethod
+tag-new(2)
p
| Define a custom attribute on the #[code Doc] which becomes available via
| #[code Doc._]. For details, see the documentation on
| #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes].
+aside-code("Example").
from spacy.tokens import Doc
city_getter = lambda doc: doc.text in ('New York', 'Paris', 'Berlin')
Doc.set_extension('has_city', getter=city_getter)
doc = nlp(u'I like New York')
assert doc._.has_city
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell
| Name of the attribute to set by the extension. For example,
| #[code 'my_attr'] will be available as #[code doc._.my_attr].
+row
+cell #[code default]
+cell -
+cell
| Optional default value of the attribute if no getter or method
| is defined.
+row
+cell #[code method]
+cell callable
+cell
| Set a custom method on the object, for example
| #[code doc._.compare(other_doc)].
+row
+cell #[code getter]
+cell callable
+cell
| Getter function that takes the object and returns an attribute
| value. Is called when the user accesses the #[code ._] attribute.
+row
+cell #[code setter]
+cell callable
+cell
| Setter function that takes the #[code Doc] and a value, and
| modifies the object. Is called when the user writes to the
| #[code Doc._] attribute.
+h(2, "get_extension") Doc.get_extension
+tag classmethod
+tag-new(2)
p
| Look up a previously registered extension by name. Returns a 4-tuple
| #[code.u-break (default, method, getter, setter)] if the extension is
| registered. Raises a #[code KeyError] otherwise.
+aside-code("Example").
from spacy.tokens import Doc
Doc.set_extension('is_city', default=False)
extension = Doc.get_extension('is_city')
assert extension == (False, None, None, None)
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the extension.
+row("foot")
+cell returns
+cell tuple
+cell
| A #[code.u-break (default, method, getter, setter)] tuple of the
| extension.
+h(2, "has_extension") Doc.has_extension
+tag classmethod
+tag-new(2)
p Check whether an extension has been registered on the #[code Doc] class.
+aside-code("Example").
from spacy.tokens import Doc
Doc.set_extension('is_city', default=False)
assert Doc.has_extension('is_city')
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the extension to check.
+row("foot")
+cell returns
+cell bool
+cell Whether the extension has been registered.
+h(2, "char_span") Doc.char_span +h(2, "char_span") Doc.char_span
+tag method +tag method
+tag-new(2) +tag-new(2)

View File

@ -116,6 +116,109 @@ p Get the number of tokens in the span.
+cell int +cell int
+cell The number of tokens in the span. +cell The number of tokens in the span.
+h(2, "set_extension") Span.set_extension
+tag classmethod
+tag-new(2)
p
| Define a custom attribute on the #[code Span] which becomes available via
| #[code Span._]. For details, see the documentation on
| #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes].
+aside-code("Example").
from spacy.tokens import Span
city_getter = lambda span: span.text in ('New York', 'Paris', 'Berlin')
Span.set_extension('has_city', getter=city_getter)
doc = nlp(u'I like New York in Autumn')
assert doc[1:4]._.has_city
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell
| Name of the attribute to set by the extension. For example,
| #[code 'my_attr'] will be available as #[code span._.my_attr].
+row
+cell #[code default]
+cell -
+cell
| Optional default value of the attribute if no getter or method
| is defined.
+row
+cell #[code method]
+cell callable
+cell
| Set a custom method on the object, for example
| #[code span._.compare(other_span)].
+row
+cell #[code getter]
+cell callable
+cell
| Getter function that takes the object and returns an attribute
| value. Is called when the user accesses the #[code ._] attribute.
+row
+cell #[code setter]
+cell callable
+cell
| Setter function that takes the #[code Span] and a value, and
| modifies the object. Is called when the user writes to the
| #[code Span._] attribute.
+h(2, "get_extension") Span.get_extension
+tag classmethod
+tag-new(2)
p
| Look up a previously registered extension by name. Returns a 4-tuple
| #[code.u-break (default, method, getter, setter)] if the extension is
| registered. Raises a #[code KeyError] otherwise.
+aside-code("Example").
from spacy.tokens import Span
Span.set_extension('is_city', default=False)
extension = Span.get_extension('is_city')
assert extension == (False, None, None, None)
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the extension.
+row("foot")
+cell returns
+cell tuple
+cell
| A #[code.u-break (default, method, getter, setter)] tuple of the
| extension.
+h(2, "has_extension") Span.has_extension
+tag classmethod
+tag-new(2)
p Check whether an extension has been registered on the #[code Span] class.
+aside-code("Example").
from spacy.tokens import Span
Span.set_extension('is_city', default=False)
assert Span.has_extension('is_city')
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the extension to check.
+row("foot")
+cell returns
+cell bool
+cell Whether the extension has been registered.
+h(2, "similarity") Span.similarity +h(2, "similarity") Span.similarity
+tag method +tag method
+tag-model("vectors") +tag-model("vectors")

View File

@ -51,6 +51,109 @@ p The number of unicode characters in the token, i.e. #[code token.text].
+cell int +cell int
+cell The number of unicode characters in the token. +cell The number of unicode characters in the token.
+h(2, "set_extension") Token.set_extension
+tag classmethod
+tag-new(2)
p
| Define a custom attribute on the #[code Token] which becomes available
| via #[code Token._]. For details, see the documentation on
| #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes].
+aside-code("Example").
from spacy.tokens import Token
fruit_getter = lambda token: token.text in ('apple', 'pear', 'banana')
Token.set_extension('is_fruit', getter=fruit_getter)
doc = nlp(u'I have an apple')
assert doc[3]._.is_fruit
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell
| Name of the attribute to set by the extension. For example,
| #[code 'my_attr'] will be available as #[code token._.my_attr].
+row
+cell #[code default]
+cell -
+cell
| Optional default value of the attribute if no getter or method
| is defined.
+row
+cell #[code method]
+cell callable
+cell
| Set a custom method on the object, for example
| #[code token._.compare(other_token)].
+row
+cell #[code getter]
+cell callable
+cell
| Getter function that takes the object and returns an attribute
| value. Is called when the user accesses the #[code ._] attribute.
+row
+cell #[code setter]
+cell callable
+cell
| Setter function that takes the #[code Token] and a value, and
| modifies the object. Is called when the user writes to the
| #[code Token._] attribute.
+h(2, "get_extension") Token.get_extension
+tag classmethod
+tag-new(2)
p
| Look up a previously registered extension by name. Returns a 4-tuple
| #[code.u-break (default, method, getter, setter)] if the extension is
| registered. Raises a #[code KeyError] otherwise.
+aside-code("Example").
from spacy.tokens import Token
Token.set_extension('is_fruit', default=False)
extension = Token.get_extension('is_fruit')
assert extension == (False, None, None, None)
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the extension.
+row("foot")
+cell returns
+cell tuple
+cell
| A #[code.u-break (default, method, getter, setter)] tuple of the
| extension.
+h(2, "has_extension") Token.has_extension
+tag classmethod
+tag-new(2)
p Check whether an extension has been registered on the #[code Token] class.
+aside-code("Example").
from spacy.tokens import Token
Token.set_extension('is_fruit', default=False)
assert Token.has_extension('is_fruit')
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the extension to check.
+row("foot")
+cell returns
+cell bool
+cell Whether the extension has been registered.
+h(2, "check_flag") Token.check_flag +h(2, "check_flag") Token.check_flag
+tag method +tag method

View File

@ -105,9 +105,9 @@
"menu": { "menu": {
"How Pipelines Work": "pipelines", "How Pipelines Work": "pipelines",
"Custom Components": "custom-components", "Custom Components": "custom-components",
"Developing Extensions": "extensions",
"Multi-threading": "multithreading", "Multi-threading": "multithreading",
"Serialization": "serialization", "Serialization": "serialization"
"Developing Extensions": "extensions"
} }
}, },
@ -195,6 +195,7 @@
"teaser": "Full code examples you can modify and run.", "teaser": "Full code examples you can modify and run.",
"next": "resources", "next": "resources",
"menu": { "menu": {
"Pipeline": "pipeline",
"Matching": "matching", "Matching": "matching",
"Training": "training", "Training": "training",
"Deep Learning": "deep-learning" "Deep Learning": "deep-learning"

View File

@ -1,12 +1,11 @@
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > CUSTOM COMPONENTS //- 💫 DOCS > USAGE > PROCESSING PIPELINES > CUSTOM COMPONENTS
p p
| A component receives a #[code Doc] object and | A component receives a #[code Doc] object and can modify it for example,
| #[strong performs the actual processing] for example, using the current | by using the current weights to make a prediction and set some annotation
| weights to make a prediction and set some annotation on the document. By | on the document. By adding a component to the pipeline, you'll get access
| adding a component to the pipeline, you'll get access to the #[code Doc] | to the #[code Doc] at any point #[strong during processing] instead of
| at any point #[strong during] processing instead of only being able to | only being able to modify it afterwards.
| modify it afterwards.
+aside-code("Example"). +aside-code("Example").
def my_component(doc): def my_component(doc):
@ -27,10 +26,10 @@ p
p p
| Custom components can be added to the pipeline using the | Custom components can be added to the pipeline using the
| #[+api("language#add_pipe") #[code add_pipe]] method. Optionally, you | #[+api("language#add_pipe") #[code add_pipe]] method. Optionally, you
| can either specify a component to add it before or after, tell spaCy | can either specify a component to add it #[strong before or after], tell
| to add it first or last in the pipeline, or define a custom name. | spaCy to add it #[strong first or last] in the pipeline, or define a
| If no name is set and no #[code name] attribute is present on your | #[strong custom name]. If no name is set and no #[code name] attribute
| component, the function name, e.g. #[code component.__name__] is used. | is present on your component, the function name is used.
+code("Adding pipeline components"). +code("Adding pipeline components").
def my_component(doc): def my_component(doc):
@ -67,7 +66,19 @@ p
nlp.add_pipe(my_component, first=True) nlp.add_pipe(my_component, first=True)
+h(3, "custom-components-attributes") +h(3, "custom-components-attributes")
| Setting attributes on the #[code Doc], #[code Span] and #[code Token] | Extension attributes on #[code Doc], #[code Span] and #[code Token]
+tag-new(2)
p
| As of v2.0, spaCy allows you to set any custom attributes and methods
| on the #[code Doc], #[code Span] and #[code Token], which become
| available as #[code Doc._], #[code Span._] and #[code Token._] for
| example, #[code Token._.my_attr]. This lets you store additional
| information relevant to your application, add new features and
| functionality to spaCy, and implement your own models trained with other
| machine learning libraries. It also lets you take advantage of spaCy's
| data structures and the #[code Doc] object as the "single source of
| truth".
+aside("Why ._?") +aside("Why ._?")
| Writing to a #[code ._] attribute instead of to the #[code Doc] directly | Writing to a #[code ._] attribute instead of to the #[code Doc] directly
@ -78,9 +89,216 @@ p
| what's custom for example, #[code doc.sentiment] is spaCy, while | what's custom for example, #[code doc.sentiment] is spaCy, while
| #[code doc._.sent_score] isn't. | #[code doc._.sent_score] isn't.
+under-construction p
| There are three main types of extensions, which can be defined using the
| #[+api("doc#set_extension") #[code Doc.set_extension]],
| #[+api("span#set_extension") #[code Span.set_extension]] and
| #[+api("token#set_extension") #[code Token.set_extension]] methods.
+h(3, "custom-components-user-hooks") Other user hooks +list("numbers")
+item #[strong Attribute extensions].
| Set a default value for an attribute, which can be overwritten
| manually at any time. Attribute extensions work like "normal"
| variables and are the quickest way to store arbitrary information
| on a #[code Doc], #[code Span] or #[code Token].
+code-wrapper
+code.
Doc.set_extension('hello', default=True)
assert doc._.hello
doc._.hello = False
+item #[strong Property extensions].
| Define a getter and an optional setter function. If no setter is
| provided, the extension is immutable. Since the getter and setter
| functions are only called when you #[em retrieve] the attribute,
| you can also access values of previously added attribute extensions.
| For example, a #[code Doc] getter can average over #[code Token]
| attributes. For #[code Span] extensions, you'll almost always want
| to use a property otherwise, you'd have to write to
| #[em every possible] #[code Span] in the #[code Doc] to set up the
| values correctly.
+code-wrapper
+code.
Doc.set_extension('hello', getter=get_hello_value, setter=set_hello_value)
assert doc._.hello
doc._.hello = 'Hi!'
+item #[strong Method extensions].
| Assign a function that becomes available as an object method. Method
| extensions are always immutable. For more details and implementation
| ideas, see
| #[+a("/usage/examples#custom-components-attr-methods") these examples].
+code-wrapper
+code.o-no-block.
Doc.set_extension('hello', method=lambda doc, name: 'Hi {}!'.format(name))
assert doc._.hello('Bob') == 'Hi Bob!'
p
| Before you can access a custom extension, you need to register it using
| the #[code set_extension] method on the object you want
| to add it to, e.g. the #[code Doc]. Keep in mind that extensions are
| always #[strong added globally] and not just on a particular instance.
| If an attribute of the same name
| already exists, or if you're trying to access an attribute that hasn't
| been registered, spaCy will raise an #[code AttributeError].
+code("Example").
from spacy.tokens import Doc, Span, Token
fruits = ['apple', 'pear', 'banana', 'orange', 'strawberry']
is_fruit_getter = lambda token: token.text in fruits
has_fruit_getter = lambda obj: any([t.text in fruits for t in obj])
Token.set_extension('is_fruit', getter=is_fruit_getter)
Doc.set_extension('has_fruit', getter=has_fruit_getter)
Span.set_extension('has_fruit', getter=has_fruit_getter)
+aside-code("Usage example").
doc = nlp(u"I have an apple and a melon")
assert doc[3]._.is_fruit # get Token attributes
assert not doc[0]._.is_fruit
assert doc._.has_fruit # get Doc attributes
assert doc[1:4]._.has_fruit # get Span attributes
p
| Once you've registered your custom attribute, you can also use the
| built-in #[code set], #[code get] and #[code has] methods to modify and
| retrieve the attributes. This is especially useful it you want to pass in
| a string instead of calling #[code doc._.my_attr].
+table(["Method", "Description", "Valid for", "Example"])
+row
+cell #[code ._.set()]
+cell Set a value for an attribute.
+cell Attributes, mutable properties.
+cell #[code.u-break token._.set('my_attr', True)]
+row
+cell #[code ._.get()]
+cell Get the value of an attribute.
+cell Attributes, mutable properties, immutable properties, methods.
+cell #[code.u-break my_attr = span._.get('my_attr')]
+row
+cell #[code ._.has()]
+cell Check if an attribute exists.
+cell Attributes, mutable properties, immutable properties, methods.
+cell #[code.u-break doc._.has('my_attr')]
+infobox("How the ._ is implemented")
| Extension definitions the defaults, methods, getters and setters you
| pass in to #[code set_extension] are stored in class attributes on the
| #[code Underscore] class. If you write to an extension attribute, e.g.
| #[code doc._.hello = True], the data is stored within the
| #[+api("doc#attributes") #[code Doc.user_data]] dictionary. To keep the
| underscore data separate from your other dictionary entries, the string
| #[code "._."] is placed before the name, in a tuple.
+h(4, "component-example1") Example: Custom sentence segmentation logic
p
| Let's say you want to implement custom logic to improve spaCy's sentence
| boundary detection. Currently, sentence segmentation is based on the
| dependency parse, which doesn't always produce ideal results. The custom
| logic should therefore be applied #[strong after] tokenization, but
| #[strong before] the dependency parsing this way, the parser can also
| take advantage of the sentence boundaries.
+code.
def sbd_component(doc):
for i, token in enumerate(doc[:-2]):
# define sentence start if period + titlecase token
if token.text == '.' and doc[i+1].is_title:
doc[i+1].sent_start = True
return doc
nlp = spacy.load('en')
nlp.add_pipe(sbd_component, before='parser') # insert before the parser
+h(4, "component-example2")
| Example: Pipeline component for entity matching and tagging with
| custom attributes
p
| This example shows how to create a spaCy extension that takes a
| terminology list (in this case, single- and multi-word company names),
| matches the occurences in a document, labels them as #[code ORG] entities,
| merges the tokens and sets custom #[code is_tech_org] and
| #[code has_tech_org] attributes. For efficient matching, the example uses
| the #[+api("phrasematcher") #[code PhraseMatcher]] which accepts
| #[code Doc] objects as match patterns and works well for large
| terminology lists. It also ensures your patterns will always match, even
| when you customise spaCy's tokenization rules. When you call #[code nlp]
| on a text, the custom pipeline component is applied to the #[code Doc]
+github("spacy", "examples/pipeline/custom_component_entities.py", false, 500)
p
| Wrapping this functionality in a
| pipeline component allows you to reuse the module with different
| settings, and have all pre-processing taken care of when you call
| #[code nlp] on your text and receive a #[code Doc] object.
+h(4, "component-example3")
| Example: Pipeline component for GPE entities and country meta data via a
| REST API
p
| This example shows the implementation of a pipeline component
| that fetches country meta data via the
| #[+a("https://restcountries.eu") REST Countries API] sets entity
| annotations for countries, merges entities into one token and
| sets custom attributes on the #[code Doc], #[code Span] and
| #[code Token] for example, the capital, latitude/longitude coordinates
| and even the country flag.
+github("spacy", "examples/pipeline/custom_component_countries_api.py", false, 500)
p
| In this case, all data can be fetched on initialisation in one request.
| However, if you're working with text that contains incomplete country
| names, spelling mistakes or foreign-language versions, you could also
| implement a #[code like_country]-style getter function that makes a
| request to the search API endpoint and returns the best-matching
| result.
+h(4, "custom-components-usage-ideas") Other usage ideas
+list
+item
| #[strong Adding new features and hooking in models]. For example,
| a sentiment analysis model, or your preferred solution for
| lemmatization or sentiment analysis. spaCy's built-in tagger,
| parser and entity recognizer respect annotations that were already
| set on the #[code Doc] in a previous step of the pipeline.
+item
| #[strong Integrating other libraries and APIs]. For example, your
| pipeline component can write additional information and data
| directly to the #[code Doc] or #[code Token] as custom attributes,
| while making sure no information is lost in the process. This can
| be output generated by other libraries and models, or an external
| service with a REST API.
+item
| #[strong Debugging and logging]. For example, a component which
| stores and/or exports relevant information about the current state
| of the processed document, and insert it at any point of your
| pipeline.
+infobox("Developing third-party extensions")
| The new pipeline management and custom attributes finally make it easy
| to develop your own spaCy extensions and plugins and share them with
| others. Extensions can claim their own #[code ._] namespace and exist as
| standalone packages. If you're developing a tool or library and want to
| make it easy for others to use it with spaCy and add it to their
| pipeline, all you have to do is expose a function that takes a
| #[code Doc], modifies it and returns it. For more details and
| #[strong best practices], see the section on
| #[+a("#extensions") developing spaCy extensions].
+h(3, "custom-components-user-hooks") User hooks
p p
| While it's generally recommended to use the #[code Doc._], #[code Span._] | While it's generally recommended to use the #[code Doc._], #[code Span._]

View File

@ -1,126 +0,0 @@
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > EXAMPLES
p
| To see real-world examples of pipeline factories and components in action,
| you can have a look at the source of spaCy's built-in components, e.g.
| the #[+api("tagger") #[code Tagger]], #[+api("parser") #[code Parser]] or
| #[+api("entityrecognizer") #[code EntityRecongnizer]].
+h(3, "example1") Example: Custom sentence segmentation logic
p
| Let's say you want to implement custom logic to improve spaCy's sentence
| boundary detection. Currently, sentence segmentation is based on the
| dependency parse, which doesn't always produce ideal results. The custom
| logic should therefore be applied #[strong after] tokenization, but
| #[strong before] the dependency parsing this way, the parser can also
| take advantage of the sentence boundaries.
+code.
def sbd_component(doc):
for i, token in enumerate(doc[:-2]):
# define sentence start if period + titlecase token
if token.text == '.' and doc[i+1].is_title:
doc[i+1].sent_start = True
return doc
p
| In this case, we simply want to add the component to the existing
| pipeline of the English model. We can do this by inserting it at index 0
| of #[code nlp.pipeline]:
+code.
nlp = spacy.load('en')
nlp.pipeline.insert(0, sbd_component)
p
| When you call #[code nlp] on some text, spaCy will tokenize it to create
| a #[code Doc] object, and first call #[code sbd_component] on it, followed
| by the model's default pipeline.
+h(3, "example2") Example: Sentiment model
p
| Let's say you have trained your own document sentiment model on English
| text. After tokenization, you want spaCy to first execute the
| #[strong default tensorizer], followed by a custom
| #[strong sentiment component] that adds a #[code .sentiment]
| property to the #[code Doc], containing your model's sentiment precition.
p
| Your component class will have a #[code from_disk()] method that spaCy
| calls to load the model data. When called, the component will compute
| the sentiment score, add it to the #[code Doc] and return the modified
| document. Optionally, the component can include an #[code update()] method
| to allow training the model.
+code.
import pickle
from pathlib import Path
class SentimentComponent(object):
def __init__(self, vocab):
self.weights = None
def __call__(self, doc):
doc.sentiment = sum(self.weights*doc.vector) # set sentiment property
return doc
def from_disk(self, path): # path = model path + factory ID ('sentiment')
self.weights = pickle.load(Path(path) / 'weights.bin') # load weights
return self
def update(self, doc, gold): # update weights allows training!
prediction = sum(self.weights*doc.vector)
self.weights -= 0.001*doc.vector*(prediction-gold.sentiment)
p
| The factory will initialise the component with the #[code Vocab] object.
| To be able to add it to your model's pipeline as #[code 'sentiment'],
| it also needs to be registered via
| #[+api("spacy#set_factory") #[code set_factory()]].
+code.
def sentiment_factory(vocab):
component = SentimentComponent(vocab) # initialise component
return component
spacy.set_factory('sentiment', sentiment_factory)
p
| The above code should be #[strong shipped with your model]. You can use
| the #[+api("cli#package") #[code package]] command to create all required
| files and directories. The model package will include an
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) #[code __init__.py]]
| with a #[code load()] method, that will initialise the language class with
| the model's pipeline and call the #[code from_disk()] method to load
| the model data.
p
| In the model package's meta.json, specify the language class and pipeline
| IDs:
+code("meta.json (excerpt)", "json").
{
"name": "sentiment_model",
"lang": "en",
"version": "1.0.0",
"spacy_version": ">=2.0.0,<3.0.0",
"pipeline": ["tensorizer", "sentiment"]
}
p
| When you load your new model, spaCy will call the model's #[code load()]
| method. This will return a #[code Language] object with a pipeline
| containing the default tensorizer, and the sentiment component returned
| by your custom #[code "sentiment"] factory.
+code.
nlp = spacy.load('en_sentiment_model')
doc = nlp(u'I love pizza')
assert doc.sentiment
+infobox("Saving and loading models")
| For more information and a detailed guide on how to package your model,
| see the documentation on
| #[+a("/usage/training#saving-loading") saving and loading models].

View File

@ -1,3 +1,110 @@
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > DEVELOPING EXTENSIONS //- 💫 DOCS > USAGE > PROCESSING PIPELINES > DEVELOPING EXTENSIONS
+under-construction p
| We're very excited about all the new possibilities for community
| extensions and plugins in spaCy v2.0, and we can't wait to see what
| you build with it! To get you started, here are a few tips, tricks and
| best practices:
+list
+item
| Make sure to choose a #[strong descriptive and specific name] for
| your pipeline component class, and set it as its #[code name]
| attribute. Avoid names that are too common or likely to clash with
| built-in or a user's other custom components. While it's fine to call
| your package "spacy_my_extension", avoid component names including
| "spacy", since this can easily lead to confusion.
+code-wrapper
+code-new name = 'myapp_lemmatizer'
+code-old name = 'lemmatizer'
+item
| When writing to #[code Doc], #[code Token] or #[code Span] objects,
| #[strong use getter functions] wherever possible, and avoid setting
| values explicitly. Tokens and spans don't own any data themselves,
| so you should provide a function that allows them to compute the
| values instead of writing static properties to individual objects.
+code-wrapper
+code-new.
is_fruit = lambda token: token.text in ('apple', 'orange')
Token.set_extension('is_fruit', getter=is_fruit)
+code-old.
token._.set_extension('is_fruit', default=False)
if token.text in ('apple', 'orange'):
token._.set('is_fruit', True)
+item
| Always add your custom attributes to the #[strong global] #[code Doc]
| #[code Token] or #[code Span] objects, not a particular instance of
| them. Add the attributes #[strong as early as possible], e.g. in
| your extension's #[code __init__] method or in the global scope of
| your module. This means that in the case of namespace collisions,
| the user will see an error immediately, not just when they run their
| pipeline.
+code-wrapper
+code-new.
from spacy.tokens import Doc
def __init__(attr='my_attr'):
Doc.set_extension(attr, getter=self.get_doc_attr)
+code-old.
def __call__(doc):
doc.set_extension('my_attr', getter=self.get_doc_attr)
+item
| If your extension is setting properties on the #[code Doc],
| #[code Token] or #[code Span], include an option to
| #[strong let the user to change those attribute names]. This makes
| it easier to avoid namespace collisions and accommodate users with
| different naming preferences. We recommend adding an #[code attrs]
| argument to the #[code __init__] method of your class so you can
| write the names to class attributes and reuse them across your
| component.
+code-wrapper
+code-new Doc.set_extension(self.doc_attr, default='some value')
+code-old Doc.set_extension('my_doc_attr', default='some value')
+item
| Ideally, extensions should be #[strong standalone packages] with
| spaCy and optionally, other packages specified as a dependency. They
| can freely assign to their own #[code ._] namespace, but should stick
| to that. If your extension's only job is to provide a better
| #[code .similarity] implementation, and your docs state this
| explicitly, there's no problem with writing to the
| #[+a("#custom-components-user-hooks") #[code user_hooks]], and
| overwriting spaCy's built-in method. However, a third-party
| extension should #[strong never silently overwrite built-ins], or
| attributes set by other extensions.
+item
| If you're looking to publish a model that depends on a custom
| pipeline component, you can either #[strong require it] in the model
| package's dependencies, or if the component is specific and
| lightweight choose to #[strong ship it with your model package]
| and add it to the #[code Language] instance returned by the
| model's #[code load()] method. For examples of this, check out the
| implementations of spaCy's
| #[+api("util#load_model_from_init_py") #[code load_model_from_init_py()]]
| and #[+api("util#load_model_from_path") #[code load_model_from_path()]]
| utility functions.
+code-wrapper
+code-new.
nlp.add_pipe(my_custom_component)
return nlp.from_disk(model_path)
+item
| Once you're ready to share your extension with others, make sure to
| #[strong add docs and installation instructions] (you can
| always link to this page for more info). Make it easy for others to
| install and use your extension, for example by uploading it to
| #[+a("https://pypi.python.org") PyPi]. If you're sharing your code on
| GitHub, don't forget to tag it
| with #[+a("https://github.com/search?q=topic%3Aspacy") #[code spacy]]
| and #[+a("https://github.com/search?q=topic%3Aspacy-pipeline") #[code spacy-pipeline]]
| to help people find it. If you post it on Twitter, feel free to tag
| #[+a("https://twitter.com/" + SOCIAL.twitter) @#{SOCIAL.twitter}]
| so we can check it out.

View File

@ -21,7 +21,7 @@ p
+code. +code.
import spacy import spacy
from spacy.tokens.span import Span from spacy.tokens import Span
text = u'Netflix is hiring a new VP of global policy' text = u'Netflix is hiring a new VP of global policy'

View File

@ -175,7 +175,7 @@ p
+code. +code.
import spacy import spacy
from spacy.tokens.doc import Doc from spacy.tokens import Doc
from spacy.vocab import Vocab from spacy.vocab import Vocab
nlp = spacy.load('en') nlp = spacy.load('en')

View File

@ -61,7 +61,7 @@ p
output_path.open('w', encoding='utf-8').write(svg) output_path.open('w', encoding='utf-8').write(svg)
p p
| The above code will generate the dependency visualizations and them to | The above code will generate the dependency visualizations as to
| two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg]. | two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg].

View File

@ -2,6 +2,44 @@
include ../_includes/_mixins include ../_includes/_mixins
+section("pipeline")
+h(3, "custom-components-entities") Custom pipeline components and attribute extensions
+tag-new(2)
p
| This example shows the implementation of a pipeline component
| that sets entity annotations based on a list of single or
| multiple-word company names, merges entities into one token and
| sets custom attributes on the #[code Doc], #[code Span] and
| #[code Token].
+github("spacy", "examples/pipeline/custom_component_entities.py")
+h(3, "custom-components-api")
| Custom pipeline components and attribute extensions via a REST API
+tag-new(2)
p
| This example shows the implementation of a pipeline component
| that fetches country meta data via the
| #[+a("https://restcountries.eu") REST Countries API] sets entity
| annotations for countries, merges entities into one token and
| sets custom attributes on the #[code Doc], #[code Span] and
| #[code Token] for example, the capital, latitude/longitude
| coordinates and the country flag.
+github("spacy", "examples/pipeline/custom_component_countries_api.py")
+h(3, "custom-components-attr-methods") Custom method extensions
+tag-new(2)
p
| A collection of snippets showing examples of extensions adding
| custom methods to the #[code Doc], #[code Token] and
| #[code Span].
+github("spacy", "examples/pipeline/custom_attr_methods.py")
+section("matching") +section("matching")
+h(3, "matcher") Using spaCy's rule-based matcher +h(3, "matcher") Using spaCy's rule-based matcher

View File

@ -12,6 +12,10 @@ include _spacy-101/_pipelines
+h(2, "custom-components") Creating custom pipeline components +h(2, "custom-components") Creating custom pipeline components
include _processing-pipelines/_custom-components include _processing-pipelines/_custom-components
+section("extensions")
+h(2, "extensions") Developing spaCy extensions
include _processing-pipelines/_extensions
+section("multithreading") +section("multithreading")
+h(2, "multithreading") Multi-threading +h(2, "multithreading") Multi-threading
include _processing-pipelines/_multithreading include _processing-pipelines/_multithreading
@ -19,7 +23,3 @@ include _spacy-101/_pipelines
+section("serialization") +section("serialization")
+h(2, "serialization") Serialization +h(2, "serialization") Serialization
include _processing-pipelines/_serialization include _processing-pipelines/_serialization
+section("extensions")
+h(2, "extensions") Developing spaCy extensions
include _processing-pipelines/_extensions

View File

@ -102,30 +102,36 @@ p
+h(3, "features-pipelines") Improved processing pipelines +h(3, "features-pipelines") Improved processing pipelines
+aside-code("Example"). +aside-code("Example").
# Modify an existing pipeline # Set custom attributes
nlp = spacy.load('en') Doc.set_extension('my_attr', default=False)
nlp.pipeline.append(my_component) Token.set_extension('my_attr', getter=my_token_getter)
assert doc._.my_attr, token._.my_attr
# Register a factory to create a component # Add components to the pipeline
spacy.set_factory('my_factory', my_factory) my_component = lambda doc: doc
nlp = Language(pipeline=['my_factory', mycomponent]) nlp.add_pipe(my_component)
p p
| It's now much easier to #[strong customise the pipeline] with your own | It's now much easier to #[strong customise the pipeline] with your own
| components, functions that receive a #[code Doc] object, modify and | components: functions that receive a #[code Doc] object, modify and
| return it. If your component is stateful, you can define and register a | return it. Extensions let you write any
| factory which receives the shared #[code Vocab] object and returns a | #[strong attributes, properties and methods] to the #[code Doc],
|  component. spaCy's default components can be added to your pipeline by | #[code Token] and #[code Span]. You can add data, implement new
| using their string IDs. This way, you won't have to worry about finding | features, integrate other libraries with spaCy or plug in your own
| and implementing them simply add #[code "tagger"] to the pipeline, | machine learning models.
| and spaCy will know what to do.
+image +image
include ../assets/img/pipeline.svg include ../assets/img/pipeline.svg
+infobox +infobox
| #[+label-inline API:] #[+api("language") #[code Language]] | #[+label-inline API:] #[+api("language") #[code Language]],
| #[+label-inline Usage:] #[+a("/usage/language-processing-pipeline") Processing text] | #[+api("doc#set_extension") #[code Doc.set_extension]],
| #[+api("span#set_extension") #[code Span.set_extension]],
| #[+api("token#set_extension") #[code Token.set_extension]]
| #[+label-inline Usage:]
| #[+a("/usage/processing-pipelines") Processing pipelines]
| #[+label-inline Code:]
| #[+src("/usage/examples#section-pipeline") Pipeline examples]
+h(3, "features-text-classification") Text classification +h(3, "features-text-classification") Text classification
@ -478,15 +484,16 @@ p
p p
| If you've been using custom pipeline components, check out the new | If you've been using custom pipeline components, check out the new
| guide on #[+a("/usage/language-processing-pipelines") processing pipelines]. | guide on #[+a("/usage/language-processing-pipelines") processing pipelines].
| Appending functions to the pipeline still works but you might be able | Appending functions to the pipeline still works but the
| to make this more convenient by registering "component factories". | #[+api("language#add_pipe") #[code add_pipe]] methods now makes this
| Components of the processing pipeline can now be disabled by passing a | much more convenient. Components of the processing pipeline can now
| list of their names to the #[code disable] keyword argument on loading | be disabled by passing a list of their names to the #[code disable]
| or processing. | keyword argument on load, or by simply demoving them from the
| pipeline alltogether.
+code-new. +code-new.
nlp = spacy.load('en', disable=['tagger', 'ner']) nlp = spacy.load('en', disable=['tagger', 'ner'])
doc = nlp(u"I don't want parsed", disable=['parser']) nlp.remove_pipe('parser')
+code-old. +code-old.
nlp = spacy.load('en', tagger=False, entity=False) nlp = spacy.load('en', tagger=False, entity=False)
doc = nlp(u"I don't want parsed", parse=False) doc = nlp(u"I don't want parsed", parse=False)