Add pipeline component examples

2025-11-09 20:38:06 +03:00 · 2017-10-10 04:26:06 +02:00 · 2017-10-10 04:26:06 +02:00 · 6679117000
commit 6679117000
parent 7a592d01dc
5 changed files with 289 additions and 0 deletions
--- a/examples/pipeline/custom_attr_methods.py
+++ b/examples/pipeline/custom_attr_methods.py
@ -0,0 +1,53 @@
 # coding: utf-8
 """This example contains several snippets of methods that can be set via custom
 Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like
 they're "bound" to the object and are partially applied – i.e. the object
 they're called on is passed in as the first argument."""
 from __future__ import unicode_literals
 from spacy.lang.en import English
 from spacy.tokens.doc import Doc
 from spacy.tokens.span import Span
 from spacy import displacy
 from pathlib import Path
 def to_html(doc, output='/tmp', style='dep'):
    """Doc method extension for saving the current state as a displaCy
    visualization.
    """
    # generate filename from first six non-punct tokens
    file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
    output_path = Path(output) / file_name
    html = displacy.render(doc, style=style, page=True)  # render markup
    output_path.open('w', encoding='utf-8').write(html)  # save to file
    print('Saved HTML to {}'.format(output_path))
 Doc.set_extension('to_html', method=to_html)
 nlp = English()
 doc = nlp(u"This is a sentence about Apple.")
 # add entity manually for demo purposes, to make it work without a model
 doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
 doc._.to_html(style='ent')
 def overlap_tokens(doc, other_doc):
    """Get the tokens from the original Doc that are also in the comparison Doc.
    """
    overlap = []
    other_tokens = [token.text for token in other_doc]
    for token in doc:
        if token.text in other_tokens:
            overlap.append(token)
    return overlap
 Doc.set_extension('overlap', method=overlap_tokens)
 nlp = English()
 doc1 = nlp(u"Peach emoji is where it has always been.")
 doc2 = nlp(u"Peach is the superior emoji.")
 tokens = doc1._.overlap(doc2)
 print(tokens)
--- a/examples/pipeline/custom_component_countries_api.py
+++ b/examples/pipeline/custom_component_countries_api.py
@ -0,0 +1,110 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import requests
 from spacy.lang.en import English
 from spacy.matcher import PhraseMatcher
 from spacy.tokens.doc import Doc
 from spacy.tokens.span import Span
 from spacy.tokens.token import Token
 class RESTCountriesComponent(object):
    """Example of a spaCy v2.0 pipeline component that requests all countries
    via the REST Countries API, merges country names into one token, assigns
    entity labels and sets attributes on country tokens, e.g. the capital and
    lat/lng coordinates. Can be extended with more details from the API.
    REST Countries API: https://restcountries.eu
    API License: Mozilla Public License MPL 2.0
    """
    name = 'rest_countries' # component name, will show up in the pipeline
    def __init__(self, nlp, label='GPE'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        # Make request once on initialisation and store the data
        r = requests.get('https://restcountries.eu/rest/v2/all')
        r.raise_for_status()  # make sure requests raises an error if it fails
        countries = r.json()
        # Convert API response to dict keyed by country name for easy lookup
        # This could also be extended using the alternative and foreign language
        # names provided by the API
        self.countries = {c['name']: c for c in countries}
        self.label = nlp.vocab.strings[label]  # get entity label ID
        # Set up the PhraseMatcher with Doc patterns for each country name
        patterns = [nlp(c) for c in self.countries.keys()]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('COUNTRIES', None, *patterns)
        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        # If no default value is set, it defaults to None.
        Token.set_extension('is_country', default=False)
        Token.set_extension('country_capital')
        Token.set_extension('country_latlng')
        Token.set_extension('country_flag')
        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_country == True.
        Doc.set_extension('has_country', getter=self.has_country)
        Span.set_extension('has_country', getter=self.has_country)
    def __call__(self, doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.
        """
        matches = self.matcher(doc)
        spans = []  # keep the spans for later so we can merge them afterwards
        for _, start, end in matches:
            # Generate Span representing the entity & set label
            entity = Span(doc, start, end, label=self.label)
            spans.append(entity)
            # Set custom attribute on each token of the entity
            # Can be extended with other data returned by the API, like
            # currencies, country code, flag, calling code etc.
            for token in entity:
                token._.set('is_country', True)
                token._.set('country_capital', self.countries[entity.text]['capital'])
                token._.set('country_latlng', self.countries[entity.text]['latlng'])
                token._.set('country_flag', self.countries[entity.text]['flag'])
            # Overwrite doc.ents and add entity – be careful not to replace!
            doc.ents = list(doc.ents) + [entity]
        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()
        return doc  # don't forget to return the Doc!
    def has_country(self, tokens):
        """Getter for Doc and Span attributes. Returns True if one of the tokens
        is a country. Since the getter is only called when we access the
        attribute, we can refer to the Token's 'is_country' attribute here,
        which is already set in the processing step."""
        return any([t._.get('is_country') for t in tokens])
 # For simplicity, we start off with only the blank English Language class and
 # no model or pre-defined pipeline loaded.
 nlp = English()
 rest_countries = RESTCountriesComponent(nlp)  # initialise component
 nlp.add_pipe(rest_countries) # add it to the pipeline
 doc = nlp(u"Some text about Colombia and the Czech Republic")
 print('Pipeline', nlp.pipe_names)  # pipeline contains component name
 print('Doc has countries', doc._.has_country)  # Doc contains countries
 for token in doc:
    if token._.is_country:
        print(token.text, token._.country_capital, token._.country_latlng,
              token._.country_flag)  # country data
 print('Entities', [(e.text, e.label_) for e in doc.ents])  # all countries are entities
--- a/examples/pipeline/custom_component_entities.py
+++ b/examples/pipeline/custom_component_entities.py
@ -0,0 +1,87 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from spacy.lang.en import English
 from spacy.matcher import PhraseMatcher
 from spacy.tokens.doc import Doc
 from spacy.tokens.span import Span
 from spacy.tokens.token import Token
 class TechCompanyRecognizer(object):
    """Example of a spaCy v2.0 pipeline component that sets entity annotations
    based on list of single or multiple-word company names. Companies are
    labelled as ORG and their spans are merged into one token. Additionally,
    ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
    respectively."""
    name = 'tech_companies'  # component name, will show up in the pipeline
    def __init__(self, nlp, companies=tuple(), label='ORG'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        self.label = nlp.vocab.strings[label]  # get entity label ID
        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
        # so even if the list of companies is long, it's very efficient
        patterns = [nlp(org) for org in companies]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('TECH_ORGS', None, *patterns)
        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Token.set_extension('is_tech_org', default=False)
        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_tech_org == True.
        Doc.set_extension('has_tech_org', getter=self.has_tech_org)
        Span.set_extension('has_tech_org', getter=self.has_tech_org)
    def __call__(self, doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.
        """
        matches = self.matcher(doc)
        spans = []  # keep the spans for later so we can merge them afterwards
        for _, start, end in matches:
            # Generate Span representing the entity & set label
            entity = Span(doc, start, end, label=self.label)
            spans.append(entity)
            # Set custom attribute on each token of the entity
            for token in entity:
                token._.set('is_tech_org', True)
            # Overwrite doc.ents and add entity – be careful not to replace!
            doc.ents = list(doc.ents) + [entity]
        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()
        return doc  # don't forget to return the Doc!
    def has_tech_org(self, tokens):
        """Getter for Doc and Span attributes. Returns True if one of the tokens
        is a tech org. Since the getter is only called when we access the
        attribute, we can refer to the Token's 'is_tech_org' attribute here,
        which is already set in the processing step."""
        return any([t._.get('is_tech_org') for t in tokens])
 # For simplicity, we start off with only the blank English Language class and
 # no model or pre-defined pipeline loaded.
 nlp = English()
 companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple']  # etc.
 component = TechCompanyRecognizer(nlp, companies)  # initialise component
 nlp.add_pipe(component, last=True)  # add it to the pipeline as the last element
 doc = nlp(u"Alphabet Inc. is the company behind Google.")
 print('Pipeline', nlp.pipe_names)  # pipeline contains component name
 print('Tokens', [t.text for t in doc])  # company names from the list are merged
 print('Doc has_tech_org', doc._.has_tech_org)  # Doc contains tech orgs
 print('Token 0 is_tech_org', doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org
 print('Token 1 is_tech_org', doc[1]._.is_tech_org)  # "is" is not
 print('Entities', [(e.text, e.label_) for e in doc.ents])  # all orgs are entities
--- a/website/usage/_data.json
+++ b/website/usage/_data.json
@ -195,6 +195,7 @@
        "teaser": "Full code examples you can modify and run.",
        "next": "resources",
        "menu": {
            "Pipeline": "pipeline",
            "Matching": "matching",
            "Training": "training",
            "Deep Learning": "deep-learning"
--- a/website/usage/examples.jade
+++ b/website/usage/examples.jade
@ -2,6 +2,44 @@
 include ../_includes/_mixins
 +section("pipeline")
    +h(3, "custom-components-entities") Custom pipeline components and attribute extensions
        +tag-new(2)
    p
        |  This example shows the implementation of a pipeline component
        |  that sets entity annotations based on a list of single or
        |  multiple-word company names, merges entities into one token and
        |  sets custom attributes on the #[code Doc], #[code Span] and
        |  #[code Token].
    +github("spacy", "examples/pipeline/custom_component_entities.py")
    +h(3, "custom-components-api")
        |  Custom pipeline components and attribute extensions via a REST API
        +tag-new(2)
    p
        |  This example shows the implementation of a pipeline component
        |  that fetches country meta data via the
        |  #[+a("https://restcountries.eu") REST Countries API] sets entity
        |  annotations for countries, merges entities into one token and
        |  sets custom attributes on the #[code Doc], #[code Span] and
        |  #[code Token] – for example, the capital, latitude/longitude
        |  coordinates and the country flag.
    +github("spacy", "examples/pipeline/custom_component_countries_api.py")
    +h(3, "custom-components-attr-methods") Custom method extensions
        +tag-new(2)
    p
        |  A collection of snippets showing examples of extensions adding
        |  custom methods to the #[code Doc], #[code Token] and
        |  #[code Span].
    +github("spacy", "examples/pipeline/custom_attr_methods.py")
 +section("matching")
    +h(3, "matcher") Using spaCy's rule-based matcher