Add pipeline component examples

2024-12-26 09:56:28 +03:00 · 2017-10-10 04:26:06 +02:00 · 2017-10-10 04:26:06 +02:00 · 6679117000
commit 6679117000
parent 7a592d01dc
5 changed files with 289 additions and 0 deletions
--- a/examples/pipeline/custom_attr_methods.py
+++ b/examples/pipeline/custom_attr_methods.py
@ -0,0 +1,53 @@
+# coding: utf-8
+"""This example contains several snippets of methods that can be set via custom
+Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like
+they're "bound" to the object and are partially applied – i.e. the object
+they're called on is passed in as the first argument."""
+from __future__ import unicode_literals
+
+from spacy.lang.en import English
+from spacy.tokens.doc import Doc
+from spacy.tokens.span import Span
+from spacy import displacy
+from pathlib import Path
+
+
+def to_html(doc, output='/tmp', style='dep'):
+    """Doc method extension for saving the current state as a displaCy
+    visualization.
+    """
+    # generate filename from first six non-punct tokens
+    file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
+    output_path = Path(output) / file_name
+    html = displacy.render(doc, style=style, page=True)  # render markup
+    output_path.open('w', encoding='utf-8').write(html)  # save to file
+    print('Saved HTML to {}'.format(output_path))
+
+
+Doc.set_extension('to_html', method=to_html)
+
+nlp = English()
+doc = nlp(u"This is a sentence about Apple.")
+# add entity manually for demo purposes, to make it work without a model
+doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
+doc._.to_html(style='ent')
+
+
+def overlap_tokens(doc, other_doc):
+    """Get the tokens from the original Doc that are also in the comparison Doc.
+    """
+    overlap = []
+    other_tokens = [token.text for token in other_doc]
+    for token in doc:
+        if token.text in other_tokens:
+            overlap.append(token)
+    return overlap
+
+
+Doc.set_extension('overlap', method=overlap_tokens)
+
+nlp = English()
+doc1 = nlp(u"Peach emoji is where it has always been.")
+doc2 = nlp(u"Peach is the superior emoji.")
+tokens = doc1._.overlap(doc2)
+print(tokens)
--- a/examples/pipeline/custom_component_countries_api.py
+++ b/examples/pipeline/custom_component_countries_api.py
@ -0,0 +1,110 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import requests
+
+from spacy.lang.en import English
+from spacy.matcher import PhraseMatcher
+from spacy.tokens.doc import Doc
+from spacy.tokens.span import Span
+from spacy.tokens.token import Token
+
+
+class RESTCountriesComponent(object):
+    """Example of a spaCy v2.0 pipeline component that requests all countries
+    via the REST Countries API, merges country names into one token, assigns
+    entity labels and sets attributes on country tokens, e.g. the capital and
+    lat/lng coordinates. Can be extended with more details from the API.
+
+    REST Countries API: https://restcountries.eu
+    API License: Mozilla Public License MPL 2.0
+    """
+    name = 'rest_countries' # component name, will show up in the pipeline
+
+    def __init__(self, nlp, label='GPE'):
+        """Initialise the pipeline component. The shared nlp instance is used
+        to initialise the matcher with the shared vocab, get the label ID and
+        generate Doc objects as phrase match patterns.
+        """
+        # Make request once on initialisation and store the data
+        r = requests.get('https://restcountries.eu/rest/v2/all')
+        r.raise_for_status()  # make sure requests raises an error if it fails
+        countries = r.json()
+
+        # Convert API response to dict keyed by country name for easy lookup
+        # This could also be extended using the alternative and foreign language
+        # names provided by the API
+        self.countries = {c['name']: c for c in countries}
+        self.label = nlp.vocab.strings[label]  # get entity label ID
+
+        # Set up the PhraseMatcher with Doc patterns for each country name
+        patterns = [nlp(c) for c in self.countries.keys()]
+        self.matcher = PhraseMatcher(nlp.vocab)
+        self.matcher.add('COUNTRIES', None, *patterns)
+
+        # Register attribute on the Token. We'll be overwriting this based on
+        # the matches, so we're only setting a default value, not a getter.
+        # If no default value is set, it defaults to None.
+        Token.set_extension('is_country', default=False)
+        Token.set_extension('country_capital')
+        Token.set_extension('country_latlng')
+        Token.set_extension('country_flag')
+
+        # Register attributes on Doc and Span via a getter that checks if one of
+        # the contained tokens is set to is_country == True.
+        Doc.set_extension('has_country', getter=self.has_country)
+        Span.set_extension('has_country', getter=self.has_country)
+
+
+    def __call__(self, doc):
+        """Apply the pipeline component on a Doc object and modify it if matches
+        are found. Return the Doc, so it can be processed by the next component
+        in the pipeline, if available.
+        """
+        matches = self.matcher(doc)
+        spans = []  # keep the spans for later so we can merge them afterwards
+        for _, start, end in matches:
+            # Generate Span representing the entity & set label
+            entity = Span(doc, start, end, label=self.label)
+            spans.append(entity)
+            # Set custom attribute on each token of the entity
+            # Can be extended with other data returned by the API, like
+            # currencies, country code, flag, calling code etc.
+            for token in entity:
+                token._.set('is_country', True)
+                token._.set('country_capital', self.countries[entity.text]['capital'])
+                token._.set('country_latlng', self.countries[entity.text]['latlng'])
+                token._.set('country_flag', self.countries[entity.text]['flag'])
+            # Overwrite doc.ents and add entity – be careful not to replace!
+            doc.ents = list(doc.ents) + [entity]
+        for span in spans:
+            # Iterate over all spans and merge them into one token. This is done
+            # after setting the entities – otherwise, it would cause mismatched
+            # indices!
+            span.merge()
+        return doc  # don't forget to return the Doc!
+
+    def has_country(self, tokens):
+        """Getter for Doc and Span attributes. Returns True if one of the tokens
+        is a country. Since the getter is only called when we access the
+        attribute, we can refer to the Token's 'is_country' attribute here,
+        which is already set in the processing step."""
+        return any([t._.get('is_country') for t in tokens])
+
+
+# For simplicity, we start off with only the blank English Language class and
+# no model or pre-defined pipeline loaded.
+
+nlp = English()
+rest_countries = RESTCountriesComponent(nlp)  # initialise component
+nlp.add_pipe(rest_countries) # add it to the pipeline
+
+doc = nlp(u"Some text about Colombia and the Czech Republic")
+
+print('Pipeline', nlp.pipe_names)  # pipeline contains component name
+print('Doc has countries', doc._.has_country)  # Doc contains countries
+for token in doc:
+    if token._.is_country:
+        print(token.text, token._.country_capital, token._.country_latlng,
+              token._.country_flag)  # country data
+print('Entities', [(e.text, e.label_) for e in doc.ents])  # all countries are entities
--- a/examples/pipeline/custom_component_entities.py
+++ b/examples/pipeline/custom_component_entities.py
@ -0,0 +1,87 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from spacy.lang.en import English
+from spacy.matcher import PhraseMatcher
+from spacy.tokens.doc import Doc
+from spacy.tokens.span import Span
+from spacy.tokens.token import Token
+
+
+class TechCompanyRecognizer(object):
+    """Example of a spaCy v2.0 pipeline component that sets entity annotations
+    based on list of single or multiple-word company names. Companies are
+    labelled as ORG and their spans are merged into one token. Additionally,
+    ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
+    respectively."""
+    name = 'tech_companies'  # component name, will show up in the pipeline
+
+    def __init__(self, nlp, companies=tuple(), label='ORG'):
+        """Initialise the pipeline component. The shared nlp instance is used
+        to initialise the matcher with the shared vocab, get the label ID and
+        generate Doc objects as phrase match patterns.
+        """
+        self.label = nlp.vocab.strings[label]  # get entity label ID
+
+        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
+        # so even if the list of companies is long, it's very efficient
+        patterns = [nlp(org) for org in companies]
+        self.matcher = PhraseMatcher(nlp.vocab)
+        self.matcher.add('TECH_ORGS', None, *patterns)
+
+        # Register attribute on the Token. We'll be overwriting this based on
+        # the matches, so we're only setting a default value, not a getter.
+        Token.set_extension('is_tech_org', default=False)
+
+        # Register attributes on Doc and Span via a getter that checks if one of
+        # the contained tokens is set to is_tech_org == True.
+        Doc.set_extension('has_tech_org', getter=self.has_tech_org)
+        Span.set_extension('has_tech_org', getter=self.has_tech_org)
+
+    def __call__(self, doc):
+        """Apply the pipeline component on a Doc object and modify it if matches
+        are found. Return the Doc, so it can be processed by the next component
+        in the pipeline, if available.
+        """
+        matches = self.matcher(doc)
+        spans = []  # keep the spans for later so we can merge them afterwards
+        for _, start, end in matches:
+            # Generate Span representing the entity & set label
+            entity = Span(doc, start, end, label=self.label)
+            spans.append(entity)
+            # Set custom attribute on each token of the entity
+            for token in entity:
+                token._.set('is_tech_org', True)
+            # Overwrite doc.ents and add entity – be careful not to replace!
+            doc.ents = list(doc.ents) + [entity]
+        for span in spans:
+            # Iterate over all spans and merge them into one token. This is done
+            # after setting the entities – otherwise, it would cause mismatched
+            # indices!
+            span.merge()
+        return doc  # don't forget to return the Doc!
+
+    def has_tech_org(self, tokens):
+        """Getter for Doc and Span attributes. Returns True if one of the tokens
+        is a tech org. Since the getter is only called when we access the
+        attribute, we can refer to the Token's 'is_tech_org' attribute here,
+        which is already set in the processing step."""
+        return any([t._.get('is_tech_org') for t in tokens])
+
+
+# For simplicity, we start off with only the blank English Language class and
+# no model or pre-defined pipeline loaded.
+
+nlp = English()
+companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple']  # etc.
+component = TechCompanyRecognizer(nlp, companies)  # initialise component
+nlp.add_pipe(component, last=True)  # add it to the pipeline as the last element
+
+doc = nlp(u"Alphabet Inc. is the company behind Google.")
+
+print('Pipeline', nlp.pipe_names)  # pipeline contains component name
+print('Tokens', [t.text for t in doc])  # company names from the list are merged
+print('Doc has_tech_org', doc._.has_tech_org)  # Doc contains tech orgs
+print('Token 0 is_tech_org', doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org
+print('Token 1 is_tech_org', doc[1]._.is_tech_org)  # "is" is not
+print('Entities', [(e.text, e.label_) for e in doc.ents])  # all orgs are entities
--- a/website/usage/_data.json
+++ b/website/usage/_data.json
@ -195,6 +195,7 @@
        "teaser": "Full code examples you can modify and run.",
        "next": "resources",
        "menu": {
+            "Pipeline": "pipeline",
            "Matching": "matching",
            "Training": "training",
            "Deep Learning": "deep-learning"
--- a/website/usage/examples.jade
+++ b/website/usage/examples.jade
@ -2,6 +2,44 @@

 include ../_includes/_mixins

+section("pipeline")
+    +h(3, "custom-components-entities") Custom pipeline components and attribute extensions
+        +tag-new(2)
+
+    p
+        |  This example shows the implementation of a pipeline component
+        |  that sets entity annotations based on a list of single or
+        |  multiple-word company names, merges entities into one token and
+        |  sets custom attributes on the #[code Doc], #[code Span] and
+        |  #[code Token].
+
+    +github("spacy", "examples/pipeline/custom_component_entities.py")
+
+    +h(3, "custom-components-api")
+        |  Custom pipeline components and attribute extensions via a REST API
+        +tag-new(2)
+
+    p
+        |  This example shows the implementation of a pipeline component
+        |  that fetches country meta data via the
+        |  #[+a("https://restcountries.eu") REST Countries API] sets entity
+        |  annotations for countries, merges entities into one token and
+        |  sets custom attributes on the #[code Doc], #[code Span] and
+        |  #[code Token] – for example, the capital, latitude/longitude
+        |  coordinates and the country flag.
+
+    +github("spacy", "examples/pipeline/custom_component_countries_api.py")
+
+    +h(3, "custom-components-attr-methods") Custom method extensions
+        +tag-new(2)
+
+    p
+        |  A collection of snippets showing examples of extensions adding
+        |  custom methods to the #[code Doc], #[code Token] and
+        |  #[code Span].
+
+    +github("spacy", "examples/pipeline/custom_attr_methods.py")
+
 +section("matching")
    +h(3, "matcher") Using spaCy's rule-based matcher