Update pipeline component examples to use plac

2025-09-16 09:02:35 +03:00 · 2017-10-27 02:58:14 +02:00 · 2017-10-27 02:58:14 +02:00 · 44f83b35bc
commit 44f83b35bc
parent af28ca1ba0
3 changed files with 129 additions and 59 deletions
--- a/examples/pipeline/custom_attr_methods.py
+++ b/examples/pipeline/custom_attr_methods.py
@ -1,35 +1,60 @@
 #!/usr/bin/env python
 # coding: utf-8
 """This example contains several snippets of methods that can be set via custom
 Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like
 they're "bound" to the object and are partially applied – i.e. the object
-they're called on is passed in as the first argument."""
+they're called on is passed in as the first argument.
 * Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
 Developed for: spaCy 2.0.0a17
 Last updated for: spaCy 2.0.0a18
 """
 from __future__ import unicode_literals
 import plac
 from spacy.lang.en import English
 from spacy.tokens import Doc, Span
 from spacy import displacy
 from pathlib import Path
@plac.annotations(
    output_dir=("Output directory for saved HTML", "positional", None, Path))
 def main(output_dir=None):
    nlp = English()  # start off with blank English class
    Doc.set_extension('overlap', method=overlap_tokens)
    doc1 = nlp(u"Peach emoji is where it has always been.")
    doc2 = nlp(u"Peach is the superior emoji.")
    print("Text 1:", doc1.text)
    print("Text 2:", doc2.text)
    print("Overlapping tokens:", doc1._.overlap(doc2))
    Doc.set_extension('to_html', method=to_html)
    doc = nlp(u"This is a sentence about Apple.")
    # add entity manually for demo purposes, to make it work without a model
    doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
    print("Text:", doc.text)
    doc._.to_html(output=output_dir, style='ent')
 def to_html(doc, output='/tmp', style='dep'):
    """Doc method extension for saving the current state as a displaCy
    visualization.
    """
    # generate filename from first six non-punct tokens
    file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
    output_path = Path(output) / file_name
    html = displacy.render(doc, style=style, page=True)  # render markup
-    output_path.open('w', encoding='utf-8').write(html)  # save to file
+    if output is not None:
-    print('Saved HTML to {}'.format(output_path))
+        output_path = Path(output)
-
+        if not output_path.exists():
-
+            output_path.mkdir()
-Doc.set_extension('to_html', method=to_html)
+        output_file = Path(output) / file_name
-
+        output_file.open('w', encoding='utf-8').write(html)  # save to file
-nlp = English()
+        print('Saved HTML to {}'.format(output_file))
-doc = nlp(u"This is a sentence about Apple.")
+    else:
-# add entity manually for demo purposes, to make it work without a model
+        print(html)
 doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
 doc._.to_html(style='ent')
 def overlap_tokens(doc, other_doc):
@ -43,10 +68,10 @@ def overlap_tokens(doc, other_doc):
    return overlap
-Doc.set_extension('overlap', method=overlap_tokens)
+if __name__ == '__main__':
    plac.call(main)
-nlp = English()
+    # Expected output:
-doc1 = nlp(u"Peach emoji is where it has always been.")
+    # Text 1: Peach emoji is where it has always been.
-doc2 = nlp(u"Peach is the superior emoji.")
+    # Text 2: Peach is the superior emoji.
-tokens = doc1._.overlap(doc2)
+    # Overlapping tokens: [Peach, emoji, is, .]
 print(tokens)
--- a/examples/pipeline/custom_component_countries_api.py
+++ b/examples/pipeline/custom_component_countries_api.py
@ -1,21 +1,45 @@
-# coding: utf-8
+#!/usr/bin/env python
 # coding: utf8
 """Example of a spaCy v2.0 pipeline component that requests all countries via
 the REST Countries API, merges country names into one token, assigns entity
 labels and sets attributes on country tokens, e.g. the capital and lat/lng
 coordinates. Can be extended with more details from the API.
 * REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0)
 * Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
 Developed for: spaCy 2.0.0a17
 Last updated for: spaCy 2.0.0a18
 """
 from __future__ import unicode_literals
 import requests
-
+import plac
 from spacy.lang.en import English
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc, Span, Token
-class RESTCountriesComponent(object):
+def main():
-    """Example of a spaCy v2.0 pipeline component that requests all countries
+    # For simplicity, we start off with only the blank English Language class
-    via the REST Countries API, merges country names into one token, assigns
+    # and no model or pre-defined pipeline loaded.
-    entity labels and sets attributes on country tokens, e.g. the capital and
+    nlp = English()
-    lat/lng coordinates. Can be extended with more details from the API.
+    rest_countries = RESTCountriesComponent(nlp)  # initialise component
    nlp.add_pipe(rest_countries) # add it to the pipeline
    doc = nlp(u"Some text about Colombia and the Czech Republic")
    print('Pipeline', nlp.pipe_names)  # pipeline contains component name
    print('Doc has countries', doc._.has_country)  # Doc contains countries
    for token in doc:
        if token._.is_country:
            print(token.text, token._.country_capital, token._.country_latlng,
                token._.country_flag)  # country data
    print('Entities', [(e.text, e.label_) for e in doc.ents])  # entities
-    REST Countries API: https://restcountries.eu
+
-    API License: Mozilla Public License MPL 2.0
+class RESTCountriesComponent(object):
    """spaCy v2.0 pipeline component that requests all countries via
    the REST Countries API, merges country names into one token, assigns entity
    labels and sets attributes on country tokens.
    """
    name = 'rest_countries' # component name, will show up in the pipeline
@ -90,19 +114,12 @@ class RESTCountriesComponent(object):
        return any([t._.get('is_country') for t in tokens])
-# For simplicity, we start off with only the blank English Language class and
+if __name__ == '__main__':
-# no model or pre-defined pipeline loaded.
+    plac.call(main)
-nlp = English()
+    # Expected output:
-rest_countries = RESTCountriesComponent(nlp)  # initialise component
+    # Pipeline ['rest_countries']
-nlp.add_pipe(rest_countries) # add it to the pipeline
+    # Doc has countries True
-
+    # Colombia Bogotá [4.0, -72.0] https://restcountries.eu/data/col.svg
-doc = nlp(u"Some text about Colombia and the Czech Republic")
+    # Czech Republic Prague [49.75, 15.5] https://restcountries.eu/data/cze.svg
-
+    # Entities [('Colombia', 'GPE'), ('Czech Republic', 'GPE')]
 print('Pipeline', nlp.pipe_names)  # pipeline contains component name
 print('Doc has countries', doc._.has_country)  # Doc contains countries
 for token in doc:
    if token._.is_country:
        print(token.text, token._.country_capital, token._.country_latlng,
              token._.country_flag)  # country data
 print('Entities', [(e.text, e.label_) for e in doc.ents])  # all countries are entities
--- a/examples/pipeline/custom_component_entities.py
+++ b/examples/pipeline/custom_component_entities.py
@ -1,11 +1,45 @@
-# coding: utf-8
+#!/usr/bin/env python
 # coding: utf8
 """Example of a spaCy v2.0 pipeline component that sets entity annotations
 based on list of single or multiple-word company names. Companies are
 labelled as ORG and their spans are merged into one token. Additionally,
 ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
 respectively.
 * Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
 Developed for: spaCy 2.0.0a17
 Last updated for: spaCy 2.0.0a18
 """
 from __future__ import unicode_literals
 import plac
 from spacy.lang.en import English
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc, Span, Token
@plac.annotations(
    text=("Text to process", "positional", None, str),
    companies=("Names of technology companies", "positional", None, str))
 def main(text="Alphabet Inc. is the company behind Google.", *companies):
    # For simplicity, we start off with only the blank English Language class
    # and no model or pre-defined pipeline loaded.
    nlp = English()
    if not companies:  # set default companies if none are set via args
        companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple']  # etc.
    component = TechCompanyRecognizer(nlp, companies)  # initialise component
    nlp.add_pipe(component, last=True)  # add last to the pipeline
    doc = nlp(text)
    print('Pipeline', nlp.pipe_names)  # pipeline contains component name
    print('Tokens', [t.text for t in doc])  # company names from the list are merged
    print('Doc has_tech_org', doc._.has_tech_org)  # Doc contains tech orgs
    print('Token 0 is_tech_org', doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org
    print('Token 1 is_tech_org', doc[1]._.is_tech_org)  # "is" is not
    print('Entities', [(e.text, e.label_) for e in doc.ents])  # all orgs are entities
 class TechCompanyRecognizer(object):
    """Example of a spaCy v2.0 pipeline component that sets entity annotations
    based on list of single or multiple-word company names. Companies are
@ -67,19 +101,13 @@ class TechCompanyRecognizer(object):
        return any([t._.get('is_tech_org') for t in tokens])
-# For simplicity, we start off with only the blank English Language class and
+if __name__ == '__main__':
-# no model or pre-defined pipeline loaded.
+    plac.call(main)
-nlp = English()
+    # Expected output:
-companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple']  # etc.
+    # Pipeline ['tech_companies']
-component = TechCompanyRecognizer(nlp, companies)  # initialise component
+    # Tokens ['Alphabet Inc.', 'is', 'the', 'company', 'behind', 'Google', '.']
-nlp.add_pipe(component, last=True)  # add it to the pipeline as the last element
+    # Doc has_tech_org True
-
+    # Token 0 is_tech_org True
-doc = nlp(u"Alphabet Inc. is the company behind Google.")
+    # Token 1 is_tech_org False
-
+    # Entities [('Alphabet Inc.', 'ORG'), ('Google', 'ORG')]
 print('Pipeline', nlp.pipe_names)  # pipeline contains component name
 print('Tokens', [t.text for t in doc])  # company names from the list are merged
 print('Doc has_tech_org', doc._.has_tech_org)  # Doc contains tech orgs
 print('Token 0 is_tech_org', doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org
 print('Token 1 is_tech_org', doc[1]._.is_tech_org)  # "is" is not
 print('Entities', [(e.text, e.label_) for e in doc.ents])  # all orgs are entities