mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 00:02:19 +03:00
Update pipeline component examples to use plac
This commit is contained in:
parent
af28ca1ba0
commit
44f83b35bc
|
@ -1,35 +1,60 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
"""This example contains several snippets of methods that can be set via custom
|
"""This example contains several snippets of methods that can be set via custom
|
||||||
Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like
|
Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like
|
||||||
they're "bound" to the object and are partially applied – i.e. the object
|
they're "bound" to the object and are partially applied – i.e. the object
|
||||||
they're called on is passed in as the first argument."""
|
they're called on is passed in as the first argument.
|
||||||
|
|
||||||
|
* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
|
||||||
|
|
||||||
|
Developed for: spaCy 2.0.0a17
|
||||||
|
Last updated for: spaCy 2.0.0a18
|
||||||
|
"""
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import plac
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
from spacy import displacy
|
from spacy import displacy
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
output_dir=("Output directory for saved HTML", "positional", None, Path))
|
||||||
|
def main(output_dir=None):
|
||||||
|
nlp = English() # start off with blank English class
|
||||||
|
|
||||||
|
Doc.set_extension('overlap', method=overlap_tokens)
|
||||||
|
doc1 = nlp(u"Peach emoji is where it has always been.")
|
||||||
|
doc2 = nlp(u"Peach is the superior emoji.")
|
||||||
|
print("Text 1:", doc1.text)
|
||||||
|
print("Text 2:", doc2.text)
|
||||||
|
print("Overlapping tokens:", doc1._.overlap(doc2))
|
||||||
|
|
||||||
|
Doc.set_extension('to_html', method=to_html)
|
||||||
|
doc = nlp(u"This is a sentence about Apple.")
|
||||||
|
# add entity manually for demo purposes, to make it work without a model
|
||||||
|
doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
|
||||||
|
print("Text:", doc.text)
|
||||||
|
doc._.to_html(output=output_dir, style='ent')
|
||||||
|
|
||||||
|
|
||||||
def to_html(doc, output='/tmp', style='dep'):
|
def to_html(doc, output='/tmp', style='dep'):
|
||||||
"""Doc method extension for saving the current state as a displaCy
|
"""Doc method extension for saving the current state as a displaCy
|
||||||
visualization.
|
visualization.
|
||||||
"""
|
"""
|
||||||
# generate filename from first six non-punct tokens
|
# generate filename from first six non-punct tokens
|
||||||
file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
|
file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
|
||||||
output_path = Path(output) / file_name
|
|
||||||
html = displacy.render(doc, style=style, page=True) # render markup
|
html = displacy.render(doc, style=style, page=True) # render markup
|
||||||
output_path.open('w', encoding='utf-8').write(html) # save to file
|
if output is not None:
|
||||||
print('Saved HTML to {}'.format(output_path))
|
output_path = Path(output)
|
||||||
|
if not output_path.exists():
|
||||||
|
output_path.mkdir()
|
||||||
Doc.set_extension('to_html', method=to_html)
|
output_file = Path(output) / file_name
|
||||||
|
output_file.open('w', encoding='utf-8').write(html) # save to file
|
||||||
nlp = English()
|
print('Saved HTML to {}'.format(output_file))
|
||||||
doc = nlp(u"This is a sentence about Apple.")
|
else:
|
||||||
# add entity manually for demo purposes, to make it work without a model
|
print(html)
|
||||||
doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
|
|
||||||
doc._.to_html(style='ent')
|
|
||||||
|
|
||||||
|
|
||||||
def overlap_tokens(doc, other_doc):
|
def overlap_tokens(doc, other_doc):
|
||||||
|
@ -43,10 +68,10 @@ def overlap_tokens(doc, other_doc):
|
||||||
return overlap
|
return overlap
|
||||||
|
|
||||||
|
|
||||||
Doc.set_extension('overlap', method=overlap_tokens)
|
if __name__ == '__main__':
|
||||||
|
plac.call(main)
|
||||||
|
|
||||||
nlp = English()
|
# Expected output:
|
||||||
doc1 = nlp(u"Peach emoji is where it has always been.")
|
# Text 1: Peach emoji is where it has always been.
|
||||||
doc2 = nlp(u"Peach is the superior emoji.")
|
# Text 2: Peach is the superior emoji.
|
||||||
tokens = doc1._.overlap(doc2)
|
# Overlapping tokens: [Peach, emoji, is, .]
|
||||||
print(tokens)
|
|
||||||
|
|
|
@ -1,21 +1,45 @@
|
||||||
# coding: utf-8
|
#!/usr/bin/env python
|
||||||
|
# coding: utf8
|
||||||
|
"""Example of a spaCy v2.0 pipeline component that requests all countries via
|
||||||
|
the REST Countries API, merges country names into one token, assigns entity
|
||||||
|
labels and sets attributes on country tokens, e.g. the capital and lat/lng
|
||||||
|
coordinates. Can be extended with more details from the API.
|
||||||
|
|
||||||
|
* REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0)
|
||||||
|
* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
|
||||||
|
|
||||||
|
Developed for: spaCy 2.0.0a17
|
||||||
|
Last updated for: spaCy 2.0.0a18
|
||||||
|
"""
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
import plac
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.matcher import PhraseMatcher
|
from spacy.matcher import PhraseMatcher
|
||||||
from spacy.tokens import Doc, Span, Token
|
from spacy.tokens import Doc, Span, Token
|
||||||
|
|
||||||
|
|
||||||
class RESTCountriesComponent(object):
|
def main():
|
||||||
"""Example of a spaCy v2.0 pipeline component that requests all countries
|
# For simplicity, we start off with only the blank English Language class
|
||||||
via the REST Countries API, merges country names into one token, assigns
|
# and no model or pre-defined pipeline loaded.
|
||||||
entity labels and sets attributes on country tokens, e.g. the capital and
|
nlp = English()
|
||||||
lat/lng coordinates. Can be extended with more details from the API.
|
rest_countries = RESTCountriesComponent(nlp) # initialise component
|
||||||
|
nlp.add_pipe(rest_countries) # add it to the pipeline
|
||||||
|
doc = nlp(u"Some text about Colombia and the Czech Republic")
|
||||||
|
print('Pipeline', nlp.pipe_names) # pipeline contains component name
|
||||||
|
print('Doc has countries', doc._.has_country) # Doc contains countries
|
||||||
|
for token in doc:
|
||||||
|
if token._.is_country:
|
||||||
|
print(token.text, token._.country_capital, token._.country_latlng,
|
||||||
|
token._.country_flag) # country data
|
||||||
|
print('Entities', [(e.text, e.label_) for e in doc.ents]) # entities
|
||||||
|
|
||||||
REST Countries API: https://restcountries.eu
|
|
||||||
API License: Mozilla Public License MPL 2.0
|
class RESTCountriesComponent(object):
|
||||||
|
"""spaCy v2.0 pipeline component that requests all countries via
|
||||||
|
the REST Countries API, merges country names into one token, assigns entity
|
||||||
|
labels and sets attributes on country tokens.
|
||||||
"""
|
"""
|
||||||
name = 'rest_countries' # component name, will show up in the pipeline
|
name = 'rest_countries' # component name, will show up in the pipeline
|
||||||
|
|
||||||
|
@ -90,19 +114,12 @@ class RESTCountriesComponent(object):
|
||||||
return any([t._.get('is_country') for t in tokens])
|
return any([t._.get('is_country') for t in tokens])
|
||||||
|
|
||||||
|
|
||||||
# For simplicity, we start off with only the blank English Language class and
|
if __name__ == '__main__':
|
||||||
# no model or pre-defined pipeline loaded.
|
plac.call(main)
|
||||||
|
|
||||||
nlp = English()
|
# Expected output:
|
||||||
rest_countries = RESTCountriesComponent(nlp) # initialise component
|
# Pipeline ['rest_countries']
|
||||||
nlp.add_pipe(rest_countries) # add it to the pipeline
|
# Doc has countries True
|
||||||
|
# Colombia Bogotá [4.0, -72.0] https://restcountries.eu/data/col.svg
|
||||||
doc = nlp(u"Some text about Colombia and the Czech Republic")
|
# Czech Republic Prague [49.75, 15.5] https://restcountries.eu/data/cze.svg
|
||||||
|
# Entities [('Colombia', 'GPE'), ('Czech Republic', 'GPE')]
|
||||||
print('Pipeline', nlp.pipe_names) # pipeline contains component name
|
|
||||||
print('Doc has countries', doc._.has_country) # Doc contains countries
|
|
||||||
for token in doc:
|
|
||||||
if token._.is_country:
|
|
||||||
print(token.text, token._.country_capital, token._.country_latlng,
|
|
||||||
token._.country_flag) # country data
|
|
||||||
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all countries are entities
|
|
||||||
|
|
|
@ -1,11 +1,45 @@
|
||||||
# coding: utf-8
|
#!/usr/bin/env python
|
||||||
|
# coding: utf8
|
||||||
|
"""Example of a spaCy v2.0 pipeline component that sets entity annotations
|
||||||
|
based on list of single or multiple-word company names. Companies are
|
||||||
|
labelled as ORG and their spans are merged into one token. Additionally,
|
||||||
|
._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
|
||||||
|
respectively.
|
||||||
|
|
||||||
|
* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
|
||||||
|
|
||||||
|
Developed for: spaCy 2.0.0a17
|
||||||
|
Last updated for: spaCy 2.0.0a18
|
||||||
|
"""
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import plac
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.matcher import PhraseMatcher
|
from spacy.matcher import PhraseMatcher
|
||||||
from spacy.tokens import Doc, Span, Token
|
from spacy.tokens import Doc, Span, Token
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
text=("Text to process", "positional", None, str),
|
||||||
|
companies=("Names of technology companies", "positional", None, str))
|
||||||
|
def main(text="Alphabet Inc. is the company behind Google.", *companies):
|
||||||
|
# For simplicity, we start off with only the blank English Language class
|
||||||
|
# and no model or pre-defined pipeline loaded.
|
||||||
|
nlp = English()
|
||||||
|
if not companies: # set default companies if none are set via args
|
||||||
|
companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc.
|
||||||
|
component = TechCompanyRecognizer(nlp, companies) # initialise component
|
||||||
|
nlp.add_pipe(component, last=True) # add last to the pipeline
|
||||||
|
|
||||||
|
doc = nlp(text)
|
||||||
|
print('Pipeline', nlp.pipe_names) # pipeline contains component name
|
||||||
|
print('Tokens', [t.text for t in doc]) # company names from the list are merged
|
||||||
|
print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs
|
||||||
|
print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org
|
||||||
|
print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not
|
||||||
|
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities
|
||||||
|
|
||||||
|
|
||||||
class TechCompanyRecognizer(object):
|
class TechCompanyRecognizer(object):
|
||||||
"""Example of a spaCy v2.0 pipeline component that sets entity annotations
|
"""Example of a spaCy v2.0 pipeline component that sets entity annotations
|
||||||
based on list of single or multiple-word company names. Companies are
|
based on list of single or multiple-word company names. Companies are
|
||||||
|
@ -67,19 +101,13 @@ class TechCompanyRecognizer(object):
|
||||||
return any([t._.get('is_tech_org') for t in tokens])
|
return any([t._.get('is_tech_org') for t in tokens])
|
||||||
|
|
||||||
|
|
||||||
# For simplicity, we start off with only the blank English Language class and
|
if __name__ == '__main__':
|
||||||
# no model or pre-defined pipeline loaded.
|
plac.call(main)
|
||||||
|
|
||||||
nlp = English()
|
# Expected output:
|
||||||
companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc.
|
# Pipeline ['tech_companies']
|
||||||
component = TechCompanyRecognizer(nlp, companies) # initialise component
|
# Tokens ['Alphabet Inc.', 'is', 'the', 'company', 'behind', 'Google', '.']
|
||||||
nlp.add_pipe(component, last=True) # add it to the pipeline as the last element
|
# Doc has_tech_org True
|
||||||
|
# Token 0 is_tech_org True
|
||||||
doc = nlp(u"Alphabet Inc. is the company behind Google.")
|
# Token 1 is_tech_org False
|
||||||
|
# Entities [('Alphabet Inc.', 'ORG'), ('Google', 'ORG')]
|
||||||
print('Pipeline', nlp.pipe_names) # pipeline contains component name
|
|
||||||
print('Tokens', [t.text for t in doc]) # company names from the list are merged
|
|
||||||
print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs
|
|
||||||
print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org
|
|
||||||
print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not
|
|
||||||
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user