mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Update pipeline component examples to use plac
This commit is contained in:
parent
af28ca1ba0
commit
44f83b35bc
|
@ -1,35 +1,60 @@
|
|||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
"""This example contains several snippets of methods that can be set via custom
|
||||
Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like
|
||||
they're "bound" to the object and are partially applied – i.e. the object
|
||||
they're called on is passed in as the first argument."""
|
||||
they're called on is passed in as the first argument.
|
||||
|
||||
* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
|
||||
|
||||
Developed for: spaCy 2.0.0a17
|
||||
Last updated for: spaCy 2.0.0a18
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy import displacy
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
output_dir=("Output directory for saved HTML", "positional", None, Path))
|
||||
def main(output_dir=None):
|
||||
nlp = English() # start off with blank English class
|
||||
|
||||
Doc.set_extension('overlap', method=overlap_tokens)
|
||||
doc1 = nlp(u"Peach emoji is where it has always been.")
|
||||
doc2 = nlp(u"Peach is the superior emoji.")
|
||||
print("Text 1:", doc1.text)
|
||||
print("Text 2:", doc2.text)
|
||||
print("Overlapping tokens:", doc1._.overlap(doc2))
|
||||
|
||||
Doc.set_extension('to_html', method=to_html)
|
||||
doc = nlp(u"This is a sentence about Apple.")
|
||||
# add entity manually for demo purposes, to make it work without a model
|
||||
doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
|
||||
print("Text:", doc.text)
|
||||
doc._.to_html(output=output_dir, style='ent')
|
||||
|
||||
|
||||
def to_html(doc, output='/tmp', style='dep'):
|
||||
"""Doc method extension for saving the current state as a displaCy
|
||||
visualization.
|
||||
"""
|
||||
# generate filename from first six non-punct tokens
|
||||
file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
|
||||
output_path = Path(output) / file_name
|
||||
html = displacy.render(doc, style=style, page=True) # render markup
|
||||
output_path.open('w', encoding='utf-8').write(html) # save to file
|
||||
print('Saved HTML to {}'.format(output_path))
|
||||
|
||||
|
||||
Doc.set_extension('to_html', method=to_html)
|
||||
|
||||
nlp = English()
|
||||
doc = nlp(u"This is a sentence about Apple.")
|
||||
# add entity manually for demo purposes, to make it work without a model
|
||||
doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
|
||||
doc._.to_html(style='ent')
|
||||
if output is not None:
|
||||
output_path = Path(output)
|
||||
if not output_path.exists():
|
||||
output_path.mkdir()
|
||||
output_file = Path(output) / file_name
|
||||
output_file.open('w', encoding='utf-8').write(html) # save to file
|
||||
print('Saved HTML to {}'.format(output_file))
|
||||
else:
|
||||
print(html)
|
||||
|
||||
|
||||
def overlap_tokens(doc, other_doc):
|
||||
|
@ -43,10 +68,10 @@ def overlap_tokens(doc, other_doc):
|
|||
return overlap
|
||||
|
||||
|
||||
Doc.set_extension('overlap', method=overlap_tokens)
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
||||
|
||||
nlp = English()
|
||||
doc1 = nlp(u"Peach emoji is where it has always been.")
|
||||
doc2 = nlp(u"Peach is the superior emoji.")
|
||||
tokens = doc1._.overlap(doc2)
|
||||
print(tokens)
|
||||
# Expected output:
|
||||
# Text 1: Peach emoji is where it has always been.
|
||||
# Text 2: Peach is the superior emoji.
|
||||
# Overlapping tokens: [Peach, emoji, is, .]
|
||||
|
|
|
@ -1,21 +1,45 @@
|
|||
# coding: utf-8
|
||||
#!/usr/bin/env python
|
||||
# coding: utf8
|
||||
"""Example of a spaCy v2.0 pipeline component that requests all countries via
|
||||
the REST Countries API, merges country names into one token, assigns entity
|
||||
labels and sets attributes on country tokens, e.g. the capital and lat/lng
|
||||
coordinates. Can be extended with more details from the API.
|
||||
|
||||
* REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0)
|
||||
* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
|
||||
|
||||
Developed for: spaCy 2.0.0a17
|
||||
Last updated for: spaCy 2.0.0a18
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import requests
|
||||
|
||||
import plac
|
||||
from spacy.lang.en import English
|
||||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc, Span, Token
|
||||
|
||||
|
||||
class RESTCountriesComponent(object):
|
||||
"""Example of a spaCy v2.0 pipeline component that requests all countries
|
||||
via the REST Countries API, merges country names into one token, assigns
|
||||
entity labels and sets attributes on country tokens, e.g. the capital and
|
||||
lat/lng coordinates. Can be extended with more details from the API.
|
||||
def main():
|
||||
# For simplicity, we start off with only the blank English Language class
|
||||
# and no model or pre-defined pipeline loaded.
|
||||
nlp = English()
|
||||
rest_countries = RESTCountriesComponent(nlp) # initialise component
|
||||
nlp.add_pipe(rest_countries) # add it to the pipeline
|
||||
doc = nlp(u"Some text about Colombia and the Czech Republic")
|
||||
print('Pipeline', nlp.pipe_names) # pipeline contains component name
|
||||
print('Doc has countries', doc._.has_country) # Doc contains countries
|
||||
for token in doc:
|
||||
if token._.is_country:
|
||||
print(token.text, token._.country_capital, token._.country_latlng,
|
||||
token._.country_flag) # country data
|
||||
print('Entities', [(e.text, e.label_) for e in doc.ents]) # entities
|
||||
|
||||
REST Countries API: https://restcountries.eu
|
||||
API License: Mozilla Public License MPL 2.0
|
||||
|
||||
class RESTCountriesComponent(object):
|
||||
"""spaCy v2.0 pipeline component that requests all countries via
|
||||
the REST Countries API, merges country names into one token, assigns entity
|
||||
labels and sets attributes on country tokens.
|
||||
"""
|
||||
name = 'rest_countries' # component name, will show up in the pipeline
|
||||
|
||||
|
@ -90,19 +114,12 @@ class RESTCountriesComponent(object):
|
|||
return any([t._.get('is_country') for t in tokens])
|
||||
|
||||
|
||||
# For simplicity, we start off with only the blank English Language class and
|
||||
# no model or pre-defined pipeline loaded.
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
||||
|
||||
nlp = English()
|
||||
rest_countries = RESTCountriesComponent(nlp) # initialise component
|
||||
nlp.add_pipe(rest_countries) # add it to the pipeline
|
||||
|
||||
doc = nlp(u"Some text about Colombia and the Czech Republic")
|
||||
|
||||
print('Pipeline', nlp.pipe_names) # pipeline contains component name
|
||||
print('Doc has countries', doc._.has_country) # Doc contains countries
|
||||
for token in doc:
|
||||
if token._.is_country:
|
||||
print(token.text, token._.country_capital, token._.country_latlng,
|
||||
token._.country_flag) # country data
|
||||
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all countries are entities
|
||||
# Expected output:
|
||||
# Pipeline ['rest_countries']
|
||||
# Doc has countries True
|
||||
# Colombia Bogotá [4.0, -72.0] https://restcountries.eu/data/col.svg
|
||||
# Czech Republic Prague [49.75, 15.5] https://restcountries.eu/data/cze.svg
|
||||
# Entities [('Colombia', 'GPE'), ('Czech Republic', 'GPE')]
|
||||
|
|
|
@ -1,11 +1,45 @@
|
|||
# coding: utf-8
|
||||
#!/usr/bin/env python
|
||||
# coding: utf8
|
||||
"""Example of a spaCy v2.0 pipeline component that sets entity annotations
|
||||
based on list of single or multiple-word company names. Companies are
|
||||
labelled as ORG and their spans are merged into one token. Additionally,
|
||||
._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
|
||||
respectively.
|
||||
|
||||
* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
|
||||
|
||||
Developed for: spaCy 2.0.0a17
|
||||
Last updated for: spaCy 2.0.0a18
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
from spacy.lang.en import English
|
||||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc, Span, Token
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
text=("Text to process", "positional", None, str),
|
||||
companies=("Names of technology companies", "positional", None, str))
|
||||
def main(text="Alphabet Inc. is the company behind Google.", *companies):
|
||||
# For simplicity, we start off with only the blank English Language class
|
||||
# and no model or pre-defined pipeline loaded.
|
||||
nlp = English()
|
||||
if not companies: # set default companies if none are set via args
|
||||
companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc.
|
||||
component = TechCompanyRecognizer(nlp, companies) # initialise component
|
||||
nlp.add_pipe(component, last=True) # add last to the pipeline
|
||||
|
||||
doc = nlp(text)
|
||||
print('Pipeline', nlp.pipe_names) # pipeline contains component name
|
||||
print('Tokens', [t.text for t in doc]) # company names from the list are merged
|
||||
print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs
|
||||
print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org
|
||||
print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not
|
||||
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities
|
||||
|
||||
|
||||
class TechCompanyRecognizer(object):
|
||||
"""Example of a spaCy v2.0 pipeline component that sets entity annotations
|
||||
based on list of single or multiple-word company names. Companies are
|
||||
|
@ -67,19 +101,13 @@ class TechCompanyRecognizer(object):
|
|||
return any([t._.get('is_tech_org') for t in tokens])
|
||||
|
||||
|
||||
# For simplicity, we start off with only the blank English Language class and
|
||||
# no model or pre-defined pipeline loaded.
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
||||
|
||||
nlp = English()
|
||||
companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc.
|
||||
component = TechCompanyRecognizer(nlp, companies) # initialise component
|
||||
nlp.add_pipe(component, last=True) # add it to the pipeline as the last element
|
||||
|
||||
doc = nlp(u"Alphabet Inc. is the company behind Google.")
|
||||
|
||||
print('Pipeline', nlp.pipe_names) # pipeline contains component name
|
||||
print('Tokens', [t.text for t in doc]) # company names from the list are merged
|
||||
print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs
|
||||
print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org
|
||||
print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not
|
||||
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities
|
||||
# Expected output:
|
||||
# Pipeline ['tech_companies']
|
||||
# Tokens ['Alphabet Inc.', 'is', 'the', 'company', 'behind', 'Google', '.']
|
||||
# Doc has_tech_org True
|
||||
# Token 0 is_tech_org True
|
||||
# Token 1 is_tech_org False
|
||||
# Entities [('Alphabet Inc.', 'ORG'), ('Google', 'ORG')]
|
||||
|
|
Loading…
Reference in New Issue
Block a user