2017-10-27 03:58:14 +03:00
|
|
|
|
#!/usr/bin/env python
|
2017-10-10 05:26:06 +03:00
|
|
|
|
# coding: utf-8
|
|
|
|
|
"""This example contains several snippets of methods that can be set via custom
|
|
|
|
|
Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like
|
|
|
|
|
they're "bound" to the object and are partially applied – i.e. the object
|
2017-10-27 03:58:14 +03:00
|
|
|
|
they're called on is passed in as the first argument.
|
|
|
|
|
|
2017-11-07 14:00:43 +03:00
|
|
|
|
* Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components
|
2017-10-27 03:58:14 +03:00
|
|
|
|
|
2017-11-07 03:22:30 +03:00
|
|
|
|
Compatible with: spaCy v2.0.0+
|
2017-10-27 03:58:14 +03:00
|
|
|
|
"""
|
2017-10-27 04:55:04 +03:00
|
|
|
|
from __future__ import unicode_literals, print_function
|
2017-10-10 05:26:06 +03:00
|
|
|
|
|
2017-10-27 03:58:14 +03:00
|
|
|
|
import plac
|
2017-10-10 05:26:06 +03:00
|
|
|
|
from spacy.lang.en import English
|
2017-10-11 03:30:40 +03:00
|
|
|
|
from spacy.tokens import Doc, Span
|
2017-10-10 05:26:06 +03:00
|
|
|
|
from spacy import displacy
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
2017-10-27 03:58:14 +03:00
|
|
|
|
@plac.annotations(
|
|
|
|
|
output_dir=("Output directory for saved HTML", "positional", None, Path))
|
|
|
|
|
def main(output_dir=None):
|
|
|
|
|
nlp = English() # start off with blank English class
|
|
|
|
|
|
|
|
|
|
Doc.set_extension('overlap', method=overlap_tokens)
|
|
|
|
|
doc1 = nlp(u"Peach emoji is where it has always been.")
|
|
|
|
|
doc2 = nlp(u"Peach is the superior emoji.")
|
|
|
|
|
print("Text 1:", doc1.text)
|
|
|
|
|
print("Text 2:", doc2.text)
|
|
|
|
|
print("Overlapping tokens:", doc1._.overlap(doc2))
|
|
|
|
|
|
|
|
|
|
Doc.set_extension('to_html', method=to_html)
|
|
|
|
|
doc = nlp(u"This is a sentence about Apple.")
|
|
|
|
|
# add entity manually for demo purposes, to make it work without a model
|
|
|
|
|
doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
|
|
|
|
|
print("Text:", doc.text)
|
|
|
|
|
doc._.to_html(output=output_dir, style='ent')
|
|
|
|
|
|
|
|
|
|
|
2017-10-10 05:26:06 +03:00
|
|
|
|
def to_html(doc, output='/tmp', style='dep'):
|
|
|
|
|
"""Doc method extension for saving the current state as a displaCy
|
|
|
|
|
visualization.
|
|
|
|
|
"""
|
|
|
|
|
# generate filename from first six non-punct tokens
|
|
|
|
|
file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
|
|
|
|
|
html = displacy.render(doc, style=style, page=True) # render markup
|
2017-10-27 03:58:14 +03:00
|
|
|
|
if output is not None:
|
|
|
|
|
output_path = Path(output)
|
|
|
|
|
if not output_path.exists():
|
|
|
|
|
output_path.mkdir()
|
|
|
|
|
output_file = Path(output) / file_name
|
|
|
|
|
output_file.open('w', encoding='utf-8').write(html) # save to file
|
|
|
|
|
print('Saved HTML to {}'.format(output_file))
|
|
|
|
|
else:
|
|
|
|
|
print(html)
|
2017-10-10 05:26:06 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def overlap_tokens(doc, other_doc):
|
|
|
|
|
"""Get the tokens from the original Doc that are also in the comparison Doc.
|
|
|
|
|
"""
|
|
|
|
|
overlap = []
|
|
|
|
|
other_tokens = [token.text for token in other_doc]
|
|
|
|
|
for token in doc:
|
|
|
|
|
if token.text in other_tokens:
|
|
|
|
|
overlap.append(token)
|
|
|
|
|
return overlap
|
|
|
|
|
|
|
|
|
|
|
2017-10-27 03:58:14 +03:00
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
plac.call(main)
|
2017-10-10 05:26:06 +03:00
|
|
|
|
|
2017-10-27 03:58:14 +03:00
|
|
|
|
# Expected output:
|
|
|
|
|
# Text 1: Peach emoji is where it has always been.
|
|
|
|
|
# Text 2: Peach is the superior emoji.
|
|
|
|
|
# Overlapping tokens: [Peach, emoji, is, .]
|