mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 18:07:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			53 lines
		
	
	
		
			1.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			53 lines
		
	
	
		
			1.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding: utf-8
 | 
						||
"""This example contains several snippets of methods that can be set via custom
 | 
						||
Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like
 | 
						||
they're "bound" to the object and are partially applied – i.e. the object
 | 
						||
they're called on is passed in as the first argument."""
 | 
						||
from __future__ import unicode_literals
 | 
						||
 | 
						||
from spacy.lang.en import English
 | 
						||
from spacy.tokens import Doc, Span
 | 
						||
from spacy import displacy
 | 
						||
from pathlib import Path
 | 
						||
 | 
						||
 | 
						||
def to_html(doc, output='/tmp', style='dep'):
 | 
						||
    """Doc method extension for saving the current state as a displaCy
 | 
						||
    visualization.
 | 
						||
    """
 | 
						||
    # generate filename from first six non-punct tokens
 | 
						||
    file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
 | 
						||
    output_path = Path(output) / file_name
 | 
						||
    html = displacy.render(doc, style=style, page=True)  # render markup
 | 
						||
    output_path.open('w', encoding='utf-8').write(html)  # save to file
 | 
						||
    print('Saved HTML to {}'.format(output_path))
 | 
						||
 | 
						||
 | 
						||
Doc.set_extension('to_html', method=to_html)
 | 
						||
 | 
						||
nlp = English()
 | 
						||
doc = nlp(u"This is a sentence about Apple.")
 | 
						||
# add entity manually for demo purposes, to make it work without a model
 | 
						||
doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
 | 
						||
doc._.to_html(style='ent')
 | 
						||
 | 
						||
 | 
						||
def overlap_tokens(doc, other_doc):
 | 
						||
    """Get the tokens from the original Doc that are also in the comparison Doc.
 | 
						||
    """
 | 
						||
    overlap = []
 | 
						||
    other_tokens = [token.text for token in other_doc]
 | 
						||
    for token in doc:
 | 
						||
        if token.text in other_tokens:
 | 
						||
            overlap.append(token)
 | 
						||
    return overlap
 | 
						||
 | 
						||
 | 
						||
Doc.set_extension('overlap', method=overlap_tokens)
 | 
						||
 | 
						||
nlp = English()
 | 
						||
doc1 = nlp(u"Peach emoji is where it has always been.")
 | 
						||
doc2 = nlp(u"Peach is the superior emoji.")
 | 
						||
tokens = doc1._.overlap(doc2)
 | 
						||
print(tokens)
 |