mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			86 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			86 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding: utf-8
 | 
						||
from __future__ import unicode_literals
 | 
						||
 | 
						||
from spacy.lang.en import English
 | 
						||
from spacy.matcher import PhraseMatcher
 | 
						||
from spacy.tokens import Doc, Span, Token
 | 
						||
 | 
						||
 | 
						||
class TechCompanyRecognizer(object):
 | 
						||
    """Example of a spaCy v2.0 pipeline component that sets entity annotations
 | 
						||
    based on list of single or multiple-word company names. Companies are
 | 
						||
    labelled as ORG and their spans are merged into one token. Additionally,
 | 
						||
    ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
 | 
						||
    respectively."""
 | 
						||
    name = 'tech_companies'  # component name, will show up in the pipeline
 | 
						||
 | 
						||
    def __init__(self, nlp, companies=tuple(), label='ORG'):
 | 
						||
        """Initialise the pipeline component. The shared nlp instance is used
 | 
						||
        to initialise the matcher with the shared vocab, get the label ID and
 | 
						||
        generate Doc objects as phrase match patterns.
 | 
						||
        """
 | 
						||
        self.label = nlp.vocab.strings[label]  # get entity label ID
 | 
						||
 | 
						||
        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
 | 
						||
        # so even if the list of companies is long, it's very efficient
 | 
						||
        patterns = [nlp(org) for org in companies]
 | 
						||
        self.matcher = PhraseMatcher(nlp.vocab)
 | 
						||
        self.matcher.add('TECH_ORGS', None, *patterns)
 | 
						||
 | 
						||
        # Register attribute on the Token. We'll be overwriting this based on
 | 
						||
        # the matches, so we're only setting a default value, not a getter.
 | 
						||
        Token.set_extension('is_tech_org', default=False)
 | 
						||
 | 
						||
        # Register attributes on Doc and Span via a getter that checks if one of
 | 
						||
        # the contained tokens is set to is_tech_org == True.
 | 
						||
        Doc.set_extension('has_tech_org', getter=self.has_tech_org)
 | 
						||
        Span.set_extension('has_tech_org', getter=self.has_tech_org)
 | 
						||
 | 
						||
    def __call__(self, doc):
 | 
						||
        """Apply the pipeline component on a Doc object and modify it if matches
 | 
						||
        are found. Return the Doc, so it can be processed by the next component
 | 
						||
        in the pipeline, if available.
 | 
						||
        """
 | 
						||
        matches = self.matcher(doc)
 | 
						||
        spans = []  # keep the spans for later so we can merge them afterwards
 | 
						||
        for _, start, end in matches:
 | 
						||
            # Generate Span representing the entity & set label
 | 
						||
            entity = Span(doc, start, end, label=self.label)
 | 
						||
            spans.append(entity)
 | 
						||
            # Set custom attribute on each token of the entity
 | 
						||
            for token in entity:
 | 
						||
                token._.set('is_tech_org', True)
 | 
						||
            # Overwrite doc.ents and add entity – be careful not to replace!
 | 
						||
            doc.ents = list(doc.ents) + [entity]
 | 
						||
        for span in spans:
 | 
						||
            # Iterate over all spans and merge them into one token. This is done
 | 
						||
            # after setting the entities – otherwise, it would cause mismatched
 | 
						||
            # indices!
 | 
						||
            span.merge()
 | 
						||
        return doc  # don't forget to return the Doc!
 | 
						||
 | 
						||
    def has_tech_org(self, tokens):
 | 
						||
        """Getter for Doc and Span attributes. Returns True if one of the tokens
 | 
						||
        is a tech org. Since the getter is only called when we access the
 | 
						||
        attribute, we can refer to the Token's 'is_tech_org' attribute here,
 | 
						||
        which is already set in the processing step."""
 | 
						||
        return any([t._.get('is_tech_org') for t in tokens])
 | 
						||
 | 
						||
 | 
						||
# For simplicity, we start off with only the blank English Language class and
 | 
						||
# no model or pre-defined pipeline loaded.
 | 
						||
 | 
						||
nlp = English()
 | 
						||
companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple']  # etc.
 | 
						||
component = TechCompanyRecognizer(nlp, companies)  # initialise component
 | 
						||
nlp.add_pipe(component, last=True)  # add it to the pipeline as the last element
 | 
						||
 | 
						||
doc = nlp(u"Alphabet Inc. is the company behind Google.")
 | 
						||
 | 
						||
print('Pipeline', nlp.pipe_names)  # pipeline contains component name
 | 
						||
print('Tokens', [t.text for t in doc])  # company names from the list are merged
 | 
						||
print('Doc has_tech_org', doc._.has_tech_org)  # Doc contains tech orgs
 | 
						||
print('Token 0 is_tech_org', doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org
 | 
						||
print('Token 1 is_tech_org', doc[1]._.is_tech_org)  # "is" is not
 | 
						||
print('Entities', [(e.text, e.label_) for e in doc.ents])  # all orgs are entities
 |