mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Update information extraction examples
This commit is contained in:
		
							parent
							
								
									bca5372fb1
								
							
						
					
					
						commit
						daed7ff8fe
					
				| 
						 | 
					@ -1,59 +0,0 @@
 | 
				
			||||||
"""Issue #252
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Question:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
In the documents and tutorials the main thing I haven't found is examples on how to break sentences down into small sub thoughts/chunks. The noun_chunks is handy, but having examples on using the token.head to find small (near-complete) sentence chunks would be neat.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Lets take the example sentence on https://displacy.spacy.io/displacy/index.html
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
displaCy uses CSS and JavaScript to show you how computers understand language
 | 
					 | 
				
			||||||
This sentence has two main parts (XCOMP & CCOMP) according to the breakdown:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[displaCy] uses CSS and Javascript [to + show]
 | 
					 | 
				
			||||||
&
 | 
					 | 
				
			||||||
show you how computers understand [language]
 | 
					 | 
				
			||||||
I'm assuming that we can use the token.head to build these groups. In one of your examples you had the following function.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def dependency_labels_to_root(token):
 | 
					 | 
				
			||||||
    '''Walk up the syntactic tree, collecting the arc labels.'''
 | 
					 | 
				
			||||||
    dep_labels = []
 | 
					 | 
				
			||||||
    while token.head is not token:
 | 
					 | 
				
			||||||
        dep_labels.append(token.dep)
 | 
					 | 
				
			||||||
        token = token.head
 | 
					 | 
				
			||||||
    return dep_labels
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
from __future__ import print_function, unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Answer:
 | 
					 | 
				
			||||||
# The easiest way is to find the head of the subtree you want, and then use the
 | 
					 | 
				
			||||||
# `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree` is the
 | 
					 | 
				
			||||||
# one that does what you're asking for most directly:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy.en import English
 | 
					 | 
				
			||||||
nlp = English()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language')
 | 
					 | 
				
			||||||
for word in doc:
 | 
					 | 
				
			||||||
    if word.dep_ in ('xcomp', 'ccomp'):
 | 
					 | 
				
			||||||
        print(''.join(w.text_with_ws for w in word.subtree))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# It'd probably be better for `word.subtree` to return a `Span` object instead 
 | 
					 | 
				
			||||||
# of a generator over the tokens. If you want the `Span` you can get it via the 
 | 
					 | 
				
			||||||
# `.right_edge` and `.left_edge` properties. The `Span` object is nice because 
 | 
					 | 
				
			||||||
# you can easily get a vector, merge it, etc.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language')
 | 
					 | 
				
			||||||
for word in doc:
 | 
					 | 
				
			||||||
    if word.dep_ in ('xcomp', 'ccomp'):
 | 
					 | 
				
			||||||
        subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
 | 
					 | 
				
			||||||
        print(subtree_span.text, '|', subtree_span.root.text)
 | 
					 | 
				
			||||||
        print(subtree_span.similarity(doc))
 | 
					 | 
				
			||||||
        print(subtree_span.similarity(subtree_span.root))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# You might also want to select a head, and then select a start and end position by
 | 
					 | 
				
			||||||
# walking along its children. You could then take the `.left_edge` and `.right_edge`
 | 
					 | 
				
			||||||
# of those tokens, and use it to calculate a span.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,59 +0,0 @@
 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy.en import English
 | 
					 | 
				
			||||||
from spacy.parts_of_speech import NOUN
 | 
					 | 
				
			||||||
from spacy.parts_of_speech import ADP as PREP
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _span_to_tuple(span):
 | 
					 | 
				
			||||||
    start = span[0].idx
 | 
					 | 
				
			||||||
    end = span[-1].idx + len(span[-1])
 | 
					 | 
				
			||||||
    tag = span.root.tag_
 | 
					 | 
				
			||||||
    text = span.text
 | 
					 | 
				
			||||||
    label = span.label_
 | 
					 | 
				
			||||||
    return (start, end, tag, text, label)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def merge_spans(spans, doc):
 | 
					 | 
				
			||||||
    # This is a bit awkward atm. What we're doing here is merging the entities,
 | 
					 | 
				
			||||||
    # so that each only takes up a single token. But an entity is a Span, and
 | 
					 | 
				
			||||||
    # each Span is a view into the doc. When we merge a span, we invalidate
 | 
					 | 
				
			||||||
    # the other spans. This will get fixed --- but for now the solution
 | 
					 | 
				
			||||||
    # is to gather the information first, before merging.
 | 
					 | 
				
			||||||
    tuples = [_span_to_tuple(span) for span in spans]
 | 
					 | 
				
			||||||
    for span_tuple in tuples:
 | 
					 | 
				
			||||||
        doc.merge(*span_tuple)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def extract_currency_relations(doc):
 | 
					 | 
				
			||||||
    merge_spans(doc.ents, doc)
 | 
					 | 
				
			||||||
    merge_spans(doc.noun_chunks, doc)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    relations = []
 | 
					 | 
				
			||||||
    for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
 | 
					 | 
				
			||||||
        if money.dep_ in ('attr', 'dobj'):
 | 
					 | 
				
			||||||
            subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
 | 
					 | 
				
			||||||
            if subject:
 | 
					 | 
				
			||||||
                subject = subject[0]
 | 
					 | 
				
			||||||
                relations.append((subject, money))
 | 
					 | 
				
			||||||
        elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
 | 
					 | 
				
			||||||
            relations.append((money.head.head, money))
 | 
					 | 
				
			||||||
 
 | 
					 | 
				
			||||||
    return relations
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def main():
 | 
					 | 
				
			||||||
    nlp = English()
 | 
					 | 
				
			||||||
    texts = [
 | 
					 | 
				
			||||||
        u'Net income was $9.4 million compared to the prior year of $2.7 million.',
 | 
					 | 
				
			||||||
        u'Revenue exceeded twelve billion dollars, with a loss of $1b.',
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
               
 | 
					 | 
				
			||||||
    for text in texts:
 | 
					 | 
				
			||||||
        doc = nlp(text)
 | 
					 | 
				
			||||||
        relations = extract_currency_relations(doc)
 | 
					 | 
				
			||||||
        for r1, r2 in relations:
 | 
					 | 
				
			||||||
            print(r1.text, r2.ent_type_, r2.text)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if __name__ == '__main__':
 | 
					 | 
				
			||||||
    plac.call(main)
 | 
					 | 
				
			||||||
							
								
								
									
										62
									
								
								examples/information_extraction/entity_relations.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										62
									
								
								examples/information_extraction/entity_relations.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,62 @@
 | 
				
			||||||
 | 
					#!/usr/bin/env python
 | 
				
			||||||
 | 
					# coding: utf8
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					A simple example of extracting relations between phrases and entities using
 | 
				
			||||||
 | 
					spaCy's named entity recognizer and the dependency parse. Here, we extract
 | 
				
			||||||
 | 
					money and currency values (entities labelled as MONEY) and then check the
 | 
				
			||||||
 | 
					dependency tree to find the noun phrase they are referring to – for example:
 | 
				
			||||||
 | 
					$9.4 million --> Net income.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Last updated for: spaCy 2.0.0a18
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					from __future__ import unicode_literals, print_function
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import plac
 | 
				
			||||||
 | 
					import spacy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TEXTS = [
 | 
				
			||||||
 | 
					    'Net income was $9.4 million compared to the prior year of $2.7 million.',
 | 
				
			||||||
 | 
					    'Revenue exceeded twelve billion dollars, with a loss of $1b.',
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@plac.annotations(
 | 
				
			||||||
 | 
					    model=("Model to load (needs parser and NER)", "positional", None, str))
 | 
				
			||||||
 | 
					def main(model='en_core_web_sm'):
 | 
				
			||||||
 | 
					    nlp = spacy.load(model)
 | 
				
			||||||
 | 
					    print("Loaded model '%s'" % model)
 | 
				
			||||||
 | 
					    print("Processing %d texts" % len(TEXTS))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for text in TEXTS:
 | 
				
			||||||
 | 
					        doc = nlp(text)
 | 
				
			||||||
 | 
					        relations = extract_currency_relations(doc)
 | 
				
			||||||
 | 
					        for r1, r2 in relations:
 | 
				
			||||||
 | 
					            print('{:<10}\t{}\t{}'.format(r1.text, r2.ent_type_, r2.text))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def extract_currency_relations(doc):
 | 
				
			||||||
 | 
					    # merge entities and noun chunks into one token
 | 
				
			||||||
 | 
					    for span in [*list(doc.ents), *list(doc.noun_chunks)]:
 | 
				
			||||||
 | 
					        span.merge()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    relations = []
 | 
				
			||||||
 | 
					    for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
 | 
				
			||||||
 | 
					        if money.dep_ in ('attr', 'dobj'):
 | 
				
			||||||
 | 
					            subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
 | 
				
			||||||
 | 
					            if subject:
 | 
				
			||||||
 | 
					                subject = subject[0]
 | 
				
			||||||
 | 
					                relations.append((subject, money))
 | 
				
			||||||
 | 
					        elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
 | 
				
			||||||
 | 
					            relations.append((money.head.head, money))
 | 
				
			||||||
 | 
					    return relations
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
					    plac.call(main)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Expected output:
 | 
				
			||||||
 | 
					    # Net income      MONEY   $9.4 million
 | 
				
			||||||
 | 
					    # the prior year  MONEY   $2.7 million
 | 
				
			||||||
 | 
					    # Revenue         MONEY   twelve billion dollars
 | 
				
			||||||
 | 
					    # a loss          MONEY   1b
 | 
				
			||||||
							
								
								
									
										65
									
								
								examples/information_extraction/parse_subtrees.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										65
									
								
								examples/information_extraction/parse_subtrees.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,65 @@
 | 
				
			||||||
 | 
					#!/usr/bin/env python
 | 
				
			||||||
 | 
					# coding: utf8
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					This example shows how to navigate the parse tree including subtrees attached
 | 
				
			||||||
 | 
					to a word.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Based on issue #252:
 | 
				
			||||||
 | 
					"In the documents and tutorials the main thing I haven't found is
 | 
				
			||||||
 | 
					examples on how to break sentences down into small sub thoughts/chunks. The
 | 
				
			||||||
 | 
					noun_chunks is handy, but having examples on using the token.head to find small
 | 
				
			||||||
 | 
					(near-complete) sentence chunks would be neat. Lets take the example sentence:
 | 
				
			||||||
 | 
					"displaCy uses CSS and JavaScript to show you how computers understand language"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This sentence has two main parts (XCOMP & CCOMP) according to the breakdown:
 | 
				
			||||||
 | 
					[displaCy] uses CSS and Javascript [to + show]
 | 
				
			||||||
 | 
					show you how computers understand [language]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					I'm assuming that we can use the token.head to build these groups."
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Last updated for: spaCy 2.0.0a18
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					from __future__ import unicode_literals, print_function
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import plac
 | 
				
			||||||
 | 
					import spacy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@plac.annotations(
 | 
				
			||||||
 | 
					    model=("Model to load", "positional", None, str))
 | 
				
			||||||
 | 
					def main(model='en_core_web_sm'):
 | 
				
			||||||
 | 
					    nlp = spacy.load(model)
 | 
				
			||||||
 | 
					    print("Loaded model '%s'" % model)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    doc = nlp("displaCy uses CSS and JavaScript to show you how computers "
 | 
				
			||||||
 | 
					               "understand language")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # The easiest way is to find the head of the subtree you want, and then use
 | 
				
			||||||
 | 
					    # the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree`
 | 
				
			||||||
 | 
					    # is the one that does what you're asking for most directly:
 | 
				
			||||||
 | 
					    for word in doc:
 | 
				
			||||||
 | 
					        if word.dep_ in ('xcomp', 'ccomp'):
 | 
				
			||||||
 | 
					            print(''.join(w.text_with_ws for w in word.subtree))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # It'd probably be better for `word.subtree` to return a `Span` object
 | 
				
			||||||
 | 
					    # instead of a generator over the tokens. If you want the `Span` you can
 | 
				
			||||||
 | 
					    # get it via the `.right_edge` and `.left_edge` properties. The `Span`
 | 
				
			||||||
 | 
					    # object is nice because you can easily get a vector, merge it, etc.
 | 
				
			||||||
 | 
					    for word in doc:
 | 
				
			||||||
 | 
					        if word.dep_ in ('xcomp', 'ccomp'):
 | 
				
			||||||
 | 
					            subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
 | 
				
			||||||
 | 
					            print(subtree_span.text, '|', subtree_span.root.text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # You might also want to select a head, and then select a start and end
 | 
				
			||||||
 | 
					    # position by walking along its children. You could then take the
 | 
				
			||||||
 | 
					    # `.left_edge` and `.right_edge` of those tokens, and use it to calculate
 | 
				
			||||||
 | 
					    # a span.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
					    plac.call(main)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Expected output:
 | 
				
			||||||
 | 
					    # to show you how computers understand language
 | 
				
			||||||
 | 
					    # how computers understand language
 | 
				
			||||||
 | 
					    # to show you how computers understand language | show
 | 
				
			||||||
 | 
					    # how computers understand language | understand
 | 
				
			||||||
| 
						 | 
					@ -196,8 +196,8 @@
 | 
				
			||||||
        "teaser": "Full code examples you can modify and run.",
 | 
					        "teaser": "Full code examples you can modify and run.",
 | 
				
			||||||
        "next": "resources",
 | 
					        "next": "resources",
 | 
				
			||||||
        "menu": {
 | 
					        "menu": {
 | 
				
			||||||
 | 
					            "Information Extraction": "information-extraction",
 | 
				
			||||||
            "Pipeline": "pipeline",
 | 
					            "Pipeline": "pipeline",
 | 
				
			||||||
            "Matching": "matching",
 | 
					 | 
				
			||||||
            "Training": "training",
 | 
					            "Training": "training",
 | 
				
			||||||
            "Deep Learning": "deep-learning"
 | 
					            "Deep Learning": "deep-learning"
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,6 +2,37 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
include ../_includes/_mixins
 | 
					include ../_includes/_mixins
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+section("information-extraction")
 | 
				
			||||||
 | 
					    +h(3, "phrase-matcher") Using spaCy's phrase matcher
 | 
				
			||||||
 | 
					        +tag-new(2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    p
 | 
				
			||||||
 | 
					        |  This example shows how to use the new
 | 
				
			||||||
 | 
					        |  #[+api("phrasematcher") #[code PhraseMatcher]] to efficiently find
 | 
				
			||||||
 | 
					        |  entities from a large terminology list.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +github("spacy", "examples/information_extraction/phrase_matcher.py")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +h(3, "entity-relations") Extracting entity relations
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    p
 | 
				
			||||||
 | 
					        |  A simple example of extracting relations between phrases and
 | 
				
			||||||
 | 
					        |  entities using spaCy's named entity recognizer and the dependency
 | 
				
			||||||
 | 
					        |  parse. Here, we extract money and currency values (entities labelled
 | 
				
			||||||
 | 
					        |  as #[code MONEY]) and then check the dependency tree to find the
 | 
				
			||||||
 | 
					        |  noun phrase they are referring to – for example: "$9.4 million"
 | 
				
			||||||
 | 
					        |  → "Net income".
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +github("spacy", "examples/information_extraction/entity_relations.py")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +h(3, "subtrees") Navigating the parse tree and subtrees
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    p
 | 
				
			||||||
 | 
					        |  This example shows how to navigate the parse tree including subtrees
 | 
				
			||||||
 | 
					        |  attached to a word.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +github("spacy", "examples/information_extraction/parse_subtrees.py")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
+section("pipeline")
 | 
					+section("pipeline")
 | 
				
			||||||
    +h(3, "custom-components-entities") Custom pipeline components and attribute extensions
 | 
					    +h(3, "custom-components-entities") Custom pipeline components and attribute extensions
 | 
				
			||||||
        +tag-new(2)
 | 
					        +tag-new(2)
 | 
				
			||||||
| 
						 | 
					@ -40,26 +71,6 @@ include ../_includes/_mixins
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    +github("spacy", "examples/pipeline/custom_attr_methods.py")
 | 
					    +github("spacy", "examples/pipeline/custom_attr_methods.py")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
+section("matching")
 | 
					 | 
				
			||||||
    +h(3, "matcher") Using spaCy's rule-based matcher
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    p
 | 
					 | 
				
			||||||
        |  This example shows how to use spaCy's rule-based
 | 
					 | 
				
			||||||
        |  #[+api("matcher") #[code Matcher]] to find and label entities across
 | 
					 | 
				
			||||||
        |  documents.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    +github("spacy", "examples/matcher_example.py")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    +h(3, "phrase-matcher") Using spaCy's phrase matcher
 | 
					 | 
				
			||||||
        +tag-new(2)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    p
 | 
					 | 
				
			||||||
        |  This example shows how to use the new
 | 
					 | 
				
			||||||
        |  #[+api("phrasematcher") #[code PhraseMatcher]] to efficiently find
 | 
					 | 
				
			||||||
        |  entities from a large terminology list.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    +github("spacy", "examples/phrase_matcher.py")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
+section("training")
 | 
					+section("training")
 | 
				
			||||||
    +h(3, "training-ner") Training spaCy's Named Entity Recognizer
 | 
					    +h(3, "training-ner") Training spaCy's Named Entity Recognizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user