Update information extraction examples

2025-07-13 09:42:26 +03:00 · 2017-10-26 18:46:11 +02:00 · 2017-10-26 18:46:11 +02:00 · daed7ff8fe
commit daed7ff8fe
parent bca5372fb1
7 changed files with 159 additions and 139 deletions
--- a/examples/get_parse_subregions.py
+++ b/examples/get_parse_subregions.py
@ -1,59 +0,0 @@
 """Issue #252
 Question:
 In the documents and tutorials the main thing I haven't found is examples on how to break sentences down into small sub thoughts/chunks. The noun_chunks is handy, but having examples on using the token.head to find small (near-complete) sentence chunks would be neat.
 Lets take the example sentence on https://displacy.spacy.io/displacy/index.html
 displaCy uses CSS and JavaScript to show you how computers understand language
 This sentence has two main parts (XCOMP & CCOMP) according to the breakdown:
 [displaCy] uses CSS and Javascript [to + show]
 &
 show you how computers understand [language]
 I'm assuming that we can use the token.head to build these groups. In one of your examples you had the following function.
 def dependency_labels_to_root(token):
    '''Walk up the syntactic tree, collecting the arc labels.'''
    dep_labels = []
    while token.head is not token:
        dep_labels.append(token.dep)
        token = token.head
    return dep_labels
 """
 from __future__ import print_function, unicode_literals
 # Answer:
 # The easiest way is to find the head of the subtree you want, and then use the
 # `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree` is the
 # one that does what you're asking for most directly:
 from spacy.en import English
 nlp = English()
 doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language')
 for word in doc:
    if word.dep_ in ('xcomp', 'ccomp'):
        print(''.join(w.text_with_ws for w in word.subtree))
 # It'd probably be better for `word.subtree` to return a `Span` object instead 
 # of a generator over the tokens. If you want the `Span` you can get it via the 
 # `.right_edge` and `.left_edge` properties. The `Span` object is nice because 
 # you can easily get a vector, merge it, etc.
 doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language')
 for word in doc:
    if word.dep_ in ('xcomp', 'ccomp'):
        subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
        print(subtree_span.text, '|', subtree_span.root.text)
        print(subtree_span.similarity(doc))
        print(subtree_span.similarity(subtree_span.root))
 # You might also want to select a head, and then select a start and end position by
 # walking along its children. You could then take the `.left_edge` and `.right_edge`
 # of those tokens, and use it to calculate a span.
--- a/examples/information_extraction.py
+++ b/examples/information_extraction.py
@ -1,59 +0,0 @@
 import plac
 from spacy.en import English
 from spacy.parts_of_speech import NOUN
 from spacy.parts_of_speech import ADP as PREP
 def _span_to_tuple(span):
    start = span[0].idx
    end = span[-1].idx + len(span[-1])
    tag = span.root.tag_
    text = span.text
    label = span.label_
    return (start, end, tag, text, label)
 def merge_spans(spans, doc):
    # This is a bit awkward atm. What we're doing here is merging the entities,
    # so that each only takes up a single token. But an entity is a Span, and
    # each Span is a view into the doc. When we merge a span, we invalidate
    # the other spans. This will get fixed --- but for now the solution
    # is to gather the information first, before merging.
    tuples = [_span_to_tuple(span) for span in spans]
    for span_tuple in tuples:
        doc.merge(*span_tuple)
 def extract_currency_relations(doc):
    merge_spans(doc.ents, doc)
    merge_spans(doc.noun_chunks, doc)
    relations = []
    for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
        if money.dep_ in ('attr', 'dobj'):
            subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
            if subject:
                subject = subject[0]
                relations.append((subject, money))
        elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
            relations.append((money.head.head, money))
    return relations
 def main():
    nlp = English()
    texts = [
        u'Net income was $9.4 million compared to the prior year of $2.7 million.',
        u'Revenue exceeded twelve billion dollars, with a loss of $1b.',
    ]
    for text in texts:
        doc = nlp(text)
        relations = extract_currency_relations(doc)
        for r1, r2 in relations:
            print(r1.text, r2.ent_type_, r2.text)
 if __name__ == '__main__':
    plac.call(main)
--- a/examples/information_extraction/entity_relations.py
+++ b/examples/information_extraction/entity_relations.py
@ -0,0 +1,62 @@
 #!/usr/bin/env python
 # coding: utf8
 """
 A simple example of extracting relations between phrases and entities using
 spaCy's named entity recognizer and the dependency parse. Here, we extract
 money and currency values (entities labelled as MONEY) and then check the
 dependency tree to find the noun phrase they are referring to – for example:
 $9.4 million --> Net income.
 Last updated for: spaCy 2.0.0a18
 """
 from __future__ import unicode_literals, print_function
 import plac
 import spacy
 TEXTS = [
    'Net income was $9.4 million compared to the prior year of $2.7 million.',
    'Revenue exceeded twelve billion dollars, with a loss of $1b.',
 ]
@plac.annotations(
    model=("Model to load (needs parser and NER)", "positional", None, str))
 def main(model='en_core_web_sm'):
    nlp = spacy.load(model)
    print("Loaded model '%s'" % model)
    print("Processing %d texts" % len(TEXTS))
    for text in TEXTS:
        doc = nlp(text)
        relations = extract_currency_relations(doc)
        for r1, r2 in relations:
            print('{:<10}\t{}\t{}'.format(r1.text, r2.ent_type_, r2.text))
 def extract_currency_relations(doc):
    # merge entities and noun chunks into one token
    for span in [*list(doc.ents), *list(doc.noun_chunks)]:
        span.merge()
    relations = []
    for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
        if money.dep_ in ('attr', 'dobj'):
            subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
            if subject:
                subject = subject[0]
                relations.append((subject, money))
        elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
            relations.append((money.head.head, money))
    return relations
 if __name__ == '__main__':
    plac.call(main)
    # Expected output:
    # Net income      MONEY   $9.4 million
    # the prior year  MONEY   $2.7 million
    # Revenue         MONEY   twelve billion dollars
    # a loss          MONEY   1b
--- a/examples/information_extraction/parse_subtrees.py
+++ b/examples/information_extraction/parse_subtrees.py
@ -0,0 +1,65 @@
 #!/usr/bin/env python
 # coding: utf8
 """
 This example shows how to navigate the parse tree including subtrees attached
 to a word.
 Based on issue #252:
 "In the documents and tutorials the main thing I haven't found is
 examples on how to break sentences down into small sub thoughts/chunks. The
 noun_chunks is handy, but having examples on using the token.head to find small
 (near-complete) sentence chunks would be neat. Lets take the example sentence:
 "displaCy uses CSS and JavaScript to show you how computers understand language"
 This sentence has two main parts (XCOMP & CCOMP) according to the breakdown:
 [displaCy] uses CSS and Javascript [to + show]
 show you how computers understand [language]
 I'm assuming that we can use the token.head to build these groups."
 Last updated for: spaCy 2.0.0a18
 """
 from __future__ import unicode_literals, print_function
 import plac
 import spacy
@plac.annotations(
    model=("Model to load", "positional", None, str))
 def main(model='en_core_web_sm'):
    nlp = spacy.load(model)
    print("Loaded model '%s'" % model)
    doc = nlp("displaCy uses CSS and JavaScript to show you how computers "
               "understand language")
    # The easiest way is to find the head of the subtree you want, and then use
    # the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree`
    # is the one that does what you're asking for most directly:
    for word in doc:
        if word.dep_ in ('xcomp', 'ccomp'):
            print(''.join(w.text_with_ws for w in word.subtree))
    # It'd probably be better for `word.subtree` to return a `Span` object
    # instead of a generator over the tokens. If you want the `Span` you can
    # get it via the `.right_edge` and `.left_edge` properties. The `Span`
    # object is nice because you can easily get a vector, merge it, etc.
    for word in doc:
        if word.dep_ in ('xcomp', 'ccomp'):
            subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
            print(subtree_span.text, '|', subtree_span.root.text)
    # You might also want to select a head, and then select a start and end
    # position by walking along its children. You could then take the
    # `.left_edge` and `.right_edge` of those tokens, and use it to calculate
    # a span.
 if __name__ == '__main__':
    plac.call(main)
    # Expected output:
    # to show you how computers understand language
    # how computers understand language
    # to show you how computers understand language | show
    # how computers understand language | understand
--- a/examples/information_extraction/phrase_matcher.py
+++ b/examples/information_extraction/phrase_matcher.py
--- a/website/usage/_data.json
+++ b/website/usage/_data.json
@ -196,8 +196,8 @@
        "teaser": "Full code examples you can modify and run.",
        "next": "resources",
        "menu": {
            "Information Extraction": "information-extraction",
            "Pipeline": "pipeline",
            "Matching": "matching",
            "Training": "training",
            "Deep Learning": "deep-learning"
        }
--- a/website/usage/examples.jade
+++ b/website/usage/examples.jade
@ -2,6 +2,37 @@
 include ../_includes/_mixins
 +section("information-extraction")
    +h(3, "phrase-matcher") Using spaCy's phrase matcher
        +tag-new(2)
    p
        |  This example shows how to use the new
        |  #[+api("phrasematcher") #[code PhraseMatcher]] to efficiently find
        |  entities from a large terminology list.
    +github("spacy", "examples/information_extraction/phrase_matcher.py")
    +h(3, "entity-relations") Extracting entity relations
    p
        |  A simple example of extracting relations between phrases and
        |  entities using spaCy's named entity recognizer and the dependency
        |  parse. Here, we extract money and currency values (entities labelled
        |  as #[code MONEY]) and then check the dependency tree to find the
        |  noun phrase they are referring to – for example: "$9.4 million"
        |  &rarr; "Net income".
    +github("spacy", "examples/information_extraction/entity_relations.py")
    +h(3, "subtrees") Navigating the parse tree and subtrees
    p
        |  This example shows how to navigate the parse tree including subtrees
        |  attached to a word.
    +github("spacy", "examples/information_extraction/parse_subtrees.py")
 +section("pipeline")
    +h(3, "custom-components-entities") Custom pipeline components and attribute extensions
        +tag-new(2)
@ -40,26 +71,6 @@ include ../_includes/_mixins
    +github("spacy", "examples/pipeline/custom_attr_methods.py")
 +section("matching")
    +h(3, "matcher") Using spaCy's rule-based matcher
    p
        |  This example shows how to use spaCy's rule-based
        |  #[+api("matcher") #[code Matcher]] to find and label entities across
        |  documents.
    +github("spacy", "examples/matcher_example.py")
    +h(3, "phrase-matcher") Using spaCy's phrase matcher
        +tag-new(2)
    p
        |  This example shows how to use the new
        |  #[+api("phrasematcher") #[code PhraseMatcher]] to efficiently find
        |  entities from a large terminology list.
    +github("spacy", "examples/phrase_matcher.py")
 +section("training")
    +h(3, "training-ner") Training spaCy's Named Entity Recognizer