mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-25 19:33:42 +03:00
Update information extraction examples
This commit is contained in:
parent
bca5372fb1
commit
daed7ff8fe
|
@ -1,59 +0,0 @@
|
||||||
"""Issue #252
|
|
||||||
|
|
||||||
Question:
|
|
||||||
|
|
||||||
In the documents and tutorials the main thing I haven't found is examples on how to break sentences down into small sub thoughts/chunks. The noun_chunks is handy, but having examples on using the token.head to find small (near-complete) sentence chunks would be neat.
|
|
||||||
|
|
||||||
Lets take the example sentence on https://displacy.spacy.io/displacy/index.html
|
|
||||||
|
|
||||||
displaCy uses CSS and JavaScript to show you how computers understand language
|
|
||||||
This sentence has two main parts (XCOMP & CCOMP) according to the breakdown:
|
|
||||||
|
|
||||||
[displaCy] uses CSS and Javascript [to + show]
|
|
||||||
&
|
|
||||||
show you how computers understand [language]
|
|
||||||
I'm assuming that we can use the token.head to build these groups. In one of your examples you had the following function.
|
|
||||||
|
|
||||||
def dependency_labels_to_root(token):
|
|
||||||
'''Walk up the syntactic tree, collecting the arc labels.'''
|
|
||||||
dep_labels = []
|
|
||||||
while token.head is not token:
|
|
||||||
dep_labels.append(token.dep)
|
|
||||||
token = token.head
|
|
||||||
return dep_labels
|
|
||||||
"""
|
|
||||||
from __future__ import print_function, unicode_literals
|
|
||||||
|
|
||||||
# Answer:
|
|
||||||
# The easiest way is to find the head of the subtree you want, and then use the
|
|
||||||
# `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree` is the
|
|
||||||
# one that does what you're asking for most directly:
|
|
||||||
|
|
||||||
from spacy.en import English
|
|
||||||
nlp = English()
|
|
||||||
|
|
||||||
doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language')
|
|
||||||
for word in doc:
|
|
||||||
if word.dep_ in ('xcomp', 'ccomp'):
|
|
||||||
print(''.join(w.text_with_ws for w in word.subtree))
|
|
||||||
|
|
||||||
# It'd probably be better for `word.subtree` to return a `Span` object instead
|
|
||||||
# of a generator over the tokens. If you want the `Span` you can get it via the
|
|
||||||
# `.right_edge` and `.left_edge` properties. The `Span` object is nice because
|
|
||||||
# you can easily get a vector, merge it, etc.
|
|
||||||
|
|
||||||
doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language')
|
|
||||||
for word in doc:
|
|
||||||
if word.dep_ in ('xcomp', 'ccomp'):
|
|
||||||
subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
|
|
||||||
print(subtree_span.text, '|', subtree_span.root.text)
|
|
||||||
print(subtree_span.similarity(doc))
|
|
||||||
print(subtree_span.similarity(subtree_span.root))
|
|
||||||
|
|
||||||
|
|
||||||
# You might also want to select a head, and then select a start and end position by
|
|
||||||
# walking along its children. You could then take the `.left_edge` and `.right_edge`
|
|
||||||
# of those tokens, and use it to calculate a span.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,59 +0,0 @@
|
||||||
import plac
|
|
||||||
|
|
||||||
from spacy.en import English
|
|
||||||
from spacy.parts_of_speech import NOUN
|
|
||||||
from spacy.parts_of_speech import ADP as PREP
|
|
||||||
|
|
||||||
|
|
||||||
def _span_to_tuple(span):
|
|
||||||
start = span[0].idx
|
|
||||||
end = span[-1].idx + len(span[-1])
|
|
||||||
tag = span.root.tag_
|
|
||||||
text = span.text
|
|
||||||
label = span.label_
|
|
||||||
return (start, end, tag, text, label)
|
|
||||||
|
|
||||||
def merge_spans(spans, doc):
|
|
||||||
# This is a bit awkward atm. What we're doing here is merging the entities,
|
|
||||||
# so that each only takes up a single token. But an entity is a Span, and
|
|
||||||
# each Span is a view into the doc. When we merge a span, we invalidate
|
|
||||||
# the other spans. This will get fixed --- but for now the solution
|
|
||||||
# is to gather the information first, before merging.
|
|
||||||
tuples = [_span_to_tuple(span) for span in spans]
|
|
||||||
for span_tuple in tuples:
|
|
||||||
doc.merge(*span_tuple)
|
|
||||||
|
|
||||||
|
|
||||||
def extract_currency_relations(doc):
|
|
||||||
merge_spans(doc.ents, doc)
|
|
||||||
merge_spans(doc.noun_chunks, doc)
|
|
||||||
|
|
||||||
relations = []
|
|
||||||
for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
|
|
||||||
if money.dep_ in ('attr', 'dobj'):
|
|
||||||
subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
|
|
||||||
if subject:
|
|
||||||
subject = subject[0]
|
|
||||||
relations.append((subject, money))
|
|
||||||
elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
|
|
||||||
relations.append((money.head.head, money))
|
|
||||||
|
|
||||||
return relations
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
nlp = English()
|
|
||||||
texts = [
|
|
||||||
u'Net income was $9.4 million compared to the prior year of $2.7 million.',
|
|
||||||
u'Revenue exceeded twelve billion dollars, with a loss of $1b.',
|
|
||||||
]
|
|
||||||
|
|
||||||
for text in texts:
|
|
||||||
doc = nlp(text)
|
|
||||||
relations = extract_currency_relations(doc)
|
|
||||||
for r1, r2 in relations:
|
|
||||||
print(r1.text, r2.ent_type_, r2.text)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
62
examples/information_extraction/entity_relations.py
Normal file
62
examples/information_extraction/entity_relations.py
Normal file
|
@ -0,0 +1,62 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# coding: utf8
|
||||||
|
"""
|
||||||
|
A simple example of extracting relations between phrases and entities using
|
||||||
|
spaCy's named entity recognizer and the dependency parse. Here, we extract
|
||||||
|
money and currency values (entities labelled as MONEY) and then check the
|
||||||
|
dependency tree to find the noun phrase they are referring to – for example:
|
||||||
|
$9.4 million --> Net income.
|
||||||
|
|
||||||
|
Last updated for: spaCy 2.0.0a18
|
||||||
|
"""
|
||||||
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
import plac
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
|
||||||
|
TEXTS = [
|
||||||
|
'Net income was $9.4 million compared to the prior year of $2.7 million.',
|
||||||
|
'Revenue exceeded twelve billion dollars, with a loss of $1b.',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
model=("Model to load (needs parser and NER)", "positional", None, str))
|
||||||
|
def main(model='en_core_web_sm'):
|
||||||
|
nlp = spacy.load(model)
|
||||||
|
print("Loaded model '%s'" % model)
|
||||||
|
print("Processing %d texts" % len(TEXTS))
|
||||||
|
|
||||||
|
for text in TEXTS:
|
||||||
|
doc = nlp(text)
|
||||||
|
relations = extract_currency_relations(doc)
|
||||||
|
for r1, r2 in relations:
|
||||||
|
print('{:<10}\t{}\t{}'.format(r1.text, r2.ent_type_, r2.text))
|
||||||
|
|
||||||
|
|
||||||
|
def extract_currency_relations(doc):
|
||||||
|
# merge entities and noun chunks into one token
|
||||||
|
for span in [*list(doc.ents), *list(doc.noun_chunks)]:
|
||||||
|
span.merge()
|
||||||
|
|
||||||
|
relations = []
|
||||||
|
for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
|
||||||
|
if money.dep_ in ('attr', 'dobj'):
|
||||||
|
subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
|
||||||
|
if subject:
|
||||||
|
subject = subject[0]
|
||||||
|
relations.append((subject, money))
|
||||||
|
elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
|
||||||
|
relations.append((money.head.head, money))
|
||||||
|
return relations
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
plac.call(main)
|
||||||
|
|
||||||
|
# Expected output:
|
||||||
|
# Net income MONEY $9.4 million
|
||||||
|
# the prior year MONEY $2.7 million
|
||||||
|
# Revenue MONEY twelve billion dollars
|
||||||
|
# a loss MONEY 1b
|
65
examples/information_extraction/parse_subtrees.py
Normal file
65
examples/information_extraction/parse_subtrees.py
Normal file
|
@ -0,0 +1,65 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# coding: utf8
|
||||||
|
"""
|
||||||
|
This example shows how to navigate the parse tree including subtrees attached
|
||||||
|
to a word.
|
||||||
|
|
||||||
|
Based on issue #252:
|
||||||
|
"In the documents and tutorials the main thing I haven't found is
|
||||||
|
examples on how to break sentences down into small sub thoughts/chunks. The
|
||||||
|
noun_chunks is handy, but having examples on using the token.head to find small
|
||||||
|
(near-complete) sentence chunks would be neat. Lets take the example sentence:
|
||||||
|
"displaCy uses CSS and JavaScript to show you how computers understand language"
|
||||||
|
|
||||||
|
This sentence has two main parts (XCOMP & CCOMP) according to the breakdown:
|
||||||
|
[displaCy] uses CSS and Javascript [to + show]
|
||||||
|
show you how computers understand [language]
|
||||||
|
|
||||||
|
I'm assuming that we can use the token.head to build these groups."
|
||||||
|
|
||||||
|
Last updated for: spaCy 2.0.0a18
|
||||||
|
"""
|
||||||
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
import plac
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
model=("Model to load", "positional", None, str))
|
||||||
|
def main(model='en_core_web_sm'):
|
||||||
|
nlp = spacy.load(model)
|
||||||
|
print("Loaded model '%s'" % model)
|
||||||
|
|
||||||
|
doc = nlp("displaCy uses CSS and JavaScript to show you how computers "
|
||||||
|
"understand language")
|
||||||
|
|
||||||
|
# The easiest way is to find the head of the subtree you want, and then use
|
||||||
|
# the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree`
|
||||||
|
# is the one that does what you're asking for most directly:
|
||||||
|
for word in doc:
|
||||||
|
if word.dep_ in ('xcomp', 'ccomp'):
|
||||||
|
print(''.join(w.text_with_ws for w in word.subtree))
|
||||||
|
|
||||||
|
# It'd probably be better for `word.subtree` to return a `Span` object
|
||||||
|
# instead of a generator over the tokens. If you want the `Span` you can
|
||||||
|
# get it via the `.right_edge` and `.left_edge` properties. The `Span`
|
||||||
|
# object is nice because you can easily get a vector, merge it, etc.
|
||||||
|
for word in doc:
|
||||||
|
if word.dep_ in ('xcomp', 'ccomp'):
|
||||||
|
subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
|
||||||
|
print(subtree_span.text, '|', subtree_span.root.text)
|
||||||
|
|
||||||
|
# You might also want to select a head, and then select a start and end
|
||||||
|
# position by walking along its children. You could then take the
|
||||||
|
# `.left_edge` and `.right_edge` of those tokens, and use it to calculate
|
||||||
|
# a span.
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
plac.call(main)
|
||||||
|
|
||||||
|
# Expected output:
|
||||||
|
# to show you how computers understand language
|
||||||
|
# how computers understand language
|
||||||
|
# to show you how computers understand language | show
|
||||||
|
# how computers understand language | understand
|
|
@ -196,8 +196,8 @@
|
||||||
"teaser": "Full code examples you can modify and run.",
|
"teaser": "Full code examples you can modify and run.",
|
||||||
"next": "resources",
|
"next": "resources",
|
||||||
"menu": {
|
"menu": {
|
||||||
|
"Information Extraction": "information-extraction",
|
||||||
"Pipeline": "pipeline",
|
"Pipeline": "pipeline",
|
||||||
"Matching": "matching",
|
|
||||||
"Training": "training",
|
"Training": "training",
|
||||||
"Deep Learning": "deep-learning"
|
"Deep Learning": "deep-learning"
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,37 @@
|
||||||
|
|
||||||
include ../_includes/_mixins
|
include ../_includes/_mixins
|
||||||
|
|
||||||
|
+section("information-extraction")
|
||||||
|
+h(3, "phrase-matcher") Using spaCy's phrase matcher
|
||||||
|
+tag-new(2)
|
||||||
|
|
||||||
|
p
|
||||||
|
| This example shows how to use the new
|
||||||
|
| #[+api("phrasematcher") #[code PhraseMatcher]] to efficiently find
|
||||||
|
| entities from a large terminology list.
|
||||||
|
|
||||||
|
+github("spacy", "examples/information_extraction/phrase_matcher.py")
|
||||||
|
|
||||||
|
+h(3, "entity-relations") Extracting entity relations
|
||||||
|
|
||||||
|
p
|
||||||
|
| A simple example of extracting relations between phrases and
|
||||||
|
| entities using spaCy's named entity recognizer and the dependency
|
||||||
|
| parse. Here, we extract money and currency values (entities labelled
|
||||||
|
| as #[code MONEY]) and then check the dependency tree to find the
|
||||||
|
| noun phrase they are referring to – for example: "$9.4 million"
|
||||||
|
| → "Net income".
|
||||||
|
|
||||||
|
+github("spacy", "examples/information_extraction/entity_relations.py")
|
||||||
|
|
||||||
|
+h(3, "subtrees") Navigating the parse tree and subtrees
|
||||||
|
|
||||||
|
p
|
||||||
|
| This example shows how to navigate the parse tree including subtrees
|
||||||
|
| attached to a word.
|
||||||
|
|
||||||
|
+github("spacy", "examples/information_extraction/parse_subtrees.py")
|
||||||
|
|
||||||
+section("pipeline")
|
+section("pipeline")
|
||||||
+h(3, "custom-components-entities") Custom pipeline components and attribute extensions
|
+h(3, "custom-components-entities") Custom pipeline components and attribute extensions
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
@ -40,26 +71,6 @@ include ../_includes/_mixins
|
||||||
|
|
||||||
+github("spacy", "examples/pipeline/custom_attr_methods.py")
|
+github("spacy", "examples/pipeline/custom_attr_methods.py")
|
||||||
|
|
||||||
+section("matching")
|
|
||||||
+h(3, "matcher") Using spaCy's rule-based matcher
|
|
||||||
|
|
||||||
p
|
|
||||||
| This example shows how to use spaCy's rule-based
|
|
||||||
| #[+api("matcher") #[code Matcher]] to find and label entities across
|
|
||||||
| documents.
|
|
||||||
|
|
||||||
+github("spacy", "examples/matcher_example.py")
|
|
||||||
|
|
||||||
+h(3, "phrase-matcher") Using spaCy's phrase matcher
|
|
||||||
+tag-new(2)
|
|
||||||
|
|
||||||
p
|
|
||||||
| This example shows how to use the new
|
|
||||||
| #[+api("phrasematcher") #[code PhraseMatcher]] to efficiently find
|
|
||||||
| entities from a large terminology list.
|
|
||||||
|
|
||||||
+github("spacy", "examples/phrase_matcher.py")
|
|
||||||
|
|
||||||
+section("training")
|
+section("training")
|
||||||
+h(3, "training-ner") Training spaCy's Named Entity Recognizer
|
+h(3, "training-ner") Training spaCy's Named Entity Recognizer
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user