Merge pull request #658 from pokey/master

Add noun_chunks to Span
This commit is contained in:
Matthew Honnibal 2016-11-24 23:33:57 +11:00 committed by GitHub
commit 1f247959f3
3 changed files with 37 additions and 4 deletions

View File

@ -1,13 +1,16 @@
from spacy.parts_of_speech cimport NOUN, PROPN, PRON from spacy.parts_of_speech cimport NOUN, PROPN, PRON
def english_noun_chunks(doc): def english_noun_chunks(obj):
'''Detect base noun phrases from a dependency parse.
Works on both Doc and Span.'''
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
'attr', 'ROOT', 'root'] 'attr', 'ROOT', 'root']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings['conj'] conj = doc.vocab.strings['conj']
np_label = doc.vocab.strings['NP'] np_label = doc.vocab.strings['NP']
for i, word in enumerate(doc): for i, word in enumerate(obj):
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps: if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
yield word.left_edge.i, word.i+1, np_label yield word.left_edge.i, word.i+1, np_label
elif word.pos == NOUN and word.dep == conj: elif word.pos == NOUN and word.dep == conj:
@ -25,14 +28,15 @@ def english_noun_chunks(doc):
# extended to the right of the NOUN # extended to the right of the NOUN
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not # example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
# just "eine Tasse", same for "das Thema Familie" # just "eine Tasse", same for "das Thema Familie"
def german_noun_chunks(doc): def german_noun_chunks(obj):
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app'] labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
doc = obj.doc # Ensure works on both Doc and Span.
np_label = doc.vocab.strings['NP'] np_label = doc.vocab.strings['NP']
np_deps = set(doc.vocab.strings[label] for label in labels) np_deps = set(doc.vocab.strings[label] for label in labels)
close_app = doc.vocab.strings['nk'] close_app = doc.vocab.strings['nk']
rbracket = 0 rbracket = 0
for i, word in enumerate(doc): for i, word in enumerate(obj):
if i < rbracket: if i < rbracket:
continue continue
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps: if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:

View File

@ -223,6 +223,10 @@ cdef class Doc:
def __repr__(self): def __repr__(self):
return self.__str__() return self.__str__()
@property
def doc(self):
return self
def similarity(self, other): def similarity(self, other):
'''Make a semantic similarity estimate. The default estimate is cosine '''Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors. similarity using an average of word vectors.

View File

@ -190,6 +190,31 @@ cdef class Span:
def __get__(self): def __get__(self):
return u''.join([t.text_with_ws for t in self]) return u''.join([t.text_with_ws for t in self])
property noun_chunks:
'''
Yields base noun-phrase #[code Span] objects, if the document
has been syntactically parsed. A base noun phrase, or
'NP chunk', is a noun phrase that does not permit other NPs to
be nested within it so no NP-level coordination, no prepositional
phrases, and no relative clauses. For example:
'''
def __get__(self):
if not self.doc.is_parsed:
raise ValueError(
"noun_chunks requires the dependency parse, which "
"requires data to be installed. If you haven't done so, run: "
"\npython -m spacy.%s.download all\n"
"to install the data" % self.vocab.lang)
# Accumulate the result before beginning to iterate over it. This prevents
# the tokenisation from being changed out from under us during the iteration.
# The tricky thing here is that Span accepts its tokenisation changing,
# so it's okay once we have the Span objects. See Issue #375
spans = []
for start, end, label in self.doc.noun_chunks_iterator(self):
spans.append(Span(self, start, end, label=label))
for span in spans:
yield span
property root: property root:
"""The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered. """The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered.