mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
commit
1f247959f3
|
@ -1,13 +1,16 @@
|
||||||
from spacy.parts_of_speech cimport NOUN, PROPN, PRON
|
from spacy.parts_of_speech cimport NOUN, PROPN, PRON
|
||||||
|
|
||||||
|
|
||||||
def english_noun_chunks(doc):
|
def english_noun_chunks(obj):
|
||||||
|
'''Detect base noun phrases from a dependency parse.
|
||||||
|
Works on both Doc and Span.'''
|
||||||
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
|
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
|
||||||
'attr', 'ROOT', 'root']
|
'attr', 'ROOT', 'root']
|
||||||
|
doc = obj.doc # Ensure works on both Doc and Span.
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
conj = doc.vocab.strings['conj']
|
conj = doc.vocab.strings['conj']
|
||||||
np_label = doc.vocab.strings['NP']
|
np_label = doc.vocab.strings['NP']
|
||||||
for i, word in enumerate(doc):
|
for i, word in enumerate(obj):
|
||||||
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
||||||
yield word.left_edge.i, word.i+1, np_label
|
yield word.left_edge.i, word.i+1, np_label
|
||||||
elif word.pos == NOUN and word.dep == conj:
|
elif word.pos == NOUN and word.dep == conj:
|
||||||
|
@ -25,14 +28,15 @@ def english_noun_chunks(doc):
|
||||||
# extended to the right of the NOUN
|
# extended to the right of the NOUN
|
||||||
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
|
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
|
||||||
# just "eine Tasse", same for "das Thema Familie"
|
# just "eine Tasse", same for "das Thema Familie"
|
||||||
def german_noun_chunks(doc):
|
def german_noun_chunks(obj):
|
||||||
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
|
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
|
||||||
|
doc = obj.doc # Ensure works on both Doc and Span.
|
||||||
np_label = doc.vocab.strings['NP']
|
np_label = doc.vocab.strings['NP']
|
||||||
np_deps = set(doc.vocab.strings[label] for label in labels)
|
np_deps = set(doc.vocab.strings[label] for label in labels)
|
||||||
close_app = doc.vocab.strings['nk']
|
close_app = doc.vocab.strings['nk']
|
||||||
|
|
||||||
rbracket = 0
|
rbracket = 0
|
||||||
for i, word in enumerate(doc):
|
for i, word in enumerate(obj):
|
||||||
if i < rbracket:
|
if i < rbracket:
|
||||||
continue
|
continue
|
||||||
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
||||||
|
|
|
@ -223,6 +223,10 @@ cdef class Doc:
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return self.__str__()
|
return self.__str__()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def doc(self):
|
||||||
|
return self
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
'''Make a semantic similarity estimate. The default estimate is cosine
|
'''Make a semantic similarity estimate. The default estimate is cosine
|
||||||
similarity using an average of word vectors.
|
similarity using an average of word vectors.
|
||||||
|
|
|
@ -190,6 +190,31 @@ cdef class Span:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return u''.join([t.text_with_ws for t in self])
|
return u''.join([t.text_with_ws for t in self])
|
||||||
|
|
||||||
|
property noun_chunks:
|
||||||
|
'''
|
||||||
|
Yields base noun-phrase #[code Span] objects, if the document
|
||||||
|
has been syntactically parsed. A base noun phrase, or
|
||||||
|
'NP chunk', is a noun phrase that does not permit other NPs to
|
||||||
|
be nested within it – so no NP-level coordination, no prepositional
|
||||||
|
phrases, and no relative clauses. For example:
|
||||||
|
'''
|
||||||
|
def __get__(self):
|
||||||
|
if not self.doc.is_parsed:
|
||||||
|
raise ValueError(
|
||||||
|
"noun_chunks requires the dependency parse, which "
|
||||||
|
"requires data to be installed. If you haven't done so, run: "
|
||||||
|
"\npython -m spacy.%s.download all\n"
|
||||||
|
"to install the data" % self.vocab.lang)
|
||||||
|
# Accumulate the result before beginning to iterate over it. This prevents
|
||||||
|
# the tokenisation from being changed out from under us during the iteration.
|
||||||
|
# The tricky thing here is that Span accepts its tokenisation changing,
|
||||||
|
# so it's okay once we have the Span objects. See Issue #375
|
||||||
|
spans = []
|
||||||
|
for start, end, label in self.doc.noun_chunks_iterator(self):
|
||||||
|
spans.append(Span(self, start, end, label=label))
|
||||||
|
for span in spans:
|
||||||
|
yield span
|
||||||
|
|
||||||
property root:
|
property root:
|
||||||
"""The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered.
|
"""The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user