Add noun_chunks to Span

2025-09-18 10:02:40 +03:00 · 2016-11-24 10:47:20 +00:00 · 2016-11-24 10:47:20 +00:00 · 3e3bda142d
commit 3e3bda142d
parent a98da29232
3 changed files with 32 additions and 2 deletions
--- a/spacy/syntax/iterators.pyx
+++ b/spacy/syntax/iterators.pyx
@ -1,13 +1,14 @@
 from spacy.parts_of_speech cimport NOUN, PROPN, PRON
-def english_noun_chunks(doc):
+def english_noun_chunks(obj):
    labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
              'attr', 'ROOT', 'root']
    doc = obj.doc
    np_deps = [doc.vocab.strings[label] for label in labels]
    conj = doc.vocab.strings['conj']
    np_label = doc.vocab.strings['NP']
-    for i, word in enumerate(doc):
+    for i, word in enumerate(obj):
        if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
            yield word.left_edge.i, word.i+1, np_label
        elif word.pos == NOUN and word.dep == conj:
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -223,6 +223,10 @@ cdef class Doc:
    def __repr__(self):
        return self.__str__()
    @property
    def doc(self):
        return self
    def similarity(self, other):
        '''Make a semantic similarity estimate. The default estimate is cosine
        similarity using an average of word vectors.
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -190,6 +190,31 @@ cdef class Span:
        def __get__(self):
            return u''.join([t.text_with_ws for t in self])
    property noun_chunks:
        '''
        Yields base noun-phrase #[code Span] objects, if the document
        has been syntactically parsed. A base noun phrase, or 
        'NP chunk', is a noun phrase that does not permit other NPs to 
        be nested within it – so no NP-level coordination, no prepositional 
        phrases, and no relative clauses. For example:
        '''
        def __get__(self):
            if not self.doc.is_parsed:
                raise ValueError(
                    "noun_chunks requires the dependency parse, which "
                    "requires data to be installed. If you haven't done so, run: "
                    "\npython -m spacy.%s.download all\n"
                    "to install the data" % self.vocab.lang)
            # Accumulate the result before beginning to iterate over it. This prevents
            # the tokenisation from being changed out from under us during the iteration.
            # The tricky thing here is that Span accepts its tokenisation changing,
            # so it's okay once we have the Span objects. See Issue #375
            spans = []
            for start, end, label in self.doc.noun_chunks_iterator(self):
                spans.append(Span(self, start, end, label=label))
            for span in spans:
                yield span
    property root:
        """The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered.