* Refactor noun chunk iterators, so that they're simple functions. Install the iterator when the Doc is created, but allow users to write to the noun_chunk_iterator attribute. The iterator functions accept an object and yield (int start, int end, int label) triples.

2025-08-09 06:34:54 +03:00 · 2016-05-02 14:25:10 +02:00 · 2016-05-02 14:25:10 +02:00 · 508fd1f6dc
commit 508fd1f6dc
parent e526be5602
7 changed files with 57 additions and 131 deletions
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -383,6 +383,9 @@ cdef class ArcEager(TransitionSystem):
            if st._sent[i].head == 0 and st._sent[i].dep == 0:
                st._sent[i].dep = self.root_label

+    def finalize_doc(self, doc):
+        doc.is_parsed = True
+
    cdef int set_valid(self, int* output, const StateC* st) nogil:
        cdef bint[N_MOVES] is_valid
        is_valid[SHIFT] = Shift.is_valid(st, -1)
--- a/spacy/syntax/iterators.pxd
+++ b/spacy/syntax/iterators.pxd
@ -1,19 +0,0 @@
-
-from spacy.tokens.doc cimport Doc
-
-cdef dict CHUNKERS
-
-cdef class DocIterator:
-    cdef Doc _doc
-
-cdef class EnglishNounChunks(DocIterator):
-    cdef int i
-    cdef int _np_label
-    cdef set _np_deps
-    cdef int _conjunct
-
-cdef class GermanNounChunks(DocIterator):
-    cdef int i
-    cdef int _np_label
-    cdef set _np_deps
-    cdef int _close_app
--- a/spacy/syntax/iterators.pyx
+++ b/spacy/syntax/iterators.pyx
@ -1,55 +1,23 @@
-from spacy.structs cimport TokenC
-from spacy.tokens.span cimport Span
-from spacy.tokens.doc cimport Doc
-from spacy.tokens.token cimport Token
-
 from spacy.parts_of_speech cimport NOUN

-CHUNKERS = {'en':EnglishNounChunks, 'de':GermanNounChunks}

-# base class for document iterators
-cdef class DocIterator:
-    def __init__(self, Doc doc):
-        self._doc = doc
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        raise NotImplementedError
-
-
-cdef class EnglishNounChunks(DocIterator):
-    def __init__(self, Doc doc):
-        super(EnglishNounChunks,self).__init__(doc)
-        labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
-        self._np_label = self._doc.vocab.strings['NP']
-        self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
-        self._conjunct = self._doc.vocab.strings['conj']
-        self.i = 0
-
-    def __iter__(self):
-        self.i = 0
-        return super(EnglishNounChunks,self).__iter__()
-
-    def __next__(self):
-        cdef const TokenC* word
-        cdef widx
-        while self.i < self._doc.length:
-            widx = self.i
-            self.i += 1
-            word = &self._doc.c[widx]
-            if word.pos == NOUN:
-                if word.dep in self._np_deps:
-                    return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
-                elif word.dep == self._conjunct:
-                    head = word+word.head
-                    while head.dep == self._conjunct and head.head < 0:
-                        head += head.head
-                    # If the head is an NP, and we're coordinated to it, we're an NP
-                    if head.dep in self._np_deps:
-                        return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
-        raise StopIteration
+def english_noun_chunks(doc):
+    labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
+              'attr', 'root']
+    np_deps = [doc.vocab.strings[label] for label in labels]
+    conj = doc.vocab.strings['conj']
+    np_label = doc.vocab.strings['NP']
+    for i in range(len(doc)):
+        word = doc[i]
+        if word.pos == NOUN and word.dep in np_deps:
+            yield word.left_edge.i, word.i+1, np_label
+        elif word.pos == NOUN and word.dep == conj:
+            head = word.head
+            while head.dep == conj and head.head.i < head.i:
+                head = head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                yield word.left_edge.i, word.i+1, np_label


 # this iterator extracts spans headed by NOUNs starting from the left-most
@ -58,35 +26,21 @@ cdef class EnglishNounChunks(DocIterator):
 # extended to the right of the NOUN
 # example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
 # just "eine Tasse", same for "das Thema Familie"
-cdef class GermanNounChunks(DocIterator):
-    def __init__(self, Doc doc):
-        super(GermanNounChunks,self).__init__(doc)
-        labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
-        self._np_label = self._doc.vocab.strings['NP']
-        self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
-        self._close_app = self._doc.vocab.strings['nk']
-        self.i = 0
+def german_noun_chunks(doc):
+    labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
+    np_label = doc.vocab.strings['NP']
+    np_deps = set(doc.vocab.strings[label] for label in labels)
+    close_app = doc.vocab.strings['nk']

-    def __iter__(self):
-        self.i = 0
-        return super(GermanNounChunks,self).__iter__()
+    for word in doc:
+        if word.pos == NOUN and word.dep in np_deps:
+            rbracket = word.i+1
+            # try to extend the span to the right
+            # to capture close apposition/measurement constructions
+            for rdep in doc[word.i].rights:
+                if rdep.pos == NOUN and rdep.dep == close_app:
+                    rbracket = rdep.i+1
+            yield word.l_edge, rbracket, np_label

-    def __next__(self):
-        cdef const TokenC* word
-        cdef int rbracket
-        cdef Token rdep
-        cdef widx
-        while self.i < self._doc.length:
-            widx = self.i
-            self.i += 1
-            word = &self._doc.c[widx]
-            if word.pos == NOUN and word.dep in self._np_deps:
-                rbracket = widx+1
-                # try to extend the span to the right
-                # to capture close apposition/measurement constructions
-                for rdep in self._doc[widx].rights:
-                    if rdep.pos == NOUN and rdep.dep == self._close_app:
-                        rbracket = rdep.i+1
-                return Span(self._doc, word.l_edge, rbracket, label=self._np_label)
-        raise StopIteration

+CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -47,8 +47,6 @@ from ._parse_features cimport fill_context
 from .stateclass cimport StateClass
 from ._state cimport StateC

-from spacy.syntax.iterators cimport CHUNKERS, DocIterator, EnglishNounChunks, GermanNounChunks
-

 DEBUG = False
 def set_debug(val):
@ -116,7 +114,7 @@ cdef class Parser:
            self.parseC(tokens.c, tokens.length, nr_feat, nr_class)
        # Check for KeyboardInterrupt etc. Untested
        PyErr_CheckSignals()
-        self._finalize(tokens)
+        self.moves.finalize_doc(tokens)

    def pipe(self, stream, int batch_size=1000, int n_threads=2):
        cdef Pool mem = Pool()
@ -142,7 +140,7 @@ cdef class Parser:
                                raise ValueError("Error parsing doc: %s" % sent_str)
                PyErr_CheckSignals()
                for doc in queue:
-                    self._finalize(doc)
+                    self.moves.finalize_doc(doc)
                    yield doc
                queue = []
        batch_size = len(queue)
@ -155,18 +153,9 @@ cdef class Parser:
                        raise ValueError("Error parsing doc: %s" % sent_str)
        PyErr_CheckSignals()
        for doc in queue:
-            self._finalize(doc)
+            self.moves.finalize_doc(doc)
            yield doc

-    def _finalize(self, Doc doc):
-        # deprojectivize output
-        if self._projectivize:
-            PseudoProjectivity.deprojectivize(doc)
-        # set annotation-specific iterators
-        doc.noun_chunks = CHUNKERS.get(doc.vocab.lang,DocIterator)
-        # mark doc as parsed
-        doc.is_parsed = True
-
    cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil:
        cdef ExampleC eg
        eg.nr_feat = nr_feat
@ -313,6 +302,7 @@ cdef class StepwiseState:
        if self.stcls.is_final():
            self.parser.moves.finalize_state(self.stcls.c)
        self.doc.set_parse(self.stcls.c._sent)
+        self.parser.moves.finalize_doc(self.doc)


 cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -53,6 +53,9 @@ cdef class TransitionSystem:
    cdef int finalize_state(self, StateC* state) nogil:
        pass

+    def finalize_doc(self, doc):
+        pass
+
    cdef int preprocess_gold(self, GoldParse gold) except -1:
        raise NotImplementedError

--- a/spacy/tests/tokens/test_noun_chunks.py
+++ b/spacy/tests/tokens/test_noun_chunks.py
@ -3,7 +3,7 @@ import numpy as np
 from spacy.attrs import HEAD, DEP
 from spacy.symbols import nsubj, dobj, punct, amod, nmod, conj, cc, root
 from spacy.en import English
-from spacy.syntax.iterators import EnglishNounChunks
+from spacy.syntax.iterators import english_noun_chunks


 def test_not_nested():
@ -22,9 +22,7 @@ def test_not_nested():
                [-2, conj],
                [-5, dobj]
            ], dtype='int32'))
-    tokens.noun_chunks = EnglishNounChunks
-    for chunk in tokens.noun_chunks:
-        print(chunk.text)
+    tokens.noun_chunks_iterator = english_noun_chunks
    word_occurred = {}
    for chunk in tokens.noun_chunks:
        for word in chunk:
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -23,6 +23,7 @@ from .span cimport Span
 from .token cimport Token
 from ..serialize.bits cimport BitArray
 from ..util import normalize_slice
+from ..syntax.iterators import CHUNKERS


 DEF PADDING = 5
@ -81,7 +82,7 @@ cdef class Doc:
        self.is_parsed = False
        self._py_tokens = []
        self._vector = None
-        self.noun_chunks_iterator = DocIterator(self)
+        self.noun_chunks_iterator = CHUNKERS.get(self.vocab.lang)

    def __getitem__(self, object i):
        """Get a Token or a Span from the Doc.
@ -233,21 +234,17 @@ cdef class Doc:
                    self.c[start].ent_iob = 3


-    property noun_chunks:
-        def __get__(self):
-            """Yield spans for base noun phrases."""
-            if not self.is_parsed:
-                raise ValueError(
-                    "noun_chunks requires the dependency parse, which "
-                    "requires data to be installed. If you haven't done so, run: "
-                    "\npython -m spacy.%s.download all\n"
-                    "to install the data" % self.vocab.lang)
-
-            yield from self.noun_chunks_iterator
-
-        def __set__(self, DocIterator):            
-            self.noun_chunks_iterator = DocIterator(self)
-
+    @property
+    def noun_chunks(self):
+        """Yield spans for base noun phrases."""
+        if not self.is_parsed:
+            raise ValueError(
+                "noun_chunks requires the dependency parse, which "
+                "requires data to be installed. If you haven't done so, run: "
+                "\npython -m spacy.%s.download all\n"
+                "to install the data" % self.vocab.lang)
+        for start, end, label in self.noun_chunks_iterator(self):
+            yield Span(self, start, end, label=label)

    @property
    def sents(self):