add baseclass DocIterator for iterators over documents

add classes for English and German noun chunks the respective iterators are set for the document when created by the parser as they depend on the annotation scheme of the parsing model
2026-03-06 21:01:34 +03:00 · 2016-03-16 15:53:35 +01:00 · 2016-03-16 15:53:35 +01:00 · 5e2e8e951a
commit 5e2e8e951a
parent 03fb498dbe
11 changed files with 140 additions and 89 deletions
--- a/setup.py
+++ b/setup.py
@ -63,8 +63,7 @@ MOD_NAMES = [
    'spacy.matcher',
    'spacy.syntax.ner',
    'spacy.symbols',
-    'spacy.en.iterators',
-    'spacy.de.iterators']
+    'spacy.syntax.iterators']


 # By subclassing build_extensions we have the actual compiler that will be used
--- a/spacy/de/iterators.pxd
+++ b/spacy/de/iterators.pxd
--- a/spacy/de/iterators.pyx
+++ b/spacy/de/iterators.pyx
@ -1,28 +0,0 @@
-from spacy.structs cimport TokenC
-from spacy.tokens.span cimport Span
-
-from spacy.parts_of_speech cimport NOUN
-
-def noun_chunks(Span sent):
-    # this function extracts spans headed by NOUNs starting from the left-most
-    # syntactic dependent until the NOUN itself
-    # for close apposition and measurement construction, the span is sometimes
-    # extended to the right of the NOUN
-    # example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
-    # just "eine Tasse", same for "das Thema Familie"
-    cdef const TokenC* word
-    strings = sent.doc.vocab.strings
-    labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
-    close_app = strings['nk']
-    np_deps = [strings[label] for label in labels]
-    np_label = strings['NP']
-    for i in range(sent.start, sent.end):
-        word = &sent.doc.c[i]
-        if word.pos == NOUN and word.dep in np_deps:
-            rbracket = i+1
-            # try to extend the span to the right
-            # to capture close apposition/measurement constructions
-            for rdep in sent.doc[i].rights:
-                if rdep.pos == NOUN and rdep.dep == close_app:
-                    rbracket = rdep.i+1
-            yield Span(sent.doc, word.l_edge, rbracket, label=np_label)
--- a/spacy/en/iterators.pxd
+++ b/spacy/en/iterators.pxd
--- a/spacy/en/iterators.pyx
+++ b/spacy/en/iterators.pyx
@ -1,24 +0,0 @@
-from spacy.structs cimport TokenC
-from spacy.tokens.span cimport Span
-
-from spacy.parts_of_speech cimport NOUN
-
-def noun_chunks(Span sent):
-    cdef const TokenC* word
-    strings = sent.doc.vocab.strings
-    labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
-    np_deps = [strings[label] for label in labels]
-    conj = strings['conj']
-    np_label = strings['NP']
-    for i in range(sent.start, sent.end):
-        word = &sent.doc.c[i]
-        if word.pos == NOUN and word.dep in np_deps:
-            yield Span(sent.doc, word.l_edge, i+1, label=np_label)
-        elif word.pos == NOUN and word.dep == conj:
-            head = word+word.head
-            while head.dep == conj and head.head < 0:
-                head += head.head
-            # If the head is an NP, and we're coordinated to it, we're an NP
-            if head.dep in np_deps:
-                yield Span(sent.doc, word.l_edge, i+1, label=np_label)
-
--- a/spacy/syntax/iterators.pxd
+++ b/spacy/syntax/iterators.pxd
@ -0,0 +1,16 @@
+
+from spacy.tokens.doc cimport Doc
+
+cdef class DocIterator:
+    cdef Doc _doc
+
+cdef class EnglishNounChunks(DocIterator):
+    cdef int i
+    cdef int _np_label
+    cdef set _np_deps
+
+cdef class GermanNounChunks(DocIterator):
+    cdef int i
+    cdef int _np_label
+    cdef set _np_deps
+    cdef int _close_app
--- a/spacy/syntax/iterators.pyx
+++ b/spacy/syntax/iterators.pyx
@ -0,0 +1,82 @@
+from spacy.structs cimport TokenC
+from spacy.tokens.span cimport Span
+from spacy.tokens.doc cimport Doc
+from spacy.tokens.token cimport Token
+
+from spacy.parts_of_speech cimport NOUN
+
+# base class for document iterators
+cdef class DocIterator:
+    def __init__(self, Doc doc):
+        self._doc = doc
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        raise NotImplementedError
+
+
+cdef class EnglishNounChunks(DocIterator):
+    def __init__(self, Doc doc):
+        super(EnglishNounChunks,self).__init__(doc)
+        labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
+        self._np_label = self._doc.vocab.strings['NP']
+        self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
+        self._conjunct = self._doc.vocab.strings['conj']
+        self.i = 0
+
+    def __next__(self):
+        cdef const TokenC* word
+        cdef widx
+        while self.i < self._doc.length:
+            widx = self.i
+            self.i += 1
+            word = &self._doc.c[widx]
+            if word.pos == NOUN:
+                if word.dep in self._np_deps:
+                    return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
+                elif word.dep == self._conjunct:
+                    head = word+word.head
+                    while head.dep == self._conjunct and head.head < 0:
+                        head += head.head
+                    # If the head is an NP, and we're coordinated to it, we're an NP
+                    if head.dep in self._np_deps:
+                        return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
+        raise StopIteration
+
+
+# this iterator extracts spans headed by NOUNs starting from the left-most
+# syntactic dependent until the NOUN itself
+# for close apposition and measurement construction, the span is sometimes
+# extended to the right of the NOUN
+# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
+# just "eine Tasse", same for "das Thema Familie"
+cdef class GermanNounChunks(DocIterator):
+    def __init__(self, Doc doc):
+        super(GermanNounChunks,self).__init__(doc)
+        labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
+        self._np_label = self._doc.vocab.strings['NP']
+        self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
+        self._close_app = self._doc.vocab.strings['nk']
+        self.i = 0
+
+    def __next__(self):
+        cdef const TokenC* word
+        cdef int rbracket
+        cdef Token rdep
+        cdef widx
+        while self.i < self._doc.length:
+            widx = self.i
+            self.i += 1
+            word = &self._doc.c[widx]
+            if word.pos == NOUN and word.dep in self._np_deps:
+                rbracket = widx+1
+                # try to extend the span to the right
+                # to capture close apposition/measurement constructions
+                for rdep in self._doc[widx].rights:
+                    if rdep.pos == NOUN and rdep.dep == self._close_app:
+                        rbracket = rdep.i+1
+                return Span(self._doc, word.l_edge, rbracket, label=self._np_label)                
+        raise StopIteration
+
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -47,6 +47,8 @@ from ._parse_features cimport fill_context
 from .stateclass cimport StateClass
 from ._state cimport StateC

+from spacy.syntax.iterators cimport DocIterator, EnglishNounChunks, GermanNounChunks
+CHUNKERS = {'en':EnglishNounChunks, 'de':GermanNounChunks}


 DEBUG = False
@ -113,12 +115,9 @@ cdef class Parser:
        cdef int nr_feat = self.model.nr_feat
        with nogil:
            self.parseC(tokens.c, tokens.length, nr_feat, nr_class)
-            tokens.is_parsed = True
        # Check for KeyboardInterrupt etc. Untested
        PyErr_CheckSignals()
-        # projectivize output
-        if self._projectivize:
-            PseudoProjectivity.deprojectivize(tokens)
+        self._finalize(tokens)

    def pipe(self, stream, int batch_size=1000, int n_threads=2):
        cdef Pool mem = Pool()
@ -144,7 +143,7 @@ cdef class Parser:
                                raise ValueError("Error parsing doc: %s" % sent_str)
                PyErr_CheckSignals()
                for doc in queue:
-                    doc.is_parsed = True
+                    self._finalize(doc)
                    yield doc
                queue = []
        batch_size = len(queue)
@ -155,10 +154,19 @@ cdef class Parser:
                    with gil:
                        sent_str = queue[i].text
                        raise ValueError("Error parsing doc: %s" % sent_str)
-        for doc in queue:
-            doc.is_parsed = True
-            yield doc
        PyErr_CheckSignals()
+        for doc in queue:
+            self._finalize(doc)
+            yield doc
+
+    def _finalize(self, Doc doc):
+        # deprojectivize output
+        if self._projectivize:
+            PseudoProjectivity.deprojectivize(doc)
+        # set annotation-specific iterators
+        doc.noun_chunks = CHUNKERS.get(doc.vocab.lang,DocIterator)
+        # mark doc as parsed
+        doc.is_parsed = True

    cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil:
        cdef ExampleC eg
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -7,6 +7,8 @@ from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport attr_t
 from ..attrs cimport attr_id_t

+from spacy.syntax.iterators cimport DocIterator
+

 cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil

@ -42,6 +44,8 @@ cdef class Doc:
    cdef int length
    cdef int max_length

+    cdef DocIterator noun_chunks_iterator
+
    cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1

    cpdef np.ndarray to_array(self, object features)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -81,6 +81,7 @@ cdef class Doc:
        self.is_parsed = False
        self._py_tokens = []
        self._vector = None
+        self.noun_chunks_iterator = DocIterator(self)

    def __getitem__(self, object i):
        """Get a Token or a Span from the Doc.
@ -231,36 +232,22 @@ cdef class Doc:
                    # Set start as B
                    self.c[start].ent_iob = 3

-    @property
-    def noun_chunks(self):
-        """Yield spans for base noun phrases."""
-        if not self.is_parsed:
-            raise ValueError(
-                "noun_chunks requires the dependency parse, which "
-                "requires data to be installed. If you haven't done so, run: "
-                "\npython -m spacy.en.download all\n"
-                "to install the data")

-        from spacy.en.iterators import noun_chunks as en_noun_chunks
-        from spacy.de.iterators import noun_chunks as de_noun_chunks
+    property noun_chunks:
+        def __get__(self):
+            """Yield spans for base noun phrases."""
+            if not self.is_parsed:
+                raise ValueError(
+                    "noun_chunks requires the dependency parse, which "
+                    "requires data to be installed. If you haven't done so, run: "
+                    "\npython -m spacy.en.download all\n"
+                    "to install the data")

-        chunk_rules = {'en':en_noun_chunks, 
-                       'de':de_noun_chunks,
-                       }
+            yield from self.noun_chunks_iterator

-        for sent in self.sents:
-            print(sent)
-            lang = sent.root.lang_
-            chunker = chunk_rules.get(lang,None)
-            if chunker == None:
-                warnings.warn("noun_chunks is not available for language %s." % lang)
-                print(sent.root.orth_)
-                continue
+        def __set__(self, DocIterator):            
+            self.noun_chunks_iterator = DocIterator(self)

-            for chunk in chunker(sent):
-                yield chunk
-
-        

    @property
    def sents(self):
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -27,7 +27,7 @@ from . import symbols
 from cymem.cymem cimport Address
 from . import util
 from .serialize.packer cimport Packer
-from .attrs cimport PROB
+from .attrs cimport PROB, LANG

 try:
    import copy_reg
@ -105,6 +105,13 @@ cdef class Vocab:
                self._serializer = Packer(self, self.serializer_freqs)
            return self._serializer

+    property lang:
+        def __get__(self):
+            langfunc = None
+            if self.get_lex_attr:
+                langfunc = self.get_lex_attr.get(LANG,None)
+            return langfunc('_') if langfunc else ''
+
    def __len__(self):
        """The current number of lexemes stored."""
        return self.length