From 5e2e8e951a75348d069d68cade7972c6cff55ee9 Mon Sep 17 00:00:00 2001
From: Wolfgang Seeker <seeker@spacy.io>
Date: Wed, 16 Mar 2016 15:53:35 +0100
Subject: [PATCH] add baseclass DocIterator for iterators over documents add
 classes for English and German noun chunks

the respective iterators are set for the document when created by the parser
as they depend on the annotation scheme of the parsing model
---
 setup.py                   |  3 +-
 spacy/de/iterators.pxd     |  0
 spacy/de/iterators.pyx     | 28 -------------
 spacy/en/iterators.pxd     |  0
 spacy/en/iterators.pyx     | 24 -----------
 spacy/syntax/iterators.pxd | 16 ++++++++
 spacy/syntax/iterators.pyx | 82 ++++++++++++++++++++++++++++++++++++++
 spacy/syntax/parser.pyx    | 24 +++++++----
 spacy/tokens/doc.pxd       |  4 ++
 spacy/tokens/doc.pyx       | 39 ++++++------------
 spacy/vocab.pyx            |  9 ++++-
 11 files changed, 140 insertions(+), 89 deletions(-)
 delete mode 100644 spacy/de/iterators.pxd
 delete mode 100644 spacy/de/iterators.pyx
 delete mode 100644 spacy/en/iterators.pxd
 delete mode 100644 spacy/en/iterators.pyx
 create mode 100644 spacy/syntax/iterators.pxd
 create mode 100644 spacy/syntax/iterators.pyx

diff --git a/setup.py b/setup.py
index 7449212b9..91a118227 100644
--- a/setup.py
+++ b/setup.py
@@ -63,8 +63,7 @@ MOD_NAMES = [
     'spacy.matcher',
     'spacy.syntax.ner',
     'spacy.symbols',
-    'spacy.en.iterators',
-    'spacy.de.iterators']
+    'spacy.syntax.iterators']
 
 
 # By subclassing build_extensions we have the actual compiler that will be used
diff --git a/spacy/de/iterators.pxd b/spacy/de/iterators.pxd
deleted file mode 100644
index e69de29bb..000000000
diff --git a/spacy/de/iterators.pyx b/spacy/de/iterators.pyx
deleted file mode 100644
index a6321bd57..000000000
--- a/spacy/de/iterators.pyx
+++ /dev/null
@@ -1,28 +0,0 @@
-from spacy.structs cimport TokenC
-from spacy.tokens.span cimport Span
-
-from spacy.parts_of_speech cimport NOUN
-
-def noun_chunks(Span sent):
-    # this function extracts spans headed by NOUNs starting from the left-most
-    # syntactic dependent until the NOUN itself
-    # for close apposition and measurement construction, the span is sometimes
-    # extended to the right of the NOUN
-    # example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
-    # just "eine Tasse", same for "das Thema Familie"
-    cdef const TokenC* word
-    strings = sent.doc.vocab.strings
-    labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
-    close_app = strings['nk']
-    np_deps = [strings[label] for label in labels]
-    np_label = strings['NP']
-    for i in range(sent.start, sent.end):
-        word = &sent.doc.c[i]
-        if word.pos == NOUN and word.dep in np_deps:
-            rbracket = i+1
-            # try to extend the span to the right
-            # to capture close apposition/measurement constructions
-            for rdep in sent.doc[i].rights:
-                if rdep.pos == NOUN and rdep.dep == close_app:
-                    rbracket = rdep.i+1
-            yield Span(sent.doc, word.l_edge, rbracket, label=np_label)
diff --git a/spacy/en/iterators.pxd b/spacy/en/iterators.pxd
deleted file mode 100644
index e69de29bb..000000000
diff --git a/spacy/en/iterators.pyx b/spacy/en/iterators.pyx
deleted file mode 100644
index e4f0fe2a4..000000000
--- a/spacy/en/iterators.pyx
+++ /dev/null
@@ -1,24 +0,0 @@
-from spacy.structs cimport TokenC
-from spacy.tokens.span cimport Span
-
-from spacy.parts_of_speech cimport NOUN
-
-def noun_chunks(Span sent):
-    cdef const TokenC* word
-    strings = sent.doc.vocab.strings
-    labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
-    np_deps = [strings[label] for label in labels]
-    conj = strings['conj']
-    np_label = strings['NP']
-    for i in range(sent.start, sent.end):
-        word = &sent.doc.c[i]
-        if word.pos == NOUN and word.dep in np_deps:
-            yield Span(sent.doc, word.l_edge, i+1, label=np_label)
-        elif word.pos == NOUN and word.dep == conj:
-            head = word+word.head
-            while head.dep == conj and head.head < 0:
-                head += head.head
-            # If the head is an NP, and we're coordinated to it, we're an NP
-            if head.dep in np_deps:
-                yield Span(sent.doc, word.l_edge, i+1, label=np_label)
-
diff --git a/spacy/syntax/iterators.pxd b/spacy/syntax/iterators.pxd
new file mode 100644
index 000000000..662f851c8
--- /dev/null
+++ b/spacy/syntax/iterators.pxd
@@ -0,0 +1,16 @@
+
+from spacy.tokens.doc cimport Doc
+
+cdef class DocIterator:
+    cdef Doc _doc
+
+cdef class EnglishNounChunks(DocIterator):
+    cdef int i
+    cdef int _np_label
+    cdef set _np_deps
+
+cdef class GermanNounChunks(DocIterator):
+    cdef int i
+    cdef int _np_label
+    cdef set _np_deps
+    cdef int _close_app
diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx
new file mode 100644
index 000000000..78679b8ce
--- /dev/null
+++ b/spacy/syntax/iterators.pyx
@@ -0,0 +1,82 @@
+from spacy.structs cimport TokenC
+from spacy.tokens.span cimport Span
+from spacy.tokens.doc cimport Doc
+from spacy.tokens.token cimport Token
+
+from spacy.parts_of_speech cimport NOUN
+
+# base class for document iterators
+cdef class DocIterator:
+    def __init__(self, Doc doc):
+        self._doc = doc
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        raise NotImplementedError
+
+
+cdef class EnglishNounChunks(DocIterator):
+    def __init__(self, Doc doc):
+        super(EnglishNounChunks,self).__init__(doc)
+        labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
+        self._np_label = self._doc.vocab.strings['NP']
+        self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
+        self._conjunct = self._doc.vocab.strings['conj']
+        self.i = 0
+
+    def __next__(self):
+        cdef const TokenC* word
+        cdef widx
+        while self.i < self._doc.length:
+            widx = self.i
+            self.i += 1
+            word = &self._doc.c[widx]
+            if word.pos == NOUN:
+                if word.dep in self._np_deps:
+                    return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
+                elif word.dep == self._conjunct:
+                    head = word+word.head
+                    while head.dep == self._conjunct and head.head < 0:
+                        head += head.head
+                    # If the head is an NP, and we're coordinated to it, we're an NP
+                    if head.dep in self._np_deps:
+                        return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
+        raise StopIteration
+
+
+# this iterator extracts spans headed by NOUNs starting from the left-most
+# syntactic dependent until the NOUN itself
+# for close apposition and measurement construction, the span is sometimes
+# extended to the right of the NOUN
+# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
+# just "eine Tasse", same for "das Thema Familie"
+cdef class GermanNounChunks(DocIterator):
+    def __init__(self, Doc doc):
+        super(GermanNounChunks,self).__init__(doc)
+        labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
+        self._np_label = self._doc.vocab.strings['NP']
+        self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
+        self._close_app = self._doc.vocab.strings['nk']
+        self.i = 0
+
+    def __next__(self):
+        cdef const TokenC* word
+        cdef int rbracket
+        cdef Token rdep
+        cdef widx
+        while self.i < self._doc.length:
+            widx = self.i
+            self.i += 1
+            word = &self._doc.c[widx]
+            if word.pos == NOUN and word.dep in self._np_deps:
+                rbracket = widx+1
+                # try to extend the span to the right
+                # to capture close apposition/measurement constructions
+                for rdep in self._doc[widx].rights:
+                    if rdep.pos == NOUN and rdep.dep == self._close_app:
+                        rbracket = rdep.i+1
+                return Span(self._doc, word.l_edge, rbracket, label=self._np_label)                
+        raise StopIteration
+
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index a83c397dc..c7b88d5b8 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -47,6 +47,8 @@ from ._parse_features cimport fill_context
 from .stateclass cimport StateClass
 from ._state cimport StateC
 
+from spacy.syntax.iterators cimport DocIterator, EnglishNounChunks, GermanNounChunks
+CHUNKERS = {'en':EnglishNounChunks, 'de':GermanNounChunks}
 
 
 DEBUG = False
@@ -113,12 +115,9 @@ cdef class Parser:
         cdef int nr_feat = self.model.nr_feat
         with nogil:
             self.parseC(tokens.c, tokens.length, nr_feat, nr_class)
-            tokens.is_parsed = True
         # Check for KeyboardInterrupt etc. Untested
         PyErr_CheckSignals()
-        # projectivize output
-        if self._projectivize:
-            PseudoProjectivity.deprojectivize(tokens)
+        self._finalize(tokens)
 
     def pipe(self, stream, int batch_size=1000, int n_threads=2):
         cdef Pool mem = Pool()
@@ -144,7 +143,7 @@ cdef class Parser:
                                 raise ValueError("Error parsing doc: %s" % sent_str)
                 PyErr_CheckSignals()
                 for doc in queue:
-                    doc.is_parsed = True
+                    self._finalize(doc)
                     yield doc
                 queue = []
         batch_size = len(queue)
@@ -155,10 +154,19 @@ cdef class Parser:
                     with gil:
                         sent_str = queue[i].text
                         raise ValueError("Error parsing doc: %s" % sent_str)
-        for doc in queue:
-            doc.is_parsed = True
-            yield doc
         PyErr_CheckSignals()
+        for doc in queue:
+            self._finalize(doc)
+            yield doc
+
+    def _finalize(self, Doc doc):
+        # deprojectivize output
+        if self._projectivize:
+            PseudoProjectivity.deprojectivize(doc)
+        # set annotation-specific iterators
+        doc.noun_chunks = CHUNKERS.get(doc.vocab.lang,DocIterator)
+        # mark doc as parsed
+        doc.is_parsed = True
 
     cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil:
         cdef ExampleC eg
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index aa2cf6b54..02b6f29a5 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -7,6 +7,8 @@ from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport attr_t
 from ..attrs cimport attr_id_t
 
+from spacy.syntax.iterators cimport DocIterator
+
 
 cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
 
@@ -42,6 +44,8 @@ cdef class Doc:
     cdef int length
     cdef int max_length
 
+    cdef DocIterator noun_chunks_iterator
+
     cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1
 
     cpdef np.ndarray to_array(self, object features)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 887b1085f..faed51e23 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -81,6 +81,7 @@ cdef class Doc:
         self.is_parsed = False
         self._py_tokens = []
         self._vector = None
+        self.noun_chunks_iterator = DocIterator(self)
 
     def __getitem__(self, object i):
         """Get a Token or a Span from the Doc.
@@ -231,36 +232,22 @@ cdef class Doc:
                     # Set start as B
                     self.c[start].ent_iob = 3
 
-    @property
-    def noun_chunks(self):
-        """Yield spans for base noun phrases."""
-        if not self.is_parsed:
-            raise ValueError(
-                "noun_chunks requires the dependency parse, which "
-                "requires data to be installed. If you haven't done so, run: "
-                "\npython -m spacy.en.download all\n"
-                "to install the data")
 
-        from spacy.en.iterators import noun_chunks as en_noun_chunks
-        from spacy.de.iterators import noun_chunks as de_noun_chunks
+    property noun_chunks:
+        def __get__(self):
+            """Yield spans for base noun phrases."""
+            if not self.is_parsed:
+                raise ValueError(
+                    "noun_chunks requires the dependency parse, which "
+                    "requires data to be installed. If you haven't done so, run: "
+                    "\npython -m spacy.en.download all\n"
+                    "to install the data")
 
-        chunk_rules = {'en':en_noun_chunks, 
-                       'de':de_noun_chunks,
-                       }
+            yield from self.noun_chunks_iterator
 
-        for sent in self.sents:
-            print(sent)
-            lang = sent.root.lang_
-            chunker = chunk_rules.get(lang,None)
-            if chunker == None:
-                warnings.warn("noun_chunks is not available for language %s." % lang)
-                print(sent.root.orth_)
-                continue
+        def __set__(self, DocIterator):            
+            self.noun_chunks_iterator = DocIterator(self)
 
-            for chunk in chunker(sent):
-                yield chunk
-
-        
 
     @property
     def sents(self):
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index df8a4bbd5..3494d2e40 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -27,7 +27,7 @@ from . import symbols
 from cymem.cymem cimport Address
 from . import util
 from .serialize.packer cimport Packer
-from .attrs cimport PROB
+from .attrs cimport PROB, LANG
 
 try:
     import copy_reg
@@ -105,6 +105,13 @@ cdef class Vocab:
                 self._serializer = Packer(self, self.serializer_freqs)
             return self._serializer
 
+    property lang:
+        def __get__(self):
+            langfunc = None
+            if self.get_lex_attr:
+                langfunc = self.get_lex_attr.get(LANG,None)
+            return langfunc('_') if langfunc else ''
+
     def __len__(self):
         """The current number of lexemes stored."""
         return self.length