add baseclass DocIterator for iterators over documents

add classes for English and German noun chunks the respective iterators are set for the document when created by the parser as they depend on the annotation scheme of the parsing model
2024-12-25 09:26:27 +03:00 · 2016-03-16 15:53:35 +01:00 · 2016-03-16 15:53:35 +01:00 · 5e2e8e951a
commit 5e2e8e951a
parent 03fb498dbe
11 changed files with 140 additions and 89 deletions
--- a/setup.py
+++ b/setup.py
@ -63,8 +63,7 @@ MOD_NAMES = [
    'spacy.matcher',
    'spacy.syntax.ner',
    'spacy.symbols',
-    'spacy.en.iterators',
+    'spacy.syntax.iterators']
    'spacy.de.iterators']
 # By subclassing build_extensions we have the actual compiler that will be used
--- a/spacy/de/iterators.pxd
+++ b/spacy/de/iterators.pxd
--- a/spacy/de/iterators.pyx
+++ b/spacy/de/iterators.pyx
@ -1,28 +0,0 @@
 from spacy.structs cimport TokenC
 from spacy.tokens.span cimport Span
 from spacy.parts_of_speech cimport NOUN
 def noun_chunks(Span sent):
    # this function extracts spans headed by NOUNs starting from the left-most
    # syntactic dependent until the NOUN itself
    # for close apposition and measurement construction, the span is sometimes
    # extended to the right of the NOUN
    # example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
    # just "eine Tasse", same for "das Thema Familie"
    cdef const TokenC* word
    strings = sent.doc.vocab.strings
    labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
    close_app = strings['nk']
    np_deps = [strings[label] for label in labels]
    np_label = strings['NP']
    for i in range(sent.start, sent.end):
        word = &sent.doc.c[i]
        if word.pos == NOUN and word.dep in np_deps:
            rbracket = i+1
            # try to extend the span to the right
            # to capture close apposition/measurement constructions
            for rdep in sent.doc[i].rights:
                if rdep.pos == NOUN and rdep.dep == close_app:
                    rbracket = rdep.i+1
            yield Span(sent.doc, word.l_edge, rbracket, label=np_label)
--- a/spacy/en/iterators.pxd
+++ b/spacy/en/iterators.pxd
--- a/spacy/en/iterators.pyx
+++ b/spacy/en/iterators.pyx
@ -1,24 +0,0 @@
 from spacy.structs cimport TokenC
 from spacy.tokens.span cimport Span
 from spacy.parts_of_speech cimport NOUN
 def noun_chunks(Span sent):
    cdef const TokenC* word
    strings = sent.doc.vocab.strings
    labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
    np_deps = [strings[label] for label in labels]
    conj = strings['conj']
    np_label = strings['NP']
    for i in range(sent.start, sent.end):
        word = &sent.doc.c[i]
        if word.pos == NOUN and word.dep in np_deps:
            yield Span(sent.doc, word.l_edge, i+1, label=np_label)
        elif word.pos == NOUN and word.dep == conj:
            head = word+word.head
            while head.dep == conj and head.head < 0:
                head += head.head
            # If the head is an NP, and we're coordinated to it, we're an NP
            if head.dep in np_deps:
                yield Span(sent.doc, word.l_edge, i+1, label=np_label)
--- a/spacy/syntax/iterators.pxd
+++ b/spacy/syntax/iterators.pxd
@ -0,0 +1,16 @@
 from spacy.tokens.doc cimport Doc
 cdef class DocIterator:
    cdef Doc _doc
 cdef class EnglishNounChunks(DocIterator):
    cdef int i
    cdef int _np_label
    cdef set _np_deps
 cdef class GermanNounChunks(DocIterator):
    cdef int i
    cdef int _np_label
    cdef set _np_deps
    cdef int _close_app
--- a/spacy/syntax/iterators.pyx
+++ b/spacy/syntax/iterators.pyx
@ -0,0 +1,82 @@
 from spacy.structs cimport TokenC
 from spacy.tokens.span cimport Span
 from spacy.tokens.doc cimport Doc
 from spacy.tokens.token cimport Token
 from spacy.parts_of_speech cimport NOUN
 # base class for document iterators
 cdef class DocIterator:
    def __init__(self, Doc doc):
        self._doc = doc
    def __iter__(self):
        return self
    def __next__(self):
        raise NotImplementedError
 cdef class EnglishNounChunks(DocIterator):
    def __init__(self, Doc doc):
        super(EnglishNounChunks,self).__init__(doc)
        labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
        self._np_label = self._doc.vocab.strings['NP']
        self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
        self._conjunct = self._doc.vocab.strings['conj']
        self.i = 0
    def __next__(self):
        cdef const TokenC* word
        cdef widx
        while self.i < self._doc.length:
            widx = self.i
            self.i += 1
            word = &self._doc.c[widx]
            if word.pos == NOUN:
                if word.dep in self._np_deps:
                    return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
                elif word.dep == self._conjunct:
                    head = word+word.head
                    while head.dep == self._conjunct and head.head < 0:
                        head += head.head
                    # If the head is an NP, and we're coordinated to it, we're an NP
                    if head.dep in self._np_deps:
                        return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
        raise StopIteration
 # this iterator extracts spans headed by NOUNs starting from the left-most
 # syntactic dependent until the NOUN itself
 # for close apposition and measurement construction, the span is sometimes
 # extended to the right of the NOUN
 # example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
 # just "eine Tasse", same for "das Thema Familie"
 cdef class GermanNounChunks(DocIterator):
    def __init__(self, Doc doc):
        super(GermanNounChunks,self).__init__(doc)
        labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
        self._np_label = self._doc.vocab.strings['NP']
        self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
        self._close_app = self._doc.vocab.strings['nk']
        self.i = 0
    def __next__(self):
        cdef const TokenC* word
        cdef int rbracket
        cdef Token rdep
        cdef widx
        while self.i < self._doc.length:
            widx = self.i
            self.i += 1
            word = &self._doc.c[widx]
            if word.pos == NOUN and word.dep in self._np_deps:
                rbracket = widx+1
                # try to extend the span to the right
                # to capture close apposition/measurement constructions
                for rdep in self._doc[widx].rights:
                    if rdep.pos == NOUN and rdep.dep == self._close_app:
                        rbracket = rdep.i+1
                return Span(self._doc, word.l_edge, rbracket, label=self._np_label)                
        raise StopIteration
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -47,6 +47,8 @@ from ._parse_features cimport fill_context
 from .stateclass cimport StateClass
 from ._state cimport StateC
 from spacy.syntax.iterators cimport DocIterator, EnglishNounChunks, GermanNounChunks
 CHUNKERS = {'en':EnglishNounChunks, 'de':GermanNounChunks}
 DEBUG = False
@ -113,12 +115,9 @@ cdef class Parser:
        cdef int nr_feat = self.model.nr_feat
        with nogil:
            self.parseC(tokens.c, tokens.length, nr_feat, nr_class)
            tokens.is_parsed = True
        # Check for KeyboardInterrupt etc. Untested
        PyErr_CheckSignals()
-        # projectivize output
+        self._finalize(tokens)
        if self._projectivize:
            PseudoProjectivity.deprojectivize(tokens)
    def pipe(self, stream, int batch_size=1000, int n_threads=2):
        cdef Pool mem = Pool()
@ -144,7 +143,7 @@ cdef class Parser:
                                raise ValueError("Error parsing doc: %s" % sent_str)
                PyErr_CheckSignals()
                for doc in queue:
-                    doc.is_parsed = True
+                    self._finalize(doc)
                    yield doc
                queue = []
        batch_size = len(queue)
@ -155,10 +154,19 @@ cdef class Parser:
                    with gil:
                        sent_str = queue[i].text
                        raise ValueError("Error parsing doc: %s" % sent_str)
        for doc in queue:
            doc.is_parsed = True
            yield doc
        PyErr_CheckSignals()
        for doc in queue:
            self._finalize(doc)
            yield doc
    def _finalize(self, Doc doc):
        # deprojectivize output
        if self._projectivize:
            PseudoProjectivity.deprojectivize(doc)
        # set annotation-specific iterators
        doc.noun_chunks = CHUNKERS.get(doc.vocab.lang,DocIterator)
        # mark doc as parsed
        doc.is_parsed = True
    cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil:
        cdef ExampleC eg
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -7,6 +7,8 @@ from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport attr_t
 from ..attrs cimport attr_id_t
 from spacy.syntax.iterators cimport DocIterator
 cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
@ -42,6 +44,8 @@ cdef class Doc:
    cdef int length
    cdef int max_length
    cdef DocIterator noun_chunks_iterator
    cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1
    cpdef np.ndarray to_array(self, object features)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -81,6 +81,7 @@ cdef class Doc:
        self.is_parsed = False
        self._py_tokens = []
        self._vector = None
        self.noun_chunks_iterator = DocIterator(self)
    def __getitem__(self, object i):
        """Get a Token or a Span from the Doc.
@ -231,35 +232,21 @@ cdef class Doc:
                    # Set start as B
                    self.c[start].ent_iob = 3
    @property
    def noun_chunks(self):
        """Yield spans for base noun phrases."""
        if not self.is_parsed:
            raise ValueError(
                "noun_chunks requires the dependency parse, which "
                "requires data to be installed. If you haven't done so, run: "
                "\npython -m spacy.en.download all\n"
                "to install the data")
-        from spacy.en.iterators import noun_chunks as en_noun_chunks
+    property noun_chunks:
-        from spacy.de.iterators import noun_chunks as de_noun_chunks
+        def __get__(self):
            """Yield spans for base noun phrases."""
            if not self.is_parsed:
                raise ValueError(
                    "noun_chunks requires the dependency parse, which "
                    "requires data to be installed. If you haven't done so, run: "
                    "\npython -m spacy.en.download all\n"
                    "to install the data")
-        chunk_rules = {'en':en_noun_chunks, 
+            yield from self.noun_chunks_iterator
                       'de':de_noun_chunks,
                       }
        for sent in self.sents:
            print(sent)
            lang = sent.root.lang_
            chunker = chunk_rules.get(lang,None)
            if chunker == None:
                warnings.warn("noun_chunks is not available for language %s." % lang)
                print(sent.root.orth_)
                continue
            for chunk in chunker(sent):
                yield chunk
        def __set__(self, DocIterator):            
            self.noun_chunks_iterator = DocIterator(self)
    @property
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -27,7 +27,7 @@ from . import symbols
 from cymem.cymem cimport Address
 from . import util
 from .serialize.packer cimport Packer
-from .attrs cimport PROB
+from .attrs cimport PROB, LANG
 try:
    import copy_reg
@ -105,6 +105,13 @@ cdef class Vocab:
                self._serializer = Packer(self, self.serializer_freqs)
            return self._serializer
    property lang:
        def __get__(self):
            langfunc = None
            if self.get_lex_attr:
                langfunc = self.get_lex_attr.get(LANG,None)
            return langfunc('_') if langfunc else ''
    def __len__(self):
        """The current number of lexemes stored."""
        return self.length