From 5e2e8e951a75348d069d68cade7972c6cff55ee9 Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Wed, 16 Mar 2016 15:53:35 +0100 Subject: [PATCH] add baseclass DocIterator for iterators over documents add classes for English and German noun chunks the respective iterators are set for the document when created by the parser as they depend on the annotation scheme of the parsing model --- setup.py | 3 +- spacy/de/iterators.pxd | 0 spacy/de/iterators.pyx | 28 ------------- spacy/en/iterators.pxd | 0 spacy/en/iterators.pyx | 24 ----------- spacy/syntax/iterators.pxd | 16 ++++++++ spacy/syntax/iterators.pyx | 82 ++++++++++++++++++++++++++++++++++++++ spacy/syntax/parser.pyx | 24 +++++++---- spacy/tokens/doc.pxd | 4 ++ spacy/tokens/doc.pyx | 39 ++++++------------ spacy/vocab.pyx | 9 ++++- 11 files changed, 140 insertions(+), 89 deletions(-) delete mode 100644 spacy/de/iterators.pxd delete mode 100644 spacy/de/iterators.pyx delete mode 100644 spacy/en/iterators.pxd delete mode 100644 spacy/en/iterators.pyx create mode 100644 spacy/syntax/iterators.pxd create mode 100644 spacy/syntax/iterators.pyx diff --git a/setup.py b/setup.py index 7449212b9..91a118227 100644 --- a/setup.py +++ b/setup.py @@ -63,8 +63,7 @@ MOD_NAMES = [ 'spacy.matcher', 'spacy.syntax.ner', 'spacy.symbols', - 'spacy.en.iterators', - 'spacy.de.iterators'] + 'spacy.syntax.iterators'] # By subclassing build_extensions we have the actual compiler that will be used diff --git a/spacy/de/iterators.pxd b/spacy/de/iterators.pxd deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/de/iterators.pyx b/spacy/de/iterators.pyx deleted file mode 100644 index a6321bd57..000000000 --- a/spacy/de/iterators.pyx +++ /dev/null @@ -1,28 +0,0 @@ -from spacy.structs cimport TokenC -from spacy.tokens.span cimport Span - -from spacy.parts_of_speech cimport NOUN - -def noun_chunks(Span sent): - # this function extracts spans headed by NOUNs starting from the left-most - # syntactic dependent until the NOUN itself - # for close apposition and measurement construction, the span is sometimes - # extended to the right of the NOUN - # example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not - # just "eine Tasse", same for "das Thema Familie" - cdef const TokenC* word - strings = sent.doc.vocab.strings - labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app'] - close_app = strings['nk'] - np_deps = [strings[label] for label in labels] - np_label = strings['NP'] - for i in range(sent.start, sent.end): - word = &sent.doc.c[i] - if word.pos == NOUN and word.dep in np_deps: - rbracket = i+1 - # try to extend the span to the right - # to capture close apposition/measurement constructions - for rdep in sent.doc[i].rights: - if rdep.pos == NOUN and rdep.dep == close_app: - rbracket = rdep.i+1 - yield Span(sent.doc, word.l_edge, rbracket, label=np_label) diff --git a/spacy/en/iterators.pxd b/spacy/en/iterators.pxd deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/en/iterators.pyx b/spacy/en/iterators.pyx deleted file mode 100644 index e4f0fe2a4..000000000 --- a/spacy/en/iterators.pyx +++ /dev/null @@ -1,24 +0,0 @@ -from spacy.structs cimport TokenC -from spacy.tokens.span cimport Span - -from spacy.parts_of_speech cimport NOUN - -def noun_chunks(Span sent): - cdef const TokenC* word - strings = sent.doc.vocab.strings - labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root'] - np_deps = [strings[label] for label in labels] - conj = strings['conj'] - np_label = strings['NP'] - for i in range(sent.start, sent.end): - word = &sent.doc.c[i] - if word.pos == NOUN and word.dep in np_deps: - yield Span(sent.doc, word.l_edge, i+1, label=np_label) - elif word.pos == NOUN and word.dep == conj: - head = word+word.head - while head.dep == conj and head.head < 0: - head += head.head - # If the head is an NP, and we're coordinated to it, we're an NP - if head.dep in np_deps: - yield Span(sent.doc, word.l_edge, i+1, label=np_label) - diff --git a/spacy/syntax/iterators.pxd b/spacy/syntax/iterators.pxd new file mode 100644 index 000000000..662f851c8 --- /dev/null +++ b/spacy/syntax/iterators.pxd @@ -0,0 +1,16 @@ + +from spacy.tokens.doc cimport Doc + +cdef class DocIterator: + cdef Doc _doc + +cdef class EnglishNounChunks(DocIterator): + cdef int i + cdef int _np_label + cdef set _np_deps + +cdef class GermanNounChunks(DocIterator): + cdef int i + cdef int _np_label + cdef set _np_deps + cdef int _close_app diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx new file mode 100644 index 000000000..78679b8ce --- /dev/null +++ b/spacy/syntax/iterators.pyx @@ -0,0 +1,82 @@ +from spacy.structs cimport TokenC +from spacy.tokens.span cimport Span +from spacy.tokens.doc cimport Doc +from spacy.tokens.token cimport Token + +from spacy.parts_of_speech cimport NOUN + +# base class for document iterators +cdef class DocIterator: + def __init__(self, Doc doc): + self._doc = doc + + def __iter__(self): + return self + + def __next__(self): + raise NotImplementedError + + +cdef class EnglishNounChunks(DocIterator): + def __init__(self, Doc doc): + super(EnglishNounChunks,self).__init__(doc) + labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root'] + self._np_label = self._doc.vocab.strings['NP'] + self._np_deps = set( self._doc.vocab.strings[label] for label in labels ) + self._conjunct = self._doc.vocab.strings['conj'] + self.i = 0 + + def __next__(self): + cdef const TokenC* word + cdef widx + while self.i < self._doc.length: + widx = self.i + self.i += 1 + word = &self._doc.c[widx] + if word.pos == NOUN: + if word.dep in self._np_deps: + return Span(self._doc, word.l_edge, widx+1, label=self._np_label) + elif word.dep == self._conjunct: + head = word+word.head + while head.dep == self._conjunct and head.head < 0: + head += head.head + # If the head is an NP, and we're coordinated to it, we're an NP + if head.dep in self._np_deps: + return Span(self._doc, word.l_edge, widx+1, label=self._np_label) + raise StopIteration + + +# this iterator extracts spans headed by NOUNs starting from the left-most +# syntactic dependent until the NOUN itself +# for close apposition and measurement construction, the span is sometimes +# extended to the right of the NOUN +# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not +# just "eine Tasse", same for "das Thema Familie" +cdef class GermanNounChunks(DocIterator): + def __init__(self, Doc doc): + super(GermanNounChunks,self).__init__(doc) + labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app'] + self._np_label = self._doc.vocab.strings['NP'] + self._np_deps = set( self._doc.vocab.strings[label] for label in labels ) + self._close_app = self._doc.vocab.strings['nk'] + self.i = 0 + + def __next__(self): + cdef const TokenC* word + cdef int rbracket + cdef Token rdep + cdef widx + while self.i < self._doc.length: + widx = self.i + self.i += 1 + word = &self._doc.c[widx] + if word.pos == NOUN and word.dep in self._np_deps: + rbracket = widx+1 + # try to extend the span to the right + # to capture close apposition/measurement constructions + for rdep in self._doc[widx].rights: + if rdep.pos == NOUN and rdep.dep == self._close_app: + rbracket = rdep.i+1 + return Span(self._doc, word.l_edge, rbracket, label=self._np_label) + raise StopIteration + diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index a83c397dc..c7b88d5b8 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -47,6 +47,8 @@ from ._parse_features cimport fill_context from .stateclass cimport StateClass from ._state cimport StateC +from spacy.syntax.iterators cimport DocIterator, EnglishNounChunks, GermanNounChunks +CHUNKERS = {'en':EnglishNounChunks, 'de':GermanNounChunks} DEBUG = False @@ -113,12 +115,9 @@ cdef class Parser: cdef int nr_feat = self.model.nr_feat with nogil: self.parseC(tokens.c, tokens.length, nr_feat, nr_class) - tokens.is_parsed = True # Check for KeyboardInterrupt etc. Untested PyErr_CheckSignals() - # projectivize output - if self._projectivize: - PseudoProjectivity.deprojectivize(tokens) + self._finalize(tokens) def pipe(self, stream, int batch_size=1000, int n_threads=2): cdef Pool mem = Pool() @@ -144,7 +143,7 @@ cdef class Parser: raise ValueError("Error parsing doc: %s" % sent_str) PyErr_CheckSignals() for doc in queue: - doc.is_parsed = True + self._finalize(doc) yield doc queue = [] batch_size = len(queue) @@ -155,10 +154,19 @@ cdef class Parser: with gil: sent_str = queue[i].text raise ValueError("Error parsing doc: %s" % sent_str) - for doc in queue: - doc.is_parsed = True - yield doc PyErr_CheckSignals() + for doc in queue: + self._finalize(doc) + yield doc + + def _finalize(self, Doc doc): + # deprojectivize output + if self._projectivize: + PseudoProjectivity.deprojectivize(doc) + # set annotation-specific iterators + doc.noun_chunks = CHUNKERS.get(doc.vocab.lang,DocIterator) + # mark doc as parsed + doc.is_parsed = True cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil: cdef ExampleC eg diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index aa2cf6b54..02b6f29a5 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -7,6 +7,8 @@ from ..structs cimport TokenC, LexemeC from ..typedefs cimport attr_t from ..attrs cimport attr_id_t +from spacy.syntax.iterators cimport DocIterator + cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil @@ -42,6 +44,8 @@ cdef class Doc: cdef int length cdef int max_length + cdef DocIterator noun_chunks_iterator + cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1 cpdef np.ndarray to_array(self, object features) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 887b1085f..faed51e23 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -81,6 +81,7 @@ cdef class Doc: self.is_parsed = False self._py_tokens = [] self._vector = None + self.noun_chunks_iterator = DocIterator(self) def __getitem__(self, object i): """Get a Token or a Span from the Doc. @@ -231,36 +232,22 @@ cdef class Doc: # Set start as B self.c[start].ent_iob = 3 - @property - def noun_chunks(self): - """Yield spans for base noun phrases.""" - if not self.is_parsed: - raise ValueError( - "noun_chunks requires the dependency parse, which " - "requires data to be installed. If you haven't done so, run: " - "\npython -m spacy.en.download all\n" - "to install the data") - from spacy.en.iterators import noun_chunks as en_noun_chunks - from spacy.de.iterators import noun_chunks as de_noun_chunks + property noun_chunks: + def __get__(self): + """Yield spans for base noun phrases.""" + if not self.is_parsed: + raise ValueError( + "noun_chunks requires the dependency parse, which " + "requires data to be installed. If you haven't done so, run: " + "\npython -m spacy.en.download all\n" + "to install the data") - chunk_rules = {'en':en_noun_chunks, - 'de':de_noun_chunks, - } + yield from self.noun_chunks_iterator - for sent in self.sents: - print(sent) - lang = sent.root.lang_ - chunker = chunk_rules.get(lang,None) - if chunker == None: - warnings.warn("noun_chunks is not available for language %s." % lang) - print(sent.root.orth_) - continue + def __set__(self, DocIterator): + self.noun_chunks_iterator = DocIterator(self) - for chunk in chunker(sent): - yield chunk - - @property def sents(self): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index df8a4bbd5..3494d2e40 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -27,7 +27,7 @@ from . import symbols from cymem.cymem cimport Address from . import util from .serialize.packer cimport Packer -from .attrs cimport PROB +from .attrs cimport PROB, LANG try: import copy_reg @@ -105,6 +105,13 @@ cdef class Vocab: self._serializer = Packer(self, self.serializer_freqs) return self._serializer + property lang: + def __get__(self): + langfunc = None + if self.get_lex_attr: + langfunc = self.get_lex_attr.get(LANG,None) + return langfunc('_') if langfunc else '' + def __len__(self): """The current number of lexemes stored.""" return self.length