diff --git a/setup.py b/setup.py index 7449212b9..91a118227 100644 --- a/setup.py +++ b/setup.py @@ -63,8 +63,7 @@ MOD_NAMES = [ 'spacy.matcher', 'spacy.syntax.ner', 'spacy.symbols', - 'spacy.en.iterators', - 'spacy.de.iterators'] + 'spacy.syntax.iterators'] # By subclassing build_extensions we have the actual compiler that will be used diff --git a/spacy/de/iterators.pxd b/spacy/de/iterators.pxd deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/de/iterators.pyx b/spacy/de/iterators.pyx deleted file mode 100644 index a6321bd57..000000000 --- a/spacy/de/iterators.pyx +++ /dev/null @@ -1,28 +0,0 @@ -from spacy.structs cimport TokenC -from spacy.tokens.span cimport Span - -from spacy.parts_of_speech cimport NOUN - -def noun_chunks(Span sent): - # this function extracts spans headed by NOUNs starting from the left-most - # syntactic dependent until the NOUN itself - # for close apposition and measurement construction, the span is sometimes - # extended to the right of the NOUN - # example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not - # just "eine Tasse", same for "das Thema Familie" - cdef const TokenC* word - strings = sent.doc.vocab.strings - labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app'] - close_app = strings['nk'] - np_deps = [strings[label] for label in labels] - np_label = strings['NP'] - for i in range(sent.start, sent.end): - word = &sent.doc.c[i] - if word.pos == NOUN and word.dep in np_deps: - rbracket = i+1 - # try to extend the span to the right - # to capture close apposition/measurement constructions - for rdep in sent.doc[i].rights: - if rdep.pos == NOUN and rdep.dep == close_app: - rbracket = rdep.i+1 - yield Span(sent.doc, word.l_edge, rbracket, label=np_label) diff --git a/spacy/en/iterators.pxd b/spacy/en/iterators.pxd deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/en/iterators.pyx b/spacy/en/iterators.pyx deleted file mode 100644 index e4f0fe2a4..000000000 --- a/spacy/en/iterators.pyx +++ /dev/null @@ -1,24 +0,0 @@ -from spacy.structs cimport TokenC -from spacy.tokens.span cimport Span - -from spacy.parts_of_speech cimport NOUN - -def noun_chunks(Span sent): - cdef const TokenC* word - strings = sent.doc.vocab.strings - labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root'] - np_deps = [strings[label] for label in labels] - conj = strings['conj'] - np_label = strings['NP'] - for i in range(sent.start, sent.end): - word = &sent.doc.c[i] - if word.pos == NOUN and word.dep in np_deps: - yield Span(sent.doc, word.l_edge, i+1, label=np_label) - elif word.pos == NOUN and word.dep == conj: - head = word+word.head - while head.dep == conj and head.head < 0: - head += head.head - # If the head is an NP, and we're coordinated to it, we're an NP - if head.dep in np_deps: - yield Span(sent.doc, word.l_edge, i+1, label=np_label) - diff --git a/spacy/syntax/iterators.pxd b/spacy/syntax/iterators.pxd new file mode 100644 index 000000000..662f851c8 --- /dev/null +++ b/spacy/syntax/iterators.pxd @@ -0,0 +1,16 @@ + +from spacy.tokens.doc cimport Doc + +cdef class DocIterator: + cdef Doc _doc + +cdef class EnglishNounChunks(DocIterator): + cdef int i + cdef int _np_label + cdef set _np_deps + +cdef class GermanNounChunks(DocIterator): + cdef int i + cdef int _np_label + cdef set _np_deps + cdef int _close_app diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx new file mode 100644 index 000000000..78679b8ce --- /dev/null +++ b/spacy/syntax/iterators.pyx @@ -0,0 +1,82 @@ +from spacy.structs cimport TokenC +from spacy.tokens.span cimport Span +from spacy.tokens.doc cimport Doc +from spacy.tokens.token cimport Token + +from spacy.parts_of_speech cimport NOUN + +# base class for document iterators +cdef class DocIterator: + def __init__(self, Doc doc): + self._doc = doc + + def __iter__(self): + return self + + def __next__(self): + raise NotImplementedError + + +cdef class EnglishNounChunks(DocIterator): + def __init__(self, Doc doc): + super(EnglishNounChunks,self).__init__(doc) + labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root'] + self._np_label = self._doc.vocab.strings['NP'] + self._np_deps = set( self._doc.vocab.strings[label] for label in labels ) + self._conjunct = self._doc.vocab.strings['conj'] + self.i = 0 + + def __next__(self): + cdef const TokenC* word + cdef widx + while self.i < self._doc.length: + widx = self.i + self.i += 1 + word = &self._doc.c[widx] + if word.pos == NOUN: + if word.dep in self._np_deps: + return Span(self._doc, word.l_edge, widx+1, label=self._np_label) + elif word.dep == self._conjunct: + head = word+word.head + while head.dep == self._conjunct and head.head < 0: + head += head.head + # If the head is an NP, and we're coordinated to it, we're an NP + if head.dep in self._np_deps: + return Span(self._doc, word.l_edge, widx+1, label=self._np_label) + raise StopIteration + + +# this iterator extracts spans headed by NOUNs starting from the left-most +# syntactic dependent until the NOUN itself +# for close apposition and measurement construction, the span is sometimes +# extended to the right of the NOUN +# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not +# just "eine Tasse", same for "das Thema Familie" +cdef class GermanNounChunks(DocIterator): + def __init__(self, Doc doc): + super(GermanNounChunks,self).__init__(doc) + labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app'] + self._np_label = self._doc.vocab.strings['NP'] + self._np_deps = set( self._doc.vocab.strings[label] for label in labels ) + self._close_app = self._doc.vocab.strings['nk'] + self.i = 0 + + def __next__(self): + cdef const TokenC* word + cdef int rbracket + cdef Token rdep + cdef widx + while self.i < self._doc.length: + widx = self.i + self.i += 1 + word = &self._doc.c[widx] + if word.pos == NOUN and word.dep in self._np_deps: + rbracket = widx+1 + # try to extend the span to the right + # to capture close apposition/measurement constructions + for rdep in self._doc[widx].rights: + if rdep.pos == NOUN and rdep.dep == self._close_app: + rbracket = rdep.i+1 + return Span(self._doc, word.l_edge, rbracket, label=self._np_label) + raise StopIteration + diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index a83c397dc..c7b88d5b8 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -47,6 +47,8 @@ from ._parse_features cimport fill_context from .stateclass cimport StateClass from ._state cimport StateC +from spacy.syntax.iterators cimport DocIterator, EnglishNounChunks, GermanNounChunks +CHUNKERS = {'en':EnglishNounChunks, 'de':GermanNounChunks} DEBUG = False @@ -113,12 +115,9 @@ cdef class Parser: cdef int nr_feat = self.model.nr_feat with nogil: self.parseC(tokens.c, tokens.length, nr_feat, nr_class) - tokens.is_parsed = True # Check for KeyboardInterrupt etc. Untested PyErr_CheckSignals() - # projectivize output - if self._projectivize: - PseudoProjectivity.deprojectivize(tokens) + self._finalize(tokens) def pipe(self, stream, int batch_size=1000, int n_threads=2): cdef Pool mem = Pool() @@ -144,7 +143,7 @@ cdef class Parser: raise ValueError("Error parsing doc: %s" % sent_str) PyErr_CheckSignals() for doc in queue: - doc.is_parsed = True + self._finalize(doc) yield doc queue = [] batch_size = len(queue) @@ -155,10 +154,19 @@ cdef class Parser: with gil: sent_str = queue[i].text raise ValueError("Error parsing doc: %s" % sent_str) - for doc in queue: - doc.is_parsed = True - yield doc PyErr_CheckSignals() + for doc in queue: + self._finalize(doc) + yield doc + + def _finalize(self, Doc doc): + # deprojectivize output + if self._projectivize: + PseudoProjectivity.deprojectivize(doc) + # set annotation-specific iterators + doc.noun_chunks = CHUNKERS.get(doc.vocab.lang,DocIterator) + # mark doc as parsed + doc.is_parsed = True cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil: cdef ExampleC eg diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index aa2cf6b54..02b6f29a5 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -7,6 +7,8 @@ from ..structs cimport TokenC, LexemeC from ..typedefs cimport attr_t from ..attrs cimport attr_id_t +from spacy.syntax.iterators cimport DocIterator + cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil @@ -42,6 +44,8 @@ cdef class Doc: cdef int length cdef int max_length + cdef DocIterator noun_chunks_iterator + cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1 cpdef np.ndarray to_array(self, object features) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 887b1085f..faed51e23 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -81,6 +81,7 @@ cdef class Doc: self.is_parsed = False self._py_tokens = [] self._vector = None + self.noun_chunks_iterator = DocIterator(self) def __getitem__(self, object i): """Get a Token or a Span from the Doc. @@ -231,36 +232,22 @@ cdef class Doc: # Set start as B self.c[start].ent_iob = 3 - @property - def noun_chunks(self): - """Yield spans for base noun phrases.""" - if not self.is_parsed: - raise ValueError( - "noun_chunks requires the dependency parse, which " - "requires data to be installed. If you haven't done so, run: " - "\npython -m spacy.en.download all\n" - "to install the data") - from spacy.en.iterators import noun_chunks as en_noun_chunks - from spacy.de.iterators import noun_chunks as de_noun_chunks + property noun_chunks: + def __get__(self): + """Yield spans for base noun phrases.""" + if not self.is_parsed: + raise ValueError( + "noun_chunks requires the dependency parse, which " + "requires data to be installed. If you haven't done so, run: " + "\npython -m spacy.en.download all\n" + "to install the data") - chunk_rules = {'en':en_noun_chunks, - 'de':de_noun_chunks, - } + yield from self.noun_chunks_iterator - for sent in self.sents: - print(sent) - lang = sent.root.lang_ - chunker = chunk_rules.get(lang,None) - if chunker == None: - warnings.warn("noun_chunks is not available for language %s." % lang) - print(sent.root.orth_) - continue + def __set__(self, DocIterator): + self.noun_chunks_iterator = DocIterator(self) - for chunk in chunker(sent): - yield chunk - - @property def sents(self): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index df8a4bbd5..3494d2e40 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -27,7 +27,7 @@ from . import symbols from cymem.cymem cimport Address from . import util from .serialize.packer cimport Packer -from .attrs cimport PROB +from .attrs cimport PROB, LANG try: import copy_reg @@ -105,6 +105,13 @@ cdef class Vocab: self._serializer = Packer(self, self.serializer_freqs) return self._serializer + property lang: + def __get__(self): + langfunc = None + if self.get_lex_attr: + langfunc = self.get_lex_attr.get(LANG,None) + return langfunc('_') if langfunc else '' + def __len__(self): """The current number of lexemes stored.""" return self.length