From 508fd1f6dc7f645fcfc39578f7cec4f294425942 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 2 May 2016 14:25:10 +0200 Subject: [PATCH] * Refactor noun chunk iterators, so that they're simple functions. Install the iterator when the Doc is created, but allow users to write to the noun_chunk_iterator attribute. The iterator functions accept an object and yield (int start, int end, int label) triples. --- spacy/syntax/arc_eager.pyx | 3 + spacy/syntax/iterators.pxd | 19 ----- spacy/syntax/iterators.pyx | 110 +++++++------------------ spacy/syntax/parser.pyx | 18 +--- spacy/syntax/transition_system.pyx | 3 + spacy/tests/tokens/test_noun_chunks.py | 6 +- spacy/tokens/doc.pyx | 29 +++---- 7 files changed, 57 insertions(+), 131 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 3f2bc7c65..4c6bf4742 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -383,6 +383,9 @@ cdef class ArcEager(TransitionSystem): if st._sent[i].head == 0 and st._sent[i].dep == 0: st._sent[i].dep = self.root_label + def finalize_doc(self, doc): + doc.is_parsed = True + cdef int set_valid(self, int* output, const StateC* st) nogil: cdef bint[N_MOVES] is_valid is_valid[SHIFT] = Shift.is_valid(st, -1) diff --git a/spacy/syntax/iterators.pxd b/spacy/syntax/iterators.pxd index f5ea7e632..e69de29bb 100644 --- a/spacy/syntax/iterators.pxd +++ b/spacy/syntax/iterators.pxd @@ -1,19 +0,0 @@ - -from spacy.tokens.doc cimport Doc - -cdef dict CHUNKERS - -cdef class DocIterator: - cdef Doc _doc - -cdef class EnglishNounChunks(DocIterator): - cdef int i - cdef int _np_label - cdef set _np_deps - cdef int _conjunct - -cdef class GermanNounChunks(DocIterator): - cdef int i - cdef int _np_label - cdef set _np_deps - cdef int _close_app diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index 4ecfb4529..b8b810d36 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -1,55 +1,23 @@ -from spacy.structs cimport TokenC -from spacy.tokens.span cimport Span -from spacy.tokens.doc cimport Doc -from spacy.tokens.token cimport Token - from spacy.parts_of_speech cimport NOUN -CHUNKERS = {'en':EnglishNounChunks, 'de':GermanNounChunks} -# base class for document iterators -cdef class DocIterator: - def __init__(self, Doc doc): - self._doc = doc - - def __iter__(self): - return self - - def __next__(self): - raise NotImplementedError - - -cdef class EnglishNounChunks(DocIterator): - def __init__(self, Doc doc): - super(EnglishNounChunks,self).__init__(doc) - labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root'] - self._np_label = self._doc.vocab.strings['NP'] - self._np_deps = set( self._doc.vocab.strings[label] for label in labels ) - self._conjunct = self._doc.vocab.strings['conj'] - self.i = 0 - - def __iter__(self): - self.i = 0 - return super(EnglishNounChunks,self).__iter__() - - def __next__(self): - cdef const TokenC* word - cdef widx - while self.i < self._doc.length: - widx = self.i - self.i += 1 - word = &self._doc.c[widx] - if word.pos == NOUN: - if word.dep in self._np_deps: - return Span(self._doc, word.l_edge, widx+1, label=self._np_label) - elif word.dep == self._conjunct: - head = word+word.head - while head.dep == self._conjunct and head.head < 0: - head += head.head - # If the head is an NP, and we're coordinated to it, we're an NP - if head.dep in self._np_deps: - return Span(self._doc, word.l_edge, widx+1, label=self._np_label) - raise StopIteration +def english_noun_chunks(doc): + labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', + 'attr', 'root'] + np_deps = [doc.vocab.strings[label] for label in labels] + conj = doc.vocab.strings['conj'] + np_label = doc.vocab.strings['NP'] + for i in range(len(doc)): + word = doc[i] + if word.pos == NOUN and word.dep in np_deps: + yield word.left_edge.i, word.i+1, np_label + elif word.pos == NOUN and word.dep == conj: + head = word.head + while head.dep == conj and head.head.i < head.i: + head = head.head + # If the head is an NP, and we're coordinated to it, we're an NP + if head.dep in np_deps: + yield word.left_edge.i, word.i+1, np_label # this iterator extracts spans headed by NOUNs starting from the left-most @@ -58,35 +26,21 @@ cdef class EnglishNounChunks(DocIterator): # extended to the right of the NOUN # example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not # just "eine Tasse", same for "das Thema Familie" -cdef class GermanNounChunks(DocIterator): - def __init__(self, Doc doc): - super(GermanNounChunks,self).__init__(doc) - labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app'] - self._np_label = self._doc.vocab.strings['NP'] - self._np_deps = set( self._doc.vocab.strings[label] for label in labels ) - self._close_app = self._doc.vocab.strings['nk'] - self.i = 0 +def german_noun_chunks(doc): + labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app'] + np_label = doc.vocab.strings['NP'] + np_deps = set(doc.vocab.strings[label] for label in labels) + close_app = doc.vocab.strings['nk'] - def __iter__(self): - self.i = 0 - return super(GermanNounChunks,self).__iter__() + for word in doc: + if word.pos == NOUN and word.dep in np_deps: + rbracket = word.i+1 + # try to extend the span to the right + # to capture close apposition/measurement constructions + for rdep in doc[word.i].rights: + if rdep.pos == NOUN and rdep.dep == close_app: + rbracket = rdep.i+1 + yield word.l_edge, rbracket, np_label - def __next__(self): - cdef const TokenC* word - cdef int rbracket - cdef Token rdep - cdef widx - while self.i < self._doc.length: - widx = self.i - self.i += 1 - word = &self._doc.c[widx] - if word.pos == NOUN and word.dep in self._np_deps: - rbracket = widx+1 - # try to extend the span to the right - # to capture close apposition/measurement constructions - for rdep in self._doc[widx].rights: - if rdep.pos == NOUN and rdep.dep == self._close_app: - rbracket = rdep.i+1 - return Span(self._doc, word.l_edge, rbracket, label=self._np_label) - raise StopIteration +CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks} diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 20cce7bb6..04f9d5f22 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -47,8 +47,6 @@ from ._parse_features cimport fill_context from .stateclass cimport StateClass from ._state cimport StateC -from spacy.syntax.iterators cimport CHUNKERS, DocIterator, EnglishNounChunks, GermanNounChunks - DEBUG = False def set_debug(val): @@ -116,7 +114,7 @@ cdef class Parser: self.parseC(tokens.c, tokens.length, nr_feat, nr_class) # Check for KeyboardInterrupt etc. Untested PyErr_CheckSignals() - self._finalize(tokens) + self.moves.finalize_doc(tokens) def pipe(self, stream, int batch_size=1000, int n_threads=2): cdef Pool mem = Pool() @@ -142,7 +140,7 @@ cdef class Parser: raise ValueError("Error parsing doc: %s" % sent_str) PyErr_CheckSignals() for doc in queue: - self._finalize(doc) + self.moves.finalize_doc(doc) yield doc queue = [] batch_size = len(queue) @@ -155,18 +153,9 @@ cdef class Parser: raise ValueError("Error parsing doc: %s" % sent_str) PyErr_CheckSignals() for doc in queue: - self._finalize(doc) + self.moves.finalize_doc(doc) yield doc - def _finalize(self, Doc doc): - # deprojectivize output - if self._projectivize: - PseudoProjectivity.deprojectivize(doc) - # set annotation-specific iterators - doc.noun_chunks = CHUNKERS.get(doc.vocab.lang,DocIterator) - # mark doc as parsed - doc.is_parsed = True - cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil: cdef ExampleC eg eg.nr_feat = nr_feat @@ -313,6 +302,7 @@ cdef class StepwiseState: if self.stcls.is_final(): self.parser.moves.finalize_state(self.stcls.c) self.doc.set_parse(self.stcls.c._sent) + self.parser.moves.finalize_doc(self.doc) cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions, diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 2a7ac9523..9e624bd58 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -53,6 +53,9 @@ cdef class TransitionSystem: cdef int finalize_state(self, StateC* state) nogil: pass + def finalize_doc(self, doc): + pass + cdef int preprocess_gold(self, GoldParse gold) except -1: raise NotImplementedError diff --git a/spacy/tests/tokens/test_noun_chunks.py b/spacy/tests/tokens/test_noun_chunks.py index cf72e9ce1..642ce715b 100644 --- a/spacy/tests/tokens/test_noun_chunks.py +++ b/spacy/tests/tokens/test_noun_chunks.py @@ -3,7 +3,7 @@ import numpy as np from spacy.attrs import HEAD, DEP from spacy.symbols import nsubj, dobj, punct, amod, nmod, conj, cc, root from spacy.en import English -from spacy.syntax.iterators import EnglishNounChunks +from spacy.syntax.iterators import english_noun_chunks def test_not_nested(): @@ -22,9 +22,7 @@ def test_not_nested(): [-2, conj], [-5, dobj] ], dtype='int32')) - tokens.noun_chunks = EnglishNounChunks - for chunk in tokens.noun_chunks: - print(chunk.text) + tokens.noun_chunks_iterator = english_noun_chunks word_occurred = {} for chunk in tokens.noun_chunks: for word in chunk: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 68232ae8c..c1d38f16f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -23,6 +23,7 @@ from .span cimport Span from .token cimport Token from ..serialize.bits cimport BitArray from ..util import normalize_slice +from ..syntax.iterators import CHUNKERS DEF PADDING = 5 @@ -81,7 +82,7 @@ cdef class Doc: self.is_parsed = False self._py_tokens = [] self._vector = None - self.noun_chunks_iterator = DocIterator(self) + self.noun_chunks_iterator = CHUNKERS.get(self.vocab.lang) def __getitem__(self, object i): """Get a Token or a Span from the Doc. @@ -233,21 +234,17 @@ cdef class Doc: self.c[start].ent_iob = 3 - property noun_chunks: - def __get__(self): - """Yield spans for base noun phrases.""" - if not self.is_parsed: - raise ValueError( - "noun_chunks requires the dependency parse, which " - "requires data to be installed. If you haven't done so, run: " - "\npython -m spacy.%s.download all\n" - "to install the data" % self.vocab.lang) - - yield from self.noun_chunks_iterator - - def __set__(self, DocIterator): - self.noun_chunks_iterator = DocIterator(self) - + @property + def noun_chunks(self): + """Yield spans for base noun phrases.""" + if not self.is_parsed: + raise ValueError( + "noun_chunks requires the dependency parse, which " + "requires data to be installed. If you haven't done so, run: " + "\npython -m spacy.%s.download all\n" + "to install the data" % self.vocab.lang) + for start, end, label in self.noun_chunks_iterator(self): + yield Span(self, start, end, label=label) @property def sents(self):