* Refactor noun chunk iterators, so that they're simple functions. Install the iterator when the Doc is created, but allow users to write to the noun_chunk_iterator attribute. The iterator functions accept an object and yield (int start, int end, int label) triples.

This commit is contained in:
Matthew Honnibal 2016-05-02 14:25:10 +02:00
parent e526be5602
commit 508fd1f6dc
7 changed files with 57 additions and 131 deletions

View File

@ -383,6 +383,9 @@ cdef class ArcEager(TransitionSystem):
if st._sent[i].head == 0 and st._sent[i].dep == 0:
st._sent[i].dep = self.root_label
def finalize_doc(self, doc):
doc.is_parsed = True
cdef int set_valid(self, int* output, const StateC* st) nogil:
cdef bint[N_MOVES] is_valid
is_valid[SHIFT] = Shift.is_valid(st, -1)

View File

@ -1,19 +0,0 @@
from spacy.tokens.doc cimport Doc
cdef dict CHUNKERS
cdef class DocIterator:
cdef Doc _doc
cdef class EnglishNounChunks(DocIterator):
cdef int i
cdef int _np_label
cdef set _np_deps
cdef int _conjunct
cdef class GermanNounChunks(DocIterator):
cdef int i
cdef int _np_label
cdef set _np_deps
cdef int _close_app

View File

@ -1,55 +1,23 @@
from spacy.structs cimport TokenC
from spacy.tokens.span cimport Span
from spacy.tokens.doc cimport Doc
from spacy.tokens.token cimport Token
from spacy.parts_of_speech cimport NOUN
CHUNKERS = {'en':EnglishNounChunks, 'de':GermanNounChunks}
# base class for document iterators
cdef class DocIterator:
def __init__(self, Doc doc):
self._doc = doc
def __iter__(self):
return self
def __next__(self):
raise NotImplementedError
cdef class EnglishNounChunks(DocIterator):
def __init__(self, Doc doc):
super(EnglishNounChunks,self).__init__(doc)
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
self._np_label = self._doc.vocab.strings['NP']
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
self._conjunct = self._doc.vocab.strings['conj']
self.i = 0
def __iter__(self):
self.i = 0
return super(EnglishNounChunks,self).__iter__()
def __next__(self):
cdef const TokenC* word
cdef widx
while self.i < self._doc.length:
widx = self.i
self.i += 1
word = &self._doc.c[widx]
if word.pos == NOUN:
if word.dep in self._np_deps:
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
elif word.dep == self._conjunct:
head = word+word.head
while head.dep == self._conjunct and head.head < 0:
head += head.head
def english_noun_chunks(doc):
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
'attr', 'root']
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings['conj']
np_label = doc.vocab.strings['NP']
for i in range(len(doc)):
word = doc[i]
if word.pos == NOUN and word.dep in np_deps:
yield word.left_edge.i, word.i+1, np_label
elif word.pos == NOUN and word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in self._np_deps:
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
raise StopIteration
if head.dep in np_deps:
yield word.left_edge.i, word.i+1, np_label
# this iterator extracts spans headed by NOUNs starting from the left-most
@ -58,35 +26,21 @@ cdef class EnglishNounChunks(DocIterator):
# extended to the right of the NOUN
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
# just "eine Tasse", same for "das Thema Familie"
cdef class GermanNounChunks(DocIterator):
def __init__(self, Doc doc):
super(GermanNounChunks,self).__init__(doc)
def german_noun_chunks(doc):
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
self._np_label = self._doc.vocab.strings['NP']
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
self._close_app = self._doc.vocab.strings['nk']
self.i = 0
np_label = doc.vocab.strings['NP']
np_deps = set(doc.vocab.strings[label] for label in labels)
close_app = doc.vocab.strings['nk']
def __iter__(self):
self.i = 0
return super(GermanNounChunks,self).__iter__()
def __next__(self):
cdef const TokenC* word
cdef int rbracket
cdef Token rdep
cdef widx
while self.i < self._doc.length:
widx = self.i
self.i += 1
word = &self._doc.c[widx]
if word.pos == NOUN and word.dep in self._np_deps:
rbracket = widx+1
for word in doc:
if word.pos == NOUN and word.dep in np_deps:
rbracket = word.i+1
# try to extend the span to the right
# to capture close apposition/measurement constructions
for rdep in self._doc[widx].rights:
if rdep.pos == NOUN and rdep.dep == self._close_app:
for rdep in doc[word.i].rights:
if rdep.pos == NOUN and rdep.dep == close_app:
rbracket = rdep.i+1
return Span(self._doc, word.l_edge, rbracket, label=self._np_label)
raise StopIteration
yield word.l_edge, rbracket, np_label
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}

View File

@ -47,8 +47,6 @@ from ._parse_features cimport fill_context
from .stateclass cimport StateClass
from ._state cimport StateC
from spacy.syntax.iterators cimport CHUNKERS, DocIterator, EnglishNounChunks, GermanNounChunks
DEBUG = False
def set_debug(val):
@ -116,7 +114,7 @@ cdef class Parser:
self.parseC(tokens.c, tokens.length, nr_feat, nr_class)
# Check for KeyboardInterrupt etc. Untested
PyErr_CheckSignals()
self._finalize(tokens)
self.moves.finalize_doc(tokens)
def pipe(self, stream, int batch_size=1000, int n_threads=2):
cdef Pool mem = Pool()
@ -142,7 +140,7 @@ cdef class Parser:
raise ValueError("Error parsing doc: %s" % sent_str)
PyErr_CheckSignals()
for doc in queue:
self._finalize(doc)
self.moves.finalize_doc(doc)
yield doc
queue = []
batch_size = len(queue)
@ -155,18 +153,9 @@ cdef class Parser:
raise ValueError("Error parsing doc: %s" % sent_str)
PyErr_CheckSignals()
for doc in queue:
self._finalize(doc)
self.moves.finalize_doc(doc)
yield doc
def _finalize(self, Doc doc):
# deprojectivize output
if self._projectivize:
PseudoProjectivity.deprojectivize(doc)
# set annotation-specific iterators
doc.noun_chunks = CHUNKERS.get(doc.vocab.lang,DocIterator)
# mark doc as parsed
doc.is_parsed = True
cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil:
cdef ExampleC eg
eg.nr_feat = nr_feat
@ -313,6 +302,7 @@ cdef class StepwiseState:
if self.stcls.is_final():
self.parser.moves.finalize_state(self.stcls.c)
self.doc.set_parse(self.stcls.c._sent)
self.parser.moves.finalize_doc(self.doc)
cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,

View File

@ -53,6 +53,9 @@ cdef class TransitionSystem:
cdef int finalize_state(self, StateC* state) nogil:
pass
def finalize_doc(self, doc):
pass
cdef int preprocess_gold(self, GoldParse gold) except -1:
raise NotImplementedError

View File

@ -3,7 +3,7 @@ import numpy as np
from spacy.attrs import HEAD, DEP
from spacy.symbols import nsubj, dobj, punct, amod, nmod, conj, cc, root
from spacy.en import English
from spacy.syntax.iterators import EnglishNounChunks
from spacy.syntax.iterators import english_noun_chunks
def test_not_nested():
@ -22,9 +22,7 @@ def test_not_nested():
[-2, conj],
[-5, dobj]
], dtype='int32'))
tokens.noun_chunks = EnglishNounChunks
for chunk in tokens.noun_chunks:
print(chunk.text)
tokens.noun_chunks_iterator = english_noun_chunks
word_occurred = {}
for chunk in tokens.noun_chunks:
for word in chunk:

View File

@ -23,6 +23,7 @@ from .span cimport Span
from .token cimport Token
from ..serialize.bits cimport BitArray
from ..util import normalize_slice
from ..syntax.iterators import CHUNKERS
DEF PADDING = 5
@ -81,7 +82,7 @@ cdef class Doc:
self.is_parsed = False
self._py_tokens = []
self._vector = None
self.noun_chunks_iterator = DocIterator(self)
self.noun_chunks_iterator = CHUNKERS.get(self.vocab.lang)
def __getitem__(self, object i):
"""Get a Token or a Span from the Doc.
@ -233,8 +234,8 @@ cdef class Doc:
self.c[start].ent_iob = 3
property noun_chunks:
def __get__(self):
@property
def noun_chunks(self):
"""Yield spans for base noun phrases."""
if not self.is_parsed:
raise ValueError(
@ -242,12 +243,8 @@ cdef class Doc:
"requires data to be installed. If you haven't done so, run: "
"\npython -m spacy.%s.download all\n"
"to install the data" % self.vocab.lang)
yield from self.noun_chunks_iterator
def __set__(self, DocIterator):
self.noun_chunks_iterator = DocIterator(self)
for start, end, label in self.noun_chunks_iterator(self):
yield Span(self, start, end, label=label)
@property
def sents(self):