* Refactor noun chunk iterators, so that they're simple functions. Install the iterator when the Doc is created, but allow users to write to the noun_chunk_iterator attribute. The iterator functions accept an object and yield (int start, int end, int label) triples.

This commit is contained in:
Matthew Honnibal 2016-05-02 14:25:10 +02:00
parent e526be5602
commit 508fd1f6dc
7 changed files with 57 additions and 131 deletions

View File

@ -383,6 +383,9 @@ cdef class ArcEager(TransitionSystem):
if st._sent[i].head == 0 and st._sent[i].dep == 0: if st._sent[i].head == 0 and st._sent[i].dep == 0:
st._sent[i].dep = self.root_label st._sent[i].dep = self.root_label
def finalize_doc(self, doc):
doc.is_parsed = True
cdef int set_valid(self, int* output, const StateC* st) nogil: cdef int set_valid(self, int* output, const StateC* st) nogil:
cdef bint[N_MOVES] is_valid cdef bint[N_MOVES] is_valid
is_valid[SHIFT] = Shift.is_valid(st, -1) is_valid[SHIFT] = Shift.is_valid(st, -1)

View File

@ -1,19 +0,0 @@
from spacy.tokens.doc cimport Doc
cdef dict CHUNKERS
cdef class DocIterator:
cdef Doc _doc
cdef class EnglishNounChunks(DocIterator):
cdef int i
cdef int _np_label
cdef set _np_deps
cdef int _conjunct
cdef class GermanNounChunks(DocIterator):
cdef int i
cdef int _np_label
cdef set _np_deps
cdef int _close_app

View File

@ -1,55 +1,23 @@
from spacy.structs cimport TokenC
from spacy.tokens.span cimport Span
from spacy.tokens.doc cimport Doc
from spacy.tokens.token cimport Token
from spacy.parts_of_speech cimport NOUN from spacy.parts_of_speech cimport NOUN
CHUNKERS = {'en':EnglishNounChunks, 'de':GermanNounChunks}
# base class for document iterators def english_noun_chunks(doc):
cdef class DocIterator: labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
def __init__(self, Doc doc): 'attr', 'root']
self._doc = doc np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings['conj']
def __iter__(self): np_label = doc.vocab.strings['NP']
return self for i in range(len(doc)):
word = doc[i]
def __next__(self): if word.pos == NOUN and word.dep in np_deps:
raise NotImplementedError yield word.left_edge.i, word.i+1, np_label
elif word.pos == NOUN and word.dep == conj:
head = word.head
cdef class EnglishNounChunks(DocIterator): while head.dep == conj and head.head.i < head.i:
def __init__(self, Doc doc): head = head.head
super(EnglishNounChunks,self).__init__(doc)
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
self._np_label = self._doc.vocab.strings['NP']
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
self._conjunct = self._doc.vocab.strings['conj']
self.i = 0
def __iter__(self):
self.i = 0
return super(EnglishNounChunks,self).__iter__()
def __next__(self):
cdef const TokenC* word
cdef widx
while self.i < self._doc.length:
widx = self.i
self.i += 1
word = &self._doc.c[widx]
if word.pos == NOUN:
if word.dep in self._np_deps:
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
elif word.dep == self._conjunct:
head = word+word.head
while head.dep == self._conjunct and head.head < 0:
head += head.head
# If the head is an NP, and we're coordinated to it, we're an NP # If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in self._np_deps: if head.dep in np_deps:
return Span(self._doc, word.l_edge, widx+1, label=self._np_label) yield word.left_edge.i, word.i+1, np_label
raise StopIteration
# this iterator extracts spans headed by NOUNs starting from the left-most # this iterator extracts spans headed by NOUNs starting from the left-most
@ -58,35 +26,21 @@ cdef class EnglishNounChunks(DocIterator):
# extended to the right of the NOUN # extended to the right of the NOUN
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not # example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
# just "eine Tasse", same for "das Thema Familie" # just "eine Tasse", same for "das Thema Familie"
cdef class GermanNounChunks(DocIterator): def german_noun_chunks(doc):
def __init__(self, Doc doc):
super(GermanNounChunks,self).__init__(doc)
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app'] labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
self._np_label = self._doc.vocab.strings['NP'] np_label = doc.vocab.strings['NP']
self._np_deps = set( self._doc.vocab.strings[label] for label in labels ) np_deps = set(doc.vocab.strings[label] for label in labels)
self._close_app = self._doc.vocab.strings['nk'] close_app = doc.vocab.strings['nk']
self.i = 0
def __iter__(self): for word in doc:
self.i = 0 if word.pos == NOUN and word.dep in np_deps:
return super(GermanNounChunks,self).__iter__() rbracket = word.i+1
def __next__(self):
cdef const TokenC* word
cdef int rbracket
cdef Token rdep
cdef widx
while self.i < self._doc.length:
widx = self.i
self.i += 1
word = &self._doc.c[widx]
if word.pos == NOUN and word.dep in self._np_deps:
rbracket = widx+1
# try to extend the span to the right # try to extend the span to the right
# to capture close apposition/measurement constructions # to capture close apposition/measurement constructions
for rdep in self._doc[widx].rights: for rdep in doc[word.i].rights:
if rdep.pos == NOUN and rdep.dep == self._close_app: if rdep.pos == NOUN and rdep.dep == close_app:
rbracket = rdep.i+1 rbracket = rdep.i+1
return Span(self._doc, word.l_edge, rbracket, label=self._np_label) yield word.l_edge, rbracket, np_label
raise StopIteration
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}

View File

@ -47,8 +47,6 @@ from ._parse_features cimport fill_context
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
from spacy.syntax.iterators cimport CHUNKERS, DocIterator, EnglishNounChunks, GermanNounChunks
DEBUG = False DEBUG = False
def set_debug(val): def set_debug(val):
@ -116,7 +114,7 @@ cdef class Parser:
self.parseC(tokens.c, tokens.length, nr_feat, nr_class) self.parseC(tokens.c, tokens.length, nr_feat, nr_class)
# Check for KeyboardInterrupt etc. Untested # Check for KeyboardInterrupt etc. Untested
PyErr_CheckSignals() PyErr_CheckSignals()
self._finalize(tokens) self.moves.finalize_doc(tokens)
def pipe(self, stream, int batch_size=1000, int n_threads=2): def pipe(self, stream, int batch_size=1000, int n_threads=2):
cdef Pool mem = Pool() cdef Pool mem = Pool()
@ -142,7 +140,7 @@ cdef class Parser:
raise ValueError("Error parsing doc: %s" % sent_str) raise ValueError("Error parsing doc: %s" % sent_str)
PyErr_CheckSignals() PyErr_CheckSignals()
for doc in queue: for doc in queue:
self._finalize(doc) self.moves.finalize_doc(doc)
yield doc yield doc
queue = [] queue = []
batch_size = len(queue) batch_size = len(queue)
@ -155,18 +153,9 @@ cdef class Parser:
raise ValueError("Error parsing doc: %s" % sent_str) raise ValueError("Error parsing doc: %s" % sent_str)
PyErr_CheckSignals() PyErr_CheckSignals()
for doc in queue: for doc in queue:
self._finalize(doc) self.moves.finalize_doc(doc)
yield doc yield doc
def _finalize(self, Doc doc):
# deprojectivize output
if self._projectivize:
PseudoProjectivity.deprojectivize(doc)
# set annotation-specific iterators
doc.noun_chunks = CHUNKERS.get(doc.vocab.lang,DocIterator)
# mark doc as parsed
doc.is_parsed = True
cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil: cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil:
cdef ExampleC eg cdef ExampleC eg
eg.nr_feat = nr_feat eg.nr_feat = nr_feat
@ -313,6 +302,7 @@ cdef class StepwiseState:
if self.stcls.is_final(): if self.stcls.is_final():
self.parser.moves.finalize_state(self.stcls.c) self.parser.moves.finalize_state(self.stcls.c)
self.doc.set_parse(self.stcls.c._sent) self.doc.set_parse(self.stcls.c._sent)
self.parser.moves.finalize_doc(self.doc)
cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions, cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,

View File

@ -53,6 +53,9 @@ cdef class TransitionSystem:
cdef int finalize_state(self, StateC* state) nogil: cdef int finalize_state(self, StateC* state) nogil:
pass pass
def finalize_doc(self, doc):
pass
cdef int preprocess_gold(self, GoldParse gold) except -1: cdef int preprocess_gold(self, GoldParse gold) except -1:
raise NotImplementedError raise NotImplementedError

View File

@ -3,7 +3,7 @@ import numpy as np
from spacy.attrs import HEAD, DEP from spacy.attrs import HEAD, DEP
from spacy.symbols import nsubj, dobj, punct, amod, nmod, conj, cc, root from spacy.symbols import nsubj, dobj, punct, amod, nmod, conj, cc, root
from spacy.en import English from spacy.en import English
from spacy.syntax.iterators import EnglishNounChunks from spacy.syntax.iterators import english_noun_chunks
def test_not_nested(): def test_not_nested():
@ -22,9 +22,7 @@ def test_not_nested():
[-2, conj], [-2, conj],
[-5, dobj] [-5, dobj]
], dtype='int32')) ], dtype='int32'))
tokens.noun_chunks = EnglishNounChunks tokens.noun_chunks_iterator = english_noun_chunks
for chunk in tokens.noun_chunks:
print(chunk.text)
word_occurred = {} word_occurred = {}
for chunk in tokens.noun_chunks: for chunk in tokens.noun_chunks:
for word in chunk: for word in chunk:

View File

@ -23,6 +23,7 @@ from .span cimport Span
from .token cimport Token from .token cimport Token
from ..serialize.bits cimport BitArray from ..serialize.bits cimport BitArray
from ..util import normalize_slice from ..util import normalize_slice
from ..syntax.iterators import CHUNKERS
DEF PADDING = 5 DEF PADDING = 5
@ -81,7 +82,7 @@ cdef class Doc:
self.is_parsed = False self.is_parsed = False
self._py_tokens = [] self._py_tokens = []
self._vector = None self._vector = None
self.noun_chunks_iterator = DocIterator(self) self.noun_chunks_iterator = CHUNKERS.get(self.vocab.lang)
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a Token or a Span from the Doc. """Get a Token or a Span from the Doc.
@ -233,8 +234,8 @@ cdef class Doc:
self.c[start].ent_iob = 3 self.c[start].ent_iob = 3
property noun_chunks: @property
def __get__(self): def noun_chunks(self):
"""Yield spans for base noun phrases.""" """Yield spans for base noun phrases."""
if not self.is_parsed: if not self.is_parsed:
raise ValueError( raise ValueError(
@ -242,12 +243,8 @@ cdef class Doc:
"requires data to be installed. If you haven't done so, run: " "requires data to be installed. If you haven't done so, run: "
"\npython -m spacy.%s.download all\n" "\npython -m spacy.%s.download all\n"
"to install the data" % self.vocab.lang) "to install the data" % self.vocab.lang)
for start, end, label in self.noun_chunks_iterator(self):
yield from self.noun_chunks_iterator yield Span(self, start, end, label=label)
def __set__(self, DocIterator):
self.noun_chunks_iterator = DocIterator(self)
@property @property
def sents(self): def sents(self):