add baseclass DocIterator for iterators over documents

add classes for English and German noun chunks

the respective iterators are set for the document when created by the parser
as they depend on the annotation scheme of the parsing model
This commit is contained in:
Wolfgang Seeker 2016-03-16 15:53:35 +01:00
parent 03fb498dbe
commit 5e2e8e951a
11 changed files with 140 additions and 89 deletions

View File

@ -63,8 +63,7 @@ MOD_NAMES = [
'spacy.matcher', 'spacy.matcher',
'spacy.syntax.ner', 'spacy.syntax.ner',
'spacy.symbols', 'spacy.symbols',
'spacy.en.iterators', 'spacy.syntax.iterators']
'spacy.de.iterators']
# By subclassing build_extensions we have the actual compiler that will be used # By subclassing build_extensions we have the actual compiler that will be used

View File

View File

@ -1,28 +0,0 @@
from spacy.structs cimport TokenC
from spacy.tokens.span cimport Span
from spacy.parts_of_speech cimport NOUN
def noun_chunks(Span sent):
# this function extracts spans headed by NOUNs starting from the left-most
# syntactic dependent until the NOUN itself
# for close apposition and measurement construction, the span is sometimes
# extended to the right of the NOUN
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
# just "eine Tasse", same for "das Thema Familie"
cdef const TokenC* word
strings = sent.doc.vocab.strings
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
close_app = strings['nk']
np_deps = [strings[label] for label in labels]
np_label = strings['NP']
for i in range(sent.start, sent.end):
word = &sent.doc.c[i]
if word.pos == NOUN and word.dep in np_deps:
rbracket = i+1
# try to extend the span to the right
# to capture close apposition/measurement constructions
for rdep in sent.doc[i].rights:
if rdep.pos == NOUN and rdep.dep == close_app:
rbracket = rdep.i+1
yield Span(sent.doc, word.l_edge, rbracket, label=np_label)

View File

View File

@ -1,24 +0,0 @@
from spacy.structs cimport TokenC
from spacy.tokens.span cimport Span
from spacy.parts_of_speech cimport NOUN
def noun_chunks(Span sent):
cdef const TokenC* word
strings = sent.doc.vocab.strings
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
np_deps = [strings[label] for label in labels]
conj = strings['conj']
np_label = strings['NP']
for i in range(sent.start, sent.end):
word = &sent.doc.c[i]
if word.pos == NOUN and word.dep in np_deps:
yield Span(sent.doc, word.l_edge, i+1, label=np_label)
elif word.pos == NOUN and word.dep == conj:
head = word+word.head
while head.dep == conj and head.head < 0:
head += head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
yield Span(sent.doc, word.l_edge, i+1, label=np_label)

View File

@ -0,0 +1,16 @@
from spacy.tokens.doc cimport Doc
cdef class DocIterator:
cdef Doc _doc
cdef class EnglishNounChunks(DocIterator):
cdef int i
cdef int _np_label
cdef set _np_deps
cdef class GermanNounChunks(DocIterator):
cdef int i
cdef int _np_label
cdef set _np_deps
cdef int _close_app

View File

@ -0,0 +1,82 @@
from spacy.structs cimport TokenC
from spacy.tokens.span cimport Span
from spacy.tokens.doc cimport Doc
from spacy.tokens.token cimport Token
from spacy.parts_of_speech cimport NOUN
# base class for document iterators
cdef class DocIterator:
def __init__(self, Doc doc):
self._doc = doc
def __iter__(self):
return self
def __next__(self):
raise NotImplementedError
cdef class EnglishNounChunks(DocIterator):
def __init__(self, Doc doc):
super(EnglishNounChunks,self).__init__(doc)
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
self._np_label = self._doc.vocab.strings['NP']
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
self._conjunct = self._doc.vocab.strings['conj']
self.i = 0
def __next__(self):
cdef const TokenC* word
cdef widx
while self.i < self._doc.length:
widx = self.i
self.i += 1
word = &self._doc.c[widx]
if word.pos == NOUN:
if word.dep in self._np_deps:
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
elif word.dep == self._conjunct:
head = word+word.head
while head.dep == self._conjunct and head.head < 0:
head += head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in self._np_deps:
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
raise StopIteration
# this iterator extracts spans headed by NOUNs starting from the left-most
# syntactic dependent until the NOUN itself
# for close apposition and measurement construction, the span is sometimes
# extended to the right of the NOUN
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
# just "eine Tasse", same for "das Thema Familie"
cdef class GermanNounChunks(DocIterator):
def __init__(self, Doc doc):
super(GermanNounChunks,self).__init__(doc)
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
self._np_label = self._doc.vocab.strings['NP']
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
self._close_app = self._doc.vocab.strings['nk']
self.i = 0
def __next__(self):
cdef const TokenC* word
cdef int rbracket
cdef Token rdep
cdef widx
while self.i < self._doc.length:
widx = self.i
self.i += 1
word = &self._doc.c[widx]
if word.pos == NOUN and word.dep in self._np_deps:
rbracket = widx+1
# try to extend the span to the right
# to capture close apposition/measurement constructions
for rdep in self._doc[widx].rights:
if rdep.pos == NOUN and rdep.dep == self._close_app:
rbracket = rdep.i+1
return Span(self._doc, word.l_edge, rbracket, label=self._np_label)
raise StopIteration

View File

@ -47,6 +47,8 @@ from ._parse_features cimport fill_context
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
from spacy.syntax.iterators cimport DocIterator, EnglishNounChunks, GermanNounChunks
CHUNKERS = {'en':EnglishNounChunks, 'de':GermanNounChunks}
DEBUG = False DEBUG = False
@ -113,12 +115,9 @@ cdef class Parser:
cdef int nr_feat = self.model.nr_feat cdef int nr_feat = self.model.nr_feat
with nogil: with nogil:
self.parseC(tokens.c, tokens.length, nr_feat, nr_class) self.parseC(tokens.c, tokens.length, nr_feat, nr_class)
tokens.is_parsed = True
# Check for KeyboardInterrupt etc. Untested # Check for KeyboardInterrupt etc. Untested
PyErr_CheckSignals() PyErr_CheckSignals()
# projectivize output self._finalize(tokens)
if self._projectivize:
PseudoProjectivity.deprojectivize(tokens)
def pipe(self, stream, int batch_size=1000, int n_threads=2): def pipe(self, stream, int batch_size=1000, int n_threads=2):
cdef Pool mem = Pool() cdef Pool mem = Pool()
@ -144,7 +143,7 @@ cdef class Parser:
raise ValueError("Error parsing doc: %s" % sent_str) raise ValueError("Error parsing doc: %s" % sent_str)
PyErr_CheckSignals() PyErr_CheckSignals()
for doc in queue: for doc in queue:
doc.is_parsed = True self._finalize(doc)
yield doc yield doc
queue = [] queue = []
batch_size = len(queue) batch_size = len(queue)
@ -155,10 +154,19 @@ cdef class Parser:
with gil: with gil:
sent_str = queue[i].text sent_str = queue[i].text
raise ValueError("Error parsing doc: %s" % sent_str) raise ValueError("Error parsing doc: %s" % sent_str)
for doc in queue:
doc.is_parsed = True
yield doc
PyErr_CheckSignals() PyErr_CheckSignals()
for doc in queue:
self._finalize(doc)
yield doc
def _finalize(self, Doc doc):
# deprojectivize output
if self._projectivize:
PseudoProjectivity.deprojectivize(doc)
# set annotation-specific iterators
doc.noun_chunks = CHUNKERS.get(doc.vocab.lang,DocIterator)
# mark doc as parsed
doc.is_parsed = True
cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil: cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil:
cdef ExampleC eg cdef ExampleC eg

View File

@ -7,6 +7,8 @@ from ..structs cimport TokenC, LexemeC
from ..typedefs cimport attr_t from ..typedefs cimport attr_t
from ..attrs cimport attr_id_t from ..attrs cimport attr_id_t
from spacy.syntax.iterators cimport DocIterator
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
@ -42,6 +44,8 @@ cdef class Doc:
cdef int length cdef int length
cdef int max_length cdef int max_length
cdef DocIterator noun_chunks_iterator
cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1 cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1
cpdef np.ndarray to_array(self, object features) cpdef np.ndarray to_array(self, object features)

View File

@ -81,6 +81,7 @@ cdef class Doc:
self.is_parsed = False self.is_parsed = False
self._py_tokens = [] self._py_tokens = []
self._vector = None self._vector = None
self.noun_chunks_iterator = DocIterator(self)
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a Token or a Span from the Doc. """Get a Token or a Span from the Doc.
@ -231,8 +232,9 @@ cdef class Doc:
# Set start as B # Set start as B
self.c[start].ent_iob = 3 self.c[start].ent_iob = 3
@property
def noun_chunks(self): property noun_chunks:
def __get__(self):
"""Yield spans for base noun phrases.""" """Yield spans for base noun phrases."""
if not self.is_parsed: if not self.is_parsed:
raise ValueError( raise ValueError(
@ -241,25 +243,10 @@ cdef class Doc:
"\npython -m spacy.en.download all\n" "\npython -m spacy.en.download all\n"
"to install the data") "to install the data")
from spacy.en.iterators import noun_chunks as en_noun_chunks yield from self.noun_chunks_iterator
from spacy.de.iterators import noun_chunks as de_noun_chunks
chunk_rules = {'en':en_noun_chunks,
'de':de_noun_chunks,
}
for sent in self.sents:
print(sent)
lang = sent.root.lang_
chunker = chunk_rules.get(lang,None)
if chunker == None:
warnings.warn("noun_chunks is not available for language %s." % lang)
print(sent.root.orth_)
continue
for chunk in chunker(sent):
yield chunk
def __set__(self, DocIterator):
self.noun_chunks_iterator = DocIterator(self)
@property @property

View File

@ -27,7 +27,7 @@ from . import symbols
from cymem.cymem cimport Address from cymem.cymem cimport Address
from . import util from . import util
from .serialize.packer cimport Packer from .serialize.packer cimport Packer
from .attrs cimport PROB from .attrs cimport PROB, LANG
try: try:
import copy_reg import copy_reg
@ -105,6 +105,13 @@ cdef class Vocab:
self._serializer = Packer(self, self.serializer_freqs) self._serializer = Packer(self, self.serializer_freqs)
return self._serializer return self._serializer
property lang:
def __get__(self):
langfunc = None
if self.get_lex_attr:
langfunc = self.get_lex_attr.get(LANG,None)
return langfunc('_') if langfunc else ''
def __len__(self): def __len__(self):
"""The current number of lexemes stored.""" """The current number of lexemes stored."""
return self.length return self.length