spaCy/spacy/syntax/iterators.pyx
Wolfgang Seeker 5e2e8e951a add baseclass DocIterator for iterators over documents
add classes for English and German noun chunks

the respective iterators are set for the document when created by the parser
as they depend on the annotation scheme of the parsing model
2016-03-16 15:53:35 +01:00

83 lines
3.1 KiB
Cython

from spacy.structs cimport TokenC
from spacy.tokens.span cimport Span
from spacy.tokens.doc cimport Doc
from spacy.tokens.token cimport Token
from spacy.parts_of_speech cimport NOUN
# base class for document iterators
cdef class DocIterator:
def __init__(self, Doc doc):
self._doc = doc
def __iter__(self):
return self
def __next__(self):
raise NotImplementedError
cdef class EnglishNounChunks(DocIterator):
def __init__(self, Doc doc):
super(EnglishNounChunks,self).__init__(doc)
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
self._np_label = self._doc.vocab.strings['NP']
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
self._conjunct = self._doc.vocab.strings['conj']
self.i = 0
def __next__(self):
cdef const TokenC* word
cdef widx
while self.i < self._doc.length:
widx = self.i
self.i += 1
word = &self._doc.c[widx]
if word.pos == NOUN:
if word.dep in self._np_deps:
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
elif word.dep == self._conjunct:
head = word+word.head
while head.dep == self._conjunct and head.head < 0:
head += head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in self._np_deps:
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
raise StopIteration
# this iterator extracts spans headed by NOUNs starting from the left-most
# syntactic dependent until the NOUN itself
# for close apposition and measurement construction, the span is sometimes
# extended to the right of the NOUN
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
# just "eine Tasse", same for "das Thema Familie"
cdef class GermanNounChunks(DocIterator):
def __init__(self, Doc doc):
super(GermanNounChunks,self).__init__(doc)
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
self._np_label = self._doc.vocab.strings['NP']
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
self._close_app = self._doc.vocab.strings['nk']
self.i = 0
def __next__(self):
cdef const TokenC* word
cdef int rbracket
cdef Token rdep
cdef widx
while self.i < self._doc.length:
widx = self.i
self.i += 1
word = &self._doc.c[widx]
if word.pos == NOUN and word.dep in self._np_deps:
rbracket = widx+1
# try to extend the span to the right
# to capture close apposition/measurement constructions
for rdep in self._doc[widx].rights:
if rdep.pos == NOUN and rdep.dep == self._close_app:
rbracket = rdep.i+1
return Span(self._doc, word.l_edge, rbracket, label=self._np_label)
raise StopIteration