2016-03-16 17:53:35 +03:00
|
|
|
from spacy.structs cimport TokenC
|
|
|
|
from spacy.tokens.span cimport Span
|
|
|
|
from spacy.tokens.doc cimport Doc
|
|
|
|
from spacy.tokens.token cimport Token
|
|
|
|
|
|
|
|
from spacy.parts_of_speech cimport NOUN
|
|
|
|
|
2016-04-08 17:45:27 +03:00
|
|
|
CHUNKERS = {'en':EnglishNounChunks, 'de':GermanNounChunks}
|
|
|
|
|
2016-03-16 17:53:35 +03:00
|
|
|
# base class for document iterators
|
|
|
|
cdef class DocIterator:
|
|
|
|
def __init__(self, Doc doc):
|
|
|
|
self._doc = doc
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
return self
|
|
|
|
|
|
|
|
def __next__(self):
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
|
|
|
|
cdef class EnglishNounChunks(DocIterator):
|
|
|
|
def __init__(self, Doc doc):
|
|
|
|
super(EnglishNounChunks,self).__init__(doc)
|
|
|
|
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
|
|
|
|
self._np_label = self._doc.vocab.strings['NP']
|
|
|
|
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
|
|
|
|
self._conjunct = self._doc.vocab.strings['conj']
|
|
|
|
self.i = 0
|
|
|
|
|
2016-04-15 18:49:16 +03:00
|
|
|
def __iter__(self):
|
|
|
|
self.i = 0
|
|
|
|
return super(EnglishNounChunks,self).__iter__()
|
|
|
|
|
2016-03-16 17:53:35 +03:00
|
|
|
def __next__(self):
|
|
|
|
cdef const TokenC* word
|
|
|
|
cdef widx
|
|
|
|
while self.i < self._doc.length:
|
|
|
|
widx = self.i
|
|
|
|
self.i += 1
|
|
|
|
word = &self._doc.c[widx]
|
|
|
|
if word.pos == NOUN:
|
|
|
|
if word.dep in self._np_deps:
|
|
|
|
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
|
|
|
|
elif word.dep == self._conjunct:
|
|
|
|
head = word+word.head
|
|
|
|
while head.dep == self._conjunct and head.head < 0:
|
|
|
|
head += head.head
|
|
|
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
|
|
|
if head.dep in self._np_deps:
|
|
|
|
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
|
|
|
|
raise StopIteration
|
|
|
|
|
|
|
|
|
|
|
|
# this iterator extracts spans headed by NOUNs starting from the left-most
|
|
|
|
# syntactic dependent until the NOUN itself
|
|
|
|
# for close apposition and measurement construction, the span is sometimes
|
|
|
|
# extended to the right of the NOUN
|
|
|
|
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
|
|
|
|
# just "eine Tasse", same for "das Thema Familie"
|
|
|
|
cdef class GermanNounChunks(DocIterator):
|
|
|
|
def __init__(self, Doc doc):
|
|
|
|
super(GermanNounChunks,self).__init__(doc)
|
|
|
|
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
|
|
|
|
self._np_label = self._doc.vocab.strings['NP']
|
|
|
|
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
|
|
|
|
self._close_app = self._doc.vocab.strings['nk']
|
|
|
|
self.i = 0
|
|
|
|
|
2016-04-15 18:49:16 +03:00
|
|
|
def __iter__(self):
|
|
|
|
self.i = 0
|
|
|
|
return super(GermanNounChunks,self).__iter__()
|
|
|
|
|
2016-03-16 17:53:35 +03:00
|
|
|
def __next__(self):
|
|
|
|
cdef const TokenC* word
|
|
|
|
cdef int rbracket
|
|
|
|
cdef Token rdep
|
|
|
|
cdef widx
|
|
|
|
while self.i < self._doc.length:
|
|
|
|
widx = self.i
|
|
|
|
self.i += 1
|
|
|
|
word = &self._doc.c[widx]
|
|
|
|
if word.pos == NOUN and word.dep in self._np_deps:
|
|
|
|
rbracket = widx+1
|
|
|
|
# try to extend the span to the right
|
|
|
|
# to capture close apposition/measurement constructions
|
|
|
|
for rdep in self._doc[widx].rights:
|
|
|
|
if rdep.pos == NOUN and rdep.dep == self._close_app:
|
|
|
|
rbracket = rdep.i+1
|
2016-04-15 18:49:16 +03:00
|
|
|
return Span(self._doc, word.l_edge, rbracket, label=self._np_label)
|
2016-03-16 17:53:35 +03:00
|
|
|
raise StopIteration
|
|
|
|
|