mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-24 07:30:52 +03:00
add classes for English and German noun chunks the respective iterators are set for the document when created by the parser as they depend on the annotation scheme of the parsing model
83 lines
3.1 KiB
Cython
83 lines
3.1 KiB
Cython
from spacy.structs cimport TokenC
|
|
from spacy.tokens.span cimport Span
|
|
from spacy.tokens.doc cimport Doc
|
|
from spacy.tokens.token cimport Token
|
|
|
|
from spacy.parts_of_speech cimport NOUN
|
|
|
|
# base class for document iterators
|
|
cdef class DocIterator:
|
|
def __init__(self, Doc doc):
|
|
self._doc = doc
|
|
|
|
def __iter__(self):
|
|
return self
|
|
|
|
def __next__(self):
|
|
raise NotImplementedError
|
|
|
|
|
|
cdef class EnglishNounChunks(DocIterator):
|
|
def __init__(self, Doc doc):
|
|
super(EnglishNounChunks,self).__init__(doc)
|
|
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
|
|
self._np_label = self._doc.vocab.strings['NP']
|
|
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
|
|
self._conjunct = self._doc.vocab.strings['conj']
|
|
self.i = 0
|
|
|
|
def __next__(self):
|
|
cdef const TokenC* word
|
|
cdef widx
|
|
while self.i < self._doc.length:
|
|
widx = self.i
|
|
self.i += 1
|
|
word = &self._doc.c[widx]
|
|
if word.pos == NOUN:
|
|
if word.dep in self._np_deps:
|
|
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
|
|
elif word.dep == self._conjunct:
|
|
head = word+word.head
|
|
while head.dep == self._conjunct and head.head < 0:
|
|
head += head.head
|
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
|
if head.dep in self._np_deps:
|
|
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
|
|
raise StopIteration
|
|
|
|
|
|
# this iterator extracts spans headed by NOUNs starting from the left-most
|
|
# syntactic dependent until the NOUN itself
|
|
# for close apposition and measurement construction, the span is sometimes
|
|
# extended to the right of the NOUN
|
|
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
|
|
# just "eine Tasse", same for "das Thema Familie"
|
|
cdef class GermanNounChunks(DocIterator):
|
|
def __init__(self, Doc doc):
|
|
super(GermanNounChunks,self).__init__(doc)
|
|
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
|
|
self._np_label = self._doc.vocab.strings['NP']
|
|
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
|
|
self._close_app = self._doc.vocab.strings['nk']
|
|
self.i = 0
|
|
|
|
def __next__(self):
|
|
cdef const TokenC* word
|
|
cdef int rbracket
|
|
cdef Token rdep
|
|
cdef widx
|
|
while self.i < self._doc.length:
|
|
widx = self.i
|
|
self.i += 1
|
|
word = &self._doc.c[widx]
|
|
if word.pos == NOUN and word.dep in self._np_deps:
|
|
rbracket = widx+1
|
|
# try to extend the span to the right
|
|
# to capture close apposition/measurement constructions
|
|
for rdep in self._doc[widx].rights:
|
|
if rdep.pos == NOUN and rdep.dep == self._close_app:
|
|
rbracket = rdep.i+1
|
|
return Span(self._doc, word.l_edge, rbracket, label=self._np_label)
|
|
raise StopIteration
|
|
|