mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
add baseclass DocIterator for iterators over documents
add classes for English and German noun chunks the respective iterators are set for the document when created by the parser as they depend on the annotation scheme of the parsing model
This commit is contained in:
parent
03fb498dbe
commit
5e2e8e951a
3
setup.py
3
setup.py
|
@ -63,8 +63,7 @@ MOD_NAMES = [
|
||||||
'spacy.matcher',
|
'spacy.matcher',
|
||||||
'spacy.syntax.ner',
|
'spacy.syntax.ner',
|
||||||
'spacy.symbols',
|
'spacy.symbols',
|
||||||
'spacy.en.iterators',
|
'spacy.syntax.iterators']
|
||||||
'spacy.de.iterators']
|
|
||||||
|
|
||||||
|
|
||||||
# By subclassing build_extensions we have the actual compiler that will be used
|
# By subclassing build_extensions we have the actual compiler that will be used
|
||||||
|
|
|
@ -1,28 +0,0 @@
|
||||||
from spacy.structs cimport TokenC
|
|
||||||
from spacy.tokens.span cimport Span
|
|
||||||
|
|
||||||
from spacy.parts_of_speech cimport NOUN
|
|
||||||
|
|
||||||
def noun_chunks(Span sent):
|
|
||||||
# this function extracts spans headed by NOUNs starting from the left-most
|
|
||||||
# syntactic dependent until the NOUN itself
|
|
||||||
# for close apposition and measurement construction, the span is sometimes
|
|
||||||
# extended to the right of the NOUN
|
|
||||||
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
|
|
||||||
# just "eine Tasse", same for "das Thema Familie"
|
|
||||||
cdef const TokenC* word
|
|
||||||
strings = sent.doc.vocab.strings
|
|
||||||
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
|
|
||||||
close_app = strings['nk']
|
|
||||||
np_deps = [strings[label] for label in labels]
|
|
||||||
np_label = strings['NP']
|
|
||||||
for i in range(sent.start, sent.end):
|
|
||||||
word = &sent.doc.c[i]
|
|
||||||
if word.pos == NOUN and word.dep in np_deps:
|
|
||||||
rbracket = i+1
|
|
||||||
# try to extend the span to the right
|
|
||||||
# to capture close apposition/measurement constructions
|
|
||||||
for rdep in sent.doc[i].rights:
|
|
||||||
if rdep.pos == NOUN and rdep.dep == close_app:
|
|
||||||
rbracket = rdep.i+1
|
|
||||||
yield Span(sent.doc, word.l_edge, rbracket, label=np_label)
|
|
|
@ -1,24 +0,0 @@
|
||||||
from spacy.structs cimport TokenC
|
|
||||||
from spacy.tokens.span cimport Span
|
|
||||||
|
|
||||||
from spacy.parts_of_speech cimport NOUN
|
|
||||||
|
|
||||||
def noun_chunks(Span sent):
|
|
||||||
cdef const TokenC* word
|
|
||||||
strings = sent.doc.vocab.strings
|
|
||||||
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
|
|
||||||
np_deps = [strings[label] for label in labels]
|
|
||||||
conj = strings['conj']
|
|
||||||
np_label = strings['NP']
|
|
||||||
for i in range(sent.start, sent.end):
|
|
||||||
word = &sent.doc.c[i]
|
|
||||||
if word.pos == NOUN and word.dep in np_deps:
|
|
||||||
yield Span(sent.doc, word.l_edge, i+1, label=np_label)
|
|
||||||
elif word.pos == NOUN and word.dep == conj:
|
|
||||||
head = word+word.head
|
|
||||||
while head.dep == conj and head.head < 0:
|
|
||||||
head += head.head
|
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
|
||||||
if head.dep in np_deps:
|
|
||||||
yield Span(sent.doc, word.l_edge, i+1, label=np_label)
|
|
||||||
|
|
16
spacy/syntax/iterators.pxd
Normal file
16
spacy/syntax/iterators.pxd
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
|
||||||
|
from spacy.tokens.doc cimport Doc
|
||||||
|
|
||||||
|
cdef class DocIterator:
|
||||||
|
cdef Doc _doc
|
||||||
|
|
||||||
|
cdef class EnglishNounChunks(DocIterator):
|
||||||
|
cdef int i
|
||||||
|
cdef int _np_label
|
||||||
|
cdef set _np_deps
|
||||||
|
|
||||||
|
cdef class GermanNounChunks(DocIterator):
|
||||||
|
cdef int i
|
||||||
|
cdef int _np_label
|
||||||
|
cdef set _np_deps
|
||||||
|
cdef int _close_app
|
82
spacy/syntax/iterators.pyx
Normal file
82
spacy/syntax/iterators.pyx
Normal file
|
@ -0,0 +1,82 @@
|
||||||
|
from spacy.structs cimport TokenC
|
||||||
|
from spacy.tokens.span cimport Span
|
||||||
|
from spacy.tokens.doc cimport Doc
|
||||||
|
from spacy.tokens.token cimport Token
|
||||||
|
|
||||||
|
from spacy.parts_of_speech cimport NOUN
|
||||||
|
|
||||||
|
# base class for document iterators
|
||||||
|
cdef class DocIterator:
|
||||||
|
def __init__(self, Doc doc):
|
||||||
|
self._doc = doc
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
cdef class EnglishNounChunks(DocIterator):
|
||||||
|
def __init__(self, Doc doc):
|
||||||
|
super(EnglishNounChunks,self).__init__(doc)
|
||||||
|
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
|
||||||
|
self._np_label = self._doc.vocab.strings['NP']
|
||||||
|
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
|
||||||
|
self._conjunct = self._doc.vocab.strings['conj']
|
||||||
|
self.i = 0
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
cdef const TokenC* word
|
||||||
|
cdef widx
|
||||||
|
while self.i < self._doc.length:
|
||||||
|
widx = self.i
|
||||||
|
self.i += 1
|
||||||
|
word = &self._doc.c[widx]
|
||||||
|
if word.pos == NOUN:
|
||||||
|
if word.dep in self._np_deps:
|
||||||
|
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
|
||||||
|
elif word.dep == self._conjunct:
|
||||||
|
head = word+word.head
|
||||||
|
while head.dep == self._conjunct and head.head < 0:
|
||||||
|
head += head.head
|
||||||
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
|
if head.dep in self._np_deps:
|
||||||
|
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
|
||||||
|
raise StopIteration
|
||||||
|
|
||||||
|
|
||||||
|
# this iterator extracts spans headed by NOUNs starting from the left-most
|
||||||
|
# syntactic dependent until the NOUN itself
|
||||||
|
# for close apposition and measurement construction, the span is sometimes
|
||||||
|
# extended to the right of the NOUN
|
||||||
|
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
|
||||||
|
# just "eine Tasse", same for "das Thema Familie"
|
||||||
|
cdef class GermanNounChunks(DocIterator):
|
||||||
|
def __init__(self, Doc doc):
|
||||||
|
super(GermanNounChunks,self).__init__(doc)
|
||||||
|
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
|
||||||
|
self._np_label = self._doc.vocab.strings['NP']
|
||||||
|
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
|
||||||
|
self._close_app = self._doc.vocab.strings['nk']
|
||||||
|
self.i = 0
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
cdef const TokenC* word
|
||||||
|
cdef int rbracket
|
||||||
|
cdef Token rdep
|
||||||
|
cdef widx
|
||||||
|
while self.i < self._doc.length:
|
||||||
|
widx = self.i
|
||||||
|
self.i += 1
|
||||||
|
word = &self._doc.c[widx]
|
||||||
|
if word.pos == NOUN and word.dep in self._np_deps:
|
||||||
|
rbracket = widx+1
|
||||||
|
# try to extend the span to the right
|
||||||
|
# to capture close apposition/measurement constructions
|
||||||
|
for rdep in self._doc[widx].rights:
|
||||||
|
if rdep.pos == NOUN and rdep.dep == self._close_app:
|
||||||
|
rbracket = rdep.i+1
|
||||||
|
return Span(self._doc, word.l_edge, rbracket, label=self._np_label)
|
||||||
|
raise StopIteration
|
||||||
|
|
|
@ -47,6 +47,8 @@ from ._parse_features cimport fill_context
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
|
||||||
|
from spacy.syntax.iterators cimport DocIterator, EnglishNounChunks, GermanNounChunks
|
||||||
|
CHUNKERS = {'en':EnglishNounChunks, 'de':GermanNounChunks}
|
||||||
|
|
||||||
|
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
|
@ -113,12 +115,9 @@ cdef class Parser:
|
||||||
cdef int nr_feat = self.model.nr_feat
|
cdef int nr_feat = self.model.nr_feat
|
||||||
with nogil:
|
with nogil:
|
||||||
self.parseC(tokens.c, tokens.length, nr_feat, nr_class)
|
self.parseC(tokens.c, tokens.length, nr_feat, nr_class)
|
||||||
tokens.is_parsed = True
|
|
||||||
# Check for KeyboardInterrupt etc. Untested
|
# Check for KeyboardInterrupt etc. Untested
|
||||||
PyErr_CheckSignals()
|
PyErr_CheckSignals()
|
||||||
# projectivize output
|
self._finalize(tokens)
|
||||||
if self._projectivize:
|
|
||||||
PseudoProjectivity.deprojectivize(tokens)
|
|
||||||
|
|
||||||
def pipe(self, stream, int batch_size=1000, int n_threads=2):
|
def pipe(self, stream, int batch_size=1000, int n_threads=2):
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
|
@ -144,7 +143,7 @@ cdef class Parser:
|
||||||
raise ValueError("Error parsing doc: %s" % sent_str)
|
raise ValueError("Error parsing doc: %s" % sent_str)
|
||||||
PyErr_CheckSignals()
|
PyErr_CheckSignals()
|
||||||
for doc in queue:
|
for doc in queue:
|
||||||
doc.is_parsed = True
|
self._finalize(doc)
|
||||||
yield doc
|
yield doc
|
||||||
queue = []
|
queue = []
|
||||||
batch_size = len(queue)
|
batch_size = len(queue)
|
||||||
|
@ -155,10 +154,19 @@ cdef class Parser:
|
||||||
with gil:
|
with gil:
|
||||||
sent_str = queue[i].text
|
sent_str = queue[i].text
|
||||||
raise ValueError("Error parsing doc: %s" % sent_str)
|
raise ValueError("Error parsing doc: %s" % sent_str)
|
||||||
for doc in queue:
|
|
||||||
doc.is_parsed = True
|
|
||||||
yield doc
|
|
||||||
PyErr_CheckSignals()
|
PyErr_CheckSignals()
|
||||||
|
for doc in queue:
|
||||||
|
self._finalize(doc)
|
||||||
|
yield doc
|
||||||
|
|
||||||
|
def _finalize(self, Doc doc):
|
||||||
|
# deprojectivize output
|
||||||
|
if self._projectivize:
|
||||||
|
PseudoProjectivity.deprojectivize(doc)
|
||||||
|
# set annotation-specific iterators
|
||||||
|
doc.noun_chunks = CHUNKERS.get(doc.vocab.lang,DocIterator)
|
||||||
|
# mark doc as parsed
|
||||||
|
doc.is_parsed = True
|
||||||
|
|
||||||
cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil:
|
cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil:
|
||||||
cdef ExampleC eg
|
cdef ExampleC eg
|
||||||
|
|
|
@ -7,6 +7,8 @@ from ..structs cimport TokenC, LexemeC
|
||||||
from ..typedefs cimport attr_t
|
from ..typedefs cimport attr_t
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
|
|
||||||
|
from spacy.syntax.iterators cimport DocIterator
|
||||||
|
|
||||||
|
|
||||||
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
|
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
|
||||||
|
|
||||||
|
@ -42,6 +44,8 @@ cdef class Doc:
|
||||||
cdef int length
|
cdef int length
|
||||||
cdef int max_length
|
cdef int max_length
|
||||||
|
|
||||||
|
cdef DocIterator noun_chunks_iterator
|
||||||
|
|
||||||
cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1
|
cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1
|
||||||
|
|
||||||
cpdef np.ndarray to_array(self, object features)
|
cpdef np.ndarray to_array(self, object features)
|
||||||
|
|
|
@ -81,6 +81,7 @@ cdef class Doc:
|
||||||
self.is_parsed = False
|
self.is_parsed = False
|
||||||
self._py_tokens = []
|
self._py_tokens = []
|
||||||
self._vector = None
|
self._vector = None
|
||||||
|
self.noun_chunks_iterator = DocIterator(self)
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
"""Get a Token or a Span from the Doc.
|
"""Get a Token or a Span from the Doc.
|
||||||
|
@ -231,35 +232,21 @@ cdef class Doc:
|
||||||
# Set start as B
|
# Set start as B
|
||||||
self.c[start].ent_iob = 3
|
self.c[start].ent_iob = 3
|
||||||
|
|
||||||
@property
|
|
||||||
def noun_chunks(self):
|
|
||||||
"""Yield spans for base noun phrases."""
|
|
||||||
if not self.is_parsed:
|
|
||||||
raise ValueError(
|
|
||||||
"noun_chunks requires the dependency parse, which "
|
|
||||||
"requires data to be installed. If you haven't done so, run: "
|
|
||||||
"\npython -m spacy.en.download all\n"
|
|
||||||
"to install the data")
|
|
||||||
|
|
||||||
from spacy.en.iterators import noun_chunks as en_noun_chunks
|
property noun_chunks:
|
||||||
from spacy.de.iterators import noun_chunks as de_noun_chunks
|
def __get__(self):
|
||||||
|
"""Yield spans for base noun phrases."""
|
||||||
|
if not self.is_parsed:
|
||||||
|
raise ValueError(
|
||||||
|
"noun_chunks requires the dependency parse, which "
|
||||||
|
"requires data to be installed. If you haven't done so, run: "
|
||||||
|
"\npython -m spacy.en.download all\n"
|
||||||
|
"to install the data")
|
||||||
|
|
||||||
chunk_rules = {'en':en_noun_chunks,
|
yield from self.noun_chunks_iterator
|
||||||
'de':de_noun_chunks,
|
|
||||||
}
|
|
||||||
|
|
||||||
for sent in self.sents:
|
|
||||||
print(sent)
|
|
||||||
lang = sent.root.lang_
|
|
||||||
chunker = chunk_rules.get(lang,None)
|
|
||||||
if chunker == None:
|
|
||||||
warnings.warn("noun_chunks is not available for language %s." % lang)
|
|
||||||
print(sent.root.orth_)
|
|
||||||
continue
|
|
||||||
|
|
||||||
for chunk in chunker(sent):
|
|
||||||
yield chunk
|
|
||||||
|
|
||||||
|
def __set__(self, DocIterator):
|
||||||
|
self.noun_chunks_iterator = DocIterator(self)
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -27,7 +27,7 @@ from . import symbols
|
||||||
from cymem.cymem cimport Address
|
from cymem.cymem cimport Address
|
||||||
from . import util
|
from . import util
|
||||||
from .serialize.packer cimport Packer
|
from .serialize.packer cimport Packer
|
||||||
from .attrs cimport PROB
|
from .attrs cimport PROB, LANG
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import copy_reg
|
import copy_reg
|
||||||
|
@ -105,6 +105,13 @@ cdef class Vocab:
|
||||||
self._serializer = Packer(self, self.serializer_freqs)
|
self._serializer = Packer(self, self.serializer_freqs)
|
||||||
return self._serializer
|
return self._serializer
|
||||||
|
|
||||||
|
property lang:
|
||||||
|
def __get__(self):
|
||||||
|
langfunc = None
|
||||||
|
if self.get_lex_attr:
|
||||||
|
langfunc = self.get_lex_attr.get(LANG,None)
|
||||||
|
return langfunc('_') if langfunc else ''
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The current number of lexemes stored."""
|
"""The current number of lexemes stored."""
|
||||||
return self.length
|
return self.length
|
||||||
|
|
Loading…
Reference in New Issue
Block a user