mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
* Refactor noun chunk iterators, so that they're simple functions. Install the iterator when the Doc is created, but allow users to write to the noun_chunk_iterator attribute. The iterator functions accept an object and yield (int start, int end, int label) triples.
This commit is contained in:
parent
e526be5602
commit
508fd1f6dc
|
@ -383,6 +383,9 @@ cdef class ArcEager(TransitionSystem):
|
||||||
if st._sent[i].head == 0 and st._sent[i].dep == 0:
|
if st._sent[i].head == 0 and st._sent[i].dep == 0:
|
||||||
st._sent[i].dep = self.root_label
|
st._sent[i].dep = self.root_label
|
||||||
|
|
||||||
|
def finalize_doc(self, doc):
|
||||||
|
doc.is_parsed = True
|
||||||
|
|
||||||
cdef int set_valid(self, int* output, const StateC* st) nogil:
|
cdef int set_valid(self, int* output, const StateC* st) nogil:
|
||||||
cdef bint[N_MOVES] is_valid
|
cdef bint[N_MOVES] is_valid
|
||||||
is_valid[SHIFT] = Shift.is_valid(st, -1)
|
is_valid[SHIFT] = Shift.is_valid(st, -1)
|
||||||
|
|
|
@ -1,19 +0,0 @@
|
||||||
|
|
||||||
from spacy.tokens.doc cimport Doc
|
|
||||||
|
|
||||||
cdef dict CHUNKERS
|
|
||||||
|
|
||||||
cdef class DocIterator:
|
|
||||||
cdef Doc _doc
|
|
||||||
|
|
||||||
cdef class EnglishNounChunks(DocIterator):
|
|
||||||
cdef int i
|
|
||||||
cdef int _np_label
|
|
||||||
cdef set _np_deps
|
|
||||||
cdef int _conjunct
|
|
||||||
|
|
||||||
cdef class GermanNounChunks(DocIterator):
|
|
||||||
cdef int i
|
|
||||||
cdef int _np_label
|
|
||||||
cdef set _np_deps
|
|
||||||
cdef int _close_app
|
|
|
@ -1,55 +1,23 @@
|
||||||
from spacy.structs cimport TokenC
|
|
||||||
from spacy.tokens.span cimport Span
|
|
||||||
from spacy.tokens.doc cimport Doc
|
|
||||||
from spacy.tokens.token cimport Token
|
|
||||||
|
|
||||||
from spacy.parts_of_speech cimport NOUN
|
from spacy.parts_of_speech cimport NOUN
|
||||||
|
|
||||||
CHUNKERS = {'en':EnglishNounChunks, 'de':GermanNounChunks}
|
|
||||||
|
|
||||||
# base class for document iterators
|
def english_noun_chunks(doc):
|
||||||
cdef class DocIterator:
|
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
|
||||||
def __init__(self, Doc doc):
|
'attr', 'root']
|
||||||
self._doc = doc
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
|
conj = doc.vocab.strings['conj']
|
||||||
def __iter__(self):
|
np_label = doc.vocab.strings['NP']
|
||||||
return self
|
for i in range(len(doc)):
|
||||||
|
word = doc[i]
|
||||||
def __next__(self):
|
if word.pos == NOUN and word.dep in np_deps:
|
||||||
raise NotImplementedError
|
yield word.left_edge.i, word.i+1, np_label
|
||||||
|
elif word.pos == NOUN and word.dep == conj:
|
||||||
|
head = word.head
|
||||||
cdef class EnglishNounChunks(DocIterator):
|
while head.dep == conj and head.head.i < head.i:
|
||||||
def __init__(self, Doc doc):
|
head = head.head
|
||||||
super(EnglishNounChunks,self).__init__(doc)
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
|
if head.dep in np_deps:
|
||||||
self._np_label = self._doc.vocab.strings['NP']
|
yield word.left_edge.i, word.i+1, np_label
|
||||||
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
|
|
||||||
self._conjunct = self._doc.vocab.strings['conj']
|
|
||||||
self.i = 0
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
self.i = 0
|
|
||||||
return super(EnglishNounChunks,self).__iter__()
|
|
||||||
|
|
||||||
def __next__(self):
|
|
||||||
cdef const TokenC* word
|
|
||||||
cdef widx
|
|
||||||
while self.i < self._doc.length:
|
|
||||||
widx = self.i
|
|
||||||
self.i += 1
|
|
||||||
word = &self._doc.c[widx]
|
|
||||||
if word.pos == NOUN:
|
|
||||||
if word.dep in self._np_deps:
|
|
||||||
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
|
|
||||||
elif word.dep == self._conjunct:
|
|
||||||
head = word+word.head
|
|
||||||
while head.dep == self._conjunct and head.head < 0:
|
|
||||||
head += head.head
|
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
|
||||||
if head.dep in self._np_deps:
|
|
||||||
return Span(self._doc, word.l_edge, widx+1, label=self._np_label)
|
|
||||||
raise StopIteration
|
|
||||||
|
|
||||||
|
|
||||||
# this iterator extracts spans headed by NOUNs starting from the left-most
|
# this iterator extracts spans headed by NOUNs starting from the left-most
|
||||||
|
@ -58,35 +26,21 @@ cdef class EnglishNounChunks(DocIterator):
|
||||||
# extended to the right of the NOUN
|
# extended to the right of the NOUN
|
||||||
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
|
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
|
||||||
# just "eine Tasse", same for "das Thema Familie"
|
# just "eine Tasse", same for "das Thema Familie"
|
||||||
cdef class GermanNounChunks(DocIterator):
|
def german_noun_chunks(doc):
|
||||||
def __init__(self, Doc doc):
|
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
|
||||||
super(GermanNounChunks,self).__init__(doc)
|
np_label = doc.vocab.strings['NP']
|
||||||
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app']
|
np_deps = set(doc.vocab.strings[label] for label in labels)
|
||||||
self._np_label = self._doc.vocab.strings['NP']
|
close_app = doc.vocab.strings['nk']
|
||||||
self._np_deps = set( self._doc.vocab.strings[label] for label in labels )
|
|
||||||
self._close_app = self._doc.vocab.strings['nk']
|
|
||||||
self.i = 0
|
|
||||||
|
|
||||||
def __iter__(self):
|
for word in doc:
|
||||||
self.i = 0
|
if word.pos == NOUN and word.dep in np_deps:
|
||||||
return super(GermanNounChunks,self).__iter__()
|
rbracket = word.i+1
|
||||||
|
# try to extend the span to the right
|
||||||
|
# to capture close apposition/measurement constructions
|
||||||
|
for rdep in doc[word.i].rights:
|
||||||
|
if rdep.pos == NOUN and rdep.dep == close_app:
|
||||||
|
rbracket = rdep.i+1
|
||||||
|
yield word.l_edge, rbracket, np_label
|
||||||
|
|
||||||
def __next__(self):
|
|
||||||
cdef const TokenC* word
|
|
||||||
cdef int rbracket
|
|
||||||
cdef Token rdep
|
|
||||||
cdef widx
|
|
||||||
while self.i < self._doc.length:
|
|
||||||
widx = self.i
|
|
||||||
self.i += 1
|
|
||||||
word = &self._doc.c[widx]
|
|
||||||
if word.pos == NOUN and word.dep in self._np_deps:
|
|
||||||
rbracket = widx+1
|
|
||||||
# try to extend the span to the right
|
|
||||||
# to capture close apposition/measurement constructions
|
|
||||||
for rdep in self._doc[widx].rights:
|
|
||||||
if rdep.pos == NOUN and rdep.dep == self._close_app:
|
|
||||||
rbracket = rdep.i+1
|
|
||||||
return Span(self._doc, word.l_edge, rbracket, label=self._np_label)
|
|
||||||
raise StopIteration
|
|
||||||
|
|
||||||
|
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}
|
||||||
|
|
|
@ -47,8 +47,6 @@ from ._parse_features cimport fill_context
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
|
||||||
from spacy.syntax.iterators cimport CHUNKERS, DocIterator, EnglishNounChunks, GermanNounChunks
|
|
||||||
|
|
||||||
|
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
def set_debug(val):
|
def set_debug(val):
|
||||||
|
@ -116,7 +114,7 @@ cdef class Parser:
|
||||||
self.parseC(tokens.c, tokens.length, nr_feat, nr_class)
|
self.parseC(tokens.c, tokens.length, nr_feat, nr_class)
|
||||||
# Check for KeyboardInterrupt etc. Untested
|
# Check for KeyboardInterrupt etc. Untested
|
||||||
PyErr_CheckSignals()
|
PyErr_CheckSignals()
|
||||||
self._finalize(tokens)
|
self.moves.finalize_doc(tokens)
|
||||||
|
|
||||||
def pipe(self, stream, int batch_size=1000, int n_threads=2):
|
def pipe(self, stream, int batch_size=1000, int n_threads=2):
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
|
@ -142,7 +140,7 @@ cdef class Parser:
|
||||||
raise ValueError("Error parsing doc: %s" % sent_str)
|
raise ValueError("Error parsing doc: %s" % sent_str)
|
||||||
PyErr_CheckSignals()
|
PyErr_CheckSignals()
|
||||||
for doc in queue:
|
for doc in queue:
|
||||||
self._finalize(doc)
|
self.moves.finalize_doc(doc)
|
||||||
yield doc
|
yield doc
|
||||||
queue = []
|
queue = []
|
||||||
batch_size = len(queue)
|
batch_size = len(queue)
|
||||||
|
@ -155,18 +153,9 @@ cdef class Parser:
|
||||||
raise ValueError("Error parsing doc: %s" % sent_str)
|
raise ValueError("Error parsing doc: %s" % sent_str)
|
||||||
PyErr_CheckSignals()
|
PyErr_CheckSignals()
|
||||||
for doc in queue:
|
for doc in queue:
|
||||||
self._finalize(doc)
|
self.moves.finalize_doc(doc)
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def _finalize(self, Doc doc):
|
|
||||||
# deprojectivize output
|
|
||||||
if self._projectivize:
|
|
||||||
PseudoProjectivity.deprojectivize(doc)
|
|
||||||
# set annotation-specific iterators
|
|
||||||
doc.noun_chunks = CHUNKERS.get(doc.vocab.lang,DocIterator)
|
|
||||||
# mark doc as parsed
|
|
||||||
doc.is_parsed = True
|
|
||||||
|
|
||||||
cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil:
|
cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil:
|
||||||
cdef ExampleC eg
|
cdef ExampleC eg
|
||||||
eg.nr_feat = nr_feat
|
eg.nr_feat = nr_feat
|
||||||
|
@ -313,6 +302,7 @@ cdef class StepwiseState:
|
||||||
if self.stcls.is_final():
|
if self.stcls.is_final():
|
||||||
self.parser.moves.finalize_state(self.stcls.c)
|
self.parser.moves.finalize_state(self.stcls.c)
|
||||||
self.doc.set_parse(self.stcls.c._sent)
|
self.doc.set_parse(self.stcls.c._sent)
|
||||||
|
self.parser.moves.finalize_doc(self.doc)
|
||||||
|
|
||||||
|
|
||||||
cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,
|
cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,
|
||||||
|
|
|
@ -53,6 +53,9 @@ cdef class TransitionSystem:
|
||||||
cdef int finalize_state(self, StateC* state) nogil:
|
cdef int finalize_state(self, StateC* state) nogil:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def finalize_doc(self, doc):
|
||||||
|
pass
|
||||||
|
|
||||||
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ import numpy as np
|
||||||
from spacy.attrs import HEAD, DEP
|
from spacy.attrs import HEAD, DEP
|
||||||
from spacy.symbols import nsubj, dobj, punct, amod, nmod, conj, cc, root
|
from spacy.symbols import nsubj, dobj, punct, amod, nmod, conj, cc, root
|
||||||
from spacy.en import English
|
from spacy.en import English
|
||||||
from spacy.syntax.iterators import EnglishNounChunks
|
from spacy.syntax.iterators import english_noun_chunks
|
||||||
|
|
||||||
|
|
||||||
def test_not_nested():
|
def test_not_nested():
|
||||||
|
@ -22,9 +22,7 @@ def test_not_nested():
|
||||||
[-2, conj],
|
[-2, conj],
|
||||||
[-5, dobj]
|
[-5, dobj]
|
||||||
], dtype='int32'))
|
], dtype='int32'))
|
||||||
tokens.noun_chunks = EnglishNounChunks
|
tokens.noun_chunks_iterator = english_noun_chunks
|
||||||
for chunk in tokens.noun_chunks:
|
|
||||||
print(chunk.text)
|
|
||||||
word_occurred = {}
|
word_occurred = {}
|
||||||
for chunk in tokens.noun_chunks:
|
for chunk in tokens.noun_chunks:
|
||||||
for word in chunk:
|
for word in chunk:
|
||||||
|
|
|
@ -23,6 +23,7 @@ from .span cimport Span
|
||||||
from .token cimport Token
|
from .token cimport Token
|
||||||
from ..serialize.bits cimport BitArray
|
from ..serialize.bits cimport BitArray
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
|
from ..syntax.iterators import CHUNKERS
|
||||||
|
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
@ -81,7 +82,7 @@ cdef class Doc:
|
||||||
self.is_parsed = False
|
self.is_parsed = False
|
||||||
self._py_tokens = []
|
self._py_tokens = []
|
||||||
self._vector = None
|
self._vector = None
|
||||||
self.noun_chunks_iterator = DocIterator(self)
|
self.noun_chunks_iterator = CHUNKERS.get(self.vocab.lang)
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
"""Get a Token or a Span from the Doc.
|
"""Get a Token or a Span from the Doc.
|
||||||
|
@ -233,21 +234,17 @@ cdef class Doc:
|
||||||
self.c[start].ent_iob = 3
|
self.c[start].ent_iob = 3
|
||||||
|
|
||||||
|
|
||||||
property noun_chunks:
|
@property
|
||||||
def __get__(self):
|
def noun_chunks(self):
|
||||||
"""Yield spans for base noun phrases."""
|
"""Yield spans for base noun phrases."""
|
||||||
if not self.is_parsed:
|
if not self.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"noun_chunks requires the dependency parse, which "
|
"noun_chunks requires the dependency parse, which "
|
||||||
"requires data to be installed. If you haven't done so, run: "
|
"requires data to be installed. If you haven't done so, run: "
|
||||||
"\npython -m spacy.%s.download all\n"
|
"\npython -m spacy.%s.download all\n"
|
||||||
"to install the data" % self.vocab.lang)
|
"to install the data" % self.vocab.lang)
|
||||||
|
for start, end, label in self.noun_chunks_iterator(self):
|
||||||
yield from self.noun_chunks_iterator
|
yield Span(self, start, end, label=label)
|
||||||
|
|
||||||
def __set__(self, DocIterator):
|
|
||||||
self.noun_chunks_iterator = DocIterator(self)
|
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def sents(self):
|
def sents(self):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user