* Have SBD return start/end indices

This commit is contained in:
Matthew Honnibal 2015-01-22 22:24:44 +11:00
parent b183dff72d
commit a27b23cc8f

View File

@ -92,15 +92,18 @@ cdef class Tokens:
cdef attr_t period = self.vocab.strings['.'] cdef attr_t period = self.vocab.strings['.']
cdef attr_t question = self.vocab.strings['?'] cdef attr_t question = self.vocab.strings['?']
cdef attr_t exclamation = self.vocab.strings['!'] cdef attr_t exclamation = self.vocab.strings['!']
spans = []
start = None
for i in range(self.length): for i in range(self.length):
sent.push_back(self.data[i].idx, &self.data[i]) if start is None:
start = i
if self.data[i].lex.sic == period or self.data[i].lex.sic == exclamation or \ if self.data[i].lex.sic == period or self.data[i].lex.sic == exclamation or \
self.data[i].lex.sic == question: self.data[i].lex.sic == question:
sentences.append(sent) spans.append((start, i+1))
sent = Tokens(self.vocab, self._string[self.data[i].idx:]) start = None
if sent.length: if start is not None:
sentences.append(sent) spans.append((start, self.length))
return sentences return spans
def __getitem__(self, i): def __getitem__(self, i):
"""Retrieve a token. """Retrieve a token.