* Bug fixes to sentences method, and improved vector transport for tokens

This commit is contained in:
Matthew Honnibal 2015-01-21 18:56:32 +11:00
parent f2a229136c
commit d6ac60e91c

View File

@ -63,8 +63,10 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
cdef class Tokens: cdef class Tokens:
"""Access and set annotations onto some text. """Access and set annotations onto some text.
""" """
def __init__(self, Vocab vocab, string_length=0): def __init__(self, Vocab vocab, unicode string):
self.vocab = vocab self.vocab = vocab
self._string = string
string_length = len(string)
if string_length >= 3: if string_length >= 3:
size = int(string_length / 3.0) size = int(string_length / 3.0)
else: else:
@ -84,16 +86,18 @@ cdef class Tokens:
def sentences(self): def sentences(self):
cdef int i cdef int i
sentences = [] sentences = []
sent = Tokens(self.vocab) cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:])
cdef attr_t period = self.vocab.strings['.'] cdef attr_t period = self.vocab.strings['.']
cdef attr_t question = self.vocab.strings['?'] cdef attr_t question = self.vocab.strings['?']
cdef attr_t exclamation = self.vocab.strings['!'] cdef attr_t exclamation = self.vocab.strings['!']
for i in range(self.length): for i in range(self.length):
idx = sent.push_back(idx, &self.data[i]) sent.push_back(self.data[i].idx, &self.data[i])
if self.data[i].lex.sic == period or self.data[i].lex.sic == exclamation or \ if self.data[i].lex.sic == period or self.data[i].lex.sic == exclamation or \
self.data[i].lex.sic == question: self.data[i].lex.sic == question:
sentences.append(sent) sentences.append(sent)
sent = Tokens(self.vocab) sent = Tokens(self.vocab, self._string[self.data[i].idx:])
if sent.length:
sentences.append(sent)
return sentences return sentences
def __getitem__(self, i): def __getitem__(self, i):
@ -119,6 +123,10 @@ cdef class Tokens:
def __len__(self): def __len__(self):
return self.length return self.length
def __unicode__(self):
cdef const TokenC* last = &self.data[self.length - 1]
return self._string[:last.idx + last.lex.length]
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1: cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
if self.length == self.max_length: if self.length == self.max_length:
self._realloc(self.length * 2) self._realloc(self.length * 2)
@ -221,9 +229,10 @@ cdef class Token:
self.tag = t.tag self.tag = t.tag
self.dep = t.dep self.dep = t.dep
self.vec = numpy.ndarray(shape=(300,), dtype=numpy.float32) #self.vec = numpy.ndarray(shape=(300,), dtype=numpy.float32)
for i in range(300): #for i in range(300):
self.vec[i] = t.lex.vec[i] # self.vec[i] = t.lex.vec[i]
self.vec = numpy.asarray(<float[:300,]> t.lex.vec)
def __unicode__(self): def __unicode__(self):
cdef const TokenC* t = &self._seq.data[self.i] cdef const TokenC* t = &self._seq.data[self.i]
@ -247,7 +256,7 @@ cdef class Token:
return False return False
def is_pos(self, univ_tag_t pos): def is_pos(self, univ_tag_t pos):
return False return self.tag == pos
property head: property head:
"""The token predicted by the parser to be the head of the current token.""" """The token predicted by the parser to be the head of the current token."""