From d6ac60e91c0f235406e01689f09d5e550cf3597d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 21 Jan 2015 18:56:32 +1100 Subject: [PATCH] * Bug fixes to sentences method, and improved vector transport for tokens --- spacy/tokens.pyx | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 145131fd8..9389155f8 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -63,8 +63,10 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil: cdef class Tokens: """Access and set annotations onto some text. """ - def __init__(self, Vocab vocab, string_length=0): + def __init__(self, Vocab vocab, unicode string): self.vocab = vocab + self._string = string + string_length = len(string) if string_length >= 3: size = int(string_length / 3.0) else: @@ -84,16 +86,18 @@ cdef class Tokens: def sentences(self): cdef int i sentences = [] - sent = Tokens(self.vocab) + cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:]) cdef attr_t period = self.vocab.strings['.'] cdef attr_t question = self.vocab.strings['?'] cdef attr_t exclamation = self.vocab.strings['!'] for i in range(self.length): - idx = sent.push_back(idx, &self.data[i]) + sent.push_back(self.data[i].idx, &self.data[i]) if self.data[i].lex.sic == period or self.data[i].lex.sic == exclamation or \ self.data[i].lex.sic == question: sentences.append(sent) - sent = Tokens(self.vocab) + sent = Tokens(self.vocab, self._string[self.data[i].idx:]) + if sent.length: + sentences.append(sent) return sentences def __getitem__(self, i): @@ -119,6 +123,10 @@ cdef class Tokens: def __len__(self): return self.length + def __unicode__(self): + cdef const TokenC* last = &self.data[self.length - 1] + return self._string[:last.idx + last.lex.length] + cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1: if self.length == self.max_length: self._realloc(self.length * 2) @@ -221,9 +229,10 @@ cdef class Token: self.tag = t.tag self.dep = t.dep - self.vec = numpy.ndarray(shape=(300,), dtype=numpy.float32) - for i in range(300): - self.vec[i] = t.lex.vec[i] + #self.vec = numpy.ndarray(shape=(300,), dtype=numpy.float32) + #for i in range(300): + # self.vec[i] = t.lex.vec[i] + self.vec = numpy.asarray( t.lex.vec) def __unicode__(self): cdef const TokenC* t = &self._seq.data[self.i] @@ -247,7 +256,7 @@ cdef class Token: return False def is_pos(self, univ_tag_t pos): - return False + return self.tag == pos property head: """The token predicted by the parser to be the head of the current token."""