mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
* Bug fixes to sentences method, and improved vector transport for tokens
This commit is contained in:
parent
f2a229136c
commit
d6ac60e91c
|
@ -63,8 +63,10 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||||
cdef class Tokens:
|
cdef class Tokens:
|
||||||
"""Access and set annotations onto some text.
|
"""Access and set annotations onto some text.
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, string_length=0):
|
def __init__(self, Vocab vocab, unicode string):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
self._string = string
|
||||||
|
string_length = len(string)
|
||||||
if string_length >= 3:
|
if string_length >= 3:
|
||||||
size = int(string_length / 3.0)
|
size = int(string_length / 3.0)
|
||||||
else:
|
else:
|
||||||
|
@ -84,16 +86,18 @@ cdef class Tokens:
|
||||||
def sentences(self):
|
def sentences(self):
|
||||||
cdef int i
|
cdef int i
|
||||||
sentences = []
|
sentences = []
|
||||||
sent = Tokens(self.vocab)
|
cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:])
|
||||||
cdef attr_t period = self.vocab.strings['.']
|
cdef attr_t period = self.vocab.strings['.']
|
||||||
cdef attr_t question = self.vocab.strings['?']
|
cdef attr_t question = self.vocab.strings['?']
|
||||||
cdef attr_t exclamation = self.vocab.strings['!']
|
cdef attr_t exclamation = self.vocab.strings['!']
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
idx = sent.push_back(idx, &self.data[i])
|
sent.push_back(self.data[i].idx, &self.data[i])
|
||||||
if self.data[i].lex.sic == period or self.data[i].lex.sic == exclamation or \
|
if self.data[i].lex.sic == period or self.data[i].lex.sic == exclamation or \
|
||||||
self.data[i].lex.sic == question:
|
self.data[i].lex.sic == question:
|
||||||
sentences.append(sent)
|
sentences.append(sent)
|
||||||
sent = Tokens(self.vocab)
|
sent = Tokens(self.vocab, self._string[self.data[i].idx:])
|
||||||
|
if sent.length:
|
||||||
|
sentences.append(sent)
|
||||||
return sentences
|
return sentences
|
||||||
|
|
||||||
def __getitem__(self, i):
|
def __getitem__(self, i):
|
||||||
|
@ -119,6 +123,10 @@ cdef class Tokens:
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self.length
|
return self.length
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
cdef const TokenC* last = &self.data[self.length - 1]
|
||||||
|
return self._string[:last.idx + last.lex.length]
|
||||||
|
|
||||||
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
|
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
|
||||||
if self.length == self.max_length:
|
if self.length == self.max_length:
|
||||||
self._realloc(self.length * 2)
|
self._realloc(self.length * 2)
|
||||||
|
@ -221,9 +229,10 @@ cdef class Token:
|
||||||
self.tag = t.tag
|
self.tag = t.tag
|
||||||
self.dep = t.dep
|
self.dep = t.dep
|
||||||
|
|
||||||
self.vec = numpy.ndarray(shape=(300,), dtype=numpy.float32)
|
#self.vec = numpy.ndarray(shape=(300,), dtype=numpy.float32)
|
||||||
for i in range(300):
|
#for i in range(300):
|
||||||
self.vec[i] = t.lex.vec[i]
|
# self.vec[i] = t.lex.vec[i]
|
||||||
|
self.vec = numpy.asarray(<float[:300,]> t.lex.vec)
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
cdef const TokenC* t = &self._seq.data[self.i]
|
cdef const TokenC* t = &self._seq.data[self.i]
|
||||||
|
@ -247,7 +256,7 @@ cdef class Token:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def is_pos(self, univ_tag_t pos):
|
def is_pos(self, univ_tag_t pos):
|
||||||
return False
|
return self.tag == pos
|
||||||
|
|
||||||
property head:
|
property head:
|
||||||
"""The token predicted by the parser to be the head of the current token."""
|
"""The token predicted by the parser to be the head of the current token."""
|
||||||
|
|
Loading…
Reference in New Issue
Block a user