* Rename .repvec to .vector in C API

This commit is contained in:
Matthew Honnibal 2015-11-03 23:47:59 +11:00
parent f81389abe0
commit 1e99fcd413
5 changed files with 18 additions and 22 deletions

View File

@ -51,7 +51,7 @@ cdef class Lexeme:
def __get__(self): def __get__(self):
cdef int i cdef int i
for i in range(self.vocab.vectors_length): for i in range(self.vocab.vectors_length):
if self.c.repvec[i] != 0: if self.c.vector[i] != 0:
return True return True
else: else:
return False return False
@ -74,14 +74,14 @@ cdef class Lexeme:
"to install the data." "to install the data."
) )
repvec_view = <float[:length,]>self.c.repvec vector_view = <float[:length,]>self.c.vector
return numpy.asarray(repvec_view) return numpy.asarray(vector_view)
def __set__(self, vector): def __set__(self, vector):
assert len(vector) == self.vocab.vectors_length assert len(vector) == self.vocab.vectors_length
cdef float value cdef float value
for i, value in enumerate(vector): for i, value in enumerate(vector):
self.c.repvec[i] = value self.c.vector[i] = value
property repvec: property repvec:
def __get__(self): def __get__(self):

View File

@ -5,7 +5,7 @@ from .parts_of_speech cimport univ_pos_t
cdef struct LexemeC: cdef struct LexemeC:
float* repvec float* vector
flags_t flags flags_t flags

View File

@ -134,10 +134,6 @@ cdef class Doc:
return 0.0 return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property repvec:
def __get__(self):
return self.vector
property vector: property vector:
def __get__(self): def __get__(self):
if self._vector is None: if self._vector is None:
@ -399,7 +395,7 @@ cdef class Doc:
elif attr_id == TAG: elif attr_id == TAG:
for i in range(length): for i in range(length):
self.vocab.morphology.assign_tag(&tokens[i], self.vocab.morphology.assign_tag(&tokens[i],
self.vocab.strings[values[i]]) self.vocab.morphology.reverse_index[values[i]])
if not self.is_tagged and tokens[i].tag != 0: if not self.is_tagged and tokens[i].tag != 0:
self.is_tagged = True self.is_tagged = True
elif attr_id == POS: elif attr_id == POS:

View File

@ -143,7 +143,7 @@ cdef class Token:
def __get__(self): def __get__(self):
cdef int i cdef int i
for i in range(self.vocab.vectors_length): for i in range(self.vocab.vectors_length):
if self.c.lex.repvec[i] != 0: if self.c.lex.vector[i] != 0:
return True return True
else: else:
return False return False
@ -158,8 +158,8 @@ cdef class Token:
"\npython -m spacy.en.download all\n" "\npython -m spacy.en.download all\n"
"to install the data." "to install the data."
) )
repvec_view = <float[:length,]>self.c.lex.repvec vector_view = <float[:length,]>self.c.lex.vector
return numpy.asarray(repvec_view) return numpy.asarray(vector_view)
property repvec: property repvec:
def __get__(self): def __get__(self):

View File

@ -40,7 +40,7 @@ DEF MAX_VEC_SIZE = 100000
cdef float[MAX_VEC_SIZE] EMPTY_VEC cdef float[MAX_VEC_SIZE] EMPTY_VEC
memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC)) memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC))
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
EMPTY_LEXEME.repvec = EMPTY_VEC EMPTY_LEXEME.vector = EMPTY_VEC
cdef class Vocab: cdef class Vocab:
@ -162,7 +162,7 @@ cdef class Vocab:
lex.orth = self.strings[string] lex.orth = self.strings[string]
lex.length = len(string) lex.length = len(string)
lex.id = self.length lex.id = self.length
lex.repvec = <float*>mem.alloc(self.vectors_length, sizeof(float)) lex.vector = <float*>mem.alloc(self.vectors_length, sizeof(float))
if self.get_lex_attr is not None: if self.get_lex_attr is not None:
for attr, func in self.get_lex_attr.items(): for attr, func in self.get_lex_attr.items():
value = func(string) value = func(string)
@ -287,7 +287,7 @@ cdef class Vocab:
fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment)) fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm)) fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
lexeme.repvec = EMPTY_VEC lexeme.vector = EMPTY_VEC
py_str = self.strings[lexeme.orth] py_str = self.strings[lexeme.orth]
key = hash_string(py_str) key = hash_string(py_str)
self._by_hash.set(key, lexeme) self._by_hash.set(key, lexeme)
@ -306,7 +306,7 @@ cdef class Vocab:
cdef CFile out_file = CFile(out_loc, 'wb') cdef CFile out_file = CFile(out_loc, 'wb')
for lexeme in self: for lexeme in self:
word_str = lexeme.orth_.encode('utf8') word_str = lexeme.orth_.encode('utf8')
vec = lexeme.c.repvec vec = lexeme.c.vector
word_len = len(word_str) word_len = len(word_str)
out_file.write_from(&word_len, 1, sizeof(word_len)) out_file.write_from(&word_len, 1, sizeof(word_len))
@ -331,10 +331,10 @@ cdef class Vocab:
vec_len, len(pieces)) vec_len, len(pieces))
orth = self.strings[word_str] orth = self.strings[word_str]
lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth) lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
lexeme.repvec = <float*>self.mem.alloc(self.vectors_length, sizeof(float)) lexeme.vector = <float*>self.mem.alloc(self.vectors_length, sizeof(float))
for i, val_str in enumerate(pieces): for i, val_str in enumerate(pieces):
lexeme.repvec[i] = float(val_str) lexeme.vector[i] = float(val_str)
return vec_len return vec_len
def load_vectors_from_bin_loc(self, loc): def load_vectors_from_bin_loc(self, loc):
@ -376,12 +376,12 @@ cdef class Vocab:
for orth, lex_addr in self._by_orth.items(): for orth, lex_addr in self._by_orth.items():
lex = <LexemeC*>lex_addr lex = <LexemeC*>lex_addr
if lex.lower < vectors.size(): if lex.lower < vectors.size():
lex.repvec = vectors[lex.lower] lex.vector = vectors[lex.lower]
for i in range(vec_len): for i in range(vec_len):
lex.l2_norm += (lex.repvec[i] * lex.repvec[i]) lex.l2_norm += (lex.vector[i] * lex.vector[i])
lex.l2_norm = math.sqrt(lex.l2_norm) lex.l2_norm = math.sqrt(lex.l2_norm)
else: else:
lex.repvec = EMPTY_VEC lex.vector = EMPTY_VEC
return vec_len return vec_len