mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
* Rename .repvec to .vector in C API
This commit is contained in:
parent
f81389abe0
commit
1e99fcd413
|
@ -51,7 +51,7 @@ cdef class Lexeme:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.vocab.vectors_length):
|
for i in range(self.vocab.vectors_length):
|
||||||
if self.c.repvec[i] != 0:
|
if self.c.vector[i] != 0:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
@ -74,14 +74,14 @@ cdef class Lexeme:
|
||||||
"to install the data."
|
"to install the data."
|
||||||
)
|
)
|
||||||
|
|
||||||
repvec_view = <float[:length,]>self.c.repvec
|
vector_view = <float[:length,]>self.c.vector
|
||||||
return numpy.asarray(repvec_view)
|
return numpy.asarray(vector_view)
|
||||||
|
|
||||||
def __set__(self, vector):
|
def __set__(self, vector):
|
||||||
assert len(vector) == self.vocab.vectors_length
|
assert len(vector) == self.vocab.vectors_length
|
||||||
cdef float value
|
cdef float value
|
||||||
for i, value in enumerate(vector):
|
for i, value in enumerate(vector):
|
||||||
self.c.repvec[i] = value
|
self.c.vector[i] = value
|
||||||
|
|
||||||
property repvec:
|
property repvec:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
|
@ -5,7 +5,7 @@ from .parts_of_speech cimport univ_pos_t
|
||||||
|
|
||||||
|
|
||||||
cdef struct LexemeC:
|
cdef struct LexemeC:
|
||||||
float* repvec
|
float* vector
|
||||||
|
|
||||||
flags_t flags
|
flags_t flags
|
||||||
|
|
||||||
|
|
|
@ -134,10 +134,6 @@ cdef class Doc:
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
property repvec:
|
|
||||||
def __get__(self):
|
|
||||||
return self.vector
|
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if self._vector is None:
|
if self._vector is None:
|
||||||
|
@ -399,7 +395,7 @@ cdef class Doc:
|
||||||
elif attr_id == TAG:
|
elif attr_id == TAG:
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
self.vocab.morphology.assign_tag(&tokens[i],
|
self.vocab.morphology.assign_tag(&tokens[i],
|
||||||
self.vocab.strings[values[i]])
|
self.vocab.morphology.reverse_index[values[i]])
|
||||||
if not self.is_tagged and tokens[i].tag != 0:
|
if not self.is_tagged and tokens[i].tag != 0:
|
||||||
self.is_tagged = True
|
self.is_tagged = True
|
||||||
elif attr_id == POS:
|
elif attr_id == POS:
|
||||||
|
|
|
@ -143,7 +143,7 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.vocab.vectors_length):
|
for i in range(self.vocab.vectors_length):
|
||||||
if self.c.lex.repvec[i] != 0:
|
if self.c.lex.vector[i] != 0:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
@ -158,8 +158,8 @@ cdef class Token:
|
||||||
"\npython -m spacy.en.download all\n"
|
"\npython -m spacy.en.download all\n"
|
||||||
"to install the data."
|
"to install the data."
|
||||||
)
|
)
|
||||||
repvec_view = <float[:length,]>self.c.lex.repvec
|
vector_view = <float[:length,]>self.c.lex.vector
|
||||||
return numpy.asarray(repvec_view)
|
return numpy.asarray(vector_view)
|
||||||
|
|
||||||
property repvec:
|
property repvec:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
|
@ -40,7 +40,7 @@ DEF MAX_VEC_SIZE = 100000
|
||||||
cdef float[MAX_VEC_SIZE] EMPTY_VEC
|
cdef float[MAX_VEC_SIZE] EMPTY_VEC
|
||||||
memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC))
|
memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC))
|
||||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||||
EMPTY_LEXEME.repvec = EMPTY_VEC
|
EMPTY_LEXEME.vector = EMPTY_VEC
|
||||||
|
|
||||||
|
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
|
@ -162,7 +162,7 @@ cdef class Vocab:
|
||||||
lex.orth = self.strings[string]
|
lex.orth = self.strings[string]
|
||||||
lex.length = len(string)
|
lex.length = len(string)
|
||||||
lex.id = self.length
|
lex.id = self.length
|
||||||
lex.repvec = <float*>mem.alloc(self.vectors_length, sizeof(float))
|
lex.vector = <float*>mem.alloc(self.vectors_length, sizeof(float))
|
||||||
if self.get_lex_attr is not None:
|
if self.get_lex_attr is not None:
|
||||||
for attr, func in self.get_lex_attr.items():
|
for attr, func in self.get_lex_attr.items():
|
||||||
value = func(string)
|
value = func(string)
|
||||||
|
@ -287,7 +287,7 @@ cdef class Vocab:
|
||||||
fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
|
fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
|
||||||
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
|
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
|
||||||
|
|
||||||
lexeme.repvec = EMPTY_VEC
|
lexeme.vector = EMPTY_VEC
|
||||||
py_str = self.strings[lexeme.orth]
|
py_str = self.strings[lexeme.orth]
|
||||||
key = hash_string(py_str)
|
key = hash_string(py_str)
|
||||||
self._by_hash.set(key, lexeme)
|
self._by_hash.set(key, lexeme)
|
||||||
|
@ -306,7 +306,7 @@ cdef class Vocab:
|
||||||
cdef CFile out_file = CFile(out_loc, 'wb')
|
cdef CFile out_file = CFile(out_loc, 'wb')
|
||||||
for lexeme in self:
|
for lexeme in self:
|
||||||
word_str = lexeme.orth_.encode('utf8')
|
word_str = lexeme.orth_.encode('utf8')
|
||||||
vec = lexeme.c.repvec
|
vec = lexeme.c.vector
|
||||||
word_len = len(word_str)
|
word_len = len(word_str)
|
||||||
|
|
||||||
out_file.write_from(&word_len, 1, sizeof(word_len))
|
out_file.write_from(&word_len, 1, sizeof(word_len))
|
||||||
|
@ -331,10 +331,10 @@ cdef class Vocab:
|
||||||
vec_len, len(pieces))
|
vec_len, len(pieces))
|
||||||
orth = self.strings[word_str]
|
orth = self.strings[word_str]
|
||||||
lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
|
lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
|
||||||
lexeme.repvec = <float*>self.mem.alloc(self.vectors_length, sizeof(float))
|
lexeme.vector = <float*>self.mem.alloc(self.vectors_length, sizeof(float))
|
||||||
|
|
||||||
for i, val_str in enumerate(pieces):
|
for i, val_str in enumerate(pieces):
|
||||||
lexeme.repvec[i] = float(val_str)
|
lexeme.vector[i] = float(val_str)
|
||||||
return vec_len
|
return vec_len
|
||||||
|
|
||||||
def load_vectors_from_bin_loc(self, loc):
|
def load_vectors_from_bin_loc(self, loc):
|
||||||
|
@ -376,12 +376,12 @@ cdef class Vocab:
|
||||||
for orth, lex_addr in self._by_orth.items():
|
for orth, lex_addr in self._by_orth.items():
|
||||||
lex = <LexemeC*>lex_addr
|
lex = <LexemeC*>lex_addr
|
||||||
if lex.lower < vectors.size():
|
if lex.lower < vectors.size():
|
||||||
lex.repvec = vectors[lex.lower]
|
lex.vector = vectors[lex.lower]
|
||||||
for i in range(vec_len):
|
for i in range(vec_len):
|
||||||
lex.l2_norm += (lex.repvec[i] * lex.repvec[i])
|
lex.l2_norm += (lex.vector[i] * lex.vector[i])
|
||||||
lex.l2_norm = math.sqrt(lex.l2_norm)
|
lex.l2_norm = math.sqrt(lex.l2_norm)
|
||||||
else:
|
else:
|
||||||
lex.repvec = EMPTY_VEC
|
lex.vector = EMPTY_VEC
|
||||||
return vec_len
|
return vec_len
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user