mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
* Save and load word vectors during pickling, re Issue #125
This commit is contained in:
parent
a824a98312
commit
a371a1071d
|
@ -109,12 +109,14 @@ cdef class Vocab:
|
||||||
tmp_dir = tempfile.mkdtemp()
|
tmp_dir = tempfile.mkdtemp()
|
||||||
lex_loc = path.join(tmp_dir, 'lexemes.bin')
|
lex_loc = path.join(tmp_dir, 'lexemes.bin')
|
||||||
str_loc = path.join(tmp_dir, 'strings.json')
|
str_loc = path.join(tmp_dir, 'strings.json')
|
||||||
vec_loc = path.join(self.data_dir, 'vec.bin') if self.data_dir is not None else None
|
vec_loc = path.join(tmp_dir, 'vec.bin')
|
||||||
|
|
||||||
self.dump(lex_loc)
|
self.dump(lex_loc)
|
||||||
with io.open(str_loc, 'w', encoding='utf8') as file_:
|
with io.open(str_loc, 'w', encoding='utf8') as file_:
|
||||||
self.strings.dump(file_)
|
self.strings.dump(file_)
|
||||||
|
|
||||||
|
self.dump_vectors(vec_loc)
|
||||||
|
|
||||||
state = (str_loc, lex_loc, vec_loc, self.morphology, self.get_lex_attr,
|
state = (str_loc, lex_loc, vec_loc, self.morphology, self.get_lex_attr,
|
||||||
self.serializer_freqs, self.data_dir)
|
self.serializer_freqs, self.data_dir)
|
||||||
return (unpickle_vocab, state, None, None)
|
return (unpickle_vocab, state, None, None)
|
||||||
|
@ -293,6 +295,27 @@ cdef class Vocab:
|
||||||
i += 1
|
i += 1
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
|
def dump_vectors(self, out_loc):
|
||||||
|
cdef int32_t vec_len = self.vectors_length
|
||||||
|
cdef int32_t word_len
|
||||||
|
cdef bytes word_str
|
||||||
|
cdef char* chars
|
||||||
|
|
||||||
|
cdef Lexeme lexeme
|
||||||
|
cdef CFile out_file = CFile(out_loc, 'wb')
|
||||||
|
for lexeme in self:
|
||||||
|
word_str = lexeme.orth_.encode('utf8')
|
||||||
|
vec = lexeme.c.repvec
|
||||||
|
word_len = len(word_str)
|
||||||
|
|
||||||
|
out_file.write_from(&word_len, 1, sizeof(word_len))
|
||||||
|
out_file.write_from(&vec_len, 1, sizeof(vec_len))
|
||||||
|
|
||||||
|
chars = <char*>word_str
|
||||||
|
out_file.write_from(chars, word_len, sizeof(char))
|
||||||
|
out_file.write_from(vec, vec_len, sizeof(float))
|
||||||
|
out_file.close()
|
||||||
|
|
||||||
def load_vectors(self, file_):
|
def load_vectors(self, file_):
|
||||||
cdef LexemeC* lexeme
|
cdef LexemeC* lexeme
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
|
|
Loading…
Reference in New Issue
Block a user