* Save and load word vectors during pickling, re Issue #125

This commit is contained in:
Matthew Honnibal 2015-10-26 12:33:04 +11:00
parent a824a98312
commit a371a1071d

View File

@ -109,12 +109,14 @@ cdef class Vocab:
tmp_dir = tempfile.mkdtemp() tmp_dir = tempfile.mkdtemp()
lex_loc = path.join(tmp_dir, 'lexemes.bin') lex_loc = path.join(tmp_dir, 'lexemes.bin')
str_loc = path.join(tmp_dir, 'strings.json') str_loc = path.join(tmp_dir, 'strings.json')
vec_loc = path.join(self.data_dir, 'vec.bin') if self.data_dir is not None else None vec_loc = path.join(tmp_dir, 'vec.bin')
self.dump(lex_loc) self.dump(lex_loc)
with io.open(str_loc, 'w', encoding='utf8') as file_: with io.open(str_loc, 'w', encoding='utf8') as file_:
self.strings.dump(file_) self.strings.dump(file_)
self.dump_vectors(vec_loc)
state = (str_loc, lex_loc, vec_loc, self.morphology, self.get_lex_attr, state = (str_loc, lex_loc, vec_loc, self.morphology, self.get_lex_attr,
self.serializer_freqs, self.data_dir) self.serializer_freqs, self.data_dir)
return (unpickle_vocab, state, None, None) return (unpickle_vocab, state, None, None)
@ -293,6 +295,27 @@ cdef class Vocab:
i += 1 i += 1
fp.close() fp.close()
def dump_vectors(self, out_loc):
cdef int32_t vec_len = self.vectors_length
cdef int32_t word_len
cdef bytes word_str
cdef char* chars
cdef Lexeme lexeme
cdef CFile out_file = CFile(out_loc, 'wb')
for lexeme in self:
word_str = lexeme.orth_.encode('utf8')
vec = lexeme.c.repvec
word_len = len(word_str)
out_file.write_from(&word_len, 1, sizeof(word_len))
out_file.write_from(&vec_len, 1, sizeof(vec_len))
chars = <char*>word_str
out_file.write_from(chars, word_len, sizeof(char))
out_file.write_from(vec, vec_len, sizeof(float))
out_file.close()
def load_vectors(self, file_): def load_vectors(self, file_):
cdef LexemeC* lexeme cdef LexemeC* lexeme
cdef attr_t orth cdef attr_t orth