* Start trying to pickle Vocab

This commit is contained in:
Matthew Honnibal 2015-10-12 16:41:31 +11:00
parent 5ca57bd859
commit 85e7944572
2 changed files with 17 additions and 2 deletions

View File

@ -25,7 +25,6 @@ cdef struct _Cached:
cdef class Vocab:
cpdef public lexeme_props_getter
cdef Pool mem
cpdef readonly StringStore strings
cpdef readonly Morphology morphology
@ -33,7 +32,6 @@ cdef class Vocab:
cdef public object _serializer
cdef public object data_dir
cdef public object get_lex_attr
cdef public object pos_tags
cdef public object serializer_freqs
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL

View File

@ -10,6 +10,8 @@ from os import path
import io
import math
import json
import tempfile
import copy_reg
from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme
@ -96,6 +98,18 @@ cdef class Vocab:
"""The current number of lexemes stored."""
return self.length
def __reduce__(self):
tmp_dir = tempfile.mkdtmp()
lex_loc = path.join(tmp_dir, 'lexemes.bin')
str_loc = path.join(tmp_dir, 'strings.txt')
map_loc = path.join(tmp_dir, 'tag_map.json')
self.dump(lex_loc)
self.strings.dump(str_loc)
json.dump(self.morphology.tag_map, open(map_loc, 'w'))
return (Vocab.from_dir, (tmp_dir, self.get_lex_attr), None, None)
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool
@ -339,6 +353,9 @@ cdef class Vocab:
return vec_len
copy_reg.constructor(Vocab.from_dir)
def write_binary_vectors(in_loc, out_loc):
cdef CFile out_file = CFile(out_loc, 'wb')
cdef Address mem