* Start trying to pickle Vocab

This commit is contained in:
Matthew Honnibal 2015-10-12 16:41:31 +11:00
parent 5ca57bd859
commit 85e7944572
2 changed files with 17 additions and 2 deletions

View File

@ -25,7 +25,6 @@ cdef struct _Cached:
cdef class Vocab: cdef class Vocab:
cpdef public lexeme_props_getter
cdef Pool mem cdef Pool mem
cpdef readonly StringStore strings cpdef readonly StringStore strings
cpdef readonly Morphology morphology cpdef readonly Morphology morphology
@ -33,7 +32,6 @@ cdef class Vocab:
cdef public object _serializer cdef public object _serializer
cdef public object data_dir cdef public object data_dir
cdef public object get_lex_attr cdef public object get_lex_attr
cdef public object pos_tags
cdef public object serializer_freqs cdef public object serializer_freqs
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL cdef const LexemeC* get(self, Pool mem, unicode string) except NULL

View File

@ -10,6 +10,8 @@ from os import path
import io import io
import math import math
import json import json
import tempfile
import copy_reg
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
@ -96,6 +98,18 @@ cdef class Vocab:
"""The current number of lexemes stored.""" """The current number of lexemes stored."""
return self.length return self.length
def __reduce__(self):
tmp_dir = tempfile.mkdtmp()
lex_loc = path.join(tmp_dir, 'lexemes.bin')
str_loc = path.join(tmp_dir, 'strings.txt')
map_loc = path.join(tmp_dir, 'tag_map.json')
self.dump(lex_loc)
self.strings.dump(str_loc)
json.dump(self.morphology.tag_map, open(map_loc, 'w'))
return (Vocab.from_dir, (tmp_dir, self.get_lex_attr), None, None)
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool if necessary, using memory acquired from the given pool. If the pool
@ -339,6 +353,9 @@ cdef class Vocab:
return vec_len return vec_len
copy_reg.constructor(Vocab.from_dir)
def write_binary_vectors(in_loc, out_loc): def write_binary_vectors(in_loc, out_loc):
cdef CFile out_file = CFile(out_loc, 'wb') cdef CFile out_file = CFile(out_loc, 'wb')
cdef Address mem cdef Address mem