* Work on pickling Vocab instances. The current implementation is not correct, but it may serve to see whether this approach is workable. Pickling is necessary to address Issue #125

This commit is contained in:
Matthew Honnibal 2015-10-12 17:00:01 +11:00
parent 85e7944572
commit f8de403483
2 changed files with 20 additions and 3 deletions

View File

@ -99,7 +99,7 @@ cdef class Vocab:
return self.length return self.length
def __reduce__(self): def __reduce__(self):
tmp_dir = tempfile.mkdtmp() tmp_dir = tempfile.mkdtemp()
lex_loc = path.join(tmp_dir, 'lexemes.bin') lex_loc = path.join(tmp_dir, 'lexemes.bin')
str_loc = path.join(tmp_dir, 'strings.txt') str_loc = path.join(tmp_dir, 'strings.txt')
map_loc = path.join(tmp_dir, 'tag_map.json') map_loc = path.join(tmp_dir, 'tag_map.json')
@ -108,7 +108,7 @@ cdef class Vocab:
self.strings.dump(str_loc) self.strings.dump(str_loc)
json.dump(self.morphology.tag_map, open(map_loc, 'w')) json.dump(self.morphology.tag_map, open(map_loc, 'w'))
return (Vocab.from_dir, (tmp_dir, self.get_lex_attr), None, None) return (unpickle_vocab, (tmp_dir,), None, None)
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
@ -353,7 +353,13 @@ cdef class Vocab:
return vec_len return vec_len
copy_reg.constructor(Vocab.from_dir) def unpickle_vocab(data_dir):
# TODO: This needs fixing --- the trouble is, we can't pickle staticmethods,
# so we need to fiddle with the design of Language a little bit.
from .language import Language
return Vocab.from_dir(data_dir, Language.default_lex_attrs())
copy_reg.constructor(unpickle_vocab)
def write_binary_vectors(in_loc, out_loc): def write_binary_vectors(in_loc, out_loc):

View File

@ -1,10 +1,13 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
import StringIO
import pickle
from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA
from spacy.parts_of_speech import NOUN, VERB from spacy.parts_of_speech import NOUN, VERB
def test_neq(en_vocab): def test_neq(en_vocab):
addr = en_vocab['Hello'] addr = en_vocab['Hello']
assert en_vocab['bye'].orth != addr.orth assert en_vocab['bye'].orth != addr.orth
@ -38,3 +41,11 @@ def test_symbols(en_vocab):
assert en_vocab.strings['ORTH'] == ORTH assert en_vocab.strings['ORTH'] == ORTH
assert en_vocab.strings['PROB'] == PROB assert en_vocab.strings['PROB'] == PROB
def test_pickle_vocab(en_vocab):
file_ = StringIO.StringIO()
pickle.dump(en_vocab, file_)
file_.seek(0)
loaded = pickle.load(file_)