From f8de403483f587e39ddd7148807d305d762b7736 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 12 Oct 2015 17:00:01 +1100 Subject: [PATCH] * Work on pickling Vocab instances. The current implementation is not correct, but it may serve to see whether this approach is workable. Pickling is necessary to address Issue #125 --- spacy/vocab.pyx | 12 +++++++++--- tests/vocab/test_vocab.py | 11 +++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 7f07a64ba..dd6792104 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -99,7 +99,7 @@ cdef class Vocab: return self.length def __reduce__(self): - tmp_dir = tempfile.mkdtmp() + tmp_dir = tempfile.mkdtemp() lex_loc = path.join(tmp_dir, 'lexemes.bin') str_loc = path.join(tmp_dir, 'strings.txt') map_loc = path.join(tmp_dir, 'tag_map.json') @@ -108,7 +108,7 @@ cdef class Vocab: self.strings.dump(str_loc) json.dump(self.morphology.tag_map, open(map_loc, 'w')) - return (Vocab.from_dir, (tmp_dir, self.get_lex_attr), None, None) + return (unpickle_vocab, (tmp_dir,), None, None) cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme @@ -353,7 +353,13 @@ cdef class Vocab: return vec_len -copy_reg.constructor(Vocab.from_dir) +def unpickle_vocab(data_dir): + # TODO: This needs fixing --- the trouble is, we can't pickle staticmethods, + # so we need to fiddle with the design of Language a little bit. + from .language import Language + return Vocab.from_dir(data_dir, Language.default_lex_attrs()) + +copy_reg.constructor(unpickle_vocab) def write_binary_vectors(in_loc, out_loc): diff --git a/tests/vocab/test_vocab.py b/tests/vocab/test_vocab.py index 153e0d546..1ab3746f3 100644 --- a/tests/vocab/test_vocab.py +++ b/tests/vocab/test_vocab.py @@ -1,10 +1,13 @@ from __future__ import unicode_literals import pytest +import StringIO +import pickle from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA from spacy.parts_of_speech import NOUN, VERB + def test_neq(en_vocab): addr = en_vocab['Hello'] assert en_vocab['bye'].orth != addr.orth @@ -38,3 +41,11 @@ def test_symbols(en_vocab): assert en_vocab.strings['ORTH'] == ORTH assert en_vocab.strings['PROB'] == PROB + +def test_pickle_vocab(en_vocab): + file_ = StringIO.StringIO() + pickle.dump(en_vocab, file_) + + file_.seek(0) + + loaded = pickle.load(file_)