* Work on pickling Vocab instances. The current implementation is not correct, but it may serve to see whether this approach is workable. Pickling is necessary to address Issue #125

2025-08-09 06:34:54 +03:00 · 2015-10-12 17:00:01 +11:00 · 2015-10-12 17:00:01 +11:00 · f8de403483
commit f8de403483
parent 85e7944572
2 changed files with 20 additions and 3 deletions
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -99,7 +99,7 @@ cdef class Vocab:
        return self.length

    def __reduce__(self):
-        tmp_dir = tempfile.mkdtmp()
+        tmp_dir = tempfile.mkdtemp()
        lex_loc = path.join(tmp_dir, 'lexemes.bin')
        str_loc = path.join(tmp_dir, 'strings.txt')
        map_loc = path.join(tmp_dir, 'tag_map.json')
@ -108,7 +108,7 @@ cdef class Vocab:
        self.strings.dump(str_loc)
        json.dump(self.morphology.tag_map, open(map_loc, 'w'))

-        return (Vocab.from_dir, (tmp_dir, self.get_lex_attr), None, None)
+        return (unpickle_vocab, (tmp_dir,), None, None)

    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
        '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
@ -353,7 +353,13 @@ cdef class Vocab:
        return vec_len


-copy_reg.constructor(Vocab.from_dir)
+def unpickle_vocab(data_dir):
+    # TODO: This needs fixing --- the trouble is, we can't pickle staticmethods,
+    # so we need to fiddle with the design of Language a little bit.
+    from .language import Language
+    return Vocab.from_dir(data_dir, Language.default_lex_attrs())
+
+copy_reg.constructor(unpickle_vocab)


 def write_binary_vectors(in_loc, out_loc):
--- a/tests/vocab/test_vocab.py
+++ b/tests/vocab/test_vocab.py
@ -1,10 +1,13 @@
 from __future__ import unicode_literals
 import pytest
+import StringIO
+import pickle

 from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA
 from spacy.parts_of_speech import NOUN, VERB


+
 def test_neq(en_vocab):
    addr = en_vocab['Hello']
    assert en_vocab['bye'].orth != addr.orth
@ -38,3 +41,11 @@ def test_symbols(en_vocab):
    assert en_vocab.strings['ORTH'] == ORTH
    assert en_vocab.strings['PROB'] == PROB
    
+
+def test_pickle_vocab(en_vocab):
+    file_ = StringIO.StringIO()
+    pickle.dump(en_vocab, file_)
+
+    file_.seek(0)
+
+    loaded = pickle.load(file_)