mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
* Work on pickling Vocab instances. The current implementation is not correct, but it may serve to see whether this approach is workable. Pickling is necessary to address Issue #125
This commit is contained in:
parent
85e7944572
commit
f8de403483
|
@ -99,7 +99,7 @@ cdef class Vocab:
|
||||||
return self.length
|
return self.length
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
tmp_dir = tempfile.mkdtmp()
|
tmp_dir = tempfile.mkdtemp()
|
||||||
lex_loc = path.join(tmp_dir, 'lexemes.bin')
|
lex_loc = path.join(tmp_dir, 'lexemes.bin')
|
||||||
str_loc = path.join(tmp_dir, 'strings.txt')
|
str_loc = path.join(tmp_dir, 'strings.txt')
|
||||||
map_loc = path.join(tmp_dir, 'tag_map.json')
|
map_loc = path.join(tmp_dir, 'tag_map.json')
|
||||||
|
@ -108,7 +108,7 @@ cdef class Vocab:
|
||||||
self.strings.dump(str_loc)
|
self.strings.dump(str_loc)
|
||||||
json.dump(self.morphology.tag_map, open(map_loc, 'w'))
|
json.dump(self.morphology.tag_map, open(map_loc, 'w'))
|
||||||
|
|
||||||
return (Vocab.from_dir, (tmp_dir, self.get_lex_attr), None, None)
|
return (unpickle_vocab, (tmp_dir,), None, None)
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
||||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||||
|
@ -353,7 +353,13 @@ cdef class Vocab:
|
||||||
return vec_len
|
return vec_len
|
||||||
|
|
||||||
|
|
||||||
copy_reg.constructor(Vocab.from_dir)
|
def unpickle_vocab(data_dir):
|
||||||
|
# TODO: This needs fixing --- the trouble is, we can't pickle staticmethods,
|
||||||
|
# so we need to fiddle with the design of Language a little bit.
|
||||||
|
from .language import Language
|
||||||
|
return Vocab.from_dir(data_dir, Language.default_lex_attrs())
|
||||||
|
|
||||||
|
copy_reg.constructor(unpickle_vocab)
|
||||||
|
|
||||||
|
|
||||||
def write_binary_vectors(in_loc, out_loc):
|
def write_binary_vectors(in_loc, out_loc):
|
||||||
|
|
|
@ -1,10 +1,13 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
import StringIO
|
||||||
|
import pickle
|
||||||
|
|
||||||
from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA
|
from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA
|
||||||
from spacy.parts_of_speech import NOUN, VERB
|
from spacy.parts_of_speech import NOUN, VERB
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_neq(en_vocab):
|
def test_neq(en_vocab):
|
||||||
addr = en_vocab['Hello']
|
addr = en_vocab['Hello']
|
||||||
assert en_vocab['bye'].orth != addr.orth
|
assert en_vocab['bye'].orth != addr.orth
|
||||||
|
@ -38,3 +41,11 @@ def test_symbols(en_vocab):
|
||||||
assert en_vocab.strings['ORTH'] == ORTH
|
assert en_vocab.strings['ORTH'] == ORTH
|
||||||
assert en_vocab.strings['PROB'] == PROB
|
assert en_vocab.strings['PROB'] == PROB
|
||||||
|
|
||||||
|
|
||||||
|
def test_pickle_vocab(en_vocab):
|
||||||
|
file_ = StringIO.StringIO()
|
||||||
|
pickle.dump(en_vocab, file_)
|
||||||
|
|
||||||
|
file_.seek(0)
|
||||||
|
|
||||||
|
loaded = pickle.load(file_)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user