Merge pull request #149 from chrisdubois/pickle-patch

Add __reduce__ to Tokenizer so that English pickles.
This commit is contained in:
Matthew Honnibal 2015-10-25 15:30:31 +11:00
commit 3a6e48e814
4 changed files with 40 additions and 4 deletions

View File

@ -19,6 +19,7 @@ cdef class Tokenizer:
cdef object _prefix_re cdef object _prefix_re
cdef object _suffix_re cdef object _suffix_re
cdef object _infix_re cdef object _infix_re
cdef object _rules
cpdef Doc tokens_from_list(self, list strings) cpdef Doc tokens_from_list(self, list strings)

View File

@ -29,6 +29,16 @@ cdef class Tokenizer:
self._infix_re = infix_re self._infix_re = infix_re
self.vocab = vocab self.vocab = vocab
self._load_special_tokenization(rules) self._load_special_tokenization(rules)
self._rules = rules
def __reduce__(self):
args = (self.vocab,
self._rules,
self._prefix_re,
self._suffix_re,
self._infix_re)
return (self.__class__, args, None, None)
@classmethod @classmethod
def from_dir(cls, Vocab vocab, data_dir): def from_dir(cls, Vocab vocab, data_dir):

View File

@ -1,8 +1,9 @@
import pytest
import io
import cloudpickle import cloudpickle
import io
import os
import pickle import pickle
import pytest
import tempfile
@pytest.mark.models @pytest.mark.models
def test_pickle_english(EN): def test_pickle_english(EN):
@ -12,4 +13,15 @@ def test_pickle_english(EN):
file_.seek(0) file_.seek(0)
loaded = pickle.load(file_) loaded = pickle.load(file_)
assert loaded is not None
@pytest.mark.models
def test_cloudpickle_to_file(EN):
f = tempfile.NamedTemporaryFile(delete=False)
p = cloudpickle.CloudPickler(f)
p.dump(EN)
f.close()
loaded_en = cloudpickle.load(open(f.name))
os.unlink(f.name)
doc = loaded_en(unicode('test parse'))
assert len(doc) == 2

View File

@ -2,6 +2,19 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
import io
import pickle
import cloudpickle
import tempfile
@pytest.mark.models
def test_pickle(en_tokenizer):
file_ = io.BytesIO()
cloudpickle.dump(en_tokenizer, file_)
file_.seek(0)
loaded = pickle.load(file_)
assert loaded is not None
def test_no_word(en_tokenizer): def test_no_word(en_tokenizer):
@ -108,7 +121,7 @@ def test_cnts5(en_tokenizer):
# text = """Today is Tuesday.Mr.""" # text = """Today is Tuesday.Mr."""
# tokens = en_tokenizer(text) # tokens = en_tokenizer(text)
# assert len(tokens) == 5 # assert len(tokens) == 5
# assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.'] # assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
def test_cnts6(en_tokenizer): def test_cnts6(en_tokenizer):