From dac8fe7bdbdc9bb13730a9da5ced9fac78bb9262 Mon Sep 17 00:00:00 2001 From: Chris DuBois Date: Fri, 23 Oct 2015 22:18:47 -0700 Subject: [PATCH] Add __reduce__ to Tokenizer so that English pickles. - Add tests to test_pickle and test_tokenizer that save to tempfiles. --- spacy/tokenizer.pxd | 1 + spacy/tokenizer.pyx | 10 ++++++++++ tests/test_pickle.py | 18 +++++++++++++++--- tests/tokenizer/test_tokenizer.py | 15 ++++++++++++++- 4 files changed, 40 insertions(+), 4 deletions(-) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 9d60d2a6e..c07e87bbc 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -19,6 +19,7 @@ cdef class Tokenizer: cdef object _prefix_re cdef object _suffix_re cdef object _infix_re + cdef object _rules cpdef Doc tokens_from_list(self, list strings) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index ef9c26c01..f0d664c09 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -29,6 +29,16 @@ cdef class Tokenizer: self._infix_re = infix_re self.vocab = vocab self._load_special_tokenization(rules) + self._rules = rules + + def __reduce__(self): + args = (self.vocab, + self._rules, + self._prefix_re, + self._suffix_re, + self._infix_re) + + return (self.__class__, args, None, None) @classmethod def from_dir(cls, Vocab vocab, data_dir): diff --git a/tests/test_pickle.py b/tests/test_pickle.py index a3d54c627..540e54486 100644 --- a/tests/test_pickle.py +++ b/tests/test_pickle.py @@ -1,8 +1,9 @@ -import pytest -import io import cloudpickle +import io +import os import pickle - +import pytest +import tempfile @pytest.mark.models def test_pickle_english(EN): @@ -12,4 +13,15 @@ def test_pickle_english(EN): file_.seek(0) loaded = pickle.load(file_) + assert loaded is not None +@pytest.mark.models +def test_cloudpickle_to_file(EN): + f = tempfile.NamedTemporaryFile(delete=False) + p = cloudpickle.CloudPickler(f) + p.dump(EN) + f.close() + loaded_en = cloudpickle.load(open(f.name)) + os.unlink(f.name) + doc = loaded_en(unicode('test parse')) + assert len(doc) == 2 diff --git a/tests/tokenizer/test_tokenizer.py b/tests/tokenizer/test_tokenizer.py index abf09dd03..be93b9953 100644 --- a/tests/tokenizer/test_tokenizer.py +++ b/tests/tokenizer/test_tokenizer.py @@ -2,6 +2,19 @@ from __future__ import unicode_literals import pytest +import io +import pickle +import cloudpickle +import tempfile + + +@pytest.mark.models +def test_pickle(en_tokenizer): + file_ = io.BytesIO() + cloudpickle.dump(en_tokenizer, file_) + file_.seek(0) + loaded = pickle.load(file_) + assert loaded is not None def test_no_word(en_tokenizer): @@ -108,7 +121,7 @@ def test_cnts5(en_tokenizer): # text = """Today is Tuesday.Mr.""" # tokens = en_tokenizer(text) # assert len(tokens) == 5 -# assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.'] +# assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.'] def test_cnts6(en_tokenizer):