mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-25 00:34:20 +03:00
Merge pull request #149 from chrisdubois/pickle-patch
Add __reduce__ to Tokenizer so that English pickles.
This commit is contained in:
commit
3a6e48e814
|
@ -19,6 +19,7 @@ cdef class Tokenizer:
|
||||||
cdef object _prefix_re
|
cdef object _prefix_re
|
||||||
cdef object _suffix_re
|
cdef object _suffix_re
|
||||||
cdef object _infix_re
|
cdef object _infix_re
|
||||||
|
cdef object _rules
|
||||||
|
|
||||||
cpdef Doc tokens_from_list(self, list strings)
|
cpdef Doc tokens_from_list(self, list strings)
|
||||||
|
|
||||||
|
|
|
@ -29,6 +29,16 @@ cdef class Tokenizer:
|
||||||
self._infix_re = infix_re
|
self._infix_re = infix_re
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self._load_special_tokenization(rules)
|
self._load_special_tokenization(rules)
|
||||||
|
self._rules = rules
|
||||||
|
|
||||||
|
def __reduce__(self):
|
||||||
|
args = (self.vocab,
|
||||||
|
self._rules,
|
||||||
|
self._prefix_re,
|
||||||
|
self._suffix_re,
|
||||||
|
self._infix_re)
|
||||||
|
|
||||||
|
return (self.__class__, args, None, None)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dir(cls, Vocab vocab, data_dir):
|
def from_dir(cls, Vocab vocab, data_dir):
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
import pytest
|
|
||||||
import io
|
|
||||||
import cloudpickle
|
import cloudpickle
|
||||||
|
import io
|
||||||
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
|
import pytest
|
||||||
|
import tempfile
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models
|
||||||
def test_pickle_english(EN):
|
def test_pickle_english(EN):
|
||||||
|
@ -12,4 +13,15 @@ def test_pickle_english(EN):
|
||||||
file_.seek(0)
|
file_.seek(0)
|
||||||
|
|
||||||
loaded = pickle.load(file_)
|
loaded = pickle.load(file_)
|
||||||
|
assert loaded is not None
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
def test_cloudpickle_to_file(EN):
|
||||||
|
f = tempfile.NamedTemporaryFile(delete=False)
|
||||||
|
p = cloudpickle.CloudPickler(f)
|
||||||
|
p.dump(EN)
|
||||||
|
f.close()
|
||||||
|
loaded_en = cloudpickle.load(open(f.name))
|
||||||
|
os.unlink(f.name)
|
||||||
|
doc = loaded_en(unicode('test parse'))
|
||||||
|
assert len(doc) == 2
|
||||||
|
|
|
@ -2,6 +2,19 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import io
|
||||||
|
import pickle
|
||||||
|
import cloudpickle
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
def test_pickle(en_tokenizer):
|
||||||
|
file_ = io.BytesIO()
|
||||||
|
cloudpickle.dump(en_tokenizer, file_)
|
||||||
|
file_.seek(0)
|
||||||
|
loaded = pickle.load(file_)
|
||||||
|
assert loaded is not None
|
||||||
|
|
||||||
|
|
||||||
def test_no_word(en_tokenizer):
|
def test_no_word(en_tokenizer):
|
||||||
|
@ -108,7 +121,7 @@ def test_cnts5(en_tokenizer):
|
||||||
# text = """Today is Tuesday.Mr."""
|
# text = """Today is Tuesday.Mr."""
|
||||||
# tokens = en_tokenizer(text)
|
# tokens = en_tokenizer(text)
|
||||||
# assert len(tokens) == 5
|
# assert len(tokens) == 5
|
||||||
# assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
|
# assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
|
||||||
|
|
||||||
|
|
||||||
def test_cnts6(en_tokenizer):
|
def test_cnts6(en_tokenizer):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user