Update serialization tests for tokenizer

This commit is contained in:
ines 2017-06-03 17:05:28 +02:00
parent 7c919aeb09
commit 3152ee5ca2

View File

@ -1,17 +1,25 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..util import make_tempdir from ...util import get_lang_class
from ..util import make_tempdir, assert_packed_msg_equal
import pytest import pytest
@pytest.mark.parametrize('text', ["I can't do this"]) def load_tokenizer(b):
tok = get_lang_class('en').Defaults.create_tokenizer()
tok.from_bytes(b)
return tok
@pytest.mark.parametrize('text', ["I💜you", "theyre", "“hello”"])
def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text): def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
tokenizer_b = en_tokenizer.to_bytes() tokenizer = en_tokenizer
new_tokenizer = en_tokenizer.from_bytes(tokenizer_b) new_tokenizer = load_tokenizer(tokenizer.to_bytes())
assert new_tokenizer.to_bytes() == tokenizer_b assert_packed_msg_equal(new_tokenizer.to_bytes(), tokenizer.to_bytes())
doc1 = en_tokenizer(text) # assert new_tokenizer.to_bytes() == tokenizer.to_bytes()
doc1 = tokenizer(text)
doc2 = new_tokenizer(text) doc2 = new_tokenizer(text)
assert [token.text for token in doc1] == [token.text for token in doc2] assert [token.text for token in doc1] == [token.text for token in doc2]