mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
Update serialization tests for tokenizer
This commit is contained in:
parent
7c919aeb09
commit
3152ee5ca2
|
@ -1,17 +1,25 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..util import make_tempdir
|
||||
from ...util import get_lang_class
|
||||
from ..util import make_tempdir, assert_packed_msg_equal
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["I can't do this"])
|
||||
def load_tokenizer(b):
|
||||
tok = get_lang_class('en').Defaults.create_tokenizer()
|
||||
tok.from_bytes(b)
|
||||
return tok
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["I💜you", "they’re", "“hello”"])
|
||||
def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
|
||||
tokenizer_b = en_tokenizer.to_bytes()
|
||||
new_tokenizer = en_tokenizer.from_bytes(tokenizer_b)
|
||||
assert new_tokenizer.to_bytes() == tokenizer_b
|
||||
doc1 = en_tokenizer(text)
|
||||
tokenizer = en_tokenizer
|
||||
new_tokenizer = load_tokenizer(tokenizer.to_bytes())
|
||||
assert_packed_msg_equal(new_tokenizer.to_bytes(), tokenizer.to_bytes())
|
||||
# assert new_tokenizer.to_bytes() == tokenizer.to_bytes()
|
||||
doc1 = tokenizer(text)
|
||||
doc2 = new_tokenizer(text)
|
||||
assert [token.text for token in doc1] == [token.text for token in doc2]
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user