mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
db55577c45
* Remove unicode declarations * Remove Python 3.5 and 2.7 from CI * Don't require pathlib * Replace compat helpers * Remove OrderedDict * Use f-strings * Set Cython compiler language level * Fix typo * Re-add OrderedDict for Table * Update setup.cfg * Revert CONTRIBUTING.md * Revert lookups.md * Revert top-level.md * Small adjustments and docs [ci skip]
42 lines
1.5 KiB
Python
42 lines
1.5 KiB
Python
import pytest
|
||
from spacy.util import get_lang_class
|
||
from spacy.tokenizer import Tokenizer
|
||
|
||
from ..util import make_tempdir, assert_packed_msg_equal
|
||
|
||
|
||
def load_tokenizer(b):
|
||
tok = get_lang_class("en").Defaults.create_tokenizer()
|
||
tok.from_bytes(b)
|
||
return tok
|
||
|
||
|
||
def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
|
||
"""Test that custom tokenizer with not all functions defined can be
|
||
serialized and deserialized correctly (see #2494)."""
|
||
tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search)
|
||
tokenizer_bytes = tokenizer.to_bytes()
|
||
Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
|
||
|
||
|
||
@pytest.mark.skip(reason="Currently unreliable across platforms")
|
||
@pytest.mark.parametrize("text", ["I💜you", "they’re", "“hello”"])
|
||
def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
|
||
tokenizer = en_tokenizer
|
||
new_tokenizer = load_tokenizer(tokenizer.to_bytes())
|
||
assert_packed_msg_equal(new_tokenizer.to_bytes(), tokenizer.to_bytes())
|
||
assert new_tokenizer.to_bytes() == tokenizer.to_bytes()
|
||
doc1 = tokenizer(text)
|
||
doc2 = new_tokenizer(text)
|
||
assert [token.text for token in doc1] == [token.text for token in doc2]
|
||
|
||
|
||
@pytest.mark.skip(reason="Currently unreliable across platforms")
|
||
def test_serialize_tokenizer_roundtrip_disk(en_tokenizer):
|
||
tokenizer = en_tokenizer
|
||
with make_tempdir() as d:
|
||
file_path = d / "tokenizer"
|
||
tokenizer.to_disk(file_path)
|
||
tokenizer_d = en_tokenizer.from_disk(file_path)
|
||
assert tokenizer.to_bytes() == tokenizer_d.to_bytes()
|