spaCy/spacy/tests/lang/ja/test_serialize.py
adrianeboyd 3bf111585d
Update Japanese tokenizer config and add serialization (#5562)
* Use `config` dict for tokenizer settings
* Add serialization of split mode setting
* Add tests for tokenizer split modes and serialization of split mode
setting

Based on #5561
2020-06-08 16:29:05 +02:00

38 lines
1.1 KiB
Python

# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.lang.ja import Japanese
from ...util import make_tempdir
def test_ja_tokenizer_serialize(ja_tokenizer):
tokenizer_bytes = ja_tokenizer.to_bytes()
nlp = Japanese()
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
assert nlp.tokenizer.split_mode == None
with make_tempdir() as d:
file_path = d / "tokenizer"
ja_tokenizer.to_disk(file_path)
nlp = Japanese()
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
assert nlp.tokenizer.split_mode == None
# split mode is (de)serialized correctly
nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})
nlp_r = Japanese()
nlp_bytes = nlp.to_bytes()
nlp_r.from_bytes(nlp_bytes)
assert nlp_bytes == nlp_r.to_bytes()
assert nlp_r.tokenizer.split_mode == "B"
with make_tempdir() as d:
nlp.to_disk(d)
nlp_r = Japanese()
nlp_r.from_disk(d)
assert nlp_bytes == nlp_r.to_bytes()
assert nlp_r.tokenizer.split_mode == "B"