2021-08-19 15:37:47 +03:00
|
|
|
import pickle
|
|
|
|
|
2020-06-08 17:29:05 +03:00
|
|
|
from spacy.lang.ja import Japanese
|
2023-06-14 18:48:41 +03:00
|
|
|
|
2020-06-08 17:29:05 +03:00
|
|
|
from ...util import make_tempdir
|
|
|
|
|
|
|
|
|
|
|
|
def test_ja_tokenizer_serialize(ja_tokenizer):
|
|
|
|
tokenizer_bytes = ja_tokenizer.to_bytes()
|
|
|
|
nlp = Japanese()
|
|
|
|
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
|
|
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
2020-06-21 23:38:04 +03:00
|
|
|
assert nlp.tokenizer.split_mode is None
|
2020-06-08 17:29:05 +03:00
|
|
|
|
|
|
|
with make_tempdir() as d:
|
|
|
|
file_path = d / "tokenizer"
|
|
|
|
ja_tokenizer.to_disk(file_path)
|
|
|
|
nlp = Japanese()
|
|
|
|
nlp.tokenizer.from_disk(file_path)
|
|
|
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
2020-06-21 23:38:04 +03:00
|
|
|
assert nlp.tokenizer.split_mode is None
|
2020-06-08 17:29:05 +03:00
|
|
|
|
|
|
|
# split mode is (de)serialized correctly
|
2020-07-22 14:42:59 +03:00
|
|
|
nlp = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}})
|
2020-06-08 17:29:05 +03:00
|
|
|
nlp_r = Japanese()
|
|
|
|
nlp_bytes = nlp.to_bytes()
|
|
|
|
nlp_r.from_bytes(nlp_bytes)
|
|
|
|
assert nlp_bytes == nlp_r.to_bytes()
|
|
|
|
assert nlp_r.tokenizer.split_mode == "B"
|
|
|
|
|
|
|
|
with make_tempdir() as d:
|
|
|
|
nlp.to_disk(d)
|
|
|
|
nlp_r = Japanese()
|
|
|
|
nlp_r.from_disk(d)
|
|
|
|
assert nlp_bytes == nlp_r.to_bytes()
|
|
|
|
assert nlp_r.tokenizer.split_mode == "B"
|
2021-08-19 15:37:47 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_ja_tokenizer_pickle(ja_tokenizer):
|
|
|
|
b = pickle.dumps(ja_tokenizer)
|
|
|
|
ja_tokenizer_re = pickle.loads(b)
|
|
|
|
assert ja_tokenizer.to_bytes() == ja_tokenizer_re.to_bytes()
|