spaCy/spacy/tests/lang/vi/test_serialize.py

from spacy.lang.vi import Vietnamese
from ...util import make_tempdir


def test_vi_tokenizer_serialize(vi_tokenizer):
    tokenizer_bytes = vi_tokenizer.to_bytes()
    nlp = Vietnamese()
    nlp.tokenizer.from_bytes(tokenizer_bytes)
    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
    assert nlp.tokenizer.use_pyvi is True

    with make_tempdir() as d:
        file_path = d / "tokenizer"
        vi_tokenizer.to_disk(file_path)
        nlp = Vietnamese()
        nlp.tokenizer.from_disk(file_path)
        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
        assert nlp.tokenizer.use_pyvi is True

    # mode is (de)serialized correctly
    nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
    nlp_bytes = nlp.to_bytes()
    nlp_r = Vietnamese()
    nlp_r.from_bytes(nlp_bytes)
    assert nlp_bytes == nlp_r.to_bytes()
    assert nlp_r.tokenizer.use_pyvi == False

    with make_tempdir() as d:
        nlp.to_disk(d)
        nlp_r = Vietnamese()
        nlp_r.from_disk(d)
        assert nlp_bytes == nlp_r.to_bytes()
        assert nlp_r.tokenizer.use_pyvi == False
Update Vietnamese tokenizer (#8099) * Adapt tokenization methods from `pyvi` to preserve text encoding and whitespace * Add serialization support similar to Chinese and Japanese Note: as for Chinese and Japanese, some settings are duplicated in `config.cfg` and `tokenizer/cfg`. 2021-05-17 11:16:20 +03:00			`from spacy.lang.vi import Vietnamese`
			`from ...util import make_tempdir`


			`def test_vi_tokenizer_serialize(vi_tokenizer):`
			`tokenizer_bytes = vi_tokenizer.to_bytes()`
			`nlp = Vietnamese()`
			`nlp.tokenizer.from_bytes(tokenizer_bytes)`
			`assert tokenizer_bytes == nlp.tokenizer.to_bytes()`
			`assert nlp.tokenizer.use_pyvi is True`

			`with make_tempdir() as d:`
			`file_path = d / "tokenizer"`
			`vi_tokenizer.to_disk(file_path)`
			`nlp = Vietnamese()`
			`nlp.tokenizer.from_disk(file_path)`
			`assert tokenizer_bytes == nlp.tokenizer.to_bytes()`
			`assert nlp.tokenizer.use_pyvi is True`

			`# mode is (de)serialized correctly`
			`nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})`
			`nlp_bytes = nlp.to_bytes()`
			`nlp_r = Vietnamese()`
			`nlp_r.from_bytes(nlp_bytes)`
			`assert nlp_bytes == nlp_r.to_bytes()`
			`assert nlp_r.tokenizer.use_pyvi == False`

			`with make_tempdir() as d:`
			`nlp.to_disk(d)`
			`nlp_r = Vietnamese()`
			`nlp_r.from_disk(d)`
			`assert nlp_bytes == nlp_r.to_bytes()`
			`assert nlp_r.tokenizer.use_pyvi == False`