mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-15 12:06:25 +03:00
1d59fdbd39
* Adapt tokenization methods from `pyvi` to preserve text encoding and whitespace * Add serialization support similar to Chinese and Japanese Note: as for Chinese and Japanese, some settings are duplicated in `config.cfg` and `tokenizer/cfg`.
34 lines
1.1 KiB
Python
34 lines
1.1 KiB
Python
from spacy.lang.vi import Vietnamese
|
|
from ...util import make_tempdir
|
|
|
|
|
|
def test_vi_tokenizer_serialize(vi_tokenizer):
|
|
tokenizer_bytes = vi_tokenizer.to_bytes()
|
|
nlp = Vietnamese()
|
|
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
|
assert nlp.tokenizer.use_pyvi is True
|
|
|
|
with make_tempdir() as d:
|
|
file_path = d / "tokenizer"
|
|
vi_tokenizer.to_disk(file_path)
|
|
nlp = Vietnamese()
|
|
nlp.tokenizer.from_disk(file_path)
|
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
|
assert nlp.tokenizer.use_pyvi is True
|
|
|
|
# mode is (de)serialized correctly
|
|
nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
|
|
nlp_bytes = nlp.to_bytes()
|
|
nlp_r = Vietnamese()
|
|
nlp_r.from_bytes(nlp_bytes)
|
|
assert nlp_bytes == nlp_r.to_bytes()
|
|
assert nlp_r.tokenizer.use_pyvi == False
|
|
|
|
with make_tempdir() as d:
|
|
nlp.to_disk(d)
|
|
nlp_r = Vietnamese()
|
|
nlp_r.from_disk(d)
|
|
assert nlp_bytes == nlp_r.to_bytes()
|
|
assert nlp_r.tokenizer.use_pyvi == False
|