From c5de9b463a30dbb1cd016d4919e4348e55416d5c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 19 Aug 2021 14:37:47 +0200 Subject: [PATCH] Update custom tokenizer APIs and pickling (#8972) * Fix incorrect pickling of Japanese and Korean pipelines, which led to the entire pipeline being reset if pickled * Enable pickling of Vietnamese tokenizer * Update tokenizer APIs for Chinese, Japanese, Korean, Thai, and Vietnamese so that only the `Vocab` is required for initialization --- spacy/lang/ja/__init__.py | 17 +++++++---------- spacy/lang/ko/__init__.py | 19 ++++++++----------- spacy/lang/th/__init__.py | 7 ++++--- spacy/lang/vi/__init__.py | 10 +++++++--- spacy/lang/zh/__init__.py | 7 ++++--- spacy/tests/lang/ja/test_serialize.py | 8 ++++++++ spacy/tests/lang/ko/test_serialize.py | 24 ++++++++++++++++++++++++ spacy/tests/lang/th/test_serialize.py | 24 ++++++++++++++++++++++++ spacy/tests/lang/vi/test_serialize.py | 8 ++++++++ 9 files changed, 94 insertions(+), 30 deletions(-) create mode 100644 spacy/tests/lang/ko/test_serialize.py create mode 100644 spacy/tests/lang/th/test_serialize.py diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 4e6bf9d3c..12e65413a 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -8,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS from .tag_map import TAG_MAP from .tag_orth_map import TAG_ORTH_MAP from .tag_bigram_map import TAG_BIGRAM_MAP -from ...compat import copy_reg from ...errors import Errors from ...language import Language from ...scorer import Scorer @@ -16,6 +15,7 @@ from ...symbols import POS from ...tokens import Doc from ...training import validate_examples from ...util import DummyTokenizer, registry, load_config_from_str +from ...vocab import Vocab from ... import util @@ -31,17 +31,20 @@ split_mode = null @registry.tokenizers("spacy.ja.JapaneseTokenizer") def create_tokenizer(split_mode: Optional[str] = None): def japanese_tokenizer_factory(nlp): - return JapaneseTokenizer(nlp, split_mode=split_mode) + return JapaneseTokenizer(nlp.vocab, split_mode=split_mode) return japanese_tokenizer_factory class JapaneseTokenizer(DummyTokenizer): - def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None: - self.vocab = nlp.vocab + def __init__(self, vocab: Vocab, split_mode: Optional[str] = None) -> None: + self.vocab = vocab self.split_mode = split_mode self.tokenizer = try_sudachi_import(self.split_mode) + def __reduce__(self): + return JapaneseTokenizer, (self.vocab, self.split_mode) + def __call__(self, text: str) -> Doc: # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces sudachipy_tokens = self.tokenizer.tokenize(text) @@ -293,10 +296,4 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"): return text_dtokens, text_spaces -def pickle_japanese(instance): - return Japanese, tuple() - - -copy_reg.pickle(Japanese, pickle_japanese) - __all__ = ["Japanese"] diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 83c9f4962..daa445e09 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -1,15 +1,15 @@ -from typing import Optional, Any, Dict +from typing import Any, Dict from .stop_words import STOP_WORDS from .tag_map import TAG_MAP from .lex_attrs import LEX_ATTRS from ...language import Language from ...tokens import Doc -from ...compat import copy_reg from ...scorer import Scorer from ...symbols import POS from ...training import validate_examples from ...util import DummyTokenizer, registry, load_config_from_str +from ...vocab import Vocab DEFAULT_CONFIG = """ @@ -23,17 +23,20 @@ DEFAULT_CONFIG = """ @registry.tokenizers("spacy.ko.KoreanTokenizer") def create_tokenizer(): def korean_tokenizer_factory(nlp): - return KoreanTokenizer(nlp) + return KoreanTokenizer(nlp.vocab) return korean_tokenizer_factory class KoreanTokenizer(DummyTokenizer): - def __init__(self, nlp: Optional[Language] = None): - self.vocab = nlp.vocab + def __init__(self, vocab: Vocab): + self.vocab = vocab MeCab = try_mecab_import() self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") + def __reduce__(self): + return KoreanTokenizer, (self.vocab,) + def __del__(self): self.mecab_tokenizer.__del__() @@ -106,10 +109,4 @@ def check_spaces(text, tokens): yield False -def pickle_korean(instance): - return Korean, tuple() - - -copy_reg.pickle(Korean, pickle_korean) - __all__ = ["Korean"] diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index 219c50c1a..a89d4dc77 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -3,6 +3,7 @@ from .lex_attrs import LEX_ATTRS from ...language import Language from ...tokens import Doc from ...util import DummyTokenizer, registry, load_config_from_str +from ...vocab import Vocab DEFAULT_CONFIG = """ @@ -16,13 +17,13 @@ DEFAULT_CONFIG = """ @registry.tokenizers("spacy.th.ThaiTokenizer") def create_thai_tokenizer(): def thai_tokenizer_factory(nlp): - return ThaiTokenizer(nlp) + return ThaiTokenizer(nlp.vocab) return thai_tokenizer_factory class ThaiTokenizer(DummyTokenizer): - def __init__(self, nlp: Language) -> None: + def __init__(self, vocab: Vocab) -> None: try: from pythainlp.tokenize import word_tokenize except ImportError: @@ -31,7 +32,7 @@ class ThaiTokenizer(DummyTokenizer): "https://github.com/PyThaiNLP/pythainlp" ) from None self.word_tokenize = word_tokenize - self.vocab = nlp.vocab + self.vocab = vocab def __call__(self, text: str) -> Doc: words = list(self.word_tokenize(text)) diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py index b6d873a13..afc715ff3 100644 --- a/spacy/lang/vi/__init__.py +++ b/spacy/lang/vi/__init__.py @@ -9,6 +9,7 @@ from .lex_attrs import LEX_ATTRS from ...language import Language from ...tokens import Doc from ...util import DummyTokenizer, registry, load_config_from_str +from ...vocab import Vocab from ... import util @@ -24,14 +25,14 @@ use_pyvi = true @registry.tokenizers("spacy.vi.VietnameseTokenizer") def create_vietnamese_tokenizer(use_pyvi: bool = True): def vietnamese_tokenizer_factory(nlp): - return VietnameseTokenizer(nlp, use_pyvi=use_pyvi) + return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi) return vietnamese_tokenizer_factory class VietnameseTokenizer(DummyTokenizer): - def __init__(self, nlp: Language, use_pyvi: bool = False): - self.vocab = nlp.vocab + def __init__(self, vocab: Vocab, use_pyvi: bool = False): + self.vocab = vocab self.use_pyvi = use_pyvi if self.use_pyvi: try: @@ -45,6 +46,9 @@ class VietnameseTokenizer(DummyTokenizer): ) raise ImportError(msg) from None + def __reduce__(self): + return VietnameseTokenizer, (self.vocab, self.use_pyvi) + def __call__(self, text: str) -> Doc: if self.use_pyvi: words = self.pyvi_tokenize(text) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 9a8a21a63..c6dd7bb85 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -11,6 +11,7 @@ from ...scorer import Scorer from ...tokens import Doc from ...training import validate_examples, Example from ...util import DummyTokenizer, registry, load_config_from_str +from ...vocab import Vocab from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS from ... import util @@ -48,14 +49,14 @@ class Segmenter(str, Enum): @registry.tokenizers("spacy.zh.ChineseTokenizer") def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char): def chinese_tokenizer_factory(nlp): - return ChineseTokenizer(nlp, segmenter=segmenter) + return ChineseTokenizer(nlp.vocab, segmenter=segmenter) return chinese_tokenizer_factory class ChineseTokenizer(DummyTokenizer): - def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char): - self.vocab = nlp.vocab + def __init__(self, vocab: Vocab, segmenter: Segmenter = Segmenter.char): + self.vocab = vocab if isinstance(segmenter, Segmenter): segmenter = segmenter.value self.segmenter = segmenter diff --git a/spacy/tests/lang/ja/test_serialize.py b/spacy/tests/lang/ja/test_serialize.py index e05a363bf..011eb470f 100644 --- a/spacy/tests/lang/ja/test_serialize.py +++ b/spacy/tests/lang/ja/test_serialize.py @@ -1,3 +1,5 @@ +import pickle + from spacy.lang.ja import Japanese from ...util import make_tempdir @@ -31,3 +33,9 @@ def test_ja_tokenizer_serialize(ja_tokenizer): nlp_r.from_disk(d) assert nlp_bytes == nlp_r.to_bytes() assert nlp_r.tokenizer.split_mode == "B" + + +def test_ja_tokenizer_pickle(ja_tokenizer): + b = pickle.dumps(ja_tokenizer) + ja_tokenizer_re = pickle.loads(b) + assert ja_tokenizer.to_bytes() == ja_tokenizer_re.to_bytes() diff --git a/spacy/tests/lang/ko/test_serialize.py b/spacy/tests/lang/ko/test_serialize.py new file mode 100644 index 000000000..75288fcc5 --- /dev/null +++ b/spacy/tests/lang/ko/test_serialize.py @@ -0,0 +1,24 @@ +import pickle + +from spacy.lang.ko import Korean +from ...util import make_tempdir + + +def test_ko_tokenizer_serialize(ko_tokenizer): + tokenizer_bytes = ko_tokenizer.to_bytes() + nlp = Korean() + nlp.tokenizer.from_bytes(tokenizer_bytes) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + + with make_tempdir() as d: + file_path = d / "tokenizer" + ko_tokenizer.to_disk(file_path) + nlp = Korean() + nlp.tokenizer.from_disk(file_path) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + + +def test_ko_tokenizer_pickle(ko_tokenizer): + b = pickle.dumps(ko_tokenizer) + ko_tokenizer_re = pickle.loads(b) + assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes() diff --git a/spacy/tests/lang/th/test_serialize.py b/spacy/tests/lang/th/test_serialize.py new file mode 100644 index 000000000..a3de4bf54 --- /dev/null +++ b/spacy/tests/lang/th/test_serialize.py @@ -0,0 +1,24 @@ +import pickle + +from spacy.lang.th import Thai +from ...util import make_tempdir + + +def test_th_tokenizer_serialize(th_tokenizer): + tokenizer_bytes = th_tokenizer.to_bytes() + nlp = Thai() + nlp.tokenizer.from_bytes(tokenizer_bytes) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + + with make_tempdir() as d: + file_path = d / "tokenizer" + th_tokenizer.to_disk(file_path) + nlp = Thai() + nlp.tokenizer.from_disk(file_path) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + + +def test_th_tokenizer_pickle(th_tokenizer): + b = pickle.dumps(th_tokenizer) + th_tokenizer_re = pickle.loads(b) + assert th_tokenizer.to_bytes() == th_tokenizer_re.to_bytes() diff --git a/spacy/tests/lang/vi/test_serialize.py b/spacy/tests/lang/vi/test_serialize.py index ed4652df7..55dab799c 100644 --- a/spacy/tests/lang/vi/test_serialize.py +++ b/spacy/tests/lang/vi/test_serialize.py @@ -1,3 +1,5 @@ +import pickle + from spacy.lang.vi import Vietnamese from ...util import make_tempdir @@ -31,3 +33,9 @@ def test_vi_tokenizer_serialize(vi_tokenizer): nlp_r.from_disk(d) assert nlp_bytes == nlp_r.to_bytes() assert nlp_r.tokenizer.use_pyvi is False + + +def test_vi_tokenizer_pickle(vi_tokenizer): + b = pickle.dumps(vi_tokenizer) + vi_tokenizer_re = pickle.loads(b) + assert vi_tokenizer.to_bytes() == vi_tokenizer_re.to_bytes()