Update custom tokenizer APIs and pickling (#8972)

* Fix incorrect pickling of Japanese and Korean pipelines, which led to
the entire pipeline being reset if pickled

* Enable pickling of Vietnamese tokenizer

* Update tokenizer APIs for Chinese, Japanese, Korean, Thai, and
Vietnamese so that only the `Vocab` is required for initialization
This commit is contained in:
Adriane Boyd 2021-08-19 14:37:47 +02:00 committed by GitHub
parent b278f31ee6
commit c5de9b463a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 94 additions and 30 deletions

View File

@ -8,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .tag_orth_map import TAG_ORTH_MAP from .tag_orth_map import TAG_ORTH_MAP
from .tag_bigram_map import TAG_BIGRAM_MAP from .tag_bigram_map import TAG_BIGRAM_MAP
from ...compat import copy_reg
from ...errors import Errors from ...errors import Errors
from ...language import Language from ...language import Language
from ...scorer import Scorer from ...scorer import Scorer
@ -16,6 +15,7 @@ from ...symbols import POS
from ...tokens import Doc from ...tokens import Doc
from ...training import validate_examples from ...training import validate_examples
from ...util import DummyTokenizer, registry, load_config_from_str from ...util import DummyTokenizer, registry, load_config_from_str
from ...vocab import Vocab
from ... import util from ... import util
@ -31,17 +31,20 @@ split_mode = null
@registry.tokenizers("spacy.ja.JapaneseTokenizer") @registry.tokenizers("spacy.ja.JapaneseTokenizer")
def create_tokenizer(split_mode: Optional[str] = None): def create_tokenizer(split_mode: Optional[str] = None):
def japanese_tokenizer_factory(nlp): def japanese_tokenizer_factory(nlp):
return JapaneseTokenizer(nlp, split_mode=split_mode) return JapaneseTokenizer(nlp.vocab, split_mode=split_mode)
return japanese_tokenizer_factory return japanese_tokenizer_factory
class JapaneseTokenizer(DummyTokenizer): class JapaneseTokenizer(DummyTokenizer):
def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None: def __init__(self, vocab: Vocab, split_mode: Optional[str] = None) -> None:
self.vocab = nlp.vocab self.vocab = vocab
self.split_mode = split_mode self.split_mode = split_mode
self.tokenizer = try_sudachi_import(self.split_mode) self.tokenizer = try_sudachi_import(self.split_mode)
def __reduce__(self):
return JapaneseTokenizer, (self.vocab, self.split_mode)
def __call__(self, text: str) -> Doc: def __call__(self, text: str) -> Doc:
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
sudachipy_tokens = self.tokenizer.tokenize(text) sudachipy_tokens = self.tokenizer.tokenize(text)
@ -293,10 +296,4 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
return text_dtokens, text_spaces return text_dtokens, text_spaces
def pickle_japanese(instance):
return Japanese, tuple()
copy_reg.pickle(Japanese, pickle_japanese)
__all__ = ["Japanese"] __all__ = ["Japanese"]

View File

@ -1,15 +1,15 @@
from typing import Optional, Any, Dict from typing import Any, Dict
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from ...compat import copy_reg
from ...scorer import Scorer from ...scorer import Scorer
from ...symbols import POS from ...symbols import POS
from ...training import validate_examples from ...training import validate_examples
from ...util import DummyTokenizer, registry, load_config_from_str from ...util import DummyTokenizer, registry, load_config_from_str
from ...vocab import Vocab
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -23,17 +23,20 @@ DEFAULT_CONFIG = """
@registry.tokenizers("spacy.ko.KoreanTokenizer") @registry.tokenizers("spacy.ko.KoreanTokenizer")
def create_tokenizer(): def create_tokenizer():
def korean_tokenizer_factory(nlp): def korean_tokenizer_factory(nlp):
return KoreanTokenizer(nlp) return KoreanTokenizer(nlp.vocab)
return korean_tokenizer_factory return korean_tokenizer_factory
class KoreanTokenizer(DummyTokenizer): class KoreanTokenizer(DummyTokenizer):
def __init__(self, nlp: Optional[Language] = None): def __init__(self, vocab: Vocab):
self.vocab = nlp.vocab self.vocab = vocab
MeCab = try_mecab_import() MeCab = try_mecab_import()
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
def __reduce__(self):
return KoreanTokenizer, (self.vocab,)
def __del__(self): def __del__(self):
self.mecab_tokenizer.__del__() self.mecab_tokenizer.__del__()
@ -106,10 +109,4 @@ def check_spaces(text, tokens):
yield False yield False
def pickle_korean(instance):
return Korean, tuple()
copy_reg.pickle(Korean, pickle_korean)
__all__ = ["Korean"] __all__ = ["Korean"]

View File

@ -3,6 +3,7 @@ from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from ...util import DummyTokenizer, registry, load_config_from_str from ...util import DummyTokenizer, registry, load_config_from_str
from ...vocab import Vocab
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -16,13 +17,13 @@ DEFAULT_CONFIG = """
@registry.tokenizers("spacy.th.ThaiTokenizer") @registry.tokenizers("spacy.th.ThaiTokenizer")
def create_thai_tokenizer(): def create_thai_tokenizer():
def thai_tokenizer_factory(nlp): def thai_tokenizer_factory(nlp):
return ThaiTokenizer(nlp) return ThaiTokenizer(nlp.vocab)
return thai_tokenizer_factory return thai_tokenizer_factory
class ThaiTokenizer(DummyTokenizer): class ThaiTokenizer(DummyTokenizer):
def __init__(self, nlp: Language) -> None: def __init__(self, vocab: Vocab) -> None:
try: try:
from pythainlp.tokenize import word_tokenize from pythainlp.tokenize import word_tokenize
except ImportError: except ImportError:
@ -31,7 +32,7 @@ class ThaiTokenizer(DummyTokenizer):
"https://github.com/PyThaiNLP/pythainlp" "https://github.com/PyThaiNLP/pythainlp"
) from None ) from None
self.word_tokenize = word_tokenize self.word_tokenize = word_tokenize
self.vocab = nlp.vocab self.vocab = vocab
def __call__(self, text: str) -> Doc: def __call__(self, text: str) -> Doc:
words = list(self.word_tokenize(text)) words = list(self.word_tokenize(text))

View File

@ -9,6 +9,7 @@ from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from ...util import DummyTokenizer, registry, load_config_from_str from ...util import DummyTokenizer, registry, load_config_from_str
from ...vocab import Vocab
from ... import util from ... import util
@ -24,14 +25,14 @@ use_pyvi = true
@registry.tokenizers("spacy.vi.VietnameseTokenizer") @registry.tokenizers("spacy.vi.VietnameseTokenizer")
def create_vietnamese_tokenizer(use_pyvi: bool = True): def create_vietnamese_tokenizer(use_pyvi: bool = True):
def vietnamese_tokenizer_factory(nlp): def vietnamese_tokenizer_factory(nlp):
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi) return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)
return vietnamese_tokenizer_factory return vietnamese_tokenizer_factory
class VietnameseTokenizer(DummyTokenizer): class VietnameseTokenizer(DummyTokenizer):
def __init__(self, nlp: Language, use_pyvi: bool = False): def __init__(self, vocab: Vocab, use_pyvi: bool = False):
self.vocab = nlp.vocab self.vocab = vocab
self.use_pyvi = use_pyvi self.use_pyvi = use_pyvi
if self.use_pyvi: if self.use_pyvi:
try: try:
@ -45,6 +46,9 @@ class VietnameseTokenizer(DummyTokenizer):
) )
raise ImportError(msg) from None raise ImportError(msg) from None
def __reduce__(self):
return VietnameseTokenizer, (self.vocab, self.use_pyvi)
def __call__(self, text: str) -> Doc: def __call__(self, text: str) -> Doc:
if self.use_pyvi: if self.use_pyvi:
words = self.pyvi_tokenize(text) words = self.pyvi_tokenize(text)

View File

@ -11,6 +11,7 @@ from ...scorer import Scorer
from ...tokens import Doc from ...tokens import Doc
from ...training import validate_examples, Example from ...training import validate_examples, Example
from ...util import DummyTokenizer, registry, load_config_from_str from ...util import DummyTokenizer, registry, load_config_from_str
from ...vocab import Vocab
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ... import util from ... import util
@ -48,14 +49,14 @@ class Segmenter(str, Enum):
@registry.tokenizers("spacy.zh.ChineseTokenizer") @registry.tokenizers("spacy.zh.ChineseTokenizer")
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char): def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
def chinese_tokenizer_factory(nlp): def chinese_tokenizer_factory(nlp):
return ChineseTokenizer(nlp, segmenter=segmenter) return ChineseTokenizer(nlp.vocab, segmenter=segmenter)
return chinese_tokenizer_factory return chinese_tokenizer_factory
class ChineseTokenizer(DummyTokenizer): class ChineseTokenizer(DummyTokenizer):
def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char): def __init__(self, vocab: Vocab, segmenter: Segmenter = Segmenter.char):
self.vocab = nlp.vocab self.vocab = vocab
if isinstance(segmenter, Segmenter): if isinstance(segmenter, Segmenter):
segmenter = segmenter.value segmenter = segmenter.value
self.segmenter = segmenter self.segmenter = segmenter

View File

@ -1,3 +1,5 @@
import pickle
from spacy.lang.ja import Japanese from spacy.lang.ja import Japanese
from ...util import make_tempdir from ...util import make_tempdir
@ -31,3 +33,9 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
nlp_r.from_disk(d) nlp_r.from_disk(d)
assert nlp_bytes == nlp_r.to_bytes() assert nlp_bytes == nlp_r.to_bytes()
assert nlp_r.tokenizer.split_mode == "B" assert nlp_r.tokenizer.split_mode == "B"
def test_ja_tokenizer_pickle(ja_tokenizer):
b = pickle.dumps(ja_tokenizer)
ja_tokenizer_re = pickle.loads(b)
assert ja_tokenizer.to_bytes() == ja_tokenizer_re.to_bytes()

View File

@ -0,0 +1,24 @@
import pickle
from spacy.lang.ko import Korean
from ...util import make_tempdir
def test_ko_tokenizer_serialize(ko_tokenizer):
tokenizer_bytes = ko_tokenizer.to_bytes()
nlp = Korean()
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
with make_tempdir() as d:
file_path = d / "tokenizer"
ko_tokenizer.to_disk(file_path)
nlp = Korean()
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
def test_ko_tokenizer_pickle(ko_tokenizer):
b = pickle.dumps(ko_tokenizer)
ko_tokenizer_re = pickle.loads(b)
assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()

View File

@ -0,0 +1,24 @@
import pickle
from spacy.lang.th import Thai
from ...util import make_tempdir
def test_th_tokenizer_serialize(th_tokenizer):
tokenizer_bytes = th_tokenizer.to_bytes()
nlp = Thai()
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
with make_tempdir() as d:
file_path = d / "tokenizer"
th_tokenizer.to_disk(file_path)
nlp = Thai()
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
def test_th_tokenizer_pickle(th_tokenizer):
b = pickle.dumps(th_tokenizer)
th_tokenizer_re = pickle.loads(b)
assert th_tokenizer.to_bytes() == th_tokenizer_re.to_bytes()

View File

@ -1,3 +1,5 @@
import pickle
from spacy.lang.vi import Vietnamese from spacy.lang.vi import Vietnamese
from ...util import make_tempdir from ...util import make_tempdir
@ -31,3 +33,9 @@ def test_vi_tokenizer_serialize(vi_tokenizer):
nlp_r.from_disk(d) nlp_r.from_disk(d)
assert nlp_bytes == nlp_r.to_bytes() assert nlp_bytes == nlp_r.to_bytes()
assert nlp_r.tokenizer.use_pyvi is False assert nlp_r.tokenizer.use_pyvi is False
def test_vi_tokenizer_pickle(vi_tokenizer):
b = pickle.dumps(vi_tokenizer)
vi_tokenizer_re = pickle.loads(b)
assert vi_tokenizer.to_bytes() == vi_tokenizer_re.to_bytes()