Update custom tokenizer APIs and pickling (#8972)

* Fix incorrect pickling of Japanese and Korean pipelines, which led to
the entire pipeline being reset if pickled

* Enable pickling of Vietnamese tokenizer

* Update tokenizer APIs for Chinese, Japanese, Korean, Thai, and
Vietnamese so that only the `Vocab` is required for initialization
This commit is contained in:
Adriane Boyd 2021-08-19 14:37:47 +02:00 committed by GitHub
parent b278f31ee6
commit c5de9b463a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 94 additions and 30 deletions

View File

@ -8,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
from .tag_map import TAG_MAP
from .tag_orth_map import TAG_ORTH_MAP
from .tag_bigram_map import TAG_BIGRAM_MAP
from ...compat import copy_reg
from ...errors import Errors
from ...language import Language
from ...scorer import Scorer
@ -16,6 +15,7 @@ from ...symbols import POS
from ...tokens import Doc
from ...training import validate_examples
from ...util import DummyTokenizer, registry, load_config_from_str
from ...vocab import Vocab
from ... import util
@ -31,17 +31,20 @@ split_mode = null
@registry.tokenizers("spacy.ja.JapaneseTokenizer")
def create_tokenizer(split_mode: Optional[str] = None):
def japanese_tokenizer_factory(nlp):
return JapaneseTokenizer(nlp, split_mode=split_mode)
return JapaneseTokenizer(nlp.vocab, split_mode=split_mode)
return japanese_tokenizer_factory
class JapaneseTokenizer(DummyTokenizer):
def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
self.vocab = nlp.vocab
def __init__(self, vocab: Vocab, split_mode: Optional[str] = None) -> None:
self.vocab = vocab
self.split_mode = split_mode
self.tokenizer = try_sudachi_import(self.split_mode)
def __reduce__(self):
return JapaneseTokenizer, (self.vocab, self.split_mode)
def __call__(self, text: str) -> Doc:
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
sudachipy_tokens = self.tokenizer.tokenize(text)
@ -293,10 +296,4 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
return text_dtokens, text_spaces
def pickle_japanese(instance):
return Japanese, tuple()
copy_reg.pickle(Japanese, pickle_japanese)
__all__ = ["Japanese"]

View File

@ -1,15 +1,15 @@
from typing import Optional, Any, Dict
from typing import Any, Dict
from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...tokens import Doc
from ...compat import copy_reg
from ...scorer import Scorer
from ...symbols import POS
from ...training import validate_examples
from ...util import DummyTokenizer, registry, load_config_from_str
from ...vocab import Vocab
DEFAULT_CONFIG = """
@ -23,17 +23,20 @@ DEFAULT_CONFIG = """
@registry.tokenizers("spacy.ko.KoreanTokenizer")
def create_tokenizer():
def korean_tokenizer_factory(nlp):
return KoreanTokenizer(nlp)
return KoreanTokenizer(nlp.vocab)
return korean_tokenizer_factory
class KoreanTokenizer(DummyTokenizer):
def __init__(self, nlp: Optional[Language] = None):
self.vocab = nlp.vocab
def __init__(self, vocab: Vocab):
self.vocab = vocab
MeCab = try_mecab_import()
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
def __reduce__(self):
return KoreanTokenizer, (self.vocab,)
def __del__(self):
self.mecab_tokenizer.__del__()
@ -106,10 +109,4 @@ def check_spaces(text, tokens):
yield False
def pickle_korean(instance):
return Korean, tuple()
copy_reg.pickle(Korean, pickle_korean)
__all__ = ["Korean"]

View File

@ -3,6 +3,7 @@ from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...tokens import Doc
from ...util import DummyTokenizer, registry, load_config_from_str
from ...vocab import Vocab
DEFAULT_CONFIG = """
@ -16,13 +17,13 @@ DEFAULT_CONFIG = """
@registry.tokenizers("spacy.th.ThaiTokenizer")
def create_thai_tokenizer():
def thai_tokenizer_factory(nlp):
return ThaiTokenizer(nlp)
return ThaiTokenizer(nlp.vocab)
return thai_tokenizer_factory
class ThaiTokenizer(DummyTokenizer):
def __init__(self, nlp: Language) -> None:
def __init__(self, vocab: Vocab) -> None:
try:
from pythainlp.tokenize import word_tokenize
except ImportError:
@ -31,7 +32,7 @@ class ThaiTokenizer(DummyTokenizer):
"https://github.com/PyThaiNLP/pythainlp"
) from None
self.word_tokenize = word_tokenize
self.vocab = nlp.vocab
self.vocab = vocab
def __call__(self, text: str) -> Doc:
words = list(self.word_tokenize(text))

View File

@ -9,6 +9,7 @@ from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...tokens import Doc
from ...util import DummyTokenizer, registry, load_config_from_str
from ...vocab import Vocab
from ... import util
@ -24,14 +25,14 @@ use_pyvi = true
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
def create_vietnamese_tokenizer(use_pyvi: bool = True):
def vietnamese_tokenizer_factory(nlp):
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)
return vietnamese_tokenizer_factory
class VietnameseTokenizer(DummyTokenizer):
def __init__(self, nlp: Language, use_pyvi: bool = False):
self.vocab = nlp.vocab
def __init__(self, vocab: Vocab, use_pyvi: bool = False):
self.vocab = vocab
self.use_pyvi = use_pyvi
if self.use_pyvi:
try:
@ -45,6 +46,9 @@ class VietnameseTokenizer(DummyTokenizer):
)
raise ImportError(msg) from None
def __reduce__(self):
return VietnameseTokenizer, (self.vocab, self.use_pyvi)
def __call__(self, text: str) -> Doc:
if self.use_pyvi:
words = self.pyvi_tokenize(text)

View File

@ -11,6 +11,7 @@ from ...scorer import Scorer
from ...tokens import Doc
from ...training import validate_examples, Example
from ...util import DummyTokenizer, registry, load_config_from_str
from ...vocab import Vocab
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from ... import util
@ -48,14 +49,14 @@ class Segmenter(str, Enum):
@registry.tokenizers("spacy.zh.ChineseTokenizer")
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
def chinese_tokenizer_factory(nlp):
return ChineseTokenizer(nlp, segmenter=segmenter)
return ChineseTokenizer(nlp.vocab, segmenter=segmenter)
return chinese_tokenizer_factory
class ChineseTokenizer(DummyTokenizer):
def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char):
self.vocab = nlp.vocab
def __init__(self, vocab: Vocab, segmenter: Segmenter = Segmenter.char):
self.vocab = vocab
if isinstance(segmenter, Segmenter):
segmenter = segmenter.value
self.segmenter = segmenter

View File

@ -1,3 +1,5 @@
import pickle
from spacy.lang.ja import Japanese
from ...util import make_tempdir
@ -31,3 +33,9 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
nlp_r.from_disk(d)
assert nlp_bytes == nlp_r.to_bytes()
assert nlp_r.tokenizer.split_mode == "B"
def test_ja_tokenizer_pickle(ja_tokenizer):
b = pickle.dumps(ja_tokenizer)
ja_tokenizer_re = pickle.loads(b)
assert ja_tokenizer.to_bytes() == ja_tokenizer_re.to_bytes()

View File

@ -0,0 +1,24 @@
import pickle
from spacy.lang.ko import Korean
from ...util import make_tempdir
def test_ko_tokenizer_serialize(ko_tokenizer):
tokenizer_bytes = ko_tokenizer.to_bytes()
nlp = Korean()
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
with make_tempdir() as d:
file_path = d / "tokenizer"
ko_tokenizer.to_disk(file_path)
nlp = Korean()
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
def test_ko_tokenizer_pickle(ko_tokenizer):
b = pickle.dumps(ko_tokenizer)
ko_tokenizer_re = pickle.loads(b)
assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()

View File

@ -0,0 +1,24 @@
import pickle
from spacy.lang.th import Thai
from ...util import make_tempdir
def test_th_tokenizer_serialize(th_tokenizer):
tokenizer_bytes = th_tokenizer.to_bytes()
nlp = Thai()
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
with make_tempdir() as d:
file_path = d / "tokenizer"
th_tokenizer.to_disk(file_path)
nlp = Thai()
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
def test_th_tokenizer_pickle(th_tokenizer):
b = pickle.dumps(th_tokenizer)
th_tokenizer_re = pickle.loads(b)
assert th_tokenizer.to_bytes() == th_tokenizer_re.to_bytes()

View File

@ -1,3 +1,5 @@
import pickle
from spacy.lang.vi import Vietnamese
from ...util import make_tempdir
@ -31,3 +33,9 @@ def test_vi_tokenizer_serialize(vi_tokenizer):
nlp_r.from_disk(d)
assert nlp_bytes == nlp_r.to_bytes()
assert nlp_r.tokenizer.use_pyvi is False
def test_vi_tokenizer_pickle(vi_tokenizer):
b = pickle.dumps(vi_tokenizer)
vi_tokenizer_re = pickle.loads(b)
assert vi_tokenizer.to_bytes() == vi_tokenizer_re.to_bytes()