mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Update custom tokenizer APIs and pickling (#8972)
* Fix incorrect pickling of Japanese and Korean pipelines, which led to the entire pipeline being reset if pickled * Enable pickling of Vietnamese tokenizer * Update tokenizer APIs for Chinese, Japanese, Korean, Thai, and Vietnamese so that only the `Vocab` is required for initialization
This commit is contained in:
parent
b278f31ee6
commit
c5de9b463a
|
@ -8,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
|||
from .tag_map import TAG_MAP
|
||||
from .tag_orth_map import TAG_ORTH_MAP
|
||||
from .tag_bigram_map import TAG_BIGRAM_MAP
|
||||
from ...compat import copy_reg
|
||||
from ...errors import Errors
|
||||
from ...language import Language
|
||||
from ...scorer import Scorer
|
||||
|
@ -16,6 +15,7 @@ from ...symbols import POS
|
|||
from ...tokens import Doc
|
||||
from ...training import validate_examples
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
from ...vocab import Vocab
|
||||
from ... import util
|
||||
|
||||
|
||||
|
@ -31,17 +31,20 @@ split_mode = null
|
|||
@registry.tokenizers("spacy.ja.JapaneseTokenizer")
|
||||
def create_tokenizer(split_mode: Optional[str] = None):
|
||||
def japanese_tokenizer_factory(nlp):
|
||||
return JapaneseTokenizer(nlp, split_mode=split_mode)
|
||||
return JapaneseTokenizer(nlp.vocab, split_mode=split_mode)
|
||||
|
||||
return japanese_tokenizer_factory
|
||||
|
||||
|
||||
class JapaneseTokenizer(DummyTokenizer):
|
||||
def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
|
||||
self.vocab = nlp.vocab
|
||||
def __init__(self, vocab: Vocab, split_mode: Optional[str] = None) -> None:
|
||||
self.vocab = vocab
|
||||
self.split_mode = split_mode
|
||||
self.tokenizer = try_sudachi_import(self.split_mode)
|
||||
|
||||
def __reduce__(self):
|
||||
return JapaneseTokenizer, (self.vocab, self.split_mode)
|
||||
|
||||
def __call__(self, text: str) -> Doc:
|
||||
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
|
||||
sudachipy_tokens = self.tokenizer.tokenize(text)
|
||||
|
@ -293,10 +296,4 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
|
|||
return text_dtokens, text_spaces
|
||||
|
||||
|
||||
def pickle_japanese(instance):
|
||||
return Japanese, tuple()
|
||||
|
||||
|
||||
copy_reg.pickle(Japanese, pickle_japanese)
|
||||
|
||||
__all__ = ["Japanese"]
|
||||
|
|
|
@ -1,15 +1,15 @@
|
|||
from typing import Optional, Any, Dict
|
||||
from typing import Any, Dict
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tag_map import TAG_MAP
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from ...compat import copy_reg
|
||||
from ...scorer import Scorer
|
||||
from ...symbols import POS
|
||||
from ...training import validate_examples
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
from ...vocab import Vocab
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -23,17 +23,20 @@ DEFAULT_CONFIG = """
|
|||
@registry.tokenizers("spacy.ko.KoreanTokenizer")
|
||||
def create_tokenizer():
|
||||
def korean_tokenizer_factory(nlp):
|
||||
return KoreanTokenizer(nlp)
|
||||
return KoreanTokenizer(nlp.vocab)
|
||||
|
||||
return korean_tokenizer_factory
|
||||
|
||||
|
||||
class KoreanTokenizer(DummyTokenizer):
|
||||
def __init__(self, nlp: Optional[Language] = None):
|
||||
self.vocab = nlp.vocab
|
||||
def __init__(self, vocab: Vocab):
|
||||
self.vocab = vocab
|
||||
MeCab = try_mecab_import()
|
||||
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
|
||||
|
||||
def __reduce__(self):
|
||||
return KoreanTokenizer, (self.vocab,)
|
||||
|
||||
def __del__(self):
|
||||
self.mecab_tokenizer.__del__()
|
||||
|
||||
|
@ -106,10 +109,4 @@ def check_spaces(text, tokens):
|
|||
yield False
|
||||
|
||||
|
||||
def pickle_korean(instance):
|
||||
return Korean, tuple()
|
||||
|
||||
|
||||
copy_reg.pickle(Korean, pickle_korean)
|
||||
|
||||
__all__ = ["Korean"]
|
||||
|
|
|
@ -3,6 +3,7 @@ from .lex_attrs import LEX_ATTRS
|
|||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
from ...vocab import Vocab
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -16,13 +17,13 @@ DEFAULT_CONFIG = """
|
|||
@registry.tokenizers("spacy.th.ThaiTokenizer")
|
||||
def create_thai_tokenizer():
|
||||
def thai_tokenizer_factory(nlp):
|
||||
return ThaiTokenizer(nlp)
|
||||
return ThaiTokenizer(nlp.vocab)
|
||||
|
||||
return thai_tokenizer_factory
|
||||
|
||||
|
||||
class ThaiTokenizer(DummyTokenizer):
|
||||
def __init__(self, nlp: Language) -> None:
|
||||
def __init__(self, vocab: Vocab) -> None:
|
||||
try:
|
||||
from pythainlp.tokenize import word_tokenize
|
||||
except ImportError:
|
||||
|
@ -31,7 +32,7 @@ class ThaiTokenizer(DummyTokenizer):
|
|||
"https://github.com/PyThaiNLP/pythainlp"
|
||||
) from None
|
||||
self.word_tokenize = word_tokenize
|
||||
self.vocab = nlp.vocab
|
||||
self.vocab = vocab
|
||||
|
||||
def __call__(self, text: str) -> Doc:
|
||||
words = list(self.word_tokenize(text))
|
||||
|
|
|
@ -9,6 +9,7 @@ from .lex_attrs import LEX_ATTRS
|
|||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
from ...vocab import Vocab
|
||||
from ... import util
|
||||
|
||||
|
||||
|
@ -24,14 +25,14 @@ use_pyvi = true
|
|||
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
||||
def create_vietnamese_tokenizer(use_pyvi: bool = True):
|
||||
def vietnamese_tokenizer_factory(nlp):
|
||||
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
|
||||
return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)
|
||||
|
||||
return vietnamese_tokenizer_factory
|
||||
|
||||
|
||||
class VietnameseTokenizer(DummyTokenizer):
|
||||
def __init__(self, nlp: Language, use_pyvi: bool = False):
|
||||
self.vocab = nlp.vocab
|
||||
def __init__(self, vocab: Vocab, use_pyvi: bool = False):
|
||||
self.vocab = vocab
|
||||
self.use_pyvi = use_pyvi
|
||||
if self.use_pyvi:
|
||||
try:
|
||||
|
@ -45,6 +46,9 @@ class VietnameseTokenizer(DummyTokenizer):
|
|||
)
|
||||
raise ImportError(msg) from None
|
||||
|
||||
def __reduce__(self):
|
||||
return VietnameseTokenizer, (self.vocab, self.use_pyvi)
|
||||
|
||||
def __call__(self, text: str) -> Doc:
|
||||
if self.use_pyvi:
|
||||
words = self.pyvi_tokenize(text)
|
||||
|
|
|
@ -11,6 +11,7 @@ from ...scorer import Scorer
|
|||
from ...tokens import Doc
|
||||
from ...training import validate_examples, Example
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
from ...vocab import Vocab
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from ... import util
|
||||
|
@ -48,14 +49,14 @@ class Segmenter(str, Enum):
|
|||
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
||||
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
|
||||
def chinese_tokenizer_factory(nlp):
|
||||
return ChineseTokenizer(nlp, segmenter=segmenter)
|
||||
return ChineseTokenizer(nlp.vocab, segmenter=segmenter)
|
||||
|
||||
return chinese_tokenizer_factory
|
||||
|
||||
|
||||
class ChineseTokenizer(DummyTokenizer):
|
||||
def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char):
|
||||
self.vocab = nlp.vocab
|
||||
def __init__(self, vocab: Vocab, segmenter: Segmenter = Segmenter.char):
|
||||
self.vocab = vocab
|
||||
if isinstance(segmenter, Segmenter):
|
||||
segmenter = segmenter.value
|
||||
self.segmenter = segmenter
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
import pickle
|
||||
|
||||
from spacy.lang.ja import Japanese
|
||||
from ...util import make_tempdir
|
||||
|
||||
|
@ -31,3 +33,9 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
|
|||
nlp_r.from_disk(d)
|
||||
assert nlp_bytes == nlp_r.to_bytes()
|
||||
assert nlp_r.tokenizer.split_mode == "B"
|
||||
|
||||
|
||||
def test_ja_tokenizer_pickle(ja_tokenizer):
|
||||
b = pickle.dumps(ja_tokenizer)
|
||||
ja_tokenizer_re = pickle.loads(b)
|
||||
assert ja_tokenizer.to_bytes() == ja_tokenizer_re.to_bytes()
|
||||
|
|
24
spacy/tests/lang/ko/test_serialize.py
Normal file
24
spacy/tests/lang/ko/test_serialize.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
import pickle
|
||||
|
||||
from spacy.lang.ko import Korean
|
||||
from ...util import make_tempdir
|
||||
|
||||
|
||||
def test_ko_tokenizer_serialize(ko_tokenizer):
|
||||
tokenizer_bytes = ko_tokenizer.to_bytes()
|
||||
nlp = Korean()
|
||||
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "tokenizer"
|
||||
ko_tokenizer.to_disk(file_path)
|
||||
nlp = Korean()
|
||||
nlp.tokenizer.from_disk(file_path)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
|
||||
|
||||
def test_ko_tokenizer_pickle(ko_tokenizer):
|
||||
b = pickle.dumps(ko_tokenizer)
|
||||
ko_tokenizer_re = pickle.loads(b)
|
||||
assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
|
24
spacy/tests/lang/th/test_serialize.py
Normal file
24
spacy/tests/lang/th/test_serialize.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
import pickle
|
||||
|
||||
from spacy.lang.th import Thai
|
||||
from ...util import make_tempdir
|
||||
|
||||
|
||||
def test_th_tokenizer_serialize(th_tokenizer):
|
||||
tokenizer_bytes = th_tokenizer.to_bytes()
|
||||
nlp = Thai()
|
||||
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "tokenizer"
|
||||
th_tokenizer.to_disk(file_path)
|
||||
nlp = Thai()
|
||||
nlp.tokenizer.from_disk(file_path)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
|
||||
|
||||
def test_th_tokenizer_pickle(th_tokenizer):
|
||||
b = pickle.dumps(th_tokenizer)
|
||||
th_tokenizer_re = pickle.loads(b)
|
||||
assert th_tokenizer.to_bytes() == th_tokenizer_re.to_bytes()
|
|
@ -1,3 +1,5 @@
|
|||
import pickle
|
||||
|
||||
from spacy.lang.vi import Vietnamese
|
||||
from ...util import make_tempdir
|
||||
|
||||
|
@ -31,3 +33,9 @@ def test_vi_tokenizer_serialize(vi_tokenizer):
|
|||
nlp_r.from_disk(d)
|
||||
assert nlp_bytes == nlp_r.to_bytes()
|
||||
assert nlp_r.tokenizer.use_pyvi is False
|
||||
|
||||
|
||||
def test_vi_tokenizer_pickle(vi_tokenizer):
|
||||
b = pickle.dumps(vi_tokenizer)
|
||||
vi_tokenizer_re = pickle.loads(b)
|
||||
assert vi_tokenizer.to_bytes() == vi_tokenizer_re.to_bytes()
|
||||
|
|
Loading…
Reference in New Issue
Block a user