mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Update custom tokenizer APIs and pickling (#8972)
* Fix incorrect pickling of Japanese and Korean pipelines, which led to the entire pipeline being reset if pickled * Enable pickling of Vietnamese tokenizer * Update tokenizer APIs for Chinese, Japanese, Korean, Thai, and Vietnamese so that only the `Vocab` is required for initialization
This commit is contained in:
parent
b278f31ee6
commit
c5de9b463a
|
@ -8,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .tag_orth_map import TAG_ORTH_MAP
|
from .tag_orth_map import TAG_ORTH_MAP
|
||||||
from .tag_bigram_map import TAG_BIGRAM_MAP
|
from .tag_bigram_map import TAG_BIGRAM_MAP
|
||||||
from ...compat import copy_reg
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...scorer import Scorer
|
from ...scorer import Scorer
|
||||||
|
@ -16,6 +15,7 @@ from ...symbols import POS
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...training import validate_examples
|
from ...training import validate_examples
|
||||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
from ...vocab import Vocab
|
||||||
from ... import util
|
from ... import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -31,17 +31,20 @@ split_mode = null
|
||||||
@registry.tokenizers("spacy.ja.JapaneseTokenizer")
|
@registry.tokenizers("spacy.ja.JapaneseTokenizer")
|
||||||
def create_tokenizer(split_mode: Optional[str] = None):
|
def create_tokenizer(split_mode: Optional[str] = None):
|
||||||
def japanese_tokenizer_factory(nlp):
|
def japanese_tokenizer_factory(nlp):
|
||||||
return JapaneseTokenizer(nlp, split_mode=split_mode)
|
return JapaneseTokenizer(nlp.vocab, split_mode=split_mode)
|
||||||
|
|
||||||
return japanese_tokenizer_factory
|
return japanese_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
class JapaneseTokenizer(DummyTokenizer):
|
class JapaneseTokenizer(DummyTokenizer):
|
||||||
def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
|
def __init__(self, vocab: Vocab, split_mode: Optional[str] = None) -> None:
|
||||||
self.vocab = nlp.vocab
|
self.vocab = vocab
|
||||||
self.split_mode = split_mode
|
self.split_mode = split_mode
|
||||||
self.tokenizer = try_sudachi_import(self.split_mode)
|
self.tokenizer = try_sudachi_import(self.split_mode)
|
||||||
|
|
||||||
|
def __reduce__(self):
|
||||||
|
return JapaneseTokenizer, (self.vocab, self.split_mode)
|
||||||
|
|
||||||
def __call__(self, text: str) -> Doc:
|
def __call__(self, text: str) -> Doc:
|
||||||
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
|
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
|
||||||
sudachipy_tokens = self.tokenizer.tokenize(text)
|
sudachipy_tokens = self.tokenizer.tokenize(text)
|
||||||
|
@ -293,10 +296,4 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
|
||||||
return text_dtokens, text_spaces
|
return text_dtokens, text_spaces
|
||||||
|
|
||||||
|
|
||||||
def pickle_japanese(instance):
|
|
||||||
return Japanese, tuple()
|
|
||||||
|
|
||||||
|
|
||||||
copy_reg.pickle(Japanese, pickle_japanese)
|
|
||||||
|
|
||||||
__all__ = ["Japanese"]
|
__all__ = ["Japanese"]
|
||||||
|
|
|
@ -1,15 +1,15 @@
|
||||||
from typing import Optional, Any, Dict
|
from typing import Any, Dict
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...compat import copy_reg
|
|
||||||
from ...scorer import Scorer
|
from ...scorer import Scorer
|
||||||
from ...symbols import POS
|
from ...symbols import POS
|
||||||
from ...training import validate_examples
|
from ...training import validate_examples
|
||||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
from ...vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -23,17 +23,20 @@ DEFAULT_CONFIG = """
|
||||||
@registry.tokenizers("spacy.ko.KoreanTokenizer")
|
@registry.tokenizers("spacy.ko.KoreanTokenizer")
|
||||||
def create_tokenizer():
|
def create_tokenizer():
|
||||||
def korean_tokenizer_factory(nlp):
|
def korean_tokenizer_factory(nlp):
|
||||||
return KoreanTokenizer(nlp)
|
return KoreanTokenizer(nlp.vocab)
|
||||||
|
|
||||||
return korean_tokenizer_factory
|
return korean_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
class KoreanTokenizer(DummyTokenizer):
|
class KoreanTokenizer(DummyTokenizer):
|
||||||
def __init__(self, nlp: Optional[Language] = None):
|
def __init__(self, vocab: Vocab):
|
||||||
self.vocab = nlp.vocab
|
self.vocab = vocab
|
||||||
MeCab = try_mecab_import()
|
MeCab = try_mecab_import()
|
||||||
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
|
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
|
||||||
|
|
||||||
|
def __reduce__(self):
|
||||||
|
return KoreanTokenizer, (self.vocab,)
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
self.mecab_tokenizer.__del__()
|
self.mecab_tokenizer.__del__()
|
||||||
|
|
||||||
|
@ -106,10 +109,4 @@ def check_spaces(text, tokens):
|
||||||
yield False
|
yield False
|
||||||
|
|
||||||
|
|
||||||
def pickle_korean(instance):
|
|
||||||
return Korean, tuple()
|
|
||||||
|
|
||||||
|
|
||||||
copy_reg.pickle(Korean, pickle_korean)
|
|
||||||
|
|
||||||
__all__ = ["Korean"]
|
__all__ = ["Korean"]
|
||||||
|
|
|
@ -3,6 +3,7 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
from ...vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -16,13 +17,13 @@ DEFAULT_CONFIG = """
|
||||||
@registry.tokenizers("spacy.th.ThaiTokenizer")
|
@registry.tokenizers("spacy.th.ThaiTokenizer")
|
||||||
def create_thai_tokenizer():
|
def create_thai_tokenizer():
|
||||||
def thai_tokenizer_factory(nlp):
|
def thai_tokenizer_factory(nlp):
|
||||||
return ThaiTokenizer(nlp)
|
return ThaiTokenizer(nlp.vocab)
|
||||||
|
|
||||||
return thai_tokenizer_factory
|
return thai_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
class ThaiTokenizer(DummyTokenizer):
|
class ThaiTokenizer(DummyTokenizer):
|
||||||
def __init__(self, nlp: Language) -> None:
|
def __init__(self, vocab: Vocab) -> None:
|
||||||
try:
|
try:
|
||||||
from pythainlp.tokenize import word_tokenize
|
from pythainlp.tokenize import word_tokenize
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -31,7 +32,7 @@ class ThaiTokenizer(DummyTokenizer):
|
||||||
"https://github.com/PyThaiNLP/pythainlp"
|
"https://github.com/PyThaiNLP/pythainlp"
|
||||||
) from None
|
) from None
|
||||||
self.word_tokenize = word_tokenize
|
self.word_tokenize = word_tokenize
|
||||||
self.vocab = nlp.vocab
|
self.vocab = vocab
|
||||||
|
|
||||||
def __call__(self, text: str) -> Doc:
|
def __call__(self, text: str) -> Doc:
|
||||||
words = list(self.word_tokenize(text))
|
words = list(self.word_tokenize(text))
|
||||||
|
|
|
@ -9,6 +9,7 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
from ...vocab import Vocab
|
||||||
from ... import util
|
from ... import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,14 +25,14 @@ use_pyvi = true
|
||||||
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
||||||
def create_vietnamese_tokenizer(use_pyvi: bool = True):
|
def create_vietnamese_tokenizer(use_pyvi: bool = True):
|
||||||
def vietnamese_tokenizer_factory(nlp):
|
def vietnamese_tokenizer_factory(nlp):
|
||||||
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
|
return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)
|
||||||
|
|
||||||
return vietnamese_tokenizer_factory
|
return vietnamese_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
class VietnameseTokenizer(DummyTokenizer):
|
class VietnameseTokenizer(DummyTokenizer):
|
||||||
def __init__(self, nlp: Language, use_pyvi: bool = False):
|
def __init__(self, vocab: Vocab, use_pyvi: bool = False):
|
||||||
self.vocab = nlp.vocab
|
self.vocab = vocab
|
||||||
self.use_pyvi = use_pyvi
|
self.use_pyvi = use_pyvi
|
||||||
if self.use_pyvi:
|
if self.use_pyvi:
|
||||||
try:
|
try:
|
||||||
|
@ -45,6 +46,9 @@ class VietnameseTokenizer(DummyTokenizer):
|
||||||
)
|
)
|
||||||
raise ImportError(msg) from None
|
raise ImportError(msg) from None
|
||||||
|
|
||||||
|
def __reduce__(self):
|
||||||
|
return VietnameseTokenizer, (self.vocab, self.use_pyvi)
|
||||||
|
|
||||||
def __call__(self, text: str) -> Doc:
|
def __call__(self, text: str) -> Doc:
|
||||||
if self.use_pyvi:
|
if self.use_pyvi:
|
||||||
words = self.pyvi_tokenize(text)
|
words = self.pyvi_tokenize(text)
|
||||||
|
|
|
@ -11,6 +11,7 @@ from ...scorer import Scorer
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...training import validate_examples, Example
|
from ...training import validate_examples, Example
|
||||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
|
from ...vocab import Vocab
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ... import util
|
from ... import util
|
||||||
|
@ -48,14 +49,14 @@ class Segmenter(str, Enum):
|
||||||
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
||||||
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
|
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
|
||||||
def chinese_tokenizer_factory(nlp):
|
def chinese_tokenizer_factory(nlp):
|
||||||
return ChineseTokenizer(nlp, segmenter=segmenter)
|
return ChineseTokenizer(nlp.vocab, segmenter=segmenter)
|
||||||
|
|
||||||
return chinese_tokenizer_factory
|
return chinese_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
class ChineseTokenizer(DummyTokenizer):
|
class ChineseTokenizer(DummyTokenizer):
|
||||||
def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char):
|
def __init__(self, vocab: Vocab, segmenter: Segmenter = Segmenter.char):
|
||||||
self.vocab = nlp.vocab
|
self.vocab = vocab
|
||||||
if isinstance(segmenter, Segmenter):
|
if isinstance(segmenter, Segmenter):
|
||||||
segmenter = segmenter.value
|
segmenter = segmenter.value
|
||||||
self.segmenter = segmenter
|
self.segmenter = segmenter
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
import pickle
|
||||||
|
|
||||||
from spacy.lang.ja import Japanese
|
from spacy.lang.ja import Japanese
|
||||||
from ...util import make_tempdir
|
from ...util import make_tempdir
|
||||||
|
|
||||||
|
@ -31,3 +33,9 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
|
||||||
nlp_r.from_disk(d)
|
nlp_r.from_disk(d)
|
||||||
assert nlp_bytes == nlp_r.to_bytes()
|
assert nlp_bytes == nlp_r.to_bytes()
|
||||||
assert nlp_r.tokenizer.split_mode == "B"
|
assert nlp_r.tokenizer.split_mode == "B"
|
||||||
|
|
||||||
|
|
||||||
|
def test_ja_tokenizer_pickle(ja_tokenizer):
|
||||||
|
b = pickle.dumps(ja_tokenizer)
|
||||||
|
ja_tokenizer_re = pickle.loads(b)
|
||||||
|
assert ja_tokenizer.to_bytes() == ja_tokenizer_re.to_bytes()
|
||||||
|
|
24
spacy/tests/lang/ko/test_serialize.py
Normal file
24
spacy/tests/lang/ko/test_serialize.py
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
from spacy.lang.ko import Korean
|
||||||
|
from ...util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_ko_tokenizer_serialize(ko_tokenizer):
|
||||||
|
tokenizer_bytes = ko_tokenizer.to_bytes()
|
||||||
|
nlp = Korean()
|
||||||
|
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||||
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
file_path = d / "tokenizer"
|
||||||
|
ko_tokenizer.to_disk(file_path)
|
||||||
|
nlp = Korean()
|
||||||
|
nlp.tokenizer.from_disk(file_path)
|
||||||
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
def test_ko_tokenizer_pickle(ko_tokenizer):
|
||||||
|
b = pickle.dumps(ko_tokenizer)
|
||||||
|
ko_tokenizer_re = pickle.loads(b)
|
||||||
|
assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
|
24
spacy/tests/lang/th/test_serialize.py
Normal file
24
spacy/tests/lang/th/test_serialize.py
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
from spacy.lang.th import Thai
|
||||||
|
from ...util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_th_tokenizer_serialize(th_tokenizer):
|
||||||
|
tokenizer_bytes = th_tokenizer.to_bytes()
|
||||||
|
nlp = Thai()
|
||||||
|
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||||
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
file_path = d / "tokenizer"
|
||||||
|
th_tokenizer.to_disk(file_path)
|
||||||
|
nlp = Thai()
|
||||||
|
nlp.tokenizer.from_disk(file_path)
|
||||||
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
def test_th_tokenizer_pickle(th_tokenizer):
|
||||||
|
b = pickle.dumps(th_tokenizer)
|
||||||
|
th_tokenizer_re = pickle.loads(b)
|
||||||
|
assert th_tokenizer.to_bytes() == th_tokenizer_re.to_bytes()
|
|
@ -1,3 +1,5 @@
|
||||||
|
import pickle
|
||||||
|
|
||||||
from spacy.lang.vi import Vietnamese
|
from spacy.lang.vi import Vietnamese
|
||||||
from ...util import make_tempdir
|
from ...util import make_tempdir
|
||||||
|
|
||||||
|
@ -31,3 +33,9 @@ def test_vi_tokenizer_serialize(vi_tokenizer):
|
||||||
nlp_r.from_disk(d)
|
nlp_r.from_disk(d)
|
||||||
assert nlp_bytes == nlp_r.to_bytes()
|
assert nlp_bytes == nlp_r.to_bytes()
|
||||||
assert nlp_r.tokenizer.use_pyvi is False
|
assert nlp_r.tokenizer.use_pyvi is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_vi_tokenizer_pickle(vi_tokenizer):
|
||||||
|
b = pickle.dumps(vi_tokenizer)
|
||||||
|
vi_tokenizer_re = pickle.loads(b)
|
||||||
|
assert vi_tokenizer.to_bytes() == vi_tokenizer_re.to_bytes()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user