mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Delay loading of mecab in Korean tokenizer (#10295)
* Delay loading of mecab in Korean tokenizer Delay loading of mecab until the tokenizer is called the first time so that it's possible to initialize a blank `ko` pipeline without having mecab installed, e.g. for use with `spacy init vectors`. * Move mecab import back to __init__ Move mecab import back to __init__ to warn users at the same point as before for missing python dependencies.
This commit is contained in:
parent
3854ab901f
commit
da7520a83c
|
@ -31,15 +31,24 @@ def create_tokenizer():
|
|||
class KoreanTokenizer(DummyTokenizer):
|
||||
def __init__(self, vocab: Vocab):
|
||||
self.vocab = vocab
|
||||
MeCab = try_mecab_import() # type: ignore[func-returns-value]
|
||||
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
|
||||
self._mecab = try_mecab_import() # type: ignore[func-returns-value]
|
||||
self._mecab_tokenizer = None
|
||||
|
||||
@property
|
||||
def mecab_tokenizer(self):
|
||||
# This is a property so that initializing a pipeline with blank:ko is
|
||||
# possible without actually requiring mecab-ko, e.g. to run
|
||||
# `spacy init vectors ko` for a pipeline that will have a different
|
||||
# tokenizer in the end. The languages need to match for the vectors
|
||||
# to be imported and there's no way to pass a custom config to
|
||||
# `init vectors`.
|
||||
if self._mecab_tokenizer is None:
|
||||
self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
|
||||
return self._mecab_tokenizer
|
||||
|
||||
def __reduce__(self):
|
||||
return KoreanTokenizer, (self.vocab,)
|
||||
|
||||
def __del__(self):
|
||||
self.mecab_tokenizer.__del__()
|
||||
|
||||
def __call__(self, text: str) -> Doc:
|
||||
dtokens = list(self.detailed_tokens(text))
|
||||
surfaces = [dt["surface"] for dt in dtokens]
|
||||
|
@ -90,7 +99,8 @@ def try_mecab_import() -> None:
|
|||
return MeCab
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||
"The Korean tokenizer (\"spacy.ko.KoreanTokenizer\") requires "
|
||||
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
||||
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
||||
) from None
|
||||
|
|
Loading…
Reference in New Issue
Block a user