mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Delay loading of mecab in Korean tokenizer (#10295)
* Delay loading of mecab in Korean tokenizer Delay loading of mecab until the tokenizer is called the first time so that it's possible to initialize a blank `ko` pipeline without having mecab installed, e.g. for use with `spacy init vectors`. * Move mecab import back to __init__ Move mecab import back to __init__ to warn users at the same point as before for missing python dependencies.
This commit is contained in:
parent
3854ab901f
commit
da7520a83c
|
@ -31,15 +31,24 @@ def create_tokenizer():
|
||||||
class KoreanTokenizer(DummyTokenizer):
|
class KoreanTokenizer(DummyTokenizer):
|
||||||
def __init__(self, vocab: Vocab):
|
def __init__(self, vocab: Vocab):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
MeCab = try_mecab_import() # type: ignore[func-returns-value]
|
self._mecab = try_mecab_import() # type: ignore[func-returns-value]
|
||||||
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
|
self._mecab_tokenizer = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def mecab_tokenizer(self):
|
||||||
|
# This is a property so that initializing a pipeline with blank:ko is
|
||||||
|
# possible without actually requiring mecab-ko, e.g. to run
|
||||||
|
# `spacy init vectors ko` for a pipeline that will have a different
|
||||||
|
# tokenizer in the end. The languages need to match for the vectors
|
||||||
|
# to be imported and there's no way to pass a custom config to
|
||||||
|
# `init vectors`.
|
||||||
|
if self._mecab_tokenizer is None:
|
||||||
|
self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
|
||||||
|
return self._mecab_tokenizer
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return KoreanTokenizer, (self.vocab,)
|
return KoreanTokenizer, (self.vocab,)
|
||||||
|
|
||||||
def __del__(self):
|
|
||||||
self.mecab_tokenizer.__del__()
|
|
||||||
|
|
||||||
def __call__(self, text: str) -> Doc:
|
def __call__(self, text: str) -> Doc:
|
||||||
dtokens = list(self.detailed_tokens(text))
|
dtokens = list(self.detailed_tokens(text))
|
||||||
surfaces = [dt["surface"] for dt in dtokens]
|
surfaces = [dt["surface"] for dt in dtokens]
|
||||||
|
@ -90,7 +99,8 @@ def try_mecab_import() -> None:
|
||||||
return MeCab
|
return MeCab
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
"The Korean tokenizer (\"spacy.ko.KoreanTokenizer\") requires "
|
||||||
|
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||||
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
||||||
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
||||||
) from None
|
) from None
|
||||||
|
|
Loading…
Reference in New Issue
Block a user