Delay loading of mecab in Korean tokenizer (#10295)

* Delay loading of mecab in Korean tokenizer

Delay loading of mecab until the tokenizer is called the first time so
that it's possible to initialize a blank `ko` pipeline without having
mecab installed, e.g. for use with `spacy init vectors`.

* Move mecab import back to __init__

Move mecab import back to __init__ to warn users at the same point as
before for missing python dependencies.
This commit is contained in:
Adriane Boyd 2022-02-17 11:35:34 +01:00 committed by GitHub
parent 3854ab901f
commit da7520a83c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -31,15 +31,24 @@ def create_tokenizer():
class KoreanTokenizer(DummyTokenizer): class KoreanTokenizer(DummyTokenizer):
def __init__(self, vocab: Vocab): def __init__(self, vocab: Vocab):
self.vocab = vocab self.vocab = vocab
MeCab = try_mecab_import() # type: ignore[func-returns-value] self._mecab = try_mecab_import() # type: ignore[func-returns-value]
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") self._mecab_tokenizer = None
@property
def mecab_tokenizer(self):
# This is a property so that initializing a pipeline with blank:ko is
# possible without actually requiring mecab-ko, e.g. to run
# `spacy init vectors ko` for a pipeline that will have a different
# tokenizer in the end. The languages need to match for the vectors
# to be imported and there's no way to pass a custom config to
# `init vectors`.
if self._mecab_tokenizer is None:
self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
return self._mecab_tokenizer
def __reduce__(self): def __reduce__(self):
return KoreanTokenizer, (self.vocab,) return KoreanTokenizer, (self.vocab,)
def __del__(self):
self.mecab_tokenizer.__del__()
def __call__(self, text: str) -> Doc: def __call__(self, text: str) -> Doc:
dtokens = list(self.detailed_tokens(text)) dtokens = list(self.detailed_tokens(text))
surfaces = [dt["surface"] for dt in dtokens] surfaces = [dt["surface"] for dt in dtokens]
@ -90,7 +99,8 @@ def try_mecab_import() -> None:
return MeCab return MeCab
except ImportError: except ImportError:
raise ImportError( raise ImportError(
"Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), " "The Korean tokenizer (\"spacy.ko.KoreanTokenizer\") requires "
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), " "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
"and [natto-py](https://github.com/buruzaemon/natto-py)" "and [natto-py](https://github.com/buruzaemon/natto-py)"
) from None ) from None