Try 'context' concept in ChineseTokenizer

This commit is contained in:
Matthew Honnibal 2020-09-26 23:19:48 +02:00
parent f168822857
commit 6f0eeefefd

View File

@ -71,9 +71,9 @@ class ChineseTokenizer(DummyTokenizer):
self.pkuseg_user_dict = pkuseg_user_dict self.pkuseg_user_dict = pkuseg_user_dict
self.pkuseg_seg = None self.pkuseg_seg = None
self.jieba_seg = None self.jieba_seg = None
self.configure_segmenter(segmenter) self.configure_segmenter(segmenter, _context=nlp._context)
def configure_segmenter(self, segmenter: str): def configure_segmenter(self, segmenter: str, *, _context: str=""):
if segmenter not in Segmenter.values(): if segmenter not in Segmenter.values():
warn_msg = Warnings.W103.format( warn_msg = Warnings.W103.format(
lang="Chinese", lang="Chinese",
@ -84,11 +84,14 @@ class ChineseTokenizer(DummyTokenizer):
warnings.warn(warn_msg) warnings.warn(warn_msg)
self.segmenter = Segmenter.char self.segmenter = Segmenter.char
self.jieba_seg = try_jieba_import(self.segmenter) self.jieba_seg = try_jieba_import(self.segmenter)
self.pkuseg_seg = try_pkuseg_import( if _context == "loading":
self.segmenter, self.pkuseg_seg = None
pkuseg_model=self.pkuseg_model, else:
pkuseg_user_dict=self.pkuseg_user_dict, self.pkuseg_seg = try_pkuseg_import(
) self.segmenter,
pkuseg_model=self.pkuseg_model,
pkuseg_user_dict=self.pkuseg_user_dict
)
def __call__(self, text: str) -> Doc: def __call__(self, text: str) -> Doc:
if self.segmenter == Segmenter.jieba: if self.segmenter == Segmenter.jieba: