Merge pull request #6203 from adrianeboyd/feature/zh-spacy-pkuseg

This commit is contained in:
Ines Montani 2020-10-05 21:35:57 +02:00 committed by GitHub
commit 6abfc2911d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 25 additions and 52 deletions

View File

@ -1,7 +1,7 @@
SHELL := /bin/bash
ifndef SPACY_EXTRAS
override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba spacy-pkuseg==0.0.26 sudachipy sudachidict_core
endif
ifndef PYVER

View File

@ -17,8 +17,7 @@ from ... import util
# fmt: off
_PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`"
_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7."
_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install spacy-pkuseg==0.0.26`"
# fmt: on
DEFAULT_CONFIG = """
@ -82,9 +81,11 @@ class ChineseTokenizer(DummyTokenizer):
*,
nlp: Optional[Language] = None,
pkuseg_model: Optional[str] = None,
pkuseg_user_dict: str = "default",
pkuseg_user_dict: Optional[str] = "default",
):
if self.segmenter == Segmenter.pkuseg:
if pkuseg_user_dict is None:
pkuseg_user_dict = pkuseg_model
self.pkuseg_seg = try_pkuseg_import(
pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict,
)
@ -120,12 +121,12 @@ class ChineseTokenizer(DummyTokenizer):
if self.segmenter == Segmenter.pkuseg:
if reset:
try:
import pkuseg
import spacy_pkuseg
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(None)
except ImportError:
msg = (
"pkuseg not installed: unable to reset pkuseg "
"spacy_pkuseg not installed: unable to reset pkuseg "
"user dict. Please " + _PKUSEG_INSTALL_MSG
)
raise ImportError(msg) from None
@ -156,23 +157,7 @@ class ChineseTokenizer(DummyTokenizer):
self.pkuseg_seg.feature_extractor.save(tempdir)
self.pkuseg_seg.model.save(tempdir)
tempdir = Path(tempdir)
# pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which
# means that it will be saved with pickle protocol 5 with
# python 3.8, which can't be reloaded with python 3.6-3.7.
# To try to make the model compatible with python 3.6+, reload
# the data with pickle5 and convert it back to protocol 4.
try:
import pickle5
with open(tempdir / "features.pkl", "rb") as fileh:
features = pickle5.load(fileh)
with open(tempdir / "features.pkl", "wb") as fileh:
pickle5.dump(features, fileh, protocol=4)
except ImportError as e:
raise e
except Exception:
warnings.warn(_PKUSEG_PICKLE_WARNING)
with open(tempdir / "features.pkl", "rb") as fileh:
with open(tempdir / "features.msgpack", "rb") as fileh:
pkuseg_features_b = fileh.read()
with open(tempdir / "weights.npz", "rb") as fileh:
pkuseg_weights_b = fileh.read()
@ -213,22 +198,22 @@ class ChineseTokenizer(DummyTokenizer):
if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
with tempfile.TemporaryDirectory() as tempdir:
tempdir = Path(tempdir)
with open(tempdir / "features.pkl", "wb") as fileh:
with open(tempdir / "features.msgpack", "wb") as fileh:
fileh.write(pkuseg_data["features_b"])
with open(tempdir / "weights.npz", "wb") as fileh:
fileh.write(pkuseg_data["weights_b"])
try:
import pkuseg
import spacy_pkuseg
except ImportError:
raise ImportError(
"pkuseg not installed. To use this model, "
"spacy_pkuseg not installed. To use this model, "
+ _PKUSEG_INSTALL_MSG
) from None
self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
self.pkuseg_seg = spacy_pkuseg.pkuseg(str(tempdir))
if pkuseg_data["processors_data"]:
processors_data = pkuseg_data["processors_data"]
(user_dict, do_process, common_words, other_words) = processors_data
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
self.pkuseg_seg.postprocesser.do_process = do_process
self.pkuseg_seg.postprocesser.common_words = set(common_words)
self.pkuseg_seg.postprocesser.other_words = set(other_words)
@ -244,18 +229,6 @@ class ChineseTokenizer(DummyTokenizer):
path.mkdir(parents=True)
self.pkuseg_seg.model.save(path)
self.pkuseg_seg.feature_extractor.save(path)
# try to convert features.pkl to pickle protocol 4
try:
import pickle5
with open(path / "features.pkl", "rb") as fileh:
features = pickle5.load(fileh)
with open(path / "features.pkl", "wb") as fileh:
pickle5.dump(features, fileh, protocol=4)
except ImportError as e:
raise e
except Exception:
warnings.warn(_PKUSEG_PICKLE_WARNING)
def save_pkuseg_processors(path):
if self.pkuseg_seg:
@ -279,26 +252,26 @@ class ChineseTokenizer(DummyTokenizer):
def load_pkuseg_model(path):
try:
import pkuseg
import spacy_pkuseg
except ImportError:
if self.segmenter == Segmenter.pkuseg:
raise ImportError(
"pkuseg not installed. To use this model, "
"spacy_pkuseg not installed. To use this model, "
+ _PKUSEG_INSTALL_MSG
) from None
if path.exists():
self.pkuseg_seg = pkuseg.pkuseg(path)
self.pkuseg_seg = spacy_pkuseg.pkuseg(path)
def load_pkuseg_processors(path):
try:
import pkuseg
import spacy_pkuseg
except ImportError:
if self.segmenter == Segmenter.pkuseg:
raise ImportError(self._pkuseg_install_msg) from None
if self.segmenter == Segmenter.pkuseg:
data = srsly.read_msgpack(path)
(user_dict, do_process, common_words, other_words) = data
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
self.pkuseg_seg.postprocesser.do_process = do_process
self.pkuseg_seg.postprocesser.common_words = set(common_words)
self.pkuseg_seg.postprocesser.other_words = set(other_words)
@ -341,12 +314,13 @@ def try_jieba_import() -> None:
def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
try:
import pkuseg
import spacy_pkuseg
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
except ImportError:
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
msg = "spacy_pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
raise ImportError(msg) from None
try:
return spacy_pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
except FileNotFoundError:
msg = "Unable to load pkuseg model from: " + pkuseg_model
raise FileNotFoundError(msg) from None

View File

@ -284,8 +284,7 @@ def zh_tokenizer_jieba():
@pytest.fixture(scope="session")
def zh_tokenizer_pkuseg():
pytest.importorskip("pkuseg")
pytest.importorskip("pickle5")
pytest.importorskip("spacy_pkuseg")
config = {
"nlp": {
"tokenizer": {
@ -295,7 +294,7 @@ def zh_tokenizer_pkuseg():
},
"initialize": {
"tokenizer": {
"pkuseg_model": "default",
"pkuseg_model": "web",
}
},
}