mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge pull request #6203 from adrianeboyd/feature/zh-spacy-pkuseg
This commit is contained in:
commit
6abfc2911d
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
|||
SHELL := /bin/bash
|
||||
|
||||
ifndef SPACY_EXTRAS
|
||||
override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
|
||||
override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba spacy-pkuseg==0.0.26 sudachipy sudachidict_core
|
||||
endif
|
||||
|
||||
ifndef PYVER
|
||||
|
|
|
@ -17,8 +17,7 @@ from ... import util
|
|||
|
||||
|
||||
# fmt: off
|
||||
_PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`"
|
||||
_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7."
|
||||
_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install spacy-pkuseg==0.0.26`"
|
||||
# fmt: on
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -82,9 +81,11 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
*,
|
||||
nlp: Optional[Language] = None,
|
||||
pkuseg_model: Optional[str] = None,
|
||||
pkuseg_user_dict: str = "default",
|
||||
pkuseg_user_dict: Optional[str] = "default",
|
||||
):
|
||||
if self.segmenter == Segmenter.pkuseg:
|
||||
if pkuseg_user_dict is None:
|
||||
pkuseg_user_dict = pkuseg_model
|
||||
self.pkuseg_seg = try_pkuseg_import(
|
||||
pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict,
|
||||
)
|
||||
|
@ -120,12 +121,12 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
if self.segmenter == Segmenter.pkuseg:
|
||||
if reset:
|
||||
try:
|
||||
import pkuseg
|
||||
import spacy_pkuseg
|
||||
|
||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
|
||||
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(None)
|
||||
except ImportError:
|
||||
msg = (
|
||||
"pkuseg not installed: unable to reset pkuseg "
|
||||
"spacy_pkuseg not installed: unable to reset pkuseg "
|
||||
"user dict. Please " + _PKUSEG_INSTALL_MSG
|
||||
)
|
||||
raise ImportError(msg) from None
|
||||
|
@ -156,23 +157,7 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
self.pkuseg_seg.feature_extractor.save(tempdir)
|
||||
self.pkuseg_seg.model.save(tempdir)
|
||||
tempdir = Path(tempdir)
|
||||
# pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which
|
||||
# means that it will be saved with pickle protocol 5 with
|
||||
# python 3.8, which can't be reloaded with python 3.6-3.7.
|
||||
# To try to make the model compatible with python 3.6+, reload
|
||||
# the data with pickle5 and convert it back to protocol 4.
|
||||
try:
|
||||
import pickle5
|
||||
|
||||
with open(tempdir / "features.pkl", "rb") as fileh:
|
||||
features = pickle5.load(fileh)
|
||||
with open(tempdir / "features.pkl", "wb") as fileh:
|
||||
pickle5.dump(features, fileh, protocol=4)
|
||||
except ImportError as e:
|
||||
raise e
|
||||
except Exception:
|
||||
warnings.warn(_PKUSEG_PICKLE_WARNING)
|
||||
with open(tempdir / "features.pkl", "rb") as fileh:
|
||||
with open(tempdir / "features.msgpack", "rb") as fileh:
|
||||
pkuseg_features_b = fileh.read()
|
||||
with open(tempdir / "weights.npz", "rb") as fileh:
|
||||
pkuseg_weights_b = fileh.read()
|
||||
|
@ -213,22 +198,22 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
|
||||
with tempfile.TemporaryDirectory() as tempdir:
|
||||
tempdir = Path(tempdir)
|
||||
with open(tempdir / "features.pkl", "wb") as fileh:
|
||||
with open(tempdir / "features.msgpack", "wb") as fileh:
|
||||
fileh.write(pkuseg_data["features_b"])
|
||||
with open(tempdir / "weights.npz", "wb") as fileh:
|
||||
fileh.write(pkuseg_data["weights_b"])
|
||||
try:
|
||||
import pkuseg
|
||||
import spacy_pkuseg
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"pkuseg not installed. To use this model, "
|
||||
"spacy_pkuseg not installed. To use this model, "
|
||||
+ _PKUSEG_INSTALL_MSG
|
||||
) from None
|
||||
self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
|
||||
self.pkuseg_seg = spacy_pkuseg.pkuseg(str(tempdir))
|
||||
if pkuseg_data["processors_data"]:
|
||||
processors_data = pkuseg_data["processors_data"]
|
||||
(user_dict, do_process, common_words, other_words) = processors_data
|
||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
||||
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
|
||||
self.pkuseg_seg.postprocesser.do_process = do_process
|
||||
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
||||
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
||||
|
@ -244,18 +229,6 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
path.mkdir(parents=True)
|
||||
self.pkuseg_seg.model.save(path)
|
||||
self.pkuseg_seg.feature_extractor.save(path)
|
||||
# try to convert features.pkl to pickle protocol 4
|
||||
try:
|
||||
import pickle5
|
||||
|
||||
with open(path / "features.pkl", "rb") as fileh:
|
||||
features = pickle5.load(fileh)
|
||||
with open(path / "features.pkl", "wb") as fileh:
|
||||
pickle5.dump(features, fileh, protocol=4)
|
||||
except ImportError as e:
|
||||
raise e
|
||||
except Exception:
|
||||
warnings.warn(_PKUSEG_PICKLE_WARNING)
|
||||
|
||||
def save_pkuseg_processors(path):
|
||||
if self.pkuseg_seg:
|
||||
|
@ -279,26 +252,26 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
|
||||
def load_pkuseg_model(path):
|
||||
try:
|
||||
import pkuseg
|
||||
import spacy_pkuseg
|
||||
except ImportError:
|
||||
if self.segmenter == Segmenter.pkuseg:
|
||||
raise ImportError(
|
||||
"pkuseg not installed. To use this model, "
|
||||
"spacy_pkuseg not installed. To use this model, "
|
||||
+ _PKUSEG_INSTALL_MSG
|
||||
) from None
|
||||
if path.exists():
|
||||
self.pkuseg_seg = pkuseg.pkuseg(path)
|
||||
self.pkuseg_seg = spacy_pkuseg.pkuseg(path)
|
||||
|
||||
def load_pkuseg_processors(path):
|
||||
try:
|
||||
import pkuseg
|
||||
import spacy_pkuseg
|
||||
except ImportError:
|
||||
if self.segmenter == Segmenter.pkuseg:
|
||||
raise ImportError(self._pkuseg_install_msg) from None
|
||||
if self.segmenter == Segmenter.pkuseg:
|
||||
data = srsly.read_msgpack(path)
|
||||
(user_dict, do_process, common_words, other_words) = data
|
||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
||||
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
|
||||
self.pkuseg_seg.postprocesser.do_process = do_process
|
||||
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
||||
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
||||
|
@ -341,12 +314,13 @@ def try_jieba_import() -> None:
|
|||
|
||||
def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
|
||||
try:
|
||||
import pkuseg
|
||||
import spacy_pkuseg
|
||||
|
||||
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||
except ImportError:
|
||||
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
||||
msg = "spacy_pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
||||
raise ImportError(msg) from None
|
||||
try:
|
||||
return spacy_pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||
except FileNotFoundError:
|
||||
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
||||
raise FileNotFoundError(msg) from None
|
||||
|
|
|
@ -284,8 +284,7 @@ def zh_tokenizer_jieba():
|
|||
|
||||
@pytest.fixture(scope="session")
|
||||
def zh_tokenizer_pkuseg():
|
||||
pytest.importorskip("pkuseg")
|
||||
pytest.importorskip("pickle5")
|
||||
pytest.importorskip("spacy_pkuseg")
|
||||
config = {
|
||||
"nlp": {
|
||||
"tokenizer": {
|
||||
|
@ -295,7 +294,7 @@ def zh_tokenizer_pkuseg():
|
|||
},
|
||||
"initialize": {
|
||||
"tokenizer": {
|
||||
"pkuseg_model": "default",
|
||||
"pkuseg_model": "web",
|
||||
}
|
||||
},
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user