mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Update Chinese tokenizer for spacy-pkuseg fork
This commit is contained in:
parent
549758f67d
commit
5d19dfc9d3
|
@ -17,8 +17,7 @@ from ... import util
|
|||
|
||||
|
||||
# fmt: off
|
||||
_PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`"
|
||||
_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7."
|
||||
_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install spacy-pkuseg==0.0.26`"
|
||||
# fmt: on
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -120,12 +119,12 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
if self.segmenter == Segmenter.pkuseg:
|
||||
if reset:
|
||||
try:
|
||||
import pkuseg
|
||||
import spacy_pkuseg
|
||||
|
||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
|
||||
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(None)
|
||||
except ImportError:
|
||||
msg = (
|
||||
"pkuseg not installed: unable to reset pkuseg "
|
||||
"spacy_pkuseg not installed: unable to reset pkuseg "
|
||||
"user dict. Please " + _PKUSEG_INSTALL_MSG
|
||||
)
|
||||
raise ImportError(msg) from None
|
||||
|
@ -156,22 +155,6 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
self.pkuseg_seg.feature_extractor.save(tempdir)
|
||||
self.pkuseg_seg.model.save(tempdir)
|
||||
tempdir = Path(tempdir)
|
||||
# pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which
|
||||
# means that it will be saved with pickle protocol 5 with
|
||||
# python 3.8, which can't be reloaded with python 3.6-3.7.
|
||||
# To try to make the model compatible with python 3.6+, reload
|
||||
# the data with pickle5 and convert it back to protocol 4.
|
||||
try:
|
||||
import pickle5
|
||||
|
||||
with open(tempdir / "features.pkl", "rb") as fileh:
|
||||
features = pickle5.load(fileh)
|
||||
with open(tempdir / "features.pkl", "wb") as fileh:
|
||||
pickle5.dump(features, fileh, protocol=4)
|
||||
except ImportError as e:
|
||||
raise e
|
||||
except Exception:
|
||||
warnings.warn(_PKUSEG_PICKLE_WARNING)
|
||||
with open(tempdir / "features.pkl", "rb") as fileh:
|
||||
pkuseg_features_b = fileh.read()
|
||||
with open(tempdir / "weights.npz", "rb") as fileh:
|
||||
|
@ -218,17 +201,17 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
with open(tempdir / "weights.npz", "wb") as fileh:
|
||||
fileh.write(pkuseg_data["weights_b"])
|
||||
try:
|
||||
import pkuseg
|
||||
import spacy_pkuseg
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"pkuseg not installed. To use this model, "
|
||||
"spacy_pkuseg not installed. To use this model, "
|
||||
+ _PKUSEG_INSTALL_MSG
|
||||
) from None
|
||||
self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
|
||||
self.pkuseg_seg = spacy_pkuseg.pkuseg(str(tempdir))
|
||||
if pkuseg_data["processors_data"]:
|
||||
processors_data = pkuseg_data["processors_data"]
|
||||
(user_dict, do_process, common_words, other_words) = processors_data
|
||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
||||
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
|
||||
self.pkuseg_seg.postprocesser.do_process = do_process
|
||||
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
||||
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
||||
|
@ -244,18 +227,6 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
path.mkdir(parents=True)
|
||||
self.pkuseg_seg.model.save(path)
|
||||
self.pkuseg_seg.feature_extractor.save(path)
|
||||
# try to convert features.pkl to pickle protocol 4
|
||||
try:
|
||||
import pickle5
|
||||
|
||||
with open(path / "features.pkl", "rb") as fileh:
|
||||
features = pickle5.load(fileh)
|
||||
with open(path / "features.pkl", "wb") as fileh:
|
||||
pickle5.dump(features, fileh, protocol=4)
|
||||
except ImportError as e:
|
||||
raise e
|
||||
except Exception:
|
||||
warnings.warn(_PKUSEG_PICKLE_WARNING)
|
||||
|
||||
def save_pkuseg_processors(path):
|
||||
if self.pkuseg_seg:
|
||||
|
@ -279,26 +250,26 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
|
||||
def load_pkuseg_model(path):
|
||||
try:
|
||||
import pkuseg
|
||||
import spacy_pkuseg
|
||||
except ImportError:
|
||||
if self.segmenter == Segmenter.pkuseg:
|
||||
raise ImportError(
|
||||
"pkuseg not installed. To use this model, "
|
||||
"spacy_pkuseg not installed. To use this model, "
|
||||
+ _PKUSEG_INSTALL_MSG
|
||||
) from None
|
||||
if path.exists():
|
||||
self.pkuseg_seg = pkuseg.pkuseg(path)
|
||||
self.pkuseg_seg = spacy_pkuseg.pkuseg(path)
|
||||
|
||||
def load_pkuseg_processors(path):
|
||||
try:
|
||||
import pkuseg
|
||||
import spacy_pkuseg
|
||||
except ImportError:
|
||||
if self.segmenter == Segmenter.pkuseg:
|
||||
raise ImportError(self._pkuseg_install_msg) from None
|
||||
if self.segmenter == Segmenter.pkuseg:
|
||||
data = srsly.read_msgpack(path)
|
||||
(user_dict, do_process, common_words, other_words) = data
|
||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
||||
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
|
||||
self.pkuseg_seg.postprocesser.do_process = do_process
|
||||
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
||||
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
||||
|
@ -341,12 +312,13 @@ def try_jieba_import() -> None:
|
|||
|
||||
def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
|
||||
try:
|
||||
import pkuseg
|
||||
import spacy_pkuseg
|
||||
|
||||
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||
except ImportError:
|
||||
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
||||
msg = "spacy_pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
||||
raise ImportError(msg) from None
|
||||
try:
|
||||
return spacy_pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||
except FileNotFoundError:
|
||||
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
||||
raise FileNotFoundError(msg) from None
|
||||
|
|
|
@ -285,8 +285,7 @@ def zh_tokenizer_jieba():
|
|||
|
||||
@pytest.fixture(scope="session")
|
||||
def zh_tokenizer_pkuseg():
|
||||
pytest.importorskip("pkuseg")
|
||||
pytest.importorskip("pickle5")
|
||||
pytest.importorskip("spacy_pkuseg")
|
||||
config = {
|
||||
"nlp": {
|
||||
"tokenizer": {
|
||||
|
@ -296,7 +295,7 @@ def zh_tokenizer_pkuseg():
|
|||
},
|
||||
"initialize": {
|
||||
"tokenizer": {
|
||||
"pkuseg_model": "default",
|
||||
"pkuseg_model": "web",
|
||||
}
|
||||
},
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user