mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Update Chinese tokenizer for spacy-pkuseg fork
This commit is contained in:
parent
549758f67d
commit
5d19dfc9d3
|
@ -17,8 +17,7 @@ from ... import util
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
_PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`"
|
_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install spacy-pkuseg==0.0.26`"
|
||||||
_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7."
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -120,12 +119,12 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
if self.segmenter == Segmenter.pkuseg:
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
if reset:
|
if reset:
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import spacy_pkuseg
|
||||||
|
|
||||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
|
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(None)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
msg = (
|
msg = (
|
||||||
"pkuseg not installed: unable to reset pkuseg "
|
"spacy_pkuseg not installed: unable to reset pkuseg "
|
||||||
"user dict. Please " + _PKUSEG_INSTALL_MSG
|
"user dict. Please " + _PKUSEG_INSTALL_MSG
|
||||||
)
|
)
|
||||||
raise ImportError(msg) from None
|
raise ImportError(msg) from None
|
||||||
|
@ -156,22 +155,6 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
self.pkuseg_seg.feature_extractor.save(tempdir)
|
self.pkuseg_seg.feature_extractor.save(tempdir)
|
||||||
self.pkuseg_seg.model.save(tempdir)
|
self.pkuseg_seg.model.save(tempdir)
|
||||||
tempdir = Path(tempdir)
|
tempdir = Path(tempdir)
|
||||||
# pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which
|
|
||||||
# means that it will be saved with pickle protocol 5 with
|
|
||||||
# python 3.8, which can't be reloaded with python 3.6-3.7.
|
|
||||||
# To try to make the model compatible with python 3.6+, reload
|
|
||||||
# the data with pickle5 and convert it back to protocol 4.
|
|
||||||
try:
|
|
||||||
import pickle5
|
|
||||||
|
|
||||||
with open(tempdir / "features.pkl", "rb") as fileh:
|
|
||||||
features = pickle5.load(fileh)
|
|
||||||
with open(tempdir / "features.pkl", "wb") as fileh:
|
|
||||||
pickle5.dump(features, fileh, protocol=4)
|
|
||||||
except ImportError as e:
|
|
||||||
raise e
|
|
||||||
except Exception:
|
|
||||||
warnings.warn(_PKUSEG_PICKLE_WARNING)
|
|
||||||
with open(tempdir / "features.pkl", "rb") as fileh:
|
with open(tempdir / "features.pkl", "rb") as fileh:
|
||||||
pkuseg_features_b = fileh.read()
|
pkuseg_features_b = fileh.read()
|
||||||
with open(tempdir / "weights.npz", "rb") as fileh:
|
with open(tempdir / "weights.npz", "rb") as fileh:
|
||||||
|
@ -218,17 +201,17 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
with open(tempdir / "weights.npz", "wb") as fileh:
|
with open(tempdir / "weights.npz", "wb") as fileh:
|
||||||
fileh.write(pkuseg_data["weights_b"])
|
fileh.write(pkuseg_data["weights_b"])
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import spacy_pkuseg
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"pkuseg not installed. To use this model, "
|
"spacy_pkuseg not installed. To use this model, "
|
||||||
+ _PKUSEG_INSTALL_MSG
|
+ _PKUSEG_INSTALL_MSG
|
||||||
) from None
|
) from None
|
||||||
self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
|
self.pkuseg_seg = spacy_pkuseg.pkuseg(str(tempdir))
|
||||||
if pkuseg_data["processors_data"]:
|
if pkuseg_data["processors_data"]:
|
||||||
processors_data = pkuseg_data["processors_data"]
|
processors_data = pkuseg_data["processors_data"]
|
||||||
(user_dict, do_process, common_words, other_words) = processors_data
|
(user_dict, do_process, common_words, other_words) = processors_data
|
||||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
|
||||||
self.pkuseg_seg.postprocesser.do_process = do_process
|
self.pkuseg_seg.postprocesser.do_process = do_process
|
||||||
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
||||||
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
||||||
|
@ -244,18 +227,6 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
path.mkdir(parents=True)
|
path.mkdir(parents=True)
|
||||||
self.pkuseg_seg.model.save(path)
|
self.pkuseg_seg.model.save(path)
|
||||||
self.pkuseg_seg.feature_extractor.save(path)
|
self.pkuseg_seg.feature_extractor.save(path)
|
||||||
# try to convert features.pkl to pickle protocol 4
|
|
||||||
try:
|
|
||||||
import pickle5
|
|
||||||
|
|
||||||
with open(path / "features.pkl", "rb") as fileh:
|
|
||||||
features = pickle5.load(fileh)
|
|
||||||
with open(path / "features.pkl", "wb") as fileh:
|
|
||||||
pickle5.dump(features, fileh, protocol=4)
|
|
||||||
except ImportError as e:
|
|
||||||
raise e
|
|
||||||
except Exception:
|
|
||||||
warnings.warn(_PKUSEG_PICKLE_WARNING)
|
|
||||||
|
|
||||||
def save_pkuseg_processors(path):
|
def save_pkuseg_processors(path):
|
||||||
if self.pkuseg_seg:
|
if self.pkuseg_seg:
|
||||||
|
@ -279,26 +250,26 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
def load_pkuseg_model(path):
|
def load_pkuseg_model(path):
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import spacy_pkuseg
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if self.segmenter == Segmenter.pkuseg:
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"pkuseg not installed. To use this model, "
|
"spacy_pkuseg not installed. To use this model, "
|
||||||
+ _PKUSEG_INSTALL_MSG
|
+ _PKUSEG_INSTALL_MSG
|
||||||
) from None
|
) from None
|
||||||
if path.exists():
|
if path.exists():
|
||||||
self.pkuseg_seg = pkuseg.pkuseg(path)
|
self.pkuseg_seg = spacy_pkuseg.pkuseg(path)
|
||||||
|
|
||||||
def load_pkuseg_processors(path):
|
def load_pkuseg_processors(path):
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import spacy_pkuseg
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if self.segmenter == Segmenter.pkuseg:
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
raise ImportError(self._pkuseg_install_msg) from None
|
raise ImportError(self._pkuseg_install_msg) from None
|
||||||
if self.segmenter == Segmenter.pkuseg:
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
data = srsly.read_msgpack(path)
|
data = srsly.read_msgpack(path)
|
||||||
(user_dict, do_process, common_words, other_words) = data
|
(user_dict, do_process, common_words, other_words) = data
|
||||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
|
||||||
self.pkuseg_seg.postprocesser.do_process = do_process
|
self.pkuseg_seg.postprocesser.do_process = do_process
|
||||||
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
||||||
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
||||||
|
@ -341,12 +312,13 @@ def try_jieba_import() -> None:
|
||||||
|
|
||||||
def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
|
def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import spacy_pkuseg
|
||||||
|
|
||||||
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
msg = "spacy_pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
||||||
raise ImportError(msg) from None
|
raise ImportError(msg) from None
|
||||||
|
try:
|
||||||
|
return spacy_pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
||||||
raise FileNotFoundError(msg) from None
|
raise FileNotFoundError(msg) from None
|
||||||
|
|
|
@ -285,8 +285,7 @@ def zh_tokenizer_jieba():
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def zh_tokenizer_pkuseg():
|
def zh_tokenizer_pkuseg():
|
||||||
pytest.importorskip("pkuseg")
|
pytest.importorskip("spacy_pkuseg")
|
||||||
pytest.importorskip("pickle5")
|
|
||||||
config = {
|
config = {
|
||||||
"nlp": {
|
"nlp": {
|
||||||
"tokenizer": {
|
"tokenizer": {
|
||||||
|
@ -296,7 +295,7 @@ def zh_tokenizer_pkuseg():
|
||||||
},
|
},
|
||||||
"initialize": {
|
"initialize": {
|
||||||
"tokenizer": {
|
"tokenizer": {
|
||||||
"pkuseg_model": "default",
|
"pkuseg_model": "web",
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user