From 5d19dfc9d32c7fd039118d9fe0f8cf713e7af471 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 5 Oct 2020 14:21:53 +0200 Subject: [PATCH 1/4] Update Chinese tokenizer for spacy-pkuseg fork --- spacy/lang/zh/__init__.py | 62 +++++++++++---------------------------- spacy/tests/conftest.py | 5 ++-- 2 files changed, 19 insertions(+), 48 deletions(-) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 55a77330a..8864ae119 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -17,8 +17,7 @@ from ... import util # fmt: off -_PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`" -_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7." +_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install spacy-pkuseg==0.0.26`" # fmt: on DEFAULT_CONFIG = """ @@ -120,12 +119,12 @@ class ChineseTokenizer(DummyTokenizer): if self.segmenter == Segmenter.pkuseg: if reset: try: - import pkuseg + import spacy_pkuseg - self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None) + self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(None) except ImportError: msg = ( - "pkuseg not installed: unable to reset pkuseg " + "spacy_pkuseg not installed: unable to reset pkuseg " "user dict. Please " + _PKUSEG_INSTALL_MSG ) raise ImportError(msg) from None @@ -156,22 +155,6 @@ class ChineseTokenizer(DummyTokenizer): self.pkuseg_seg.feature_extractor.save(tempdir) self.pkuseg_seg.model.save(tempdir) tempdir = Path(tempdir) - # pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which - # means that it will be saved with pickle protocol 5 with - # python 3.8, which can't be reloaded with python 3.6-3.7. - # To try to make the model compatible with python 3.6+, reload - # the data with pickle5 and convert it back to protocol 4. - try: - import pickle5 - - with open(tempdir / "features.pkl", "rb") as fileh: - features = pickle5.load(fileh) - with open(tempdir / "features.pkl", "wb") as fileh: - pickle5.dump(features, fileh, protocol=4) - except ImportError as e: - raise e - except Exception: - warnings.warn(_PKUSEG_PICKLE_WARNING) with open(tempdir / "features.pkl", "rb") as fileh: pkuseg_features_b = fileh.read() with open(tempdir / "weights.npz", "rb") as fileh: @@ -218,17 +201,17 @@ class ChineseTokenizer(DummyTokenizer): with open(tempdir / "weights.npz", "wb") as fileh: fileh.write(pkuseg_data["weights_b"]) try: - import pkuseg + import spacy_pkuseg except ImportError: raise ImportError( - "pkuseg not installed. To use this model, " + "spacy_pkuseg not installed. To use this model, " + _PKUSEG_INSTALL_MSG ) from None - self.pkuseg_seg = pkuseg.pkuseg(str(tempdir)) + self.pkuseg_seg = spacy_pkuseg.pkuseg(str(tempdir)) if pkuseg_data["processors_data"]: processors_data = pkuseg_data["processors_data"] (user_dict, do_process, common_words, other_words) = processors_data - self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict) + self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict) self.pkuseg_seg.postprocesser.do_process = do_process self.pkuseg_seg.postprocesser.common_words = set(common_words) self.pkuseg_seg.postprocesser.other_words = set(other_words) @@ -244,18 +227,6 @@ class ChineseTokenizer(DummyTokenizer): path.mkdir(parents=True) self.pkuseg_seg.model.save(path) self.pkuseg_seg.feature_extractor.save(path) - # try to convert features.pkl to pickle protocol 4 - try: - import pickle5 - - with open(path / "features.pkl", "rb") as fileh: - features = pickle5.load(fileh) - with open(path / "features.pkl", "wb") as fileh: - pickle5.dump(features, fileh, protocol=4) - except ImportError as e: - raise e - except Exception: - warnings.warn(_PKUSEG_PICKLE_WARNING) def save_pkuseg_processors(path): if self.pkuseg_seg: @@ -279,26 +250,26 @@ class ChineseTokenizer(DummyTokenizer): def load_pkuseg_model(path): try: - import pkuseg + import spacy_pkuseg except ImportError: if self.segmenter == Segmenter.pkuseg: raise ImportError( - "pkuseg not installed. To use this model, " + "spacy_pkuseg not installed. To use this model, " + _PKUSEG_INSTALL_MSG ) from None if path.exists(): - self.pkuseg_seg = pkuseg.pkuseg(path) + self.pkuseg_seg = spacy_pkuseg.pkuseg(path) def load_pkuseg_processors(path): try: - import pkuseg + import spacy_pkuseg except ImportError: if self.segmenter == Segmenter.pkuseg: raise ImportError(self._pkuseg_install_msg) from None if self.segmenter == Segmenter.pkuseg: data = srsly.read_msgpack(path) (user_dict, do_process, common_words, other_words) = data - self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict) + self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict) self.pkuseg_seg.postprocesser.do_process = do_process self.pkuseg_seg.postprocesser.common_words = set(common_words) self.pkuseg_seg.postprocesser.other_words = set(other_words) @@ -341,12 +312,13 @@ def try_jieba_import() -> None: def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None: try: - import pkuseg + import spacy_pkuseg - return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict) except ImportError: - msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG + msg = "spacy_pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG raise ImportError(msg) from None + try: + return spacy_pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict) except FileNotFoundError: msg = "Unable to load pkuseg model from: " + pkuseg_model raise FileNotFoundError(msg) from None diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 4a3d126d7..bb9f770bc 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -285,8 +285,7 @@ def zh_tokenizer_jieba(): @pytest.fixture(scope="session") def zh_tokenizer_pkuseg(): - pytest.importorskip("pkuseg") - pytest.importorskip("pickle5") + pytest.importorskip("spacy_pkuseg") config = { "nlp": { "tokenizer": { @@ -296,7 +295,7 @@ def zh_tokenizer_pkuseg(): }, "initialize": { "tokenizer": { - "pkuseg_model": "default", + "pkuseg_model": "web", } }, } From 187234648cfb20974cdbf79b0d8a477c0aaf36b3 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 5 Oct 2020 16:24:28 +0200 Subject: [PATCH 2/4] Revert back to "default" as default for pkuseg_user_dict --- spacy/lang/zh/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 8864ae119..5d4d55aed 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -81,9 +81,11 @@ class ChineseTokenizer(DummyTokenizer): *, nlp: Optional[Language] = None, pkuseg_model: Optional[str] = None, - pkuseg_user_dict: str = "default", + pkuseg_user_dict: Optional[str] = "default", ): if self.segmenter == Segmenter.pkuseg: + if pkuseg_user_dict is None: + pkuseg_user_dict = pkuseg_model self.pkuseg_seg = try_pkuseg_import( pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict, ) From f102ef6b54bbc0ddaf7c093dee7fcacaf667c2ed Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 5 Oct 2020 17:47:39 +0200 Subject: [PATCH 3/4] Read features.msgpack instead of features.pkl --- spacy/lang/zh/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 5d4d55aed..f9065f92c 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -157,7 +157,7 @@ class ChineseTokenizer(DummyTokenizer): self.pkuseg_seg.feature_extractor.save(tempdir) self.pkuseg_seg.model.save(tempdir) tempdir = Path(tempdir) - with open(tempdir / "features.pkl", "rb") as fileh: + with open(tempdir / "features.msgpack", "rb") as fileh: pkuseg_features_b = fileh.read() with open(tempdir / "weights.npz", "rb") as fileh: pkuseg_weights_b = fileh.read() @@ -198,7 +198,7 @@ class ChineseTokenizer(DummyTokenizer): if pkuseg_data["features_b"] and pkuseg_data["weights_b"]: with tempfile.TemporaryDirectory() as tempdir: tempdir = Path(tempdir) - with open(tempdir / "features.pkl", "wb") as fileh: + with open(tempdir / "features.msgpack", "wb") as fileh: fileh.write(pkuseg_data["features_b"]) with open(tempdir / "weights.npz", "wb") as fileh: fileh.write(pkuseg_data["weights_b"]) From d2806f11f2ad87b97a6571b6b71d5fe33f544ae0 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 5 Oct 2020 18:08:32 +0200 Subject: [PATCH 4/4] Update to spacy-pkuseg==0.0.26 in Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a4df0f8c8..3f10e79cc 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ SHELL := /bin/bash ifndef SPACY_EXTRAS -override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core +override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba spacy-pkuseg==0.0.26 sudachipy sudachidict_core endif ifndef PYVER