From 11e195d3ed1b138a882d90385210c78d9575febc Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 27 Sep 2020 14:00:18 +0200 Subject: [PATCH 1/4] Update ChineseTokenizer * Allow `pkuseg_model` to be set to `None` on initialization * Don't save config within tokenizer * Force convert pkuseg_model to use pickle protocol 4 by reencoding with `pickle5` on serialization * Update pkuseg serialization test --- spacy/errors.py | 13 ++++-- spacy/lang/zh/__init__.py | 60 ++++++++++++++------------- spacy/tests/lang/zh/test_serialize.py | 11 ++--- 3 files changed, 46 insertions(+), 38 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 640419182..aad49e1ad 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -670,10 +670,15 @@ class Errors: "'{token_attrs}'.") E999 = ("Unable to merge the `Doc` objects because they do not all share " "the same `Vocab`.") - E1000 = ("No pkuseg model available. Provide a pkuseg model when " - "initializing the pipeline:\n" - 'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\n' - 'nlp = Chinese(config=cfg)') + E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was " + "specified. Provide the name of a pretrained model or the path to " + "a model when initializing the pipeline:\n" + 'config = {\n' + ' "@tokenizers": "spacy.zh.ChineseTokenizer",\n' + ' "segmenter": "pkuseg",\n' + ' "pkuseg_model": "default", # or "/path/to/pkuseg_model" \n' + '}\n' + 'nlp = Chinese.from_config({"nlp": {"tokenizer": config}})') E1001 = ("Target token outside of matched span for match with tokens " "'{span}' and offset '{index}' matched by patterns '{patterns}'.") E1002 = ("Span index out of range.") diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 5d3bd2a96..d222e78f2 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -15,7 +15,8 @@ from .stop_words import STOP_WORDS from ... import util -_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python" +_PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`" +_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7. DEFAULT_CONFIG = """ [nlp] @@ -64,7 +65,7 @@ class ChineseTokenizer(DummyTokenizer): pkuseg_user_dict: Optional[str] = None, ): self.vocab = nlp.vocab - if isinstance(segmenter, Segmenter): # we might have the Enum here + if isinstance(segmenter, Segmenter): segmenter = segmenter.value self.segmenter = segmenter self.pkuseg_model = pkuseg_model @@ -136,18 +137,6 @@ class ChineseTokenizer(DummyTokenizer): warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter) warnings.warn(warn_msg) - def _get_config(self) -> Dict[str, Any]: - return { - "segmenter": self.segmenter, - "pkuseg_model": self.pkuseg_model, - "pkuseg_user_dict": self.pkuseg_user_dict, - } - - def _set_config(self, config: Dict[str, Any] = {}) -> None: - self.segmenter = config.get("segmenter", Segmenter.char) - self.pkuseg_model = config.get("pkuseg_model", None) - self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default") - def to_bytes(self, **kwargs): pkuseg_features_b = b"" pkuseg_weights_b = b"" @@ -157,6 +146,20 @@ class ChineseTokenizer(DummyTokenizer): self.pkuseg_seg.feature_extractor.save(tempdir) self.pkuseg_seg.model.save(tempdir) tempdir = Path(tempdir) + # pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which + # means that it will be saved with pickle protocol 5 with + # python 3.8, which can't be reloaded with python 3.6-3.7. + # To try to make the model compatible with python 3.6+, reload + # the data with pickle5 and convert it back to protocol 4. + try: + import pickle5 + + with open(tempdir / "features.pkl", "rb") as fileh: + features = pickle5.load(fileh) + with open(tempdir / "features.pkl", "wb") as fileh: + pickle5.dump(features, fileh, protocol=4) + except: + warnings.warn(_PKUSEG_PICKLE_WARNING) with open(tempdir / "features.pkl", "rb") as fileh: pkuseg_features_b = fileh.read() with open(tempdir / "weights.npz", "rb") as fileh: @@ -168,7 +171,6 @@ class ChineseTokenizer(DummyTokenizer): sorted(list(self.pkuseg_seg.postprocesser.other_words)), ) serializers = { - "cfg": lambda: srsly.json_dumps(self._get_config()), "pkuseg_features": lambda: pkuseg_features_b, "pkuseg_weights": lambda: pkuseg_weights_b, "pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data), @@ -188,7 +190,6 @@ class ChineseTokenizer(DummyTokenizer): pkuseg_data["processors_data"] = srsly.msgpack_loads(b) deserializers = { - "cfg": lambda b: self._set_config(srsly.json_loads(b)), "pkuseg_features": deserialize_pkuseg_features, "pkuseg_weights": deserialize_pkuseg_weights, "pkuseg_processors": deserialize_pkuseg_processors, @@ -229,6 +230,16 @@ class ChineseTokenizer(DummyTokenizer): path.mkdir(parents=True) self.pkuseg_seg.model.save(path) self.pkuseg_seg.feature_extractor.save(path) + # try to convert features.pkl to pickle protocol 4 + try: + import pickle5 + + with open(path / "features.pkl", "rb") as fileh: + features = pickle5.load(fileh) + with open(path / "features.pkl", "wb") as fileh: + pickle5.dump(features, fileh, protocol=4) + except: + warnings.warn(_PKUSEG_PICKLE_WARNING) def save_pkuseg_processors(path): if self.pkuseg_seg: @@ -241,7 +252,6 @@ class ChineseTokenizer(DummyTokenizer): srsly.write_msgpack(path, data) serializers = { - "cfg": lambda p: srsly.write_json(p, self._get_config()), "pkuseg_model": lambda p: save_pkuseg_model(p), "pkuseg_processors": lambda p: save_pkuseg_processors(p), } @@ -277,7 +287,6 @@ class ChineseTokenizer(DummyTokenizer): self.pkuseg_seg.postprocesser.other_words = set(other_words) serializers = { - "cfg": lambda p: self._set_config(srsly.read_json(p)), "pkuseg_model": lambda p: load_pkuseg_model(p), "pkuseg_processors": lambda p: load_pkuseg_processors(p), } @@ -314,21 +323,14 @@ def try_jieba_import(segmenter: str) -> None: raise ImportError(msg) from None -def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) -> None: +def try_pkuseg_import(segmenter: str, pkuseg_model: Optional[str], pkuseg_user_dict: str) -> None: try: import pkuseg - if pkuseg_model: + if pkuseg_model is None: + return None + else: return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict) - elif segmenter == Segmenter.pkuseg: - msg = ( - "The Chinese word segmenter is 'pkuseg' but no pkuseg model " - "was specified. Please provide the name of a pretrained model " - "or the path to a model with:\n" - 'cfg = {"nlp": {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path }}\n' - "nlp = Chinese.from_config(cfg)" - ) - raise ValueError(msg) except ImportError: if segmenter == Segmenter.pkuseg: msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py index 1c6fdf419..5491314e2 100644 --- a/spacy/tests/lang/zh/test_serialize.py +++ b/spacy/tests/lang/zh/test_serialize.py @@ -27,9 +27,10 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba): @pytest.mark.slow def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg): - nlp = Chinese( - meta={ - "tokenizer": {"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine"}} - } - ) + config = { + "@tokenizers": "spacy.zh.ChineseTokenizer", + "segmenter": "pkuseg", + "pkuseg_model": "medicine", + } + nlp = Chinese.from_config({"nlp": {"tokenizer": config}}) zh_tokenizer_serialize(nlp.tokenizer) From 54fe8719355534ec1dd51b20252bf154c25a8be3 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 27 Sep 2020 14:37:28 +0200 Subject: [PATCH 2/4] Fix formatting, refactor pickle5 exceptions --- spacy/lang/zh/__init__.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index d222e78f2..f9887a4df 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional, List, Dict, Any +from typing import Optional, List from enum import Enum import tempfile import srsly @@ -16,7 +16,7 @@ from ... import util _PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`" -_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7. +_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7." DEFAULT_CONFIG = """ [nlp] @@ -158,7 +158,9 @@ class ChineseTokenizer(DummyTokenizer): features = pickle5.load(fileh) with open(tempdir / "features.pkl", "wb") as fileh: pickle5.dump(features, fileh, protocol=4) - except: + except ImportError as e: + raise(e) + except Exception: warnings.warn(_PKUSEG_PICKLE_WARNING) with open(tempdir / "features.pkl", "rb") as fileh: pkuseg_features_b = fileh.read() @@ -238,7 +240,9 @@ class ChineseTokenizer(DummyTokenizer): features = pickle5.load(fileh) with open(path / "features.pkl", "wb") as fileh: pickle5.dump(features, fileh, protocol=4) - except: + except ImportError as e: + raise(e) + except Exception: warnings.warn(_PKUSEG_PICKLE_WARNING) def save_pkuseg_processors(path): From 8393dbedad3e122638fc996719e8d29611dd9a24 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 27 Sep 2020 15:15:53 +0200 Subject: [PATCH 3/4] Minor fixes * Put `cfg` back in serialization * Add `pickle5` to pytest conf --- spacy/lang/zh/__init__.py | 18 +++++++++++++++++- spacy/tests/conftest.py | 1 + 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index f9887a4df..69c7b644d 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional, List +from typing import Optional, List, Dict, Any from enum import Enum import tempfile import srsly @@ -137,6 +137,18 @@ class ChineseTokenizer(DummyTokenizer): warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter) warnings.warn(warn_msg) + def _get_config(self) -> Dict[str, Any]: + return { + "segmenter": self.segmenter, + "pkuseg_model": self.pkuseg_model, + "pkuseg_user_dict": self.pkuseg_user_dict, + } + + def _set_config(self, config: Dict[str, Any] = {}) -> None: + self.segmenter = config.get("segmenter", Segmenter.char) + self.pkuseg_model = config.get("pkuseg_model", None) + self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default") + def to_bytes(self, **kwargs): pkuseg_features_b = b"" pkuseg_weights_b = b"" @@ -173,6 +185,7 @@ class ChineseTokenizer(DummyTokenizer): sorted(list(self.pkuseg_seg.postprocesser.other_words)), ) serializers = { + "cfg": lambda: srsly.json_dumps(self._get_config()), "pkuseg_features": lambda: pkuseg_features_b, "pkuseg_weights": lambda: pkuseg_weights_b, "pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data), @@ -192,6 +205,7 @@ class ChineseTokenizer(DummyTokenizer): pkuseg_data["processors_data"] = srsly.msgpack_loads(b) deserializers = { + "cfg": lambda b: self._set_config(srsly.json_loads(b)), "pkuseg_features": deserialize_pkuseg_features, "pkuseg_weights": deserialize_pkuseg_weights, "pkuseg_processors": deserialize_pkuseg_processors, @@ -256,6 +270,7 @@ class ChineseTokenizer(DummyTokenizer): srsly.write_msgpack(path, data) serializers = { + "cfg": lambda p: srsly.write_json(p, self._get_config()), "pkuseg_model": lambda p: save_pkuseg_model(p), "pkuseg_processors": lambda p: save_pkuseg_processors(p), } @@ -291,6 +306,7 @@ class ChineseTokenizer(DummyTokenizer): self.pkuseg_seg.postprocesser.other_words = set(other_words) serializers = { + "cfg": lambda p: self._set_config(srsly.read_json(p)), "pkuseg_model": lambda p: load_pkuseg_model(p), "pkuseg_processors": lambda p: load_pkuseg_processors(p), } diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 3a9a1f26b..23fc5e98f 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -282,6 +282,7 @@ def zh_tokenizer_jieba(): @pytest.fixture(scope="session") def zh_tokenizer_pkuseg(): pytest.importorskip("pkuseg") + pytest.importorskip("pickle5") config = { "@tokenizers": "spacy.zh.ChineseTokenizer", "segmenter": "pkuseg", From 09d42d4bf0e4fd08229b372a1e81bc486ee1a699 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 28 Sep 2020 09:49:59 +0200 Subject: [PATCH 4/4] Add pickle5 to Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index d44063f83..a180063b9 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ SHELL := /bin/bash ifndef SPACY_EXTRAS -override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 sudachipy sudachidict_core +override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core endif ifndef PYVER