mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Merge pull request #6154 from adrianeboyd/bugfix/chinese-tokenizer-pickle
This commit is contained in:
		
						commit
						c3f8c09d7d
					
				
							
								
								
									
										2
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								Makefile
									
									
									
									
									
								
							| 
						 | 
					@ -1,7 +1,7 @@
 | 
				
			||||||
SHELL := /bin/bash
 | 
					SHELL := /bin/bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ifndef SPACY_EXTRAS
 | 
					ifndef SPACY_EXTRAS
 | 
				
			||||||
override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 sudachipy sudachidict_core
 | 
					override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ifndef PYVER
 | 
					ifndef PYVER
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -670,10 +670,15 @@ class Errors:
 | 
				
			||||||
            "'{token_attrs}'.")
 | 
					            "'{token_attrs}'.")
 | 
				
			||||||
    E999 = ("Unable to merge the `Doc` objects because they do not all share "
 | 
					    E999 = ("Unable to merge the `Doc` objects because they do not all share "
 | 
				
			||||||
            "the same `Vocab`.")
 | 
					            "the same `Vocab`.")
 | 
				
			||||||
    E1000 = ("No pkuseg model available. Provide a pkuseg model when "
 | 
					    E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was "
 | 
				
			||||||
             "initializing the pipeline:\n"
 | 
					             "specified. Provide the name of a pretrained model or the path to "
 | 
				
			||||||
             'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\n'
 | 
					             "a model when initializing the pipeline:\n"
 | 
				
			||||||
             'nlp = Chinese(config=cfg)')
 | 
					             'config = {\n'
 | 
				
			||||||
 | 
					             '   "@tokenizers": "spacy.zh.ChineseTokenizer",\n'
 | 
				
			||||||
 | 
					             '   "segmenter": "pkuseg",\n'
 | 
				
			||||||
 | 
					             '   "pkuseg_model": "default", # or "/path/to/pkuseg_model" \n'
 | 
				
			||||||
 | 
					             '}\n'
 | 
				
			||||||
 | 
					             'nlp = Chinese.from_config({"nlp": {"tokenizer": config}})')
 | 
				
			||||||
    E1001 = ("Target token outside of matched span for match with tokens "
 | 
					    E1001 = ("Target token outside of matched span for match with tokens "
 | 
				
			||||||
             "'{span}' and offset '{index}' matched by patterns '{patterns}'.")
 | 
					             "'{span}' and offset '{index}' matched by patterns '{patterns}'.")
 | 
				
			||||||
    E1002 = ("Span index out of range.")
 | 
					    E1002 = ("Span index out of range.")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -17,7 +17,8 @@ from .stop_words import STOP_WORDS
 | 
				
			||||||
from ... import util
 | 
					from ... import util
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
 | 
					_PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`"
 | 
				
			||||||
 | 
					_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7."
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
[nlp]
 | 
					[nlp]
 | 
				
			||||||
| 
						 | 
					@ -66,7 +67,7 @@ class ChineseTokenizer(DummyTokenizer):
 | 
				
			||||||
        pkuseg_user_dict: Optional[str] = None,
 | 
					        pkuseg_user_dict: Optional[str] = None,
 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
        self.vocab = nlp.vocab
 | 
					        self.vocab = nlp.vocab
 | 
				
			||||||
        if isinstance(segmenter, Segmenter):  # we might have the Enum here
 | 
					        if isinstance(segmenter, Segmenter):
 | 
				
			||||||
            segmenter = segmenter.value
 | 
					            segmenter = segmenter.value
 | 
				
			||||||
        self.segmenter = segmenter
 | 
					        self.segmenter = segmenter
 | 
				
			||||||
        self.pkuseg_model = pkuseg_model
 | 
					        self.pkuseg_model = pkuseg_model
 | 
				
			||||||
| 
						 | 
					@ -163,6 +164,22 @@ class ChineseTokenizer(DummyTokenizer):
 | 
				
			||||||
                self.pkuseg_seg.feature_extractor.save(tempdir)
 | 
					                self.pkuseg_seg.feature_extractor.save(tempdir)
 | 
				
			||||||
                self.pkuseg_seg.model.save(tempdir)
 | 
					                self.pkuseg_seg.model.save(tempdir)
 | 
				
			||||||
                tempdir = Path(tempdir)
 | 
					                tempdir = Path(tempdir)
 | 
				
			||||||
 | 
					                # pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which
 | 
				
			||||||
 | 
					                # means that it will be saved with pickle protocol 5 with
 | 
				
			||||||
 | 
					                # python 3.8, which can't be reloaded with python 3.6-3.7.
 | 
				
			||||||
 | 
					                # To try to make the model compatible with python 3.6+, reload
 | 
				
			||||||
 | 
					                # the data with pickle5 and convert it back to protocol 4.
 | 
				
			||||||
 | 
					                try:
 | 
				
			||||||
 | 
					                    import pickle5
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    with open(tempdir / "features.pkl", "rb") as fileh:
 | 
				
			||||||
 | 
					                        features = pickle5.load(fileh)
 | 
				
			||||||
 | 
					                    with open(tempdir / "features.pkl", "wb") as fileh:
 | 
				
			||||||
 | 
					                        pickle5.dump(features, fileh, protocol=4)
 | 
				
			||||||
 | 
					                except ImportError as e:
 | 
				
			||||||
 | 
					                    raise(e)
 | 
				
			||||||
 | 
					                except Exception:
 | 
				
			||||||
 | 
					                    warnings.warn(_PKUSEG_PICKLE_WARNING)
 | 
				
			||||||
                with open(tempdir / "features.pkl", "rb") as fileh:
 | 
					                with open(tempdir / "features.pkl", "rb") as fileh:
 | 
				
			||||||
                    pkuseg_features_b = fileh.read()
 | 
					                    pkuseg_features_b = fileh.read()
 | 
				
			||||||
                with open(tempdir / "weights.npz", "rb") as fileh:
 | 
					                with open(tempdir / "weights.npz", "rb") as fileh:
 | 
				
			||||||
| 
						 | 
					@ -235,6 +252,18 @@ class ChineseTokenizer(DummyTokenizer):
 | 
				
			||||||
                    path.mkdir(parents=True)
 | 
					                    path.mkdir(parents=True)
 | 
				
			||||||
                self.pkuseg_seg.model.save(path)
 | 
					                self.pkuseg_seg.model.save(path)
 | 
				
			||||||
                self.pkuseg_seg.feature_extractor.save(path)
 | 
					                self.pkuseg_seg.feature_extractor.save(path)
 | 
				
			||||||
 | 
					                # try to convert features.pkl to pickle protocol 4
 | 
				
			||||||
 | 
					                try:
 | 
				
			||||||
 | 
					                    import pickle5
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    with open(path / "features.pkl", "rb") as fileh:
 | 
				
			||||||
 | 
					                        features = pickle5.load(fileh)
 | 
				
			||||||
 | 
					                    with open(path / "features.pkl", "wb") as fileh:
 | 
				
			||||||
 | 
					                        pickle5.dump(features, fileh, protocol=4)
 | 
				
			||||||
 | 
					                except ImportError as e:
 | 
				
			||||||
 | 
					                    raise(e)
 | 
				
			||||||
 | 
					                except Exception:
 | 
				
			||||||
 | 
					                    warnings.warn(_PKUSEG_PICKLE_WARNING)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def save_pkuseg_processors(path):
 | 
					        def save_pkuseg_processors(path):
 | 
				
			||||||
            if self.pkuseg_seg:
 | 
					            if self.pkuseg_seg:
 | 
				
			||||||
| 
						 | 
					@ -320,21 +349,14 @@ def try_jieba_import(segmenter: str) -> None:
 | 
				
			||||||
            raise ImportError(msg) from None
 | 
					            raise ImportError(msg) from None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) -> None:
 | 
					def try_pkuseg_import(segmenter: str, pkuseg_model: Optional[str], pkuseg_user_dict: str) -> None:
 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
        import pkuseg
 | 
					        import pkuseg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if pkuseg_model:
 | 
					        if pkuseg_model is None:
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
            return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
 | 
					            return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
 | 
				
			||||||
        elif segmenter == Segmenter.pkuseg:
 | 
					 | 
				
			||||||
            msg = (
 | 
					 | 
				
			||||||
                "The Chinese word segmenter is 'pkuseg' but no pkuseg model "
 | 
					 | 
				
			||||||
                "was specified. Please provide the name of a pretrained model "
 | 
					 | 
				
			||||||
                "or the path to a model with:\n"
 | 
					 | 
				
			||||||
                'cfg = {"nlp": {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path }}\n'
 | 
					 | 
				
			||||||
                "nlp = Chinese.from_config(cfg)"
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
            raise ValueError(msg)
 | 
					 | 
				
			||||||
    except ImportError:
 | 
					    except ImportError:
 | 
				
			||||||
        if segmenter == Segmenter.pkuseg:
 | 
					        if segmenter == Segmenter.pkuseg:
 | 
				
			||||||
            msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
 | 
					            msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -282,6 +282,7 @@ def zh_tokenizer_jieba():
 | 
				
			||||||
@pytest.fixture(scope="session")
 | 
					@pytest.fixture(scope="session")
 | 
				
			||||||
def zh_tokenizer_pkuseg():
 | 
					def zh_tokenizer_pkuseg():
 | 
				
			||||||
    pytest.importorskip("pkuseg")
 | 
					    pytest.importorskip("pkuseg")
 | 
				
			||||||
 | 
					    pytest.importorskip("pickle5")
 | 
				
			||||||
    config = {
 | 
					    config = {
 | 
				
			||||||
        "@tokenizers": "spacy.zh.ChineseTokenizer",
 | 
					        "@tokenizers": "spacy.zh.ChineseTokenizer",
 | 
				
			||||||
        "segmenter": "pkuseg",
 | 
					        "segmenter": "pkuseg",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -27,9 +27,10 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.slow
 | 
					@pytest.mark.slow
 | 
				
			||||||
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
 | 
					def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
 | 
				
			||||||
    nlp = Chinese(
 | 
					    config = {
 | 
				
			||||||
        meta={
 | 
					        "@tokenizers": "spacy.zh.ChineseTokenizer",
 | 
				
			||||||
            "tokenizer": {"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine"}}
 | 
					        "segmenter": "pkuseg",
 | 
				
			||||||
        }
 | 
					        "pkuseg_model": "medicine",
 | 
				
			||||||
    )
 | 
					    }
 | 
				
			||||||
 | 
					    nlp = Chinese.from_config({"nlp": {"tokenizer": config}})
 | 
				
			||||||
    zh_tokenizer_serialize(nlp.tokenizer)
 | 
					    zh_tokenizer_serialize(nlp.tokenizer)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user