2020-09-27 15:37:28 +03:00
from typing import Optional , List
2020-07-22 14:42:59 +03:00
from enum import Enum
2020-04-18 18:01:53 +03:00
import tempfile
import srsly
2020-07-19 14:34:37 +03:00
import warnings
2020-04-18 18:01:53 +03:00
from pathlib import Path
2020-07-22 14:42:59 +03:00
from thinc . api import Config
2020-07-19 14:34:37 +03:00
from . . . errors import Warnings , Errors
2017-05-08 23:29:04 +03:00
from . . . language import Language
from . . . tokens import Doc
2020-07-22 14:42:59 +03:00
from . . . util import DummyTokenizer , registry
2019-11-11 16:23:21 +03:00
from . lex_attrs import LEX_ATTRS
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
from . stop_words import STOP_WORDS
2020-04-18 18:01:53 +03:00
from . . . import util
2020-09-27 15:00:18 +03:00
_PKUSEG_INSTALL_MSG = " install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5` "
2020-09-27 15:37:28 +03:00
_PKUSEG_PICKLE_WARNING = " Failed to force pkuseg model to use pickle protocol 4. If you ' re saving this model with python 3.8, it may not work with python 3.6-3.7. "
2016-04-24 19:44:24 +03:00
2020-07-22 14:42:59 +03:00
DEFAULT_CONFIG = """
[ nlp ]
2019-08-18 16:09:16 +03:00
2020-07-22 14:42:59 +03:00
[ nlp . tokenizer ]
2020-07-24 15:50:26 +03:00
@tokenizers = " spacy.zh.ChineseTokenizer "
2020-07-22 14:42:59 +03:00
segmenter = " char "
pkuseg_model = null
pkuseg_user_dict = " default "
"""
2020-04-18 18:01:53 +03:00
2019-11-11 16:23:21 +03:00
2020-07-22 14:42:59 +03:00
class Segmenter ( str , Enum ) :
char = " char "
jieba = " jieba "
pkuseg = " pkuseg "
2019-11-11 16:23:21 +03:00
2020-07-22 14:42:59 +03:00
@classmethod
def values ( cls ) :
return list ( cls . __members__ . keys ( ) )
2020-04-18 18:01:53 +03:00
2020-07-22 14:42:59 +03:00
2020-07-24 15:50:26 +03:00
@registry.tokenizers ( " spacy.zh.ChineseTokenizer " )
2020-07-22 14:42:59 +03:00
def create_chinese_tokenizer (
segmenter : Segmenter = Segmenter . char ,
pkuseg_model : Optional [ str ] = None ,
pkuseg_user_dict : Optional [ str ] = " default " ,
) :
def chinese_tokenizer_factory ( nlp ) :
return ChineseTokenizer (
nlp ,
segmenter = segmenter ,
pkuseg_model = pkuseg_model ,
pkuseg_user_dict = pkuseg_user_dict ,
)
return chinese_tokenizer_factory
2020-04-18 18:01:53 +03:00
2019-11-11 16:23:21 +03:00
class ChineseTokenizer ( DummyTokenizer ) :
2020-07-22 14:42:59 +03:00
def __init__ (
self ,
nlp : Language ,
segmenter : Segmenter = Segmenter . char ,
pkuseg_model : Optional [ str ] = None ,
pkuseg_user_dict : Optional [ str ] = None ,
) :
self . vocab = nlp . vocab
2020-09-27 15:00:18 +03:00
if isinstance ( segmenter , Segmenter ) :
2020-07-22 14:42:59 +03:00
segmenter = segmenter . value
self . segmenter = segmenter
self . pkuseg_model = pkuseg_model
self . pkuseg_user_dict = pkuseg_user_dict
self . pkuseg_seg = None
self . jieba_seg = None
self . configure_segmenter ( segmenter )
def configure_segmenter ( self , segmenter : str ) :
if segmenter not in Segmenter . values ( ) :
warn_msg = Warnings . W103 . format (
lang = " Chinese " ,
segmenter = segmenter ,
supported = " , " . join ( Segmenter . values ( ) ) ,
default = " ' char ' (character segmentation) " ,
)
warnings . warn ( warn_msg )
self . segmenter = Segmenter . char
2020-07-19 14:34:37 +03:00
self . jieba_seg = try_jieba_import ( self . segmenter )
self . pkuseg_seg = try_pkuseg_import (
self . segmenter ,
2020-07-22 14:42:59 +03:00
pkuseg_model = self . pkuseg_model ,
pkuseg_user_dict = self . pkuseg_user_dict ,
2020-07-19 14:34:37 +03:00
)
2020-07-22 14:42:59 +03:00
def __call__ ( self , text : str ) - > Doc :
if self . segmenter == Segmenter . jieba :
2020-04-18 18:01:53 +03:00
words = list ( [ x for x in self . jieba_seg . cut ( text , cut_all = False ) if x ] )
( words , spaces ) = util . get_words_and_spaces ( words , text )
return Doc ( self . vocab , words = words , spaces = spaces )
2020-07-22 14:42:59 +03:00
elif self . segmenter == Segmenter . pkuseg :
2020-07-19 14:34:37 +03:00
if self . pkuseg_seg is None :
raise ValueError ( Errors . E1000 )
2020-04-18 18:01:53 +03:00
words = self . pkuseg_seg . cut ( text )
( words , spaces ) = util . get_words_and_spaces ( words , text )
2019-11-11 16:23:21 +03:00
return Doc ( self . vocab , words = words , spaces = spaces )
2020-07-19 14:34:37 +03:00
# warn if segmenter setting is not the only remaining option "char"
2020-07-22 14:42:59 +03:00
if self . segmenter != Segmenter . char :
2020-07-19 14:34:37 +03:00
warn_msg = Warnings . W103 . format (
lang = " Chinese " ,
segmenter = self . segmenter ,
2020-07-22 14:42:59 +03:00
supported = " , " . join ( Segmenter . values ( ) ) ,
2020-07-19 14:34:37 +03:00
default = " ' char ' (character segmentation) " ,
)
warnings . warn ( warn_msg )
# split into individual characters
words = list ( text )
( words , spaces ) = util . get_words_and_spaces ( words , text )
return Doc ( self . vocab , words = words , spaces = spaces )
2020-04-18 18:01:53 +03:00
2020-07-22 14:42:59 +03:00
def pkuseg_update_user_dict ( self , words : List [ str ] , reset : bool = False ) :
if self . segmenter == Segmenter . pkuseg :
2020-05-08 12:21:46 +03:00
if reset :
try :
import pkuseg
2020-05-21 15:04:57 +03:00
2020-05-08 12:21:46 +03:00
self . pkuseg_seg . preprocesser = pkuseg . Preprocesser ( None )
except ImportError :
2020-07-22 14:42:59 +03:00
msg = (
" pkuseg not installed: unable to reset pkuseg "
" user dict. Please " + _PKUSEG_INSTALL_MSG
)
2020-08-06 00:53:21 +03:00
raise ImportError ( msg ) from None
2020-05-08 12:21:46 +03:00
for word in words :
2020-05-21 15:04:57 +03:00
self . pkuseg_seg . preprocesser . insert ( word . strip ( ) , " " )
2020-07-19 14:34:37 +03:00
else :
warn_msg = Warnings . W104 . format ( target = " pkuseg " , current = self . segmenter )
warnings . warn ( warn_msg )
2020-05-08 12:21:46 +03:00
2020-04-18 18:01:53 +03:00
def to_bytes ( self , * * kwargs ) :
pkuseg_features_b = b " "
pkuseg_weights_b = b " "
pkuseg_processors_data = None
if self . pkuseg_seg :
with tempfile . TemporaryDirectory ( ) as tempdir :
self . pkuseg_seg . feature_extractor . save ( tempdir )
self . pkuseg_seg . model . save ( tempdir )
tempdir = Path ( tempdir )
2020-09-27 15:00:18 +03:00
# pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which
# means that it will be saved with pickle protocol 5 with
# python 3.8, which can't be reloaded with python 3.6-3.7.
# To try to make the model compatible with python 3.6+, reload
# the data with pickle5 and convert it back to protocol 4.
try :
import pickle5
with open ( tempdir / " features.pkl " , " rb " ) as fileh :
features = pickle5 . load ( fileh )
with open ( tempdir / " features.pkl " , " wb " ) as fileh :
pickle5 . dump ( features , fileh , protocol = 4 )
2020-09-27 15:37:28 +03:00
except ImportError as e :
raise ( e )
except Exception :
2020-09-27 15:00:18 +03:00
warnings . warn ( _PKUSEG_PICKLE_WARNING )
2020-04-18 18:01:53 +03:00
with open ( tempdir / " features.pkl " , " rb " ) as fileh :
pkuseg_features_b = fileh . read ( )
with open ( tempdir / " weights.npz " , " rb " ) as fileh :
pkuseg_weights_b = fileh . read ( )
pkuseg_processors_data = (
_get_pkuseg_trie_data ( self . pkuseg_seg . preprocesser . trie ) ,
self . pkuseg_seg . postprocesser . do_process ,
sorted ( list ( self . pkuseg_seg . postprocesser . common_words ) ) ,
sorted ( list ( self . pkuseg_seg . postprocesser . other_words ) ) ,
)
2020-07-22 14:42:59 +03:00
serializers = {
" pkuseg_features " : lambda : pkuseg_features_b ,
" pkuseg_weights " : lambda : pkuseg_weights_b ,
" pkuseg_processors " : lambda : srsly . msgpack_dumps ( pkuseg_processors_data ) ,
}
2020-04-18 18:01:53 +03:00
return util . to_bytes ( serializers , [ ] )
def from_bytes ( self , data , * * kwargs ) :
2020-05-21 15:24:38 +03:00
pkuseg_data = { " features_b " : b " " , " weights_b " : b " " , " processors_data " : None }
2020-04-18 18:01:53 +03:00
def deserialize_pkuseg_features ( b ) :
2020-05-21 15:24:38 +03:00
pkuseg_data [ " features_b " ] = b
2020-04-18 18:01:53 +03:00
def deserialize_pkuseg_weights ( b ) :
2020-05-21 15:24:38 +03:00
pkuseg_data [ " weights_b " ] = b
2020-04-18 18:01:53 +03:00
def deserialize_pkuseg_processors ( b ) :
2020-05-21 15:24:38 +03:00
pkuseg_data [ " processors_data " ] = srsly . msgpack_loads ( b )
2020-04-18 18:01:53 +03:00
2020-07-22 14:42:59 +03:00
deserializers = {
" pkuseg_features " : deserialize_pkuseg_features ,
" pkuseg_weights " : deserialize_pkuseg_weights ,
" pkuseg_processors " : deserialize_pkuseg_processors ,
}
2020-04-18 18:01:53 +03:00
util . from_bytes ( data , deserializers , [ ] )
2020-05-21 15:24:38 +03:00
if pkuseg_data [ " features_b " ] and pkuseg_data [ " weights_b " ] :
2020-04-18 18:01:53 +03:00
with tempfile . TemporaryDirectory ( ) as tempdir :
tempdir = Path ( tempdir )
with open ( tempdir / " features.pkl " , " wb " ) as fileh :
2020-05-21 15:24:38 +03:00
fileh . write ( pkuseg_data [ " features_b " ] )
2020-04-18 18:01:53 +03:00
with open ( tempdir / " weights.npz " , " wb " ) as fileh :
2020-05-21 15:24:38 +03:00
fileh . write ( pkuseg_data [ " weights_b " ] )
2020-04-18 18:01:53 +03:00
try :
import pkuseg
except ImportError :
raise ImportError (
" pkuseg not installed. To use this model, "
+ _PKUSEG_INSTALL_MSG
2020-08-06 00:53:21 +03:00
) from None
2020-04-18 18:01:53 +03:00
self . pkuseg_seg = pkuseg . pkuseg ( str ( tempdir ) )
2020-05-21 15:24:38 +03:00
if pkuseg_data [ " processors_data " ] :
processors_data = pkuseg_data [ " processors_data " ]
( user_dict , do_process , common_words , other_words ) = processors_data
2020-04-18 18:01:53 +03:00
self . pkuseg_seg . preprocesser = pkuseg . Preprocesser ( user_dict )
self . pkuseg_seg . postprocesser . do_process = do_process
self . pkuseg_seg . postprocesser . common_words = set ( common_words )
self . pkuseg_seg . postprocesser . other_words = set ( other_words )
return self
def to_disk ( self , path , * * kwargs ) :
path = util . ensure_path ( path )
2019-11-11 16:23:21 +03:00
2020-04-18 18:01:53 +03:00
def save_pkuseg_model ( path ) :
if self . pkuseg_seg :
if not path . exists ( ) :
path . mkdir ( parents = True )
self . pkuseg_seg . model . save ( path )
self . pkuseg_seg . feature_extractor . save ( path )
2020-09-27 15:00:18 +03:00
# try to convert features.pkl to pickle protocol 4
try :
import pickle5
with open ( path / " features.pkl " , " rb " ) as fileh :
features = pickle5 . load ( fileh )
with open ( path / " features.pkl " , " wb " ) as fileh :
pickle5 . dump ( features , fileh , protocol = 4 )
2020-09-27 15:37:28 +03:00
except ImportError as e :
raise ( e )
except Exception :
2020-09-27 15:00:18 +03:00
warnings . warn ( _PKUSEG_PICKLE_WARNING )
2020-04-18 18:01:53 +03:00
def save_pkuseg_processors ( path ) :
if self . pkuseg_seg :
data = (
_get_pkuseg_trie_data ( self . pkuseg_seg . preprocesser . trie ) ,
self . pkuseg_seg . postprocesser . do_process ,
sorted ( list ( self . pkuseg_seg . postprocesser . common_words ) ) ,
sorted ( list ( self . pkuseg_seg . postprocesser . other_words ) ) ,
)
srsly . write_msgpack ( path , data )
2020-07-22 14:42:59 +03:00
serializers = {
" pkuseg_model " : lambda p : save_pkuseg_model ( p ) ,
" pkuseg_processors " : lambda p : save_pkuseg_processors ( p ) ,
}
2020-04-18 18:01:53 +03:00
return util . to_disk ( path , serializers , [ ] )
def from_disk ( self , path , * * kwargs ) :
path = util . ensure_path ( path )
def load_pkuseg_model ( path ) :
try :
import pkuseg
except ImportError :
2020-07-22 14:42:59 +03:00
if self . segmenter == Segmenter . pkuseg :
2020-04-18 18:01:53 +03:00
raise ImportError (
" pkuseg not installed. To use this model, "
+ _PKUSEG_INSTALL_MSG
2020-08-06 00:53:21 +03:00
) from None
2020-04-18 18:01:53 +03:00
if path . exists ( ) :
self . pkuseg_seg = pkuseg . pkuseg ( path )
def load_pkuseg_processors ( path ) :
try :
import pkuseg
except ImportError :
2020-07-22 14:42:59 +03:00
if self . segmenter == Segmenter . pkuseg :
2020-08-06 00:53:21 +03:00
raise ImportError ( self . _pkuseg_install_msg ) from None
2020-07-22 14:42:59 +03:00
if self . segmenter == Segmenter . pkuseg :
2020-04-18 18:01:53 +03:00
data = srsly . read_msgpack ( path )
( user_dict , do_process , common_words , other_words ) = data
self . pkuseg_seg . preprocesser = pkuseg . Preprocesser ( user_dict )
self . pkuseg_seg . postprocesser . do_process = do_process
self . pkuseg_seg . postprocesser . common_words = set ( common_words )
self . pkuseg_seg . postprocesser . other_words = set ( other_words )
2020-07-22 14:42:59 +03:00
serializers = {
" pkuseg_model " : lambda p : load_pkuseg_model ( p ) ,
" pkuseg_processors " : lambda p : load_pkuseg_processors ( p ) ,
}
2020-04-18 18:01:53 +03:00
util . from_disk ( path , serializers , [ ] )
2019-11-11 16:23:21 +03:00
2017-12-28 12:13:58 +03:00
class ChineseDefaults ( Language . Defaults ) :
2020-07-24 15:50:26 +03:00
config = Config ( ) . from_str ( DEFAULT_CONFIG )
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
writing_system = { " direction " : " ltr " , " has_case " : False , " has_letters " : False }
2017-12-28 12:13:58 +03:00
2019-03-11 19:10:50 +03:00
2016-04-24 19:44:24 +03:00
class Chinese ( Language ) :
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
lang = " zh "
2020-07-22 14:42:59 +03:00
Defaults = ChineseDefaults
2016-05-05 12:39:12 +03:00
2020-07-22 14:42:59 +03:00
def try_jieba_import ( segmenter : str ) - > None :
try :
import jieba
if segmenter == Segmenter . jieba :
# segment a short text to have jieba initialize its cache in advance
list ( jieba . cut ( " 作为 " , cut_all = False ) )
return jieba
except ImportError :
if segmenter == Segmenter . jieba :
msg = (
" Jieba not installed. To use jieba, install it with `pip "
" install jieba` or from https://github.com/fxsjy/jieba "
)
2020-08-06 00:53:21 +03:00
raise ImportError ( msg ) from None
2020-07-22 14:42:59 +03:00
2020-09-27 15:00:18 +03:00
def try_pkuseg_import ( segmenter : str , pkuseg_model : Optional [ str ] , pkuseg_user_dict : str ) - > None :
2020-07-22 14:42:59 +03:00
try :
import pkuseg
2020-09-27 15:00:18 +03:00
if pkuseg_model is None :
return None
else :
2020-07-22 14:42:59 +03:00
return pkuseg . pkuseg ( pkuseg_model , pkuseg_user_dict )
except ImportError :
if segmenter == Segmenter . pkuseg :
msg = " pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
2020-08-06 00:53:21 +03:00
raise ImportError ( msg ) from None
2020-07-22 14:42:59 +03:00
except FileNotFoundError :
if segmenter == Segmenter . pkuseg :
msg = " Unable to load pkuseg model from: " + pkuseg_model
2020-08-06 00:53:21 +03:00
raise FileNotFoundError ( msg ) from None
2017-05-03 12:01:42 +03:00
2020-04-18 18:01:53 +03:00
def _get_pkuseg_trie_data ( node , path = " " ) :
data = [ ]
for c , child_node in sorted ( node . children . items ( ) ) :
data . extend ( _get_pkuseg_trie_data ( child_node , path + c ) )
if node . isword :
data . append ( ( path , node . usertag ) )
return data
2019-08-18 16:09:16 +03:00
__all__ = [ " Chinese " ]