2017-05-08 16:54:36 +03:00
|
|
|
# coding: utf8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2020-04-18 18:01:53 +03:00
|
|
|
import tempfile
|
|
|
|
import srsly
|
|
|
|
from pathlib import Path
|
|
|
|
from collections import OrderedDict
|
2017-12-28 12:13:58 +03:00
|
|
|
from ...attrs import LANG
|
2017-05-08 23:29:04 +03:00
|
|
|
from ...language import Language
|
|
|
|
from ...tokens import Doc
|
2019-11-11 16:23:21 +03:00
|
|
|
from ...util import DummyTokenizer
|
2018-08-07 12:26:31 +03:00
|
|
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
2019-11-11 16:23:21 +03:00
|
|
|
from .lex_attrs import LEX_ATTRS
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
from .stop_words import STOP_WORDS
|
2019-08-12 11:37:48 +03:00
|
|
|
from .tag_map import TAG_MAP
|
2020-04-18 18:01:53 +03:00
|
|
|
from ... import util
|
|
|
|
|
|
|
|
|
2020-11-14 11:20:42 +03:00
|
|
|
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
|
2016-04-24 19:44:24 +03:00
|
|
|
|
2019-08-18 16:09:16 +03:00
|
|
|
|
2019-11-11 16:23:21 +03:00
|
|
|
def try_jieba_import(use_jieba):
|
|
|
|
try:
|
|
|
|
import jieba
|
2019-11-20 15:15:24 +03:00
|
|
|
|
2020-04-18 18:01:53 +03:00
|
|
|
# segment a short text to have jieba initialize its cache in advance
|
|
|
|
list(jieba.cut("作为", cut_all=False))
|
|
|
|
|
2019-11-11 16:23:21 +03:00
|
|
|
return jieba
|
|
|
|
except ImportError:
|
|
|
|
if use_jieba:
|
|
|
|
msg = (
|
2020-04-20 23:06:53 +03:00
|
|
|
"Jieba not installed. Either set the default to False with "
|
|
|
|
"`from spacy.lang.zh import ChineseDefaults; ChineseDefaults.use_jieba = False`, "
|
|
|
|
"or install it with `pip install jieba` or from "
|
|
|
|
"https://github.com/fxsjy/jieba"
|
2019-11-11 16:23:21 +03:00
|
|
|
)
|
|
|
|
raise ImportError(msg)
|
|
|
|
|
|
|
|
|
2020-04-18 18:01:53 +03:00
|
|
|
def try_pkuseg_import(use_pkuseg, pkuseg_model, pkuseg_user_dict):
|
|
|
|
try:
|
|
|
|
import pkuseg
|
|
|
|
|
|
|
|
if pkuseg_model:
|
|
|
|
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
|
|
|
elif use_pkuseg:
|
|
|
|
msg = (
|
|
|
|
"Chinese.use_pkuseg is True but no pkuseg model was specified. "
|
|
|
|
"Please provide the name of a pretrained model "
|
|
|
|
"or the path to a model with "
|
|
|
|
'`Chinese(meta={"tokenizer": {"config": {"pkuseg_model": name_or_path}}}).'
|
|
|
|
)
|
|
|
|
raise ValueError(msg)
|
|
|
|
except ImportError:
|
|
|
|
if use_pkuseg:
|
|
|
|
msg = (
|
|
|
|
"pkuseg not installed. Either set Chinese.use_pkuseg = False, "
|
|
|
|
"or " + _PKUSEG_INSTALL_MSG
|
|
|
|
)
|
|
|
|
raise ImportError(msg)
|
|
|
|
except FileNotFoundError:
|
|
|
|
if use_pkuseg:
|
|
|
|
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
|
|
|
raise FileNotFoundError(msg)
|
|
|
|
|
|
|
|
|
2019-11-11 16:23:21 +03:00
|
|
|
class ChineseTokenizer(DummyTokenizer):
|
2020-04-18 18:01:53 +03:00
|
|
|
def __init__(self, cls, nlp=None, config={}):
|
|
|
|
self.use_jieba = config.get("use_jieba", cls.use_jieba)
|
|
|
|
self.use_pkuseg = config.get("use_pkuseg", cls.use_pkuseg)
|
|
|
|
self.require_pkuseg = config.get("require_pkuseg", False)
|
2019-11-11 16:23:21 +03:00
|
|
|
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
|
|
|
self.jieba_seg = try_jieba_import(self.use_jieba)
|
2020-04-18 18:01:53 +03:00
|
|
|
self.pkuseg_seg = try_pkuseg_import(
|
|
|
|
self.use_pkuseg,
|
|
|
|
pkuseg_model=config.get("pkuseg_model", None),
|
|
|
|
pkuseg_user_dict=config.get("pkuseg_user_dict", "default"),
|
|
|
|
)
|
|
|
|
# remove relevant settings from config so they're not also saved in
|
|
|
|
# Language.meta
|
|
|
|
for key in ["use_jieba", "use_pkuseg", "require_pkuseg", "pkuseg_model"]:
|
|
|
|
if key in config:
|
|
|
|
del config[key]
|
2019-11-11 16:23:21 +03:00
|
|
|
self.tokenizer = Language.Defaults().create_tokenizer(nlp)
|
|
|
|
|
|
|
|
def __call__(self, text):
|
2020-04-18 18:01:53 +03:00
|
|
|
use_jieba = self.use_jieba
|
|
|
|
use_pkuseg = self.use_pkuseg
|
|
|
|
if self.require_pkuseg:
|
|
|
|
use_jieba = False
|
|
|
|
use_pkuseg = True
|
|
|
|
if use_jieba:
|
|
|
|
words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
|
|
|
|
(words, spaces) = util.get_words_and_spaces(words, text)
|
|
|
|
return Doc(self.vocab, words=words, spaces=spaces)
|
|
|
|
elif use_pkuseg:
|
|
|
|
words = self.pkuseg_seg.cut(text)
|
|
|
|
(words, spaces) = util.get_words_and_spaces(words, text)
|
2019-11-11 16:23:21 +03:00
|
|
|
return Doc(self.vocab, words=words, spaces=spaces)
|
2020-04-18 18:01:53 +03:00
|
|
|
else:
|
|
|
|
# split into individual characters
|
|
|
|
words = list(text)
|
|
|
|
(words, spaces) = util.get_words_and_spaces(words, text)
|
|
|
|
return Doc(self.vocab, words=words, spaces=spaces)
|
|
|
|
|
2020-05-08 12:21:46 +03:00
|
|
|
def pkuseg_update_user_dict(self, words, reset=False):
|
|
|
|
if self.pkuseg_seg:
|
|
|
|
if reset:
|
|
|
|
try:
|
|
|
|
import pkuseg
|
2020-11-14 11:20:42 +03:00
|
|
|
|
2020-05-08 12:21:46 +03:00
|
|
|
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
|
|
|
|
except ImportError:
|
|
|
|
if self.use_pkuseg:
|
|
|
|
msg = (
|
|
|
|
"pkuseg not installed: unable to reset pkuseg "
|
|
|
|
"user dict. Please " + _PKUSEG_INSTALL_MSG
|
|
|
|
)
|
|
|
|
raise ImportError(msg)
|
|
|
|
for word in words:
|
2020-11-14 11:20:42 +03:00
|
|
|
self.pkuseg_seg.preprocesser.insert(word.strip(), "")
|
2020-05-08 12:21:46 +03:00
|
|
|
|
2020-04-18 18:01:53 +03:00
|
|
|
def _get_config(self):
|
|
|
|
config = OrderedDict(
|
|
|
|
(
|
|
|
|
("use_jieba", self.use_jieba),
|
|
|
|
("use_pkuseg", self.use_pkuseg),
|
|
|
|
("require_pkuseg", self.require_pkuseg),
|
|
|
|
)
|
|
|
|
)
|
|
|
|
return config
|
|
|
|
|
|
|
|
def _set_config(self, config={}):
|
|
|
|
self.use_jieba = config.get("use_jieba", False)
|
|
|
|
self.use_pkuseg = config.get("use_pkuseg", False)
|
|
|
|
self.require_pkuseg = config.get("require_pkuseg", False)
|
|
|
|
|
|
|
|
def to_bytes(self, **kwargs):
|
|
|
|
pkuseg_features_b = b""
|
|
|
|
pkuseg_weights_b = b""
|
|
|
|
pkuseg_processors_data = None
|
|
|
|
if self.pkuseg_seg:
|
|
|
|
with tempfile.TemporaryDirectory() as tempdir:
|
|
|
|
self.pkuseg_seg.feature_extractor.save(tempdir)
|
|
|
|
self.pkuseg_seg.model.save(tempdir)
|
|
|
|
tempdir = Path(tempdir)
|
|
|
|
with open(tempdir / "features.pkl", "rb") as fileh:
|
|
|
|
pkuseg_features_b = fileh.read()
|
|
|
|
with open(tempdir / "weights.npz", "rb") as fileh:
|
|
|
|
pkuseg_weights_b = fileh.read()
|
|
|
|
pkuseg_processors_data = (
|
|
|
|
_get_pkuseg_trie_data(self.pkuseg_seg.preprocesser.trie),
|
|
|
|
self.pkuseg_seg.postprocesser.do_process,
|
|
|
|
sorted(list(self.pkuseg_seg.postprocesser.common_words)),
|
|
|
|
sorted(list(self.pkuseg_seg.postprocesser.other_words)),
|
|
|
|
)
|
|
|
|
serializers = OrderedDict(
|
|
|
|
(
|
|
|
|
("cfg", lambda: srsly.json_dumps(self._get_config())),
|
|
|
|
("pkuseg_features", lambda: pkuseg_features_b),
|
|
|
|
("pkuseg_weights", lambda: pkuseg_weights_b),
|
|
|
|
(
|
|
|
|
"pkuseg_processors",
|
|
|
|
lambda: srsly.msgpack_dumps(pkuseg_processors_data),
|
|
|
|
),
|
|
|
|
)
|
|
|
|
)
|
|
|
|
return util.to_bytes(serializers, [])
|
|
|
|
|
|
|
|
def from_bytes(self, data, **kwargs):
|
2020-11-14 11:20:42 +03:00
|
|
|
pkuseg_data = {"features_b": b"", "weights_b": b"", "processors_data": None}
|
2020-04-18 18:01:53 +03:00
|
|
|
|
|
|
|
def deserialize_pkuseg_features(b):
|
2020-11-14 11:20:42 +03:00
|
|
|
pkuseg_data["features_b"] = b
|
2020-04-18 18:01:53 +03:00
|
|
|
|
|
|
|
def deserialize_pkuseg_weights(b):
|
2020-11-14 11:20:42 +03:00
|
|
|
pkuseg_data["weights_b"] = b
|
2020-04-18 18:01:53 +03:00
|
|
|
|
|
|
|
def deserialize_pkuseg_processors(b):
|
2020-11-14 11:20:42 +03:00
|
|
|
pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
|
2020-04-18 18:01:53 +03:00
|
|
|
|
|
|
|
deserializers = OrderedDict(
|
|
|
|
(
|
|
|
|
("cfg", lambda b: self._set_config(srsly.json_loads(b))),
|
|
|
|
("pkuseg_features", deserialize_pkuseg_features),
|
|
|
|
("pkuseg_weights", deserialize_pkuseg_weights),
|
|
|
|
("pkuseg_processors", deserialize_pkuseg_processors),
|
|
|
|
)
|
|
|
|
)
|
|
|
|
util.from_bytes(data, deserializers, [])
|
|
|
|
|
2020-11-14 11:20:42 +03:00
|
|
|
if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
|
2020-04-18 18:01:53 +03:00
|
|
|
with tempfile.TemporaryDirectory() as tempdir:
|
|
|
|
tempdir = Path(tempdir)
|
|
|
|
with open(tempdir / "features.pkl", "wb") as fileh:
|
2020-11-14 11:20:42 +03:00
|
|
|
fileh.write(pkuseg_data["features_b"])
|
2020-04-18 18:01:53 +03:00
|
|
|
with open(tempdir / "weights.npz", "wb") as fileh:
|
2020-11-14 11:20:42 +03:00
|
|
|
fileh.write(pkuseg_data["weights_b"])
|
2020-04-18 18:01:53 +03:00
|
|
|
try:
|
|
|
|
import pkuseg
|
|
|
|
except ImportError:
|
|
|
|
raise ImportError(
|
|
|
|
"pkuseg not installed. To use this model, "
|
|
|
|
+ _PKUSEG_INSTALL_MSG
|
|
|
|
)
|
|
|
|
self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
|
2020-11-14 11:20:42 +03:00
|
|
|
if pkuseg_data["processors_data"]:
|
|
|
|
processors_data = pkuseg_data["processors_data"]
|
|
|
|
(user_dict, do_process, common_words, other_words) = processors_data
|
2020-04-18 18:01:53 +03:00
|
|
|
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
|
|
|
self.pkuseg_seg.postprocesser.do_process = do_process
|
|
|
|
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
|
|
|
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
|
|
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
def to_disk(self, path, **kwargs):
|
|
|
|
path = util.ensure_path(path)
|
2019-11-11 16:23:21 +03:00
|
|
|
|
2020-04-18 18:01:53 +03:00
|
|
|
def save_pkuseg_model(path):
|
|
|
|
if self.pkuseg_seg:
|
|
|
|
if not path.exists():
|
|
|
|
path.mkdir(parents=True)
|
|
|
|
self.pkuseg_seg.model.save(path)
|
|
|
|
self.pkuseg_seg.feature_extractor.save(path)
|
|
|
|
|
|
|
|
def save_pkuseg_processors(path):
|
|
|
|
if self.pkuseg_seg:
|
|
|
|
data = (
|
|
|
|
_get_pkuseg_trie_data(self.pkuseg_seg.preprocesser.trie),
|
|
|
|
self.pkuseg_seg.postprocesser.do_process,
|
|
|
|
sorted(list(self.pkuseg_seg.postprocesser.common_words)),
|
|
|
|
sorted(list(self.pkuseg_seg.postprocesser.other_words)),
|
|
|
|
)
|
|
|
|
srsly.write_msgpack(path, data)
|
|
|
|
|
|
|
|
serializers = OrderedDict(
|
|
|
|
(
|
|
|
|
("cfg", lambda p: srsly.write_json(p, self._get_config())),
|
|
|
|
("pkuseg_model", lambda p: save_pkuseg_model(p)),
|
|
|
|
("pkuseg_processors", lambda p: save_pkuseg_processors(p)),
|
|
|
|
)
|
|
|
|
)
|
|
|
|
return util.to_disk(path, serializers, [])
|
|
|
|
|
|
|
|
def from_disk(self, path, **kwargs):
|
|
|
|
path = util.ensure_path(path)
|
|
|
|
|
|
|
|
def load_pkuseg_model(path):
|
|
|
|
try:
|
|
|
|
import pkuseg
|
|
|
|
except ImportError:
|
|
|
|
if self.use_pkuseg:
|
|
|
|
raise ImportError(
|
|
|
|
"pkuseg not installed. To use this model, "
|
|
|
|
+ _PKUSEG_INSTALL_MSG
|
|
|
|
)
|
|
|
|
if path.exists():
|
|
|
|
self.pkuseg_seg = pkuseg.pkuseg(path)
|
|
|
|
|
|
|
|
def load_pkuseg_processors(path):
|
|
|
|
try:
|
|
|
|
import pkuseg
|
|
|
|
except ImportError:
|
|
|
|
if self.use_pkuseg:
|
|
|
|
raise ImportError(self._pkuseg_install_msg)
|
|
|
|
if self.pkuseg_seg:
|
|
|
|
data = srsly.read_msgpack(path)
|
|
|
|
(user_dict, do_process, common_words, other_words) = data
|
|
|
|
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
|
|
|
self.pkuseg_seg.postprocesser.do_process = do_process
|
|
|
|
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
|
|
|
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
|
|
|
|
|
|
|
serializers = OrderedDict(
|
|
|
|
(
|
|
|
|
("cfg", lambda p: self._set_config(srsly.read_json(p))),
|
|
|
|
("pkuseg_model", lambda p: load_pkuseg_model(p)),
|
|
|
|
("pkuseg_processors", lambda p: load_pkuseg_processors(p)),
|
|
|
|
)
|
|
|
|
)
|
|
|
|
util.from_disk(path, serializers, [])
|
2019-11-11 16:23:21 +03:00
|
|
|
|
|
|
|
|
2017-12-28 12:13:58 +03:00
|
|
|
class ChineseDefaults(Language.Defaults):
|
|
|
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
2019-11-11 16:23:21 +03:00
|
|
|
lex_attr_getters.update(LEX_ATTRS)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
lex_attr_getters[LANG] = lambda text: "zh"
|
|
|
|
tokenizer_exceptions = BASE_EXCEPTIONS
|
2018-07-25 03:47:23 +03:00
|
|
|
stop_words = STOP_WORDS
|
2019-08-12 11:37:48 +03:00
|
|
|
tag_map = TAG_MAP
|
2019-03-11 17:23:20 +03:00
|
|
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
2019-11-11 16:23:21 +03:00
|
|
|
use_jieba = True
|
2020-04-18 18:01:53 +03:00
|
|
|
use_pkuseg = False
|
2019-11-11 16:23:21 +03:00
|
|
|
|
|
|
|
@classmethod
|
2020-04-18 18:01:53 +03:00
|
|
|
def create_tokenizer(cls, nlp=None, config={}):
|
|
|
|
return ChineseTokenizer(cls, nlp, config=config)
|
2017-12-28 12:13:58 +03:00
|
|
|
|
2019-03-11 19:10:50 +03:00
|
|
|
|
2016-04-24 19:44:24 +03:00
|
|
|
class Chinese(Language):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
lang = "zh"
|
2017-12-28 12:13:58 +03:00
|
|
|
Defaults = ChineseDefaults # override defaults
|
2016-05-05 12:39:12 +03:00
|
|
|
|
2016-11-02 21:57:38 +03:00
|
|
|
def make_doc(self, text):
|
2019-11-11 16:23:21 +03:00
|
|
|
return self.tokenizer(text)
|
2017-05-03 12:01:42 +03:00
|
|
|
|
|
|
|
|
2020-04-18 18:01:53 +03:00
|
|
|
def _get_pkuseg_trie_data(node, path=""):
|
|
|
|
data = []
|
|
|
|
for c, child_node in sorted(node.children.items()):
|
|
|
|
data.extend(_get_pkuseg_trie_data(child_node, path + c))
|
|
|
|
if node.isword:
|
|
|
|
data.append((path, node.usertag))
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
2019-08-18 16:09:16 +03:00
|
|
|
__all__ = ["Chinese"]
|