From 39ebcd9ec9768b03f49893de45793015829f80cf Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 19 Jul 2020 13:34:37 +0200 Subject: [PATCH] Refactor Chinese tokenizer configuration (#5736) * Refactor Chinese tokenizer configuration Refactor `ChineseTokenizer` configuration so that it uses a single `segmenter` setting to choose between character segmentation, jieba, and pkuseg. * replace `use_jieba`, `use_pkuseg`, `require_pkuseg` with the setting `segmenter` with the supported values: `char`, `jieba`, `pkuseg` * make the default segmenter plain character segmentation `char` (no additional libraries required) * Fix Chinese serialization test to use char default * Warn if attempting to customize other segmenter Add a warning if `Chinese.pkuseg_update_user_dict` is called when another segmenter is selected. --- spacy/errors.py | 8 ++ spacy/lang/zh/__init__.py | 128 ++++++++++++++------------ spacy/tests/conftest.py | 10 +- spacy/tests/lang/zh/test_serialize.py | 11 +-- spacy/tests/lang/zh/test_tokenizer.py | 22 ++++- website/docs/usage/models.md | 53 ++++++----- 6 files changed, 134 insertions(+), 98 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 45de5ed45..4f234a494 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -115,6 +115,10 @@ class Warnings: "string \"Field1=Value1,Value2|Field2=Value3\".") W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.") W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.") + W103 = ("Unknown {lang} word segmenter '{segmenter}'. Supported " + "word segmenters: {supported}. Defaulting to {default}.") + W104 = ("Skipping modifications for '{target}' segmenter. The current " + "segmenter is '{current}'.") @add_codes @@ -535,6 +539,10 @@ class Errors: "'{token_attrs}'.") E999 = ("Unable to merge the `Doc` objects because they do not all share " "the same `Vocab`.") + E1000 = ("No pkuseg model available. Provide a pkuseg model when " + "initializing the pipeline: " + '`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; ' + 'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`') @add_codes diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index fc7573f8d..99a84edfc 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -1,8 +1,10 @@ import tempfile import srsly +import warnings from pathlib import Path from collections import OrderedDict from ...attrs import LANG +from ...errors import Warnings, Errors from ...language import Language from ...tokens import Doc from ...util import DummyTokenizer @@ -16,100 +18,117 @@ from ... import util _PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.22` or from https://github.com/lancopku/pkuseg-python" -def try_jieba_import(use_jieba): +def try_jieba_import(segmenter): try: import jieba - # segment a short text to have jieba initialize its cache in advance - list(jieba.cut("作为", cut_all=False)) + if segmenter == "jieba": + # segment a short text to have jieba initialize its cache in advance + list(jieba.cut("作为", cut_all=False)) return jieba except ImportError: - if use_jieba: + if segmenter == "jieba": msg = ( - "Jieba not installed. Either set the default to False with " - "`from spacy.lang.zh import ChineseDefaults; ChineseDefaults.use_jieba = False`, " - "or install it with `pip install jieba` or from " - "https://github.com/fxsjy/jieba" + "Jieba not installed. To use jieba, install it with `pip " + " install jieba` or from https://github.com/fxsjy/jieba" ) raise ImportError(msg) -def try_pkuseg_import(use_pkuseg, pkuseg_model, pkuseg_user_dict): +def try_pkuseg_import(segmenter, pkuseg_model, pkuseg_user_dict): try: import pkuseg if pkuseg_model: return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict) - elif use_pkuseg: + elif segmenter == "pkuseg": msg = ( - "Chinese.use_pkuseg is True but no pkuseg model was specified. " - "Please provide the name of a pretrained model " + "The Chinese word segmenter is 'pkuseg' but no pkuseg model " + "was specified. Please provide the name of a pretrained model " "or the path to a model with " - '`Chinese(meta={"tokenizer": {"config": {"pkuseg_model": name_or_path}}}).' + '`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; ' + 'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`' ) raise ValueError(msg) except ImportError: - if use_pkuseg: - msg = ( - "pkuseg not installed. Either set Chinese.use_pkuseg = False, " - "or " + _PKUSEG_INSTALL_MSG - ) + if segmenter == "pkuseg": + msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG raise ImportError(msg) except FileNotFoundError: - if use_pkuseg: + if segmenter == "pkuseg": msg = "Unable to load pkuseg model from: " + pkuseg_model raise FileNotFoundError(msg) class ChineseTokenizer(DummyTokenizer): def __init__(self, cls, nlp=None, config={}): - self.use_jieba = config.get("use_jieba", cls.use_jieba) - self.use_pkuseg = config.get("use_pkuseg", cls.use_pkuseg) - self.require_pkuseg = config.get("require_pkuseg", False) + self.supported_segmenters = ("char", "jieba", "pkuseg") + self.configure_segmenter(config) self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - self.jieba_seg = try_jieba_import(self.use_jieba) - self.pkuseg_seg = try_pkuseg_import( - self.use_pkuseg, - pkuseg_model=config.get("pkuseg_model", None), - pkuseg_user_dict=config.get("pkuseg_user_dict", "default"), - ) # remove relevant settings from config so they're not also saved in # Language.meta - for key in ["use_jieba", "use_pkuseg", "require_pkuseg", "pkuseg_model"]: + for key in ["segmenter", "pkuseg_model", "pkuseg_user_dict"]: if key in config: del config[key] self.tokenizer = Language.Defaults().create_tokenizer(nlp) + def configure_segmenter(self, config): + self.segmenter = "char" + if "segmenter" in config: + if config["segmenter"] in self.supported_segmenters: + self.segmenter = config["segmenter"] + else: + warn_msg = Warnings.W103.format( + lang="Chinese", + segmenter=config["segmenter"], + supported=", ".join([repr(s) for s in self.supported_segmenters]), + default="'char' (character segmentation)", + ) + warnings.warn(warn_msg) + self.jieba_seg = try_jieba_import(self.segmenter) + self.pkuseg_seg = try_pkuseg_import( + self.segmenter, + pkuseg_model=config.get("pkuseg_model", None), + pkuseg_user_dict=config.get("pkuseg_user_dict", "default"), + ) + def __call__(self, text): - use_jieba = self.use_jieba - use_pkuseg = self.use_pkuseg - if self.require_pkuseg: - use_jieba = False - use_pkuseg = True - if use_jieba: + if self.segmenter == "jieba": words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x]) (words, spaces) = util.get_words_and_spaces(words, text) return Doc(self.vocab, words=words, spaces=spaces) - elif use_pkuseg: + elif self.segmenter == "pkuseg": + if self.pkuseg_seg is None: + raise ValueError(Errors.E1000) words = self.pkuseg_seg.cut(text) (words, spaces) = util.get_words_and_spaces(words, text) return Doc(self.vocab, words=words, spaces=spaces) - else: - # split into individual characters - words = list(text) - (words, spaces) = util.get_words_and_spaces(words, text) - return Doc(self.vocab, words=words, spaces=spaces) + + # warn if segmenter setting is not the only remaining option "char" + if self.segmenter != "char": + warn_msg = Warnings.W103.format( + lang="Chinese", + segmenter=self.segmenter, + supported=", ".join([repr(s) for s in self.supported_segmenters]), + default="'char' (character segmentation)", + ) + warnings.warn(warn_msg) + + # split into individual characters + words = list(text) + (words, spaces) = util.get_words_and_spaces(words, text) + return Doc(self.vocab, words=words, spaces=spaces) def pkuseg_update_user_dict(self, words, reset=False): - if self.pkuseg_seg: + if self.segmenter == "pkuseg": if reset: try: import pkuseg self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None) except ImportError: - if self.use_pkuseg: + if self.segmenter == "pkuseg": msg = ( "pkuseg not installed: unable to reset pkuseg " "user dict. Please " + _PKUSEG_INSTALL_MSG @@ -117,21 +136,16 @@ class ChineseTokenizer(DummyTokenizer): raise ImportError(msg) for word in words: self.pkuseg_seg.preprocesser.insert(word.strip(), "") + else: + warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter) + warnings.warn(warn_msg) def _get_config(self): - config = OrderedDict( - ( - ("use_jieba", self.use_jieba), - ("use_pkuseg", self.use_pkuseg), - ("require_pkuseg", self.require_pkuseg), - ) - ) + config = OrderedDict((("segmenter", self.segmenter),)) return config def _set_config(self, config={}): - self.use_jieba = config.get("use_jieba", False) - self.use_pkuseg = config.get("use_pkuseg", False) - self.require_pkuseg = config.get("require_pkuseg", False) + self.configure_segmenter(config) def to_bytes(self, **kwargs): pkuseg_features_b = b"" @@ -248,7 +262,7 @@ class ChineseTokenizer(DummyTokenizer): try: import pkuseg except ImportError: - if self.use_pkuseg: + if self.segmenter == "pkuseg": raise ImportError( "pkuseg not installed. To use this model, " + _PKUSEG_INSTALL_MSG @@ -260,9 +274,9 @@ class ChineseTokenizer(DummyTokenizer): try: import pkuseg except ImportError: - if self.use_pkuseg: + if self.segmenter == "pkuseg": raise ImportError(self._pkuseg_install_msg) - if self.pkuseg_seg: + if self.segmenter == "pkuseg": data = srsly.read_msgpack(path) (user_dict, do_process, common_words, other_words) = data self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict) @@ -288,8 +302,6 @@ class ChineseDefaults(Language.Defaults): stop_words = STOP_WORDS tag_map = TAG_MAP writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} - use_jieba = True - use_pkuseg = False @classmethod def create_tokenizer(cls, nlp=None, config={}): diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 22ff2ce26..bc8d088c5 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -244,22 +244,22 @@ def yo_tokenizer(): @pytest.fixture(scope="session") def zh_tokenizer_char(): - return get_lang_class("zh").Defaults.create_tokenizer( - config={"use_jieba": False, "use_pkuseg": False} - ) + return get_lang_class("zh").Defaults.create_tokenizer() @pytest.fixture(scope="session") def zh_tokenizer_jieba(): pytest.importorskip("jieba") - return get_lang_class("zh").Defaults.create_tokenizer() + return get_lang_class("zh").Defaults.create_tokenizer( + config={"segmenter": "jieba"} + ) @pytest.fixture(scope="session") def zh_tokenizer_pkuseg(): pytest.importorskip("pkuseg") return get_lang_class("zh").Defaults.create_tokenizer( - config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True} + config={"pkuseg_model": "default", "segmenter": "pkuseg"} ) diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py index d84920c3e..544c4a7bc 100644 --- a/spacy/tests/lang/zh/test_serialize.py +++ b/spacy/tests/lang/zh/test_serialize.py @@ -5,14 +5,14 @@ from ...util import make_tempdir def zh_tokenizer_serialize(zh_tokenizer): tokenizer_bytes = zh_tokenizer.to_bytes() - nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}}) + nlp = Chinese() nlp.tokenizer.from_bytes(tokenizer_bytes) assert tokenizer_bytes == nlp.tokenizer.to_bytes() with make_tempdir() as d: file_path = d / "tokenizer" zh_tokenizer.to_disk(file_path) - nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}}) + nlp = Chinese() nlp.tokenizer.from_disk(file_path) assert tokenizer_bytes == nlp.tokenizer.to_bytes() @@ -25,18 +25,13 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba): zh_tokenizer_serialize(zh_tokenizer_jieba) -def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg): - zh_tokenizer_serialize(zh_tokenizer_pkuseg) - - @pytest.mark.slow def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg): nlp = Chinese( meta={ "tokenizer": { "config": { - "use_jieba": False, - "use_pkuseg": True, + "segmenter": "pkuseg", "pkuseg_model": "medicine", } } diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py index 7af8a7604..1ebb1e7b7 100644 --- a/spacy/tests/lang/zh/test_tokenizer.py +++ b/spacy/tests/lang/zh/test_tokenizer.py @@ -1,5 +1,5 @@ import pytest -from spacy.lang.zh import _get_pkuseg_trie_data +from spacy.lang.zh import Chinese, _get_pkuseg_trie_data # fmt: off @@ -37,7 +37,7 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens): assert tokens == expected_tokens -def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg): +def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg, zh_tokenizer_char): user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie) zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"]) updated_user_dict = _get_pkuseg_trie_data( @@ -52,8 +52,24 @@ def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg): ) assert len(reset_user_dict) == 0 + # warn if not relevant + with pytest.warns(UserWarning): + zh_tokenizer_char.pkuseg_update_user_dict(["nonsense_asdf"]) -def test_extra_spaces(zh_tokenizer_char): + +def test_zh_extra_spaces(zh_tokenizer_char): # note: three spaces after "I" tokens = zh_tokenizer_char("I like cheese.") assert tokens[1].orth_ == " " + + +def test_zh_unsupported_segmenter(): + with pytest.warns(UserWarning): + nlp = Chinese(meta={"tokenizer": {"config": {"segmenter": "unk"}}}) + + +def test_zh_uninitialized_pkuseg(): + nlp = Chinese(meta={"tokenizer": {"config": {"segmenter": "char"}}}) + nlp.tokenizer.segmenter = "pkuseg" + with pytest.raises(ValueError): + doc = nlp("test") diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 4c8bc1664..ce313c2ad 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -92,30 +92,35 @@ The Chinese language class supports three word segmentation options: > ```python > from spacy.lang.zh import Chinese > -> # Disable jieba to use character segmentation -> Chinese.Defaults.use_jieba = False +> # Character segmentation (default) > nlp = Chinese() > -> # Disable jieba through tokenizer config options -> cfg = {"use_jieba": False} +> # Jieba +> cfg = {"segmenter": "jieba"} > nlp = Chinese(meta={"tokenizer": {"config": cfg}}) > -> # Load with "default" model provided by pkuseg -> cfg = {"pkuseg_model": "default", "require_pkuseg": True} +> # PKUSeg with "default" model provided by pkuseg +> cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"} > nlp = Chinese(meta={"tokenizer": {"config": cfg}}) > ``` -1. **Jieba:** `Chinese` uses [Jieba](https://github.com/fxsjy/jieba) for word - segmentation by default. It's enabled when you create a new `Chinese` +1. **Character segmentation:** Character segmentation is the default + segmentation option. It's enabled when you create a new `Chinese` language class or call `spacy.blank("zh")`. -2. **Character segmentation:** Character segmentation is supported by disabling - `jieba` and setting `Chinese.Defaults.use_jieba = False` _before_ - initializing the language class. As of spaCy v2.3.0, the `meta` tokenizer - config options can be used to configure `use_jieba`. -3. **PKUSeg**: In spaCy v2.3.0, support for +2. **Jieba:** `Chinese` uses [Jieba](https://github.com/fxsjy/jieba) for word + segmentation with the tokenizer option `{"segmenter": "jieba"}`. +3. **PKUSeg**: As of spaCy v2.3.0, support for [PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support - better segmentation for Chinese OntoNotes and the new - [Chinese models](/models/zh). + better segmentation for Chinese OntoNotes and the provided + [Chinese models](/models/zh). Enable PKUSeg with the tokenizer option + `{"segmenter": "pkuseg"}`. + + + +In spaCy v3, the default Chinese word segmenter has switched from Jieba to +character segmentation. + + @@ -129,29 +134,29 @@ $ pip install https://github.com/honnibal/pkuseg-python/archive/master.zip - + The `meta` argument of the `Chinese` language class supports the following following tokenizer config settings: -| Name | Type | Description | -| ------------------ | ---- | ---------------------------------------------------------------------------------------------------- | -| `pkuseg_model` | str | **Required:** Name of a model provided by `pkuseg` or the path to a local model directory. | -| `pkuseg_user_dict` | str | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. | -| `require_pkuseg` | bool | Overrides all `jieba` settings (optional but strongly recommended). | +| Name | Type | Description | +| ------------------ | ---- | ------------------------------------------------------------------------------------------------------- | +| `segmenter` | str | Word segmenter: `char`, `jieba` or `pkuseg`. Defaults to `char`. | +| `pkuseg_model` | str | **Required for `pkuseg`:** Name of a model provided by `pkuseg` or the path to a local model directory. | +| `pkuseg_user_dict` | str | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. | ```python ### Examples # Load "default" model -cfg = {"pkuseg_model": "default", "require_pkuseg": True} +cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"} nlp = Chinese(meta={"tokenizer": {"config": cfg}}) # Load local model -cfg = {"pkuseg_model": "/path/to/pkuseg_model", "require_pkuseg": True} +cfg = {"segmenter": "pkuseg", "pkuseg_model": "/path/to/pkuseg_model"} nlp = Chinese(meta={"tokenizer": {"config": cfg}}) # Override the user directory -cfg = {"pkuseg_model": "default", "require_pkuseg": True, "pkuseg_user_dict": "/path"} +cfg = {"segmenter": "pkuseg", "pkuseg_model": "default", "pkuseg_user_dict": "/path"} nlp = Chinese(meta={"tokenizer": {"config": cfg}}) ```