diff --git a/spacy/errors.py b/spacy/errors.py
index 45de5ed45..4f234a494 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -115,6 +115,10 @@ class Warnings:
"string \"Field1=Value1,Value2|Field2=Value3\".")
W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.")
W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")
+ W103 = ("Unknown {lang} word segmenter '{segmenter}'. Supported "
+ "word segmenters: {supported}. Defaulting to {default}.")
+ W104 = ("Skipping modifications for '{target}' segmenter. The current "
+ "segmenter is '{current}'.")
@add_codes
@@ -535,6 +539,10 @@ class Errors:
"'{token_attrs}'.")
E999 = ("Unable to merge the `Doc` objects because they do not all share "
"the same `Vocab`.")
+ E1000 = ("No pkuseg model available. Provide a pkuseg model when "
+ "initializing the pipeline: "
+ '`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
+ 'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`')
@add_codes
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index fc7573f8d..99a84edfc 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -1,8 +1,10 @@
import tempfile
import srsly
+import warnings
from pathlib import Path
from collections import OrderedDict
from ...attrs import LANG
+from ...errors import Warnings, Errors
from ...language import Language
from ...tokens import Doc
from ...util import DummyTokenizer
@@ -16,100 +18,117 @@ from ... import util
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.22` or from https://github.com/lancopku/pkuseg-python"
-def try_jieba_import(use_jieba):
+def try_jieba_import(segmenter):
try:
import jieba
- # segment a short text to have jieba initialize its cache in advance
- list(jieba.cut("作为", cut_all=False))
+ if segmenter == "jieba":
+ # segment a short text to have jieba initialize its cache in advance
+ list(jieba.cut("作为", cut_all=False))
return jieba
except ImportError:
- if use_jieba:
+ if segmenter == "jieba":
msg = (
- "Jieba not installed. Either set the default to False with "
- "`from spacy.lang.zh import ChineseDefaults; ChineseDefaults.use_jieba = False`, "
- "or install it with `pip install jieba` or from "
- "https://github.com/fxsjy/jieba"
+ "Jieba not installed. To use jieba, install it with `pip "
+ " install jieba` or from https://github.com/fxsjy/jieba"
)
raise ImportError(msg)
-def try_pkuseg_import(use_pkuseg, pkuseg_model, pkuseg_user_dict):
+def try_pkuseg_import(segmenter, pkuseg_model, pkuseg_user_dict):
try:
import pkuseg
if pkuseg_model:
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
- elif use_pkuseg:
+ elif segmenter == "pkuseg":
msg = (
- "Chinese.use_pkuseg is True but no pkuseg model was specified. "
- "Please provide the name of a pretrained model "
+ "The Chinese word segmenter is 'pkuseg' but no pkuseg model "
+ "was specified. Please provide the name of a pretrained model "
"or the path to a model with "
- '`Chinese(meta={"tokenizer": {"config": {"pkuseg_model": name_or_path}}}).'
+ '`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
+ 'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`'
)
raise ValueError(msg)
except ImportError:
- if use_pkuseg:
- msg = (
- "pkuseg not installed. Either set Chinese.use_pkuseg = False, "
- "or " + _PKUSEG_INSTALL_MSG
- )
+ if segmenter == "pkuseg":
+ msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
raise ImportError(msg)
except FileNotFoundError:
- if use_pkuseg:
+ if segmenter == "pkuseg":
msg = "Unable to load pkuseg model from: " + pkuseg_model
raise FileNotFoundError(msg)
class ChineseTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None, config={}):
- self.use_jieba = config.get("use_jieba", cls.use_jieba)
- self.use_pkuseg = config.get("use_pkuseg", cls.use_pkuseg)
- self.require_pkuseg = config.get("require_pkuseg", False)
+ self.supported_segmenters = ("char", "jieba", "pkuseg")
+ self.configure_segmenter(config)
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
- self.jieba_seg = try_jieba_import(self.use_jieba)
- self.pkuseg_seg = try_pkuseg_import(
- self.use_pkuseg,
- pkuseg_model=config.get("pkuseg_model", None),
- pkuseg_user_dict=config.get("pkuseg_user_dict", "default"),
- )
# remove relevant settings from config so they're not also saved in
# Language.meta
- for key in ["use_jieba", "use_pkuseg", "require_pkuseg", "pkuseg_model"]:
+ for key in ["segmenter", "pkuseg_model", "pkuseg_user_dict"]:
if key in config:
del config[key]
self.tokenizer = Language.Defaults().create_tokenizer(nlp)
+ def configure_segmenter(self, config):
+ self.segmenter = "char"
+ if "segmenter" in config:
+ if config["segmenter"] in self.supported_segmenters:
+ self.segmenter = config["segmenter"]
+ else:
+ warn_msg = Warnings.W103.format(
+ lang="Chinese",
+ segmenter=config["segmenter"],
+ supported=", ".join([repr(s) for s in self.supported_segmenters]),
+ default="'char' (character segmentation)",
+ )
+ warnings.warn(warn_msg)
+ self.jieba_seg = try_jieba_import(self.segmenter)
+ self.pkuseg_seg = try_pkuseg_import(
+ self.segmenter,
+ pkuseg_model=config.get("pkuseg_model", None),
+ pkuseg_user_dict=config.get("pkuseg_user_dict", "default"),
+ )
+
def __call__(self, text):
- use_jieba = self.use_jieba
- use_pkuseg = self.use_pkuseg
- if self.require_pkuseg:
- use_jieba = False
- use_pkuseg = True
- if use_jieba:
+ if self.segmenter == "jieba":
words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
(words, spaces) = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces)
- elif use_pkuseg:
+ elif self.segmenter == "pkuseg":
+ if self.pkuseg_seg is None:
+ raise ValueError(Errors.E1000)
words = self.pkuseg_seg.cut(text)
(words, spaces) = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces)
- else:
- # split into individual characters
- words = list(text)
- (words, spaces) = util.get_words_and_spaces(words, text)
- return Doc(self.vocab, words=words, spaces=spaces)
+
+ # warn if segmenter setting is not the only remaining option "char"
+ if self.segmenter != "char":
+ warn_msg = Warnings.W103.format(
+ lang="Chinese",
+ segmenter=self.segmenter,
+ supported=", ".join([repr(s) for s in self.supported_segmenters]),
+ default="'char' (character segmentation)",
+ )
+ warnings.warn(warn_msg)
+
+ # split into individual characters
+ words = list(text)
+ (words, spaces) = util.get_words_and_spaces(words, text)
+ return Doc(self.vocab, words=words, spaces=spaces)
def pkuseg_update_user_dict(self, words, reset=False):
- if self.pkuseg_seg:
+ if self.segmenter == "pkuseg":
if reset:
try:
import pkuseg
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
except ImportError:
- if self.use_pkuseg:
+ if self.segmenter == "pkuseg":
msg = (
"pkuseg not installed: unable to reset pkuseg "
"user dict. Please " + _PKUSEG_INSTALL_MSG
@@ -117,21 +136,16 @@ class ChineseTokenizer(DummyTokenizer):
raise ImportError(msg)
for word in words:
self.pkuseg_seg.preprocesser.insert(word.strip(), "")
+ else:
+ warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
+ warnings.warn(warn_msg)
def _get_config(self):
- config = OrderedDict(
- (
- ("use_jieba", self.use_jieba),
- ("use_pkuseg", self.use_pkuseg),
- ("require_pkuseg", self.require_pkuseg),
- )
- )
+ config = OrderedDict((("segmenter", self.segmenter),))
return config
def _set_config(self, config={}):
- self.use_jieba = config.get("use_jieba", False)
- self.use_pkuseg = config.get("use_pkuseg", False)
- self.require_pkuseg = config.get("require_pkuseg", False)
+ self.configure_segmenter(config)
def to_bytes(self, **kwargs):
pkuseg_features_b = b""
@@ -248,7 +262,7 @@ class ChineseTokenizer(DummyTokenizer):
try:
import pkuseg
except ImportError:
- if self.use_pkuseg:
+ if self.segmenter == "pkuseg":
raise ImportError(
"pkuseg not installed. To use this model, "
+ _PKUSEG_INSTALL_MSG
@@ -260,9 +274,9 @@ class ChineseTokenizer(DummyTokenizer):
try:
import pkuseg
except ImportError:
- if self.use_pkuseg:
+ if self.segmenter == "pkuseg":
raise ImportError(self._pkuseg_install_msg)
- if self.pkuseg_seg:
+ if self.segmenter == "pkuseg":
data = srsly.read_msgpack(path)
(user_dict, do_process, common_words, other_words) = data
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
@@ -288,8 +302,6 @@ class ChineseDefaults(Language.Defaults):
stop_words = STOP_WORDS
tag_map = TAG_MAP
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
- use_jieba = True
- use_pkuseg = False
@classmethod
def create_tokenizer(cls, nlp=None, config={}):
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 22ff2ce26..bc8d088c5 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -244,22 +244,22 @@ def yo_tokenizer():
@pytest.fixture(scope="session")
def zh_tokenizer_char():
- return get_lang_class("zh").Defaults.create_tokenizer(
- config={"use_jieba": False, "use_pkuseg": False}
- )
+ return get_lang_class("zh").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def zh_tokenizer_jieba():
pytest.importorskip("jieba")
- return get_lang_class("zh").Defaults.create_tokenizer()
+ return get_lang_class("zh").Defaults.create_tokenizer(
+ config={"segmenter": "jieba"}
+ )
@pytest.fixture(scope="session")
def zh_tokenizer_pkuseg():
pytest.importorskip("pkuseg")
return get_lang_class("zh").Defaults.create_tokenizer(
- config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True}
+ config={"pkuseg_model": "default", "segmenter": "pkuseg"}
)
diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py
index d84920c3e..544c4a7bc 100644
--- a/spacy/tests/lang/zh/test_serialize.py
+++ b/spacy/tests/lang/zh/test_serialize.py
@@ -5,14 +5,14 @@ from ...util import make_tempdir
def zh_tokenizer_serialize(zh_tokenizer):
tokenizer_bytes = zh_tokenizer.to_bytes()
- nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}})
+ nlp = Chinese()
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
with make_tempdir() as d:
file_path = d / "tokenizer"
zh_tokenizer.to_disk(file_path)
- nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}})
+ nlp = Chinese()
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
@@ -25,18 +25,13 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
zh_tokenizer_serialize(zh_tokenizer_jieba)
-def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg):
- zh_tokenizer_serialize(zh_tokenizer_pkuseg)
-
-
@pytest.mark.slow
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
nlp = Chinese(
meta={
"tokenizer": {
"config": {
- "use_jieba": False,
- "use_pkuseg": True,
+ "segmenter": "pkuseg",
"pkuseg_model": "medicine",
}
}
diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py
index 7af8a7604..1ebb1e7b7 100644
--- a/spacy/tests/lang/zh/test_tokenizer.py
+++ b/spacy/tests/lang/zh/test_tokenizer.py
@@ -1,5 +1,5 @@
import pytest
-from spacy.lang.zh import _get_pkuseg_trie_data
+from spacy.lang.zh import Chinese, _get_pkuseg_trie_data
# fmt: off
@@ -37,7 +37,7 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens):
assert tokens == expected_tokens
-def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
+def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg, zh_tokenizer_char):
user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
updated_user_dict = _get_pkuseg_trie_data(
@@ -52,8 +52,24 @@ def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
)
assert len(reset_user_dict) == 0
+ # warn if not relevant
+ with pytest.warns(UserWarning):
+ zh_tokenizer_char.pkuseg_update_user_dict(["nonsense_asdf"])
-def test_extra_spaces(zh_tokenizer_char):
+
+def test_zh_extra_spaces(zh_tokenizer_char):
# note: three spaces after "I"
tokens = zh_tokenizer_char("I like cheese.")
assert tokens[1].orth_ == " "
+
+
+def test_zh_unsupported_segmenter():
+ with pytest.warns(UserWarning):
+ nlp = Chinese(meta={"tokenizer": {"config": {"segmenter": "unk"}}})
+
+
+def test_zh_uninitialized_pkuseg():
+ nlp = Chinese(meta={"tokenizer": {"config": {"segmenter": "char"}}})
+ nlp.tokenizer.segmenter = "pkuseg"
+ with pytest.raises(ValueError):
+ doc = nlp("test")
diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md
index 4c8bc1664..ce313c2ad 100644
--- a/website/docs/usage/models.md
+++ b/website/docs/usage/models.md
@@ -92,30 +92,35 @@ The Chinese language class supports three word segmentation options:
> ```python
> from spacy.lang.zh import Chinese
>
-> # Disable jieba to use character segmentation
-> Chinese.Defaults.use_jieba = False
+> # Character segmentation (default)
> nlp = Chinese()
>
-> # Disable jieba through tokenizer config options
-> cfg = {"use_jieba": False}
+> # Jieba
+> cfg = {"segmenter": "jieba"}
> nlp = Chinese(meta={"tokenizer": {"config": cfg}})
>
-> # Load with "default" model provided by pkuseg
-> cfg = {"pkuseg_model": "default", "require_pkuseg": True}
+> # PKUSeg with "default" model provided by pkuseg
+> cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"}
> nlp = Chinese(meta={"tokenizer": {"config": cfg}})
> ```
-1. **Jieba:** `Chinese` uses [Jieba](https://github.com/fxsjy/jieba) for word
- segmentation by default. It's enabled when you create a new `Chinese`
+1. **Character segmentation:** Character segmentation is the default
+ segmentation option. It's enabled when you create a new `Chinese`
language class or call `spacy.blank("zh")`.
-2. **Character segmentation:** Character segmentation is supported by disabling
- `jieba` and setting `Chinese.Defaults.use_jieba = False` _before_
- initializing the language class. As of spaCy v2.3.0, the `meta` tokenizer
- config options can be used to configure `use_jieba`.
-3. **PKUSeg**: In spaCy v2.3.0, support for
+2. **Jieba:** `Chinese` uses [Jieba](https://github.com/fxsjy/jieba) for word
+ segmentation with the tokenizer option `{"segmenter": "jieba"}`.
+3. **PKUSeg**: As of spaCy v2.3.0, support for
[PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support
- better segmentation for Chinese OntoNotes and the new
- [Chinese models](/models/zh).
+ better segmentation for Chinese OntoNotes and the provided
+ [Chinese models](/models/zh). Enable PKUSeg with the tokenizer option
+ `{"segmenter": "pkuseg"}`.
+
+
+
+In spaCy v3, the default Chinese word segmenter has switched from Jieba to
+character segmentation.
+
+
@@ -129,29 +134,29 @@ $ pip install https://github.com/honnibal/pkuseg-python/archive/master.zip
-
+
The `meta` argument of the `Chinese` language class supports the following
following tokenizer config settings:
-| Name | Type | Description |
-| ------------------ | ---- | ---------------------------------------------------------------------------------------------------- |
-| `pkuseg_model` | str | **Required:** Name of a model provided by `pkuseg` or the path to a local model directory. |
-| `pkuseg_user_dict` | str | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. |
-| `require_pkuseg` | bool | Overrides all `jieba` settings (optional but strongly recommended). |
+| Name | Type | Description |
+| ------------------ | ---- | ------------------------------------------------------------------------------------------------------- |
+| `segmenter` | str | Word segmenter: `char`, `jieba` or `pkuseg`. Defaults to `char`. |
+| `pkuseg_model` | str | **Required for `pkuseg`:** Name of a model provided by `pkuseg` or the path to a local model directory. |
+| `pkuseg_user_dict` | str | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. |
```python
### Examples
# Load "default" model
-cfg = {"pkuseg_model": "default", "require_pkuseg": True}
+cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"}
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
# Load local model
-cfg = {"pkuseg_model": "/path/to/pkuseg_model", "require_pkuseg": True}
+cfg = {"segmenter": "pkuseg", "pkuseg_model": "/path/to/pkuseg_model"}
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
# Override the user directory
-cfg = {"pkuseg_model": "default", "require_pkuseg": True, "pkuseg_user_dict": "/path"}
+cfg = {"segmenter": "pkuseg", "pkuseg_model": "default", "pkuseg_user_dict": "/path"}
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
```