Refactor Chinese tokenizer configuration (#5736)

* Refactor Chinese tokenizer configuration

Refactor `ChineseTokenizer` configuration so that it uses a single
`segmenter` setting to choose between character segmentation, jieba, and
pkuseg.

* replace `use_jieba`, `use_pkuseg`, `require_pkuseg` with the setting
`segmenter` with the supported values: `char`, `jieba`, `pkuseg`
* make the default segmenter plain character segmentation `char` (no
additional libraries required)

* Fix Chinese serialization test to use char default

* Warn if attempting to customize other segmenter

Add a warning if `Chinese.pkuseg_update_user_dict` is called when
another segmenter is selected.
This commit is contained in:
Adriane Boyd 2020-07-19 13:34:37 +02:00 committed by GitHub
parent 9ee1c54f40
commit 39ebcd9ec9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 134 additions and 98 deletions

View File

@ -115,6 +115,10 @@ class Warnings:
"string \"Field1=Value1,Value2|Field2=Value3\".")
W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.")
W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")
W103 = ("Unknown {lang} word segmenter '{segmenter}'. Supported "
"word segmenters: {supported}. Defaulting to {default}.")
W104 = ("Skipping modifications for '{target}' segmenter. The current "
"segmenter is '{current}'.")
@add_codes
@ -535,6 +539,10 @@ class Errors:
"'{token_attrs}'.")
E999 = ("Unable to merge the `Doc` objects because they do not all share "
"the same `Vocab`.")
E1000 = ("No pkuseg model available. Provide a pkuseg model when "
"initializing the pipeline: "
'`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`')
@add_codes

View File

@ -1,8 +1,10 @@
import tempfile
import srsly
import warnings
from pathlib import Path
from collections import OrderedDict
from ...attrs import LANG
from ...errors import Warnings, Errors
from ...language import Language
from ...tokens import Doc
from ...util import DummyTokenizer
@ -16,100 +18,117 @@ from ... import util
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.22` or from https://github.com/lancopku/pkuseg-python"
def try_jieba_import(use_jieba):
def try_jieba_import(segmenter):
try:
import jieba
if segmenter == "jieba":
# segment a short text to have jieba initialize its cache in advance
list(jieba.cut("作为", cut_all=False))
return jieba
except ImportError:
if use_jieba:
if segmenter == "jieba":
msg = (
"Jieba not installed. Either set the default to False with "
"`from spacy.lang.zh import ChineseDefaults; ChineseDefaults.use_jieba = False`, "
"or install it with `pip install jieba` or from "
"https://github.com/fxsjy/jieba"
"Jieba not installed. To use jieba, install it with `pip "
" install jieba` or from https://github.com/fxsjy/jieba"
)
raise ImportError(msg)
def try_pkuseg_import(use_pkuseg, pkuseg_model, pkuseg_user_dict):
def try_pkuseg_import(segmenter, pkuseg_model, pkuseg_user_dict):
try:
import pkuseg
if pkuseg_model:
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
elif use_pkuseg:
elif segmenter == "pkuseg":
msg = (
"Chinese.use_pkuseg is True but no pkuseg model was specified. "
"Please provide the name of a pretrained model "
"The Chinese word segmenter is 'pkuseg' but no pkuseg model "
"was specified. Please provide the name of a pretrained model "
"or the path to a model with "
'`Chinese(meta={"tokenizer": {"config": {"pkuseg_model": name_or_path}}}).'
'`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`'
)
raise ValueError(msg)
except ImportError:
if use_pkuseg:
msg = (
"pkuseg not installed. Either set Chinese.use_pkuseg = False, "
"or " + _PKUSEG_INSTALL_MSG
)
if segmenter == "pkuseg":
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
raise ImportError(msg)
except FileNotFoundError:
if use_pkuseg:
if segmenter == "pkuseg":
msg = "Unable to load pkuseg model from: " + pkuseg_model
raise FileNotFoundError(msg)
class ChineseTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None, config={}):
self.use_jieba = config.get("use_jieba", cls.use_jieba)
self.use_pkuseg = config.get("use_pkuseg", cls.use_pkuseg)
self.require_pkuseg = config.get("require_pkuseg", False)
self.supported_segmenters = ("char", "jieba", "pkuseg")
self.configure_segmenter(config)
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
self.jieba_seg = try_jieba_import(self.use_jieba)
self.pkuseg_seg = try_pkuseg_import(
self.use_pkuseg,
pkuseg_model=config.get("pkuseg_model", None),
pkuseg_user_dict=config.get("pkuseg_user_dict", "default"),
)
# remove relevant settings from config so they're not also saved in
# Language.meta
for key in ["use_jieba", "use_pkuseg", "require_pkuseg", "pkuseg_model"]:
for key in ["segmenter", "pkuseg_model", "pkuseg_user_dict"]:
if key in config:
del config[key]
self.tokenizer = Language.Defaults().create_tokenizer(nlp)
def configure_segmenter(self, config):
self.segmenter = "char"
if "segmenter" in config:
if config["segmenter"] in self.supported_segmenters:
self.segmenter = config["segmenter"]
else:
warn_msg = Warnings.W103.format(
lang="Chinese",
segmenter=config["segmenter"],
supported=", ".join([repr(s) for s in self.supported_segmenters]),
default="'char' (character segmentation)",
)
warnings.warn(warn_msg)
self.jieba_seg = try_jieba_import(self.segmenter)
self.pkuseg_seg = try_pkuseg_import(
self.segmenter,
pkuseg_model=config.get("pkuseg_model", None),
pkuseg_user_dict=config.get("pkuseg_user_dict", "default"),
)
def __call__(self, text):
use_jieba = self.use_jieba
use_pkuseg = self.use_pkuseg
if self.require_pkuseg:
use_jieba = False
use_pkuseg = True
if use_jieba:
if self.segmenter == "jieba":
words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
(words, spaces) = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces)
elif use_pkuseg:
elif self.segmenter == "pkuseg":
if self.pkuseg_seg is None:
raise ValueError(Errors.E1000)
words = self.pkuseg_seg.cut(text)
(words, spaces) = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces)
else:
# warn if segmenter setting is not the only remaining option "char"
if self.segmenter != "char":
warn_msg = Warnings.W103.format(
lang="Chinese",
segmenter=self.segmenter,
supported=", ".join([repr(s) for s in self.supported_segmenters]),
default="'char' (character segmentation)",
)
warnings.warn(warn_msg)
# split into individual characters
words = list(text)
(words, spaces) = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces)
def pkuseg_update_user_dict(self, words, reset=False):
if self.pkuseg_seg:
if self.segmenter == "pkuseg":
if reset:
try:
import pkuseg
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
except ImportError:
if self.use_pkuseg:
if self.segmenter == "pkuseg":
msg = (
"pkuseg not installed: unable to reset pkuseg "
"user dict. Please " + _PKUSEG_INSTALL_MSG
@ -117,21 +136,16 @@ class ChineseTokenizer(DummyTokenizer):
raise ImportError(msg)
for word in words:
self.pkuseg_seg.preprocesser.insert(word.strip(), "")
else:
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
warnings.warn(warn_msg)
def _get_config(self):
config = OrderedDict(
(
("use_jieba", self.use_jieba),
("use_pkuseg", self.use_pkuseg),
("require_pkuseg", self.require_pkuseg),
)
)
config = OrderedDict((("segmenter", self.segmenter),))
return config
def _set_config(self, config={}):
self.use_jieba = config.get("use_jieba", False)
self.use_pkuseg = config.get("use_pkuseg", False)
self.require_pkuseg = config.get("require_pkuseg", False)
self.configure_segmenter(config)
def to_bytes(self, **kwargs):
pkuseg_features_b = b""
@ -248,7 +262,7 @@ class ChineseTokenizer(DummyTokenizer):
try:
import pkuseg
except ImportError:
if self.use_pkuseg:
if self.segmenter == "pkuseg":
raise ImportError(
"pkuseg not installed. To use this model, "
+ _PKUSEG_INSTALL_MSG
@ -260,9 +274,9 @@ class ChineseTokenizer(DummyTokenizer):
try:
import pkuseg
except ImportError:
if self.use_pkuseg:
if self.segmenter == "pkuseg":
raise ImportError(self._pkuseg_install_msg)
if self.pkuseg_seg:
if self.segmenter == "pkuseg":
data = srsly.read_msgpack(path)
(user_dict, do_process, common_words, other_words) = data
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
@ -288,8 +302,6 @@ class ChineseDefaults(Language.Defaults):
stop_words = STOP_WORDS
tag_map = TAG_MAP
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
use_jieba = True
use_pkuseg = False
@classmethod
def create_tokenizer(cls, nlp=None, config={}):

View File

@ -244,22 +244,22 @@ def yo_tokenizer():
@pytest.fixture(scope="session")
def zh_tokenizer_char():
return get_lang_class("zh").Defaults.create_tokenizer(
config={"use_jieba": False, "use_pkuseg": False}
)
return get_lang_class("zh").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def zh_tokenizer_jieba():
pytest.importorskip("jieba")
return get_lang_class("zh").Defaults.create_tokenizer()
return get_lang_class("zh").Defaults.create_tokenizer(
config={"segmenter": "jieba"}
)
@pytest.fixture(scope="session")
def zh_tokenizer_pkuseg():
pytest.importorskip("pkuseg")
return get_lang_class("zh").Defaults.create_tokenizer(
config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True}
config={"pkuseg_model": "default", "segmenter": "pkuseg"}
)

View File

@ -5,14 +5,14 @@ from ...util import make_tempdir
def zh_tokenizer_serialize(zh_tokenizer):
tokenizer_bytes = zh_tokenizer.to_bytes()
nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}})
nlp = Chinese()
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
with make_tempdir() as d:
file_path = d / "tokenizer"
zh_tokenizer.to_disk(file_path)
nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}})
nlp = Chinese()
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
@ -25,18 +25,13 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
zh_tokenizer_serialize(zh_tokenizer_jieba)
def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg):
zh_tokenizer_serialize(zh_tokenizer_pkuseg)
@pytest.mark.slow
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
nlp = Chinese(
meta={
"tokenizer": {
"config": {
"use_jieba": False,
"use_pkuseg": True,
"segmenter": "pkuseg",
"pkuseg_model": "medicine",
}
}

View File

@ -1,5 +1,5 @@
import pytest
from spacy.lang.zh import _get_pkuseg_trie_data
from spacy.lang.zh import Chinese, _get_pkuseg_trie_data
# fmt: off
@ -37,7 +37,7 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens):
assert tokens == expected_tokens
def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg, zh_tokenizer_char):
user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
updated_user_dict = _get_pkuseg_trie_data(
@ -52,8 +52,24 @@ def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
)
assert len(reset_user_dict) == 0
# warn if not relevant
with pytest.warns(UserWarning):
zh_tokenizer_char.pkuseg_update_user_dict(["nonsense_asdf"])
def test_extra_spaces(zh_tokenizer_char):
def test_zh_extra_spaces(zh_tokenizer_char):
# note: three spaces after "I"
tokens = zh_tokenizer_char("I like cheese.")
assert tokens[1].orth_ == " "
def test_zh_unsupported_segmenter():
with pytest.warns(UserWarning):
nlp = Chinese(meta={"tokenizer": {"config": {"segmenter": "unk"}}})
def test_zh_uninitialized_pkuseg():
nlp = Chinese(meta={"tokenizer": {"config": {"segmenter": "char"}}})
nlp.tokenizer.segmenter = "pkuseg"
with pytest.raises(ValueError):
doc = nlp("test")

View File

@ -92,30 +92,35 @@ The Chinese language class supports three word segmentation options:
> ```python
> from spacy.lang.zh import Chinese
>
> # Disable jieba to use character segmentation
> Chinese.Defaults.use_jieba = False
> # Character segmentation (default)
> nlp = Chinese()
>
> # Disable jieba through tokenizer config options
> cfg = {"use_jieba": False}
> # Jieba
> cfg = {"segmenter": "jieba"}
> nlp = Chinese(meta={"tokenizer": {"config": cfg}})
>
> # Load with "default" model provided by pkuseg
> cfg = {"pkuseg_model": "default", "require_pkuseg": True}
> # PKUSeg with "default" model provided by pkuseg
> cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"}
> nlp = Chinese(meta={"tokenizer": {"config": cfg}})
> ```
1. **Jieba:** `Chinese` uses [Jieba](https://github.com/fxsjy/jieba) for word
segmentation by default. It's enabled when you create a new `Chinese`
1. **Character segmentation:** Character segmentation is the default
segmentation option. It's enabled when you create a new `Chinese`
language class or call `spacy.blank("zh")`.
2. **Character segmentation:** Character segmentation is supported by disabling
`jieba` and setting `Chinese.Defaults.use_jieba = False` _before_
initializing the language class. As of spaCy v2.3.0, the `meta` tokenizer
config options can be used to configure `use_jieba`.
3. **PKUSeg**: In spaCy v2.3.0, support for
2. **Jieba:** `Chinese` uses [Jieba](https://github.com/fxsjy/jieba) for word
segmentation with the tokenizer option `{"segmenter": "jieba"}`.
3. **PKUSeg**: As of spaCy v2.3.0, support for
[PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support
better segmentation for Chinese OntoNotes and the new
[Chinese models](/models/zh).
better segmentation for Chinese OntoNotes and the provided
[Chinese models](/models/zh). Enable PKUSeg with the tokenizer option
`{"segmenter": "pkuseg"}`.
<Infobox variant="warning">
In spaCy v3, the default Chinese word segmenter has switched from Jieba to
character segmentation.
</Infobox>
<Infobox variant="warning">
@ -129,29 +134,29 @@ $ pip install https://github.com/honnibal/pkuseg-python/archive/master.zip
</Infobox>
<Accordion title="Details on spaCy's PKUSeg API">
<Accordion title="Details on spaCy's Chinese API">
The `meta` argument of the `Chinese` language class supports the following
following tokenizer config settings:
| Name | Type | Description |
| ------------------ | ---- | ---------------------------------------------------------------------------------------------------- |
| `pkuseg_model` | str | **Required:** Name of a model provided by `pkuseg` or the path to a local model directory. |
| ------------------ | ---- | ------------------------------------------------------------------------------------------------------- |
| `segmenter` | str | Word segmenter: `char`, `jieba` or `pkuseg`. Defaults to `char`. |
| `pkuseg_model` | str | **Required for `pkuseg`:** Name of a model provided by `pkuseg` or the path to a local model directory. |
| `pkuseg_user_dict` | str | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. |
| `require_pkuseg` | bool | Overrides all `jieba` settings (optional but strongly recommended). |
```python
### Examples
# Load "default" model
cfg = {"pkuseg_model": "default", "require_pkuseg": True}
cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"}
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
# Load local model
cfg = {"pkuseg_model": "/path/to/pkuseg_model", "require_pkuseg": True}
cfg = {"segmenter": "pkuseg", "pkuseg_model": "/path/to/pkuseg_model"}
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
# Override the user directory
cfg = {"pkuseg_model": "default", "require_pkuseg": True, "pkuseg_user_dict": "/path"}
cfg = {"segmenter": "pkuseg", "pkuseg_model": "default", "pkuseg_user_dict": "/path"}
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
```