Refactor Chinese tokenizer configuration (#5736)

* Refactor Chinese tokenizer configuration

Refactor `ChineseTokenizer` configuration so that it uses a single
`segmenter` setting to choose between character segmentation, jieba, and
pkuseg.

* replace `use_jieba`, `use_pkuseg`, `require_pkuseg` with the setting
`segmenter` with the supported values: `char`, `jieba`, `pkuseg`
* make the default segmenter plain character segmentation `char` (no
additional libraries required)

* Fix Chinese serialization test to use char default

* Warn if attempting to customize other segmenter

Add a warning if `Chinese.pkuseg_update_user_dict` is called when
another segmenter is selected.
This commit is contained in:
Adriane Boyd 2020-07-19 13:34:37 +02:00 committed by GitHub
parent 9ee1c54f40
commit 39ebcd9ec9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 134 additions and 98 deletions

View File

@ -115,6 +115,10 @@ class Warnings:
"string \"Field1=Value1,Value2|Field2=Value3\".") "string \"Field1=Value1,Value2|Field2=Value3\".")
W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.") W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.")
W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.") W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")
W103 = ("Unknown {lang} word segmenter '{segmenter}'. Supported "
"word segmenters: {supported}. Defaulting to {default}.")
W104 = ("Skipping modifications for '{target}' segmenter. The current "
"segmenter is '{current}'.")
@add_codes @add_codes
@ -535,6 +539,10 @@ class Errors:
"'{token_attrs}'.") "'{token_attrs}'.")
E999 = ("Unable to merge the `Doc` objects because they do not all share " E999 = ("Unable to merge the `Doc` objects because they do not all share "
"the same `Vocab`.") "the same `Vocab`.")
E1000 = ("No pkuseg model available. Provide a pkuseg model when "
"initializing the pipeline: "
'`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`')
@add_codes @add_codes

View File

@ -1,8 +1,10 @@
import tempfile import tempfile
import srsly import srsly
import warnings
from pathlib import Path from pathlib import Path
from collections import OrderedDict from collections import OrderedDict
from ...attrs import LANG from ...attrs import LANG
from ...errors import Warnings, Errors
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from ...util import DummyTokenizer from ...util import DummyTokenizer
@ -16,100 +18,117 @@ from ... import util
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.22` or from https://github.com/lancopku/pkuseg-python" _PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.22` or from https://github.com/lancopku/pkuseg-python"
def try_jieba_import(use_jieba): def try_jieba_import(segmenter):
try: try:
import jieba import jieba
# segment a short text to have jieba initialize its cache in advance if segmenter == "jieba":
list(jieba.cut("作为", cut_all=False)) # segment a short text to have jieba initialize its cache in advance
list(jieba.cut("作为", cut_all=False))
return jieba return jieba
except ImportError: except ImportError:
if use_jieba: if segmenter == "jieba":
msg = ( msg = (
"Jieba not installed. Either set the default to False with " "Jieba not installed. To use jieba, install it with `pip "
"`from spacy.lang.zh import ChineseDefaults; ChineseDefaults.use_jieba = False`, " " install jieba` or from https://github.com/fxsjy/jieba"
"or install it with `pip install jieba` or from "
"https://github.com/fxsjy/jieba"
) )
raise ImportError(msg) raise ImportError(msg)
def try_pkuseg_import(use_pkuseg, pkuseg_model, pkuseg_user_dict): def try_pkuseg_import(segmenter, pkuseg_model, pkuseg_user_dict):
try: try:
import pkuseg import pkuseg
if pkuseg_model: if pkuseg_model:
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict) return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
elif use_pkuseg: elif segmenter == "pkuseg":
msg = ( msg = (
"Chinese.use_pkuseg is True but no pkuseg model was specified. " "The Chinese word segmenter is 'pkuseg' but no pkuseg model "
"Please provide the name of a pretrained model " "was specified. Please provide the name of a pretrained model "
"or the path to a model with " "or the path to a model with "
'`Chinese(meta={"tokenizer": {"config": {"pkuseg_model": name_or_path}}}).' '`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`'
) )
raise ValueError(msg) raise ValueError(msg)
except ImportError: except ImportError:
if use_pkuseg: if segmenter == "pkuseg":
msg = ( msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
"pkuseg not installed. Either set Chinese.use_pkuseg = False, "
"or " + _PKUSEG_INSTALL_MSG
)
raise ImportError(msg) raise ImportError(msg)
except FileNotFoundError: except FileNotFoundError:
if use_pkuseg: if segmenter == "pkuseg":
msg = "Unable to load pkuseg model from: " + pkuseg_model msg = "Unable to load pkuseg model from: " + pkuseg_model
raise FileNotFoundError(msg) raise FileNotFoundError(msg)
class ChineseTokenizer(DummyTokenizer): class ChineseTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None, config={}): def __init__(self, cls, nlp=None, config={}):
self.use_jieba = config.get("use_jieba", cls.use_jieba) self.supported_segmenters = ("char", "jieba", "pkuseg")
self.use_pkuseg = config.get("use_pkuseg", cls.use_pkuseg) self.configure_segmenter(config)
self.require_pkuseg = config.get("require_pkuseg", False)
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
self.jieba_seg = try_jieba_import(self.use_jieba)
self.pkuseg_seg = try_pkuseg_import(
self.use_pkuseg,
pkuseg_model=config.get("pkuseg_model", None),
pkuseg_user_dict=config.get("pkuseg_user_dict", "default"),
)
# remove relevant settings from config so they're not also saved in # remove relevant settings from config so they're not also saved in
# Language.meta # Language.meta
for key in ["use_jieba", "use_pkuseg", "require_pkuseg", "pkuseg_model"]: for key in ["segmenter", "pkuseg_model", "pkuseg_user_dict"]:
if key in config: if key in config:
del config[key] del config[key]
self.tokenizer = Language.Defaults().create_tokenizer(nlp) self.tokenizer = Language.Defaults().create_tokenizer(nlp)
def configure_segmenter(self, config):
self.segmenter = "char"
if "segmenter" in config:
if config["segmenter"] in self.supported_segmenters:
self.segmenter = config["segmenter"]
else:
warn_msg = Warnings.W103.format(
lang="Chinese",
segmenter=config["segmenter"],
supported=", ".join([repr(s) for s in self.supported_segmenters]),
default="'char' (character segmentation)",
)
warnings.warn(warn_msg)
self.jieba_seg = try_jieba_import(self.segmenter)
self.pkuseg_seg = try_pkuseg_import(
self.segmenter,
pkuseg_model=config.get("pkuseg_model", None),
pkuseg_user_dict=config.get("pkuseg_user_dict", "default"),
)
def __call__(self, text): def __call__(self, text):
use_jieba = self.use_jieba if self.segmenter == "jieba":
use_pkuseg = self.use_pkuseg
if self.require_pkuseg:
use_jieba = False
use_pkuseg = True
if use_jieba:
words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x]) words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
(words, spaces) = util.get_words_and_spaces(words, text) (words, spaces) = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces) return Doc(self.vocab, words=words, spaces=spaces)
elif use_pkuseg: elif self.segmenter == "pkuseg":
if self.pkuseg_seg is None:
raise ValueError(Errors.E1000)
words = self.pkuseg_seg.cut(text) words = self.pkuseg_seg.cut(text)
(words, spaces) = util.get_words_and_spaces(words, text) (words, spaces) = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces) return Doc(self.vocab, words=words, spaces=spaces)
else:
# split into individual characters # warn if segmenter setting is not the only remaining option "char"
words = list(text) if self.segmenter != "char":
(words, spaces) = util.get_words_and_spaces(words, text) warn_msg = Warnings.W103.format(
return Doc(self.vocab, words=words, spaces=spaces) lang="Chinese",
segmenter=self.segmenter,
supported=", ".join([repr(s) for s in self.supported_segmenters]),
default="'char' (character segmentation)",
)
warnings.warn(warn_msg)
# split into individual characters
words = list(text)
(words, spaces) = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces)
def pkuseg_update_user_dict(self, words, reset=False): def pkuseg_update_user_dict(self, words, reset=False):
if self.pkuseg_seg: if self.segmenter == "pkuseg":
if reset: if reset:
try: try:
import pkuseg import pkuseg
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None) self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
except ImportError: except ImportError:
if self.use_pkuseg: if self.segmenter == "pkuseg":
msg = ( msg = (
"pkuseg not installed: unable to reset pkuseg " "pkuseg not installed: unable to reset pkuseg "
"user dict. Please " + _PKUSEG_INSTALL_MSG "user dict. Please " + _PKUSEG_INSTALL_MSG
@ -117,21 +136,16 @@ class ChineseTokenizer(DummyTokenizer):
raise ImportError(msg) raise ImportError(msg)
for word in words: for word in words:
self.pkuseg_seg.preprocesser.insert(word.strip(), "") self.pkuseg_seg.preprocesser.insert(word.strip(), "")
else:
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
warnings.warn(warn_msg)
def _get_config(self): def _get_config(self):
config = OrderedDict( config = OrderedDict((("segmenter", self.segmenter),))
(
("use_jieba", self.use_jieba),
("use_pkuseg", self.use_pkuseg),
("require_pkuseg", self.require_pkuseg),
)
)
return config return config
def _set_config(self, config={}): def _set_config(self, config={}):
self.use_jieba = config.get("use_jieba", False) self.configure_segmenter(config)
self.use_pkuseg = config.get("use_pkuseg", False)
self.require_pkuseg = config.get("require_pkuseg", False)
def to_bytes(self, **kwargs): def to_bytes(self, **kwargs):
pkuseg_features_b = b"" pkuseg_features_b = b""
@ -248,7 +262,7 @@ class ChineseTokenizer(DummyTokenizer):
try: try:
import pkuseg import pkuseg
except ImportError: except ImportError:
if self.use_pkuseg: if self.segmenter == "pkuseg":
raise ImportError( raise ImportError(
"pkuseg not installed. To use this model, " "pkuseg not installed. To use this model, "
+ _PKUSEG_INSTALL_MSG + _PKUSEG_INSTALL_MSG
@ -260,9 +274,9 @@ class ChineseTokenizer(DummyTokenizer):
try: try:
import pkuseg import pkuseg
except ImportError: except ImportError:
if self.use_pkuseg: if self.segmenter == "pkuseg":
raise ImportError(self._pkuseg_install_msg) raise ImportError(self._pkuseg_install_msg)
if self.pkuseg_seg: if self.segmenter == "pkuseg":
data = srsly.read_msgpack(path) data = srsly.read_msgpack(path)
(user_dict, do_process, common_words, other_words) = data (user_dict, do_process, common_words, other_words) = data
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict) self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
@ -288,8 +302,6 @@ class ChineseDefaults(Language.Defaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
tag_map = TAG_MAP tag_map = TAG_MAP
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
use_jieba = True
use_pkuseg = False
@classmethod @classmethod
def create_tokenizer(cls, nlp=None, config={}): def create_tokenizer(cls, nlp=None, config={}):

View File

@ -244,22 +244,22 @@ def yo_tokenizer():
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def zh_tokenizer_char(): def zh_tokenizer_char():
return get_lang_class("zh").Defaults.create_tokenizer( return get_lang_class("zh").Defaults.create_tokenizer()
config={"use_jieba": False, "use_pkuseg": False}
)
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def zh_tokenizer_jieba(): def zh_tokenizer_jieba():
pytest.importorskip("jieba") pytest.importorskip("jieba")
return get_lang_class("zh").Defaults.create_tokenizer() return get_lang_class("zh").Defaults.create_tokenizer(
config={"segmenter": "jieba"}
)
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def zh_tokenizer_pkuseg(): def zh_tokenizer_pkuseg():
pytest.importorskip("pkuseg") pytest.importorskip("pkuseg")
return get_lang_class("zh").Defaults.create_tokenizer( return get_lang_class("zh").Defaults.create_tokenizer(
config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True} config={"pkuseg_model": "default", "segmenter": "pkuseg"}
) )

View File

@ -5,14 +5,14 @@ from ...util import make_tempdir
def zh_tokenizer_serialize(zh_tokenizer): def zh_tokenizer_serialize(zh_tokenizer):
tokenizer_bytes = zh_tokenizer.to_bytes() tokenizer_bytes = zh_tokenizer.to_bytes()
nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}}) nlp = Chinese()
nlp.tokenizer.from_bytes(tokenizer_bytes) nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes() assert tokenizer_bytes == nlp.tokenizer.to_bytes()
with make_tempdir() as d: with make_tempdir() as d:
file_path = d / "tokenizer" file_path = d / "tokenizer"
zh_tokenizer.to_disk(file_path) zh_tokenizer.to_disk(file_path)
nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}}) nlp = Chinese()
nlp.tokenizer.from_disk(file_path) nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes() assert tokenizer_bytes == nlp.tokenizer.to_bytes()
@ -25,18 +25,13 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
zh_tokenizer_serialize(zh_tokenizer_jieba) zh_tokenizer_serialize(zh_tokenizer_jieba)
def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg):
zh_tokenizer_serialize(zh_tokenizer_pkuseg)
@pytest.mark.slow @pytest.mark.slow
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg): def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
nlp = Chinese( nlp = Chinese(
meta={ meta={
"tokenizer": { "tokenizer": {
"config": { "config": {
"use_jieba": False, "segmenter": "pkuseg",
"use_pkuseg": True,
"pkuseg_model": "medicine", "pkuseg_model": "medicine",
} }
} }

View File

@ -1,5 +1,5 @@
import pytest import pytest
from spacy.lang.zh import _get_pkuseg_trie_data from spacy.lang.zh import Chinese, _get_pkuseg_trie_data
# fmt: off # fmt: off
@ -37,7 +37,7 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens):
assert tokens == expected_tokens assert tokens == expected_tokens
def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg): def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg, zh_tokenizer_char):
user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie) user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"]) zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
updated_user_dict = _get_pkuseg_trie_data( updated_user_dict = _get_pkuseg_trie_data(
@ -52,8 +52,24 @@ def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
) )
assert len(reset_user_dict) == 0 assert len(reset_user_dict) == 0
# warn if not relevant
with pytest.warns(UserWarning):
zh_tokenizer_char.pkuseg_update_user_dict(["nonsense_asdf"])
def test_extra_spaces(zh_tokenizer_char):
def test_zh_extra_spaces(zh_tokenizer_char):
# note: three spaces after "I" # note: three spaces after "I"
tokens = zh_tokenizer_char("I like cheese.") tokens = zh_tokenizer_char("I like cheese.")
assert tokens[1].orth_ == " " assert tokens[1].orth_ == " "
def test_zh_unsupported_segmenter():
with pytest.warns(UserWarning):
nlp = Chinese(meta={"tokenizer": {"config": {"segmenter": "unk"}}})
def test_zh_uninitialized_pkuseg():
nlp = Chinese(meta={"tokenizer": {"config": {"segmenter": "char"}}})
nlp.tokenizer.segmenter = "pkuseg"
with pytest.raises(ValueError):
doc = nlp("test")

View File

@ -92,30 +92,35 @@ The Chinese language class supports three word segmentation options:
> ```python > ```python
> from spacy.lang.zh import Chinese > from spacy.lang.zh import Chinese
> >
> # Disable jieba to use character segmentation > # Character segmentation (default)
> Chinese.Defaults.use_jieba = False
> nlp = Chinese() > nlp = Chinese()
> >
> # Disable jieba through tokenizer config options > # Jieba
> cfg = {"use_jieba": False} > cfg = {"segmenter": "jieba"}
> nlp = Chinese(meta={"tokenizer": {"config": cfg}}) > nlp = Chinese(meta={"tokenizer": {"config": cfg}})
> >
> # Load with "default" model provided by pkuseg > # PKUSeg with "default" model provided by pkuseg
> cfg = {"pkuseg_model": "default", "require_pkuseg": True} > cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"}
> nlp = Chinese(meta={"tokenizer": {"config": cfg}}) > nlp = Chinese(meta={"tokenizer": {"config": cfg}})
> ``` > ```
1. **Jieba:** `Chinese` uses [Jieba](https://github.com/fxsjy/jieba) for word 1. **Character segmentation:** Character segmentation is the default
segmentation by default. It's enabled when you create a new `Chinese` segmentation option. It's enabled when you create a new `Chinese`
language class or call `spacy.blank("zh")`. language class or call `spacy.blank("zh")`.
2. **Character segmentation:** Character segmentation is supported by disabling 2. **Jieba:** `Chinese` uses [Jieba](https://github.com/fxsjy/jieba) for word
`jieba` and setting `Chinese.Defaults.use_jieba = False` _before_ segmentation with the tokenizer option `{"segmenter": "jieba"}`.
initializing the language class. As of spaCy v2.3.0, the `meta` tokenizer 3. **PKUSeg**: As of spaCy v2.3.0, support for
config options can be used to configure `use_jieba`.
3. **PKUSeg**: In spaCy v2.3.0, support for
[PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support [PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support
better segmentation for Chinese OntoNotes and the new better segmentation for Chinese OntoNotes and the provided
[Chinese models](/models/zh). [Chinese models](/models/zh). Enable PKUSeg with the tokenizer option
`{"segmenter": "pkuseg"}`.
<Infobox variant="warning">
In spaCy v3, the default Chinese word segmenter has switched from Jieba to
character segmentation.
</Infobox>
<Infobox variant="warning"> <Infobox variant="warning">
@ -129,29 +134,29 @@ $ pip install https://github.com/honnibal/pkuseg-python/archive/master.zip
</Infobox> </Infobox>
<Accordion title="Details on spaCy's PKUSeg API"> <Accordion title="Details on spaCy's Chinese API">
The `meta` argument of the `Chinese` language class supports the following The `meta` argument of the `Chinese` language class supports the following
following tokenizer config settings: following tokenizer config settings:
| Name | Type | Description | | Name | Type | Description |
| ------------------ | ---- | ---------------------------------------------------------------------------------------------------- | | ------------------ | ---- | ------------------------------------------------------------------------------------------------------- |
| `pkuseg_model` | str | **Required:** Name of a model provided by `pkuseg` or the path to a local model directory. | | `segmenter` | str | Word segmenter: `char`, `jieba` or `pkuseg`. Defaults to `char`. |
| `pkuseg_user_dict` | str | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. | | `pkuseg_model` | str | **Required for `pkuseg`:** Name of a model provided by `pkuseg` or the path to a local model directory. |
| `require_pkuseg` | bool | Overrides all `jieba` settings (optional but strongly recommended). | | `pkuseg_user_dict` | str | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. |
```python ```python
### Examples ### Examples
# Load "default" model # Load "default" model
cfg = {"pkuseg_model": "default", "require_pkuseg": True} cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"}
nlp = Chinese(meta={"tokenizer": {"config": cfg}}) nlp = Chinese(meta={"tokenizer": {"config": cfg}})
# Load local model # Load local model
cfg = {"pkuseg_model": "/path/to/pkuseg_model", "require_pkuseg": True} cfg = {"segmenter": "pkuseg", "pkuseg_model": "/path/to/pkuseg_model"}
nlp = Chinese(meta={"tokenizer": {"config": cfg}}) nlp = Chinese(meta={"tokenizer": {"config": cfg}})
# Override the user directory # Override the user directory
cfg = {"pkuseg_model": "default", "require_pkuseg": True, "pkuseg_user_dict": "/path"} cfg = {"segmenter": "pkuseg", "pkuseg_model": "default", "pkuseg_user_dict": "/path"}
nlp = Chinese(meta={"tokenizer": {"config": cfg}}) nlp = Chinese(meta={"tokenizer": {"config": cfg}})
``` ```