mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Refactor Chinese tokenizer configuration (#5736)
* Refactor Chinese tokenizer configuration Refactor `ChineseTokenizer` configuration so that it uses a single `segmenter` setting to choose between character segmentation, jieba, and pkuseg. * replace `use_jieba`, `use_pkuseg`, `require_pkuseg` with the setting `segmenter` with the supported values: `char`, `jieba`, `pkuseg` * make the default segmenter plain character segmentation `char` (no additional libraries required) * Fix Chinese serialization test to use char default * Warn if attempting to customize other segmenter Add a warning if `Chinese.pkuseg_update_user_dict` is called when another segmenter is selected.
This commit is contained in:
parent
9ee1c54f40
commit
39ebcd9ec9
|
@ -115,6 +115,10 @@ class Warnings:
|
|||
"string \"Field1=Value1,Value2|Field2=Value3\".")
|
||||
W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.")
|
||||
W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")
|
||||
W103 = ("Unknown {lang} word segmenter '{segmenter}'. Supported "
|
||||
"word segmenters: {supported}. Defaulting to {default}.")
|
||||
W104 = ("Skipping modifications for '{target}' segmenter. The current "
|
||||
"segmenter is '{current}'.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
@ -535,6 +539,10 @@ class Errors:
|
|||
"'{token_attrs}'.")
|
||||
E999 = ("Unable to merge the `Doc` objects because they do not all share "
|
||||
"the same `Vocab`.")
|
||||
E1000 = ("No pkuseg model available. Provide a pkuseg model when "
|
||||
"initializing the pipeline: "
|
||||
'`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
|
||||
'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`')
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
import tempfile
|
||||
import srsly
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from collections import OrderedDict
|
||||
from ...attrs import LANG
|
||||
from ...errors import Warnings, Errors
|
||||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from ...util import DummyTokenizer
|
||||
|
@ -16,100 +18,117 @@ from ... import util
|
|||
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.22` or from https://github.com/lancopku/pkuseg-python"
|
||||
|
||||
|
||||
def try_jieba_import(use_jieba):
|
||||
def try_jieba_import(segmenter):
|
||||
try:
|
||||
import jieba
|
||||
|
||||
# segment a short text to have jieba initialize its cache in advance
|
||||
list(jieba.cut("作为", cut_all=False))
|
||||
if segmenter == "jieba":
|
||||
# segment a short text to have jieba initialize its cache in advance
|
||||
list(jieba.cut("作为", cut_all=False))
|
||||
|
||||
return jieba
|
||||
except ImportError:
|
||||
if use_jieba:
|
||||
if segmenter == "jieba":
|
||||
msg = (
|
||||
"Jieba not installed. Either set the default to False with "
|
||||
"`from spacy.lang.zh import ChineseDefaults; ChineseDefaults.use_jieba = False`, "
|
||||
"or install it with `pip install jieba` or from "
|
||||
"https://github.com/fxsjy/jieba"
|
||||
"Jieba not installed. To use jieba, install it with `pip "
|
||||
" install jieba` or from https://github.com/fxsjy/jieba"
|
||||
)
|
||||
raise ImportError(msg)
|
||||
|
||||
|
||||
def try_pkuseg_import(use_pkuseg, pkuseg_model, pkuseg_user_dict):
|
||||
def try_pkuseg_import(segmenter, pkuseg_model, pkuseg_user_dict):
|
||||
try:
|
||||
import pkuseg
|
||||
|
||||
if pkuseg_model:
|
||||
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||
elif use_pkuseg:
|
||||
elif segmenter == "pkuseg":
|
||||
msg = (
|
||||
"Chinese.use_pkuseg is True but no pkuseg model was specified. "
|
||||
"Please provide the name of a pretrained model "
|
||||
"The Chinese word segmenter is 'pkuseg' but no pkuseg model "
|
||||
"was specified. Please provide the name of a pretrained model "
|
||||
"or the path to a model with "
|
||||
'`Chinese(meta={"tokenizer": {"config": {"pkuseg_model": name_or_path}}}).'
|
||||
'`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
|
||||
'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`'
|
||||
)
|
||||
raise ValueError(msg)
|
||||
except ImportError:
|
||||
if use_pkuseg:
|
||||
msg = (
|
||||
"pkuseg not installed. Either set Chinese.use_pkuseg = False, "
|
||||
"or " + _PKUSEG_INSTALL_MSG
|
||||
)
|
||||
if segmenter == "pkuseg":
|
||||
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
||||
raise ImportError(msg)
|
||||
except FileNotFoundError:
|
||||
if use_pkuseg:
|
||||
if segmenter == "pkuseg":
|
||||
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
||||
raise FileNotFoundError(msg)
|
||||
|
||||
|
||||
class ChineseTokenizer(DummyTokenizer):
|
||||
def __init__(self, cls, nlp=None, config={}):
|
||||
self.use_jieba = config.get("use_jieba", cls.use_jieba)
|
||||
self.use_pkuseg = config.get("use_pkuseg", cls.use_pkuseg)
|
||||
self.require_pkuseg = config.get("require_pkuseg", False)
|
||||
self.supported_segmenters = ("char", "jieba", "pkuseg")
|
||||
self.configure_segmenter(config)
|
||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||
self.jieba_seg = try_jieba_import(self.use_jieba)
|
||||
self.pkuseg_seg = try_pkuseg_import(
|
||||
self.use_pkuseg,
|
||||
pkuseg_model=config.get("pkuseg_model", None),
|
||||
pkuseg_user_dict=config.get("pkuseg_user_dict", "default"),
|
||||
)
|
||||
# remove relevant settings from config so they're not also saved in
|
||||
# Language.meta
|
||||
for key in ["use_jieba", "use_pkuseg", "require_pkuseg", "pkuseg_model"]:
|
||||
for key in ["segmenter", "pkuseg_model", "pkuseg_user_dict"]:
|
||||
if key in config:
|
||||
del config[key]
|
||||
self.tokenizer = Language.Defaults().create_tokenizer(nlp)
|
||||
|
||||
def configure_segmenter(self, config):
|
||||
self.segmenter = "char"
|
||||
if "segmenter" in config:
|
||||
if config["segmenter"] in self.supported_segmenters:
|
||||
self.segmenter = config["segmenter"]
|
||||
else:
|
||||
warn_msg = Warnings.W103.format(
|
||||
lang="Chinese",
|
||||
segmenter=config["segmenter"],
|
||||
supported=", ".join([repr(s) for s in self.supported_segmenters]),
|
||||
default="'char' (character segmentation)",
|
||||
)
|
||||
warnings.warn(warn_msg)
|
||||
self.jieba_seg = try_jieba_import(self.segmenter)
|
||||
self.pkuseg_seg = try_pkuseg_import(
|
||||
self.segmenter,
|
||||
pkuseg_model=config.get("pkuseg_model", None),
|
||||
pkuseg_user_dict=config.get("pkuseg_user_dict", "default"),
|
||||
)
|
||||
|
||||
def __call__(self, text):
|
||||
use_jieba = self.use_jieba
|
||||
use_pkuseg = self.use_pkuseg
|
||||
if self.require_pkuseg:
|
||||
use_jieba = False
|
||||
use_pkuseg = True
|
||||
if use_jieba:
|
||||
if self.segmenter == "jieba":
|
||||
words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
|
||||
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
elif use_pkuseg:
|
||||
elif self.segmenter == "pkuseg":
|
||||
if self.pkuseg_seg is None:
|
||||
raise ValueError(Errors.E1000)
|
||||
words = self.pkuseg_seg.cut(text)
|
||||
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
else:
|
||||
# split into individual characters
|
||||
words = list(text)
|
||||
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
# warn if segmenter setting is not the only remaining option "char"
|
||||
if self.segmenter != "char":
|
||||
warn_msg = Warnings.W103.format(
|
||||
lang="Chinese",
|
||||
segmenter=self.segmenter,
|
||||
supported=", ".join([repr(s) for s in self.supported_segmenters]),
|
||||
default="'char' (character segmentation)",
|
||||
)
|
||||
warnings.warn(warn_msg)
|
||||
|
||||
# split into individual characters
|
||||
words = list(text)
|
||||
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
def pkuseg_update_user_dict(self, words, reset=False):
|
||||
if self.pkuseg_seg:
|
||||
if self.segmenter == "pkuseg":
|
||||
if reset:
|
||||
try:
|
||||
import pkuseg
|
||||
|
||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
|
||||
except ImportError:
|
||||
if self.use_pkuseg:
|
||||
if self.segmenter == "pkuseg":
|
||||
msg = (
|
||||
"pkuseg not installed: unable to reset pkuseg "
|
||||
"user dict. Please " + _PKUSEG_INSTALL_MSG
|
||||
|
@ -117,21 +136,16 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
raise ImportError(msg)
|
||||
for word in words:
|
||||
self.pkuseg_seg.preprocesser.insert(word.strip(), "")
|
||||
else:
|
||||
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
|
||||
warnings.warn(warn_msg)
|
||||
|
||||
def _get_config(self):
|
||||
config = OrderedDict(
|
||||
(
|
||||
("use_jieba", self.use_jieba),
|
||||
("use_pkuseg", self.use_pkuseg),
|
||||
("require_pkuseg", self.require_pkuseg),
|
||||
)
|
||||
)
|
||||
config = OrderedDict((("segmenter", self.segmenter),))
|
||||
return config
|
||||
|
||||
def _set_config(self, config={}):
|
||||
self.use_jieba = config.get("use_jieba", False)
|
||||
self.use_pkuseg = config.get("use_pkuseg", False)
|
||||
self.require_pkuseg = config.get("require_pkuseg", False)
|
||||
self.configure_segmenter(config)
|
||||
|
||||
def to_bytes(self, **kwargs):
|
||||
pkuseg_features_b = b""
|
||||
|
@ -248,7 +262,7 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
try:
|
||||
import pkuseg
|
||||
except ImportError:
|
||||
if self.use_pkuseg:
|
||||
if self.segmenter == "pkuseg":
|
||||
raise ImportError(
|
||||
"pkuseg not installed. To use this model, "
|
||||
+ _PKUSEG_INSTALL_MSG
|
||||
|
@ -260,9 +274,9 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
try:
|
||||
import pkuseg
|
||||
except ImportError:
|
||||
if self.use_pkuseg:
|
||||
if self.segmenter == "pkuseg":
|
||||
raise ImportError(self._pkuseg_install_msg)
|
||||
if self.pkuseg_seg:
|
||||
if self.segmenter == "pkuseg":
|
||||
data = srsly.read_msgpack(path)
|
||||
(user_dict, do_process, common_words, other_words) = data
|
||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
||||
|
@ -288,8 +302,6 @@ class ChineseDefaults(Language.Defaults):
|
|||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
use_jieba = True
|
||||
use_pkuseg = False
|
||||
|
||||
@classmethod
|
||||
def create_tokenizer(cls, nlp=None, config={}):
|
||||
|
|
|
@ -244,22 +244,22 @@ def yo_tokenizer():
|
|||
|
||||
@pytest.fixture(scope="session")
|
||||
def zh_tokenizer_char():
|
||||
return get_lang_class("zh").Defaults.create_tokenizer(
|
||||
config={"use_jieba": False, "use_pkuseg": False}
|
||||
)
|
||||
return get_lang_class("zh").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def zh_tokenizer_jieba():
|
||||
pytest.importorskip("jieba")
|
||||
return get_lang_class("zh").Defaults.create_tokenizer()
|
||||
return get_lang_class("zh").Defaults.create_tokenizer(
|
||||
config={"segmenter": "jieba"}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def zh_tokenizer_pkuseg():
|
||||
pytest.importorskip("pkuseg")
|
||||
return get_lang_class("zh").Defaults.create_tokenizer(
|
||||
config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True}
|
||||
config={"pkuseg_model": "default", "segmenter": "pkuseg"}
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -5,14 +5,14 @@ from ...util import make_tempdir
|
|||
|
||||
def zh_tokenizer_serialize(zh_tokenizer):
|
||||
tokenizer_bytes = zh_tokenizer.to_bytes()
|
||||
nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}})
|
||||
nlp = Chinese()
|
||||
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "tokenizer"
|
||||
zh_tokenizer.to_disk(file_path)
|
||||
nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}})
|
||||
nlp = Chinese()
|
||||
nlp.tokenizer.from_disk(file_path)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
|
||||
|
@ -25,18 +25,13 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
|
|||
zh_tokenizer_serialize(zh_tokenizer_jieba)
|
||||
|
||||
|
||||
def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg):
|
||||
zh_tokenizer_serialize(zh_tokenizer_pkuseg)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
||||
nlp = Chinese(
|
||||
meta={
|
||||
"tokenizer": {
|
||||
"config": {
|
||||
"use_jieba": False,
|
||||
"use_pkuseg": True,
|
||||
"segmenter": "pkuseg",
|
||||
"pkuseg_model": "medicine",
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import pytest
|
||||
from spacy.lang.zh import _get_pkuseg_trie_data
|
||||
from spacy.lang.zh import Chinese, _get_pkuseg_trie_data
|
||||
|
||||
|
||||
# fmt: off
|
||||
|
@ -37,7 +37,7 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens):
|
|||
assert tokens == expected_tokens
|
||||
|
||||
|
||||
def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
|
||||
def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg, zh_tokenizer_char):
|
||||
user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
|
||||
zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
|
||||
updated_user_dict = _get_pkuseg_trie_data(
|
||||
|
@ -52,8 +52,24 @@ def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
|
|||
)
|
||||
assert len(reset_user_dict) == 0
|
||||
|
||||
# warn if not relevant
|
||||
with pytest.warns(UserWarning):
|
||||
zh_tokenizer_char.pkuseg_update_user_dict(["nonsense_asdf"])
|
||||
|
||||
def test_extra_spaces(zh_tokenizer_char):
|
||||
|
||||
def test_zh_extra_spaces(zh_tokenizer_char):
|
||||
# note: three spaces after "I"
|
||||
tokens = zh_tokenizer_char("I like cheese.")
|
||||
assert tokens[1].orth_ == " "
|
||||
|
||||
|
||||
def test_zh_unsupported_segmenter():
|
||||
with pytest.warns(UserWarning):
|
||||
nlp = Chinese(meta={"tokenizer": {"config": {"segmenter": "unk"}}})
|
||||
|
||||
|
||||
def test_zh_uninitialized_pkuseg():
|
||||
nlp = Chinese(meta={"tokenizer": {"config": {"segmenter": "char"}}})
|
||||
nlp.tokenizer.segmenter = "pkuseg"
|
||||
with pytest.raises(ValueError):
|
||||
doc = nlp("test")
|
||||
|
|
|
@ -92,30 +92,35 @@ The Chinese language class supports three word segmentation options:
|
|||
> ```python
|
||||
> from spacy.lang.zh import Chinese
|
||||
>
|
||||
> # Disable jieba to use character segmentation
|
||||
> Chinese.Defaults.use_jieba = False
|
||||
> # Character segmentation (default)
|
||||
> nlp = Chinese()
|
||||
>
|
||||
> # Disable jieba through tokenizer config options
|
||||
> cfg = {"use_jieba": False}
|
||||
> # Jieba
|
||||
> cfg = {"segmenter": "jieba"}
|
||||
> nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
||||
>
|
||||
> # Load with "default" model provided by pkuseg
|
||||
> cfg = {"pkuseg_model": "default", "require_pkuseg": True}
|
||||
> # PKUSeg with "default" model provided by pkuseg
|
||||
> cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"}
|
||||
> nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
||||
> ```
|
||||
|
||||
1. **Jieba:** `Chinese` uses [Jieba](https://github.com/fxsjy/jieba) for word
|
||||
segmentation by default. It's enabled when you create a new `Chinese`
|
||||
1. **Character segmentation:** Character segmentation is the default
|
||||
segmentation option. It's enabled when you create a new `Chinese`
|
||||
language class or call `spacy.blank("zh")`.
|
||||
2. **Character segmentation:** Character segmentation is supported by disabling
|
||||
`jieba` and setting `Chinese.Defaults.use_jieba = False` _before_
|
||||
initializing the language class. As of spaCy v2.3.0, the `meta` tokenizer
|
||||
config options can be used to configure `use_jieba`.
|
||||
3. **PKUSeg**: In spaCy v2.3.0, support for
|
||||
2. **Jieba:** `Chinese` uses [Jieba](https://github.com/fxsjy/jieba) for word
|
||||
segmentation with the tokenizer option `{"segmenter": "jieba"}`.
|
||||
3. **PKUSeg**: As of spaCy v2.3.0, support for
|
||||
[PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support
|
||||
better segmentation for Chinese OntoNotes and the new
|
||||
[Chinese models](/models/zh).
|
||||
better segmentation for Chinese OntoNotes and the provided
|
||||
[Chinese models](/models/zh). Enable PKUSeg with the tokenizer option
|
||||
`{"segmenter": "pkuseg"}`.
|
||||
|
||||
<Infobox variant="warning">
|
||||
|
||||
In spaCy v3, the default Chinese word segmenter has switched from Jieba to
|
||||
character segmentation.
|
||||
|
||||
</Infobox>
|
||||
|
||||
<Infobox variant="warning">
|
||||
|
||||
|
@ -129,29 +134,29 @@ $ pip install https://github.com/honnibal/pkuseg-python/archive/master.zip
|
|||
|
||||
</Infobox>
|
||||
|
||||
<Accordion title="Details on spaCy's PKUSeg API">
|
||||
<Accordion title="Details on spaCy's Chinese API">
|
||||
|
||||
The `meta` argument of the `Chinese` language class supports the following
|
||||
following tokenizer config settings:
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------ | ---- | ---------------------------------------------------------------------------------------------------- |
|
||||
| `pkuseg_model` | str | **Required:** Name of a model provided by `pkuseg` or the path to a local model directory. |
|
||||
| `pkuseg_user_dict` | str | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. |
|
||||
| `require_pkuseg` | bool | Overrides all `jieba` settings (optional but strongly recommended). |
|
||||
| Name | Type | Description |
|
||||
| ------------------ | ---- | ------------------------------------------------------------------------------------------------------- |
|
||||
| `segmenter` | str | Word segmenter: `char`, `jieba` or `pkuseg`. Defaults to `char`. |
|
||||
| `pkuseg_model` | str | **Required for `pkuseg`:** Name of a model provided by `pkuseg` or the path to a local model directory. |
|
||||
| `pkuseg_user_dict` | str | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. |
|
||||
|
||||
```python
|
||||
### Examples
|
||||
# Load "default" model
|
||||
cfg = {"pkuseg_model": "default", "require_pkuseg": True}
|
||||
cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"}
|
||||
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
||||
|
||||
# Load local model
|
||||
cfg = {"pkuseg_model": "/path/to/pkuseg_model", "require_pkuseg": True}
|
||||
cfg = {"segmenter": "pkuseg", "pkuseg_model": "/path/to/pkuseg_model"}
|
||||
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
||||
|
||||
# Override the user directory
|
||||
cfg = {"pkuseg_model": "default", "require_pkuseg": True, "pkuseg_user_dict": "/path"}
|
||||
cfg = {"segmenter": "pkuseg", "pkuseg_model": "default", "pkuseg_user_dict": "/path"}
|
||||
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
||||
```
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user