mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Refactor Chinese tokenizer configuration (#5736)
* Refactor Chinese tokenizer configuration Refactor `ChineseTokenizer` configuration so that it uses a single `segmenter` setting to choose between character segmentation, jieba, and pkuseg. * replace `use_jieba`, `use_pkuseg`, `require_pkuseg` with the setting `segmenter` with the supported values: `char`, `jieba`, `pkuseg` * make the default segmenter plain character segmentation `char` (no additional libraries required) * Fix Chinese serialization test to use char default * Warn if attempting to customize other segmenter Add a warning if `Chinese.pkuseg_update_user_dict` is called when another segmenter is selected.
This commit is contained in:
parent
9ee1c54f40
commit
39ebcd9ec9
|
@ -115,6 +115,10 @@ class Warnings:
|
||||||
"string \"Field1=Value1,Value2|Field2=Value3\".")
|
"string \"Field1=Value1,Value2|Field2=Value3\".")
|
||||||
W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.")
|
W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.")
|
||||||
W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")
|
W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")
|
||||||
|
W103 = ("Unknown {lang} word segmenter '{segmenter}'. Supported "
|
||||||
|
"word segmenters: {supported}. Defaulting to {default}.")
|
||||||
|
W104 = ("Skipping modifications for '{target}' segmenter. The current "
|
||||||
|
"segmenter is '{current}'.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
@ -535,6 +539,10 @@ class Errors:
|
||||||
"'{token_attrs}'.")
|
"'{token_attrs}'.")
|
||||||
E999 = ("Unable to merge the `Doc` objects because they do not all share "
|
E999 = ("Unable to merge the `Doc` objects because they do not all share "
|
||||||
"the same `Vocab`.")
|
"the same `Vocab`.")
|
||||||
|
E1000 = ("No pkuseg model available. Provide a pkuseg model when "
|
||||||
|
"initializing the pipeline: "
|
||||||
|
'`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
|
||||||
|
'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`')
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
import tempfile
|
import tempfile
|
||||||
import srsly
|
import srsly
|
||||||
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
|
from ...errors import Warnings, Errors
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import DummyTokenizer
|
from ...util import DummyTokenizer
|
||||||
|
@ -16,100 +18,117 @@ from ... import util
|
||||||
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.22` or from https://github.com/lancopku/pkuseg-python"
|
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.22` or from https://github.com/lancopku/pkuseg-python"
|
||||||
|
|
||||||
|
|
||||||
def try_jieba_import(use_jieba):
|
def try_jieba_import(segmenter):
|
||||||
try:
|
try:
|
||||||
import jieba
|
import jieba
|
||||||
|
|
||||||
# segment a short text to have jieba initialize its cache in advance
|
if segmenter == "jieba":
|
||||||
list(jieba.cut("作为", cut_all=False))
|
# segment a short text to have jieba initialize its cache in advance
|
||||||
|
list(jieba.cut("作为", cut_all=False))
|
||||||
|
|
||||||
return jieba
|
return jieba
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if use_jieba:
|
if segmenter == "jieba":
|
||||||
msg = (
|
msg = (
|
||||||
"Jieba not installed. Either set the default to False with "
|
"Jieba not installed. To use jieba, install it with `pip "
|
||||||
"`from spacy.lang.zh import ChineseDefaults; ChineseDefaults.use_jieba = False`, "
|
" install jieba` or from https://github.com/fxsjy/jieba"
|
||||||
"or install it with `pip install jieba` or from "
|
|
||||||
"https://github.com/fxsjy/jieba"
|
|
||||||
)
|
)
|
||||||
raise ImportError(msg)
|
raise ImportError(msg)
|
||||||
|
|
||||||
|
|
||||||
def try_pkuseg_import(use_pkuseg, pkuseg_model, pkuseg_user_dict):
|
def try_pkuseg_import(segmenter, pkuseg_model, pkuseg_user_dict):
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import pkuseg
|
||||||
|
|
||||||
if pkuseg_model:
|
if pkuseg_model:
|
||||||
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||||
elif use_pkuseg:
|
elif segmenter == "pkuseg":
|
||||||
msg = (
|
msg = (
|
||||||
"Chinese.use_pkuseg is True but no pkuseg model was specified. "
|
"The Chinese word segmenter is 'pkuseg' but no pkuseg model "
|
||||||
"Please provide the name of a pretrained model "
|
"was specified. Please provide the name of a pretrained model "
|
||||||
"or the path to a model with "
|
"or the path to a model with "
|
||||||
'`Chinese(meta={"tokenizer": {"config": {"pkuseg_model": name_or_path}}}).'
|
'`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
|
||||||
|
'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`'
|
||||||
)
|
)
|
||||||
raise ValueError(msg)
|
raise ValueError(msg)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if use_pkuseg:
|
if segmenter == "pkuseg":
|
||||||
msg = (
|
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
||||||
"pkuseg not installed. Either set Chinese.use_pkuseg = False, "
|
|
||||||
"or " + _PKUSEG_INSTALL_MSG
|
|
||||||
)
|
|
||||||
raise ImportError(msg)
|
raise ImportError(msg)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
if use_pkuseg:
|
if segmenter == "pkuseg":
|
||||||
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
||||||
raise FileNotFoundError(msg)
|
raise FileNotFoundError(msg)
|
||||||
|
|
||||||
|
|
||||||
class ChineseTokenizer(DummyTokenizer):
|
class ChineseTokenizer(DummyTokenizer):
|
||||||
def __init__(self, cls, nlp=None, config={}):
|
def __init__(self, cls, nlp=None, config={}):
|
||||||
self.use_jieba = config.get("use_jieba", cls.use_jieba)
|
self.supported_segmenters = ("char", "jieba", "pkuseg")
|
||||||
self.use_pkuseg = config.get("use_pkuseg", cls.use_pkuseg)
|
self.configure_segmenter(config)
|
||||||
self.require_pkuseg = config.get("require_pkuseg", False)
|
|
||||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||||
self.jieba_seg = try_jieba_import(self.use_jieba)
|
|
||||||
self.pkuseg_seg = try_pkuseg_import(
|
|
||||||
self.use_pkuseg,
|
|
||||||
pkuseg_model=config.get("pkuseg_model", None),
|
|
||||||
pkuseg_user_dict=config.get("pkuseg_user_dict", "default"),
|
|
||||||
)
|
|
||||||
# remove relevant settings from config so they're not also saved in
|
# remove relevant settings from config so they're not also saved in
|
||||||
# Language.meta
|
# Language.meta
|
||||||
for key in ["use_jieba", "use_pkuseg", "require_pkuseg", "pkuseg_model"]:
|
for key in ["segmenter", "pkuseg_model", "pkuseg_user_dict"]:
|
||||||
if key in config:
|
if key in config:
|
||||||
del config[key]
|
del config[key]
|
||||||
self.tokenizer = Language.Defaults().create_tokenizer(nlp)
|
self.tokenizer = Language.Defaults().create_tokenizer(nlp)
|
||||||
|
|
||||||
|
def configure_segmenter(self, config):
|
||||||
|
self.segmenter = "char"
|
||||||
|
if "segmenter" in config:
|
||||||
|
if config["segmenter"] in self.supported_segmenters:
|
||||||
|
self.segmenter = config["segmenter"]
|
||||||
|
else:
|
||||||
|
warn_msg = Warnings.W103.format(
|
||||||
|
lang="Chinese",
|
||||||
|
segmenter=config["segmenter"],
|
||||||
|
supported=", ".join([repr(s) for s in self.supported_segmenters]),
|
||||||
|
default="'char' (character segmentation)",
|
||||||
|
)
|
||||||
|
warnings.warn(warn_msg)
|
||||||
|
self.jieba_seg = try_jieba_import(self.segmenter)
|
||||||
|
self.pkuseg_seg = try_pkuseg_import(
|
||||||
|
self.segmenter,
|
||||||
|
pkuseg_model=config.get("pkuseg_model", None),
|
||||||
|
pkuseg_user_dict=config.get("pkuseg_user_dict", "default"),
|
||||||
|
)
|
||||||
|
|
||||||
def __call__(self, text):
|
def __call__(self, text):
|
||||||
use_jieba = self.use_jieba
|
if self.segmenter == "jieba":
|
||||||
use_pkuseg = self.use_pkuseg
|
|
||||||
if self.require_pkuseg:
|
|
||||||
use_jieba = False
|
|
||||||
use_pkuseg = True
|
|
||||||
if use_jieba:
|
|
||||||
words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
|
words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
|
||||||
(words, spaces) = util.get_words_and_spaces(words, text)
|
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
elif use_pkuseg:
|
elif self.segmenter == "pkuseg":
|
||||||
|
if self.pkuseg_seg is None:
|
||||||
|
raise ValueError(Errors.E1000)
|
||||||
words = self.pkuseg_seg.cut(text)
|
words = self.pkuseg_seg.cut(text)
|
||||||
(words, spaces) = util.get_words_and_spaces(words, text)
|
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
else:
|
|
||||||
# split into individual characters
|
# warn if segmenter setting is not the only remaining option "char"
|
||||||
words = list(text)
|
if self.segmenter != "char":
|
||||||
(words, spaces) = util.get_words_and_spaces(words, text)
|
warn_msg = Warnings.W103.format(
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
lang="Chinese",
|
||||||
|
segmenter=self.segmenter,
|
||||||
|
supported=", ".join([repr(s) for s in self.supported_segmenters]),
|
||||||
|
default="'char' (character segmentation)",
|
||||||
|
)
|
||||||
|
warnings.warn(warn_msg)
|
||||||
|
|
||||||
|
# split into individual characters
|
||||||
|
words = list(text)
|
||||||
|
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||||
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
def pkuseg_update_user_dict(self, words, reset=False):
|
def pkuseg_update_user_dict(self, words, reset=False):
|
||||||
if self.pkuseg_seg:
|
if self.segmenter == "pkuseg":
|
||||||
if reset:
|
if reset:
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import pkuseg
|
||||||
|
|
||||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
|
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if self.use_pkuseg:
|
if self.segmenter == "pkuseg":
|
||||||
msg = (
|
msg = (
|
||||||
"pkuseg not installed: unable to reset pkuseg "
|
"pkuseg not installed: unable to reset pkuseg "
|
||||||
"user dict. Please " + _PKUSEG_INSTALL_MSG
|
"user dict. Please " + _PKUSEG_INSTALL_MSG
|
||||||
|
@ -117,21 +136,16 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
raise ImportError(msg)
|
raise ImportError(msg)
|
||||||
for word in words:
|
for word in words:
|
||||||
self.pkuseg_seg.preprocesser.insert(word.strip(), "")
|
self.pkuseg_seg.preprocesser.insert(word.strip(), "")
|
||||||
|
else:
|
||||||
|
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
|
||||||
|
warnings.warn(warn_msg)
|
||||||
|
|
||||||
def _get_config(self):
|
def _get_config(self):
|
||||||
config = OrderedDict(
|
config = OrderedDict((("segmenter", self.segmenter),))
|
||||||
(
|
|
||||||
("use_jieba", self.use_jieba),
|
|
||||||
("use_pkuseg", self.use_pkuseg),
|
|
||||||
("require_pkuseg", self.require_pkuseg),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return config
|
return config
|
||||||
|
|
||||||
def _set_config(self, config={}):
|
def _set_config(self, config={}):
|
||||||
self.use_jieba = config.get("use_jieba", False)
|
self.configure_segmenter(config)
|
||||||
self.use_pkuseg = config.get("use_pkuseg", False)
|
|
||||||
self.require_pkuseg = config.get("require_pkuseg", False)
|
|
||||||
|
|
||||||
def to_bytes(self, **kwargs):
|
def to_bytes(self, **kwargs):
|
||||||
pkuseg_features_b = b""
|
pkuseg_features_b = b""
|
||||||
|
@ -248,7 +262,7 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import pkuseg
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if self.use_pkuseg:
|
if self.segmenter == "pkuseg":
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"pkuseg not installed. To use this model, "
|
"pkuseg not installed. To use this model, "
|
||||||
+ _PKUSEG_INSTALL_MSG
|
+ _PKUSEG_INSTALL_MSG
|
||||||
|
@ -260,9 +274,9 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import pkuseg
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if self.use_pkuseg:
|
if self.segmenter == "pkuseg":
|
||||||
raise ImportError(self._pkuseg_install_msg)
|
raise ImportError(self._pkuseg_install_msg)
|
||||||
if self.pkuseg_seg:
|
if self.segmenter == "pkuseg":
|
||||||
data = srsly.read_msgpack(path)
|
data = srsly.read_msgpack(path)
|
||||||
(user_dict, do_process, common_words, other_words) = data
|
(user_dict, do_process, common_words, other_words) = data
|
||||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
||||||
|
@ -288,8 +302,6 @@ class ChineseDefaults(Language.Defaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
use_jieba = True
|
|
||||||
use_pkuseg = False
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_tokenizer(cls, nlp=None, config={}):
|
def create_tokenizer(cls, nlp=None, config={}):
|
||||||
|
|
|
@ -244,22 +244,22 @@ def yo_tokenizer():
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def zh_tokenizer_char():
|
def zh_tokenizer_char():
|
||||||
return get_lang_class("zh").Defaults.create_tokenizer(
|
return get_lang_class("zh").Defaults.create_tokenizer()
|
||||||
config={"use_jieba": False, "use_pkuseg": False}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def zh_tokenizer_jieba():
|
def zh_tokenizer_jieba():
|
||||||
pytest.importorskip("jieba")
|
pytest.importorskip("jieba")
|
||||||
return get_lang_class("zh").Defaults.create_tokenizer()
|
return get_lang_class("zh").Defaults.create_tokenizer(
|
||||||
|
config={"segmenter": "jieba"}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def zh_tokenizer_pkuseg():
|
def zh_tokenizer_pkuseg():
|
||||||
pytest.importorskip("pkuseg")
|
pytest.importorskip("pkuseg")
|
||||||
return get_lang_class("zh").Defaults.create_tokenizer(
|
return get_lang_class("zh").Defaults.create_tokenizer(
|
||||||
config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True}
|
config={"pkuseg_model": "default", "segmenter": "pkuseg"}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,14 +5,14 @@ from ...util import make_tempdir
|
||||||
|
|
||||||
def zh_tokenizer_serialize(zh_tokenizer):
|
def zh_tokenizer_serialize(zh_tokenizer):
|
||||||
tokenizer_bytes = zh_tokenizer.to_bytes()
|
tokenizer_bytes = zh_tokenizer.to_bytes()
|
||||||
nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}})
|
nlp = Chinese()
|
||||||
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
|
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
file_path = d / "tokenizer"
|
file_path = d / "tokenizer"
|
||||||
zh_tokenizer.to_disk(file_path)
|
zh_tokenizer.to_disk(file_path)
|
||||||
nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}})
|
nlp = Chinese()
|
||||||
nlp.tokenizer.from_disk(file_path)
|
nlp.tokenizer.from_disk(file_path)
|
||||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
|
|
||||||
|
@ -25,18 +25,13 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
|
||||||
zh_tokenizer_serialize(zh_tokenizer_jieba)
|
zh_tokenizer_serialize(zh_tokenizer_jieba)
|
||||||
|
|
||||||
|
|
||||||
def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg):
|
|
||||||
zh_tokenizer_serialize(zh_tokenizer_pkuseg)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.slow
|
@pytest.mark.slow
|
||||||
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
||||||
nlp = Chinese(
|
nlp = Chinese(
|
||||||
meta={
|
meta={
|
||||||
"tokenizer": {
|
"tokenizer": {
|
||||||
"config": {
|
"config": {
|
||||||
"use_jieba": False,
|
"segmenter": "pkuseg",
|
||||||
"use_pkuseg": True,
|
|
||||||
"pkuseg_model": "medicine",
|
"pkuseg_model": "medicine",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.lang.zh import _get_pkuseg_trie_data
|
from spacy.lang.zh import Chinese, _get_pkuseg_trie_data
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
@ -37,7 +37,7 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens):
|
||||||
assert tokens == expected_tokens
|
assert tokens == expected_tokens
|
||||||
|
|
||||||
|
|
||||||
def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
|
def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg, zh_tokenizer_char):
|
||||||
user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
|
user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
|
||||||
zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
|
zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
|
||||||
updated_user_dict = _get_pkuseg_trie_data(
|
updated_user_dict = _get_pkuseg_trie_data(
|
||||||
|
@ -52,8 +52,24 @@ def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
|
||||||
)
|
)
|
||||||
assert len(reset_user_dict) == 0
|
assert len(reset_user_dict) == 0
|
||||||
|
|
||||||
|
# warn if not relevant
|
||||||
|
with pytest.warns(UserWarning):
|
||||||
|
zh_tokenizer_char.pkuseg_update_user_dict(["nonsense_asdf"])
|
||||||
|
|
||||||
def test_extra_spaces(zh_tokenizer_char):
|
|
||||||
|
def test_zh_extra_spaces(zh_tokenizer_char):
|
||||||
# note: three spaces after "I"
|
# note: three spaces after "I"
|
||||||
tokens = zh_tokenizer_char("I like cheese.")
|
tokens = zh_tokenizer_char("I like cheese.")
|
||||||
assert tokens[1].orth_ == " "
|
assert tokens[1].orth_ == " "
|
||||||
|
|
||||||
|
|
||||||
|
def test_zh_unsupported_segmenter():
|
||||||
|
with pytest.warns(UserWarning):
|
||||||
|
nlp = Chinese(meta={"tokenizer": {"config": {"segmenter": "unk"}}})
|
||||||
|
|
||||||
|
|
||||||
|
def test_zh_uninitialized_pkuseg():
|
||||||
|
nlp = Chinese(meta={"tokenizer": {"config": {"segmenter": "char"}}})
|
||||||
|
nlp.tokenizer.segmenter = "pkuseg"
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc = nlp("test")
|
||||||
|
|
|
@ -92,30 +92,35 @@ The Chinese language class supports three word segmentation options:
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.lang.zh import Chinese
|
> from spacy.lang.zh import Chinese
|
||||||
>
|
>
|
||||||
> # Disable jieba to use character segmentation
|
> # Character segmentation (default)
|
||||||
> Chinese.Defaults.use_jieba = False
|
|
||||||
> nlp = Chinese()
|
> nlp = Chinese()
|
||||||
>
|
>
|
||||||
> # Disable jieba through tokenizer config options
|
> # Jieba
|
||||||
> cfg = {"use_jieba": False}
|
> cfg = {"segmenter": "jieba"}
|
||||||
> nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
> nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
||||||
>
|
>
|
||||||
> # Load with "default" model provided by pkuseg
|
> # PKUSeg with "default" model provided by pkuseg
|
||||||
> cfg = {"pkuseg_model": "default", "require_pkuseg": True}
|
> cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"}
|
||||||
> nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
> nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
1. **Jieba:** `Chinese` uses [Jieba](https://github.com/fxsjy/jieba) for word
|
1. **Character segmentation:** Character segmentation is the default
|
||||||
segmentation by default. It's enabled when you create a new `Chinese`
|
segmentation option. It's enabled when you create a new `Chinese`
|
||||||
language class or call `spacy.blank("zh")`.
|
language class or call `spacy.blank("zh")`.
|
||||||
2. **Character segmentation:** Character segmentation is supported by disabling
|
2. **Jieba:** `Chinese` uses [Jieba](https://github.com/fxsjy/jieba) for word
|
||||||
`jieba` and setting `Chinese.Defaults.use_jieba = False` _before_
|
segmentation with the tokenizer option `{"segmenter": "jieba"}`.
|
||||||
initializing the language class. As of spaCy v2.3.0, the `meta` tokenizer
|
3. **PKUSeg**: As of spaCy v2.3.0, support for
|
||||||
config options can be used to configure `use_jieba`.
|
|
||||||
3. **PKUSeg**: In spaCy v2.3.0, support for
|
|
||||||
[PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support
|
[PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support
|
||||||
better segmentation for Chinese OntoNotes and the new
|
better segmentation for Chinese OntoNotes and the provided
|
||||||
[Chinese models](/models/zh).
|
[Chinese models](/models/zh). Enable PKUSeg with the tokenizer option
|
||||||
|
`{"segmenter": "pkuseg"}`.
|
||||||
|
|
||||||
|
<Infobox variant="warning">
|
||||||
|
|
||||||
|
In spaCy v3, the default Chinese word segmenter has switched from Jieba to
|
||||||
|
character segmentation.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
<Infobox variant="warning">
|
<Infobox variant="warning">
|
||||||
|
|
||||||
|
@ -129,29 +134,29 @@ $ pip install https://github.com/honnibal/pkuseg-python/archive/master.zip
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
<Accordion title="Details on spaCy's PKUSeg API">
|
<Accordion title="Details on spaCy's Chinese API">
|
||||||
|
|
||||||
The `meta` argument of the `Chinese` language class supports the following
|
The `meta` argument of the `Chinese` language class supports the following
|
||||||
following tokenizer config settings:
|
following tokenizer config settings:
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------------ | ---- | ---------------------------------------------------------------------------------------------------- |
|
| ------------------ | ---- | ------------------------------------------------------------------------------------------------------- |
|
||||||
| `pkuseg_model` | str | **Required:** Name of a model provided by `pkuseg` or the path to a local model directory. |
|
| `segmenter` | str | Word segmenter: `char`, `jieba` or `pkuseg`. Defaults to `char`. |
|
||||||
| `pkuseg_user_dict` | str | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. |
|
| `pkuseg_model` | str | **Required for `pkuseg`:** Name of a model provided by `pkuseg` or the path to a local model directory. |
|
||||||
| `require_pkuseg` | bool | Overrides all `jieba` settings (optional but strongly recommended). |
|
| `pkuseg_user_dict` | str | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Examples
|
### Examples
|
||||||
# Load "default" model
|
# Load "default" model
|
||||||
cfg = {"pkuseg_model": "default", "require_pkuseg": True}
|
cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"}
|
||||||
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
||||||
|
|
||||||
# Load local model
|
# Load local model
|
||||||
cfg = {"pkuseg_model": "/path/to/pkuseg_model", "require_pkuseg": True}
|
cfg = {"segmenter": "pkuseg", "pkuseg_model": "/path/to/pkuseg_model"}
|
||||||
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
||||||
|
|
||||||
# Override the user directory
|
# Override the user directory
|
||||||
cfg = {"pkuseg_model": "default", "require_pkuseg": True, "pkuseg_user_dict": "/path"}
|
cfg = {"segmenter": "pkuseg", "pkuseg_model": "default", "pkuseg_user_dict": "/path"}
|
||||||
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user