mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Add pkuseg and serialization support for Chinese (#5308)
* Add pkuseg and serialization support for Chinese Add support for pkuseg alongside jieba * Specify model through `Language` meta: * split on characters (if no word segmentation packages are installed) ``` Chinese(meta={"tokenizer": {"config": {"use_jieba": False, "use_pkuseg": False}}}) ``` * jieba (remains the default tokenizer if installed) ``` Chinese() Chinese(meta={"tokenizer": {"config": {"use_jieba": True}}}) # explicit ``` * pkuseg ``` Chinese(meta={"tokenizer": {"config": {"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True}}}) ``` * The new tokenizer setting `require_pkuseg` is used to override `use_jieba` default, which is intended for models that provide a pkuseg model: ``` nlp_pkuseg = Chinese(meta={"tokenizer": {"config": {"pkuseg_model": "default", "require_pkuseg": True}}}) nlp = Chinese() # has `use_jieba` as `True` by default nlp.from_bytes(nlp_pkuseg.to_bytes()) # `require_pkuseg` overrides `use_jieba` when calling the tokenizer ``` Add support for serialization of tokenizer settings and pkuseg model, if loaded * Add sorting for `Language.to_bytes()` serialization of `Language.meta` so that the (emptied, but still present) tokenizer metadata is in a consistent position in the serialized data Extend tests to cover all three tokenizer configurations and serialization * Fix from_disk and tests without jieba or pkuseg * Load cfg first and only show error if `use_pkuseg` * Fix blank/default initialization in serialization tests * Explicitly initialize jieba's cache on init * Add serialization for pkuseg pre/postprocessors * Reformat pkuseg install message
This commit is contained in:
parent
fb73d4943a
commit
f7471abd82
|
@ -1,6 +1,10 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
import srsly
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import OrderedDict
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
|
@ -9,12 +13,19 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
|
from ... import util
|
||||||
|
|
||||||
|
|
||||||
|
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.22` or from https://github.com/lancopku/pkuseg-python"
|
||||||
|
|
||||||
|
|
||||||
def try_jieba_import(use_jieba):
|
def try_jieba_import(use_jieba):
|
||||||
try:
|
try:
|
||||||
import jieba
|
import jieba
|
||||||
|
|
||||||
|
# segment a short text to have jieba initialize its cache in advance
|
||||||
|
list(jieba.cut("作为", cut_all=False))
|
||||||
|
|
||||||
return jieba
|
return jieba
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if use_jieba:
|
if use_jieba:
|
||||||
|
@ -25,59 +36,241 @@ def try_jieba_import(use_jieba):
|
||||||
raise ImportError(msg)
|
raise ImportError(msg)
|
||||||
|
|
||||||
|
|
||||||
|
def try_pkuseg_import(use_pkuseg, pkuseg_model, pkuseg_user_dict):
|
||||||
|
try:
|
||||||
|
import pkuseg
|
||||||
|
|
||||||
|
if pkuseg_model:
|
||||||
|
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||||
|
elif use_pkuseg:
|
||||||
|
msg = (
|
||||||
|
"Chinese.use_pkuseg is True but no pkuseg model was specified. "
|
||||||
|
"Please provide the name of a pretrained model "
|
||||||
|
"or the path to a model with "
|
||||||
|
'`Chinese(meta={"tokenizer": {"config": {"pkuseg_model": name_or_path}}}).'
|
||||||
|
)
|
||||||
|
raise ValueError(msg)
|
||||||
|
except ImportError:
|
||||||
|
if use_pkuseg:
|
||||||
|
msg = (
|
||||||
|
"pkuseg not installed. Either set Chinese.use_pkuseg = False, "
|
||||||
|
"or " + _PKUSEG_INSTALL_MSG
|
||||||
|
)
|
||||||
|
raise ImportError(msg)
|
||||||
|
except FileNotFoundError:
|
||||||
|
if use_pkuseg:
|
||||||
|
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
||||||
|
raise FileNotFoundError(msg)
|
||||||
|
|
||||||
|
|
||||||
class ChineseTokenizer(DummyTokenizer):
|
class ChineseTokenizer(DummyTokenizer):
|
||||||
def __init__(self, cls, nlp=None):
|
def __init__(self, cls, nlp=None, config={}):
|
||||||
|
self.use_jieba = config.get("use_jieba", cls.use_jieba)
|
||||||
|
self.use_pkuseg = config.get("use_pkuseg", cls.use_pkuseg)
|
||||||
|
self.require_pkuseg = config.get("require_pkuseg", False)
|
||||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||||
self.use_jieba = cls.use_jieba
|
|
||||||
self.jieba_seg = try_jieba_import(self.use_jieba)
|
self.jieba_seg = try_jieba_import(self.use_jieba)
|
||||||
|
self.pkuseg_seg = try_pkuseg_import(
|
||||||
|
self.use_pkuseg,
|
||||||
|
pkuseg_model=config.get("pkuseg_model", None),
|
||||||
|
pkuseg_user_dict=config.get("pkuseg_user_dict", "default"),
|
||||||
|
)
|
||||||
|
# remove relevant settings from config so they're not also saved in
|
||||||
|
# Language.meta
|
||||||
|
for key in ["use_jieba", "use_pkuseg", "require_pkuseg", "pkuseg_model"]:
|
||||||
|
if key in config:
|
||||||
|
del config[key]
|
||||||
self.tokenizer = Language.Defaults().create_tokenizer(nlp)
|
self.tokenizer = Language.Defaults().create_tokenizer(nlp)
|
||||||
|
|
||||||
def __call__(self, text):
|
def __call__(self, text):
|
||||||
# use jieba
|
use_jieba = self.use_jieba
|
||||||
if self.use_jieba:
|
use_pkuseg = self.use_pkuseg
|
||||||
jieba_words = list(
|
if self.require_pkuseg:
|
||||||
[x for x in self.jieba_seg.cut(text, cut_all=False) if x]
|
use_jieba = False
|
||||||
)
|
use_pkuseg = True
|
||||||
words = [jieba_words[0]]
|
if use_jieba:
|
||||||
spaces = [False]
|
words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
|
||||||
for i in range(1, len(jieba_words)):
|
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||||
word = jieba_words[i]
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
if word.isspace():
|
elif use_pkuseg:
|
||||||
# second token in adjacent whitespace following a
|
words = self.pkuseg_seg.cut(text)
|
||||||
# non-space token
|
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||||
if spaces[-1]:
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
words.append(word)
|
else:
|
||||||
spaces.append(False)
|
# split into individual characters
|
||||||
# first space token following non-space token
|
words = list(text)
|
||||||
elif word == " " and not words[-1].isspace():
|
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||||
spaces[-1] = True
|
|
||||||
# token is non-space whitespace or any whitespace following
|
|
||||||
# a whitespace token
|
|
||||||
else:
|
|
||||||
# extend previous whitespace token with more whitespace
|
|
||||||
if words[-1].isspace():
|
|
||||||
words[-1] += word
|
|
||||||
# otherwise it's a new whitespace token
|
|
||||||
else:
|
|
||||||
words.append(word)
|
|
||||||
spaces.append(False)
|
|
||||||
else:
|
|
||||||
words.append(word)
|
|
||||||
spaces.append(False)
|
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
# split into individual characters
|
def _get_config(self):
|
||||||
words = []
|
config = OrderedDict(
|
||||||
spaces = []
|
(
|
||||||
for token in self.tokenizer(text):
|
("use_jieba", self.use_jieba),
|
||||||
if token.text.isspace():
|
("use_pkuseg", self.use_pkuseg),
|
||||||
words.append(token.text)
|
("require_pkuseg", self.require_pkuseg),
|
||||||
spaces.append(False)
|
)
|
||||||
else:
|
)
|
||||||
words.extend(list(token.text))
|
return config
|
||||||
spaces.extend([False] * len(token.text))
|
|
||||||
spaces[-1] = bool(token.whitespace_)
|
def _set_config(self, config={}):
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
self.use_jieba = config.get("use_jieba", False)
|
||||||
|
self.use_pkuseg = config.get("use_pkuseg", False)
|
||||||
|
self.require_pkuseg = config.get("require_pkuseg", False)
|
||||||
|
|
||||||
|
def to_bytes(self, **kwargs):
|
||||||
|
pkuseg_features_b = b""
|
||||||
|
pkuseg_weights_b = b""
|
||||||
|
pkuseg_processors_data = None
|
||||||
|
if self.pkuseg_seg:
|
||||||
|
with tempfile.TemporaryDirectory() as tempdir:
|
||||||
|
self.pkuseg_seg.feature_extractor.save(tempdir)
|
||||||
|
self.pkuseg_seg.model.save(tempdir)
|
||||||
|
tempdir = Path(tempdir)
|
||||||
|
with open(tempdir / "features.pkl", "rb") as fileh:
|
||||||
|
pkuseg_features_b = fileh.read()
|
||||||
|
with open(tempdir / "weights.npz", "rb") as fileh:
|
||||||
|
pkuseg_weights_b = fileh.read()
|
||||||
|
pkuseg_processors_data = (
|
||||||
|
_get_pkuseg_trie_data(self.pkuseg_seg.preprocesser.trie),
|
||||||
|
self.pkuseg_seg.postprocesser.do_process,
|
||||||
|
sorted(list(self.pkuseg_seg.postprocesser.common_words)),
|
||||||
|
sorted(list(self.pkuseg_seg.postprocesser.other_words)),
|
||||||
|
)
|
||||||
|
serializers = OrderedDict(
|
||||||
|
(
|
||||||
|
("cfg", lambda: srsly.json_dumps(self._get_config())),
|
||||||
|
("pkuseg_features", lambda: pkuseg_features_b),
|
||||||
|
("pkuseg_weights", lambda: pkuseg_weights_b),
|
||||||
|
(
|
||||||
|
"pkuseg_processors",
|
||||||
|
lambda: srsly.msgpack_dumps(pkuseg_processors_data),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return util.to_bytes(serializers, [])
|
||||||
|
|
||||||
|
def from_bytes(self, data, **kwargs):
|
||||||
|
pkuseg_features_b = b""
|
||||||
|
pkuseg_weights_b = b""
|
||||||
|
pkuseg_processors_data = None
|
||||||
|
|
||||||
|
def deserialize_pkuseg_features(b):
|
||||||
|
nonlocal pkuseg_features_b
|
||||||
|
pkuseg_features_b = b
|
||||||
|
|
||||||
|
def deserialize_pkuseg_weights(b):
|
||||||
|
nonlocal pkuseg_weights_b
|
||||||
|
pkuseg_weights_b = b
|
||||||
|
|
||||||
|
def deserialize_pkuseg_processors(b):
|
||||||
|
nonlocal pkuseg_processors_data
|
||||||
|
pkuseg_processors_data = srsly.msgpack_loads(b)
|
||||||
|
|
||||||
|
deserializers = OrderedDict(
|
||||||
|
(
|
||||||
|
("cfg", lambda b: self._set_config(srsly.json_loads(b))),
|
||||||
|
("pkuseg_features", deserialize_pkuseg_features),
|
||||||
|
("pkuseg_weights", deserialize_pkuseg_weights),
|
||||||
|
("pkuseg_processors", deserialize_pkuseg_processors),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
util.from_bytes(data, deserializers, [])
|
||||||
|
|
||||||
|
if pkuseg_features_b and pkuseg_weights_b:
|
||||||
|
with tempfile.TemporaryDirectory() as tempdir:
|
||||||
|
tempdir = Path(tempdir)
|
||||||
|
with open(tempdir / "features.pkl", "wb") as fileh:
|
||||||
|
fileh.write(pkuseg_features_b)
|
||||||
|
with open(tempdir / "weights.npz", "wb") as fileh:
|
||||||
|
fileh.write(pkuseg_weights_b)
|
||||||
|
try:
|
||||||
|
import pkuseg
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"pkuseg not installed. To use this model, "
|
||||||
|
+ _PKUSEG_INSTALL_MSG
|
||||||
|
)
|
||||||
|
self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
|
||||||
|
if pkuseg_processors_data:
|
||||||
|
(
|
||||||
|
user_dict,
|
||||||
|
do_process,
|
||||||
|
common_words,
|
||||||
|
other_words,
|
||||||
|
) = pkuseg_processors_data
|
||||||
|
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
||||||
|
self.pkuseg_seg.postprocesser.do_process = do_process
|
||||||
|
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
||||||
|
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def to_disk(self, path, **kwargs):
|
||||||
|
path = util.ensure_path(path)
|
||||||
|
|
||||||
|
def save_pkuseg_model(path):
|
||||||
|
if self.pkuseg_seg:
|
||||||
|
if not path.exists():
|
||||||
|
path.mkdir(parents=True)
|
||||||
|
self.pkuseg_seg.model.save(path)
|
||||||
|
self.pkuseg_seg.feature_extractor.save(path)
|
||||||
|
|
||||||
|
def save_pkuseg_processors(path):
|
||||||
|
if self.pkuseg_seg:
|
||||||
|
data = (
|
||||||
|
_get_pkuseg_trie_data(self.pkuseg_seg.preprocesser.trie),
|
||||||
|
self.pkuseg_seg.postprocesser.do_process,
|
||||||
|
sorted(list(self.pkuseg_seg.postprocesser.common_words)),
|
||||||
|
sorted(list(self.pkuseg_seg.postprocesser.other_words)),
|
||||||
|
)
|
||||||
|
srsly.write_msgpack(path, data)
|
||||||
|
|
||||||
|
serializers = OrderedDict(
|
||||||
|
(
|
||||||
|
("cfg", lambda p: srsly.write_json(p, self._get_config())),
|
||||||
|
("pkuseg_model", lambda p: save_pkuseg_model(p)),
|
||||||
|
("pkuseg_processors", lambda p: save_pkuseg_processors(p)),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return util.to_disk(path, serializers, [])
|
||||||
|
|
||||||
|
def from_disk(self, path, **kwargs):
|
||||||
|
path = util.ensure_path(path)
|
||||||
|
|
||||||
|
def load_pkuseg_model(path):
|
||||||
|
try:
|
||||||
|
import pkuseg
|
||||||
|
except ImportError:
|
||||||
|
if self.use_pkuseg:
|
||||||
|
raise ImportError(
|
||||||
|
"pkuseg not installed. To use this model, "
|
||||||
|
+ _PKUSEG_INSTALL_MSG
|
||||||
|
)
|
||||||
|
if path.exists():
|
||||||
|
self.pkuseg_seg = pkuseg.pkuseg(path)
|
||||||
|
|
||||||
|
def load_pkuseg_processors(path):
|
||||||
|
try:
|
||||||
|
import pkuseg
|
||||||
|
except ImportError:
|
||||||
|
if self.use_pkuseg:
|
||||||
|
raise ImportError(self._pkuseg_install_msg)
|
||||||
|
if self.pkuseg_seg:
|
||||||
|
data = srsly.read_msgpack(path)
|
||||||
|
(user_dict, do_process, common_words, other_words) = data
|
||||||
|
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
||||||
|
self.pkuseg_seg.postprocesser.do_process = do_process
|
||||||
|
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
||||||
|
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
||||||
|
|
||||||
|
serializers = OrderedDict(
|
||||||
|
(
|
||||||
|
("cfg", lambda p: self._set_config(srsly.read_json(p))),
|
||||||
|
("pkuseg_model", lambda p: load_pkuseg_model(p)),
|
||||||
|
("pkuseg_processors", lambda p: load_pkuseg_processors(p)),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
util.from_disk(path, serializers, [])
|
||||||
|
|
||||||
|
|
||||||
class ChineseDefaults(Language.Defaults):
|
class ChineseDefaults(Language.Defaults):
|
||||||
|
@ -89,10 +282,11 @@ class ChineseDefaults(Language.Defaults):
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
use_jieba = True
|
use_jieba = True
|
||||||
|
use_pkuseg = False
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_tokenizer(cls, nlp=None):
|
def create_tokenizer(cls, nlp=None, config={}):
|
||||||
return ChineseTokenizer(cls, nlp)
|
return ChineseTokenizer(cls, nlp, config=config)
|
||||||
|
|
||||||
|
|
||||||
class Chinese(Language):
|
class Chinese(Language):
|
||||||
|
@ -103,4 +297,13 @@ class Chinese(Language):
|
||||||
return self.tokenizer(text)
|
return self.tokenizer(text)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_pkuseg_trie_data(node, path=""):
|
||||||
|
data = []
|
||||||
|
for c, child_node in sorted(node.children.items()):
|
||||||
|
data.extend(_get_pkuseg_trie_data(child_node, path + c))
|
||||||
|
if node.isword:
|
||||||
|
data.append((path, node.usertag))
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Chinese"]
|
__all__ = ["Chinese"]
|
||||||
|
|
|
@ -969,7 +969,7 @@ class Language(object):
|
||||||
serializers = OrderedDict()
|
serializers = OrderedDict()
|
||||||
serializers["vocab"] = lambda: self.vocab.to_bytes()
|
serializers["vocab"] = lambda: self.vocab.to_bytes()
|
||||||
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
|
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
|
||||||
serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
|
serializers["meta.json"] = lambda: srsly.json_dumps(OrderedDict(sorted(self.meta.items())))
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if name in exclude:
|
if name in exclude:
|
||||||
continue
|
continue
|
||||||
|
|
|
@ -231,10 +231,22 @@ def yo_tokenizer():
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def zh_tokenizer():
|
def zh_tokenizer_char():
|
||||||
|
return get_lang_class("zh").Defaults.create_tokenizer(config={"use_jieba": False, "use_pkuseg": False})
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def zh_tokenizer_jieba():
|
||||||
pytest.importorskip("jieba")
|
pytest.importorskip("jieba")
|
||||||
return get_lang_class("zh").Defaults.create_tokenizer()
|
return get_lang_class("zh").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def zh_tokenizer_pkuseg():
|
||||||
|
pytest.importorskip("pkuseg")
|
||||||
|
return get_lang_class("zh").Defaults.create_tokenizer(config={"pkuseg_model": "default", "use_jieba": False, "use_pkuseg": True})
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def hy_tokenizer():
|
def hy_tokenizer():
|
||||||
return get_lang_class("hy").Defaults.create_tokenizer()
|
return get_lang_class("hy").Defaults.create_tokenizer()
|
||||||
|
|
38
spacy/tests/lang/zh/test_serialize.py
Normal file
38
spacy/tests/lang/zh/test_serialize.py
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from spacy.lang.zh import Chinese
|
||||||
|
from ...util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def zh_tokenizer_serialize(zh_tokenizer):
|
||||||
|
tokenizer_bytes = zh_tokenizer.to_bytes()
|
||||||
|
nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}})
|
||||||
|
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||||
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
file_path = d / "tokenizer"
|
||||||
|
zh_tokenizer.to_disk(file_path)
|
||||||
|
nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}})
|
||||||
|
nlp.tokenizer.from_disk(file_path)
|
||||||
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
def test_zh_tokenizer_serialize_char(zh_tokenizer_char):
|
||||||
|
zh_tokenizer_serialize(zh_tokenizer_char)
|
||||||
|
|
||||||
|
|
||||||
|
def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
|
||||||
|
zh_tokenizer_serialize(zh_tokenizer_jieba)
|
||||||
|
|
||||||
|
|
||||||
|
def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg):
|
||||||
|
zh_tokenizer_serialize(zh_tokenizer_pkuseg)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
||||||
|
nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False, "use_pkuseg": True, "pkuseg_model": "medicine"}}})
|
||||||
|
zh_tokenizer_serialize(nlp.tokenizer)
|
|
@ -19,7 +19,7 @@ import pytest
|
||||||
(",", False),
|
(",", False),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_lex_attrs_like_number(zh_tokenizer, text, match):
|
def test_lex_attrs_like_number(zh_tokenizer_jieba, text, match):
|
||||||
tokens = zh_tokenizer(text)
|
tokens = zh_tokenizer_jieba(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
assert tokens[0].like_num == match
|
assert tokens[0].like_num == match
|
||||||
|
|
|
@ -5,27 +5,41 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
TOKENIZER_TESTS = [
|
TEXTS = ("作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。",)
|
||||||
("作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。",
|
JIEBA_TOKENIZER_TESTS = [
|
||||||
|
(TEXTS[0],
|
||||||
['作为', '语言', '而言', ',', '为', '世界', '使用', '人', '数最多',
|
['作为', '语言', '而言', ',', '为', '世界', '使用', '人', '数最多',
|
||||||
'的', '语言', ',', '目前', '世界', '有', '五分之一', '人口', '做',
|
'的', '语言', ',', '目前', '世界', '有', '五分之一', '人口', '做',
|
||||||
'为', '母语', '。']),
|
'为', '母语', '。']),
|
||||||
]
|
]
|
||||||
|
PKUSEG_TOKENIZER_TESTS = [
|
||||||
|
(TEXTS[0],
|
||||||
|
['作为', '语言', '而言', ',', '为', '世界', '使用', '人数', '最多',
|
||||||
|
'的', '语言', ',', '目前', '世界', '有', '五分之一', '人口', '做为',
|
||||||
|
'母语', '。']),
|
||||||
|
]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
@pytest.mark.parametrize("text", TEXTS)
|
||||||
def test_zh_tokenizer(zh_tokenizer, text, expected_tokens):
|
def test_zh_tokenizer_char(zh_tokenizer_char, text):
|
||||||
zh_tokenizer.use_jieba = False
|
tokens = [token.text for token in zh_tokenizer_char(text)]
|
||||||
tokens = [token.text for token in zh_tokenizer(text)]
|
|
||||||
assert tokens == list(text)
|
assert tokens == list(text)
|
||||||
|
|
||||||
zh_tokenizer.use_jieba = True
|
|
||||||
tokens = [token.text for token in zh_tokenizer(text)]
|
@pytest.mark.parametrize("text,expected_tokens", JIEBA_TOKENIZER_TESTS)
|
||||||
|
def test_zh_tokenizer_jieba(zh_tokenizer_jieba, text, expected_tokens):
|
||||||
|
tokens = [token.text for token in zh_tokenizer_jieba(text)]
|
||||||
assert tokens == expected_tokens
|
assert tokens == expected_tokens
|
||||||
|
|
||||||
|
|
||||||
def test_extra_spaces(zh_tokenizer):
|
@pytest.mark.parametrize("text,expected_tokens", PKUSEG_TOKENIZER_TESTS)
|
||||||
|
def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens):
|
||||||
|
tokens = [token.text for token in zh_tokenizer_pkuseg(text)]
|
||||||
|
assert tokens == expected_tokens
|
||||||
|
|
||||||
|
|
||||||
|
def test_extra_spaces(zh_tokenizer_char):
|
||||||
# note: three spaces after "I"
|
# note: three spaces after "I"
|
||||||
tokens = zh_tokenizer("I like cheese.")
|
tokens = zh_tokenizer_char("I like cheese.")
|
||||||
assert tokens[1].orth_ == " "
|
assert tokens[1].orth_ == " "
|
||||||
|
|
Loading…
Reference in New Issue
Block a user