mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Rework Chinese language initialization and tokenization (#4619)
* Rework Chinese language initialization * Create a `ChineseTokenizer` class * Modify jieba post-processing to handle whitespace correctly * Modify non-jieba character tokenization to handle whitespace correctly * Add a `create_tokenizer()` method to `ChineseDefaults` * Load lexical attributes * Update Chinese tag_map for UD v2 * Add very basic Chinese tests * Test tokenization with and without jieba * Test `like_num` attribute * Fix try_jieba_import() * Fix zh code formatting
This commit is contained in:
parent
4d85f67eee
commit
0b9a5f4074
|
@ -4,19 +4,92 @@ from __future__ import unicode_literals
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
|
from ...util import DummyTokenizer
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
|
|
||||||
|
|
||||||
|
def try_jieba_import(use_jieba):
|
||||||
|
try:
|
||||||
|
import jieba
|
||||||
|
return jieba
|
||||||
|
except ImportError:
|
||||||
|
if use_jieba:
|
||||||
|
msg = (
|
||||||
|
"Jieba not installed. Either set Chinese.use_jieba = False, "
|
||||||
|
"or install it https://github.com/fxsjy/jieba"
|
||||||
|
)
|
||||||
|
raise ImportError(msg)
|
||||||
|
|
||||||
|
|
||||||
|
class ChineseTokenizer(DummyTokenizer):
|
||||||
|
def __init__(self, cls, nlp=None):
|
||||||
|
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||||
|
self.use_jieba = cls.use_jieba
|
||||||
|
self.jieba_seg = try_jieba_import(self.use_jieba)
|
||||||
|
self.tokenizer = Language.Defaults().create_tokenizer(nlp)
|
||||||
|
|
||||||
|
def __call__(self, text):
|
||||||
|
# use jieba
|
||||||
|
if self.use_jieba:
|
||||||
|
jieba_words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
|
||||||
|
words = [jieba_words[0]]
|
||||||
|
spaces = [False]
|
||||||
|
for i in range(1, len(jieba_words)):
|
||||||
|
word = jieba_words[i]
|
||||||
|
if word.isspace():
|
||||||
|
# second token in adjacent whitespace following a
|
||||||
|
# non-space token
|
||||||
|
if spaces[-1]:
|
||||||
|
words.append(word)
|
||||||
|
spaces.append(False)
|
||||||
|
# first space token following non-space token
|
||||||
|
elif word == " " and not words[-1].isspace():
|
||||||
|
spaces[-1] = True
|
||||||
|
# token is non-space whitespace or any whitespace following
|
||||||
|
# a whitespace token
|
||||||
|
else:
|
||||||
|
# extend previous whitespace token with more whitespace
|
||||||
|
if words[-1].isspace():
|
||||||
|
words[-1] += word
|
||||||
|
# otherwise it's a new whitespace token
|
||||||
|
else:
|
||||||
|
words.append(word)
|
||||||
|
spaces.append(False)
|
||||||
|
else:
|
||||||
|
words.append(word)
|
||||||
|
spaces.append(False)
|
||||||
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
|
# split into individual characters
|
||||||
|
words = []
|
||||||
|
spaces = []
|
||||||
|
for token in self.tokenizer(text):
|
||||||
|
if token.text.isspace():
|
||||||
|
words.append(token.text)
|
||||||
|
spaces.append(False)
|
||||||
|
else:
|
||||||
|
words.extend(list(token.text))
|
||||||
|
spaces.extend([False] * len(token.text))
|
||||||
|
spaces[-1] = bool(token.whitespace_)
|
||||||
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
|
|
||||||
class ChineseDefaults(Language.Defaults):
|
class ChineseDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[LANG] = lambda text: "zh"
|
lex_attr_getters[LANG] = lambda text: "zh"
|
||||||
use_jieba = True
|
|
||||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
use_jieba = True
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create_tokenizer(cls, nlp=None):
|
||||||
|
return ChineseTokenizer(cls, nlp)
|
||||||
|
|
||||||
|
|
||||||
class Chinese(Language):
|
class Chinese(Language):
|
||||||
|
@ -24,26 +97,7 @@ class Chinese(Language):
|
||||||
Defaults = ChineseDefaults # override defaults
|
Defaults = ChineseDefaults # override defaults
|
||||||
|
|
||||||
def make_doc(self, text):
|
def make_doc(self, text):
|
||||||
if self.Defaults.use_jieba:
|
return self.tokenizer(text)
|
||||||
try:
|
|
||||||
import jieba
|
|
||||||
except ImportError:
|
|
||||||
msg = (
|
|
||||||
"Jieba not installed. Either set Chinese.use_jieba = False, "
|
|
||||||
"or install it https://github.com/fxsjy/jieba"
|
|
||||||
)
|
|
||||||
raise ImportError(msg)
|
|
||||||
words = list(jieba.cut(text, cut_all=False))
|
|
||||||
words = [x for x in words if x]
|
|
||||||
return Doc(self.vocab, words=words, spaces=[False] * len(words))
|
|
||||||
else:
|
|
||||||
words = []
|
|
||||||
spaces = []
|
|
||||||
for token in self.tokenizer(text):
|
|
||||||
words.extend(list(token.text))
|
|
||||||
spaces.extend([False] * len(token.text))
|
|
||||||
spaces[-1] = bool(token.whitespace_)
|
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Chinese"]
|
__all__ = ["Chinese"]
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
|
from ...symbols import POS, PUNCT, ADJ, SCONJ, CCONJ, NUM, DET, ADV, ADP, X
|
||||||
from ...symbols import NOUN, PART, INTJ, PRON
|
from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE
|
||||||
|
|
||||||
# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank tag set.
|
# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn
|
||||||
# We also map the tags to the simpler Google Universal POS tag set.
|
# Treebank tag set. We also map the tags to the simpler Universal Dependencies
|
||||||
|
# v2 tag set.
|
||||||
|
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
"AS": {POS: PART},
|
"AS": {POS: PART},
|
||||||
|
@ -38,10 +39,11 @@ TAG_MAP = {
|
||||||
"OD": {POS: NUM},
|
"OD": {POS: NUM},
|
||||||
"DT": {POS: DET},
|
"DT": {POS: DET},
|
||||||
"CC": {POS: CCONJ},
|
"CC": {POS: CCONJ},
|
||||||
"CS": {POS: CONJ},
|
"CS": {POS: SCONJ},
|
||||||
"AD": {POS: ADV},
|
"AD": {POS: ADV},
|
||||||
"JJ": {POS: ADJ},
|
"JJ": {POS: ADJ},
|
||||||
"P": {POS: ADP},
|
"P": {POS: ADP},
|
||||||
"PN": {POS: PRON},
|
"PN": {POS: PRON},
|
||||||
"PU": {POS: PUNCT},
|
"PU": {POS: PUNCT},
|
||||||
|
"_SP": {POS: SPACE},
|
||||||
}
|
}
|
||||||
|
|
|
@ -218,3 +218,9 @@ def uk_tokenizer():
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def ur_tokenizer():
|
def ur_tokenizer():
|
||||||
return get_lang_class("ur").Defaults.create_tokenizer()
|
return get_lang_class("ur").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def zh_tokenizer():
|
||||||
|
pytest.importorskip("jieba")
|
||||||
|
return get_lang_class("zh").Defaults.create_tokenizer()
|
||||||
|
|
0
spacy/tests/lang/zh/__init__.py
Normal file
0
spacy/tests/lang/zh/__init__.py
Normal file
25
spacy/tests/lang/zh/test_text.py
Normal file
25
spacy/tests/lang/zh/test_text.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text,match",
|
||||||
|
[
|
||||||
|
("10", True),
|
||||||
|
("1", True),
|
||||||
|
("999.0", True),
|
||||||
|
("一", True),
|
||||||
|
("二", True),
|
||||||
|
("〇", True),
|
||||||
|
("十一", True),
|
||||||
|
("狗", False),
|
||||||
|
(",", False),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_lex_attrs_like_number(zh_tokenizer, text, match):
|
||||||
|
tokens = zh_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
assert tokens[0].like_num == match
|
31
spacy/tests/lang/zh/test_tokenizer.py
Normal file
31
spacy/tests/lang/zh/test_tokenizer.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
TOKENIZER_TESTS = [
|
||||||
|
("作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。",
|
||||||
|
['作为', '语言', '而言', ',', '为', '世界', '使用', '人', '数最多',
|
||||||
|
'的', '语言', ',', '目前', '世界', '有', '五分之一', '人口', '做',
|
||||||
|
'为', '母语', '。']),
|
||||||
|
]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
||||||
|
def test_zh_tokenizer(zh_tokenizer, text, expected_tokens):
|
||||||
|
zh_tokenizer.use_jieba = False
|
||||||
|
tokens = [token.text for token in zh_tokenizer(text)]
|
||||||
|
assert tokens == list(text)
|
||||||
|
|
||||||
|
zh_tokenizer.use_jieba = True
|
||||||
|
tokens = [token.text for token in zh_tokenizer(text)]
|
||||||
|
assert tokens == expected_tokens
|
||||||
|
|
||||||
|
|
||||||
|
def test_extra_spaces(zh_tokenizer):
|
||||||
|
# note: three spaces after "I"
|
||||||
|
tokens = zh_tokenizer("I like cheese.")
|
||||||
|
assert tokens[1].orth_ == " "
|
Loading…
Reference in New Issue
Block a user