Rework Chinese language initialization and tokenization (#4619)

* Rework Chinese language initialization

* Create a `ChineseTokenizer` class
  * Modify jieba post-processing to handle whitespace correctly
  * Modify non-jieba character tokenization to handle whitespace correctly

* Add a `create_tokenizer()` method to `ChineseDefaults`

* Load lexical attributes

* Update Chinese tag_map for UD v2

* Add very basic Chinese tests

* Test tokenization with and without jieba

* Test `like_num` attribute

* Fix try_jieba_import()

* Fix zh code formatting
This commit is contained in:
adrianeboyd 2019-11-11 14:23:21 +01:00 committed by Ines Montani
parent 4d85f67eee
commit 0b9a5f4074
6 changed files with 144 additions and 26 deletions

View File

@ -4,19 +4,92 @@ from __future__ import unicode_literals
from ...attrs import LANG from ...attrs import LANG
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from ...util import DummyTokenizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
def try_jieba_import(use_jieba):
try:
import jieba
return jieba
except ImportError:
if use_jieba:
msg = (
"Jieba not installed. Either set Chinese.use_jieba = False, "
"or install it https://github.com/fxsjy/jieba"
)
raise ImportError(msg)
class ChineseTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None):
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
self.use_jieba = cls.use_jieba
self.jieba_seg = try_jieba_import(self.use_jieba)
self.tokenizer = Language.Defaults().create_tokenizer(nlp)
def __call__(self, text):
# use jieba
if self.use_jieba:
jieba_words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
words = [jieba_words[0]]
spaces = [False]
for i in range(1, len(jieba_words)):
word = jieba_words[i]
if word.isspace():
# second token in adjacent whitespace following a
# non-space token
if spaces[-1]:
words.append(word)
spaces.append(False)
# first space token following non-space token
elif word == " " and not words[-1].isspace():
spaces[-1] = True
# token is non-space whitespace or any whitespace following
# a whitespace token
else:
# extend previous whitespace token with more whitespace
if words[-1].isspace():
words[-1] += word
# otherwise it's a new whitespace token
else:
words.append(word)
spaces.append(False)
else:
words.append(word)
spaces.append(False)
return Doc(self.vocab, words=words, spaces=spaces)
# split into individual characters
words = []
spaces = []
for token in self.tokenizer(text):
if token.text.isspace():
words.append(token.text)
spaces.append(False)
else:
words.extend(list(token.text))
spaces.extend([False] * len(token.text))
spaces[-1] = bool(token.whitespace_)
return Doc(self.vocab, words=words, spaces=spaces)
class ChineseDefaults(Language.Defaults): class ChineseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "zh" lex_attr_getters[LANG] = lambda text: "zh"
use_jieba = True
tokenizer_exceptions = BASE_EXCEPTIONS tokenizer_exceptions = BASE_EXCEPTIONS
stop_words = STOP_WORDS stop_words = STOP_WORDS
tag_map = TAG_MAP tag_map = TAG_MAP
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
use_jieba = True
@classmethod
def create_tokenizer(cls, nlp=None):
return ChineseTokenizer(cls, nlp)
class Chinese(Language): class Chinese(Language):
@ -24,26 +97,7 @@ class Chinese(Language):
Defaults = ChineseDefaults # override defaults Defaults = ChineseDefaults # override defaults
def make_doc(self, text): def make_doc(self, text):
if self.Defaults.use_jieba: return self.tokenizer(text)
try:
import jieba
except ImportError:
msg = (
"Jieba not installed. Either set Chinese.use_jieba = False, "
"or install it https://github.com/fxsjy/jieba"
)
raise ImportError(msg)
words = list(jieba.cut(text, cut_all=False))
words = [x for x in words if x]
return Doc(self.vocab, words=words, spaces=[False] * len(words))
else:
words = []
spaces = []
for token in self.tokenizer(text):
words.extend(list(token.text))
spaces.extend([False] * len(token.text))
spaces[-1] = bool(token.whitespace_)
return Doc(self.vocab, words=words, spaces=spaces)
__all__ = ["Chinese"] __all__ = ["Chinese"]

View File

@ -1,11 +1,12 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import POS, PUNCT, ADJ, SCONJ, CCONJ, NUM, DET, ADV, ADP, X
from ...symbols import NOUN, PART, INTJ, PRON from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE
# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank tag set. # The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn
# We also map the tags to the simpler Google Universal POS tag set. # Treebank tag set. We also map the tags to the simpler Universal Dependencies
# v2 tag set.
TAG_MAP = { TAG_MAP = {
"AS": {POS: PART}, "AS": {POS: PART},
@ -38,10 +39,11 @@ TAG_MAP = {
"OD": {POS: NUM}, "OD": {POS: NUM},
"DT": {POS: DET}, "DT": {POS: DET},
"CC": {POS: CCONJ}, "CC": {POS: CCONJ},
"CS": {POS: CONJ}, "CS": {POS: SCONJ},
"AD": {POS: ADV}, "AD": {POS: ADV},
"JJ": {POS: ADJ}, "JJ": {POS: ADJ},
"P": {POS: ADP}, "P": {POS: ADP},
"PN": {POS: PRON}, "PN": {POS: PRON},
"PU": {POS: PUNCT}, "PU": {POS: PUNCT},
"_SP": {POS: SPACE},
} }

View File

@ -218,3 +218,9 @@ def uk_tokenizer():
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def ur_tokenizer(): def ur_tokenizer():
return get_lang_class("ur").Defaults.create_tokenizer() return get_lang_class("ur").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def zh_tokenizer():
pytest.importorskip("jieba")
return get_lang_class("zh").Defaults.create_tokenizer()

View File

View File

@ -0,0 +1,25 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"text,match",
[
("10", True),
("1", True),
("999.0", True),
("", True),
("", True),
("", True),
("十一", True),
("", False),
(",", False),
],
)
def test_lex_attrs_like_number(zh_tokenizer, text, match):
tokens = zh_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].like_num == match

View File

@ -0,0 +1,31 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
# fmt: off
TOKENIZER_TESTS = [
("作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。",
['作为', '语言', '而言', '', '', '世界', '使用', '', '数最多',
'', '语言', '', '目前', '世界', '', '五分之一', '人口', '',
'', '母语', '']),
]
# fmt: on
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
def test_zh_tokenizer(zh_tokenizer, text, expected_tokens):
zh_tokenizer.use_jieba = False
tokens = [token.text for token in zh_tokenizer(text)]
assert tokens == list(text)
zh_tokenizer.use_jieba = True
tokens = [token.text for token in zh_tokenizer(text)]
assert tokens == expected_tokens
def test_extra_spaces(zh_tokenizer):
# note: three spaces after "I"
tokens = zh_tokenizer("I like cheese.")
assert tokens[1].orth_ == " "