Rework Chinese language initialization and tokenization (#4619)

* Rework Chinese language initialization * Create a `ChineseTokenizer` class * Modify jieba post-processing to handle whitespace correctly * Modify non-jieba character tokenization to handle whitespace correctly * Add a `create_tokenizer()` method to `ChineseDefaults` * Load lexical attributes * Update Chinese tag_map for UD v2 * Add very basic Chinese tests * Test tokenization with and without jieba * Test `like_num` attribute * Fix try_jieba_import() * Fix zh code formatting
2025-01-26 09:14:32 +03:00 · 2019-11-11 14:23:21 +01:00 · 2019-11-11 14:23:21 +01:00 · 0b9a5f4074
commit 0b9a5f4074
parent 4d85f67eee
6 changed files with 144 additions and 26 deletions
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -4,19 +4,92 @@ from __future__ import unicode_literals
 from ...attrs import LANG
 from ...language import Language
 from ...tokens import Doc
+from ...util import DummyTokenizer
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 from .tag_map import TAG_MAP


+def try_jieba_import(use_jieba):
+    try:
+        import jieba
+        return jieba
+    except ImportError:
+        if use_jieba:
+            msg = (
+                "Jieba not installed. Either set Chinese.use_jieba = False, "
+                "or install it https://github.com/fxsjy/jieba"
+            )
+            raise ImportError(msg)
+
+
+class ChineseTokenizer(DummyTokenizer):
+    def __init__(self, cls, nlp=None):
+        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
+        self.use_jieba = cls.use_jieba
+        self.jieba_seg = try_jieba_import(self.use_jieba)
+        self.tokenizer = Language.Defaults().create_tokenizer(nlp)
+
+    def __call__(self, text):
+        # use jieba
+        if self.use_jieba:
+            jieba_words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
+            words = [jieba_words[0]]
+            spaces = [False]
+            for i in range(1, len(jieba_words)):
+                word = jieba_words[i]
+                if word.isspace():
+                    # second token in adjacent whitespace following a
+                    # non-space token
+                    if spaces[-1]:
+                        words.append(word)
+                        spaces.append(False)
+                    # first space token following non-space token
+                    elif word == " " and not words[-1].isspace():
+                        spaces[-1] = True
+                    # token is non-space whitespace or any whitespace following
+                    # a whitespace token
+                    else:
+                        # extend previous whitespace token with more whitespace
+                        if words[-1].isspace():
+                            words[-1] += word
+                        # otherwise it's a new whitespace token
+                        else:
+                            words.append(word)
+                            spaces.append(False)
+                else:
+                    words.append(word)
+                    spaces.append(False)
+            return Doc(self.vocab, words=words, spaces=spaces)
+
+        # split into individual characters
+        words = []
+        spaces = []
+        for token in self.tokenizer(text):
+            if token.text.isspace():
+                words.append(token.text)
+                spaces.append(False)
+            else:
+                words.extend(list(token.text))
+                spaces.extend([False] * len(token.text))
+                spaces[-1] = bool(token.whitespace_)
+        return Doc(self.vocab, words=words, spaces=spaces)
+
+
 class ChineseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "zh"
-    use_jieba = True
    tokenizer_exceptions = BASE_EXCEPTIONS
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
+    use_jieba = True
+
+    @classmethod
+    def create_tokenizer(cls, nlp=None):
+        return ChineseTokenizer(cls, nlp)


 class Chinese(Language):
@ -24,26 +97,7 @@ class Chinese(Language):
    Defaults = ChineseDefaults  # override defaults

    def make_doc(self, text):
-        if self.Defaults.use_jieba:
-            try:
-                import jieba
-            except ImportError:
-                msg = (
-                    "Jieba not installed. Either set Chinese.use_jieba = False, "
-                    "or install it https://github.com/fxsjy/jieba"
-                )
-                raise ImportError(msg)
-            words = list(jieba.cut(text, cut_all=False))
-            words = [x for x in words if x]
-            return Doc(self.vocab, words=words, spaces=[False] * len(words))
-        else:
-            words = []
-            spaces = []
-            for token in self.tokenizer(text):
-                words.extend(list(token.text))
-                spaces.extend([False] * len(token.text))
-                spaces[-1] = bool(token.whitespace_)
-            return Doc(self.vocab, words=words, spaces=spaces)
+        return self.tokenizer(text)


 __all__ = ["Chinese"]
--- a/spacy/lang/zh/tag_map.py
+++ b/spacy/lang/zh/tag_map.py
@ -1,11 +1,12 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
-from ...symbols import NOUN, PART, INTJ, PRON
+from ...symbols import POS, PUNCT, ADJ, SCONJ, CCONJ, NUM, DET, ADV, ADP, X
+from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE

-# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank tag set.
-# We also map the tags to the simpler Google Universal POS tag set.
+# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn
+# Treebank tag set. We also map the tags to the simpler Universal Dependencies
+# v2 tag set.

 TAG_MAP = {
    "AS": {POS: PART},
@ -38,10 +39,11 @@ TAG_MAP = {
    "OD": {POS: NUM},
    "DT": {POS: DET},
    "CC": {POS: CCONJ},
-    "CS": {POS: CONJ},
+    "CS": {POS: SCONJ},
    "AD": {POS: ADV},
    "JJ": {POS: ADJ},
    "P": {POS: ADP},
    "PN": {POS: PRON},
    "PU": {POS: PUNCT},
+    "_SP": {POS: SPACE},
 }
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -218,3 +218,9 @@ def uk_tokenizer():
@pytest.fixture(scope="session")
 def ur_tokenizer():
    return get_lang_class("ur").Defaults.create_tokenizer()
+
+
+@pytest.fixture(scope="session")
+def zh_tokenizer():
+    pytest.importorskip("jieba")
+    return get_lang_class("zh").Defaults.create_tokenizer()
--- a/spacy/tests/lang/zh/init.py
+++ b/spacy/tests/lang/zh/init.py
--- a/spacy/tests/lang/zh/test_text.py
+++ b/spacy/tests/lang/zh/test_text.py
@ -0,0 +1,25 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+
+import pytest
+
+
+@pytest.mark.parametrize(
+    "text,match",
+    [
+        ("10", True),
+        ("1", True),
+        ("999.0", True),
+        ("一", True),
+        ("二", True),
+        ("〇", True),
+        ("十一", True),
+        ("狗", False),
+        (",", False),
+    ],
+)
+def test_lex_attrs_like_number(zh_tokenizer, text, match):
+    tokens = zh_tokenizer(text)
+    assert len(tokens) == 1
+    assert tokens[0].like_num == match
--- a/spacy/tests/lang/zh/test_tokenizer.py
+++ b/spacy/tests/lang/zh/test_tokenizer.py
@ -0,0 +1,31 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+# fmt: off
+TOKENIZER_TESTS = [
+    ("作为语言而言，为世界使用人数最多的语言，目前世界有五分之一人口做为母语。",
+        ['作为', '语言', '而言', '，', '为', '世界', '使用', '人', '数最多',
+         '的', '语言', '，', '目前', '世界', '有', '五分之一', '人口', '做',
+         '为', '母语', '。']),
+]
+# fmt: on
+
+
+@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
+def test_zh_tokenizer(zh_tokenizer, text, expected_tokens):
+    zh_tokenizer.use_jieba = False
+    tokens = [token.text for token in zh_tokenizer(text)]
+    assert tokens == list(text)
+
+    zh_tokenizer.use_jieba = True
+    tokens = [token.text for token in zh_tokenizer(text)]
+    assert tokens == expected_tokens
+
+
+def test_extra_spaces(zh_tokenizer):
+    # note: three spaces after "I"
+    tokens = zh_tokenizer("I   like cheese.")
+    assert tokens[1].orth_ == "  "