mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Rework Chinese language initialization and tokenization (#4619)
* Rework Chinese language initialization * Create a `ChineseTokenizer` class * Modify jieba post-processing to handle whitespace correctly * Modify non-jieba character tokenization to handle whitespace correctly * Add a `create_tokenizer()` method to `ChineseDefaults` * Load lexical attributes * Update Chinese tag_map for UD v2 * Add very basic Chinese tests * Test tokenization with and without jieba * Test `like_num` attribute * Fix try_jieba_import() * Fix zh code formatting
This commit is contained in:
		
							parent
							
								
									4d85f67eee
								
							
						
					
					
						commit
						0b9a5f4074
					
				| 
						 | 
					@ -4,19 +4,92 @@ from __future__ import unicode_literals
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...attrs import LANG
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...tokens import Doc
 | 
					from ...tokens import Doc
 | 
				
			||||||
 | 
					from ...util import DummyTokenizer
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tag_map import TAG_MAP
 | 
					from .tag_map import TAG_MAP
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def try_jieba_import(use_jieba):
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        import jieba
 | 
				
			||||||
 | 
					        return jieba
 | 
				
			||||||
 | 
					    except ImportError:
 | 
				
			||||||
 | 
					        if use_jieba:
 | 
				
			||||||
 | 
					            msg = (
 | 
				
			||||||
 | 
					                "Jieba not installed. Either set Chinese.use_jieba = False, "
 | 
				
			||||||
 | 
					                "or install it https://github.com/fxsjy/jieba"
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            raise ImportError(msg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class ChineseTokenizer(DummyTokenizer):
 | 
				
			||||||
 | 
					    def __init__(self, cls, nlp=None):
 | 
				
			||||||
 | 
					        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
 | 
				
			||||||
 | 
					        self.use_jieba = cls.use_jieba
 | 
				
			||||||
 | 
					        self.jieba_seg = try_jieba_import(self.use_jieba)
 | 
				
			||||||
 | 
					        self.tokenizer = Language.Defaults().create_tokenizer(nlp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __call__(self, text):
 | 
				
			||||||
 | 
					        # use jieba
 | 
				
			||||||
 | 
					        if self.use_jieba:
 | 
				
			||||||
 | 
					            jieba_words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
 | 
				
			||||||
 | 
					            words = [jieba_words[0]]
 | 
				
			||||||
 | 
					            spaces = [False]
 | 
				
			||||||
 | 
					            for i in range(1, len(jieba_words)):
 | 
				
			||||||
 | 
					                word = jieba_words[i]
 | 
				
			||||||
 | 
					                if word.isspace():
 | 
				
			||||||
 | 
					                    # second token in adjacent whitespace following a
 | 
				
			||||||
 | 
					                    # non-space token
 | 
				
			||||||
 | 
					                    if spaces[-1]:
 | 
				
			||||||
 | 
					                        words.append(word)
 | 
				
			||||||
 | 
					                        spaces.append(False)
 | 
				
			||||||
 | 
					                    # first space token following non-space token
 | 
				
			||||||
 | 
					                    elif word == " " and not words[-1].isspace():
 | 
				
			||||||
 | 
					                        spaces[-1] = True
 | 
				
			||||||
 | 
					                    # token is non-space whitespace or any whitespace following
 | 
				
			||||||
 | 
					                    # a whitespace token
 | 
				
			||||||
 | 
					                    else:
 | 
				
			||||||
 | 
					                        # extend previous whitespace token with more whitespace
 | 
				
			||||||
 | 
					                        if words[-1].isspace():
 | 
				
			||||||
 | 
					                            words[-1] += word
 | 
				
			||||||
 | 
					                        # otherwise it's a new whitespace token
 | 
				
			||||||
 | 
					                        else:
 | 
				
			||||||
 | 
					                            words.append(word)
 | 
				
			||||||
 | 
					                            spaces.append(False)
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    words.append(word)
 | 
				
			||||||
 | 
					                    spaces.append(False)
 | 
				
			||||||
 | 
					            return Doc(self.vocab, words=words, spaces=spaces)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # split into individual characters
 | 
				
			||||||
 | 
					        words = []
 | 
				
			||||||
 | 
					        spaces = []
 | 
				
			||||||
 | 
					        for token in self.tokenizer(text):
 | 
				
			||||||
 | 
					            if token.text.isspace():
 | 
				
			||||||
 | 
					                words.append(token.text)
 | 
				
			||||||
 | 
					                spaces.append(False)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                words.extend(list(token.text))
 | 
				
			||||||
 | 
					                spaces.extend([False] * len(token.text))
 | 
				
			||||||
 | 
					                spaces[-1] = bool(token.whitespace_)
 | 
				
			||||||
 | 
					        return Doc(self.vocab, words=words, spaces=spaces)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ChineseDefaults(Language.Defaults):
 | 
					class ChineseDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
				
			||||||
 | 
					    lex_attr_getters.update(LEX_ATTRS)
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "zh"
 | 
					    lex_attr_getters[LANG] = lambda text: "zh"
 | 
				
			||||||
    use_jieba = True
 | 
					 | 
				
			||||||
    tokenizer_exceptions = BASE_EXCEPTIONS
 | 
					    tokenizer_exceptions = BASE_EXCEPTIONS
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
    tag_map = TAG_MAP
 | 
					    tag_map = TAG_MAP
 | 
				
			||||||
    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
 | 
					    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
 | 
				
			||||||
 | 
					    use_jieba = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def create_tokenizer(cls, nlp=None):
 | 
				
			||||||
 | 
					        return ChineseTokenizer(cls, nlp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Chinese(Language):
 | 
					class Chinese(Language):
 | 
				
			||||||
| 
						 | 
					@ -24,26 +97,7 @@ class Chinese(Language):
 | 
				
			||||||
    Defaults = ChineseDefaults  # override defaults
 | 
					    Defaults = ChineseDefaults  # override defaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def make_doc(self, text):
 | 
					    def make_doc(self, text):
 | 
				
			||||||
        if self.Defaults.use_jieba:
 | 
					        return self.tokenizer(text)
 | 
				
			||||||
            try:
 | 
					 | 
				
			||||||
                import jieba
 | 
					 | 
				
			||||||
            except ImportError:
 | 
					 | 
				
			||||||
                msg = (
 | 
					 | 
				
			||||||
                    "Jieba not installed. Either set Chinese.use_jieba = False, "
 | 
					 | 
				
			||||||
                    "or install it https://github.com/fxsjy/jieba"
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
                raise ImportError(msg)
 | 
					 | 
				
			||||||
            words = list(jieba.cut(text, cut_all=False))
 | 
					 | 
				
			||||||
            words = [x for x in words if x]
 | 
					 | 
				
			||||||
            return Doc(self.vocab, words=words, spaces=[False] * len(words))
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            words = []
 | 
					 | 
				
			||||||
            spaces = []
 | 
					 | 
				
			||||||
            for token in self.tokenizer(text):
 | 
					 | 
				
			||||||
                words.extend(list(token.text))
 | 
					 | 
				
			||||||
                spaces.extend([False] * len(token.text))
 | 
					 | 
				
			||||||
                spaces[-1] = bool(token.whitespace_)
 | 
					 | 
				
			||||||
            return Doc(self.vocab, words=words, spaces=spaces)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Chinese"]
 | 
					__all__ = ["Chinese"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,11 +1,12 @@
 | 
				
			||||||
# coding: utf8
 | 
					# coding: utf8
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
 | 
					from ...symbols import POS, PUNCT, ADJ, SCONJ, CCONJ, NUM, DET, ADV, ADP, X
 | 
				
			||||||
from ...symbols import NOUN, PART, INTJ, PRON
 | 
					from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank tag set.
 | 
					# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn
 | 
				
			||||||
# We also map the tags to the simpler Google Universal POS tag set.
 | 
					# Treebank tag set. We also map the tags to the simpler Universal Dependencies
 | 
				
			||||||
 | 
					# v2 tag set.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TAG_MAP = {
 | 
					TAG_MAP = {
 | 
				
			||||||
    "AS": {POS: PART},
 | 
					    "AS": {POS: PART},
 | 
				
			||||||
| 
						 | 
					@ -38,10 +39,11 @@ TAG_MAP = {
 | 
				
			||||||
    "OD": {POS: NUM},
 | 
					    "OD": {POS: NUM},
 | 
				
			||||||
    "DT": {POS: DET},
 | 
					    "DT": {POS: DET},
 | 
				
			||||||
    "CC": {POS: CCONJ},
 | 
					    "CC": {POS: CCONJ},
 | 
				
			||||||
    "CS": {POS: CONJ},
 | 
					    "CS": {POS: SCONJ},
 | 
				
			||||||
    "AD": {POS: ADV},
 | 
					    "AD": {POS: ADV},
 | 
				
			||||||
    "JJ": {POS: ADJ},
 | 
					    "JJ": {POS: ADJ},
 | 
				
			||||||
    "P": {POS: ADP},
 | 
					    "P": {POS: ADP},
 | 
				
			||||||
    "PN": {POS: PRON},
 | 
					    "PN": {POS: PRON},
 | 
				
			||||||
    "PU": {POS: PUNCT},
 | 
					    "PU": {POS: PUNCT},
 | 
				
			||||||
 | 
					    "_SP": {POS: SPACE},
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -218,3 +218,9 @@ def uk_tokenizer():
 | 
				
			||||||
@pytest.fixture(scope="session")
 | 
					@pytest.fixture(scope="session")
 | 
				
			||||||
def ur_tokenizer():
 | 
					def ur_tokenizer():
 | 
				
			||||||
    return get_lang_class("ur").Defaults.create_tokenizer()
 | 
					    return get_lang_class("ur").Defaults.create_tokenizer()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.fixture(scope="session")
 | 
				
			||||||
 | 
					def zh_tokenizer():
 | 
				
			||||||
 | 
					    pytest.importorskip("jieba")
 | 
				
			||||||
 | 
					    return get_lang_class("zh").Defaults.create_tokenizer()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										0
									
								
								spacy/tests/lang/zh/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/lang/zh/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										25
									
								
								spacy/tests/lang/zh/test_text.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								spacy/tests/lang/zh/test_text.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,25 @@
 | 
				
			||||||
 | 
					# coding: utf-8
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    "text,match",
 | 
				
			||||||
 | 
					    [
 | 
				
			||||||
 | 
					        ("10", True),
 | 
				
			||||||
 | 
					        ("1", True),
 | 
				
			||||||
 | 
					        ("999.0", True),
 | 
				
			||||||
 | 
					        ("一", True),
 | 
				
			||||||
 | 
					        ("二", True),
 | 
				
			||||||
 | 
					        ("〇", True),
 | 
				
			||||||
 | 
					        ("十一", True),
 | 
				
			||||||
 | 
					        ("狗", False),
 | 
				
			||||||
 | 
					        (",", False),
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def test_lex_attrs_like_number(zh_tokenizer, text, match):
 | 
				
			||||||
 | 
					    tokens = zh_tokenizer(text)
 | 
				
			||||||
 | 
					    assert len(tokens) == 1
 | 
				
			||||||
 | 
					    assert tokens[0].like_num == match
 | 
				
			||||||
							
								
								
									
										31
									
								
								spacy/tests/lang/zh/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								spacy/tests/lang/zh/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,31 @@
 | 
				
			||||||
 | 
					# coding: utf-8
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# fmt: off
 | 
				
			||||||
 | 
					TOKENIZER_TESTS = [
 | 
				
			||||||
 | 
					    ("作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。",
 | 
				
			||||||
 | 
					        ['作为', '语言', '而言', ',', '为', '世界', '使用', '人', '数最多',
 | 
				
			||||||
 | 
					         '的', '语言', ',', '目前', '世界', '有', '五分之一', '人口', '做',
 | 
				
			||||||
 | 
					         '为', '母语', '。']),
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					# fmt: on
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
 | 
				
			||||||
 | 
					def test_zh_tokenizer(zh_tokenizer, text, expected_tokens):
 | 
				
			||||||
 | 
					    zh_tokenizer.use_jieba = False
 | 
				
			||||||
 | 
					    tokens = [token.text for token in zh_tokenizer(text)]
 | 
				
			||||||
 | 
					    assert tokens == list(text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    zh_tokenizer.use_jieba = True
 | 
				
			||||||
 | 
					    tokens = [token.text for token in zh_tokenizer(text)]
 | 
				
			||||||
 | 
					    assert tokens == expected_tokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_extra_spaces(zh_tokenizer):
 | 
				
			||||||
 | 
					    # note: three spaces after "I"
 | 
				
			||||||
 | 
					    tokens = zh_tokenizer("I   like cheese.")
 | 
				
			||||||
 | 
					    assert tokens[1].orth_ == "  "
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user