mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
0b9a5f4074
* Rework Chinese language initialization * Create a `ChineseTokenizer` class * Modify jieba post-processing to handle whitespace correctly * Modify non-jieba character tokenization to handle whitespace correctly * Add a `create_tokenizer()` method to `ChineseDefaults` * Load lexical attributes * Update Chinese tag_map for UD v2 * Add very basic Chinese tests * Test tokenization with and without jieba * Test `like_num` attribute * Fix try_jieba_import() * Fix zh code formatting
26 lines
500 B
Python
26 lines
500 B
Python
# coding: utf-8
|
||
from __future__ import unicode_literals
|
||
|
||
|
||
import pytest
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
"text,match",
|
||
[
|
||
("10", True),
|
||
("1", True),
|
||
("999.0", True),
|
||
("一", True),
|
||
("二", True),
|
||
("〇", True),
|
||
("十一", True),
|
||
("狗", False),
|
||
(",", False),
|
||
],
|
||
)
|
||
def test_lex_attrs_like_number(zh_tokenizer, text, match):
|
||
tokens = zh_tokenizer(text)
|
||
assert len(tokens) == 1
|
||
assert tokens[0].like_num == match
|