mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
Parametrize and extend Japanese tokenizer tests
This commit is contained in:
parent
30a34ebb6e
commit
c336193392
|
@ -3,6 +3,15 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
def test_japanese_tokenizer(ja_tokenizer):
|
TOKENIZER_TESTS = [
|
||||||
tokens = ja_tokenizer("日本語だよ")
|
("日本語だよ", ['日本語', 'だ', 'よ']),
|
||||||
assert len(tokens) == 3
|
("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
|
||||||
|
("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']),
|
||||||
|
("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お仕置き', 'よ', '!']),
|
||||||
|
("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'])
|
||||||
|
]
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
|
||||||
|
def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens):
|
||||||
|
tokens = [token.text for token in ja_tokenizer(text)]
|
||||||
|
assert tokens == expected_tokens
|
||||||
|
|
Loading…
Reference in New Issue
Block a user