Parametrize and extend Japanese tokenizer tests

This commit is contained in:
Paul O'Leary McCann 2017-06-29 00:09:40 +09:00
parent 30a34ebb6e
commit c336193392

View File

@ -3,6 +3,15 @@ from __future__ import unicode_literals
import pytest
def test_japanese_tokenizer(ja_tokenizer):
tokens = ja_tokenizer("日本語だよ")
assert len(tokens) == 3
TOKENIZER_TESTS = [
("日本語だよ", ['日本語', '', '']),
("東京タワーの近くに住んでいます。", ['東京', 'タワー', '', '近く', '', '住ん', '', '', 'ます', '']),
("吾輩は猫である。", ['吾輩', '', '', '', 'ある', '']),
("月に代わって、お仕置きよ!", ['', '', '代わっ', '', '', 'お仕置き', '', '!']),
("すもももももももものうち", ['すもも', '', 'もも', '', 'もも', '', 'うち'])
]
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens):
tokens = [token.text for token in ja_tokenizer(text)]
assert tokens == expected_tokens