add th test

This commit is contained in:
Wannaphong Phatthiyaphaibun 2017-09-21 12:56:58 +07:00
parent 39bb5690f0
commit 1abf472068
2 changed files with 19 additions and 0 deletions

View File

@ -15,6 +15,7 @@ from ..fi import Finnish
from ..bn import Bengali
from ..he import Hebrew
from ..nb import Norwegian
from ..th import Thai
from ..tokens import Doc
@ -101,6 +102,11 @@ def he_tokenizer():
def nb_tokenizer():
return Norwegian.Defaults.create_tokenizer()
@pytest.fixture
def th_tokenizer():
pythainlp = pytest.importorskip("pythainlp")
return Thai.Defaults.create_tokenizer()
@pytest.fixture
def stringstore():
return StringStore()

View File

@ -0,0 +1,13 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
TOKENIZER_TESTS = [
("คุณรักผมไหม", ['คุณ', 'รัก', 'ผม', 'ไหม'])
]
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
def test_thai_tokenizer(th_tokenizer, text, expected_tokens):
tokens = [token.text for token in th_tokenizer(text)]
assert tokens == expected_tokens