add th test

This commit is contained in:
Wannaphong Phatthiyaphaibun 2017-09-21 12:56:58 +07:00
parent 39bb5690f0
commit 1abf472068
2 changed files with 19 additions and 0 deletions

View File

@ -15,6 +15,7 @@ from ..fi import Finnish
from ..bn import Bengali from ..bn import Bengali
from ..he import Hebrew from ..he import Hebrew
from ..nb import Norwegian from ..nb import Norwegian
from ..th import Thai
from ..tokens import Doc from ..tokens import Doc
@ -101,6 +102,11 @@ def he_tokenizer():
def nb_tokenizer(): def nb_tokenizer():
return Norwegian.Defaults.create_tokenizer() return Norwegian.Defaults.create_tokenizer()
@pytest.fixture
def th_tokenizer():
pythainlp = pytest.importorskip("pythainlp")
return Thai.Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
def stringstore(): def stringstore():
return StringStore() return StringStore()

View File

@ -0,0 +1,13 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
TOKENIZER_TESTS = [
("คุณรักผมไหม", ['คุณ', 'รัก', 'ผม', 'ไหม'])
]
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
def test_thai_tokenizer(th_tokenizer, text, expected_tokens):
tokens = [token.text for token in th_tokenizer(text)]
assert tokens == expected_tokens