diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 6e00b1513..c9652b08d 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -15,6 +15,7 @@ from ..fi import Finnish from ..bn import Bengali from ..he import Hebrew from ..nb import Norwegian +from ..th import Thai from ..tokens import Doc @@ -101,6 +102,11 @@ def he_tokenizer(): def nb_tokenizer(): return Norwegian.Defaults.create_tokenizer() +@pytest.fixture +def th_tokenizer(): + pythainlp = pytest.importorskip("pythainlp") + return Thai.Defaults.create_tokenizer() + @pytest.fixture def stringstore(): return StringStore() diff --git a/spacy/tests/th/test_tokenizer.py b/spacy/tests/th/test_tokenizer.py new file mode 100644 index 000000000..851c6f067 --- /dev/null +++ b/spacy/tests/th/test_tokenizer.py @@ -0,0 +1,13 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +TOKENIZER_TESTS = [ + ("คุณรักผมไหม", ['คุณ', 'รัก', 'ผม', 'ไหม']) +] + +@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS) +def test_thai_tokenizer(th_tokenizer, text, expected_tokens): + tokens = [token.text for token in th_tokenizer(text)] + assert tokens == expected_tokens