[finnish] Add initial tests for tokenizer

This commit is contained in:
Michael Wallin 2017-02-04 13:47:29 +02:00
parent f9bb25d1cf
commit 1a1952afa5
3 changed files with 25 additions and 1 deletions

View File

@ -10,6 +10,7 @@ from ..pt import Portuguese
from ..nl import Dutch
from ..sv import Swedish
from ..hu import Hungarian
from ..fi import Finnish
from ..tokens import Doc
from ..strings import StringStore
from ..lemmatizer import Lemmatizer
@ -23,7 +24,7 @@ import pytest
LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
Swedish, Hungarian]
Swedish, Hungarian, Finnish]
@pytest.fixture(params=LANGUAGES)
@ -62,6 +63,11 @@ def hu_tokenizer():
return Hungarian.Defaults.create_tokenizer()
@pytest.fixture
def fi_tokenizer():
return Finnish.Defaults.create_tokenizer()
@pytest.fixture
def stringstore():
return StringStore()

View File

View File

@ -0,0 +1,18 @@
# encoding: utf8
from __future__ import unicode_literals
import pytest
ABBREVIATION_TESTS = [
('Hyvää uutta vuotta t. siht. Niemelä!', ['Hyvää', 'uutta', 'vuotta', 't.', 'siht.', 'Niemelä', '!']),
('Paino on n. 2.2 kg', ['Paino', 'on', 'n.', '2.2', 'kg'])
]
TESTCASES = ABBREVIATION_TESTS
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
def test_tokenizer_handles_testcases(fi_tokenizer, text, expected_tokens):
tokens = fi_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list