[finnish] Add initial tests for tokenizer

2025-11-06 02:47:29 +03:00 · 2017-02-04 13:47:29 +02:00 · 2017-02-04 13:47:29 +02:00 · 1a1952afa5
commit 1a1952afa5
parent f9bb25d1cf
3 changed files with 25 additions and 1 deletions
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -10,6 +10,7 @@ from ..pt import Portuguese
 from ..nl import Dutch
 from ..sv import Swedish
 from ..hu import Hungarian
 from ..fi import Finnish
 from ..tokens import Doc
 from ..strings import StringStore
 from ..lemmatizer import Lemmatizer
@ -23,7 +24,7 @@ import pytest
 LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
-             Swedish, Hungarian]
+             Swedish, Hungarian, Finnish]
@pytest.fixture(params=LANGUAGES)
@ -62,6 +63,11 @@ def hu_tokenizer():
    return Hungarian.Defaults.create_tokenizer()
@pytest.fixture
 def fi_tokenizer():
    return Finnish.Defaults.create_tokenizer()
@pytest.fixture
 def stringstore():
    return StringStore()
--- a/spacy/tests/fi/init.py
+++ b/spacy/tests/fi/init.py
--- a/spacy/tests/fi/test_tokenizer.py
+++ b/spacy/tests/fi/test_tokenizer.py
@ -0,0 +1,18 @@
 # encoding: utf8
 from __future__ import unicode_literals
 import pytest
 ABBREVIATION_TESTS = [
    ('Hyvää uutta vuotta t. siht. Niemelä!', ['Hyvää', 'uutta', 'vuotta', 't.', 'siht.', 'Niemelä', '!']),
    ('Paino on n. 2.2 kg', ['Paino', 'on', 'n.', '2.2', 'kg'])
 ]
 TESTCASES = ABBREVIATION_TESTS
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
 def test_tokenizer_handles_testcases(fi_tokenizer, text, expected_tokens):
    tokens = fi_tokenizer(text)
    token_list = [token.text for token in tokens if not token.is_space]
    assert expected_tokens == token_list