From 1a1952afa53c55302288ac3417af3629d98c668d Mon Sep 17 00:00:00 2001 From: Michael Wallin Date: Sat, 4 Feb 2017 13:47:29 +0200 Subject: [PATCH] [finnish] Add initial tests for tokenizer --- spacy/tests/conftest.py | 8 +++++++- spacy/tests/fi/__init__.py | 0 spacy/tests/fi/test_tokenizer.py | 18 ++++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/fi/__init__.py create mode 100644 spacy/tests/fi/test_tokenizer.py diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index de7ecae9b..3d9e0adcc 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -10,6 +10,7 @@ from ..pt import Portuguese from ..nl import Dutch from ..sv import Swedish from ..hu import Hungarian +from ..fi import Finnish from ..tokens import Doc from ..strings import StringStore from ..lemmatizer import Lemmatizer @@ -23,7 +24,7 @@ import pytest LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch, - Swedish, Hungarian] + Swedish, Hungarian, Finnish] @pytest.fixture(params=LANGUAGES) @@ -62,6 +63,11 @@ def hu_tokenizer(): return Hungarian.Defaults.create_tokenizer() +@pytest.fixture +def fi_tokenizer(): + return Finnish.Defaults.create_tokenizer() + + @pytest.fixture def stringstore(): return StringStore() diff --git a/spacy/tests/fi/__init__.py b/spacy/tests/fi/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/fi/test_tokenizer.py b/spacy/tests/fi/test_tokenizer.py new file mode 100644 index 000000000..baae9b9a4 --- /dev/null +++ b/spacy/tests/fi/test_tokenizer.py @@ -0,0 +1,18 @@ +# encoding: utf8 +from __future__ import unicode_literals + +import pytest + +ABBREVIATION_TESTS = [ + ('Hyvää uutta vuotta t. siht. Niemelä!', ['Hyvää', 'uutta', 'vuotta', 't.', 'siht.', 'Niemelä', '!']), + ('Paino on n. 2.2 kg', ['Paino', 'on', 'n.', '2.2', 'kg']) +] + +TESTCASES = ABBREVIATION_TESTS + + +@pytest.mark.parametrize('text,expected_tokens', TESTCASES) +def test_tokenizer_handles_testcases(fi_tokenizer, text, expected_tokens): + tokens = fi_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list