mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
[finnish] Add initial tests for tokenizer
This commit is contained in:
parent
f9bb25d1cf
commit
1a1952afa5
|
@ -10,6 +10,7 @@ from ..pt import Portuguese
|
||||||
from ..nl import Dutch
|
from ..nl import Dutch
|
||||||
from ..sv import Swedish
|
from ..sv import Swedish
|
||||||
from ..hu import Hungarian
|
from ..hu import Hungarian
|
||||||
|
from ..fi import Finnish
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..strings import StringStore
|
from ..strings import StringStore
|
||||||
from ..lemmatizer import Lemmatizer
|
from ..lemmatizer import Lemmatizer
|
||||||
|
@ -23,7 +24,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
|
LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch,
|
||||||
Swedish, Hungarian]
|
Swedish, Hungarian, Finnish]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(params=LANGUAGES)
|
@pytest.fixture(params=LANGUAGES)
|
||||||
|
@ -62,6 +63,11 @@ def hu_tokenizer():
|
||||||
return Hungarian.Defaults.create_tokenizer()
|
return Hungarian.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def fi_tokenizer():
|
||||||
|
return Finnish.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def stringstore():
|
def stringstore():
|
||||||
return StringStore()
|
return StringStore()
|
||||||
|
|
0
spacy/tests/fi/__init__.py
Normal file
0
spacy/tests/fi/__init__.py
Normal file
18
spacy/tests/fi/test_tokenizer.py
Normal file
18
spacy/tests/fi/test_tokenizer.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
ABBREVIATION_TESTS = [
|
||||||
|
('Hyvää uutta vuotta t. siht. Niemelä!', ['Hyvää', 'uutta', 'vuotta', 't.', 'siht.', 'Niemelä', '!']),
|
||||||
|
('Paino on n. 2.2 kg', ['Paino', 'on', 'n.', '2.2', 'kg'])
|
||||||
|
]
|
||||||
|
|
||||||
|
TESTCASES = ABBREVIATION_TESTS
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
||||||
|
def test_tokenizer_handles_testcases(fi_tokenizer, text, expected_tokens):
|
||||||
|
tokens = fi_tokenizer(text)
|
||||||
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
assert expected_tokens == token_list
|
Loading…
Reference in New Issue
Block a user