# coding: utf-8 from __future__ import unicode_literals import pytest @pytest.mark.parametrize("text", ["(under)"]) def test_tokenizer_splits_no_special(sv_tokenizer, text): tokens = sv_tokenizer(text) assert len(tokens) == 3 @pytest.mark.parametrize("text", ["gitta'r", "Björn's", "Lars'"]) def test_tokenizer_handles_no_punct(sv_tokenizer, text): tokens = sv_tokenizer(text) assert len(tokens) == 1 @pytest.mark.parametrize("text", ["svart.Gul", "Hej.Världen"]) def test_tokenizer_splits_period_infix(sv_tokenizer, text): tokens = sv_tokenizer(text) assert len(tokens) == 3 @pytest.mark.parametrize("text", ["Hej,Världen", "en,två"]) def test_tokenizer_splits_comma_infix(sv_tokenizer, text): tokens = sv_tokenizer(text) assert len(tokens) == 3 assert tokens[0].text == text.split(",")[0] assert tokens[1].text == "," assert tokens[2].text == text.split(",")[1] @pytest.mark.parametrize("text", ["svart...Gul", "svart...gul"]) def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text): tokens = sv_tokenizer(text) assert len(tokens) == 3