import pytest

HSB_BASIC_TOKENIZATION_TESTS = [
    (
        "Hornjoserbšćina wobsteji resp. wobsteješe z wjacorych dialektow, kotrež so zdźěla chětro wot so rozeznawachu.",
        [
            "Hornjoserbšćina",
            "wobsteji",
            "resp.",
            "wobsteješe",
            "z",
            "wjacorych",
            "dialektow",
            ",",
            "kotrež",
            "so",
            "zdźěla",
            "chětro",
            "wot",
            "so",
            "rozeznawachu",
            ".",
        ],
    ),
]


@pytest.mark.parametrize("text,expected_tokens", HSB_BASIC_TOKENIZATION_TESTS)
def test_hsb_tokenizer_basic(hsb_tokenizer, text, expected_tokens):
    tokens = hsb_tokenizer(text)
    token_list = [token.text for token in tokens if not token.is_space]
    assert expected_tokens == token_list