spaCy/spacy/tests/lang/ht/test_exceptions.py
2025-11-04 15:19:43 +01:00

45 lines
1.0 KiB
Python

import pytest
def test_ht_tokenizer_handles_basic_contraction(ht_tokenizer):
text = "m'ap ri"
tokens = ht_tokenizer(text)
assert len(tokens) == 3
assert tokens[0].text == "m'"
assert tokens[1].text == "ap"
assert tokens[2].text == "ri"
text = "mwen di'w non!"
tokens = ht_tokenizer(text)
assert len(tokens) == 5
assert tokens[0].text == "mwen"
assert tokens[1].text == "di"
assert tokens[2].text == "'w"
assert tokens[3].text == "non"
assert tokens[4].text == "!"
@pytest.mark.parametrize("text", ["Dr."])
def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text):
tokens = ht_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].text == text
def test_ht_tokenizer_full_sentence(ht_tokenizer):
text = "Si'm ka vini, m'ap pale ak li."
tokens = [t.text for t in ht_tokenizer(text)]
assert tokens == [
"Si",
"'m",
"ka",
"vini",
",",
"m'",
"ap",
"pale",
"ak",
"li",
".",
]