spaCy/spacy/tests/lang/ht/test_text.py
Jeff Adolphe 41e07772dc
Added Haitian Creole (ht) Language Support to spaCy (#13807)
This PR adds official support for Haitian Creole (ht) to spaCy's spacy/lang module.
It includes:

    Added all core language data files for spacy/lang/ht:
        tokenizer_exceptions.py
        punctuation.py
        lex_attrs.py
        syntax_iterators.py
        lemmatizer.py
        stop_words.py
        tag_map.py

    Unit tests for tokenizer and noun chunking (test_tokenizer.py, test_noun_chunking.py, etc.). Passed all 58 pytest spacy/tests/lang/ht tests that I've created.

    Basic tokenizer rules adapted for Haitian Creole orthography and informal contractions.

    Custom like_num atrribute supporting Haitian number formats (e.g., "3yèm").

    Support for common informal apostrophe usage (e.g., "m'ap", "n'ap", "di'm").

    Ensured no breakages in other language modules.

    Followed spaCy coding style (PEP8, Black).

This provides a foundation for Haitian Creole NLP development using spaCy.
2025-05-28 17:23:38 +02:00

80 lines
2.0 KiB
Python

import pytest
from spacy.lang.ht.lex_attrs import like_num, norm_custom
def test_ht_tokenizer_handles_long_text(ht_tokenizer):
text = """Onè ap fèt pou ansyen lidè Pati Travayè Britanik
Moun atravè lemond ap voye onè pou ansyen lidè
Pati Travayè a, John Smith, ki mouri pi bonè jodi a apre li te fè yon gwo kriz kadyak a laj 55 an.
Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre lanmò twò bonè" avoka ak palmantè eskoze a.
"Misye Smith, pandan tout karyè li ki te make ak distenksyon"""
tokens = ht_tokenizer(text)
assert len(tokens) == 84
@pytest.mark.parametrize(
"text,length",
[
("Map manje gato a pandan map gade televizyon lem lakay mwen.", 15),
("M'ap vini, eske wap la avek lajan'm? Si oui, di'l non pou fre'w.", 22),
("M ap teste sa (pou kounye a).", 10),
],
)
def test_ht_tokenizer_handles_cnts(ht_tokenizer, text, length):
tokens = ht_tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize(
"text,match",
[
("10", True),
("1", True),
("10,000", True),
("10,00", True),
("999.0", True),
("en", True),
("de", True),
("milya", True),
("dog", False),
(",", False),
("1/2", True),
],
)
def test_lex_attrs_like_number(ht_tokenizer, text, match):
tokens = ht_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].like_num == match
@pytest.mark.parametrize(
"word", ["ventyèm", "Milyonnyèm", "3yèm", "Santyèm", "25yèm", "52yèm"]
)
def test_ht_lex_attrs_like_number_for_ordinal(word):
assert like_num(word)
@pytest.mark.parametrize("word", ["onz"])
def test_ht_lex_attrs_capitals(word):
assert like_num(word)
assert like_num(word.upper())
@pytest.mark.parametrize(
"word, expected", [
("'m", "mwen"),
("'n", "nou"),
("'l", "li"),
("'y", "yo"),
("'w", "ou"),
]
)
def test_ht_lex_attrs_norm_custom(word, expected):
assert norm_custom(word) == expected