mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 17:52:31 +03:00
This PR adds official support for Haitian Creole (ht) to spaCy's spacy/lang module. It includes: Added all core language data files for spacy/lang/ht: tokenizer_exceptions.py punctuation.py lex_attrs.py syntax_iterators.py lemmatizer.py stop_words.py tag_map.py Unit tests for tokenizer and noun chunking (test_tokenizer.py, test_noun_chunking.py, etc.). Passed all 58 pytest spacy/tests/lang/ht tests that I've created. Basic tokenizer rules adapted for Haitian Creole orthography and informal contractions. Custom like_num atrribute supporting Haitian number formats (e.g., "3yèm"). Support for common informal apostrophe usage (e.g., "m'ap", "n'ap", "di'm"). Ensured no breakages in other language modules. Followed spaCy coding style (PEP8, Black). This provides a foundation for Haitian Creole NLP development using spaCy.
79 lines
1.8 KiB
Python
79 lines
1.8 KiB
Python
from ...attrs import LIKE_NUM, NORM
|
||
|
||
# Cardinal numbers in Creole
|
||
_num_words = set(
|
||
"""
|
||
zewo youn en de twa kat senk sis sèt uit nèf dis
|
||
onz douz trèz katoz kenz sèz disèt dizwit diznèf
|
||
vent trant karant sinkant swasant swasann-dis
|
||
san mil milyon milya
|
||
""".split()
|
||
)
|
||
|
||
# Ordinal numbers in Creole (some are French-influenced, some simplified)
|
||
_ordinal_words = set(
|
||
"""
|
||
premye dezyèm twazyèm katryèm senkyèm sizyèm sètvyèm uitvyèm nèvyèm dizyèm
|
||
onzèm douzyèm trèzyèm katozyèm kenzèm sèzyèm disetyèm dizwityèm diznèvyèm
|
||
ventyèm trantyèm karantyèm sinkantyèm swasantyèm
|
||
swasann-disyèm santyèm milyèm milyonnyèm milyadyèm
|
||
""".split()
|
||
)
|
||
|
||
NORM_MAP = {
|
||
"'m": "mwen",
|
||
"'w": "ou",
|
||
"'l": "li",
|
||
"'n": "nou",
|
||
"'y": "yo",
|
||
"’m": "mwen",
|
||
"’w": "ou",
|
||
"’l": "li",
|
||
"’n": "nou",
|
||
"’y": "yo",
|
||
"m": "mwen",
|
||
"n": "nou",
|
||
"l": "li",
|
||
"y": "yo",
|
||
"w": "ou",
|
||
"t": "te",
|
||
"k": "ki",
|
||
"p": "pa",
|
||
"M": "Mwen",
|
||
"N": "Nou",
|
||
"L": "Li",
|
||
"Y": "Yo",
|
||
"W": "Ou",
|
||
"T": "Te",
|
||
"K": "Ki",
|
||
"P": "Pa",
|
||
}
|
||
|
||
def like_num(text):
|
||
text = text.strip().lower()
|
||
if text.startswith(("+", "-", "±", "~")):
|
||
text = text[1:]
|
||
text = text.replace(",", "").replace(".", "")
|
||
if text.isdigit():
|
||
return True
|
||
if text.count("/") == 1:
|
||
num, denom = text.split("/")
|
||
if num.isdigit() and denom.isdigit():
|
||
return True
|
||
if text in _num_words:
|
||
return True
|
||
if text in _ordinal_words:
|
||
return True
|
||
# Handle things like "3yèm", "10yèm", "25yèm", etc.
|
||
if text.endswith("yèm") and text[:-3].isdigit():
|
||
return True
|
||
return False
|
||
|
||
def norm_custom(text):
|
||
return NORM_MAP.get(text, text.lower())
|
||
|
||
LEX_ATTRS = {
|
||
LIKE_NUM: like_num,
|
||
NORM: norm_custom,
|
||
}
|