mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 17:52:31 +03:00
This PR adds official support for Haitian Creole (ht) to spaCy's spacy/lang module. It includes: Added all core language data files for spacy/lang/ht: tokenizer_exceptions.py punctuation.py lex_attrs.py syntax_iterators.py lemmatizer.py stop_words.py tag_map.py Unit tests for tokenizer and noun chunking (test_tokenizer.py, test_noun_chunking.py, etc.). Passed all 58 pytest spacy/tests/lang/ht tests that I've created. Basic tokenizer rules adapted for Haitian Creole orthography and informal contractions. Custom like_num atrribute supporting Haitian number formats (e.g., "3yèm"). Support for common informal apostrophe usage (e.g., "m'ap", "n'ap", "di'm"). Ensured no breakages in other language modules. Followed spaCy coding style (PEP8, Black). This provides a foundation for Haitian Creole NLP development using spaCy.
122 lines
3.2 KiB
Python
122 lines
3.2 KiB
Python
from spacy.symbols import ORTH, NORM
|
|
|
|
def make_variants(base, first_norm, second_orth, second_norm):
|
|
return {
|
|
base: [
|
|
{ORTH: base.split("'")[0] + "'", NORM: first_norm},
|
|
{ORTH: second_orth, NORM: second_norm},
|
|
],
|
|
base.capitalize(): [
|
|
{ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()},
|
|
{ORTH: second_orth, NORM: second_norm},
|
|
]
|
|
}
|
|
|
|
TOKENIZER_EXCEPTIONS = {
|
|
"Dr.": [{ORTH: "Dr."}]
|
|
}
|
|
|
|
# Apostrophe forms
|
|
TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
|
|
TOKENIZER_EXCEPTIONS.update(make_variants("n'ap", "nou", "ap", "ap"))
|
|
TOKENIZER_EXCEPTIONS.update(make_variants("l'ap", "li", "ap", "ap"))
|
|
TOKENIZER_EXCEPTIONS.update(make_variants("y'ap", "yo", "ap", "ap"))
|
|
TOKENIZER_EXCEPTIONS.update(make_variants("m'te", "mwen", "te", "te"))
|
|
TOKENIZER_EXCEPTIONS.update(make_variants("m'pral", "mwen", "pral", "pral"))
|
|
TOKENIZER_EXCEPTIONS.update(make_variants("w'ap", "ou", "ap", "ap"))
|
|
TOKENIZER_EXCEPTIONS.update(make_variants("k'ap", "ki", "ap", "ap"))
|
|
TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
|
|
TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
|
|
|
|
# Non-apostrophe contractions (with capitalized variants)
|
|
TOKENIZER_EXCEPTIONS.update({
|
|
"map": [
|
|
{ORTH: "m", NORM: "mwen"},
|
|
{ORTH: "ap", NORM: "ap"},
|
|
],
|
|
"Map": [
|
|
{ORTH: "M", NORM: "Mwen"},
|
|
{ORTH: "ap", NORM: "ap"},
|
|
],
|
|
"lem": [
|
|
{ORTH: "le", NORM: "le"},
|
|
{ORTH: "m", NORM: "mwen"},
|
|
],
|
|
"Lem": [
|
|
{ORTH: "Le", NORM: "Le"},
|
|
{ORTH: "m", NORM: "mwen"},
|
|
],
|
|
"lew": [
|
|
{ORTH: "le", NORM: "le"},
|
|
{ORTH: "w", NORM: "ou"},
|
|
],
|
|
"Lew": [
|
|
{ORTH: "Le", NORM: "Le"},
|
|
{ORTH: "w", NORM: "ou"},
|
|
],
|
|
"nap": [
|
|
{ORTH: "n", NORM: "nou"},
|
|
{ORTH: "ap", NORM: "ap"},
|
|
],
|
|
"Nap": [
|
|
{ORTH: "N", NORM: "Nou"},
|
|
{ORTH: "ap", NORM: "ap"},
|
|
],
|
|
"lap": [
|
|
{ORTH: "l", NORM: "li"},
|
|
{ORTH: "ap", NORM: "ap"},
|
|
],
|
|
"Lap": [
|
|
{ORTH: "L", NORM: "Li"},
|
|
{ORTH: "ap", NORM: "ap"},
|
|
],
|
|
"yap": [
|
|
{ORTH: "y", NORM: "yo"},
|
|
{ORTH: "ap", NORM: "ap"},
|
|
],
|
|
"Yap": [
|
|
{ORTH: "Y", NORM: "Yo"},
|
|
{ORTH: "ap", NORM: "ap"},
|
|
],
|
|
"mte": [
|
|
{ORTH: "m", NORM: "mwen"},
|
|
{ORTH: "te", NORM: "te"},
|
|
],
|
|
"Mte": [
|
|
{ORTH: "M", NORM: "Mwen"},
|
|
{ORTH: "te", NORM: "te"},
|
|
],
|
|
"mpral": [
|
|
{ORTH: "m", NORM: "mwen"},
|
|
{ORTH: "pral", NORM: "pral"},
|
|
],
|
|
"Mpral": [
|
|
{ORTH: "M", NORM: "Mwen"},
|
|
{ORTH: "pral", NORM: "pral"},
|
|
],
|
|
"wap": [
|
|
{ORTH: "w", NORM: "ou"},
|
|
{ORTH: "ap", NORM: "ap"},
|
|
],
|
|
"Wap": [
|
|
{ORTH: "W", NORM: "Ou"},
|
|
{ORTH: "ap", NORM: "ap"},
|
|
],
|
|
"kap": [
|
|
{ORTH: "k", NORM: "ki"},
|
|
{ORTH: "ap", NORM: "ap"},
|
|
],
|
|
"Kap": [
|
|
{ORTH: "K", NORM: "Ki"},
|
|
{ORTH: "ap", NORM: "ap"},
|
|
],
|
|
"tap": [
|
|
{ORTH: "t", NORM: "te"},
|
|
{ORTH: "ap", NORM: "ap"},
|
|
],
|
|
"Tap": [
|
|
{ORTH: "T", NORM: "Te"},
|
|
{ORTH: "ap", NORM: "ap"},
|
|
],
|
|
})
|