mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 17:52:31 +03:00
This PR adds official support for Haitian Creole (ht) to spaCy's spacy/lang module. It includes: Added all core language data files for spacy/lang/ht: tokenizer_exceptions.py punctuation.py lex_attrs.py syntax_iterators.py lemmatizer.py stop_words.py tag_map.py Unit tests for tokenizer and noun chunking (test_tokenizer.py, test_noun_chunking.py, etc.). Passed all 58 pytest spacy/tests/lang/ht tests that I've created. Basic tokenizer rules adapted for Haitian Creole orthography and informal contractions. Custom like_num atrribute supporting Haitian number formats (e.g., "3yèm"). Support for common informal apostrophe usage (e.g., "m'ap", "n'ap", "di'm"). Ensured no breakages in other language modules. Followed spaCy coding style (PEP8, Black). This provides a foundation for Haitian Creole NLP development using spaCy.
53 lines
1.4 KiB
Python
53 lines
1.4 KiB
Python
from typing import Callable, Optional
|
|
|
|
from thinc.api import Model
|
|
|
|
from ...language import BaseDefaults, Language
|
|
from .lemmatizer import HaitianCreoleLemmatizer
|
|
from .lex_attrs import LEX_ATTRS
|
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
|
from .stop_words import STOP_WORDS
|
|
from .syntax_iterators import SYNTAX_ITERATORS
|
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|
from .tag_map import TAG_MAP
|
|
|
|
|
|
class HaitianCreoleDefaults(BaseDefaults):
|
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
|
prefixes = TOKENIZER_PREFIXES
|
|
infixes = TOKENIZER_INFIXES
|
|
suffixes = TOKENIZER_SUFFIXES
|
|
lex_attr_getters = LEX_ATTRS
|
|
syntax_iterators = SYNTAX_ITERATORS
|
|
stop_words = STOP_WORDS
|
|
tag_map = TAG_MAP
|
|
|
|
class HaitianCreole(Language):
|
|
lang = "ht"
|
|
Defaults = HaitianCreoleDefaults
|
|
|
|
@HaitianCreole.factory(
|
|
"lemmatizer",
|
|
assigns=["token.lemma"],
|
|
default_config={
|
|
"model": None,
|
|
"mode": "rule",
|
|
"overwrite": False,
|
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
|
},
|
|
default_score_weights={"lemma_acc": 1.0},
|
|
)
|
|
def make_lemmatizer(
|
|
nlp: Language,
|
|
model: Optional[Model],
|
|
name: str,
|
|
mode: str,
|
|
overwrite: bool,
|
|
scorer: Optional[Callable],
|
|
):
|
|
return HaitianCreoleLemmatizer(
|
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
|
)
|
|
|
|
__all__ = ["HaitianCreole"]
|