mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 17:52:31 +03:00
This PR adds official support for Haitian Creole (ht) to spaCy's spacy/lang module. It includes: Added all core language data files for spacy/lang/ht: tokenizer_exceptions.py punctuation.py lex_attrs.py syntax_iterators.py lemmatizer.py stop_words.py tag_map.py Unit tests for tokenizer and noun chunking (test_tokenizer.py, test_noun_chunking.py, etc.). Passed all 58 pytest spacy/tests/lang/ht tests that I've created. Basic tokenizer rules adapted for Haitian Creole orthography and informal contractions. Custom like_num atrribute supporting Haitian number formats (e.g., "3yèm"). Support for common informal apostrophe usage (e.g., "m'ap", "n'ap", "di'm"). Ensured no breakages in other language modules. Followed spaCy coding style (PEP8, Black). This provides a foundation for Haitian Creole NLP development using spaCy.
52 lines
1.6 KiB
Python
52 lines
1.6 KiB
Python
from typing import List, Tuple
|
|
|
|
from ...pipeline import Lemmatizer
|
|
from ...tokens import Token
|
|
from ...lookups import Lookups
|
|
|
|
|
|
class HaitianCreoleLemmatizer(Lemmatizer):
|
|
"""
|
|
Minimal Haitian Creole lemmatizer.
|
|
Returns a word's base form based on rules and lookup,
|
|
or defaults to the original form.
|
|
"""
|
|
|
|
def is_base_form(self, token: Token) -> bool:
|
|
morph = token.morph.to_dict()
|
|
upos = token.pos_.lower()
|
|
|
|
# Consider unmarked forms to be base
|
|
if upos in {"noun", "verb", "adj", "adv"}:
|
|
if not morph:
|
|
return True
|
|
if upos == "noun" and morph.get("Number") == "Sing":
|
|
return True
|
|
if upos == "verb" and morph.get("VerbForm") == "Inf":
|
|
return True
|
|
if upos == "adj" and morph.get("Degree") == "Pos":
|
|
return True
|
|
return False
|
|
|
|
def rule_lemmatize(self, token: Token) -> List[str]:
|
|
string = token.text.lower()
|
|
pos = token.pos_.lower()
|
|
cache_key = (token.orth, token.pos)
|
|
if cache_key in self.cache:
|
|
return self.cache[cache_key]
|
|
|
|
forms = []
|
|
|
|
# fallback rule: just return lowercased form
|
|
forms.append(string)
|
|
|
|
self.cache[cache_key] = forms
|
|
return forms
|
|
|
|
@classmethod
|
|
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
|
if mode == "rule":
|
|
required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
|
return (required, [])
|
|
return super().get_lookups_config(mode)
|