mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
luganda language extension (#10847)
* luganda language extension * __init__.py changes * New enhancements * Lexical attribute changed * punctuaction and sentence additions * Remove comment header * Fix typos, reformat * reformated version * Add tokenizer test * Remove contractions from stop words * Format * Add Luganda to website Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
5afa98aabf
commit
c09d2fa25b
18
spacy/lang/lg/__init__.py
Normal file
18
spacy/lang/lg/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
|
class LugandaDefaults(BaseDefaults):
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
class Luganda(Language):
|
||||||
|
lang = "lg"
|
||||||
|
Defaults = LugandaDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Luganda"]
|
17
spacy/lang/lg/examples.py
Normal file
17
spacy/lang/lg/examples.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.lg.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Mpa ebyafaayo ku byalo Nakatu ne Nkajja",
|
||||||
|
"Okuyita Ttembo kitegeeza kugwa ddalu",
|
||||||
|
"Ekifumu kino kyali kya mulimu ki?",
|
||||||
|
"Ekkovu we liyise wayitibwa mukululo",
|
||||||
|
"Akola mulimu ki oguvaamu ssente?",
|
||||||
|
"Emisumaali egikomerera embaawo giyitibwa nninga",
|
||||||
|
"Abooluganda ab’emmamba ababiri",
|
||||||
|
"Ekisaawe ky'ebyenjigiriza kya mugaso nnyo",
|
||||||
|
]
|
95
spacy/lang/lg/lex_attrs.py
Normal file
95
spacy/lang/lg/lex_attrs.py
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"nnooti", # Zero
|
||||||
|
"zeero", # zero
|
||||||
|
"emu", # one
|
||||||
|
"bbiri", # two
|
||||||
|
"ssatu", # three
|
||||||
|
"nnya", # four
|
||||||
|
"ttaano", # five
|
||||||
|
"mukaaga", # six
|
||||||
|
"musanvu", # seven
|
||||||
|
"munaana", # eight
|
||||||
|
"mwenda", # nine
|
||||||
|
"kkumi", # ten
|
||||||
|
"kkumi n'emu", # eleven
|
||||||
|
"kkumi na bbiri", # twelve
|
||||||
|
"kkumi na ssatu", # thirteen
|
||||||
|
"kkumi na nnya", # forteen
|
||||||
|
"kkumi na ttaano", # fifteen
|
||||||
|
"kkumi na mukaaga", # sixteen
|
||||||
|
"kkumi na musanvu", # seventeen
|
||||||
|
"kkumi na munaana", # eighteen
|
||||||
|
"kkumi na mwenda", # nineteen
|
||||||
|
"amakumi abiri", # twenty
|
||||||
|
"amakumi asatu", # thirty
|
||||||
|
"amakumi ana", # forty
|
||||||
|
"amakumi ataano", # fifty
|
||||||
|
"nkaaga", # sixty
|
||||||
|
"nsanvu", # seventy
|
||||||
|
"kinaana", # eighty
|
||||||
|
"kyenda", # ninety
|
||||||
|
"kikumi", # hundred
|
||||||
|
"lukumi", # thousand
|
||||||
|
"kakadde", # million
|
||||||
|
"kawumbi", # billion
|
||||||
|
"kase", # trillion
|
||||||
|
"katabalika", # quadrillion
|
||||||
|
"keesedde", # gajillion
|
||||||
|
"kafukunya", # bazillion
|
||||||
|
"ekisooka", # first
|
||||||
|
"ekyokubiri", # second
|
||||||
|
"ekyokusatu", # third
|
||||||
|
"ekyokuna", # fourth
|
||||||
|
"ekyokutaano", # fifith
|
||||||
|
"ekyomukaaga", # sixth
|
||||||
|
"ekyomusanvu", # seventh
|
||||||
|
"eky'omunaana", # eighth
|
||||||
|
"ekyomwenda", # nineth
|
||||||
|
"ekyekkumi", # tenth
|
||||||
|
"ekyekkumi n'ekimu", # eleventh
|
||||||
|
"ekyekkumi n'ebibiri", # twelveth
|
||||||
|
"ekyekkumi n'ebisatu", # thirteenth
|
||||||
|
"ekyekkumi n'ebina", # fourteenth
|
||||||
|
"ekyekkumi n'ebitaano", # fifteenth
|
||||||
|
"ekyekkumi n'omukaaga", # sixteenth
|
||||||
|
"ekyekkumi n'omusanvu", # seventeenth
|
||||||
|
"ekyekkumi n'omunaana", # eigteenth
|
||||||
|
"ekyekkumi n'omwenda", # nineteenth
|
||||||
|
"ekyamakumi abiri", # twentieth
|
||||||
|
"ekyamakumi asatu", # thirtieth
|
||||||
|
"ekyamakumi ana", # fortieth
|
||||||
|
"ekyamakumi ataano", # fiftieth
|
||||||
|
"ekyenkaaga", # sixtieth
|
||||||
|
"ekyensanvu", # seventieth
|
||||||
|
"ekyekinaana", # eightieth
|
||||||
|
"ekyekyenda", # ninetieth
|
||||||
|
"ekyekikumi", # hundredth
|
||||||
|
"ekyolukumi", # thousandth
|
||||||
|
"ekyakakadde", # millionth
|
||||||
|
"ekyakawumbi", # billionth
|
||||||
|
"ekyakase", # trillionth
|
||||||
|
"ekyakatabalika", # quadrillionth
|
||||||
|
"ekyakeesedde", # gajillionth
|
||||||
|
"ekyakafukunya", # bazillionth
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
text_lower = text.lower()
|
||||||
|
if text_lower in _num_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
19
spacy/lang/lg/punctuation.py
Normal file
19
spacy/lang/lg/punctuation.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
|
||||||
|
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||||
|
|
||||||
|
_infixes = (
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||||
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||||
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
19
spacy/lang/lg/stop_words.py
Normal file
19
spacy/lang/lg/stop_words.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
abadde abalala abamu abangi abava ajja ali alina ani anti ateekeddwa atewamu
|
||||||
|
atya awamu aweebwa ayinza ba baali babadde babalina bajja
|
||||||
|
bajjanewankubade bali balina bandi bangi bano bateekeddwa baweebwa bayina bebombi beera bibye
|
||||||
|
bimu bingi bino bo bokka bonna buli bulijjo bulungi bwabwe bwaffe bwayo bwe bwonna bya byabwe
|
||||||
|
byaffe byebimu byonna ddaa ddala ddi e ebimu ebiri ebweruobulungi ebyo edda ejja ekirala ekyo
|
||||||
|
endala engeri ennyo era erimu erina ffe ffenna ga gujja gumu gunno guno gwa gwe kaseera kati
|
||||||
|
kennyini ki kiki kikino kikye kikyo kino kirungi kki ku kubangabyombi kubangaolwokuba kudda
|
||||||
|
kuva kuwa kwegamba kyaffe kye kyekimuoyo kyekyo kyonna leero liryo lwa lwaki lyabwezaabwe
|
||||||
|
lyaffe lyange mbadde mingi mpozzi mu mulinaoyina munda mwegyabwe nolwekyo nabadde nabo nandiyagadde
|
||||||
|
nandiye nanti naye ne nedda neera nga nnyingi nnyini nnyinza nnyo nti nyinza nze oba ojja okudda
|
||||||
|
okugenda okuggyako okutuusa okuva okuwa oli olina oluvannyuma olwekyobuva omuli ono osobola otya
|
||||||
|
oyina oyo seetaaga si sinakindi singa talina tayina tebaali tebaalina tebayina terina tetulina
|
||||||
|
tetuteekeddwa tewali teyalina teyayina tolina tu tuyina tulina tuyina twafuna twetaaga wa wabula
|
||||||
|
wabweru wadde waggulunnina wakati waliwobangi waliyo wandi wange wano wansi weebwa yabadde yaffe
|
||||||
|
ye yenna yennyini yina yonna ziba zijja zonna
|
||||||
|
""".split()
|
||||||
|
)
|
|
@ -261,6 +261,11 @@ def lb_tokenizer():
|
||||||
return get_lang_class("lb")().tokenizer
|
return get_lang_class("lb")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def lg_tokenizer():
|
||||||
|
return get_lang_class("lg")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def lt_tokenizer():
|
def lt_tokenizer():
|
||||||
return get_lang_class("lt")().tokenizer
|
return get_lang_class("lt")().tokenizer
|
||||||
|
|
0
spacy/tests/lang/lg/__init__.py
Normal file
0
spacy/tests/lang/lg/__init__.py
Normal file
15
spacy/tests/lang/lg/test_tokenizer.py
Normal file
15
spacy/tests/lang/lg/test_tokenizer.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
LG_BASIC_TOKENIZATION_TESTS = [
|
||||||
|
(
|
||||||
|
"Abooluganda ab’emmamba ababiri",
|
||||||
|
["Abooluganda", "ab’emmamba", "ababiri"],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_tokens", LG_BASIC_TOKENIZATION_TESTS)
|
||||||
|
def test_lg_tokenizer_basic(lg_tokenizer, text, expected_tokens):
|
||||||
|
tokens = lg_tokenizer(text)
|
||||||
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
assert expected_tokens == token_list
|
|
@ -265,6 +265,11 @@
|
||||||
"name": "Luxembourgish",
|
"name": "Luxembourgish",
|
||||||
"has_examples": true
|
"has_examples": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"code": "lg",
|
||||||
|
"name": "Luganda",
|
||||||
|
"has_examples": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"code": "lij",
|
"code": "lij",
|
||||||
"name": "Ligurian",
|
"name": "Ligurian",
|
||||||
|
|
Loading…
Reference in New Issue
Block a user