mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* luganda language extension * __init__.py changes * New enhancements * Lexical attribute changed * punctuaction and sentence additions * Remove comment header * Fix typos, reformat * reformated version * Add tokenizer test * Remove contractions from stop words * Format * Add Luganda to website Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
		
			
				
	
	
		
			96 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			96 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from ...attrs import LIKE_NUM
 | 
						|
 | 
						|
_num_words = [
 | 
						|
    "nnooti",  # Zero
 | 
						|
    "zeero",  # zero
 | 
						|
    "emu",  # one
 | 
						|
    "bbiri",  # two
 | 
						|
    "ssatu",  # three
 | 
						|
    "nnya",  # four
 | 
						|
    "ttaano",  # five
 | 
						|
    "mukaaga",  # six
 | 
						|
    "musanvu",  # seven
 | 
						|
    "munaana",  # eight
 | 
						|
    "mwenda",  # nine
 | 
						|
    "kkumi",  # ten
 | 
						|
    "kkumi n'emu",  # eleven
 | 
						|
    "kkumi na bbiri",  # twelve
 | 
						|
    "kkumi na ssatu",  # thirteen
 | 
						|
    "kkumi na nnya",  # forteen
 | 
						|
    "kkumi na ttaano",  # fifteen
 | 
						|
    "kkumi na mukaaga",  # sixteen
 | 
						|
    "kkumi na musanvu",  # seventeen
 | 
						|
    "kkumi na munaana",  # eighteen
 | 
						|
    "kkumi na mwenda",  # nineteen
 | 
						|
    "amakumi abiri",  # twenty
 | 
						|
    "amakumi asatu",  # thirty
 | 
						|
    "amakumi ana",  # forty
 | 
						|
    "amakumi ataano",  # fifty
 | 
						|
    "nkaaga",  # sixty
 | 
						|
    "nsanvu",  # seventy
 | 
						|
    "kinaana",  # eighty
 | 
						|
    "kyenda",  # ninety
 | 
						|
    "kikumi",  # hundred
 | 
						|
    "lukumi",  # thousand
 | 
						|
    "kakadde",  # million
 | 
						|
    "kawumbi",  # billion
 | 
						|
    "kase",  # trillion
 | 
						|
    "katabalika",  # quadrillion
 | 
						|
    "keesedde",  # gajillion
 | 
						|
    "kafukunya",  # bazillion
 | 
						|
    "ekisooka",  # first
 | 
						|
    "ekyokubiri",  # second
 | 
						|
    "ekyokusatu",  # third
 | 
						|
    "ekyokuna",  # fourth
 | 
						|
    "ekyokutaano",  # fifith
 | 
						|
    "ekyomukaaga",  # sixth
 | 
						|
    "ekyomusanvu",  # seventh
 | 
						|
    "eky'omunaana",  # eighth
 | 
						|
    "ekyomwenda",  # nineth
 | 
						|
    "ekyekkumi",  # tenth
 | 
						|
    "ekyekkumi n'ekimu",  # eleventh
 | 
						|
    "ekyekkumi n'ebibiri",  # twelveth
 | 
						|
    "ekyekkumi n'ebisatu",  # thirteenth
 | 
						|
    "ekyekkumi n'ebina",  # fourteenth
 | 
						|
    "ekyekkumi n'ebitaano",  # fifteenth
 | 
						|
    "ekyekkumi n'omukaaga",  # sixteenth
 | 
						|
    "ekyekkumi n'omusanvu",  # seventeenth
 | 
						|
    "ekyekkumi n'omunaana",  # eigteenth
 | 
						|
    "ekyekkumi n'omwenda",  # nineteenth
 | 
						|
    "ekyamakumi abiri",  # twentieth
 | 
						|
    "ekyamakumi asatu",  # thirtieth
 | 
						|
    "ekyamakumi ana",  # fortieth
 | 
						|
    "ekyamakumi ataano",  # fiftieth
 | 
						|
    "ekyenkaaga",  # sixtieth
 | 
						|
    "ekyensanvu",  # seventieth
 | 
						|
    "ekyekinaana",  # eightieth
 | 
						|
    "ekyekyenda",  # ninetieth
 | 
						|
    "ekyekikumi",  # hundredth
 | 
						|
    "ekyolukumi",  # thousandth
 | 
						|
    "ekyakakadde",  # millionth
 | 
						|
    "ekyakawumbi",  # billionth
 | 
						|
    "ekyakase",  # trillionth
 | 
						|
    "ekyakatabalika",  # quadrillionth
 | 
						|
    "ekyakeesedde",  # gajillionth
 | 
						|
    "ekyakafukunya",  # bazillionth
 | 
						|
]
 | 
						|
 | 
						|
 | 
						|
def like_num(text):
 | 
						|
    if text.startswith(("+", "-", "±", "~")):
 | 
						|
        text = text[1:]
 | 
						|
    text = text.replace(",", "").replace(".", "")
 | 
						|
    if text.isdigit():
 | 
						|
        return True
 | 
						|
    if text.count("/") == 1:
 | 
						|
        num, denom = text.split("/")
 | 
						|
        if num.isdigit() and denom.isdigit():
 | 
						|
            return True
 | 
						|
    text_lower = text.lower()
 | 
						|
    if text_lower in _num_words:
 | 
						|
        return True
 | 
						|
    return False
 | 
						|
 | 
						|
 | 
						|
LEX_ATTRS = {LIKE_NUM: like_num}
 |