mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Added files to Setswana Language
Add South African Setswana Language
This commit is contained in:
		
							parent
							
								
									24046fef17
								
							
						
					
					
						commit
						f6be28cfb2
					
				
							
								
								
									
										19
									
								
								spacy/lang/tn/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								spacy/lang/tn/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,19 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| >>> from spacy.lang.en.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion", | ||||
|     "Johannesburg ke toropo e kgolo mo Afrika Borwa.", | ||||
|     "O ko kae?", | ||||
|     "ke mang presidente ya Afrika Borwa?", | ||||
|     "ke eng toropo kgolo ya Afrika Borwa?", | ||||
|     "Nelson Mandela o belegwe leng?", | ||||
| ] | ||||
							
								
								
									
										110
									
								
								spacy/lang/tn/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										110
									
								
								spacy/lang/tn/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,110 @@ | |||
| coding: utf8 | ||||
| 
 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ...attrs import LIKE_NUM | ||||
| 
 | ||||
| _num_words = [ | ||||
|     "lefela", | ||||
|     "nngwe", | ||||
|     "pedi", | ||||
|     "tharo", | ||||
|     "nne", | ||||
|     "tlhano", | ||||
|     "thataro", | ||||
|     "supa", | ||||
|     "robedi", | ||||
|     "robongwe", | ||||
|     "lesome", | ||||
|     "lesomenngwe", | ||||
|     "lesomepedi", | ||||
|     "sometharo", | ||||
|     "somenne", | ||||
|     "sometlhano", | ||||
|     "somethataro", | ||||
|     "somesupa", | ||||
|     "somerobedi", | ||||
|     "somerobongwe", | ||||
|     "someamabedi", | ||||
|     "someamararo", | ||||
|     "someamane", | ||||
|     "someamatlhano", | ||||
|     "someamarataro", | ||||
|     "someamasupa", | ||||
|     "someamarobedi", | ||||
|     "someamarobongwe", | ||||
|     "lekgolo", | ||||
|     "sekete", | ||||
|     "milione", | ||||
|     "bilione", | ||||
|     "terilione", | ||||
|     "kwatirilione", | ||||
|     "gajillione", | ||||
|     "bazillione", | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
| _ordinal_words = [ | ||||
|     "ntlha", | ||||
|     "bobedi", | ||||
|     "boraro", | ||||
|     "bone", | ||||
|     "botlhano", | ||||
|     "borataro", | ||||
|     "bosupa", | ||||
|     "borobedi ", | ||||
|     "borobongwe", | ||||
|     "bolesome", | ||||
|     "bolesomengwe", | ||||
|     "bolesomepedi", | ||||
|     "bolesometharo", | ||||
|     "bolesomenne", | ||||
|     "bolesometlhano", | ||||
|     "bolesomethataro", | ||||
|     "bolesomesupa", | ||||
|     "bolesomerobedi", | ||||
|     "bolesomerobongwe", | ||||
|     "somamabedi", | ||||
|     "someamararo", | ||||
|     "someamane", | ||||
|     "someamatlhano", | ||||
|     "someamarataro", | ||||
|     "someamasupa", | ||||
|     "someamarobedi", | ||||
|     "someamarobongwe", | ||||
|     "lekgolo", | ||||
|     "sekete", | ||||
|     "milione", | ||||
|     "bilione", | ||||
|     "terilione", | ||||
|     "kwatirilione", | ||||
|     "gajillione", | ||||
|     "bazillione", | ||||
| ] | ||||
| 
 | ||||
| def like_num(text): | ||||
|     if text.startswith(("+", "-", "±", "~")): | ||||
|         text = text[1:] | ||||
|     text = text.replace(",", "").replace(".", "") | ||||
|     if text.isdigit(): | ||||
|         return True | ||||
|     if text.count("/") == 1: | ||||
|         num, denom = text.split("/") | ||||
|         if num.isdigit() and denom.isdigit(): | ||||
|             return True | ||||
| 
 | ||||
|     text_lower = text.lower() | ||||
|     if text_lower in _num_words: | ||||
|         return True | ||||
| 
 | ||||
|     # CHeck ordinal number | ||||
|     if text_lower in _ordinal_words: | ||||
|         return True | ||||
|     if text_lower.endswith("th"): | ||||
|         if text_lower[:-2].isdigit(): | ||||
|             return True  | ||||
| 
 | ||||
|     return False | ||||
| 
 | ||||
| 
 | ||||
| LEX_ATTRS = {LIKE_NUM: like_num} | ||||
							
								
								
									
										19
									
								
								spacy/lang/tn/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								spacy/lang/tn/punctuation.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,19 @@ | |||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS | ||||
| from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA | ||||
| 
 | ||||
| _infixes = ( | ||||
|     LIST_ELLIPSES | ||||
|     + LIST_ICONS | ||||
|     + [ | ||||
|         r"(?<=[0-9])[+\-\*^](?=[0-9-])", | ||||
|         r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( | ||||
|             al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES | ||||
|         ), | ||||
|         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), | ||||
|         r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), | ||||
|     ] | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| TOKENIZER_INFIXES = _infixes | ||||
							
								
								
									
										24
									
								
								spacy/lang/tn/stop_words.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								spacy/lang/tn/stop_words.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,24 @@ | |||
| coding: utf8 | ||||
| 
 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| # Stop words | ||||
| STOP_WORDS = set(""" | ||||
| ke gareng ga selekanyo tlhwatlhwa yo mongwe se  | ||||
| sengwe fa go le jalo gongwe ba na mo tikologong | ||||
| jaaka kwa morago nna gonne ka sa pele nako teng  | ||||
| tlase fela ntle magareng tsona feta bobedi kgabaganya | ||||
| moo gape kgatlhanong botlhe tsotlhe bokana e esi | ||||
| setseng mororo dinako golo kgolo nnye wena gago  | ||||
| o ntse ntle tla goreng gangwe mang yotlhe gore  | ||||
| eo yona tseraganyo eng ne sentle re rona thata  | ||||
| godimo fitlha pedi masomamabedi lesomepedi mmogo  | ||||
| tharo tseo boraro tseno yone jaanong bobona bona  | ||||
| lesome tsaya tsamaiso nngwe masomethataro thataro  | ||||
| tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi | ||||
| bonala e tshwanang bogolo tsenya tsweetswee karolo  | ||||
| sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa  | ||||
| tlhano lesometlhano botlalo lekgolo            | ||||
| """.split()) | ||||
| print(STOP_WORDS) | ||||
							
								
								
									
										22
									
								
								spacy/lang/tn/tag_map.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								spacy/lang/tn/tag_map.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,22 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB | ||||
| from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON | ||||
| 
 | ||||
| 
 | ||||
| TAG_MAP = {     | ||||
|     "INT": {POS: INTJ},  | ||||
|     "JUNC": {POS: CCONJ},    | ||||
|     "$": {POS: PUNCT},    | ||||
|     "PROPOSS": {POS: PRON}, | ||||
|     "PROQUANT": {POS: PRON}, | ||||
|     "PROEMP": {POS: PRON}, | ||||
|     "NUM": {POS: NUM}, | ||||
|     "N": {POS: NOUN}, | ||||
|     "AUX": {POS: VERB}, | ||||
|     "ADV": {POS: ADV}, | ||||
|     "ADJ": {POS: ADJ}, | ||||
|     "V": {POS: VERB}, | ||||
|     "VCOP": {POS: VERB}, | ||||
| } | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user