mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	This includes the main kana, or phonetic characters, used in Japanese. There are some supplemental kana blocks in Unicode outside the BMP that could also be included, but because their actual use is rare I omitted them for now, but maybe they should be added. The omitted blocks are: - Kana Supplement - Kana Extended (A and B) - Small Kana Extension
This commit is contained in:
		
							parent
							
								
									58e29776bd
								
							
						
					
					
						commit
						b4d526c357
					
				|  | @ -45,6 +45,10 @@ _hangul_syllables = r"\uAC00-\uD7AF" | |||
| _hangul_jamo = r"\u1100-\u11FF" | ||||
| _hangul = _hangul_syllables + _hangul_jamo | ||||
| 
 | ||||
| _hiragana = r"\u3040-\u309F" | ||||
| _katakana = r"\u30A0-\u30FFー" | ||||
| _kana = _hiragana + _katakana | ||||
| 
 | ||||
| # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh | ||||
| _latin_u_extendedA = ( | ||||
|     r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C" | ||||
|  | @ -244,6 +248,7 @@ _uncased = ( | |||
|     + _tamil | ||||
|     + _telugu | ||||
|     + _hangul | ||||
|     + _kana | ||||
|     + _cjk | ||||
| ) | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user