mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	* Use isort with Black profile * isort all the things * Fix import cycles as a result of import sorting * Add DOCBIN_ALL_ATTRS type definition * Add isort to requirements * Remove isort from build dependencies check * Typo
		
			
				
	
	
		
			95 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			95 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from ...attrs import LIKE_NUM, NORM
 | ||
| from ..norm_exceptions import BASE_NORMS
 | ||
| 
 | ||
| # fmt: off
 | ||
| _stem_suffixes = [
 | ||
|     ["ा", "ि", "ी", "ु", "ू", "ृ", "े", "ै", "ो", "ौ"],
 | ||
|     ["ँ", "ं", "्", "ः"],
 | ||
|     ["लाई", "ले", "बाट", "को", "मा", "हरू"],
 | ||
|     ["हरूलाई", "हरूले", "हरूबाट", "हरूको", "हरूमा"],
 | ||
|     ["इलो", "िलो", "नु", "ाउनु", "ई", "इन", "इन्", "इनन्"],
 | ||
|     ["एँ", "इँन्", "इस्", "इनस्", "यो", "एन", "यौं", "एनौं", "ए", "एनन्"],
 | ||
|     ["छु", "छौँ", "छस्", "छौ", "छ", "छन्", "छेस्", "छे", "छ्यौ", "छिन्", "हुन्छ"],
 | ||
|     ["दै", "दिन", "दिँन", "दैनस्", "दैन", "दैनौँ", "दैनौं", "दैनन्"],
 | ||
|     ["हुन्न", "न्न", "न्न्स्", "न्नौं", "न्नौ", "न्न्न्", "िई"],
 | ||
|     ["अ", "ओ", "ऊ", "अरी", "साथ", "वित्तिकै", "पूर्वक"],
 | ||
|     ["याइ", "ाइ", "बार", "वार", "चाँहि"],
 | ||
|     ["ने", "ेको", "ेकी", "ेका", "ेर", "दै", "तै", "िकन", "उ", "न", "नन्"]
 | ||
| ]
 | ||
| # fmt: on
 | ||
| 
 | ||
| # reference 1: https://en.wikipedia.org/wiki/Numbers_in_Nepali_language
 | ||
| # reference 2: https://www.imnepal.com/nepali-numbers/
 | ||
| _num_words = [
 | ||
|     "शुन्य",
 | ||
|     "एक",
 | ||
|     "दुई",
 | ||
|     "तीन",
 | ||
|     "चार",
 | ||
|     "पाँच",
 | ||
|     "छ",
 | ||
|     "सात",
 | ||
|     "आठ",
 | ||
|     "नौ",
 | ||
|     "दश",
 | ||
|     "एघार",
 | ||
|     "बाह्र",
 | ||
|     "तेह्र",
 | ||
|     "चौध",
 | ||
|     "पन्ध्र",
 | ||
|     "सोह्र",
 | ||
|     "सोह्र",
 | ||
|     "सत्र",
 | ||
|     "अठार",
 | ||
|     "उन्नाइस",
 | ||
|     "बीस",
 | ||
|     "तीस",
 | ||
|     "चालीस",
 | ||
|     "पचास",
 | ||
|     "साठी",
 | ||
|     "सत्तरी",
 | ||
|     "असी",
 | ||
|     "नब्बे",
 | ||
|     "सय",
 | ||
|     "हजार",
 | ||
|     "लाख",
 | ||
|     "करोड",
 | ||
|     "अर्ब",
 | ||
|     "खर्ब",
 | ||
| ]
 | ||
| 
 | ||
| 
 | ||
| def norm(string):
 | ||
|     # normalise base exceptions,  e.g. punctuation or currency symbols
 | ||
|     if string in BASE_NORMS:
 | ||
|         return BASE_NORMS[string]
 | ||
|     # set stem word as norm,  if available,  adapted from:
 | ||
|     # https://github.com/explosion/spaCy/blob/master/spacy/lang/hi/lex_attrs.py
 | ||
|     # https://www.researchgate.net/publication/237261579_Structure_of_Nepali_Grammar
 | ||
|     for suffix_group in reversed(_stem_suffixes):
 | ||
|         length = len(suffix_group[0])
 | ||
|         if len(string) <= length:
 | ||
|             break
 | ||
|         for suffix in suffix_group:
 | ||
|             if string.endswith(suffix):
 | ||
|                 return string[:-length]
 | ||
|     return string
 | ||
| 
 | ||
| 
 | ||
| def like_num(text):
 | ||
|     if text.startswith(("+", "-", "±", "~")):
 | ||
|         text = text[1:]
 | ||
|     text = text.replace(", ", "").replace(".", "")
 | ||
|     if text.isdigit():
 | ||
|         return True
 | ||
|     if text.count("/") == 1:
 | ||
|         num, denom = text.split("/")
 | ||
|         if num.isdigit() and denom.isdigit():
 | ||
|             return True
 | ||
|     if text.lower() in _num_words:
 | ||
|         return True
 | ||
|     return False
 | ||
| 
 | ||
| 
 | ||
| LEX_ATTRS = {NORM: norm, LIKE_NUM: like_num}
 |