mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Remove unicode declarations * Remove Python 3.5 and 2.7 from CI * Don't require pathlib * Replace compat helpers * Remove OrderedDict * Use f-strings * Set Cython compiler language level * Fix typo * Re-add OrderedDict for Table * Update setup.cfg * Revert CONTRIBUTING.md * Revert lookups.md * Revert top-level.md * Small adjustments and docs [ci skip]
		
			
				
	
	
		
			82 lines
		
	
	
		
			2.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			82 lines
		
	
	
		
			2.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from ...attrs import LIKE_NUM
 | 
						|
 | 
						|
 | 
						|
_numeral_suffixes = {"பத்து": "பது", "ற்று": "று", "ரத்து": "ரம்", "சத்து": "சம்"}
 | 
						|
_num_words = [
 | 
						|
    "பூச்சியம்",
 | 
						|
    "ஒரு",
 | 
						|
    "ஒன்று",
 | 
						|
    "இரண்டு",
 | 
						|
    "மூன்று",
 | 
						|
    "நான்கு",
 | 
						|
    "ஐந்து",
 | 
						|
    "ஆறு",
 | 
						|
    "ஏழு",
 | 
						|
    "எட்டு",
 | 
						|
    "ஒன்பது",
 | 
						|
    "பத்து",
 | 
						|
    "பதினொன்று",
 | 
						|
    "பன்னிரண்டு",
 | 
						|
    "பதின்மூன்று",
 | 
						|
    "பதினான்கு",
 | 
						|
    "பதினைந்து",
 | 
						|
    "பதினாறு",
 | 
						|
    "பதினேழு",
 | 
						|
    "பதினெட்டு",
 | 
						|
    "பத்தொன்பது",
 | 
						|
    "இருபது",
 | 
						|
    "முப்பது",
 | 
						|
    "நாற்பது",
 | 
						|
    "ஐம்பது",
 | 
						|
    "அறுபது",
 | 
						|
    "எழுபது",
 | 
						|
    "எண்பது",
 | 
						|
    "தொண்ணூறு",
 | 
						|
    "நூறு",
 | 
						|
    "இருநூறு",
 | 
						|
    "முன்னூறு",
 | 
						|
    "நாநூறு",
 | 
						|
    "ஐநூறு",
 | 
						|
    "அறுநூறு",
 | 
						|
    "எழுநூறு",
 | 
						|
    "எண்ணூறு",
 | 
						|
    "தொள்ளாயிரம்",
 | 
						|
    "ஆயிரம்",
 | 
						|
    "ஒராயிரம்",
 | 
						|
    "லட்சம்",
 | 
						|
    "மில்லியன்",
 | 
						|
    "கோடி",
 | 
						|
    "பில்லியன்",
 | 
						|
    "டிரில்லியன்",
 | 
						|
]
 | 
						|
 | 
						|
 | 
						|
# 20-89 ,90-899,900-99999 and above have different suffixes
 | 
						|
def suffix_filter(text):
 | 
						|
    # text without numeral suffixes
 | 
						|
    for num_suffix in _numeral_suffixes.keys():
 | 
						|
        length = len(num_suffix)
 | 
						|
        if len(text) < length:
 | 
						|
            break
 | 
						|
        elif text.endswith(num_suffix):
 | 
						|
            return text[:-length] + _numeral_suffixes[num_suffix]
 | 
						|
    return text
 | 
						|
 | 
						|
 | 
						|
def like_num(text):
 | 
						|
    text = text.replace(",", "").replace(".", "")
 | 
						|
    if text.isdigit():
 | 
						|
        return True
 | 
						|
    if text.count("/") == 1:
 | 
						|
        num, denom = text.split("/")
 | 
						|
        if num.isdigit() and denom.isdigit():
 | 
						|
            return True
 | 
						|
    if text.lower() in _num_words:
 | 
						|
        return True
 | 
						|
    elif suffix_filter(text) in _num_words:
 | 
						|
        return True
 | 
						|
    return False
 | 
						|
 | 
						|
 | 
						|
LEX_ATTRS = {LIKE_NUM: like_num}
 |