mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Update stop_words.py Hebrew STOP WORDS * Update stop_words.py * contributor * contributor * add some common domain extentions support human number 1K/1M.... * support human number 1K/1M.... * hebrew number tokenize 1K/1M implement in EN * test human tokenize fix * test * heb like num revert human number change * heb like num
		
			
				
	
	
		
			110 lines
		
	
	
		
			1.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			110 lines
		
	
	
		
			1.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding: utf8
 | 
						|
from __future__ import unicode_literals
 | 
						|
 | 
						|
from ...attrs import LIKE_NUM
 | 
						|
 | 
						|
_num_words = [
 | 
						|
    "zero",
 | 
						|
    "one",
 | 
						|
    "two",
 | 
						|
    "three",
 | 
						|
    "four",
 | 
						|
    "five",
 | 
						|
    "six",
 | 
						|
    "seven",
 | 
						|
    "eight",
 | 
						|
    "nine",
 | 
						|
    "ten",
 | 
						|
    "eleven",
 | 
						|
    "twelve",
 | 
						|
    "thirteen",
 | 
						|
    "fourteen",
 | 
						|
    "fifteen",
 | 
						|
    "sixteen",
 | 
						|
    "seventeen",
 | 
						|
    "eighteen",
 | 
						|
    "nineteen",
 | 
						|
    "twenty",
 | 
						|
    "thirty",
 | 
						|
    "forty",
 | 
						|
    "fifty",
 | 
						|
    "sixty",
 | 
						|
    "seventy",
 | 
						|
    "eighty",
 | 
						|
    "ninety",
 | 
						|
    "hundred",
 | 
						|
    "thousand",
 | 
						|
    "million",
 | 
						|
    "billion",
 | 
						|
    "trillion",
 | 
						|
    "quadrillion",
 | 
						|
    "gajillion",
 | 
						|
    "bazillion",
 | 
						|
]
 | 
						|
 | 
						|
 | 
						|
_ordinal_words = [
 | 
						|
    "first",
 | 
						|
    "second",
 | 
						|
    "third",
 | 
						|
    "fourth",
 | 
						|
    "fifth",
 | 
						|
    "sixth",
 | 
						|
    "seventh",
 | 
						|
    "eighth",
 | 
						|
    "ninth",
 | 
						|
    "tenth",
 | 
						|
    "eleventh",
 | 
						|
    "twelfth",
 | 
						|
    "thirteenth",
 | 
						|
    "fourteenth",
 | 
						|
    "fifteenth",
 | 
						|
    "sixteenth",
 | 
						|
    "seventeenth",
 | 
						|
    "eighteenth",
 | 
						|
    "nineteenth",
 | 
						|
    "twentieth",
 | 
						|
    "thirtieth",
 | 
						|
    "fortieth",
 | 
						|
    "fiftieth",
 | 
						|
    "sixtieth",
 | 
						|
    "seventieth",
 | 
						|
    "eightieth",
 | 
						|
    "ninetieth",
 | 
						|
    "hundredth",
 | 
						|
    "thousandth",
 | 
						|
    "millionth",
 | 
						|
    "billionth",
 | 
						|
    "trillionth",
 | 
						|
    "quadrillionth",
 | 
						|
    "gajillionth",
 | 
						|
    "bazillionth",
 | 
						|
]
 | 
						|
 | 
						|
def like_num(text):
 | 
						|
    if text.startswith(("+", "-", "±", "~")):
 | 
						|
        text = text[1:]
 | 
						|
    text = text.replace(",", "").replace(".", "")
 | 
						|
    if text.isdigit():
 | 
						|
        return True
 | 
						|
    if text.count("/") == 1:
 | 
						|
        num, denom = text.split("/")
 | 
						|
        if num.isdigit() and denom.isdigit():
 | 
						|
            return True
 | 
						|
 | 
						|
    text_lower = text.lower()
 | 
						|
    if text_lower in _num_words:
 | 
						|
        return True
 | 
						|
 | 
						|
    # CHeck ordinal number
 | 
						|
    if text_lower in _ordinal_words:
 | 
						|
        return True
 | 
						|
    if text_lower.endswith("th"):
 | 
						|
        if text_lower[:-2].isdigit():
 | 
						|
            return True 
 | 
						|
 | 
						|
    return False
 | 
						|
 | 
						|
 | 
						|
LEX_ATTRS = {LIKE_NUM: like_num}
 |