mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
e07500369c
Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
47 lines
1.4 KiB
Python
47 lines
1.4 KiB
Python
from ...attrs import LIKE_NUM
|
|
|
|
|
|
_num_words = [
|
|
"không", # Zero
|
|
"một", # One
|
|
"mốt", # Also one, irreplacable in niché cases for unit digit such as "51"="năm mươi mốt"
|
|
"hai", # Two
|
|
"ba", # Three
|
|
"bốn", # Four
|
|
"tư", # Also four, used in certain cases for unit digit such as "54"="năm mươi tư"
|
|
"năm", # Five
|
|
"lăm", # Also five, irreplacable in niché cases for unit digit such as "55"="năm mươi lăm"
|
|
"sáu", # Six
|
|
"bảy", # Seven
|
|
"bẩy", # Also seven, old fashioned
|
|
"tám", # Eight
|
|
"chín", # Nine
|
|
"mười", # Ten
|
|
"chục", # Also ten, used for counting in tens such as "20 eggs"="hai chục trứng"
|
|
"trăm", # Hundred
|
|
"nghìn", # Thousand
|
|
"ngàn", # Also thousand, used in the south
|
|
"vạn", # Ten thousand
|
|
"triệu", # Million
|
|
"tỷ", # Billion
|
|
"tỉ", # Also billion, used in combinatorics such as "tỉ_phú"="billionaire"
|
|
]
|
|
|
|
|
|
def like_num(text):
|
|
if text.startswith(("+", "-", "±", "~")):
|
|
text = text[1:]
|
|
text = text.replace(",", "").replace(".", "")
|
|
if text.isdigit():
|
|
return True
|
|
if text.count("/") == 1:
|
|
num, denom = text.split("/")
|
|
if num.isdigit() and denom.isdigit():
|
|
return True
|
|
if text.lower() in _num_words:
|
|
return True
|
|
return False
|
|
|
|
|
|
LEX_ATTRS = {LIKE_NUM: like_num}
|