spaCy/spacy/lang/ti/lex_attrs.py
Yosi cf52510631
Add Amharic አማርኛ Language support (#6583)
* Add Amharic to space

* clean up

* Add some PRON_LEMMA

* add Tigrinya support

* remove text_noun_chunks

* Tigrinya Support

* added some more details for ti

* fix unit test

* add amharic char range

* changes from review

* amharic and tigrinya share same unicode block

* get rid of _amharic/_tigrinya in char_classes

Co-authored-by: Josiah Solomon <jsolomon@meteorcomm.com>
2020-12-22 16:50:34 +01:00

105 lines
2.3 KiB
Python

# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = [
"ዜሮ",
"ሐደ",
"ክልተ",
"ሰለስተ",
"ኣርባዕተ",
"ሓሙሽተ",
"ሽድሽተ",
"ሸውዓተ",
"ሽሞንተ",
"ትሽዓተ",
"ኣሰርተ",
"ኣሰርተ ሐደ",
"ኣሰርተ ክልተ",
"ኣሰርተ ሰለስተ",
"ኣሰርተ ኣርባዕተ",
"ኣሰርተ ሓሙሽተ",
"ኣሰርተ ሽድሽተ",
"ኣሰርተ ሸውዓተ",
"ኣሰርተ ሽሞንተ",
"ኣሰርተ ትሽዓተ",
"ዕስራ",
"ሰላሳ",
"ኣርብዓ",
"ሃምሳ",
"ስልሳ",
"ሰብዓ",
"ሰማንያ",
"ተስዓ",
"ሚእቲ",
"ሺሕ",
"ሚልዮን",
"ቢልዮን",
"ትሪልዮን",
"ኳድሪልዮን",
"ገጅልዮን",
"ባዝልዮን"
]
_ordinal_words = [
"ቀዳማይ",
"ካልኣይ",
"ሳልሳይ",
"ራብኣይ",
"ሓምሻይ",
"ሻድሻይ",
"ሻውዓይ",
"ሻምናይ",
"ዘጠነኛ",
"አስረኛ",
"ኣሰርተ አንደኛ",
"ኣሰርተ ሁለተኛ",
"ኣሰርተ ሶስተኛ",
"ኣሰርተ አራተኛ",
"ኣሰርተ አምስተኛ",
"ኣሰርተ ስድስተኛ",
"ኣሰርተ ሰባተኛ",
"ኣሰርተ ስምንተኛ",
"ኣሰርተ ዘጠነኛ",
"ሃያኛ",
"ሰላሳኛ"
"አርባኛ",
"አምሳኛ",
"ስድሳኛ",
"ሰባኛ",
"ሰማንያኛ",
"ዘጠናኛ",
"መቶኛ",
"ሺኛ",
"ሚሊዮንኛ",
"ቢሊዮንኛ",
"ትሪሊዮንኛ"
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
text_lower = text.lower()
if text_lower in _num_words:
return True
# Check ordinal number
if text_lower in _ordinal_words:
return True
if text_lower.endswith(""):
if text_lower[:-2].isdigit():
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}