spaCy/spacy/lang/ti/lex_attrs.py
fgaim ee011ca963
Update Tigrinya ትግርኛ language support (#8900)
* Add missing punctuation for Tigrinya and Amharic

* Fix numeral and ordinal numbers for Tigrinya

 - Amharic was used in many cases
 - Also fixed some typos

* Update Tigrinya stop-words

* Contributor agreement for fgaim

* Fix typo in "ti" lang test

* Remove multi-word entries from numbers and ordinals
2021-08-10 13:55:08 +02:00

74 lines
1.5 KiB
Python

from ...attrs import LIKE_NUM
_num_words = [
"ዜሮ",
"ሓደ",
"ክልተ",
"ሰለስተ",
"ኣርባዕተ",
"ሓሙሽተ",
"ሽድሽተ",
"ሸውዓተ",
"ሽሞንተ",
"ትሽዓተ",
"ዓሰርተ",
"ዕስራ",
"ሰላሳ",
"ኣርብዓ",
"ሓምሳ",
"ሱሳ",
"ሰብዓ",
"ሰማንያ",
"ቴስዓ",
"ሚእቲ",
"ሺሕ",
"ሚልዮን",
"ቢልዮን",
"ትሪልዮን",
"ኳድሪልዮን",
"ጋዚልዮን",
"ባዚልዮን"
]
# Tigrinya ordinals above 10 are the same as _num_words but start with "መበል "
_ordinal_words = [
"ቀዳማይ",
"ካልኣይ",
"ሳልሳይ",
"ራብዓይ",
"ሓምሻይ",
"ሻድሻይ",
"ሻውዓይ",
"ሻምናይ",
"ታሽዓይ",
"ዓስራይ"
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
text_lower = text.lower()
if text_lower in _num_words:
return True
# Check ordinal number
if text_lower in _ordinal_words:
return True
if text_lower.endswith(""):
if text_lower[:-2].isdigit():
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}