spaCy/spacy/lang/ms/lex_attrs.py

import unicodedata

from ...attrs import IS_CURRENCY, LIKE_NUM
from .punctuation import LIST_CURRENCY

_num_words = [
    "kosong",
    "satu",
    "dua",
    "tiga",
    "empat",
    "lima",
    "enam",
    "tujuh",
    "lapan",
    "sembilan",
    "sepuluh",
    "sebelas",
    "belas",
    "puluh",
    "ratus",
    "ribu",
    "juta",
    "billion",
    "trillion",
    "kuadrilion",
    "kuintilion",
    "sekstilion",
    "septilion",
    "oktilion",
    "nonilion",
    "desilion",
]


def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
    if text.count("/") == 1:
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    if text.lower() in _num_words:
        return True
    if text.count("-") == 1:
        _, num = text.split("-")
        if num.isdigit() or num in _num_words:
            return True
    return False


def is_currency(text):
    if text in LIST_CURRENCY:
        return True

    for char in text:
        if unicodedata.category(char) != "Sc":
            return False
    return True


LEX_ATTRS = {IS_CURRENCY: is_currency, LIKE_NUM: like_num}
Malay language support (#12602) * add malay lang * fix token len * black format * reformat conftest malay * remove exceptions not exist in dbp * format code 2023-05-17 13:45:21 +03:00			`import unicodedata`

			`from ...attrs import IS_CURRENCY, LIKE_NUM`
Configure isort to use the Black profile, recursively isort the `spacy` module (#12721) * Use isort with Black profile * isort all the things * Fix import cycles as a result of import sorting * Add DOCBIN_ALL_ATTRS type definition * Add isort to requirements * Remove isort from build dependencies check * Typo 2023-06-14 18:48:41 +03:00			`from .punctuation import LIST_CURRENCY`
Malay language support (#12602) * add malay lang * fix token len * black format * reformat conftest malay * remove exceptions not exist in dbp * format code 2023-05-17 13:45:21 +03:00
			`_num_words = [`
			`"kosong",`
			`"satu",`
			`"dua",`
			`"tiga",`
			`"empat",`
			`"lima",`
			`"enam",`
			`"tujuh",`
			`"lapan",`
			`"sembilan",`
			`"sepuluh",`
			`"sebelas",`
			`"belas",`
			`"puluh",`
			`"ratus",`
			`"ribu",`
			`"juta",`
			`"billion",`
			`"trillion",`
			`"kuadrilion",`
			`"kuintilion",`
			`"sekstilion",`
			`"septilion",`
			`"oktilion",`
			`"nonilion",`
			`"desilion",`
			`]`


			`def like_num(text):`
			`if text.startswith(("+", "-", "±", "~")):`
			`text = text[1:]`
			`text = text.replace(",", "").replace(".", "")`
			`if text.isdigit():`
			`return True`
			`if text.count("/") == 1:`
			`num, denom = text.split("/")`
			`if num.isdigit() and denom.isdigit():`
			`return True`
			`if text.lower() in _num_words:`
			`return True`
			`if text.count("-") == 1:`
			`_, num = text.split("-")`
			`if num.isdigit() or num in _num_words:`
			`return True`
			`return False`


			`def is_currency(text):`
			`if text in LIST_CURRENCY:`
			`return True`

			`for char in text:`
			`if unicodedata.category(char) != "Sc":`
			`return False`
			`return True`


			`LEX_ATTRS = {IS_CURRENCY: is_currency, LIKE_NUM: like_num}`