spaCy/spacy/lang/th/lex_attrs.py
Dobita21 f95ecedd83 Add Thai lex_attrs (#3655)
* test sPacy commit to git fri 04052019 10:54

* change Data format from my format to master format

* ทัทั้งนี้ ---> ทั้งนี้

* delete stop_word translate from Eng

* Adjust formatting and readability

* add Thai norm_exception

* Add Dobita21 SCA

* editรึ : หรือ,

* Update Dobita21.md

* Auto-format

* Integrate norms into language defaults

* add acronym and some norm exception words

* add lex_attrs

* Add lexical attribute getters into the language defaults

* fix LEX_ATTRS


Co-authored-by: Donut <dobita21@gmail.com>
Co-authored-by: Ines Montani <ines@ines.io>
2019-05-01 12:03:14 +02:00

63 lines
1.5 KiB
Python

# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = [
"ศูนย์",
"หนึ่ง",
"สอง",
"สาม",
"สี่",
"ห้า",
"หก",
"เจ็ด",
"แปด",
"เก้า",
"สิบ",
"สิบเอ็ด",
"ยี่สิบ",
"ยี่สิบเอ็ด",
"สามสิบ",
"สามสิบเอ็ด",
"สี่สิบ",
"สี่สิบเอ็ด",
"ห้าสิบ",
"ห้าสิบเอ็ด",
"หกสิบเอ็ด",
"เจ็ดสิบ",
"เจ็ดสิบเอ็ด",
"แปดสิบ",
"แปดสิบเอ็ด",
"เก้าสิบ",
"เก้าสิบเอ็ด",
"ร้อย",
"พัน",
"ล้าน",
"พันล้าน",
"หมื่นล้าน",
"แสนล้าน",
"ล้านล้าน",
"ล้านล้านล้าน",
"ล้านล้านล้านล้าน",
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}