Add Thai lex_attrs (#3655)

* test sPacy commit to git fri 04052019 10:54 * change Data format from my format to master format * ทัทั้งนี้ ---> ทั้งนี้ * delete stop_word translate from Eng * Adjust formatting and readability * add Thai norm_exception * Add Dobita21 SCA * editรึ : หรือ, * Update Dobita21.md * Auto-format * Integrate norms into language defaults * add acronym and some norm exception words * add lex_attrs * Add lexical attribute getters into the language defaults * fix LEX_ATTRS Co-authored-by: Donut <dobita21@gmail.com> Co-authored-by: Ines Montani <ines@ines.io>
2025-12-22 09:34:23 +03:00 · 2019-05-01 17:03:14 +07:00 · 2019-05-01 17:03:14 +07:00 · f95ecedd83
commit f95ecedd83
parent ba1ff00370
2 changed files with 64 additions and 0 deletions
--- a/spacy/lang/th/init.py
+++ b/spacy/lang/th/init.py
@ -5,6 +5,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .norm_exceptions import NORM_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from ..norm_exceptions import BASE_NORMS
 from ...attrs import LANG, NORM
@ -34,6 +35,7 @@ class ThaiTokenizer(DummyTokenizer):
 class ThaiDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda _text: "th"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
--- a/spacy/lang/th/lex_attrs.py
+++ b/spacy/lang/th/lex_attrs.py
@ -0,0 +1,62 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...attrs import LIKE_NUM
 _num_words = [
    "ศูนย์",
    "หนึ่ง",
    "สอง",
    "สาม",
    "สี่",
    "ห้า",
    "หก",
    "เจ็ด",
    "แปด",
    "เก้า",
    "สิบ",
    "สิบเอ็ด",
    "ยี่สิบ",
    "ยี่สิบเอ็ด",
    "สามสิบ",
    "สามสิบเอ็ด",
    "สี่สิบ",
    "สี่สิบเอ็ด",
    "ห้าสิบ",
    "ห้าสิบเอ็ด",
    "หกสิบเอ็ด",
    "เจ็ดสิบ",
    "เจ็ดสิบเอ็ด",
    "แปดสิบ",
    "แปดสิบเอ็ด",
    "เก้าสิบ",
    "เก้าสิบเอ็ด",
    "ร้อย",
    "พัน",
    "ล้าน",
    "พันล้าน",
    "หมื่นล้าน",
    "แสนล้าน",
    "ล้านล้าน",
    "ล้านล้านล้าน",
    "ล้านล้านล้านล้าน",
 ]
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
    if text.count("/") == 1:
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    if text in _num_words:
        return True
    return False
 LEX_ATTRS = {LIKE_NUM: like_num}