Merge pull request #7011 from Shumie82/master

2025-11-04 09:57:26 +03:00 · 2021-02-13 12:30:42 +11:00 · 2021-02-13 12:30:42 +11:00 · 34ee0fbd70
commit 34ee0fbd70
parent e583050547 6c450decfc
6 changed files with 178 additions and 1 deletions
--- a/spacy/lang/tn/init.py
+++ b/spacy/lang/tn/init.py
@ -0,0 +1,18 @@
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES
 from ...language import Language
 class SetswanaDefaults(Language.Defaults):
    infixes = TOKENIZER_INFIXES
    stop_words = STOP_WORDS
    lex_attr_getters = LEX_ATTRS
 class Setswana(Language):
    lang = "tn"
    Defaults = SetswanaDefaults
 __all__ = ["Setswana"]
--- a/spacy/lang/tn/examples.py
+++ b/spacy/lang/tn/examples.py
@ -0,0 +1,15 @@
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.tn.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion",
    "Johannesburg ke toropo e kgolo mo Afrika Borwa.",
    "O ko kae?",
    "ke mang presidente ya Afrika Borwa?",
    "ke eng toropo kgolo ya Afrika Borwa?",
    "Nelson Mandela o belegwe leng?",
 ]
--- a/spacy/lang/tn/lex_attrs.py
+++ b/spacy/lang/tn/lex_attrs.py
@ -0,0 +1,106 @@
 from ...attrs import LIKE_NUM
 _num_words = [
    "lefela",
    "nngwe",
    "pedi",
    "tharo",
    "nne",
    "tlhano",
    "thataro",
    "supa",
    "robedi",
    "robongwe",
    "lesome",
    "lesomenngwe",
    "lesomepedi",
    "sometharo",
    "somenne",
    "sometlhano",
    "somethataro",
    "somesupa",
    "somerobedi",
    "somerobongwe",
    "someamabedi",
    "someamararo",
    "someamane",
    "someamatlhano",
    "someamarataro",
    "someamasupa",
    "someamarobedi",
    "someamarobongwe",
    "lekgolo",
    "sekete",
    "milione",
    "bilione",
    "terilione",
    "kwatirilione",
    "gajillione",
    "bazillione",
 ]
 _ordinal_words = [
    "ntlha",
    "bobedi",
    "boraro",
    "bone",
    "botlhano",
    "borataro",
    "bosupa",
    "borobedi ",
    "borobongwe",
    "bolesome",
    "bolesomengwe",
    "bolesomepedi",
    "bolesometharo",
    "bolesomenne",
    "bolesometlhano",
    "bolesomethataro",
    "bolesomesupa",
    "bolesomerobedi",
    "bolesomerobongwe",
    "somamabedi",
    "someamararo",
    "someamane",
    "someamatlhano",
    "someamarataro",
    "someamasupa",
    "someamarobedi",
    "someamarobongwe",
    "lekgolo",
    "sekete",
    "milione",
    "bilione",
    "terilione",
    "kwatirilione",
    "gajillione",
    "bazillione",
 ]
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
    if text.count("/") == 1:
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    text_lower = text.lower()
    if text_lower in _num_words:
        return True
    # CHeck ordinal number
    if text_lower in _ordinal_words:
        return True
    if text_lower.endswith("th"):
        if text_lower[:-2].isdigit():
            return True 
    return False
 LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/tn/punctuation.py
+++ b/spacy/lang/tn/punctuation.py
@ -0,0 +1,19 @@
 from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
 from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
 _infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
 )
 TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/tn/stop_words.py
+++ b/spacy/lang/tn/stop_words.py
@ -0,0 +1,18 @@
 # Stop words
 STOP_WORDS = set("""
 ke gareng ga selekanyo tlhwatlhwa yo mongwe se 
 sengwe fa go le jalo gongwe ba na mo tikologong
 jaaka kwa morago nna gonne ka sa pele nako teng 
 tlase fela ntle magareng tsona feta bobedi kgabaganya
 moo gape kgatlhanong botlhe tsotlhe bokana e esi
 setseng mororo dinako golo kgolo nnye wena gago 
 o ntse ntle tla goreng gangwe mang yotlhe gore 
 eo yona tseraganyo eng ne sentle re rona thata 
 godimo fitlha pedi masomamabedi lesomepedi mmogo 
 tharo tseo boraro tseno yone jaanong bobona bona 
 lesome tsaya tsamaiso nngwe masomethataro thataro 
 tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi
 bonala e tshwanang bogolo tsenya tsweetswee karolo 
 sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa 
 tlhano lesometlhano botlalo lekgolo           
 """.split())
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@ -8,7 +8,8 @@ from spacy.util import get_lang_class
 LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
             "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
             "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
-             "sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur", 'yo']
+             "sl", "sq", "sr", "sv", "ta", "te", "tl", "tn", "tr", "tt", "ur",
             "yo"]
 # fmt: on