Merge pull request #7011 from Shumie82/master

2025-07-31 10:29:46 +03:00 · 2021-02-13 12:30:42 +11:00 · 2021-02-13 12:30:42 +11:00 · 34ee0fbd70
commit 34ee0fbd70
parent e583050547 6c450decfc
6 changed files with 178 additions and 1 deletions
--- a/spacy/lang/tn/init.py
+++ b/spacy/lang/tn/init.py
@ -0,0 +1,18 @@
+from .stop_words import STOP_WORDS
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES
+from ...language import Language
+
+
+class SetswanaDefaults(Language.Defaults):
+    infixes = TOKENIZER_INFIXES
+    stop_words = STOP_WORDS
+    lex_attr_getters = LEX_ATTRS
+
+
+class Setswana(Language):
+    lang = "tn"
+    Defaults = SetswanaDefaults
+
+
+__all__ = ["Setswana"]
--- a/spacy/lang/tn/examples.py
+++ b/spacy/lang/tn/examples.py
@ -0,0 +1,15 @@
+"""
+Example sentences to test spaCy and its language models.
+>>> from spacy.lang.tn.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion",
+    "Johannesburg ke toropo e kgolo mo Afrika Borwa.",
+    "O ko kae?",
+    "ke mang presidente ya Afrika Borwa?",
+    "ke eng toropo kgolo ya Afrika Borwa?",
+    "Nelson Mandela o belegwe leng?",
+]
--- a/spacy/lang/tn/lex_attrs.py
+++ b/spacy/lang/tn/lex_attrs.py
@ -0,0 +1,106 @@
+from ...attrs import LIKE_NUM
+
+_num_words = [
+    "lefela",
+    "nngwe",
+    "pedi",
+    "tharo",
+    "nne",
+    "tlhano",
+    "thataro",
+    "supa",
+    "robedi",
+    "robongwe",
+    "lesome",
+    "lesomenngwe",
+    "lesomepedi",
+    "sometharo",
+    "somenne",
+    "sometlhano",
+    "somethataro",
+    "somesupa",
+    "somerobedi",
+    "somerobongwe",
+    "someamabedi",
+    "someamararo",
+    "someamane",
+    "someamatlhano",
+    "someamarataro",
+    "someamasupa",
+    "someamarobedi",
+    "someamarobongwe",
+    "lekgolo",
+    "sekete",
+    "milione",
+    "bilione",
+    "terilione",
+    "kwatirilione",
+    "gajillione",
+    "bazillione",
+]
+
+
+_ordinal_words = [
+    "ntlha",
+    "bobedi",
+    "boraro",
+    "bone",
+    "botlhano",
+    "borataro",
+    "bosupa",
+    "borobedi ",
+    "borobongwe",
+    "bolesome",
+    "bolesomengwe",
+    "bolesomepedi",
+    "bolesometharo",
+    "bolesomenne",
+    "bolesometlhano",
+    "bolesomethataro",
+    "bolesomesupa",
+    "bolesomerobedi",
+    "bolesomerobongwe",
+    "somamabedi",
+    "someamararo",
+    "someamane",
+    "someamatlhano",
+    "someamarataro",
+    "someamasupa",
+    "someamarobedi",
+    "someamarobongwe",
+    "lekgolo",
+    "sekete",
+    "milione",
+    "bilione",
+    "terilione",
+    "kwatirilione",
+    "gajillione",
+    "bazillione",
+]
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+
+    text_lower = text.lower()
+    if text_lower in _num_words:
+        return True
+
+    # CHeck ordinal number
+    if text_lower in _ordinal_words:
+        return True
+    if text_lower.endswith("th"):
+        if text_lower[:-2].isdigit():
+            return True 
+
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/tn/punctuation.py
+++ b/spacy/lang/tn/punctuation.py
@ -0,0 +1,19 @@
+from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
+from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+        ),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
+        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+    ]
+)
+
+
+TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/tn/stop_words.py
+++ b/spacy/lang/tn/stop_words.py
@ -0,0 +1,18 @@
+# Stop words
+STOP_WORDS = set("""
+ke gareng ga selekanyo tlhwatlhwa yo mongwe se 
+sengwe fa go le jalo gongwe ba na mo tikologong
+jaaka kwa morago nna gonne ka sa pele nako teng 
+tlase fela ntle magareng tsona feta bobedi kgabaganya
+moo gape kgatlhanong botlhe tsotlhe bokana e esi
+setseng mororo dinako golo kgolo nnye wena gago 
+o ntse ntle tla goreng gangwe mang yotlhe gore 
+eo yona tseraganyo eng ne sentle re rona thata 
+godimo fitlha pedi masomamabedi lesomepedi mmogo 
+tharo tseo boraro tseno yone jaanong bobona bona 
+lesome tsaya tsamaiso nngwe masomethataro thataro 
+tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi
+bonala e tshwanang bogolo tsenya tsweetswee karolo 
+sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa 
+tlhano lesometlhano botlalo lekgolo           
+""".split())
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@ -8,7 +8,8 @@ from spacy.util import get_lang_class
 LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
             "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
             "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
-             "sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur", 'yo']
+             "sl", "sq", "sr", "sv", "ta", "te", "tl", "tn", "tr", "tt", "ur",
+             "yo"]
 # fmt: on