Merge pull request #7011 from Shumie82/master

This commit is contained in:
Ines Montani 2021-02-13 12:30:42 +11:00 committed by GitHub
commit 34ee0fbd70
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 178 additions and 1 deletions

18
spacy/lang/tn/__init__.py Normal file
View File

@ -0,0 +1,18 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from ...language import Language
class SetswanaDefaults(Language.Defaults):
infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
class Setswana(Language):
lang = "tn"
Defaults = SetswanaDefaults
__all__ = ["Setswana"]

15
spacy/lang/tn/examples.py Normal file
View File

@ -0,0 +1,15 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.tn.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion",
"Johannesburg ke toropo e kgolo mo Afrika Borwa.",
"O ko kae?",
"ke mang presidente ya Afrika Borwa?",
"ke eng toropo kgolo ya Afrika Borwa?",
"Nelson Mandela o belegwe leng?",
]

106
spacy/lang/tn/lex_attrs.py Normal file
View File

@ -0,0 +1,106 @@
from ...attrs import LIKE_NUM
_num_words = [
"lefela",
"nngwe",
"pedi",
"tharo",
"nne",
"tlhano",
"thataro",
"supa",
"robedi",
"robongwe",
"lesome",
"lesomenngwe",
"lesomepedi",
"sometharo",
"somenne",
"sometlhano",
"somethataro",
"somesupa",
"somerobedi",
"somerobongwe",
"someamabedi",
"someamararo",
"someamane",
"someamatlhano",
"someamarataro",
"someamasupa",
"someamarobedi",
"someamarobongwe",
"lekgolo",
"sekete",
"milione",
"bilione",
"terilione",
"kwatirilione",
"gajillione",
"bazillione",
]
_ordinal_words = [
"ntlha",
"bobedi",
"boraro",
"bone",
"botlhano",
"borataro",
"bosupa",
"borobedi ",
"borobongwe",
"bolesome",
"bolesomengwe",
"bolesomepedi",
"bolesometharo",
"bolesomenne",
"bolesometlhano",
"bolesomethataro",
"bolesomesupa",
"bolesomerobedi",
"bolesomerobongwe",
"somamabedi",
"someamararo",
"someamane",
"someamatlhano",
"someamarataro",
"someamasupa",
"someamarobedi",
"someamarobongwe",
"lekgolo",
"sekete",
"milione",
"bilione",
"terilione",
"kwatirilione",
"gajillione",
"bazillione",
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
text_lower = text.lower()
if text_lower in _num_words:
return True
# CHeck ordinal number
if text_lower in _ordinal_words:
return True
if text_lower.endswith("th"):
if text_lower[:-2].isdigit():
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -0,0 +1,19 @@
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)
TOKENIZER_INFIXES = _infixes

View File

@ -0,0 +1,18 @@
# Stop words
STOP_WORDS = set("""
ke gareng ga selekanyo tlhwatlhwa yo mongwe se
sengwe fa go le jalo gongwe ba na mo tikologong
jaaka kwa morago nna gonne ka sa pele nako teng
tlase fela ntle magareng tsona feta bobedi kgabaganya
moo gape kgatlhanong botlhe tsotlhe bokana e esi
setseng mororo dinako golo kgolo nnye wena gago
o ntse ntle tla goreng gangwe mang yotlhe gore
eo yona tseraganyo eng ne sentle re rona thata
godimo fitlha pedi masomamabedi lesomepedi mmogo
tharo tseo boraro tseno yone jaanong bobona bona
lesome tsaya tsamaiso nngwe masomethataro thataro
tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi
bonala e tshwanang bogolo tsenya tsweetswee karolo
sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa
tlhano lesometlhano botlalo lekgolo
""".split())

View File

@ -8,7 +8,8 @@ from spacy.util import get_lang_class
LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
"et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
"it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
"sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur", 'yo']
"sl", "sq", "sr", "sv", "ta", "te", "tl", "tn", "tr", "tt", "ur",
"yo"]
# fmt: on