mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge pull request #7011 from Shumie82/master
This commit is contained in:
commit
34ee0fbd70
18
spacy/lang/tn/__init__.py
Normal file
18
spacy/lang/tn/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from ...language import Language
|
||||
|
||||
|
||||
class SetswanaDefaults(Language.Defaults):
|
||||
infixes = TOKENIZER_INFIXES
|
||||
stop_words = STOP_WORDS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
|
||||
|
||||
class Setswana(Language):
|
||||
lang = "tn"
|
||||
Defaults = SetswanaDefaults
|
||||
|
||||
|
||||
__all__ = ["Setswana"]
|
15
spacy/lang/tn/examples.py
Normal file
15
spacy/lang/tn/examples.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
>>> from spacy.lang.tn.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion",
|
||||
"Johannesburg ke toropo e kgolo mo Afrika Borwa.",
|
||||
"O ko kae?",
|
||||
"ke mang presidente ya Afrika Borwa?",
|
||||
"ke eng toropo kgolo ya Afrika Borwa?",
|
||||
"Nelson Mandela o belegwe leng?",
|
||||
]
|
106
spacy/lang/tn/lex_attrs.py
Normal file
106
spacy/lang/tn/lex_attrs.py
Normal file
|
@ -0,0 +1,106 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = [
|
||||
"lefela",
|
||||
"nngwe",
|
||||
"pedi",
|
||||
"tharo",
|
||||
"nne",
|
||||
"tlhano",
|
||||
"thataro",
|
||||
"supa",
|
||||
"robedi",
|
||||
"robongwe",
|
||||
"lesome",
|
||||
"lesomenngwe",
|
||||
"lesomepedi",
|
||||
"sometharo",
|
||||
"somenne",
|
||||
"sometlhano",
|
||||
"somethataro",
|
||||
"somesupa",
|
||||
"somerobedi",
|
||||
"somerobongwe",
|
||||
"someamabedi",
|
||||
"someamararo",
|
||||
"someamane",
|
||||
"someamatlhano",
|
||||
"someamarataro",
|
||||
"someamasupa",
|
||||
"someamarobedi",
|
||||
"someamarobongwe",
|
||||
"lekgolo",
|
||||
"sekete",
|
||||
"milione",
|
||||
"bilione",
|
||||
"terilione",
|
||||
"kwatirilione",
|
||||
"gajillione",
|
||||
"bazillione",
|
||||
]
|
||||
|
||||
|
||||
_ordinal_words = [
|
||||
"ntlha",
|
||||
"bobedi",
|
||||
"boraro",
|
||||
"bone",
|
||||
"botlhano",
|
||||
"borataro",
|
||||
"bosupa",
|
||||
"borobedi ",
|
||||
"borobongwe",
|
||||
"bolesome",
|
||||
"bolesomengwe",
|
||||
"bolesomepedi",
|
||||
"bolesometharo",
|
||||
"bolesomenne",
|
||||
"bolesometlhano",
|
||||
"bolesomethataro",
|
||||
"bolesomesupa",
|
||||
"bolesomerobedi",
|
||||
"bolesomerobongwe",
|
||||
"somamabedi",
|
||||
"someamararo",
|
||||
"someamane",
|
||||
"someamatlhano",
|
||||
"someamarataro",
|
||||
"someamasupa",
|
||||
"someamarobedi",
|
||||
"someamarobongwe",
|
||||
"lekgolo",
|
||||
"sekete",
|
||||
"milione",
|
||||
"bilione",
|
||||
"terilione",
|
||||
"kwatirilione",
|
||||
"gajillione",
|
||||
"bazillione",
|
||||
]
|
||||
|
||||
def like_num(text):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
|
||||
text_lower = text.lower()
|
||||
if text_lower in _num_words:
|
||||
return True
|
||||
|
||||
# CHeck ordinal number
|
||||
if text_lower in _ordinal_words:
|
||||
return True
|
||||
if text_lower.endswith("th"):
|
||||
if text_lower[:-2].isdigit():
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
19
spacy/lang/tn/punctuation.py
Normal file
19
spacy/lang/tn/punctuation.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
18
spacy/lang/tn/stop_words.py
Normal file
18
spacy/lang/tn/stop_words.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# Stop words
|
||||
STOP_WORDS = set("""
|
||||
ke gareng ga selekanyo tlhwatlhwa yo mongwe se
|
||||
sengwe fa go le jalo gongwe ba na mo tikologong
|
||||
jaaka kwa morago nna gonne ka sa pele nako teng
|
||||
tlase fela ntle magareng tsona feta bobedi kgabaganya
|
||||
moo gape kgatlhanong botlhe tsotlhe bokana e esi
|
||||
setseng mororo dinako golo kgolo nnye wena gago
|
||||
o ntse ntle tla goreng gangwe mang yotlhe gore
|
||||
eo yona tseraganyo eng ne sentle re rona thata
|
||||
godimo fitlha pedi masomamabedi lesomepedi mmogo
|
||||
tharo tseo boraro tseno yone jaanong bobona bona
|
||||
lesome tsaya tsamaiso nngwe masomethataro thataro
|
||||
tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi
|
||||
bonala e tshwanang bogolo tsenya tsweetswee karolo
|
||||
sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa
|
||||
tlhano lesometlhano botlalo lekgolo
|
||||
""".split())
|
|
@ -8,7 +8,8 @@ from spacy.util import get_lang_class
|
|||
LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
|
||||
"et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
|
||||
"it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
|
||||
"sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur", 'yo']
|
||||
"sl", "sq", "sr", "sv", "ta", "te", "tl", "tn", "tr", "tt", "ur",
|
||||
"yo"]
|
||||
# fmt: on
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user