Added gd language folder (#13570)

Implemented a foundational Scottish Gaelic (gd) language option with tokenizer_exceptions and stop_words files.
This commit is contained in:
Mark Liberko 2024-09-09 04:14:09 -05:00 committed by GitHub
parent 319e02545c
commit 55db9c2e87
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 2386 additions and 0 deletions

18
spacy/lang/gd/__init__.py Normal file
View File

@ -0,0 +1,18 @@
from typing import Optional
from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class ScottishDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
class Scottish(Language):
lang = "gd"
Defaults = ScottishDefaults
__all__ = ["Scottish"]

386
spacy/lang/gd/stop_words.py Normal file
View File

@ -0,0 +1,386 @@
STOP_WORDS = set(
"""
'ad
'ar
'd # iad
'g # ag
'ga
'gam
'gan
'gar
'gur
'm # am
'n # an
'n seo
'na
'nad
'nam
'nan
'nar
'nuair
'nur
's
'sa
'san
'sann
'se
'sna
a
a'
a'd # agad
a'm # agam
a-chèile
a-seo
a-sin
a-siud
a chionn
a chionn 's
a chèile
a chéile
a dh'
a h-uile
a seo
ac' # aca
aca
aca-san
acasan
ach
ag
agad
agad-sa
agads'
agadsa
agaibh
agaibhse
againn
againne
agam
agam-sa
agams'
agamsa
agus
aice
aice-se
aicese
aig
aig' # aige
aige
aige-san
aigesan
air
air-san
air neo
airsan
am
an
an seo
an sin
an siud
an uair
ann
ann a
ann a'
ann a shin
ann am
ann an
annad
annam
annam-s'
annamsa
anns
anns an
annta
aon
ar
as
asad
asda
asta
b'
bho
bhon
bhuaidhe # bhuaithe
bhuainn
bhuaipe
bhuaithe
bhuapa
bhur
brì
bu
c'à
car son
carson
cha
chan
chionn
choir
chon
chun
chèile
chéile
chòir
cia mheud
ciamar
co-dhiubh
cuide
cuin
cuin'
cuine
'
càil
càit
càit'
càite
mheud
d'
da
de
dh'
dha
dhaibh
dhaibh-san
dhaibhsan
dhan
dhasan
dhe
dhen
dheth
dhi
dhiom
dhiot
dhith
dhiubh
dhomh
dhomh-s'
dhomhsa
dhu'sa # dhut-sa
dhuibh
dhuibhse
dhuinn
dhuinne
dhuit
dhut
dhutsa
dhut-sa
dhà
dhà-san
dhàsan
dhòmhsa
diubh
do
docha
don
mar
mar
dòch'
dòcha
e
eadar
eatarra
eatorra
eile
esan
fa
far
feud
fhad
fheudar
fhearr
fhein
fheudar
fheàrr
fhèin
fhéin
fhìn
fo
fodha
fodhainn
foipe
fon
fèin
ga
gach
gam
gan
ge brith
ged
gu
gu
gu ruige
gun
gur
gus
i
iad
iadsan
innte
is
ise
le
leam
leam-sa
leamsa
leat
leat-sa
leatha
leatsa
leibh
leis
leis-san
leoth'
leotha
leotha-san
linn
m'
m'a
ma
mac
man
mar
mas
mathaid
mi
mis'
mise
mo
mu
mu 'n
mun
mur
mura
mus
na
na b'
na bu
na iad
nach
nad
nam
nan
nar
nas
neo
no
nuair
o
o'n
oir
oirbh
oirbh-se
oirnn
oirnne
oirre
on
orm
orm-sa
ormsa
orra
orra-san
orrasan
ort
os
r'
ri
ribh
rinn
ris
rithe
rithe-se
rium
rium-sa
riums'
riumsa
riut
riuth'
riutha
riuthasan
ro
ro'n
roimh
roimhe
romhainn
romham
romhpa
ron
ruibh
ruinn
ruinne
sa
san
sann
se
seach
seo
seothach
shin
sibh
sibh-se
sibhse
sin
sineach
sinn
sinne
siod
siodach
siud
siudach
sna # ann an
t'
tarsaing
tarsainn
tarsuinn
thar
thoigh
thro
thu
thuc'
thuca
thugad
thugaibh
thugainn
thugam
thugamsa
thuice
thuige
thus'
thusa
timcheall
toigh
toil
tro
tro' # troimh
troimh
troimhe
tron
tu
tusa
uair
ud
ugaibh
ugam-s'
ugam-sa
uice
uige
uige-san
umad
unnta # ann an
ur
urrainn
à
às
àsan
á
ás
è
ì
ò
ó
""".split("\n")
)

File diff suppressed because it is too large Load Diff