From 2e31921d0af8fe8bbcf608d18e1f50b2d64b661a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 14 Feb 2019 15:31:19 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20Add=20base=20Language=20classes?= =?UTF-8?q?=20for=20more=20languages=20(#3276)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add base classes for more languages * Add test for language class initialization Make sure language can be initialize – otherwise, it's difficult to catch serious errors in the test suite, because languages are lazy-loaded --- spacy/lang/af/__init__.py | 20 ++ spacy/lang/af/stop_words.py | 61 ++++ spacy/lang/bg/__init__.py | 20 ++ spacy/lang/bg/stop_words.py | 269 ++++++++++++++++ spacy/lang/cs/__init__.py | 20 ++ spacy/lang/cs/stop_words.py | 266 +++++++++++++++ spacy/lang/et/__init__.py | 20 ++ spacy/lang/et/stop_words.py | 45 +++ spacy/lang/is/__init__.py | 20 ++ spacy/lang/is/stop_words.py | 162 ++++++++++ spacy/lang/lt/__init__.py | 20 ++ spacy/lang/lt/stop_words.py | 484 ++++++++++++++++++++++++++++ spacy/lang/lv/__init__.py | 20 ++ spacy/lang/lv/stop_words.py | 171 ++++++++++ spacy/lang/sk/__init__.py | 20 ++ spacy/lang/sk/stop_words.py | 231 +++++++++++++ spacy/lang/sl/__init__.py | 20 ++ spacy/lang/sl/stop_words.py | 458 ++++++++++++++++++++++++++ spacy/lang/sq/__init__.py | 20 ++ spacy/lang/sq/stop_words.py | 233 +++++++++++++ spacy/tests/lang/test_initialize.py | 21 ++ 21 files changed, 2601 insertions(+) create mode 100644 spacy/lang/af/__init__.py create mode 100644 spacy/lang/af/stop_words.py create mode 100644 spacy/lang/bg/__init__.py create mode 100644 spacy/lang/bg/stop_words.py create mode 100644 spacy/lang/cs/__init__.py create mode 100644 spacy/lang/cs/stop_words.py create mode 100644 spacy/lang/et/__init__.py create mode 100644 spacy/lang/et/stop_words.py create mode 100644 spacy/lang/is/__init__.py create mode 100644 spacy/lang/is/stop_words.py create mode 100644 spacy/lang/lt/__init__.py create mode 100644 spacy/lang/lt/stop_words.py create mode 100644 spacy/lang/lv/__init__.py create mode 100644 spacy/lang/lv/stop_words.py create mode 100644 spacy/lang/sk/__init__.py create mode 100644 spacy/lang/sk/stop_words.py create mode 100644 spacy/lang/sl/__init__.py create mode 100644 spacy/lang/sl/stop_words.py create mode 100644 spacy/lang/sq/__init__.py create mode 100644 spacy/lang/sq/stop_words.py create mode 100644 spacy/tests/lang/test_initialize.py diff --git a/spacy/lang/af/__init__.py b/spacy/lang/af/__init__.py new file mode 100644 index 000000000..90ea324f0 --- /dev/null +++ b/spacy/lang/af/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class AfrikaansDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "af" + stop_words = STOP_WORDS + + +class Afrikaans(Language): + lang = "af" + Defaults = AfrikaansDefaults + + +__all__ = ["Afrikaans"] diff --git a/spacy/lang/af/stop_words.py b/spacy/lang/af/stop_words.py new file mode 100644 index 000000000..2b3bcc019 --- /dev/null +++ b/spacy/lang/af/stop_words.py @@ -0,0 +1,61 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/stopwords-iso/stopwords-af + +STOP_WORDS = set( + """ +'n +aan +af +al +as +baie +by +daar +dag +dat +die +dit +een +ek +en +gaan +gesê +haar +het +hom +hulle +hy +in +is +jou +jy +kan +kom +ma +maar +met +my +na +nie +om +ons +op +saam +sal +se +sien +so +sy +te +toe +uit +van +vir +was +wat +ʼn +""".split() +) diff --git a/spacy/lang/bg/__init__.py b/spacy/lang/bg/__init__.py new file mode 100644 index 000000000..9b4c647e3 --- /dev/null +++ b/spacy/lang/bg/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class BulgarianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "bg" + stop_words = STOP_WORDS + + +class Bulgarian(Language): + lang = "bg" + Defaults = BulgarianDefaults + + +__all__ = ["Bulgarian"] diff --git a/spacy/lang/bg/stop_words.py b/spacy/lang/bg/stop_words.py new file mode 100644 index 000000000..e7c65cbc2 --- /dev/null +++ b/spacy/lang/bg/stop_words.py @@ -0,0 +1,269 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/Alir3z4/stop-words + +STOP_WORDS = set( + """ +а +автентичен +аз +ако +ала +бе +без +беше +би +бивш +бивша +бившо +бил +била +били +било +благодаря +близо +бъдат +бъде +бяха +в +вас +ваш +ваша +вероятно +вече +взема +ви +вие +винаги +внимава +време +все +всеки +всички +всичко +всяка +във +въпреки +върху +г +ги +главен +главна +главно +глас +го +година +години +годишен +д +да +дали +два +двама +двамата +две +двете +ден +днес +дни +до +добра +добре +добро +добър +докато +докога +дори +досега +доста +друг +друга +други +е +евтин +едва +един +една +еднаква +еднакви +еднакъв +едно +екип +ето +живот +за +забавям +зад +заедно +заради +засега +заспал +затова +защо +защото +и +из +или +им +има +имат +иска +й +каза +как +каква +какво +както +какъв +като +кога +когато +което +които +кой +който +колко +която +къде +където +към +лесен +лесно +ли +лош +м +май +малко +ме +между +мек +мен +месец +ми +много +мнозина +мога +могат +може +мокър +моля +момента +му +н +на +над +назад +най +направи +напред +например +нас +не +него +нещо +нея +ни +ние +никой +нито +нищо +но +нов +нова +нови +новина +някои +някой +няколко +няма +обаче +около +освен +особено +от +отгоре +отново +още +пак +по +повече +повечето +под +поне +поради +после +почти +прави +пред +преди +през +при +пък +първата +първи +първо +пъти +равен +равна +с +са +сам +само +се +сега +си +син +скоро +след +следващ +сме +смях +според +сред +срещу +сте +съм +със +също +т +тази +така +такива +такъв +там +твой +те +тези +ти +т.н. +то +това +тогава +този +той +толкова +точно +три +трябва +тук +тъй +тя +тях +у +утре +харесва +хиляди +ч +часа +че +често +чрез +ще +щом +юмрук +я +як +""".split() +) diff --git a/spacy/lang/cs/__init__.py b/spacy/lang/cs/__init__.py new file mode 100644 index 000000000..5b1397ba2 --- /dev/null +++ b/spacy/lang/cs/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class CzechDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "cs" + stop_words = STOP_WORDS + + +class Czech(Language): + lang = "cs" + Defaults = CzechDefaults + + +__all__ = ["Czech"] diff --git a/spacy/lang/cs/stop_words.py b/spacy/lang/cs/stop_words.py new file mode 100644 index 000000000..59d3c102e --- /dev/null +++ b/spacy/lang/cs/stop_words.py @@ -0,0 +1,266 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/Alir3z4/stop-words + +STOP_WORDS = set( + """ +ačkoli +ahoj +ale +anebo +ano +asi +aspoň +během +bez +beze +blízko +bohužel +brzo +bude +budeme +budeš +budete +budou +budu +byl +byla +byli +bylo +byly +bys +čau +chce +chceme +chceš +chcete +chci +chtějí +chtít +chut' +chuti +co +čtrnáct +čtyři +dál +dále +daleko +děkovat +děkujeme +děkuji +den +deset +devatenáct +devět +do +dobrý +docela +dva +dvacet +dvanáct +dvě +hodně +já +jak +jde +je +jeden +jedenáct +jedna +jedno +jednou +jedou +jeho +její +jejich +jemu +jen +jenom +ještě +jestli +jestliže +jí +jich +jím +jimi +jinak +jsem +jsi +jsme +jsou +jste +kam +kde +kdo +kdy +když +ke +kolik +kromě +která +které +kteří +který +kvůli +má +mají +málo +mám +máme +máš +máte +mé +mě +mezi +mí +mít +mně +mnou +moc +mohl +mohou +moje +moji +možná +můj +musí +může +my +na +nad +nade +nám +námi +naproti +nás +náš +naše +naši +ne +ně +nebo +nebyl +nebyla +nebyli +nebyly +něco +nedělá +nedělají +nedělám +neděláme +neděláš +neděláte +nějak +nejsi +někde +někdo +nemají +nemáme +nemáte +neměl +němu +není +nestačí +nevadí +než +nic +nich +ním +nimi +nula +od +ode +on +ona +oni +ono +ony +osm +osmnáct +pak +patnáct +pět +po +pořád +potom +pozdě +před +přes +přese +pro +proč +prosím +prostě +proti +protože +rovně +se +sedm +sedmnáct +šest +šestnáct +skoro +smějí +smí +snad +spolu +sta +sté +sto +ta +tady +tak +takhle +taky +tam +tamhle +tamhleto +tamto +tě +tebe +tebou +ted' +tedy +ten +ti +tisíc +tisíce +to +tobě +tohle +toto +třeba +tři +třináct +trošku +tvá +tvé +tvoje +tvůj +ty +určitě +už +vám +vámi +vás +váš +vaše +vaši +ve +večer +vedle +vlastně +všechno +všichni +vůbec +vy +vždy +za +zač +zatímco +ze +že +""".split() +) diff --git a/spacy/lang/et/__init__.py b/spacy/lang/et/__init__.py new file mode 100644 index 000000000..d84c081ef --- /dev/null +++ b/spacy/lang/et/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class EstonianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "et" + stop_words = STOP_WORDS + + +class Estonian(Language): + lang = "et" + Defaults = EstonianDefaults + + +__all__ = ["Estonian"] diff --git a/spacy/lang/et/stop_words.py b/spacy/lang/et/stop_words.py new file mode 100644 index 000000000..15070db5f --- /dev/null +++ b/spacy/lang/et/stop_words.py @@ -0,0 +1,45 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/stopwords-iso/stopwords-et + +STOP_WORDS = set( + """ +aga +ei +et +ja +jah +kas +kui +kõik +ma +me +mida +midagi +mind +minu +mis +mu +mul +mulle +nad +nii +oled +olen +oli +oma +on +pole +sa +seda +see +selle +siin +siis +ta +te +ära +""".split() +) diff --git a/spacy/lang/is/__init__.py b/spacy/lang/is/__init__.py new file mode 100644 index 000000000..18e41432d --- /dev/null +++ b/spacy/lang/is/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class IcelandicDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "is" + stop_words = STOP_WORDS + + +class Icelandic(Language): + lang = "is" + Defaults = IcelandicDefaults + + +__all__ = ["Icelandic"] diff --git a/spacy/lang/is/stop_words.py b/spacy/lang/is/stop_words.py new file mode 100644 index 000000000..e4ae0498b --- /dev/null +++ b/spacy/lang/is/stop_words.py @@ -0,0 +1,162 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/Xangis/extra-stopwords + +STOP_WORDS = set( + """ +afhverju +aftan +aftur +afþví +aldrei +allir +allt +alveg +annað +annars +bara +dag +eða +eftir +eiga +einhver +einhverjir +einhvers +eins +einu +eitthvað +ekkert +ekki +ennþá +eru +fara +fer +finna +fjöldi +fólk +framan +frá +frekar +fyrir +gegnum +geta +getur +gmg +gott +hann +hafa +hef +hefur +heyra +hér +hérna +hjá +hún +hvað +hvar +hver +hverjir +hverjum +hvernig +hvor +hvort +hægt +img +inn +kannski +koma +líka +lol +maður +mátt +mér +með +mega +meira +mig +mikið +minna +minni +missa +mjög +nei +niður +núna +oft +okkar +okkur +póst +póstur +rofl +saman +sem +sér +sig +sinni +síðan +sjá +smá +smátt +spurja +spyrja +staðar +stórt +svo +svona +sælir +sæll +taka +takk +til +tilvitnun +titlar +upp +var +vel +velkomin +velkominn +vera +verður +verið +vel +við +vil +vilja +vill +vita +væri +yfir +ykkar +það +þakka +þakkir +þannig +það +þar +þarf +þau +þeim +þeir +þeirra +þeirra +þegar +þess +þessa +þessi +þessu +þessum +þetta +þér +þið +þinn +þitt +þín +þráð +þráður +því +þær +ætti +""".split() +) diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py new file mode 100644 index 000000000..882eb0611 --- /dev/null +++ b/spacy/lang/lt/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class LithuanianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "lt" + stop_words = STOP_WORDS + + +class Lithuanian(Language): + lang = "lt" + Defaults = LithuanianDefaults + + +__all__ = ["Lithuanian"] diff --git a/spacy/lang/lt/stop_words.py b/spacy/lang/lt/stop_words.py new file mode 100644 index 000000000..19554b594 --- /dev/null +++ b/spacy/lang/lt/stop_words.py @@ -0,0 +1,484 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/stopwords-iso/stopwords-lt + +STOP_WORDS = set( + """ +abi +abidvi +abiejose +abiejuose +abiejø +abiem +abigaliai +abipus +abu +abudu +ai +ana +anaiptol +anaisiais +anajai +anajam +anajame +anapus +anas +anasai +anasis +anei +aniedvi +anieji +aniesiems +anoji +anojo +anojoje +anokia +anoks +anosiomis +anosioms +anosios +anosiose +anot +ant +antai +anuodu +anuoju +anuosiuose +anuosius +anàja +anàjà +anàjá +anàsias +anøjø +apie +aplink +ar +arba +argi +arti +aukðèiau +að +be +bei +beje +bemaþ +bent +bet +betgi +beveik +dar +dargi +daugmaþ +deja +dëka +dël +dëlei +dëlto +ech +et +gal +galbût +galgi +gan +gana +gi +greta +idant +iki +ir +irgi +it +itin +ið +iðilgai +iðvis +jaisiais +jajai +jajam +jajame +jei +jeigu +ji +jiedu +jiedvi +jieji +jiesiems +jinai +jis +jisai +jog +joji +jojo +jojoje +jokia +joks +josiomis +josioms +josios +josiose +judu +judvi +juk +jumis +jums +jumyse +juodu +juoju +juosiuose +juosius +jus +jàja +jàjà +jàsias +jájá +jøjø +jûs +jûsiðkis +jûsiðkë +jûsø +kad +kada +kadangi +kai +kaip +kaipgi +kas +katra +katras +katriedvi +katruodu +kaþin +kaþkas +kaþkatra +kaþkatras +kaþkokia +kaþkoks +kaþkuri +kaþkuris +kiaurai +kiek +kiekvienas +kieno +kita +kitas +kitokia +kitoks +kodël +kokia +koks +kol +kolei +kone +kuomet +kur +kurgi +kuri +kuriedvi +kuris +kuriuodu +lai +lig +ligi +link +lyg +man +manaisiais +manajai +manajam +manajame +manas +manasai +manasis +mane +manieji +maniesiems +manim +manimi +maniðkis +maniðkë +mano +manoji +manojo +manojoje +manosiomis +manosioms +manosios +manosiose +manuoju +manuosiuose +manuosius +manyje +manàja +manàjà +manàjá +manàsias +manæs +manøjø +mat +maþdaug +maþne +mes +mudu +mudvi +mumis +mums +mumyse +mus +mûsiðkis +mûsiðkë +mûsø +na +nagi +ne +nebe +nebent +negi +negu +nei +nejau +nejaugi +nekaip +nelyginant +nes +net +netgi +netoli +neva +nors +nuo +në +o +ogi +oi +paeiliui +pagal +pakeliui +palaipsniui +palei +pas +pasak +paskos +paskui +paskum +pat +pati +patiems +paties +pats +patys +patá +paèiais +paèiam +paèiame +paèiu +paèiuose +paèius +paèiø +per +pernelyg +pirm +pirma +pirmiau +po +prie +prieð +prieðais +pro +pusiau +rasi +rodos +sau +savaisiais +savajai +savajam +savajame +savas +savasai +savasis +save +savieji +saviesiems +savimi +saviðkis +saviðkë +savo +savoji +savojo +savojoje +savosiomis +savosioms +savosios +savosiose +savuoju +savuosiuose +savuosius +savyje +savàja +savàjà +savàjá +savàsias +savæs +savøjø +skersai +skradþiai +staèiai +su +sulig +ta +tad +tai +taigi +taip +taipogi +taisiais +tajai +tajam +tajame +tamsta +tarp +tarsi +tartum +tarytum +tas +tasai +tau +tavaisiais +tavajai +tavajam +tavajame +tavas +tavasai +tavasis +tave +tavieji +taviesiems +tavimi +taviðkis +taviðkë +tavo +tavoji +tavojo +tavojoje +tavosiomis +tavosioms +tavosios +tavosiose +tavuoju +tavuosiuose +tavuosius +tavyje +tavàja +tavàjà +tavàjá +tavàsias +tavæs +tavøjø +taèiau +te +tegu +tegul +tiedvi +tieji +ties +tiesiems +tiesiog +tik +tikriausiai +tiktai +toji +tojo +tojoje +tokia +toks +tol +tolei +toliau +tosiomis +tosioms +tosios +tosiose +tu +tuodu +tuoju +tuosiuose +tuosius +turbût +tàja +tàjà +tàjá +tàsias +tøjø +tûlas +uþ +uþtat +uþvis +va +vai +viduj +vidury +vien +vienas +vienokia +vienoks +vietoj +virð +virðuj +virðum +vis +vis dëlto +visa +visas +visgi +visokia +visoks +vos +vël +vëlgi +ypaè +á +ákypai +ástriþai +ðalia +ðe +ði +ðiaisiais +ðiajai +ðiajam +ðiajame +ðiapus +ðiedvi +ðieji +ðiesiems +ðioji +ðiojo +ðiojoje +ðiokia +ðioks +ðiosiomis +ðiosioms +ðiosios +ðiosiose +ðis +ðisai +ðit +ðita +ðitas +ðitiedvi +ðitokia +ðitoks +ðituodu +ðiuodu +ðiuoju +ðiuosiuose +ðiuosius +ðiàja +ðiàjà +ðiàsias +ðiøjø +ðtai +ðájá +þemiau +""".split() +) diff --git a/spacy/lang/lv/__init__.py b/spacy/lang/lv/__init__.py new file mode 100644 index 000000000..bb8c0763b --- /dev/null +++ b/spacy/lang/lv/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class LatvianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "lv" + stop_words = STOP_WORDS + + +class Latvian(Language): + lang = "lv" + Defaults = LatvianDefaults + + +__all__ = ["Latvian"] diff --git a/spacy/lang/lv/stop_words.py b/spacy/lang/lv/stop_words.py new file mode 100644 index 000000000..075ad6347 --- /dev/null +++ b/spacy/lang/lv/stop_words.py @@ -0,0 +1,171 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/stopwords-iso/stopwords-lv + +STOP_WORDS = set( + """ +aiz +ap +apakš +apakšpus +ar +arī +augšpus +bet +bez +bija +biji +biju +bijām +bijāt +būs +būsi +būsiet +būsim +būt +būšu +caur +diemžēl +diezin +droši +dēļ +esam +esat +esi +esmu +gan +gar +iekam +iekams +iekām +iekāms +iekš +iekšpus +ik +ir +it +itin +iz +ja +jau +jeb +jebšu +jel +jo +jā +ka +kamēr +kaut +kolīdz +kopš +kā +kļuva +kļuvi +kļuvu +kļuvām +kļuvāt +kļūs +kļūsi +kļūsiet +kļūsim +kļūst +kļūstam +kļūstat +kļūsti +kļūstu +kļūt +kļūšu +labad +lai +lejpus +līdz +līdzko +ne +nebūt +nedz +nekā +nevis +nezin +no +nu +nē +otrpus +pa +par +pat +pie +pirms +pret +priekš +pār +pēc +starp +tad +tak +tapi +taps +tapsi +tapsiet +tapsim +tapt +tapāt +tapšu +taču +te +tiec +tiek +tiekam +tiekat +tieku +tik +tika +tikai +tiki +tikko +tiklab +tiklīdz +tiks +tiksiet +tiksim +tikt +tiku +tikvien +tikām +tikāt +tikšu +tomēr +topat +turpretim +turpretī +tā +tādēļ +tālab +tāpēc +un +uz +vai +var +varat +varēja +varēji +varēju +varējām +varējāt +varēs +varēsi +varēsiet +varēsim +varēt +varēšu +vien +virs +virspus +vis +viņpus +zem +ārpus +šaipus +""".split() +) diff --git a/spacy/lang/sk/__init__.py b/spacy/lang/sk/__init__.py new file mode 100644 index 000000000..e7704196a --- /dev/null +++ b/spacy/lang/sk/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class SlovakDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "sk" + stop_words = STOP_WORDS + + +class Slovak(Language): + lang = "sk" + Defaults = SlovakDefaults + + +__all__ = ["Slovak"] diff --git a/spacy/lang/sk/stop_words.py b/spacy/lang/sk/stop_words.py new file mode 100644 index 000000000..f6994d33f --- /dev/null +++ b/spacy/lang/sk/stop_words.py @@ -0,0 +1,231 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/stopwords-iso/stopwords-sk + +STOP_WORDS = set( + """ +a +aby +aj +ak +ako +aký +ale +alebo +and +ani +asi +avšak +až +ba +bez +bol +bola +boli +bolo +bude +budem +budeme +budete +budeš +budú +buï +buď +by +byť +cez +dnes +do +ešte +for +ho +hoci +i +iba +ich +im +iné +iný +ja +je +jeho +jej +jemu +ju +k +kam +každá +každé +každí +každý +kde +kedže +keï +keď +kto +ktorou +ktorá +ktoré +ktorí +ktorý +ku +lebo +len +ma +mať +medzi +menej +mi +mna +mne +mnou +moja +moje +mu +musieť +my +má +máte +mòa +môcť +môj +môže +na +nad +nami +naši +nech +neho +nej +nemu +než +nich +nie +niektorý +nielen +nim +nič +no +nová +nové +noví +nový +nám +nás +náš +ním +o +od +odo +of +on +ona +oni +ono +ony +po +pod +podľa +pokiaľ +potom +pre +pred +predo +preto +pretože +prečo +pri +prvá +prvé +prví +prvý +práve +pýta +s +sa +seba +sem +si +sme +so +som +späť +ste +svoj +svoje +svojich +svojím +svojími +sú +ta +tak +taký +takže +tam +te +teba +tebe +tebou +teda +tej +ten +tento +the +ti +tie +tieto +tiež +to +toho +tohoto +tom +tomto +tomu +tomuto +toto +tou +tu +tvoj +tvojími +ty +tá +táto +tú +túto +tým +týmto +tě +už +v +vami +vaše +veï +viac +vo +vy +vám +vás +váš +však +všetok +z +za +zo +a +áno +èi +èo +èí +òom +òou +òu +či +čo +ďalšia +ďalšie +ďalší +že +""".split() +) diff --git a/spacy/lang/sl/__init__.py b/spacy/lang/sl/__init__.py new file mode 100644 index 000000000..2d4977bdf --- /dev/null +++ b/spacy/lang/sl/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class SlovenianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "sl" + stop_words = STOP_WORDS + + +class Slovenian(Language): + lang = "sl" + Defaults = SlovenianDefaults + + +__all__ = ["Slovenian"] diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py new file mode 100644 index 000000000..187e95876 --- /dev/null +++ b/spacy/lang/sl/stop_words.py @@ -0,0 +1,458 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/stopwords-iso/stopwords-sl +# TODO: probably needs to be tidied up – the list seems to have month names in +# it, which shouldn't be considered stop words. + +STOP_WORDS = set( + """ +a +ali +april +avgust +b +bi +bil +bila +bile +bili +bilo +biti +blizu +bo +bodo +bojo +bolj +bom +bomo +boste +bova +boš +brez +c +cel +cela +celi +celo +d +da +daleč +dan +danes +datum +december +deset +deseta +deseti +deseto +devet +deveta +deveti +deveto +do +dober +dobra +dobri +dobro +dokler +dol +dolg +dolga +dolgi +dovolj +drug +druga +drugi +drugo +dva +dve +e +eden +en +ena +ene +eni +enkrat +eno +etc. +f +februar +g +g. +ga +ga. +gor +gospa +gospod +h +halo +i +idr. +ii +iii +in +iv +ix +iz +j +januar +jaz +je +ji +jih +jim +jo +julij +junij +jutri +k +kadarkoli +kaj +kajti +kako +kakor +kamor +kamorkoli +kar +karkoli +katerikoli +kdaj +kdo +kdorkoli +ker +ki +kje +kjer +kjerkoli +ko +koder +koderkoli +koga +komu +kot +kratek +kratka +kratke +kratki +l +lahka +lahke +lahki +lahko +le +lep +lepa +lepe +lepi +lepo +leto +m +maj +majhen +majhna +majhni +malce +malo +manj +marec +me +med +medtem +mene +mesec +mi +midva +midve +mnogo +moj +moja +moje +mora +morajo +moram +moramo +morate +moraš +morem +mu +n +na +nad +naj +najina +najino +najmanj +naju +največ +nam +narobe +nas +nato +nazaj +naš +naša +naše +ne +nedavno +nedelja +nek +neka +nekaj +nekatere +nekateri +nekatero +nekdo +neke +nekega +neki +nekje +neko +nekoga +nekoč +ni +nikamor +nikdar +nikjer +nikoli +nič +nje +njega +njegov +njegova +njegovo +njej +njemu +njen +njena +njeno +nji +njih +njihov +njihova +njihovo +njiju +njim +njo +njun +njuna +njuno +no +nocoj +november +npr. +o +ob +oba +obe +oboje +od +odprt +odprta +odprti +okoli +oktober +on +onadva +one +oni +onidve +osem +osma +osmi +osmo +oz. +p +pa +pet +peta +petek +peti +peto +po +pod +pogosto +poleg +poln +polna +polni +polno +ponavadi +ponedeljek +ponovno +potem +povsod +pozdravljen +pozdravljeni +prav +prava +prave +pravi +pravo +prazen +prazna +prazno +prbl. +precej +pred +prej +preko +pri +pribl. +približno +primer +pripravljen +pripravljena +pripravljeni +proti +prva +prvi +prvo +r +ravno +redko +res +reč +s +saj +sam +sama +same +sami +samo +se +sebe +sebi +sedaj +sedem +sedma +sedmi +sedmo +sem +september +seveda +si +sicer +skoraj +skozi +slab +smo +so +sobota +spet +sreda +srednja +srednji +sta +ste +stran +stvar +sva +t +ta +tak +taka +take +taki +tako +takoj +tam +te +tebe +tebi +tega +težak +težka +težki +težko +ti +tista +tiste +tisti +tisto +tj. +tja +to +toda +torek +tretja +tretje +tretji +tri +tu +tudi +tukaj +tvoj +tvoja +tvoje +u +v +vaju +vam +vas +vaš +vaša +vaše +ve +vedno +velik +velika +veliki +veliko +vendar +ves +več +vi +vidva +vii +viii +visok +visoka +visoke +visoki +vsa +vsaj +vsak +vsaka +vsakdo +vsake +vsaki +vsakomur +vse +vsega +vsi +vso +včasih +včeraj +x +z +za +zadaj +zadnji +zakaj +zaprta +zaprti +zaprto +zdaj +zelo +zunaj +č +če +često +četrta +četrtek +četrti +četrto +čez +čigav +š +šest +šesta +šesti +šesto +štiri +ž +že +""".split() +) diff --git a/spacy/lang/sq/__init__.py b/spacy/lang/sq/__init__.py new file mode 100644 index 000000000..6f33b37c2 --- /dev/null +++ b/spacy/lang/sq/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class AlbanianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "sq" + stop_words = STOP_WORDS + + +class Albanian(Language): + lang = "sq" + Defaults = AlbanianDefaults + + +__all__ = ["Albanian"] diff --git a/spacy/lang/sq/stop_words.py b/spacy/lang/sq/stop_words.py new file mode 100644 index 000000000..f91861ca1 --- /dev/null +++ b/spacy/lang/sq/stop_words.py @@ -0,0 +1,233 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/andrixh/index-albanian + +STOP_WORDS = set( + """ +a +afert +ai +ajo +andej +anes +aq +as +asaj +ashtu +ata +ate +atij +atje +ato +aty +atyre +b +be +behem +behet +bej +beje +bejne +ben +bene +bere +beri +bie +c +ca +cdo +cfare +cila +cilat +cilave +cilen +ciles +cilet +cili +cilin +cilit +deri +dhe +dic +dicka +dickaje +dike +dikujt +dikush +disa +do +dot +drejt +duke +dy +e +edhe +ende +eshte +etj +fare +gjate +gje +gjitha +gjithcka +gjithe +gjithnje +here +i +ia +ishin +ishte +iu +ja +jam +jane +jap +je +jemi +jo +ju +k +ka +kam +kane +kem +kemi +keq +kesaj +keshtu +kete +ketej +ketij +keto +ketu +ketyre +kishin +kishte +kjo +krejt +kryer +kryesisht +kryhet +ku +kudo +kundrejt +kur +kurre +kush +ky +la +le +lloj +m +ma +madhe +marr +marre +mban +mbi +me +menjehere +merr +merret +mes +mi +midis +mire +mjaft +mori +mos +mua +mund +na +ndaj +nder +ndermjet +ndersa +ndonje +ndryshe +ne +nen +neper +nepermjet +nese +nga +nje +njera +nuk +ose +pa +pak +papritur +para +pas +pasi +pasur +per +perbashket +perpara +po +por +prane +prapa +prej +pse +qe +qene +qenet +rralle +rreth +rri +s +sa +saj +sapo +se +secila +sepse +sh +shih +shume +si +sic +sikur +sipas +siper +sone +t +ta +tani +te +tej +tek +teper +tere +ti +tij +tilla +tille +tjera +tjeret +tjeter +tjetren +to +tone +ty +tyre +u +ua +une +vazhdimisht +vend +vet +veta +vete +vetem +veten +vetes +vjen +yne +zakonisht +""".split() +) diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py new file mode 100644 index 000000000..587d15dd7 --- /dev/null +++ b/spacy/tests/lang/test_initialize.py @@ -0,0 +1,21 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from spacy.util import get_lang_class + + +# fmt: off +# Only include languages with no external dependencies +# excluded: ja, ru, th, uk, vi, zh +LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", + "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is", + "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk", + "sl", "sq", "sv", "ta", "te", "tl", "tr", "tt", "ur"] +# fmt: on + + +@pytest.mark.parametrize("lang", LANGUAGES) +def test_lang_initialize(lang): + """Test that languages can be initialized.""" + lang_cls = get_lang_class(lang)()