diff --git a/spacy/errors.py b/spacy/errors.py index 567e29cd0..f689963dd 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -177,7 +177,7 @@ class Errors(object): "you forget to call the `set_extension` method?") E047 = ("Can't assign a value to unregistered extension attribute " "'{name}'. Did you forget to call the `set_extension` method?") - E048 = ("Can't import language {lang} from spacy.lang.") + E048 = ("Can't import language {lang} from spacy.lang: {err}") E049 = ("Can't find spaCy data directory: '{path}'. Check your " "installation and permissions, or use spacy.util.set_data_path " "to customise the location if necessary.") @@ -308,6 +308,16 @@ class Errors(object): "would always have to include its Doc and Vocab, which has " "practically no advantage over pickling the parent Doc directly. " "So instead of pickling the token, pickle the Doc it belongs to.") + E112 = ("Pickling a span is not supported, because spans are only views " + "of the parent Doc and can't exist on their own. A pickled span " + "would always have to include its Doc and Vocab, which has " + "practically no advantage over pickling the parent Doc directly. " + "So instead of pickling the span, pickle the Doc it belongs to or " + "use Span.as_doc to convert the span to a standalone Doc object.") + E113 = ("The newly split token can only have one root (head = 0).") + E114 = ("The newly split token needs to have a root (head = 0)") + E115 = ("All subtokens must have associated heads") + @add_codes class TempErrors(object): diff --git a/spacy/lang/af/__init__.py b/spacy/lang/af/__init__.py new file mode 100644 index 000000000..90ea324f0 --- /dev/null +++ b/spacy/lang/af/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class AfrikaansDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "af" + stop_words = STOP_WORDS + + +class Afrikaans(Language): + lang = "af" + Defaults = AfrikaansDefaults + + +__all__ = ["Afrikaans"] diff --git a/spacy/lang/af/stop_words.py b/spacy/lang/af/stop_words.py new file mode 100644 index 000000000..2b3bcc019 --- /dev/null +++ b/spacy/lang/af/stop_words.py @@ -0,0 +1,61 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/stopwords-iso/stopwords-af + +STOP_WORDS = set( + """ +'n +aan +af +al +as +baie +by +daar +dag +dat +die +dit +een +ek +en +gaan +gesê +haar +het +hom +hulle +hy +in +is +jou +jy +kan +kom +ma +maar +met +my +na +nie +om +ons +op +saam +sal +se +sien +so +sy +te +toe +uit +van +vir +was +wat +ʼn +""".split() +) diff --git a/spacy/lang/bg/__init__.py b/spacy/lang/bg/__init__.py new file mode 100644 index 000000000..9b4c647e3 --- /dev/null +++ b/spacy/lang/bg/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class BulgarianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "bg" + stop_words = STOP_WORDS + + +class Bulgarian(Language): + lang = "bg" + Defaults = BulgarianDefaults + + +__all__ = ["Bulgarian"] diff --git a/spacy/lang/bg/stop_words.py b/spacy/lang/bg/stop_words.py new file mode 100644 index 000000000..e7c65cbc2 --- /dev/null +++ b/spacy/lang/bg/stop_words.py @@ -0,0 +1,269 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/Alir3z4/stop-words + +STOP_WORDS = set( + """ +а +автентичен +аз +ако +ала +бе +без +беше +би +бивш +бивша +бившо +бил +била +били +било +благодаря +близо +бъдат +бъде +бяха +в +вас +ваш +ваша +вероятно +вече +взема +ви +вие +винаги +внимава +време +все +всеки +всички +всичко +всяка +във +въпреки +върху +г +ги +главен +главна +главно +глас +го +година +години +годишен +д +да +дали +два +двама +двамата +две +двете +ден +днес +дни +до +добра +добре +добро +добър +докато +докога +дори +досега +доста +друг +друга +други +е +евтин +едва +един +една +еднаква +еднакви +еднакъв +едно +екип +ето +живот +за +забавям +зад +заедно +заради +засега +заспал +затова +защо +защото +и +из +или +им +има +имат +иска +й +каза +как +каква +какво +както +какъв +като +кога +когато +което +които +кой +който +колко +която +къде +където +към +лесен +лесно +ли +лош +м +май +малко +ме +между +мек +мен +месец +ми +много +мнозина +мога +могат +може +мокър +моля +момента +му +н +на +над +назад +най +направи +напред +например +нас +не +него +нещо +нея +ни +ние +никой +нито +нищо +но +нов +нова +нови +новина +някои +някой +няколко +няма +обаче +около +освен +особено +от +отгоре +отново +още +пак +по +повече +повечето +под +поне +поради +после +почти +прави +пред +преди +през +при +пък +първата +първи +първо +пъти +равен +равна +с +са +сам +само +се +сега +си +син +скоро +след +следващ +сме +смях +според +сред +срещу +сте +съм +със +също +т +тази +така +такива +такъв +там +твой +те +тези +ти +т.н. +то +това +тогава +този +той +толкова +точно +три +трябва +тук +тъй +тя +тях +у +утре +харесва +хиляди +ч +часа +че +често +чрез +ще +щом +юмрук +я +як +""".split() +) diff --git a/spacy/lang/cs/__init__.py b/spacy/lang/cs/__init__.py new file mode 100644 index 000000000..5b1397ba2 --- /dev/null +++ b/spacy/lang/cs/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class CzechDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "cs" + stop_words = STOP_WORDS + + +class Czech(Language): + lang = "cs" + Defaults = CzechDefaults + + +__all__ = ["Czech"] diff --git a/spacy/lang/cs/stop_words.py b/spacy/lang/cs/stop_words.py new file mode 100644 index 000000000..59d3c102e --- /dev/null +++ b/spacy/lang/cs/stop_words.py @@ -0,0 +1,266 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/Alir3z4/stop-words + +STOP_WORDS = set( + """ +ačkoli +ahoj +ale +anebo +ano +asi +aspoň +během +bez +beze +blízko +bohužel +brzo +bude +budeme +budeš +budete +budou +budu +byl +byla +byli +bylo +byly +bys +čau +chce +chceme +chceš +chcete +chci +chtějí +chtít +chut' +chuti +co +čtrnáct +čtyři +dál +dále +daleko +děkovat +děkujeme +děkuji +den +deset +devatenáct +devět +do +dobrý +docela +dva +dvacet +dvanáct +dvě +hodně +já +jak +jde +je +jeden +jedenáct +jedna +jedno +jednou +jedou +jeho +její +jejich +jemu +jen +jenom +ještě +jestli +jestliže +jí +jich +jím +jimi +jinak +jsem +jsi +jsme +jsou +jste +kam +kde +kdo +kdy +když +ke +kolik +kromě +která +které +kteří +který +kvůli +má +mají +málo +mám +máme +máš +máte +mé +mě +mezi +mí +mít +mně +mnou +moc +mohl +mohou +moje +moji +možná +můj +musí +může +my +na +nad +nade +nám +námi +naproti +nás +náš +naše +naši +ne +ně +nebo +nebyl +nebyla +nebyli +nebyly +něco +nedělá +nedělají +nedělám +neděláme +neděláš +neděláte +nějak +nejsi +někde +někdo +nemají +nemáme +nemáte +neměl +němu +není +nestačí +nevadí +než +nic +nich +ním +nimi +nula +od +ode +on +ona +oni +ono +ony +osm +osmnáct +pak +patnáct +pět +po +pořád +potom +pozdě +před +přes +přese +pro +proč +prosím +prostě +proti +protože +rovně +se +sedm +sedmnáct +šest +šestnáct +skoro +smějí +smí +snad +spolu +sta +sté +sto +ta +tady +tak +takhle +taky +tam +tamhle +tamhleto +tamto +tě +tebe +tebou +ted' +tedy +ten +ti +tisíc +tisíce +to +tobě +tohle +toto +třeba +tři +třináct +trošku +tvá +tvé +tvoje +tvůj +ty +určitě +už +vám +vámi +vás +váš +vaše +vaši +ve +večer +vedle +vlastně +všechno +všichni +vůbec +vy +vždy +za +zač +zatímco +ze +že +""".split() +) diff --git a/spacy/lang/et/__init__.py b/spacy/lang/et/__init__.py new file mode 100644 index 000000000..d84c081ef --- /dev/null +++ b/spacy/lang/et/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class EstonianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "et" + stop_words = STOP_WORDS + + +class Estonian(Language): + lang = "et" + Defaults = EstonianDefaults + + +__all__ = ["Estonian"] diff --git a/spacy/lang/et/stop_words.py b/spacy/lang/et/stop_words.py new file mode 100644 index 000000000..15070db5f --- /dev/null +++ b/spacy/lang/et/stop_words.py @@ -0,0 +1,45 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/stopwords-iso/stopwords-et + +STOP_WORDS = set( + """ +aga +ei +et +ja +jah +kas +kui +kõik +ma +me +mida +midagi +mind +minu +mis +mu +mul +mulle +nad +nii +oled +olen +oli +oma +on +pole +sa +seda +see +selle +siin +siis +ta +te +ära +""".split() +) diff --git a/spacy/lang/is/__init__.py b/spacy/lang/is/__init__.py new file mode 100644 index 000000000..18e41432d --- /dev/null +++ b/spacy/lang/is/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class IcelandicDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "is" + stop_words = STOP_WORDS + + +class Icelandic(Language): + lang = "is" + Defaults = IcelandicDefaults + + +__all__ = ["Icelandic"] diff --git a/spacy/lang/is/stop_words.py b/spacy/lang/is/stop_words.py new file mode 100644 index 000000000..e4ae0498b --- /dev/null +++ b/spacy/lang/is/stop_words.py @@ -0,0 +1,162 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/Xangis/extra-stopwords + +STOP_WORDS = set( + """ +afhverju +aftan +aftur +afþví +aldrei +allir +allt +alveg +annað +annars +bara +dag +eða +eftir +eiga +einhver +einhverjir +einhvers +eins +einu +eitthvað +ekkert +ekki +ennþá +eru +fara +fer +finna +fjöldi +fólk +framan +frá +frekar +fyrir +gegnum +geta +getur +gmg +gott +hann +hafa +hef +hefur +heyra +hér +hérna +hjá +hún +hvað +hvar +hver +hverjir +hverjum +hvernig +hvor +hvort +hægt +img +inn +kannski +koma +líka +lol +maður +mátt +mér +með +mega +meira +mig +mikið +minna +minni +missa +mjög +nei +niður +núna +oft +okkar +okkur +póst +póstur +rofl +saman +sem +sér +sig +sinni +síðan +sjá +smá +smátt +spurja +spyrja +staðar +stórt +svo +svona +sælir +sæll +taka +takk +til +tilvitnun +titlar +upp +var +vel +velkomin +velkominn +vera +verður +verið +vel +við +vil +vilja +vill +vita +væri +yfir +ykkar +það +þakka +þakkir +þannig +það +þar +þarf +þau +þeim +þeir +þeirra +þeirra +þegar +þess +þessa +þessi +þessu +þessum +þetta +þér +þið +þinn +þitt +þín +þráð +þráður +því +þær +ætti +""".split() +) diff --git a/spacy/lang/kn/__init__.py b/spacy/lang/kn/__init__.py index 5cc33737e..c86354248 100644 --- a/spacy/lang/kn/__init__.py +++ b/spacy/lang/kn/__init__.py @@ -2,14 +2,13 @@ from __future__ import unicode_literals from .stop_words import STOP_WORDS -from .lex_attrs import LEX_ATTRS - -from ..norm_exceptions import BASE_NORMS from ...language import Language from ...attrs import LANG class KannadaDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "kn" stop_words = STOP_WORDS diff --git a/spacy/lang/kn/stop_words.py b/spacy/lang/kn/stop_words.py index 2a8ca27bd..583a42cc1 100644 --- a/spacy/lang/kn/stop_words.py +++ b/spacy/lang/kn/stop_words.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals -STOP_WORD = set( +STOP_WORDS = set( """ ಈ ಮತ್ತು diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py new file mode 100644 index 000000000..882eb0611 --- /dev/null +++ b/spacy/lang/lt/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class LithuanianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "lt" + stop_words = STOP_WORDS + + +class Lithuanian(Language): + lang = "lt" + Defaults = LithuanianDefaults + + +__all__ = ["Lithuanian"] diff --git a/spacy/lang/lt/stop_words.py b/spacy/lang/lt/stop_words.py new file mode 100644 index 000000000..19554b594 --- /dev/null +++ b/spacy/lang/lt/stop_words.py @@ -0,0 +1,484 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/stopwords-iso/stopwords-lt + +STOP_WORDS = set( + """ +abi +abidvi +abiejose +abiejuose +abiejø +abiem +abigaliai +abipus +abu +abudu +ai +ana +anaiptol +anaisiais +anajai +anajam +anajame +anapus +anas +anasai +anasis +anei +aniedvi +anieji +aniesiems +anoji +anojo +anojoje +anokia +anoks +anosiomis +anosioms +anosios +anosiose +anot +ant +antai +anuodu +anuoju +anuosiuose +anuosius +anàja +anàjà +anàjá +anàsias +anøjø +apie +aplink +ar +arba +argi +arti +aukðèiau +að +be +bei +beje +bemaþ +bent +bet +betgi +beveik +dar +dargi +daugmaþ +deja +dëka +dël +dëlei +dëlto +ech +et +gal +galbût +galgi +gan +gana +gi +greta +idant +iki +ir +irgi +it +itin +ið +iðilgai +iðvis +jaisiais +jajai +jajam +jajame +jei +jeigu +ji +jiedu +jiedvi +jieji +jiesiems +jinai +jis +jisai +jog +joji +jojo +jojoje +jokia +joks +josiomis +josioms +josios +josiose +judu +judvi +juk +jumis +jums +jumyse +juodu +juoju +juosiuose +juosius +jus +jàja +jàjà +jàsias +jájá +jøjø +jûs +jûsiðkis +jûsiðkë +jûsø +kad +kada +kadangi +kai +kaip +kaipgi +kas +katra +katras +katriedvi +katruodu +kaþin +kaþkas +kaþkatra +kaþkatras +kaþkokia +kaþkoks +kaþkuri +kaþkuris +kiaurai +kiek +kiekvienas +kieno +kita +kitas +kitokia +kitoks +kodël +kokia +koks +kol +kolei +kone +kuomet +kur +kurgi +kuri +kuriedvi +kuris +kuriuodu +lai +lig +ligi +link +lyg +man +manaisiais +manajai +manajam +manajame +manas +manasai +manasis +mane +manieji +maniesiems +manim +manimi +maniðkis +maniðkë +mano +manoji +manojo +manojoje +manosiomis +manosioms +manosios +manosiose +manuoju +manuosiuose +manuosius +manyje +manàja +manàjà +manàjá +manàsias +manæs +manøjø +mat +maþdaug +maþne +mes +mudu +mudvi +mumis +mums +mumyse +mus +mûsiðkis +mûsiðkë +mûsø +na +nagi +ne +nebe +nebent +negi +negu +nei +nejau +nejaugi +nekaip +nelyginant +nes +net +netgi +netoli +neva +nors +nuo +në +o +ogi +oi +paeiliui +pagal +pakeliui +palaipsniui +palei +pas +pasak +paskos +paskui +paskum +pat +pati +patiems +paties +pats +patys +patá +paèiais +paèiam +paèiame +paèiu +paèiuose +paèius +paèiø +per +pernelyg +pirm +pirma +pirmiau +po +prie +prieð +prieðais +pro +pusiau +rasi +rodos +sau +savaisiais +savajai +savajam +savajame +savas +savasai +savasis +save +savieji +saviesiems +savimi +saviðkis +saviðkë +savo +savoji +savojo +savojoje +savosiomis +savosioms +savosios +savosiose +savuoju +savuosiuose +savuosius +savyje +savàja +savàjà +savàjá +savàsias +savæs +savøjø +skersai +skradþiai +staèiai +su +sulig +ta +tad +tai +taigi +taip +taipogi +taisiais +tajai +tajam +tajame +tamsta +tarp +tarsi +tartum +tarytum +tas +tasai +tau +tavaisiais +tavajai +tavajam +tavajame +tavas +tavasai +tavasis +tave +tavieji +taviesiems +tavimi +taviðkis +taviðkë +tavo +tavoji +tavojo +tavojoje +tavosiomis +tavosioms +tavosios +tavosiose +tavuoju +tavuosiuose +tavuosius +tavyje +tavàja +tavàjà +tavàjá +tavàsias +tavæs +tavøjø +taèiau +te +tegu +tegul +tiedvi +tieji +ties +tiesiems +tiesiog +tik +tikriausiai +tiktai +toji +tojo +tojoje +tokia +toks +tol +tolei +toliau +tosiomis +tosioms +tosios +tosiose +tu +tuodu +tuoju +tuosiuose +tuosius +turbût +tàja +tàjà +tàjá +tàsias +tøjø +tûlas +uþ +uþtat +uþvis +va +vai +viduj +vidury +vien +vienas +vienokia +vienoks +vietoj +virð +virðuj +virðum +vis +vis dëlto +visa +visas +visgi +visokia +visoks +vos +vël +vëlgi +ypaè +á +ákypai +ástriþai +ðalia +ðe +ði +ðiaisiais +ðiajai +ðiajam +ðiajame +ðiapus +ðiedvi +ðieji +ðiesiems +ðioji +ðiojo +ðiojoje +ðiokia +ðioks +ðiosiomis +ðiosioms +ðiosios +ðiosiose +ðis +ðisai +ðit +ðita +ðitas +ðitiedvi +ðitokia +ðitoks +ðituodu +ðiuodu +ðiuoju +ðiuosiuose +ðiuosius +ðiàja +ðiàjà +ðiàsias +ðiøjø +ðtai +ðájá +þemiau +""".split() +) diff --git a/spacy/lang/lv/__init__.py b/spacy/lang/lv/__init__.py new file mode 100644 index 000000000..bb8c0763b --- /dev/null +++ b/spacy/lang/lv/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class LatvianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "lv" + stop_words = STOP_WORDS + + +class Latvian(Language): + lang = "lv" + Defaults = LatvianDefaults + + +__all__ = ["Latvian"] diff --git a/spacy/lang/lv/stop_words.py b/spacy/lang/lv/stop_words.py new file mode 100644 index 000000000..075ad6347 --- /dev/null +++ b/spacy/lang/lv/stop_words.py @@ -0,0 +1,171 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/stopwords-iso/stopwords-lv + +STOP_WORDS = set( + """ +aiz +ap +apakš +apakšpus +ar +arī +augšpus +bet +bez +bija +biji +biju +bijām +bijāt +būs +būsi +būsiet +būsim +būt +būšu +caur +diemžēl +diezin +droši +dēļ +esam +esat +esi +esmu +gan +gar +iekam +iekams +iekām +iekāms +iekš +iekšpus +ik +ir +it +itin +iz +ja +jau +jeb +jebšu +jel +jo +jā +ka +kamēr +kaut +kolīdz +kopš +kā +kļuva +kļuvi +kļuvu +kļuvām +kļuvāt +kļūs +kļūsi +kļūsiet +kļūsim +kļūst +kļūstam +kļūstat +kļūsti +kļūstu +kļūt +kļūšu +labad +lai +lejpus +līdz +līdzko +ne +nebūt +nedz +nekā +nevis +nezin +no +nu +nē +otrpus +pa +par +pat +pie +pirms +pret +priekš +pār +pēc +starp +tad +tak +tapi +taps +tapsi +tapsiet +tapsim +tapt +tapāt +tapšu +taču +te +tiec +tiek +tiekam +tiekat +tieku +tik +tika +tikai +tiki +tikko +tiklab +tiklīdz +tiks +tiksiet +tiksim +tikt +tiku +tikvien +tikām +tikāt +tikšu +tomēr +topat +turpretim +turpretī +tā +tādēļ +tālab +tāpēc +un +uz +vai +var +varat +varēja +varēji +varēju +varējām +varējāt +varēs +varēsi +varēsiet +varēsim +varēt +varēšu +vien +virs +virspus +vis +viņpus +zem +ārpus +šaipus +""".split() +) diff --git a/spacy/lang/sk/__init__.py b/spacy/lang/sk/__init__.py new file mode 100644 index 000000000..e7704196a --- /dev/null +++ b/spacy/lang/sk/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class SlovakDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "sk" + stop_words = STOP_WORDS + + +class Slovak(Language): + lang = "sk" + Defaults = SlovakDefaults + + +__all__ = ["Slovak"] diff --git a/spacy/lang/sk/stop_words.py b/spacy/lang/sk/stop_words.py new file mode 100644 index 000000000..f6994d33f --- /dev/null +++ b/spacy/lang/sk/stop_words.py @@ -0,0 +1,231 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/stopwords-iso/stopwords-sk + +STOP_WORDS = set( + """ +a +aby +aj +ak +ako +aký +ale +alebo +and +ani +asi +avšak +až +ba +bez +bol +bola +boli +bolo +bude +budem +budeme +budete +budeš +budú +buï +buď +by +byť +cez +dnes +do +ešte +for +ho +hoci +i +iba +ich +im +iné +iný +ja +je +jeho +jej +jemu +ju +k +kam +každá +každé +každí +každý +kde +kedže +keï +keď +kto +ktorou +ktorá +ktoré +ktorí +ktorý +ku +lebo +len +ma +mať +medzi +menej +mi +mna +mne +mnou +moja +moje +mu +musieť +my +má +máte +mòa +môcť +môj +môže +na +nad +nami +naši +nech +neho +nej +nemu +než +nich +nie +niektorý +nielen +nim +nič +no +nová +nové +noví +nový +nám +nás +náš +ním +o +od +odo +of +on +ona +oni +ono +ony +po +pod +podľa +pokiaľ +potom +pre +pred +predo +preto +pretože +prečo +pri +prvá +prvé +prví +prvý +práve +pýta +s +sa +seba +sem +si +sme +so +som +späť +ste +svoj +svoje +svojich +svojím +svojími +sú +ta +tak +taký +takže +tam +te +teba +tebe +tebou +teda +tej +ten +tento +the +ti +tie +tieto +tiež +to +toho +tohoto +tom +tomto +tomu +tomuto +toto +tou +tu +tvoj +tvojími +ty +tá +táto +tú +túto +tým +týmto +tě +už +v +vami +vaše +veï +viac +vo +vy +vám +vás +váš +však +všetok +z +za +zo +a +áno +èi +èo +èí +òom +òou +òu +či +čo +ďalšia +ďalšie +ďalší +že +""".split() +) diff --git a/spacy/lang/sl/__init__.py b/spacy/lang/sl/__init__.py new file mode 100644 index 000000000..2d4977bdf --- /dev/null +++ b/spacy/lang/sl/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class SlovenianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "sl" + stop_words = STOP_WORDS + + +class Slovenian(Language): + lang = "sl" + Defaults = SlovenianDefaults + + +__all__ = ["Slovenian"] diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py new file mode 100644 index 000000000..187e95876 --- /dev/null +++ b/spacy/lang/sl/stop_words.py @@ -0,0 +1,458 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/stopwords-iso/stopwords-sl +# TODO: probably needs to be tidied up – the list seems to have month names in +# it, which shouldn't be considered stop words. + +STOP_WORDS = set( + """ +a +ali +april +avgust +b +bi +bil +bila +bile +bili +bilo +biti +blizu +bo +bodo +bojo +bolj +bom +bomo +boste +bova +boš +brez +c +cel +cela +celi +celo +d +da +daleč +dan +danes +datum +december +deset +deseta +deseti +deseto +devet +deveta +deveti +deveto +do +dober +dobra +dobri +dobro +dokler +dol +dolg +dolga +dolgi +dovolj +drug +druga +drugi +drugo +dva +dve +e +eden +en +ena +ene +eni +enkrat +eno +etc. +f +februar +g +g. +ga +ga. +gor +gospa +gospod +h +halo +i +idr. +ii +iii +in +iv +ix +iz +j +januar +jaz +je +ji +jih +jim +jo +julij +junij +jutri +k +kadarkoli +kaj +kajti +kako +kakor +kamor +kamorkoli +kar +karkoli +katerikoli +kdaj +kdo +kdorkoli +ker +ki +kje +kjer +kjerkoli +ko +koder +koderkoli +koga +komu +kot +kratek +kratka +kratke +kratki +l +lahka +lahke +lahki +lahko +le +lep +lepa +lepe +lepi +lepo +leto +m +maj +majhen +majhna +majhni +malce +malo +manj +marec +me +med +medtem +mene +mesec +mi +midva +midve +mnogo +moj +moja +moje +mora +morajo +moram +moramo +morate +moraš +morem +mu +n +na +nad +naj +najina +najino +najmanj +naju +največ +nam +narobe +nas +nato +nazaj +naš +naša +naše +ne +nedavno +nedelja +nek +neka +nekaj +nekatere +nekateri +nekatero +nekdo +neke +nekega +neki +nekje +neko +nekoga +nekoč +ni +nikamor +nikdar +nikjer +nikoli +nič +nje +njega +njegov +njegova +njegovo +njej +njemu +njen +njena +njeno +nji +njih +njihov +njihova +njihovo +njiju +njim +njo +njun +njuna +njuno +no +nocoj +november +npr. +o +ob +oba +obe +oboje +od +odprt +odprta +odprti +okoli +oktober +on +onadva +one +oni +onidve +osem +osma +osmi +osmo +oz. +p +pa +pet +peta +petek +peti +peto +po +pod +pogosto +poleg +poln +polna +polni +polno +ponavadi +ponedeljek +ponovno +potem +povsod +pozdravljen +pozdravljeni +prav +prava +prave +pravi +pravo +prazen +prazna +prazno +prbl. +precej +pred +prej +preko +pri +pribl. +približno +primer +pripravljen +pripravljena +pripravljeni +proti +prva +prvi +prvo +r +ravno +redko +res +reč +s +saj +sam +sama +same +sami +samo +se +sebe +sebi +sedaj +sedem +sedma +sedmi +sedmo +sem +september +seveda +si +sicer +skoraj +skozi +slab +smo +so +sobota +spet +sreda +srednja +srednji +sta +ste +stran +stvar +sva +t +ta +tak +taka +take +taki +tako +takoj +tam +te +tebe +tebi +tega +težak +težka +težki +težko +ti +tista +tiste +tisti +tisto +tj. +tja +to +toda +torek +tretja +tretje +tretji +tri +tu +tudi +tukaj +tvoj +tvoja +tvoje +u +v +vaju +vam +vas +vaš +vaša +vaše +ve +vedno +velik +velika +veliki +veliko +vendar +ves +več +vi +vidva +vii +viii +visok +visoka +visoke +visoki +vsa +vsaj +vsak +vsaka +vsakdo +vsake +vsaki +vsakomur +vse +vsega +vsi +vso +včasih +včeraj +x +z +za +zadaj +zadnji +zakaj +zaprta +zaprti +zaprto +zdaj +zelo +zunaj +č +če +često +četrta +četrtek +četrti +četrto +čez +čigav +š +šest +šesta +šesti +šesto +štiri +ž +že +""".split() +) diff --git a/spacy/lang/sq/__init__.py b/spacy/lang/sq/__init__.py new file mode 100644 index 000000000..6f33b37c2 --- /dev/null +++ b/spacy/lang/sq/__init__.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from ...language import Language +from ...attrs import LANG + + +class AlbanianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "sq" + stop_words = STOP_WORDS + + +class Albanian(Language): + lang = "sq" + Defaults = AlbanianDefaults + + +__all__ = ["Albanian"] diff --git a/spacy/lang/sq/stop_words.py b/spacy/lang/sq/stop_words.py new file mode 100644 index 000000000..f91861ca1 --- /dev/null +++ b/spacy/lang/sq/stop_words.py @@ -0,0 +1,233 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/andrixh/index-albanian + +STOP_WORDS = set( + """ +a +afert +ai +ajo +andej +anes +aq +as +asaj +ashtu +ata +ate +atij +atje +ato +aty +atyre +b +be +behem +behet +bej +beje +bejne +ben +bene +bere +beri +bie +c +ca +cdo +cfare +cila +cilat +cilave +cilen +ciles +cilet +cili +cilin +cilit +deri +dhe +dic +dicka +dickaje +dike +dikujt +dikush +disa +do +dot +drejt +duke +dy +e +edhe +ende +eshte +etj +fare +gjate +gje +gjitha +gjithcka +gjithe +gjithnje +here +i +ia +ishin +ishte +iu +ja +jam +jane +jap +je +jemi +jo +ju +k +ka +kam +kane +kem +kemi +keq +kesaj +keshtu +kete +ketej +ketij +keto +ketu +ketyre +kishin +kishte +kjo +krejt +kryer +kryesisht +kryhet +ku +kudo +kundrejt +kur +kurre +kush +ky +la +le +lloj +m +ma +madhe +marr +marre +mban +mbi +me +menjehere +merr +merret +mes +mi +midis +mire +mjaft +mori +mos +mua +mund +na +ndaj +nder +ndermjet +ndersa +ndonje +ndryshe +ne +nen +neper +nepermjet +nese +nga +nje +njera +nuk +ose +pa +pak +papritur +para +pas +pasi +pasur +per +perbashket +perpara +po +por +prane +prapa +prej +pse +qe +qene +qenet +rralle +rreth +rri +s +sa +saj +sapo +se +secila +sepse +sh +shih +shume +si +sic +sikur +sipas +siper +sone +t +ta +tani +te +tej +tek +teper +tere +ti +tij +tilla +tille +tjera +tjeret +tjeter +tjetren +to +tone +ty +tyre +u +ua +une +vazhdimisht +vend +vet +veta +vete +vetem +veten +vetes +vjen +yne +zakonisht +""".split() +) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 3caed87e2..2d50e3048 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -9,11 +9,59 @@ def pytest_addoption(parser): parser.addoption("--slow", action="store_true", help="include slow tests") +def pytest_runtest_setup(item): + def getopt(opt): + # When using 'pytest --pyargs spacy' to test an installed copy of + # spacy, pytest skips running our pytest_addoption() hook. Later, when + # we call getoption(), pytest raises an error, because it doesn't + # recognize the option we're asking about. To avoid this, we need to + # pass a default value. We default to False, i.e., we act like all the + # options weren't given. + return item.config.getoption("--%s" % opt, False) + + for opt in ["slow"]: + if opt in item.keywords and not getopt(opt): + pytest.skip("need --%s option to run" % opt) + + +# Fixtures for language tokenizers (languages sorted alphabetically) + + @pytest.fixture(scope="module") def tokenizer(): return get_lang_class("xx").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def ar_tokenizer(): + return get_lang_class("ar").Defaults.create_tokenizer() + + +@pytest.fixture(scope="session") +def bn_tokenizer(): + return get_lang_class("bn").Defaults.create_tokenizer() + + +@pytest.fixture(scope="session") +def ca_tokenizer(): + return get_lang_class("ca").Defaults.create_tokenizer() + + +@pytest.fixture(scope="session") +def da_tokenizer(): + return get_lang_class("da").Defaults.create_tokenizer() + + +@pytest.fixture(scope="session") +def de_tokenizer(): + return get_lang_class("de").Defaults.create_tokenizer() + + +@pytest.fixture(scope="session") +def el_tokenizer(): + return get_lang_class("el").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def en_tokenizer(): return get_lang_class("en").Defaults.create_tokenizer() @@ -36,8 +84,8 @@ def es_tokenizer(): @pytest.fixture(scope="session") -def de_tokenizer(): - return get_lang_class("de").Defaults.create_tokenizer() +def fi_tokenizer(): + return get_lang_class("fi").Defaults.create_tokenizer() @pytest.fixture(scope="session") @@ -45,21 +93,21 @@ def fr_tokenizer(): return get_lang_class("fr").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def ga_tokenizer(): + return get_lang_class("ga").Defaults.create_tokenizer() + + +@pytest.fixture(scope="session") +def he_tokenizer(): + return get_lang_class("he").Defaults.create_tokenizer() + + @pytest.fixture def hu_tokenizer(): return get_lang_class("hu").Defaults.create_tokenizer() -@pytest.fixture(scope="session") -def fi_tokenizer(): - return get_lang_class("fi").Defaults.create_tokenizer() - - -@pytest.fixture(scope="session") -def ro_tokenizer(): - return get_lang_class("ro").Defaults.create_tokenizer() - - @pytest.fixture(scope="session") def id_tokenizer(): return get_lang_class("id").Defaults.create_tokenizer() @@ -71,23 +119,9 @@ def it_tokenizer(): @pytest.fixture(scope="session") -def sv_tokenizer(): - return get_lang_class("sv").Defaults.create_tokenizer() - - -@pytest.fixture(scope="session") -def bn_tokenizer(): - return get_lang_class("bn").Defaults.create_tokenizer() - - -@pytest.fixture(scope="session") -def ga_tokenizer(): - return get_lang_class("ga").Defaults.create_tokenizer() - - -@pytest.fixture(scope="session") -def he_tokenizer(): - return get_lang_class("he").Defaults.create_tokenizer() +def ja_tokenizer(): + pytest.importorskip("MeCab") + return get_lang_class("ja").Defaults.create_tokenizer() @pytest.fixture(scope="session") @@ -96,14 +130,34 @@ def nb_tokenizer(): @pytest.fixture(scope="session") -def da_tokenizer(): - return get_lang_class("da").Defaults.create_tokenizer() +def nl_tokenizer(): + return get_lang_class("nl").Defaults.create_tokenizer() @pytest.fixture(scope="session") -def ja_tokenizer(): - pytest.importorskip("MeCab") - return get_lang_class("ja").Defaults.create_tokenizer() +def pl_tokenizer(): + return get_lang_class("pl").Defaults.create_tokenizer() + + +@pytest.fixture(scope="session") +def pt_tokenizer(): + return get_lang_class("pt").Defaults.create_tokenizer() + + +@pytest.fixture(scope="session") +def ro_tokenizer(): + return get_lang_class("ro").Defaults.create_tokenizer() + + +@pytest.fixture(scope="session") +def ru_tokenizer(): + pytest.importorskip("pymorphy2") + return get_lang_class("ru").Defaults.create_tokenizer() + + +@pytest.fixture(scope="session") +def sv_tokenizer(): + return get_lang_class("sv").Defaults.create_tokenizer() @pytest.fixture(scope="session") @@ -117,58 +171,17 @@ def tr_tokenizer(): return get_lang_class("tr").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def tt_tokenizer(): + return get_lang_class("tt").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def uk_tokenizer(): pytest.importorskip("pymorphy2") return get_lang_class("uk").Defaults.create_tokenizer() -@pytest.fixture(scope="session") -def ca_tokenizer(): - return get_lang_class("ca").Defaults.create_tokenizer() - - -@pytest.fixture(scope="session") -def pl_tokenizer(): - return get_lang_class("pl").Defaults.create_tokenizer() - - -@pytest.fixture(scope="session") -def tt_tokenizer(): - return get_lang_class("tt").Defaults.create_tokenizer() - - -@pytest.fixture(scope="session") -def el_tokenizer(): - return get_lang_class("el").Defaults.create_tokenizer() - - -@pytest.fixture(scope="session") -def ar_tokenizer(): - return get_lang_class("ar").Defaults.create_tokenizer() - - @pytest.fixture(scope="session") def ur_tokenizer(): return get_lang_class("ur").Defaults.create_tokenizer() - - -@pytest.fixture(scope="session") -def ru_tokenizer(): - pytest.importorskip("pymorphy2") - return get_lang_class("ru").Defaults.create_tokenizer() - - -def pytest_runtest_setup(item): - def getopt(opt): - # When using 'pytest --pyargs spacy' to test an installed copy of - # spacy, pytest skips running our pytest_addoption() hook. Later, when - # we call getoption(), pytest raises an error, because it doesn't - # recognize the option we're asking about. To avoid this, we need to - # pass a default value. We default to False, i.e., we act like all the - # options weren't given. - return item.config.getoption("--%s" % opt, False) - - for opt in ["slow"]: - if opt in item.keywords and not getopt(opt): - pytest.skip("need --%s option to run" % opt) diff --git a/spacy/tests/doc/test_doc_spilt.py b/spacy/tests/doc/test_doc_spilt.py new file mode 100644 index 000000000..827fd565e --- /dev/null +++ b/spacy/tests/doc/test_doc_spilt.py @@ -0,0 +1,114 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ..util import get_doc +from ...vocab import Vocab +from ...tokens import Doc +from ...tokens import Span + +import pytest + + +def test_doc_split(en_tokenizer): + text = "LosAngeles start." + heads = [1, 1, 0] + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) + + assert len(doc) == 3 + assert len(str(doc)) == 19 + assert doc[0].head.text == 'start' + assert doc[1].head.text == '.' + + with doc.retokenize() as retokenizer: + retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'}) + + assert len(doc) == 4 + assert doc[0].text == 'Los' + assert doc[0].head.text == 'Angeles' + assert doc[0].idx == 0 + assert doc[1].idx == 3 + + assert doc[1].text == 'Angeles' + assert doc[1].head.text == 'start' + + assert doc[2].text == 'start' + assert doc[2].head.text == '.' + + assert doc[3].text == '.' + assert doc[3].head.text == '.' + + assert len(str(doc)) == 19 + +def test_split_dependencies(en_tokenizer): + text = "LosAngeles start." + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens]) + dep1 = doc.vocab.strings.add('amod') + dep2 = doc.vocab.strings.add('subject') + with doc.retokenize() as retokenizer: + retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2]) + + assert doc[0].dep == dep1 + assert doc[1].dep == dep2 + + + +def test_split_heads_error(en_tokenizer): + text = "LosAngeles start." + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens]) + #Not enough heads + with pytest.raises(ValueError): + with doc.retokenize() as retokenizer: + retokenizer.split(doc[0], ["Los", "Angeles"], [0]) + + #Too many heads + with pytest.raises(ValueError): + with doc.retokenize() as retokenizer: + retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1, 0]) + + #No token head + with pytest.raises(ValueError): + with doc.retokenize() as retokenizer: + retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1]) + + #Several token heads + with pytest.raises(ValueError): + with doc.retokenize() as retokenizer: + retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0]) + + +def test_spans_entity_merge_iob(): + # Test entity IOB stays consistent after merging + words = ["abc", "d", "e"] + doc = Doc(Vocab(), words=words) + doc.ents = [(doc.vocab.strings.add('ent-abcd'), 0, 2)] + assert doc[0].ent_iob_ == "B" + assert doc[1].ent_iob_ == "I" + + with doc.retokenize() as retokenizer: + retokenizer.split(doc[0], ["a", "b", "c"], [1, 1, 0]) + assert doc[0].ent_iob_ == "B" + assert doc[1].ent_iob_ == "I" + assert doc[2].ent_iob_ == "I" + assert doc[3].ent_iob_ == "I" + +def test_spans_sentence_update_after_merge(en_tokenizer): + text = "StewartLee is a stand up comedian. He lives in England and loves JoePasquale." + heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2] + deps = ['nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr', + 'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj', + 'compound', 'punct'] + + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps) + sent1, sent2 = list(doc.sents) + init_len = len(sent1) + init_len2 = len(sent2) + with doc.retokenize() as retokenizer: + retokenizer.split(doc[0], ["Stewart", "Lee"], [1, 0]) + retokenizer.split(doc[14], ["Joe", "Pasquale"], [1, 0]) + sent1, sent2 = list(doc.sents) + assert len(sent1) == init_len + 1 + assert len(sent2) == init_len2 + 1 diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py new file mode 100644 index 000000000..587d15dd7 --- /dev/null +++ b/spacy/tests/lang/test_initialize.py @@ -0,0 +1,21 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from spacy.util import get_lang_class + + +# fmt: off +# Only include languages with no external dependencies +# excluded: ja, ru, th, uk, vi, zh +LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", + "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is", + "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk", + "sl", "sq", "sv", "ta", "te", "tl", "tr", "tt", "ur"] +# fmt: on + + +@pytest.mark.parametrize("lang", LANGUAGES) +def test_lang_initialize(lang): + """Test that languages can be initialized.""" + lang_cls = get_lang_class(lang)() diff --git a/spacy/tests/regression/test_issue2833.py b/spacy/tests/regression/test_issue2833.py index 81aa40eb3..de71a6524 100644 --- a/spacy/tests/regression/test_issue2833.py +++ b/spacy/tests/regression/test_issue2833.py @@ -7,7 +7,9 @@ from spacy.compat import pickle def test_issue2833(en_vocab): - """Test that a custom error is raised if a token is pickled.""" + """Test that a custom error is raised if a token or span is pickled.""" doc = Doc(en_vocab, words=["Hello", "world"]) with pytest.raises(NotImplementedError): pickle.dumps(doc[0]) + with pytest.raises(NotImplementedError): + pickle.dumps(doc[0:2]) diff --git a/spacy/tests/regression/test_issue3209.py b/spacy/tests/regression/test_issue3209.py index 2b406b5e0..efa006791 100644 --- a/spacy/tests/regression/test_issue3209.py +++ b/spacy/tests/regression/test_issue3209.py @@ -1,24 +1,25 @@ -'''Test that labels are mapped to classes consistently when loading NER model.''' +# coding: utf8 from __future__ import unicode_literals + from spacy.lang.en import English import pytest + @pytest.mark.xfail def test_issue3209(): - '''Test issue that occurred in spaCy nightly where NER labels were being + """Test issue that occurred in spaCy nightly where NER labels were being mapped to classes incorrectly after loading the model, when the labels were added using ner.add_label(). - ''' + """ nlp = English() - ner = nlp.create_pipe('ner') + ner = nlp.create_pipe("ner") nlp.add_pipe(ner) - - ner.add_label('ANIMAL') + + ner.add_label("ANIMAL") nlp.begin_training() - move_names = ['O', 'B-ANIMAL', 'I-ANIMAL', 'L-ANIMAL', 'U-ANIMAL'] + move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"] assert ner.move_names == move_names nlp2 = English() - nlp2.add_pipe(nlp2.create_pipe('ner')) + nlp2.add_pipe(nlp2.create_pipe("ner")) nlp2.from_bytes(nlp.to_bytes()) - assert nlp2.get_pipe('ner').move_names == move_names - + assert nlp2.get_pipe("ner").move_names == move_names diff --git a/spacy/tests/regression/test_issue3248.py b/spacy/tests/regression/test_issue3248.py index 8df45bdc0..c4b592f3c 100644 --- a/spacy/tests/regression/test_issue3248.py +++ b/spacy/tests/regression/test_issue3248.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import pytest from spacy.matcher import PhraseMatcher from spacy.lang.en import English from spacy.compat import pickle diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 1d1f0e1dc..8b737c8a3 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -45,12 +45,12 @@ cdef class Retokenizer: attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) self.merges.append((span, attrs)) - def split(self, Token token, orths, attrs=SimpleFrozenDict()): + def split(self, Token token, orths, heads, deps=[], attrs=SimpleFrozenDict()): """Mark a Token for splitting, into the specified orths. The attrs will be applied to each subtoken. """ attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) - self.splits.append((token.start_char, orths, attrs)) + self.splits.append((token.i, orths, heads, deps, attrs)) def __enter__(self): self.merges = [] @@ -67,8 +67,12 @@ cdef class Retokenizer: end = span.end _merge(self.doc, start, end, attrs) - for start_char, orths, attrs in self.splits: - raise NotImplementedError + offset = 0 + # Iterate in order, to keep the offset simple. + for token_index, orths, heads, deps, attrs in sorted(self.splits): + _split(self.doc, token_index + offset, orths, heads, deps, attrs) + # Adjust for the previous tokens + offset += len(orths)-1 def _merge(Doc doc, int start, int end, attributes): """Retokenize the document, such that the span at @@ -299,3 +303,110 @@ def _resize_tensor(tensor, ranges): delete.append(i) xp = get_array_module(tensor) return xp.delete(tensor, delete, axis=0) + + +def _split(Doc doc, int token_index, orths, heads, deps, attrs): + """Retokenize the document, such that the token at + `doc[token_index]` is split into tokens with the orth 'orths' + token_index(int): token index of the token to split. + orths: IDs of the verbatim text content of the tokens to create + **attributes: Attributes to assign to each of the newly created tokens. By default, + attributes are inherited from the original token. + RETURNS (Token): The first newly created token. + """ + cdef int nb_subtokens = len(orths) + cdef const LexemeC* lex + cdef TokenC* token + cdef TokenC orig_token = doc.c[token_index] + + if(len(heads) != nb_subtokens): + raise ValueError(Errors.E115) + token_head_index = -1 + for index, head in enumerate(heads): + if head == 0: + if token_head_index != -1: + raise ValueError(Errors.E114) + token_head_index = index + if token_head_index == -1: + raise ValueError(Errors.E113) + + # First, make the dependencies absolutes, and adjust all possible dependencies before + # creating the tokens + + for i in range(doc.length): + doc.c[i].head += i + + # Adjust dependencies + offset = nb_subtokens - 1 + for i in range(doc.length): + head_idx = doc.c[i].head + if head_idx == token_index: + doc.c[i].head = token_head_index + elif head_idx > token_index: + doc.c[i].head += offset + + new_token_head = doc.c[token_index].head + + # Double doc.c max_length if necessary (until big enough for all new tokens) + while doc.length + nb_subtokens - 1 >= doc.max_length: + doc._realloc(doc.length * 2) + + # Move tokens after the split to create space for the new tokens + doc.length = len(doc) + nb_subtokens -1 + for token_to_move in range(doc.length - 1, token_index, -1): + doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move] + + # Host the tokens in the newly created space + cdef int idx_offset = 0 + for i, orth in enumerate(orths): + + token = &doc.c[token_index + i] + lex = doc.vocab.get(doc.mem, orth) + token.lex = lex + # Update the character offset of the subtokens + if i != 0: + token.idx = orig_token.idx + idx_offset + idx_offset += len(orth) + + # Set token.spacy to False for all non-last split tokens, and + # to origToken.spacy for the last token + if (i < nb_subtokens - 1): + token.spacy = False + else: + token.spacy = orig_token.spacy + + # Apply attrs to each subtoken + for attr_name, attr_value in attrs.items(): + if attr_name == TAG: + doc.vocab.morphology.assign_tag(token, attr_value) + else: + Token.set_struct_attr(token, attr_name, attr_value) + + # Make IOB consistent + if (orig_token.ent_iob == 3): + if i == 0: + token.ent_iob = 3 + else: + token.ent_iob = 1 + else: + # In all other cases subtokens inherit iob from origToken + token.ent_iob = orig_token.ent_iob + + # Use the head of the new token everywhere. This will be partially overwritten later on. + token.head = new_token_head + + # Transform the dependencies into relative ones again + for i in range(doc.length): + doc.c[i].head -= i + + # Assign correct dependencies to the inner token + for i, head in enumerate(heads): + if head != 0: + # the token's head's head is already correct + doc.c[token_index + i].head = head + + for i, dep in enumerate(deps): + doc[token_index + i].dep = dep + + # set children from head + set_children_from_heads(doc.c, doc.length) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 9c7d8d153..593e6ddec 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -141,6 +141,9 @@ cdef class Span: for i in range(self.start, self.end): yield self.doc[i] + def __reduce__(self): + raise NotImplementedError(Errors.E112) + @property def _(self): """User space for adding custom attribute extensions.""" diff --git a/spacy/util.py b/spacy/util.py index 26f3eac2b..621ea5935 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -53,8 +53,8 @@ def get_lang_class(lang): if lang not in LANGUAGES: try: module = importlib.import_module(".lang.%s" % lang, "spacy") - except ImportError: - raise ImportError(Errors.E048.format(lang=lang)) + except ImportError as err: + raise ImportError(Errors.E048.format(lang=lang, err=err)) LANGUAGES[lang] = getattr(module, module.__all__[0]) return LANGUAGES[lang]