Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2019-02-14 15:51:12 +01:00
commit 6ccd67c682
32 changed files with 2956 additions and 103 deletions

View File

@ -177,7 +177,7 @@ class Errors(object):
"you forget to call the `set_extension` method?")
E047 = ("Can't assign a value to unregistered extension attribute "
"'{name}'. Did you forget to call the `set_extension` method?")
E048 = ("Can't import language {lang} from spacy.lang.")
E048 = ("Can't import language {lang} from spacy.lang: {err}")
E049 = ("Can't find spaCy data directory: '{path}'. Check your "
"installation and permissions, or use spacy.util.set_data_path "
"to customise the location if necessary.")
@ -308,6 +308,16 @@ class Errors(object):
"would always have to include its Doc and Vocab, which has "
"practically no advantage over pickling the parent Doc directly. "
"So instead of pickling the token, pickle the Doc it belongs to.")
E112 = ("Pickling a span is not supported, because spans are only views "
"of the parent Doc and can't exist on their own. A pickled span "
"would always have to include its Doc and Vocab, which has "
"practically no advantage over pickling the parent Doc directly. "
"So instead of pickling the span, pickle the Doc it belongs to or "
"use Span.as_doc to convert the span to a standalone Doc object.")
E113 = ("The newly split token can only have one root (head = 0).")
E114 = ("The newly split token needs to have a root (head = 0)")
E115 = ("All subtokens must have associated heads")
@add_codes
class TempErrors(object):

20
spacy/lang/af/__init__.py Normal file
View File

@ -0,0 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
class AfrikaansDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "af"
stop_words = STOP_WORDS
class Afrikaans(Language):
lang = "af"
Defaults = AfrikaansDefaults
__all__ = ["Afrikaans"]

View File

@ -0,0 +1,61 @@
# coding: utf8
from __future__ import unicode_literals
# Source: https://github.com/stopwords-iso/stopwords-af
STOP_WORDS = set(
"""
'n
aan
af
al
as
baie
by
daar
dag
dat
die
dit
een
ek
en
gaan
gesê
haar
het
hom
hulle
hy
in
is
jou
jy
kan
kom
ma
maar
met
my
na
nie
om
ons
op
saam
sal
se
sien
so
sy
te
toe
uit
van
vir
was
wat
ʼn
""".split()
)

20
spacy/lang/bg/__init__.py Normal file
View File

@ -0,0 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
class BulgarianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "bg"
stop_words = STOP_WORDS
class Bulgarian(Language):
lang = "bg"
Defaults = BulgarianDefaults
__all__ = ["Bulgarian"]

269
spacy/lang/bg/stop_words.py Normal file
View File

@ -0,0 +1,269 @@
# coding: utf8
from __future__ import unicode_literals
# Source: https://github.com/Alir3z4/stop-words
STOP_WORDS = set(
"""
а
автентичен
аз
ако
ала
бе
без
беше
би
бивш
бивша
бившо
бил
била
били
било
благодаря
близо
бъдат
бъде
бяха
в
вас
ваш
ваша
вероятно
вече
взема
ви
вие
винаги
внимава
време
все
всеки
всички
всичко
всяка
във
въпреки
върху
г
ги
главен
главна
главно
глас
го
година
години
годишен
д
да
дали
два
двама
двамата
две
двете
ден
днес
дни
до
добра
добре
добро
добър
докато
докога
дори
досега
доста
друг
друга
други
е
евтин
едва
един
една
еднаква
еднакви
еднакъв
едно
екип
ето
живот
за
забавям
зад
заедно
заради
засега
заспал
затова
защо
защото
и
из
или
им
има
имат
иска
й
каза
как
каква
какво
както
какъв
като
кога
когато
което
които
кой
който
колко
която
къде
където
към
лесен
лесно
ли
лош
м
май
малко
ме
между
мек
мен
месец
ми
много
мнозина
мога
могат
може
мокър
моля
момента
му
н
на
над
назад
най
направи
напред
например
нас
не
него
нещо
нея
ни
ние
никой
нито
нищо
но
нов
нова
нови
новина
някои
някой
няколко
няма
обаче
около
освен
особено
от
отгоре
отново
още
пак
по
повече
повечето
под
поне
поради
после
почти
прави
пред
преди
през
при
пък
първата
първи
първо
пъти
равен
равна
с
са
сам
само
се
сега
си
син
скоро
след
следващ
сме
смях
според
сред
срещу
сте
съм
със
също
т
тази
така
такива
такъв
там
твой
те
тези
ти
т.н.
то
това
тогава
този
той
толкова
точно
три
трябва
тук
тъй
тя
тях
у
утре
харесва
хиляди
ч
часа
че
често
чрез
ще
щом
юмрук
я
як
""".split()
)

20
spacy/lang/cs/__init__.py Normal file
View File

@ -0,0 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
class CzechDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "cs"
stop_words = STOP_WORDS
class Czech(Language):
lang = "cs"
Defaults = CzechDefaults
__all__ = ["Czech"]

266
spacy/lang/cs/stop_words.py Normal file
View File

@ -0,0 +1,266 @@
# coding: utf8
from __future__ import unicode_literals
# Source: https://github.com/Alir3z4/stop-words
STOP_WORDS = set(
"""
ačkoli
ahoj
ale
anebo
ano
asi
aspoň
během
bez
beze
blízko
bohužel
brzo
bude
budeme
budeš
budete
budou
budu
byl
byla
byli
bylo
byly
bys
čau
chce
chceme
chceš
chcete
chci
chtějí
chtít
chut'
chuti
co
čtrnáct
čtyři
dál
dále
daleko
děkovat
děkujeme
děkuji
den
deset
devatenáct
devět
do
dobrý
docela
dva
dvacet
dvanáct
dvě
hodně
jak
jde
je
jeden
jedenáct
jedna
jedno
jednou
jedou
jeho
její
jejich
jemu
jen
jenom
ještě
jestli
jestliže
jich
jím
jimi
jinak
jsem
jsi
jsme
jsou
jste
kam
kde
kdo
kdy
když
ke
kolik
kromě
která
které
kteří
který
kvůli
mají
málo
mám
máme
máš
máte
mezi
mít
mně
mnou
moc
mohl
mohou
moje
moji
možná
můj
musí
může
my
na
nad
nade
nám
námi
naproti
nás
náš
naše
naši
ne
nebo
nebyl
nebyla
nebyli
nebyly
něco
nedělá
nedělají
nedělám
neděláme
neděláš
neděláte
nějak
nejsi
někde
někdo
nemají
nemáme
nemáte
neměl
němu
není
nestačí
nevadí
než
nic
nich
ním
nimi
nula
od
ode
on
ona
oni
ono
ony
osm
osmnáct
pak
patnáct
pět
po
pořád
potom
pozdě
před
přes
přese
pro
proč
prosím
prostě
proti
protože
rovně
se
sedm
sedmnáct
šest
šestnáct
skoro
smějí
smí
snad
spolu
sta
sté
sto
ta
tady
tak
takhle
taky
tam
tamhle
tamhleto
tamto
tebe
tebou
ted'
tedy
ten
ti
tisíc
tisíce
to
tobě
tohle
toto
třeba
tři
třináct
trošku
tvá
tvé
tvoje
tvůj
ty
určitě
vám
vámi
vás
váš
vaše
vaši
ve
večer
vedle
vlastně
všechno
všichni
vůbec
vy
vždy
za
zač
zatímco
ze
že
""".split()
)

20
spacy/lang/et/__init__.py Normal file
View File

@ -0,0 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
class EstonianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "et"
stop_words = STOP_WORDS
class Estonian(Language):
lang = "et"
Defaults = EstonianDefaults
__all__ = ["Estonian"]

View File

@ -0,0 +1,45 @@
# coding: utf8
from __future__ import unicode_literals
# Source: https://github.com/stopwords-iso/stopwords-et
STOP_WORDS = set(
"""
aga
ei
et
ja
jah
kas
kui
kõik
ma
me
mida
midagi
mind
minu
mis
mu
mul
mulle
nad
nii
oled
olen
oli
oma
on
pole
sa
seda
see
selle
siin
siis
ta
te
ära
""".split()
)

20
spacy/lang/is/__init__.py Normal file
View File

@ -0,0 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
class IcelandicDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "is"
stop_words = STOP_WORDS
class Icelandic(Language):
lang = "is"
Defaults = IcelandicDefaults
__all__ = ["Icelandic"]

162
spacy/lang/is/stop_words.py Normal file
View File

@ -0,0 +1,162 @@
# coding: utf8
from __future__ import unicode_literals
# Source: https://github.com/Xangis/extra-stopwords
STOP_WORDS = set(
"""
afhverju
aftan
aftur
afþví
aldrei
allir
allt
alveg
annað
annars
bara
dag
eða
eftir
eiga
einhver
einhverjir
einhvers
eins
einu
eitthvað
ekkert
ekki
ennþá
eru
fara
fer
finna
fjöldi
fólk
framan
frá
frekar
fyrir
gegnum
geta
getur
gmg
gott
hann
hafa
hef
hefur
heyra
hér
hérna
hjá
hún
hvað
hvar
hver
hverjir
hverjum
hvernig
hvor
hvort
hægt
img
inn
kannski
koma
líka
lol
maður
mátt
mér
með
mega
meira
mig
mikið
minna
minni
missa
mjög
nei
niður
núna
oft
okkar
okkur
póst
póstur
rofl
saman
sem
sér
sig
sinni
síðan
sjá
smá
smátt
spurja
spyrja
staðar
stórt
svo
svona
sælir
sæll
taka
takk
til
tilvitnun
titlar
upp
var
vel
velkomin
velkominn
vera
verður
verið
vel
við
vil
vilja
vill
vita
væri
yfir
ykkar
það
þakka
þakkir
þannig
það
þar
þarf
þau
þeim
þeir
þeirra
þeirra
þegar
þess
þessa
þessi
þessu
þessum
þetta
þér
þið
þinn
þitt
þín
þráð
þráður
því
þær
ætti
""".split()
)

View File

@ -2,14 +2,13 @@
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG
class KannadaDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "kn"
stop_words = STOP_WORDS

View File

@ -2,7 +2,7 @@
from __future__ import unicode_literals
STOP_WORD = set(
STOP_WORDS = set(
"""
ಮತ

20
spacy/lang/lt/__init__.py Normal file
View File

@ -0,0 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
class LithuanianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "lt"
stop_words = STOP_WORDS
class Lithuanian(Language):
lang = "lt"
Defaults = LithuanianDefaults
__all__ = ["Lithuanian"]

484
spacy/lang/lt/stop_words.py Normal file
View File

@ -0,0 +1,484 @@
# coding: utf8
from __future__ import unicode_literals
# Source: https://github.com/stopwords-iso/stopwords-lt
STOP_WORDS = set(
"""
abi
abidvi
abiejose
abiejuose
abiejø
abiem
abigaliai
abipus
abu
abudu
ai
ana
anaiptol
anaisiais
anajai
anajam
anajame
anapus
anas
anasai
anasis
anei
aniedvi
anieji
aniesiems
anoji
anojo
anojoje
anokia
anoks
anosiomis
anosioms
anosios
anosiose
anot
ant
antai
anuodu
anuoju
anuosiuose
anuosius
anàja
anàjà
anàjá
anàsias
anøjø
apie
aplink
ar
arba
argi
arti
aukðèiau
be
bei
beje
bemaþ
bent
bet
betgi
beveik
dar
dargi
daugmaþ
deja
dëka
dël
dëlei
dëlto
ech
et
gal
galbût
galgi
gan
gana
gi
greta
idant
iki
ir
irgi
it
itin
iðilgai
iðvis
jaisiais
jajai
jajam
jajame
jei
jeigu
ji
jiedu
jiedvi
jieji
jiesiems
jinai
jis
jisai
jog
joji
jojo
jojoje
jokia
joks
josiomis
josioms
josios
josiose
judu
judvi
juk
jumis
jums
jumyse
juodu
juoju
juosiuose
juosius
jus
jàja
jàjà
jàsias
jájá
jøjø
jûs
jûsiðkis
jûsiðkë
jûsø
kad
kada
kadangi
kai
kaip
kaipgi
kas
katra
katras
katriedvi
katruodu
kaþin
kaþkas
kaþkatra
kaþkatras
kaþkokia
kaþkoks
kaþkuri
kaþkuris
kiaurai
kiek
kiekvienas
kieno
kita
kitas
kitokia
kitoks
kodël
kokia
koks
kol
kolei
kone
kuomet
kur
kurgi
kuri
kuriedvi
kuris
kuriuodu
lai
lig
ligi
link
lyg
man
manaisiais
manajai
manajam
manajame
manas
manasai
manasis
mane
manieji
maniesiems
manim
manimi
maniðkis
maniðkë
mano
manoji
manojo
manojoje
manosiomis
manosioms
manosios
manosiose
manuoju
manuosiuose
manuosius
manyje
manàja
manàjà
manàjá
manàsias
manæs
manøjø
mat
maþdaug
maþne
mes
mudu
mudvi
mumis
mums
mumyse
mus
mûsiðkis
mûsiðkë
mûsø
na
nagi
ne
nebe
nebent
negi
negu
nei
nejau
nejaugi
nekaip
nelyginant
nes
net
netgi
netoli
neva
nors
nuo
o
ogi
oi
paeiliui
pagal
pakeliui
palaipsniui
palei
pas
pasak
paskos
paskui
paskum
pat
pati
patiems
paties
pats
patys
patá
paèiais
paèiam
paèiame
paèiu
paèiuose
paèius
paèiø
per
pernelyg
pirm
pirma
pirmiau
po
prie
prieð
prieðais
pro
pusiau
rasi
rodos
sau
savaisiais
savajai
savajam
savajame
savas
savasai
savasis
save
savieji
saviesiems
savimi
saviðkis
saviðkë
savo
savoji
savojo
savojoje
savosiomis
savosioms
savosios
savosiose
savuoju
savuosiuose
savuosius
savyje
savàja
savàjà
savàjá
savàsias
savæs
savøjø
skersai
skradþiai
staèiai
su
sulig
ta
tad
tai
taigi
taip
taipogi
taisiais
tajai
tajam
tajame
tamsta
tarp
tarsi
tartum
tarytum
tas
tasai
tau
tavaisiais
tavajai
tavajam
tavajame
tavas
tavasai
tavasis
tave
tavieji
taviesiems
tavimi
taviðkis
taviðkë
tavo
tavoji
tavojo
tavojoje
tavosiomis
tavosioms
tavosios
tavosiose
tavuoju
tavuosiuose
tavuosius
tavyje
tavàja
tavàjà
tavàjá
tavàsias
tavæs
tavøjø
taèiau
te
tegu
tegul
tiedvi
tieji
ties
tiesiems
tiesiog
tik
tikriausiai
tiktai
toji
tojo
tojoje
tokia
toks
tol
tolei
toliau
tosiomis
tosioms
tosios
tosiose
tu
tuodu
tuoju
tuosiuose
tuosius
turbût
tàja
tàjà
tàjá
tàsias
tøjø
tûlas
uþtat
uþvis
va
vai
viduj
vidury
vien
vienas
vienokia
vienoks
vietoj
virð
virðuj
virðum
vis
vis dëlto
visa
visas
visgi
visokia
visoks
vos
vël
vëlgi
ypaè
á
ákypai
ástriþai
ðalia
ðe
ði
ðiaisiais
ðiajai
ðiajam
ðiajame
ðiapus
ðiedvi
ðieji
ðiesiems
ðioji
ðiojo
ðiojoje
ðiokia
ðioks
ðiosiomis
ðiosioms
ðiosios
ðiosiose
ðis
ðisai
ðit
ðita
ðitas
ðitiedvi
ðitokia
ðitoks
ðituodu
ðiuodu
ðiuoju
ðiuosiuose
ðiuosius
ðiàja
ðiàjà
ðiàsias
ðiøjø
ðtai
ðájá
þemiau
""".split()
)

20
spacy/lang/lv/__init__.py Normal file
View File

@ -0,0 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
class LatvianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "lv"
stop_words = STOP_WORDS
class Latvian(Language):
lang = "lv"
Defaults = LatvianDefaults
__all__ = ["Latvian"]

171
spacy/lang/lv/stop_words.py Normal file
View File

@ -0,0 +1,171 @@
# coding: utf8
from __future__ import unicode_literals
# Source: https://github.com/stopwords-iso/stopwords-lv
STOP_WORDS = set(
"""
aiz
ap
apakš
apakšpus
ar
arī
augšpus
bet
bez
bija
biji
biju
bijām
bijāt
būs
būsi
būsiet
būsim
būt
būšu
caur
diemžēl
diezin
droši
dēļ
esam
esat
esi
esmu
gan
gar
iekam
iekams
iekām
iekāms
iekš
iekšpus
ik
ir
it
itin
iz
ja
jau
jeb
jebšu
jel
jo
ka
kamēr
kaut
kolīdz
kopš
kļuva
kļuvi
kļuvu
kļuvām
kļuvāt
kļūs
kļūsi
kļūsiet
kļūsim
kļūst
kļūstam
kļūstat
kļūsti
kļūstu
kļūt
kļūšu
labad
lai
lejpus
līdz
līdzko
ne
nebūt
nedz
nekā
nevis
nezin
no
nu
otrpus
pa
par
pat
pie
pirms
pret
priekš
pār
pēc
starp
tad
tak
tapi
taps
tapsi
tapsiet
tapsim
tapt
tapāt
tapšu
taču
te
tiec
tiek
tiekam
tiekat
tieku
tik
tika
tikai
tiki
tikko
tiklab
tiklīdz
tiks
tiksiet
tiksim
tikt
tiku
tikvien
tikām
tikāt
tikšu
tomēr
topat
turpretim
turpretī
tādēļ
tālab
tāpēc
un
uz
vai
var
varat
varēja
varēji
varēju
varējām
varējāt
varēs
varēsi
varēsiet
varēsim
varēt
varēšu
vien
virs
virspus
vis
viņpus
zem
ārpus
šaipus
""".split()
)

20
spacy/lang/sk/__init__.py Normal file
View File

@ -0,0 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
class SlovakDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "sk"
stop_words = STOP_WORDS
class Slovak(Language):
lang = "sk"
Defaults = SlovakDefaults
__all__ = ["Slovak"]

231
spacy/lang/sk/stop_words.py Normal file
View File

@ -0,0 +1,231 @@
# coding: utf8
from __future__ import unicode_literals
# Source: https://github.com/stopwords-iso/stopwords-sk
STOP_WORDS = set(
"""
a
aby
aj
ak
ako
aký
ale
alebo
and
ani
asi
avšak
ba
bez
bol
bola
boli
bolo
bude
budem
budeme
budete
budeš
budú
buï
buď
by
byť
cez
dnes
do
ešte
for
ho
hoci
i
iba
ich
im
iné
iný
ja
je
jeho
jej
jemu
ju
k
kam
každá
každé
každí
každý
kde
kedže
keï
keď
kto
ktorou
ktorá
ktoré
ktorí
ktorý
ku
lebo
len
ma
mať
medzi
menej
mi
mna
mne
mnou
moja
moje
mu
musieť
my
máte
mòa
môcť
môj
môže
na
nad
nami
naši
nech
neho
nej
nemu
než
nich
nie
niektorý
nielen
nim
nič
no
nová
nové
noví
nový
nám
nás
náš
ním
o
od
odo
of
on
ona
oni
ono
ony
po
pod
podľa
pokiaľ
potom
pre
pred
predo
preto
pretože
prečo
pri
prvá
prvé
prví
prvý
práve
pýta
s
sa
seba
sem
si
sme
so
som
späť
ste
svoj
svoje
svojich
svojím
svojími
ta
tak
taký
takže
tam
te
teba
tebe
tebou
teda
tej
ten
tento
the
ti
tie
tieto
tiež
to
toho
tohoto
tom
tomto
tomu
tomuto
toto
tou
tu
tvoj
tvojími
ty
táto
túto
tým
týmto
v
vami
vaše
veï
viac
vo
vy
vám
vás
váš
však
všetok
z
za
zo
a
áno
èi
èo
èí
òom
òou
òu
či
čo
ďalšia
ďalšie
ďalší
že
""".split()
)

20
spacy/lang/sl/__init__.py Normal file
View File

@ -0,0 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
class SlovenianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "sl"
stop_words = STOP_WORDS
class Slovenian(Language):
lang = "sl"
Defaults = SlovenianDefaults
__all__ = ["Slovenian"]

458
spacy/lang/sl/stop_words.py Normal file
View File

@ -0,0 +1,458 @@
# coding: utf8
from __future__ import unicode_literals
# Source: https://github.com/stopwords-iso/stopwords-sl
# TODO: probably needs to be tidied up the list seems to have month names in
# it, which shouldn't be considered stop words.
STOP_WORDS = set(
"""
a
ali
april
avgust
b
bi
bil
bila
bile
bili
bilo
biti
blizu
bo
bodo
bojo
bolj
bom
bomo
boste
bova
boš
brez
c
cel
cela
celi
celo
d
da
daleč
dan
danes
datum
december
deset
deseta
deseti
deseto
devet
deveta
deveti
deveto
do
dober
dobra
dobri
dobro
dokler
dol
dolg
dolga
dolgi
dovolj
drug
druga
drugi
drugo
dva
dve
e
eden
en
ena
ene
eni
enkrat
eno
etc.
f
februar
g
g.
ga
ga.
gor
gospa
gospod
h
halo
i
idr.
ii
iii
in
iv
ix
iz
j
januar
jaz
je
ji
jih
jim
jo
julij
junij
jutri
k
kadarkoli
kaj
kajti
kako
kakor
kamor
kamorkoli
kar
karkoli
katerikoli
kdaj
kdo
kdorkoli
ker
ki
kje
kjer
kjerkoli
ko
koder
koderkoli
koga
komu
kot
kratek
kratka
kratke
kratki
l
lahka
lahke
lahki
lahko
le
lep
lepa
lepe
lepi
lepo
leto
m
maj
majhen
majhna
majhni
malce
malo
manj
marec
me
med
medtem
mene
mesec
mi
midva
midve
mnogo
moj
moja
moje
mora
morajo
moram
moramo
morate
moraš
morem
mu
n
na
nad
naj
najina
najino
najmanj
naju
največ
nam
narobe
nas
nato
nazaj
naš
naša
naše
ne
nedavno
nedelja
nek
neka
nekaj
nekatere
nekateri
nekatero
nekdo
neke
nekega
neki
nekje
neko
nekoga
nekoč
ni
nikamor
nikdar
nikjer
nikoli
nič
nje
njega
njegov
njegova
njegovo
njej
njemu
njen
njena
njeno
nji
njih
njihov
njihova
njihovo
njiju
njim
njo
njun
njuna
njuno
no
nocoj
november
npr.
o
ob
oba
obe
oboje
od
odprt
odprta
odprti
okoli
oktober
on
onadva
one
oni
onidve
osem
osma
osmi
osmo
oz.
p
pa
pet
peta
petek
peti
peto
po
pod
pogosto
poleg
poln
polna
polni
polno
ponavadi
ponedeljek
ponovno
potem
povsod
pozdravljen
pozdravljeni
prav
prava
prave
pravi
pravo
prazen
prazna
prazno
prbl.
precej
pred
prej
preko
pri
pribl.
približno
primer
pripravljen
pripravljena
pripravljeni
proti
prva
prvi
prvo
r
ravno
redko
res
reč
s
saj
sam
sama
same
sami
samo
se
sebe
sebi
sedaj
sedem
sedma
sedmi
sedmo
sem
september
seveda
si
sicer
skoraj
skozi
slab
smo
so
sobota
spet
sreda
srednja
srednji
sta
ste
stran
stvar
sva
t
ta
tak
taka
take
taki
tako
takoj
tam
te
tebe
tebi
tega
težak
težka
težki
težko
ti
tista
tiste
tisti
tisto
tj.
tja
to
toda
torek
tretja
tretje
tretji
tri
tu
tudi
tukaj
tvoj
tvoja
tvoje
u
v
vaju
vam
vas
vaš
vaša
vaše
ve
vedno
velik
velika
veliki
veliko
vendar
ves
več
vi
vidva
vii
viii
visok
visoka
visoke
visoki
vsa
vsaj
vsak
vsaka
vsakdo
vsake
vsaki
vsakomur
vse
vsega
vsi
vso
včasih
včeraj
x
z
za
zadaj
zadnji
zakaj
zaprta
zaprti
zaprto
zdaj
zelo
zunaj
č
če
često
četrta
četrtek
četrti
četrto
čez
čigav
š
šest
šesta
šesti
šesto
štiri
ž
že
""".split()
)

20
spacy/lang/sq/__init__.py Normal file
View File

@ -0,0 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
class AlbanianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "sq"
stop_words = STOP_WORDS
class Albanian(Language):
lang = "sq"
Defaults = AlbanianDefaults
__all__ = ["Albanian"]

233
spacy/lang/sq/stop_words.py Normal file
View File

@ -0,0 +1,233 @@
# coding: utf8
from __future__ import unicode_literals
# Source: https://github.com/andrixh/index-albanian
STOP_WORDS = set(
"""
a
afert
ai
ajo
andej
anes
aq
as
asaj
ashtu
ata
ate
atij
atje
ato
aty
atyre
b
be
behem
behet
bej
beje
bejne
ben
bene
bere
beri
bie
c
ca
cdo
cfare
cila
cilat
cilave
cilen
ciles
cilet
cili
cilin
cilit
deri
dhe
dic
dicka
dickaje
dike
dikujt
dikush
disa
do
dot
drejt
duke
dy
e
edhe
ende
eshte
etj
fare
gjate
gje
gjitha
gjithcka
gjithe
gjithnje
here
i
ia
ishin
ishte
iu
ja
jam
jane
jap
je
jemi
jo
ju
k
ka
kam
kane
kem
kemi
keq
kesaj
keshtu
kete
ketej
ketij
keto
ketu
ketyre
kishin
kishte
kjo
krejt
kryer
kryesisht
kryhet
ku
kudo
kundrejt
kur
kurre
kush
ky
la
le
lloj
m
ma
madhe
marr
marre
mban
mbi
me
menjehere
merr
merret
mes
mi
midis
mire
mjaft
mori
mos
mua
mund
na
ndaj
nder
ndermjet
ndersa
ndonje
ndryshe
ne
nen
neper
nepermjet
nese
nga
nje
njera
nuk
ose
pa
pak
papritur
para
pas
pasi
pasur
per
perbashket
perpara
po
por
prane
prapa
prej
pse
qe
qene
qenet
rralle
rreth
rri
s
sa
saj
sapo
se
secila
sepse
sh
shih
shume
si
sic
sikur
sipas
siper
sone
t
ta
tani
te
tej
tek
teper
tere
ti
tij
tilla
tille
tjera
tjeret
tjeter
tjetren
to
tone
ty
tyre
u
ua
une
vazhdimisht
vend
vet
veta
vete
vetem
veten
vetes
vjen
yne
zakonisht
""".split()
)

View File

@ -9,11 +9,59 @@ def pytest_addoption(parser):
parser.addoption("--slow", action="store_true", help="include slow tests")
def pytest_runtest_setup(item):
def getopt(opt):
# When using 'pytest --pyargs spacy' to test an installed copy of
# spacy, pytest skips running our pytest_addoption() hook. Later, when
# we call getoption(), pytest raises an error, because it doesn't
# recognize the option we're asking about. To avoid this, we need to
# pass a default value. We default to False, i.e., we act like all the
# options weren't given.
return item.config.getoption("--%s" % opt, False)
for opt in ["slow"]:
if opt in item.keywords and not getopt(opt):
pytest.skip("need --%s option to run" % opt)
# Fixtures for language tokenizers (languages sorted alphabetically)
@pytest.fixture(scope="module")
def tokenizer():
return get_lang_class("xx").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def ar_tokenizer():
return get_lang_class("ar").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def bn_tokenizer():
return get_lang_class("bn").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def ca_tokenizer():
return get_lang_class("ca").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def da_tokenizer():
return get_lang_class("da").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def de_tokenizer():
return get_lang_class("de").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def el_tokenizer():
return get_lang_class("el").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def en_tokenizer():
return get_lang_class("en").Defaults.create_tokenizer()
@ -36,8 +84,8 @@ def es_tokenizer():
@pytest.fixture(scope="session")
def de_tokenizer():
return get_lang_class("de").Defaults.create_tokenizer()
def fi_tokenizer():
return get_lang_class("fi").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
@ -45,21 +93,21 @@ def fr_tokenizer():
return get_lang_class("fr").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def ga_tokenizer():
return get_lang_class("ga").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def he_tokenizer():
return get_lang_class("he").Defaults.create_tokenizer()
@pytest.fixture
def hu_tokenizer():
return get_lang_class("hu").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def fi_tokenizer():
return get_lang_class("fi").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def ro_tokenizer():
return get_lang_class("ro").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def id_tokenizer():
return get_lang_class("id").Defaults.create_tokenizer()
@ -71,23 +119,9 @@ def it_tokenizer():
@pytest.fixture(scope="session")
def sv_tokenizer():
return get_lang_class("sv").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def bn_tokenizer():
return get_lang_class("bn").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def ga_tokenizer():
return get_lang_class("ga").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def he_tokenizer():
return get_lang_class("he").Defaults.create_tokenizer()
def ja_tokenizer():
pytest.importorskip("MeCab")
return get_lang_class("ja").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
@ -96,14 +130,34 @@ def nb_tokenizer():
@pytest.fixture(scope="session")
def da_tokenizer():
return get_lang_class("da").Defaults.create_tokenizer()
def nl_tokenizer():
return get_lang_class("nl").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def ja_tokenizer():
pytest.importorskip("MeCab")
return get_lang_class("ja").Defaults.create_tokenizer()
def pl_tokenizer():
return get_lang_class("pl").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def pt_tokenizer():
return get_lang_class("pt").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def ro_tokenizer():
return get_lang_class("ro").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def ru_tokenizer():
pytest.importorskip("pymorphy2")
return get_lang_class("ru").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def sv_tokenizer():
return get_lang_class("sv").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
@ -117,58 +171,17 @@ def tr_tokenizer():
return get_lang_class("tr").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def tt_tokenizer():
return get_lang_class("tt").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def uk_tokenizer():
pytest.importorskip("pymorphy2")
return get_lang_class("uk").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def ca_tokenizer():
return get_lang_class("ca").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def pl_tokenizer():
return get_lang_class("pl").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def tt_tokenizer():
return get_lang_class("tt").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def el_tokenizer():
return get_lang_class("el").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def ar_tokenizer():
return get_lang_class("ar").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def ur_tokenizer():
return get_lang_class("ur").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def ru_tokenizer():
pytest.importorskip("pymorphy2")
return get_lang_class("ru").Defaults.create_tokenizer()
def pytest_runtest_setup(item):
def getopt(opt):
# When using 'pytest --pyargs spacy' to test an installed copy of
# spacy, pytest skips running our pytest_addoption() hook. Later, when
# we call getoption(), pytest raises an error, because it doesn't
# recognize the option we're asking about. To avoid this, we need to
# pass a default value. We default to False, i.e., we act like all the
# options weren't given.
return item.config.getoption("--%s" % opt, False)
for opt in ["slow"]:
if opt in item.keywords and not getopt(opt):
pytest.skip("need --%s option to run" % opt)

View File

@ -0,0 +1,114 @@
# coding: utf-8
from __future__ import unicode_literals
from ..util import get_doc
from ...vocab import Vocab
from ...tokens import Doc
from ...tokens import Span
import pytest
def test_doc_split(en_tokenizer):
text = "LosAngeles start."
heads = [1, 1, 0]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
assert len(doc) == 3
assert len(str(doc)) == 19
assert doc[0].head.text == 'start'
assert doc[1].head.text == '.'
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'})
assert len(doc) == 4
assert doc[0].text == 'Los'
assert doc[0].head.text == 'Angeles'
assert doc[0].idx == 0
assert doc[1].idx == 3
assert doc[1].text == 'Angeles'
assert doc[1].head.text == 'start'
assert doc[2].text == 'start'
assert doc[2].head.text == '.'
assert doc[3].text == '.'
assert doc[3].head.text == '.'
assert len(str(doc)) == 19
def test_split_dependencies(en_tokenizer):
text = "LosAngeles start."
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens])
dep1 = doc.vocab.strings.add('amod')
dep2 = doc.vocab.strings.add('subject')
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2])
assert doc[0].dep == dep1
assert doc[1].dep == dep2
def test_split_heads_error(en_tokenizer):
text = "LosAngeles start."
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens])
#Not enough heads
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [0])
#Too many heads
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1, 0])
#No token head
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1])
#Several token heads
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0])
def test_spans_entity_merge_iob():
# Test entity IOB stays consistent after merging
words = ["abc", "d", "e"]
doc = Doc(Vocab(), words=words)
doc.ents = [(doc.vocab.strings.add('ent-abcd'), 0, 2)]
assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I"
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["a", "b", "c"], [1, 1, 0])
assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I"
assert doc[2].ent_iob_ == "I"
assert doc[3].ent_iob_ == "I"
def test_spans_sentence_update_after_merge(en_tokenizer):
text = "StewartLee is a stand up comedian. He lives in England and loves JoePasquale."
heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
deps = ['nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
'compound', 'punct']
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
sent1, sent2 = list(doc.sents)
init_len = len(sent1)
init_len2 = len(sent2)
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Stewart", "Lee"], [1, 0])
retokenizer.split(doc[14], ["Joe", "Pasquale"], [1, 0])
sent1, sent2 = list(doc.sents)
assert len(sent1) == init_len + 1
assert len(sent2) == init_len2 + 1

View File

@ -0,0 +1,21 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.util import get_lang_class
# fmt: off
# Only include languages with no external dependencies
# excluded: ja, ru, th, uk, vi, zh
LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
"et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
"it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
"sl", "sq", "sv", "ta", "te", "tl", "tr", "tt", "ur"]
# fmt: on
@pytest.mark.parametrize("lang", LANGUAGES)
def test_lang_initialize(lang):
"""Test that languages can be initialized."""
lang_cls = get_lang_class(lang)()

View File

@ -7,7 +7,9 @@ from spacy.compat import pickle
def test_issue2833(en_vocab):
"""Test that a custom error is raised if a token is pickled."""
"""Test that a custom error is raised if a token or span is pickled."""
doc = Doc(en_vocab, words=["Hello", "world"])
with pytest.raises(NotImplementedError):
pickle.dumps(doc[0])
with pytest.raises(NotImplementedError):
pickle.dumps(doc[0:2])

View File

@ -1,24 +1,25 @@
'''Test that labels are mapped to classes consistently when loading NER model.'''
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English
import pytest
@pytest.mark.xfail
def test_issue3209():
'''Test issue that occurred in spaCy nightly where NER labels were being
"""Test issue that occurred in spaCy nightly where NER labels were being
mapped to classes incorrectly after loading the model, when the labels
were added using ner.add_label().
'''
"""
nlp = English()
ner = nlp.create_pipe('ner')
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
ner.add_label('ANIMAL')
ner.add_label("ANIMAL")
nlp.begin_training()
move_names = ['O', 'B-ANIMAL', 'I-ANIMAL', 'L-ANIMAL', 'U-ANIMAL']
move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
assert ner.move_names == move_names
nlp2 = English()
nlp2.add_pipe(nlp2.create_pipe('ner'))
nlp2.add_pipe(nlp2.create_pipe("ner"))
nlp2.from_bytes(nlp.to_bytes())
assert nlp2.get_pipe('ner').move_names == move_names
assert nlp2.get_pipe("ner").move_names == move_names

View File

@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.matcher import PhraseMatcher
from spacy.lang.en import English
from spacy.compat import pickle

View File

@ -45,12 +45,12 @@ cdef class Retokenizer:
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
self.merges.append((span, attrs))
def split(self, Token token, orths, attrs=SimpleFrozenDict()):
def split(self, Token token, orths, heads, deps=[], attrs=SimpleFrozenDict()):
"""Mark a Token for splitting, into the specified orths. The attrs
will be applied to each subtoken.
"""
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
self.splits.append((token.start_char, orths, attrs))
self.splits.append((token.i, orths, heads, deps, attrs))
def __enter__(self):
self.merges = []
@ -67,8 +67,12 @@ cdef class Retokenizer:
end = span.end
_merge(self.doc, start, end, attrs)
for start_char, orths, attrs in self.splits:
raise NotImplementedError
offset = 0
# Iterate in order, to keep the offset simple.
for token_index, orths, heads, deps, attrs in sorted(self.splits):
_split(self.doc, token_index + offset, orths, heads, deps, attrs)
# Adjust for the previous tokens
offset += len(orths)-1
def _merge(Doc doc, int start, int end, attributes):
"""Retokenize the document, such that the span at
@ -299,3 +303,110 @@ def _resize_tensor(tensor, ranges):
delete.append(i)
xp = get_array_module(tensor)
return xp.delete(tensor, delete, axis=0)
def _split(Doc doc, int token_index, orths, heads, deps, attrs):
"""Retokenize the document, such that the token at
`doc[token_index]` is split into tokens with the orth 'orths'
token_index(int): token index of the token to split.
orths: IDs of the verbatim text content of the tokens to create
**attributes: Attributes to assign to each of the newly created tokens. By default,
attributes are inherited from the original token.
RETURNS (Token): The first newly created token.
"""
cdef int nb_subtokens = len(orths)
cdef const LexemeC* lex
cdef TokenC* token
cdef TokenC orig_token = doc.c[token_index]
if(len(heads) != nb_subtokens):
raise ValueError(Errors.E115)
token_head_index = -1
for index, head in enumerate(heads):
if head == 0:
if token_head_index != -1:
raise ValueError(Errors.E114)
token_head_index = index
if token_head_index == -1:
raise ValueError(Errors.E113)
# First, make the dependencies absolutes, and adjust all possible dependencies before
# creating the tokens
for i in range(doc.length):
doc.c[i].head += i
# Adjust dependencies
offset = nb_subtokens - 1
for i in range(doc.length):
head_idx = doc.c[i].head
if head_idx == token_index:
doc.c[i].head = token_head_index
elif head_idx > token_index:
doc.c[i].head += offset
new_token_head = doc.c[token_index].head
# Double doc.c max_length if necessary (until big enough for all new tokens)
while doc.length + nb_subtokens - 1 >= doc.max_length:
doc._realloc(doc.length * 2)
# Move tokens after the split to create space for the new tokens
doc.length = len(doc) + nb_subtokens -1
for token_to_move in range(doc.length - 1, token_index, -1):
doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move]
# Host the tokens in the newly created space
cdef int idx_offset = 0
for i, orth in enumerate(orths):
token = &doc.c[token_index + i]
lex = doc.vocab.get(doc.mem, orth)
token.lex = lex
# Update the character offset of the subtokens
if i != 0:
token.idx = orig_token.idx + idx_offset
idx_offset += len(orth)
# Set token.spacy to False for all non-last split tokens, and
# to origToken.spacy for the last token
if (i < nb_subtokens - 1):
token.spacy = False
else:
token.spacy = orig_token.spacy
# Apply attrs to each subtoken
for attr_name, attr_value in attrs.items():
if attr_name == TAG:
doc.vocab.morphology.assign_tag(token, attr_value)
else:
Token.set_struct_attr(token, attr_name, attr_value)
# Make IOB consistent
if (orig_token.ent_iob == 3):
if i == 0:
token.ent_iob = 3
else:
token.ent_iob = 1
else:
# In all other cases subtokens inherit iob from origToken
token.ent_iob = orig_token.ent_iob
# Use the head of the new token everywhere. This will be partially overwritten later on.
token.head = new_token_head
# Transform the dependencies into relative ones again
for i in range(doc.length):
doc.c[i].head -= i
# Assign correct dependencies to the inner token
for i, head in enumerate(heads):
if head != 0:
# the token's head's head is already correct
doc.c[token_index + i].head = head
for i, dep in enumerate(deps):
doc[token_index + i].dep = dep
# set children from head
set_children_from_heads(doc.c, doc.length)

View File

@ -141,6 +141,9 @@ cdef class Span:
for i in range(self.start, self.end):
yield self.doc[i]
def __reduce__(self):
raise NotImplementedError(Errors.E112)
@property
def _(self):
"""User space for adding custom attribute extensions."""

View File

@ -53,8 +53,8 @@ def get_lang_class(lang):
if lang not in LANGUAGES:
try:
module = importlib.import_module(".lang.%s" % lang, "spacy")
except ImportError:
raise ImportError(Errors.E048.format(lang=lang))
except ImportError as err:
raise ImportError(Errors.E048.format(lang=lang, err=err))
LANGUAGES[lang] = getattr(module, module.__all__[0])
return LANGUAGES[lang]