Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-11-03 13:29:56 +01:00
commit 33bd2428db
36 changed files with 1409 additions and 97 deletions

View File

@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [ ] I am signing on behalf of myself as an individual and no other person
* [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
@ -98,9 +98,9 @@ mark both statements:
| Field | Entry |
|------------------------------- | -------------------- |
| Name | |
| Company name (if applicable) | |
| Title or role (if applicable) | |
| Date | |
| GitHub username | |
| Website (optional) | |
| Name | Abhinav Sharma |
| Company name (if applicable) | Fourtek I.T. Solutions Pvt. Ltd. |
| Title or role (if applicable) | Machine Learning Engineer |
| Date | 3 Novermber 2017 |
| GitHub username | abhi18av |
| Website (optional) | https://abhi18av.github.io/ |

View File

@ -150,10 +150,10 @@ class PrecomputableAffine(Model):
def _backprop_padding(self, dY, ids):
# (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0
for i in range(ids.shape[0]):
for j in range(ids.shape[1]):
if ids[i,j] < 0:
self.d_pad[0,j] += dY[i, j]
mask = ids < 0.
mask = mask.sum(axis=1)
d_pad = dY * mask.reshape((ids.shape[0], 1, 1))
self.d_pad += d_pad.sum(axis=0)
return dY, ids
@staticmethod

View File

@ -85,6 +85,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
batch_sizes = util.compounding(util.env_opt('batch_from', 1),
util.env_opt('batch_to', 16),
util.env_opt('batch_compound', 1.001))
max_doc_len = util.env_opt('max_doc_len', 5000)
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
n_train_words = corpus.count_train()
@ -108,6 +109,9 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
losses = {}
for batch in minibatch(train_docs, size=batch_sizes):
batch = [(d, g) for (d, g) in batch if len(d) < max_doc_len]
if not batch:
continue
docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer,
drop=next(dropout_rates), losses=losses)

View File

@ -20,7 +20,7 @@ for exc_data in [
{ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"},
{ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"},
{ORTH: "মি.লি.", LEMMA: "মিলিলিটার"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)]
_exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc

View File

@ -8,7 +8,6 @@ _exc = {}
for exc_data in [
{ORTH: "Kbh.", LEMMA: "København", NORM: "København"},
{ORTH: "Jan.", LEMMA: "januar", NORM: "januar"},
{ORTH: "Feb.", LEMMA: "februar", NORM: "februar"},
{ORTH: "Mar.", LEMMA: "marts", NORM: "marts"},
@ -21,7 +20,7 @@ for exc_data in [
{ORTH: "Okt.", LEMMA: "oktober", NORM: "oktober"},
{ORTH: "Nov.", LEMMA: "november", NORM: "november"},
{ORTH: "Dec.", LEMMA: "december", NORM: "december"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)]
_exc[exc_data[ORTH]] = [exc_data]
for orth in [
"A/S", "beg.", "bl.a.", "ca.", "d.s.s.", "dvs.", "f.eks.", "fr.", "hhv.",

View File

@ -164,7 +164,7 @@ for exc_data in [
{ORTH: "z.b.", LEMMA: "zum Beispiel"},
{ORTH: "zzgl.", LEMMA: "zuzüglich"},
{ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)]
_exc[exc_data[ORTH]] = [exc_data]
for orth in [

View File

@ -276,7 +276,7 @@ for exc_data in [
exc_data_apos = dict(exc_data)
exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
for data in [exc_data, exc_data_apos]:
_exc[data[ORTH]] = [dict(data)]
_exc[data[ORTH]] = [data]
# Times
@ -440,7 +440,7 @@ for exc_data in [
{ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
{ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
{ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)]
_exc[exc_data[ORTH]] = [exc_data]
for orth in [

View File

@ -26,7 +26,7 @@ for exc_data in [
{ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"},
{ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
{ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)]
_exc[exc_data[ORTH]] = [exc_data]
# Times

View File

@ -73,7 +73,7 @@ for exc_data in [
{ORTH: "ts.", LEMMA: "toisin sanoen"},
{ORTH: "vm.", LEMMA: "viimeksi mainittu"},
{ORTH: "srk.", LEMMA: "seurakunta"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)]
_exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc

View File

@ -54,7 +54,7 @@ for exc_data in [
{LEMMA: "degrés", ORTH: ""},
{LEMMA: "saint", ORTH: "St."},
{LEMMA: "sainte", ORTH: "Ste."}]:
_exc[exc_data[ORTH]] = [dict(exc_data)]
_exc[exc_data[ORTH]] = [exc_data]
for orth in FR_BASE_EXCEPTIONS + ["etc."]:

27
spacy/lang/hr/__init__.py Normal file
View File

@ -0,0 +1,27 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class CroatianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'hr'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = STOP_WORDS
class Croatian(Language):
lang = 'hr'
Defaults = CroatianDefaults
__all__ = ['Croatian']

187
spacy/lang/hr/stop_words.py Normal file
View File

@ -0,0 +1,187 @@
# encoding: utf8
from __future__ import unicode_literals
# Source: https://github.com/stopwords-iso/stopwords-hr
STOP_WORDS = set("""
a
ako
ali
bi
bih
bila
bili
bilo
bio
bismo
biste
biti
bumo
da
do
duž
ga
hoće
hoćemo
hoćete
hoćeš
hoću
i
iako
ih
ili
iz
ja
je
jedna
jedne
jedno
jer
jesam
jesi
jesmo
jest
jeste
jesu
jim
joj
još
ju
kada
kako
kao
koja
koje
koji
kojima
koju
kroz
li
me
mene
meni
mi
mimo
moj
moja
moje
mu
na
nad
nakon
nam
nama
nas
naš
naša
naše
našeg
ne
nego
neka
neki
nekog
neku
nema
netko
neće
nećemo
nećete
nećeš
neću
nešto
ni
nije
nikoga
nikoje
nikoju
nisam
nisi
nismo
niste
nisu
njega
njegov
njegova
njegovo
njemu
njezin
njezina
njezino
njih
njihov
njihova
njihovo
njim
njima
njoj
nju
no
o
od
odmah
on
ona
oni
ono
ova
pa
pak
po
pod
pored
prije
s
sa
sam
samo
se
sebe
sebi
si
smo
ste
su
sve
svi
svog
svoj
svoja
svoje
svom
ta
tada
taj
tako
te
tebe
tebi
ti
to
toj
tome
tu
tvoj
tvoja
tvoje
u
uz
vam
vama
vas
vaš
vaša
vaše
već
vi
vrlo
za
zar
će
ćemo
ćete
ćeš
ću
što
""".split())

View File

@ -11,7 +11,7 @@ for exc_data in [
{ORTH: "jan.", LEMMA: "januar"},
{ORTH: "feb.", LEMMA: "februar"},
{ORTH: "jul.", LEMMA: "juli"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)]
_exc[exc_data[ORTH]] = [exc_data]
for orth in [

View File

@ -1,7 +1,7 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import ORTH, LEMMA, POS
from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN
_exc = {}
@ -13,7 +13,7 @@ for exc_data in [
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
{ORTH: "tj.", LEMMA: "to jest", POS: ADV},
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
_exc[exc_data[ORTH]] = [dict(exc_data)],
_exc[exc_data[ORTH]] = [exc_data]
for orth in [
"w.", "r."]:

28
spacy/lang/ro/__init__.py Normal file
View File

@ -0,0 +1,28 @@
# coding: utf8
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class RomanianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'ro'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
class Romanian(Language):
lang = 'ro'
Defaults = RomanianDefaults
__all__ = ['Romanian']

442
spacy/lang/ro/stop_words.py Normal file
View File

@ -0,0 +1,442 @@
# encoding: utf8
from __future__ import unicode_literals
# Source: https://github.com/stopwords-iso/stopwords-ro
STOP_WORDS = set("""
a
abia
acea
aceasta
această
aceea
aceeasi
acei
aceia
acel
acela
acelasi
acele
acelea
acest
acesta
aceste
acestea
acestei
acestia
acestui
aceşti
aceştia
acolo
acord
acum
adica
ai
aia
aibă
aici
aiurea
al
ala
alaturi
ale
alea
alt
alta
altceva
altcineva
alte
altfel
alti
altii
altul
am
anume
apoi
ar
are
as
asa
asemenea
asta
astazi
astea
astfel
astăzi
asupra
atare
atat
atata
atatea
atatia
ati
atit
atita
atitea
atitia
atunci
au
avea
avem
aveţi
avut
azi
aşadar
aţi
b
ba
bine
bucur
bună
c
ca
cam
cand
capat
care
careia
carora
caruia
cat
catre
caut
ce
cea
ceea
cei
ceilalti
cel
cele
celor
ceva
chiar
ci
cinci
cind
cine
cineva
cit
cita
cite
citeva
citi
citiva
conform
contra
cu
cui
cum
cumva
curând
curînd
când
cât
câte
câtva
câţi
cînd
cît
cîte
cîtva
cîţi
căci
cărei
căror
cărui
către
d
da
daca
dacă
dar
dat
datorită
dată
dau
de
deasupra
deci
decit
degraba
deja
deoarece
departe
desi
despre
deşi
din
dinaintea
dintr
dintr-
dintre
doar
doi
doilea
două
drept
dupa
după
e
ea
ei
el
ele
era
eram
este
eu
exact
eşti
f
face
fara
fata
fel
fi
fie
fiecare
fii
fim
fiu
fiţi
foarte
fost
frumos
fără
g
geaba
graţie
h
halbă
i
ia
iar
ieri
ii
il
imi
in
inainte
inapoi
inca
incit
insa
intr
intre
isi
iti
j
k
l
la
le
li
lor
lui
lângă
lîngă
m
ma
mai
mare
mea
mei
mele
mereu
meu
mi
mie
mine
mod
mult
multa
multe
multi
multă
mulţi
mulţumesc
mâine
mîine
n
ne
nevoie
ni
nici
niciodata
nicăieri
nimeni
nimeri
nimic
niste
nişte
noastre
noastră
noi
noroc
nostri
nostru
nou
noua
nouă
noştri
nu
numai
o
opt
or
ori
oricare
orice
oricine
oricum
oricând
oricât
oricînd
oricît
oriunde
p
pai
parca
patra
patru
patrulea
pe
pentru
peste
pic
pina
plus
poate
pot
prea
prima
primul
prin
printr-
putini
puţin
puţina
puţină
până
pînă
r
rog
s
sa
sa-mi
sa-ti
sai
sale
sau
se
si
sint
sintem
spate
spre
sub
sunt
suntem
sunteţi
sus
sută
sînt
sîntem
sînteţi
săi
său
t
ta
tale
te
ti
timp
tine
toata
toate
toată
tocmai
tot
toti
totul
totusi
totuşi
toţi
trei
treia
treilea
tu
tuturor
tăi
tău
u
ul
ului
un
una
unde
undeva
unei
uneia
unele
uneori
unii
unor
unora
unu
unui
unuia
unul
v
va
vi
voastre
voastră
voi
vom
vor
vostru
vouă
voştri
vreme
vreo
vreun
x
z
zece
zero
zi
zice
îi
îl
îmi
împotriva
în
înainte
înaintea
încotro
încât
încît
între
întrucât
întrucît
îţi
ăla
ălea
ăsta
ăstea
ăştia
şapte
şase
şi
ştiu
ţi
ţie
""".split())

View File

@ -0,0 +1,17 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import ORTH
_exc = {}
# Source: https://en.wiktionary.org/wiki/Category:Romanian_abbreviations
for orth in [
"1-a", "1-ul", "10-a", "10-lea", "2-a", "3-a", "3-lea", "6-lea",
"d-voastră", "dvs.", "Rom.", "str."]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc

View File

@ -68,7 +68,7 @@ for exc_data in [
{ORTH: "Sön.", LEMMA: "Söndag"},
{ORTH: "sthlm", LEMMA: "Stockholm"},
{ORTH: "gbg", LEMMA: "Göteborg"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)]
_exc[exc_data[ORTH]] = [exc_data]
for orth in [

View File

@ -68,7 +68,7 @@ for exc_data in [
{ORTH: "\\n", POS: SPACE},
{ORTH: "\u2014", POS: PUNCT, LEMMA: "--"},
{ORTH: "\u00a0", POS: SPACE, LEMMA: " "}]:
BASE_EXCEPTIONS[exc_data[ORTH]] = [dict(exc_data)]
BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data]
for orth in [

28
spacy/lang/tr/__init__.py Normal file
View File

@ -0,0 +1,28 @@
# coding: utf8
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class TurkishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'tr'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
class Turkish(Language):
lang = 'tr'
Defaults = TurkishDefaults
__all__ = ['Turkish']

512
spacy/lang/tr/stop_words.py Normal file
View File

@ -0,0 +1,512 @@
# encoding: utf8
from __future__ import unicode_literals
# Source: https://github.com/stopwords-iso/stopwords-tr
STOP_WORDS = set("""
acaba
acep
adamakıllı
adeta
ait
altmýþ
altmış
altý
altı
ama
amma
anca
ancak
arada
artýk
aslında
aynen
ayrıca
az
ıkça
ıkçası
bana
bari
bazen
bazý
bazı
başkası
baţka
belki
ben
benden
beni
benim
beri
beriki
beþ
beş
beţ
bilcümle
bile
bin
binaen
binaenaleyh
bir
biraz
birazdan
birbiri
birden
birdenbire
biri
birice
birileri
birisi
birkaç
birkaçı
birkez
birlikte
birçok
birçoğu
birþey
birþeyi
birşey
birşeyi
birţey
bitevi
biteviye
bittabi
biz
bizatihi
bizce
bizcileyin
bizden
bize
bizi
bizim
bizimki
bizzat
boşuna
bu
buna
bunda
bundan
bunlar
bunları
bunların
bunu
bunun
buracıkta
burada
buradan
burası
böyle
böylece
böylecene
böylelikle
böylemesine
böylesine
büsbütün
bütün
cuk
cümlesi
da
daha
dahi
dahil
dahilen
daima
dair
dayanarak
de
defa
dek
demin
demincek
deminden
denli
derakap
derhal
derken
deđil
değil
değin
diye
diđer
diğer
diğeri
doksan
dokuz
dolayı
dolayısıyla
doğru
dört
edecek
eden
ederek
edilecek
ediliyor
edilmesi
ediyor
elbet
elbette
elli
emme
en
enikonu
epey
epeyce
epeyi
esasen
esnasında
etmesi
etraflı
etraflıca
etti
ettiği
ettiğini
evleviyetle
evvel
evvela
evvelce
evvelden
evvelemirde
evveli
eđer
eğer
fakat
filanca
gah
gayet
gayetle
gayri
gayrı
gelgelelim
gene
gerek
gerçi
geçende
geçenlerde
gibi
gibilerden
gibisinden
gine
göre
gırla
hakeza
halbuki
halen
halihazırda
haliyle
handiyse
hangi
hangisi
hani
hariç
hasebiyle
hasılı
hatta
hele
hem
henüz
hep
hepsi
her
herhangi
herkes
herkesin
hiç
hiçbir
hiçbiri
hoş
hulasaten
iken
iki
ila
ile
ilen
ilgili
ilk
illa
illaki
imdi
indinde
inen
insermi
ise
ister
itibaren
itibariyle
itibarıyla
iyi
iyice
iyicene
için
işte
iţte
kadar
kaffesi
kah
kala
kanýmca
karşın
katrilyon
kaynak
kaçı
kelli
kendi
kendilerine
kendini
kendisi
kendisine
kendisini
kere
kez
keza
kezalik
keşke
keţke
ki
kim
kimden
kime
kimi
kimisi
kimse
kimsecik
kimsecikler
külliyen
kýrk
kýsaca
kırk
kısaca
lakin
leh
lütfen
maada
madem
mademki
mamafih
mebni
međer
meğer
meğerki
meğerse
milyar
milyon
mu
mı
nasýl
nasıl
nasılsa
nazaran
naşi
ne
neden
nedeniyle
nedenle
nedense
nerde
nerden
nerdeyse
nere
nerede
nereden
neredeyse
neresi
nereye
netekim
neye
neyi
neyse
nice
nihayet
nihayetinde
nitekim
niye
niçin
o
olan
olarak
oldu
olduklarını
oldukça
olduğu
olduğunu
olmadı
olmadığı
olmak
olması
olmayan
olmaz
olsa
olsun
olup
olur
olursa
oluyor
on
ona
onca
onculayın
onda
ondan
onlar
onlardan
onlari
onlarýn
onları
onların
onu
onun
oracık
oracıkta
orada
oradan
oranca
oranla
oraya
otuz
oysa
oysaki
pek
pekala
peki
pekçe
peyderpey
rağmen
sadece
sahi
sahiden
sana
sanki
sekiz
seksen
sen
senden
seni
senin
siz
sizden
sizi
sizin
sonra
sonradan
sonraları
sonunda
tabii
tam
tamam
tamamen
tamamıyla
tarafından
tek
trilyon
tüm
var
vardı
vasıtasıyla
ve
velev
velhasıl
velhasılıkelam
veya
veyahut
ya
yahut
yakinen
yakında
yakından
yakınlarda
yalnız
yalnızca
yani
yapacak
yapmak
yaptı
yaptıkları
yaptığı
yaptığını
yapılan
yapılması
yapıyor
yedi
yeniden
yenilerde
yerine
yetmiþ
yetmiş
yetmiţ
yine
yirmi
yok
yoksa
yoluyla
yüz
yüzünden
zarfında
zaten
zati
zira
çabuk
çabukça
çeşitli
çok
çokları
çoklarınca
çokluk
çoklukla
çokça
çoğu
çoğun
çoğunca
çoğunlukla
çünkü
öbür
öbürkü
öbürü
önce
önceden
önceleri
öncelikle
öteki
ötekisi
öyle
öylece
öylelikle
öylemesine
öz
üzere
üç
þey
þeyden
þeyi
þeyler
þu
þuna
þunda
þundan
þunu
şayet
şey
şeyden
şeyi
şeyler
şu
şuna
şuncacık
şunda
şundan
şunlar
şunları
şunu
şunun
şura
şuracık
şuracıkta
şurası
şöyle
ţayet
ţimdi
ţu
ţöyle
""".split())

View File

@ -0,0 +1,27 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import ORTH, NORM
# These exceptions are mostly for example purposes hoping that Turkish
# speakers can contribute in the future! Source of copy-pasted examples:
# https://en.wiktionary.org/wiki/Category:Turkish_language
_exc = {
"sağol": [
{ORTH: "sağ"},
{ORTH: "ol", NORM: "olun"}]
}
for exc_data in [
{ORTH: "A.B.D.", NORM: "Amerika Birleşik Devletleri"}]:
_exc[exc_data[ORTH]] = [exc_data]
for orth in ["Dr."]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc

View File

@ -18,7 +18,7 @@ _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id',
'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
_models = {'en': ['en_core_web_sm'],
'de': ['de_core_news_md'],
'fr': ['fr_depvec_web_lg'],
'fr': ['fr_core_news_sm'],
'xx': ['xx_ent_web_md']}

View File

@ -6,6 +6,7 @@ from .. import util
from ..displacy import parse_deps, parse_ents
from ..tokens import Span
from .util import get_doc
from .._ml import PrecomputableAffine
from pathlib import Path
import pytest
@ -59,3 +60,19 @@ def test_displacy_parse_deps(en_vocab):
assert deps['arcs'] == [{'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'},
{'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'},
{'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}]
def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP)
assert model.W.shape == (nF, nO, nP, nI)
tensor = model.ops.allocate((10, nI))
Y, get_dX = model.begin_update(tensor)
assert Y.shape == (tensor.shape[0]+1, nF, nO, nP)
assert model.d_pad.shape == (1, nF, nO, nP)
dY = model.ops.allocate((15, nF, nO, nP))
ids = model.ops.allocate((15, nF))
ids[1,2] = -1
dY[1,2] = 1
assert model.d_pad[0, 2, 0, 0] == 0.
model._backprop_padding(dY, ids)
assert model.d_pad[0, 2, 0, 0] == 1.

View File

@ -40,6 +40,8 @@ for id in CURRENT_MODELS
each label in ["Pipeline", "Vectors", "Sources", "Author", "License"]
- var field = label.toLowerCase()
if field == "vectors"
- field = "vecs"
+row
+cell.u-nowrap
+label=label

View File

@ -13,7 +13,6 @@ script(src="/assets/js/vendor/prism.min.js")
if SECTION == "models"
script(src="/assets/js/vendor/chart.min.js")
script(src="/assets/js/models.js?v#{V_JS}" type="module")
script
if quickstart
@ -24,15 +23,15 @@ script
| (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
| ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
if IS_PAGE
script
if IS_PAGE
| ((window.gitter = {}).chat = {}).options = {
| useStyles: false,
| activationElement: '.js-gitter-button',
| targetElement: '.js-gitter',
| room: '!{SOCIAL.gitter}'
| };
if IS_PAGE
script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
@ -48,10 +47,23 @@ if IS_PAGE
- ModelLoader = "new ModelLoader('" + MODELS_REPO + "'," + JSON.stringify(CURRENT_MODELS) + "," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + ");"
- ModelComparer = "new ModelComparer('" + MODELS_REPO + "'," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + "," + JSON.stringify(LANGUAGES) + "," + JSON.stringify(MODEL_META) + "," + JSON.stringify(default_models || false) + ");"
//- Browsers with JS module support.
Will be ignored otherwise.
script(type="module")
if environment == "deploy"
//- DEPLOY: use compiled rollup.js and instantiate classes directly
script(src="/assets/js/rollup.js")
script
!=ProgressBar
if changelog
!=Changelog
if IS_PAGE
!=NavHighlighter
!=GitHubEmbed
if HAS_MODELS
!=ModeLoader
if compare_models
!=ModelComparer
else
//- DEVELOPMENT: Use ES6 modules
script(type="module")
| import ProgressBar from '/assets/js/progress.js';
!=ProgressBar
if changelog
@ -68,19 +80,3 @@ script(type="module")
if compare_models
| import { ModelComparer } from '/assets/js/models.js';
!=ModelComparer
//- Browsers with no JS module support.
Won't be fetched or interpreted otherwise.
script(nomodule src="/assets/js/rollup.js")
script(nomodule)
!=ProgressBar
if changelog
!=Changelog
if IS_PAGE
!=NavHighlighter
!=GitHubEmbed
if HAS_MODELS
!=ModeLoader
if compare_models
!=ModelComparer

View File

@ -12,7 +12,6 @@ body
animation: fadeIn 0.25s ease
background: $color-back
color: $color-front
//scroll-behavior: smooth
//- Paragraphs

View File

@ -20,21 +20,33 @@ const CHART_FONTS = {
* @property {function} vectors - Format vector data (entries and dimensions).
* @property {function} version - Format model version number.
*/
export const formats = {
const formats = {
author: (author, url) => url ? `<a href="${url}" target="_blank">${author}</a>` : author,
license: (license, url) => url ? `<a href="${url}" target="_blank">${license}</a>` : license,
sources: sources => (sources instanceof Array) ? sources.join(', ') : sources,
pipeline: pipes => (pipes && pipes.length) ? pipes.map(p => `<code>${p}</code>`).join(', ') : '-',
vectors: vec => vec ? `${abbrNumber(vec.keys)} keys, ${abbrNumber(vec.vectors)} unique vectors (${vec.width} dimensions)` : 'n/a',
vectors: vec => formatVectors(vec),
version: version => `<code>v${version}</code>`
};
/**
* Format word vectors data depending on contents.
* @property {Object} data - The vectors object from the model's meta.json.
*/
const formatVectors = data => {
if (!data) return 'n/a';
if (Object.values(data).every(n => n == 0)) return 'context vectors only';
const { keys, vectors: vecs, width } = data;
return `${abbrNumber(keys)} keys, ${abbrNumber(vecs)} unique vectors (${width} dimensions)`;
}
/**
* Find the latest version of a model in a compatibility table.
* @param {string} model - The model name.
* @param {Object} compat - Compatibility table, keyed by spaCy version.
*/
export const getLatestVersion = (model, compat = {}) => {
const getLatestVersion = (model, compat = {}) => {
for (let [spacy_v, models] of Object.entries(compat)) {
if (models[model]) return models[model][0];
}
@ -90,7 +102,7 @@ export class ModelLoader {
const tpl = new Templater(modelId);
tpl.get('table').removeAttribute('data-loading');
tpl.get('error').style.display = 'block';
for (let key of ['sources', 'pipeline', 'vectors', 'author', 'license']) {
for (let key of ['sources', 'pipeline', 'vecs', 'author', 'license']) {
tpl.get(key).parentElement.parentElement.style.display = 'none';
}
}
@ -120,8 +132,8 @@ export class ModelLoader {
if (author) tpl.fill('author', formats.author(author, url), true);
if (license) tpl.fill('license', formats.license(license, this.licenses[license]), true);
if (sources) tpl.fill('sources', formats.sources(sources));
if (vectors) tpl.fill('vectors', formats.vectors(vectors));
else tpl.get('vectors').parentElement.parentElement.style.display = 'none';
if (vectors) tpl.fill('vecs', formats.vectors(vectors));
else tpl.get('vecs').parentElement.parentElement.style.display = 'none';
if (pipeline && pipeline.length) tpl.fill('pipeline', formats.pipeline(pipeline), true);
else tpl.get('pipeline').parentElement.parentElement.style.display = 'none';
}
@ -186,6 +198,7 @@ export class ModelComparer {
this.fonts = CHART_FONTS;
this.defaultModels = defaultModels;
this.tpl.get('result').style.display = 'block';
this.tpl.get('error').style.display = 'none';
this.fetchCompat()
.then(compat => this.init(compat))
.catch(this.showError.bind(this))
@ -223,8 +236,9 @@ export class ModelComparer {
const version = getLatestVersion(name, this.compat);
const modelName = `${name}-${version}`;
return new Promise((resolve, reject) => {
if (!version) reject();
// resolve immediately if model already loaded, e.g. in this.models
if (this.models[name]) resolve(this.models[name]);
else if (this.models[name]) resolve(this.models[name]);
else fetch(`${this.url}/meta/${modelName}.json`)
.then(res => handleResponse(res))
.then(json => json.ok ? resolve(this.saveModel(name, json)) : reject())
@ -306,12 +320,13 @@ export class ModelComparer {
this.tpl.fill(`size${i}`, size);
this.tpl.fill(`desc${i}`, description || 'n/a');
this.tpl.fill(`pipeline${i}`, formats.pipeline(pipeline), true);
this.tpl.fill(`vectors${i}`, formats.vectors(vectors));
this.tpl.fill(`vecs${i}`, formats.vectors(vectors));
this.tpl.fill(`sources${i}`, formats.sources(sources));
this.tpl.fill(`author${i}`, formats.author(author, url), true);
this.tpl.fill(`license${i}`, formats.license(license, this.licenses[license]), true);
// check if model accuracy or speed includes one of the pre-set keys
for (let key of [...metaKeys, ...Object.keys(this.benchKeys.speed)]) {
const allKeys = [].concat(...Object.entries(this.benchKeys).map(([_, v]) => Object.keys(v)));
for (let key of allKeys) {
if (accuracy[key]) this.tpl.fill(`${key}${i}`, accuracy[key].toFixed(2))
else if (speed[key]) this.tpl.fill(`${key}${i}`, convertNumber(Math.round(speed[key])))
else this.tpl.fill(`${key}${i}`, 'n/a')

View File

@ -59,11 +59,12 @@ export const convertNumber = (num = 0, separator = ',') =>
* @param {number|string} num - The number to convert.
* @param {number} fixed - Number of decimals.
*/
export const abbrNumber = (num = 0, fixed = 2) => {
export const abbrNumber = (num = 0, fixed = 1) => {
const suffixes = ['', 'k', 'm', 'b', 't'];
if (num === null || num === 0) return 0;
const b = num.toPrecision(2).split('e');
const k = (b.length === 1) ? 0 : Math.floor(Math.min(b[1].slice(1), 14) / 3);
const c = (k < 1) ? num.toFixed(fixed) : (num / Math.pow(10, k * 3)).toFixed(fixed + 1);
const n = (k < 1) ? num : num / Math.pow(10, k * 3);
const c = (k >= 1 && n >= 100 ) ? Math.round(n) : n.toFixed(fixed);
return (c < 0 ? c : Math.abs(c)) + suffixes[k];
}

View File

@ -12,6 +12,7 @@
"Portuguese": "pt",
"French": "fr",
"Italian": "it",
"Dutch": "nl",
"Multi-Language": "xx"
}
},
@ -40,11 +41,9 @@
"MODELS": {
"en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"],
"de": ["de_dep_news_sm"],
"es": ["es_core_web_sm"],
"pt": [],
"fr": [],
"it": [],
"de": ["de_core_news_sm"],
"es": ["es_core_news_sm", "es_core_news_md"],
"it": ["it_core_news_sm"],
"xx": ["xx_ent_wiki_sm"]
},
@ -66,6 +65,7 @@
"gpu": "words per second on GPU",
"pipeline": "Processing pipeline components in order",
"sources": "Sources of training data",
"vecs": "Word vectors included in the model. Models that only support context vectors compute similarity via the tensors shared with the pipeline.",
"benchmark_parser": "Parser accuracy",
"benchmark_ner": "NER accuracy",
"benchmark_speed": "Speed"
@ -74,9 +74,11 @@
"MODEL_LICENSES": {
"CC BY-SA": "https://creativecommons.org/licenses/by-sa/3.0/",
"CC BY-SA 3.0": "https://creativecommons.org/licenses/by-sa/3.0/",
"CC BY-SA 4.0": "https://creativecommons.org/licenses/by-sa/4.0/",
"CC BY-NC": "https://creativecommons.org/licenses/by-nc/3.0/",
"CC BY-NC 3.0": "https://creativecommons.org/licenses/by-nc/3.0/",
"GPL": "http://www.gnu.de/documents/gpl.en.html"
"GPL": "https://www.gnu.org/licenses/gpl.html",
"LGPL": "https://www.gnu.org/licenses/lgpl.html"
},
"MODEL_BENCHMARKS": {
@ -99,6 +101,9 @@
"da": "Danish",
"hu": "Hungarian",
"pl": "Polish",
"ro": "Romanian",
"hr": "Croatian",
"tr": "Turkish",
"he": "Hebrew",
"ga": "Irish",
"bn": "Bengali",

View File

@ -53,6 +53,8 @@ div(data-tpl=TPL data-tpl-key="result" style="display: none")
for label in ["Version", "Size", "Pipeline", "Vectors", "Sources", "Author", "License"]
- var field = label.toLowerCase()
if field == "vectors"
- field = "vecs"
+row
+cell.u-nowrap
+label=label

6
website/models/nl.jade Normal file
View File

@ -0,0 +1,6 @@
//- 💫 DOCS > MODELS > NL
include ../_includes/_mixins
//- This is a placeholder. The page is rendered via the template at
//- /_includes/_page-model.jade.

View File

@ -9,7 +9,8 @@
"babel-cli": "^6.14.0",
"harp": "^0.24.0",
"rollup": "^0.50.0",
"uglify-js": "^2.7.3"
"uglify-js": "^2.7.3",
"broken-link-checker": "^0.7.6"
},
"dependencies": {},
"scripts": {

View File

@ -218,7 +218,7 @@ p
| If an exception consists of more than one token, the #[code ORTH] values
| combined always need to #[strong match the original string]. The way the
| original string is split up can be pretty arbitrary sometimes for
| example "gonna" is split into "gon" (lemma "go") nad "na" (lemma "to").
| example "gonna" is split into "gon" (lemma "go") and "na" (lemma "to").
| Because of how the tokenizer works, it's currently not possible to split
| single-letter strings into multiple tokens.

View File

@ -4,9 +4,9 @@ p
| Similarity is determined by comparing #[strong word vectors] or "word
| embeddings", multi-dimensional meaning representations of a word. Word
| vectors can be generated using an algorithm like
| #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's
| #[+a("/models") default models] come with
| #[strong 300-dimensional vectors] that look like this:
| #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. spaCy's medium
| #[code md] and large #[code lg] #[+a("/models") models] come with
| #[strong multi-dimensional vectors] that look like this:
+code("banana.vector", false, false, 250).
array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01,

View File

@ -4,12 +4,9 @@
| Dense, real valued vectors representing distributional similarity
| information are now a cornerstone of practical NLP. The most common way
| to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]
| family of algorithms. The default
| #[+a("/models/en") English model] installs
| 300-dimensional vectors trained on the
| #[+a("http://commoncrawl.org") Common Crawl] corpus.
| If you need to train a word2vec model, we recommend the implementation in
| the Python library #[+a("https://radimrehurek.com/gensim/") Gensim].
| family of algorithms. If you need to train a word2vec model, we recommend
| the implementation in the Python library
| #[+a("https://radimrehurek.com/gensim/") Gensim].
include ../_spacy-101/_similarity
include ../_spacy-101/_word-vectors