Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-07-16 03:02:41 +03:00 · 2017-11-03 13:29:56 +01:00 · 2017-11-03 13:29:56 +01:00 · 33bd2428db
commit 33bd2428db
parent 6681058abd 58bb4bd484
36 changed files with 1409 additions and 97 deletions
--- a/.github/CONTRIBUTOR_AGREEMENT.md
+++ b/.github/CONTRIBUTOR_AGREEMENT.md
@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:

-    * [ ] I am signing on behalf of myself as an individual and no other person
+    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.

@ -98,9 +98,9 @@ mark both statements:

 | Field                          | Entry                              |
 |------------------------------- | --------------------               |
-| Name                           |                      |
-| Company name (if applicable)   |                      |
-| Title or role (if applicable)  |                      |
-| Date                           |                      |
-| GitHub username                |                      |
-| Website (optional)             |                      |
+| Name                           | Abhinav Sharma                     |
+| Company name (if applicable)   | Fourtek I.T. Solutions Pvt. Ltd.   |
+| Title or role (if applicable)  | Machine Learning Engineer          |
+| Date                           | 3 Novermber 2017                   |
+| GitHub username                | abhi18av                           |
+| Website (optional)             | https://abhi18av.github.io/        |
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -150,10 +150,10 @@ class PrecomputableAffine(Model):

    def _backprop_padding(self, dY, ids):
        # (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0
-        for i in range(ids.shape[0]):
-            for j in range(ids.shape[1]):
-                if ids[i,j] < 0:
-                    self.d_pad[0,j] += dY[i, j]
+        mask = ids < 0.
+        mask = mask.sum(axis=1)
+        d_pad = dY * mask.reshape((ids.shape[0], 1, 1))
+        self.d_pad += d_pad.sum(axis=0)
        return dY, ids

    @staticmethod
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -85,6 +85,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
    batch_sizes = util.compounding(util.env_opt('batch_from', 1),
                                   util.env_opt('batch_to', 16),
                                   util.env_opt('batch_compound', 1.001))
+    max_doc_len = util.env_opt('max_doc_len', 5000)
    corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
    n_train_words = corpus.count_train()

@ -108,6 +109,9 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in minibatch(train_docs, size=batch_sizes):
+                    batch = [(d, g) for (d, g) in batch if len(d) < max_doc_len]
+                    if not batch:
+                        continue
                    docs, golds = zip(*batch)
                    nlp.update(docs, golds, sgd=optimizer,
                               drop=next(dropout_rates), losses=losses)
--- a/spacy/lang/bn/tokenizer_exceptions.py
+++ b/spacy/lang/bn/tokenizer_exceptions.py
@ -20,7 +20,7 @@ for exc_data in [
    {ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"},
    {ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"},
    {ORTH: "মি.লি.", LEMMA: "মিলিলিটার"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]


 TOKENIZER_EXCEPTIONS = _exc
--- a/spacy/lang/da/tokenizer_exceptions.py
+++ b/spacy/lang/da/tokenizer_exceptions.py
@ -8,7 +8,6 @@ _exc = {}

 for exc_data in [
    {ORTH: "Kbh.", LEMMA: "København", NORM: "København"},
-
    {ORTH: "Jan.", LEMMA: "januar", NORM: "januar"},
    {ORTH: "Feb.", LEMMA: "februar", NORM: "februar"},
    {ORTH: "Mar.", LEMMA: "marts", NORM: "marts"},
@ -21,7 +20,7 @@ for exc_data in [
    {ORTH: "Okt.", LEMMA: "oktober", NORM: "oktober"},
    {ORTH: "Nov.", LEMMA: "november", NORM: "november"},
    {ORTH: "Dec.", LEMMA: "december", NORM: "december"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]

 for orth in [
    "A/S", "beg.", "bl.a.", "ca.", "d.s.s.", "dvs.", "f.eks.", "fr.", "hhv.",
--- a/spacy/lang/de/tokenizer_exceptions.py
+++ b/spacy/lang/de/tokenizer_exceptions.py
@ -164,7 +164,7 @@ for exc_data in [
    {ORTH: "z.b.", LEMMA: "zum Beispiel"},
    {ORTH: "zzgl.", LEMMA: "zuzüglich"},
    {ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]


 for orth in [
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -276,7 +276,7 @@ for exc_data in [
    exc_data_apos = dict(exc_data)
    exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
    for data in [exc_data, exc_data_apos]:
-        _exc[data[ORTH]] = [dict(data)]
+        _exc[data[ORTH]] = [data]


 # Times
@ -440,7 +440,7 @@ for exc_data in [
    {ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
    {ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
    {ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]


 for orth in [
--- a/spacy/lang/es/tokenizer_exceptions.py
+++ b/spacy/lang/es/tokenizer_exceptions.py
@ -26,7 +26,7 @@ for exc_data in [
    {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"},
    {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
    {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]


 # Times
--- a/spacy/lang/fi/tokenizer_exceptions.py
+++ b/spacy/lang/fi/tokenizer_exceptions.py
@ -73,7 +73,7 @@ for exc_data in [
    {ORTH: "ts.", LEMMA: "toisin sanoen"},
    {ORTH: "vm.", LEMMA: "viimeksi mainittu"},
    {ORTH: "srk.", LEMMA: "seurakunta"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]


 TOKENIZER_EXCEPTIONS = _exc
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@ -54,7 +54,7 @@ for exc_data in [
    {LEMMA: "degrés", ORTH: "d°"},
    {LEMMA: "saint", ORTH: "St."},
    {LEMMA: "sainte", ORTH: "Ste."}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]


 for orth in FR_BASE_EXCEPTIONS + ["etc."]:
--- a/spacy/lang/hr/init.py
+++ b/spacy/lang/hr/init.py
@ -0,0 +1,27 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .stop_words import STOP_WORDS
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
+from ...language import Language
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class CroatianDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'hr'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
+    stop_words = STOP_WORDS
+
+
+class Croatian(Language):
+    lang = 'hr'
+    Defaults = CroatianDefaults
+
+
+__all__ = ['Croatian']
+
--- a/spacy/lang/hr/stop_words.py
+++ b/spacy/lang/hr/stop_words.py
@ -0,0 +1,187 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+# Source: https://github.com/stopwords-iso/stopwords-hr
+
+STOP_WORDS = set("""
+a
+ako
+ali
+bi
+bih
+bila
+bili
+bilo
+bio
+bismo
+biste
+biti
+bumo
+da
+do
+duž
+ga
+hoće
+hoćemo
+hoćete
+hoćeš
+hoću
+i
+iako
+ih
+ili
+iz
+ja
+je
+jedna
+jedne
+jedno
+jer
+jesam
+jesi
+jesmo
+jest
+jeste
+jesu
+jim
+joj
+još
+ju
+kada
+kako
+kao
+koja
+koje
+koji
+kojima
+koju
+kroz
+li
+me
+mene
+meni
+mi
+mimo
+moj
+moja
+moje
+mu
+na
+nad
+nakon
+nam
+nama
+nas
+naš
+naša
+naše
+našeg
+ne
+nego
+neka
+neki
+nekog
+neku
+nema
+netko
+neće
+nećemo
+nećete
+nećeš
+neću
+nešto
+ni
+nije
+nikoga
+nikoje
+nikoju
+nisam
+nisi
+nismo
+niste
+nisu
+njega
+njegov
+njegova
+njegovo
+njemu
+njezin
+njezina
+njezino
+njih
+njihov
+njihova
+njihovo
+njim
+njima
+njoj
+nju
+no
+o
+od
+odmah
+on
+ona
+oni
+ono
+ova
+pa
+pak
+po
+pod
+pored
+prije
+s
+sa
+sam
+samo
+se
+sebe
+sebi
+si
+smo
+ste
+su
+sve
+svi
+svog
+svoj
+svoja
+svoje
+svom
+ta
+tada
+taj
+tako
+te
+tebe
+tebi
+ti
+to
+toj
+tome
+tu
+tvoj
+tvoja
+tvoje
+u
+uz
+vam
+vama
+vas
+vaš
+vaša
+vaše
+već
+vi
+vrlo
+za
+zar
+će
+ćemo
+ćete
+ćeš
+ću
+što
+""".split())
--- a/spacy/lang/nb/tokenizer_exceptions.py
+++ b/spacy/lang/nb/tokenizer_exceptions.py
@ -11,7 +11,7 @@ for exc_data in [
    {ORTH: "jan.", LEMMA: "januar"},
    {ORTH: "feb.", LEMMA: "februar"},
    {ORTH: "jul.", LEMMA: "juli"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]


 for orth in [
--- a/spacy/lang/pl/tokenizer_exceptions.py
+++ b/spacy/lang/pl/tokenizer_exceptions.py
@ -1,7 +1,7 @@
 # encoding: utf8
 from __future__ import unicode_literals

-from ..symbols import ORTH, LEMMA, POS
+from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN


 _exc = {}
@ -13,7 +13,7 @@ for exc_data in [
    {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
    {ORTH: "tj.", LEMMA: "to jest", POS: ADV},
    {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)],
+    _exc[exc_data[ORTH]] = [exc_data]

 for orth in [
    "w.", "r."]:
--- a/spacy/lang/ro/init.py
+++ b/spacy/lang/ro/init.py
@ -0,0 +1,28 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .stop_words import STOP_WORDS
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
+from ...language import Language
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class RomanianDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'ro'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = STOP_WORDS
+
+
+class Romanian(Language):
+    lang = 'ro'
+    Defaults = RomanianDefaults
+
+
+__all__ = ['Romanian']
+
--- a/spacy/lang/ro/stop_words.py
+++ b/spacy/lang/ro/stop_words.py
@ -0,0 +1,442 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+# Source: https://github.com/stopwords-iso/stopwords-ro
+
+STOP_WORDS = set("""
+a
+abia
+acea
+aceasta
+această
+aceea
+aceeasi
+acei
+aceia
+acel
+acela
+acelasi
+acele
+acelea
+acest
+acesta
+aceste
+acestea
+acestei
+acestia
+acestui
+aceşti
+aceştia
+acolo
+acord
+acum
+adica
+ai
+aia
+aibă
+aici
+aiurea
+al
+ala
+alaturi
+ale
+alea
+alt
+alta
+altceva
+altcineva
+alte
+altfel
+alti
+altii
+altul
+am
+anume
+apoi
+ar
+are
+as
+asa
+asemenea
+asta
+astazi
+astea
+astfel
+astăzi
+asupra
+atare
+atat
+atata
+atatea
+atatia
+ati
+atit
+atita
+atitea
+atitia
+atunci
+au
+avea
+avem
+aveţi
+avut
+azi
+aş
+aşadar
+aţi
+b
+ba
+bine
+bucur
+bună
+c
+ca
+cam
+cand
+capat
+care
+careia
+carora
+caruia
+cat
+catre
+caut
+ce
+cea
+ceea
+cei
+ceilalti
+cel
+cele
+celor
+ceva
+chiar
+ci
+cinci
+cind
+cine
+cineva
+cit
+cita
+cite
+citeva
+citi
+citiva
+conform
+contra
+cu
+cui
+cum
+cumva
+curând
+curînd
+când
+cât
+câte
+câtva
+câţi
+cînd
+cît
+cîte
+cîtva
+cîţi
+că
+căci
+cărei
+căror
+cărui
+către
+d
+da
+daca
+dacă
+dar
+dat
+datorită
+dată
+dau
+de
+deasupra
+deci
+decit
+degraba
+deja
+deoarece
+departe
+desi
+despre
+deşi
+din
+dinaintea
+dintr
+dintr-
+dintre
+doar
+doi
+doilea
+două
+drept
+dupa
+după
+dă
+e
+ea
+ei
+el
+ele
+era
+eram
+este
+eu
+exact
+eşti
+f
+face
+fara
+fata
+fel
+fi
+fie
+fiecare
+fii
+fim
+fiu
+fiţi
+foarte
+fost
+frumos
+fără
+g
+geaba
+graţie
+h
+halbă
+i
+ia
+iar
+ieri
+ii
+il
+imi
+in
+inainte
+inapoi
+inca
+incit
+insa
+intr
+intre
+isi
+iti
+j
+k
+l
+la
+le
+li
+lor
+lui
+lângă
+lîngă
+m
+ma
+mai
+mare
+mea
+mei
+mele
+mereu
+meu
+mi
+mie
+mine
+mod
+mult
+multa
+multe
+multi
+multă
+mulţi
+mulţumesc
+mâine
+mîine
+mă
+n
+ne
+nevoie
+ni
+nici
+niciodata
+nicăieri
+nimeni
+nimeri
+nimic
+niste
+nişte
+noastre
+noastră
+noi
+noroc
+nostri
+nostru
+nou
+noua
+nouă
+noştri
+nu
+numai
+o
+opt
+or
+ori
+oricare
+orice
+oricine
+oricum
+oricând
+oricât
+oricînd
+oricît
+oriunde
+p
+pai
+parca
+patra
+patru
+patrulea
+pe
+pentru
+peste
+pic
+pina
+plus
+poate
+pot
+prea
+prima
+primul
+prin
+printr-
+putini
+puţin
+puţina
+puţină
+până
+pînă
+r
+rog
+s
+sa
+sa-mi
+sa-ti
+sai
+sale
+sau
+se
+si
+sint
+sintem
+spate
+spre
+sub
+sunt
+suntem
+sunteţi
+sus
+sută
+sînt
+sîntem
+sînteţi
+să
+săi
+său
+t
+ta
+tale
+te
+ti
+timp
+tine
+toata
+toate
+toată
+tocmai
+tot
+toti
+totul
+totusi
+totuşi
+toţi
+trei
+treia
+treilea
+tu
+tuturor
+tăi
+tău
+u
+ul
+ului
+un
+una
+unde
+undeva
+unei
+uneia
+unele
+uneori
+unii
+unor
+unora
+unu
+unui
+unuia
+unul
+v
+va
+vi
+voastre
+voastră
+voi
+vom
+vor
+vostru
+vouă
+voştri
+vreme
+vreo
+vreun
+vă
+x
+z
+zece
+zero
+zi
+zice
+îi
+îl
+îmi
+împotriva
+în
+înainte
+înaintea
+încotro
+încât
+încît
+între
+întrucât
+întrucît
+îţi
+ăla
+ălea
+ăsta
+ăstea
+ăştia
+şapte
+şase
+şi
+ştiu
+ţi
+ţie
+""".split())
--- a/spacy/lang/ro/tokenizer_exceptions.py
+++ b/spacy/lang/ro/tokenizer_exceptions.py
@ -0,0 +1,17 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import ORTH
+
+
+_exc = {}
+
+
+# Source: https://en.wiktionary.org/wiki/Category:Romanian_abbreviations
+for orth in [
+    "1-a", "1-ul", "10-a", "10-lea", "2-a", "3-a", "3-lea", "6-lea",
+    "d-voastră", "dvs.", "Rom.", "str."]:
+    _exc[orth] = [{ORTH: orth}]
+
+
+TOKENIZER_EXCEPTIONS = _exc
--- a/spacy/lang/sv/tokenizer_exceptions.py
+++ b/spacy/lang/sv/tokenizer_exceptions.py
@ -68,7 +68,7 @@ for exc_data in [
    {ORTH: "Sön.", LEMMA: "Söndag"},
    {ORTH: "sthlm", LEMMA: "Stockholm"},
    {ORTH: "gbg", LEMMA: "Göteborg"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]


 for orth in [
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@ -68,7 +68,7 @@ for exc_data in [
    {ORTH: "\\n", POS: SPACE},
    {ORTH: "\u2014", POS: PUNCT, LEMMA: "--"},
    {ORTH: "\u00a0", POS: SPACE, LEMMA: "  "}]:
-    BASE_EXCEPTIONS[exc_data[ORTH]] = [dict(exc_data)]
+    BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data]


 for orth in [
--- a/spacy/lang/tr/init.py
+++ b/spacy/lang/tr/init.py
@ -0,0 +1,28 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .stop_words import STOP_WORDS
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
+from ...language import Language
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class TurkishDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'tr'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = STOP_WORDS
+
+
+class Turkish(Language):
+    lang = 'tr'
+    Defaults = TurkishDefaults
+
+
+__all__ = ['Turkish']
+
--- a/spacy/lang/tr/stop_words.py
+++ b/spacy/lang/tr/stop_words.py
@ -0,0 +1,512 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+# Source: https://github.com/stopwords-iso/stopwords-tr
+
+STOP_WORDS = set("""
+acaba
+acep
+adamakıllı
+adeta
+ait
+altmýþ
+altmış
+altý
+altı
+ama
+amma
+anca
+ancak
+arada
+artýk
+aslında
+aynen
+ayrıca
+az
+açıkça
+açıkçası
+bana
+bari
+bazen
+bazý
+bazı
+başkası
+baţka
+belki
+ben
+benden
+beni
+benim
+beri
+beriki
+beþ
+beş
+beţ
+bilcümle
+bile
+bin
+binaen
+binaenaleyh
+bir
+biraz
+birazdan
+birbiri
+birden
+birdenbire
+biri
+birice
+birileri
+birisi
+birkaç
+birkaçı
+birkez
+birlikte
+birçok
+birçoğu
+birþey
+birþeyi
+birşey
+birşeyi
+birţey
+bitevi
+biteviye
+bittabi
+biz
+bizatihi
+bizce
+bizcileyin
+bizden
+bize
+bizi
+bizim
+bizimki
+bizzat
+boşuna
+bu
+buna
+bunda
+bundan
+bunlar
+bunları
+bunların
+bunu
+bunun
+buracıkta
+burada
+buradan
+burası
+böyle
+böylece
+böylecene
+böylelikle
+böylemesine
+böylesine
+büsbütün
+bütün
+cuk
+cümlesi
+da
+daha
+dahi
+dahil
+dahilen
+daima
+dair
+dayanarak
+de
+defa
+dek
+demin
+demincek
+deminden
+denli
+derakap
+derhal
+derken
+deđil
+değil
+değin
+diye
+diđer
+diğer
+diğeri
+doksan
+dokuz
+dolayı
+dolayısıyla
+doğru
+dört
+edecek
+eden
+ederek
+edilecek
+ediliyor
+edilmesi
+ediyor
+elbet
+elbette
+elli
+emme
+en
+enikonu
+epey
+epeyce
+epeyi
+esasen
+esnasında
+etmesi
+etraflı
+etraflıca
+etti
+ettiği
+ettiğini
+evleviyetle
+evvel
+evvela
+evvelce
+evvelden
+evvelemirde
+evveli
+eđer
+eğer
+fakat
+filanca
+gah
+gayet
+gayetle
+gayri
+gayrı
+gelgelelim
+gene
+gerek
+gerçi
+geçende
+geçenlerde
+gibi
+gibilerden
+gibisinden
+gine
+göre
+gırla
+hakeza
+halbuki
+halen
+halihazırda
+haliyle
+handiyse
+hangi
+hangisi
+hani
+hariç
+hasebiyle
+hasılı
+hatta
+hele
+hem
+henüz
+hep
+hepsi
+her
+herhangi
+herkes
+herkesin
+hiç
+hiçbir
+hiçbiri
+hoş
+hulasaten
+iken
+iki
+ila
+ile
+ilen
+ilgili
+ilk
+illa
+illaki
+imdi
+indinde
+inen
+insermi
+ise
+ister
+itibaren
+itibariyle
+itibarıyla
+iyi
+iyice
+iyicene
+için
+iş
+işte
+iţte
+kadar
+kaffesi
+kah
+kala
+kanýmca
+karşın
+katrilyon
+kaynak
+kaçı
+kelli
+kendi
+kendilerine
+kendini
+kendisi
+kendisine
+kendisini
+kere
+kez
+keza
+kezalik
+keşke
+keţke
+ki
+kim
+kimden
+kime
+kimi
+kimisi
+kimse
+kimsecik
+kimsecikler
+külliyen
+kýrk
+kýsaca
+kırk
+kısaca
+lakin
+leh
+lütfen
+maada
+madem
+mademki
+mamafih
+mebni
+međer
+meğer
+meğerki
+meğerse
+milyar
+milyon
+mu
+mü
+mý
+mı
+nasýl
+nasıl
+nasılsa
+nazaran
+naşi
+ne
+neden
+nedeniyle
+nedenle
+nedense
+nerde
+nerden
+nerdeyse
+nere
+nerede
+nereden
+neredeyse
+neresi
+nereye
+netekim
+neye
+neyi
+neyse
+nice
+nihayet
+nihayetinde
+nitekim
+niye
+niçin
+o
+olan
+olarak
+oldu
+olduklarını
+oldukça
+olduğu
+olduğunu
+olmadı
+olmadığı
+olmak
+olması
+olmayan
+olmaz
+olsa
+olsun
+olup
+olur
+olursa
+oluyor
+on
+ona
+onca
+onculayın
+onda
+ondan
+onlar
+onlardan
+onlari
+onlarýn
+onları
+onların
+onu
+onun
+oracık
+oracıkta
+orada
+oradan
+oranca
+oranla
+oraya
+otuz
+oysa
+oysaki
+pek
+pekala
+peki
+pekçe
+peyderpey
+rağmen
+sadece
+sahi
+sahiden
+sana
+sanki
+sekiz
+seksen
+sen
+senden
+seni
+senin
+siz
+sizden
+sizi
+sizin
+sonra
+sonradan
+sonraları
+sonunda
+tabii
+tam
+tamam
+tamamen
+tamamıyla
+tarafından
+tek
+trilyon
+tüm
+var
+vardı
+vasıtasıyla
+ve
+velev
+velhasıl
+velhasılıkelam
+veya
+veyahut
+ya
+yahut
+yakinen
+yakında
+yakından
+yakınlarda
+yalnız
+yalnızca
+yani
+yapacak
+yapmak
+yaptı
+yaptıkları
+yaptığı
+yaptığını
+yapılan
+yapılması
+yapıyor
+yedi
+yeniden
+yenilerde
+yerine
+yetmiþ
+yetmiş
+yetmiţ
+yine
+yirmi
+yok
+yoksa
+yoluyla
+yüz
+yüzünden
+zarfında
+zaten
+zati
+zira
+çabuk
+çabukça
+çeşitli
+çok
+çokları
+çoklarınca
+çokluk
+çoklukla
+çokça
+çoğu
+çoğun
+çoğunca
+çoğunlukla
+çünkü
+öbür
+öbürkü
+öbürü
+önce
+önceden
+önceleri
+öncelikle
+öteki
+ötekisi
+öyle
+öylece
+öylelikle
+öylemesine
+öz
+üzere
+üç
+þey
+þeyden
+þeyi
+þeyler
+þu
+þuna
+þunda
+þundan
+þunu
+şayet
+şey
+şeyden
+şeyi
+şeyler
+şu
+şuna
+şuncacık
+şunda
+şundan
+şunlar
+şunları
+şunu
+şunun
+şura
+şuracık
+şuracıkta
+şurası
+şöyle
+ţayet
+ţimdi
+ţu
+ţöyle
+""".split())
--- a/spacy/lang/tr/tokenizer_exceptions.py
+++ b/spacy/lang/tr/tokenizer_exceptions.py
@ -0,0 +1,27 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import ORTH, NORM
+
+
+# These exceptions are mostly for example purposes – hoping that Turkish
+# speakers can contribute in the future! Source of copy-pasted examples:
+# https://en.wiktionary.org/wiki/Category:Turkish_language
+
+_exc = {
+    "sağol": [
+        {ORTH: "sağ"},
+        {ORTH: "ol", NORM: "olun"}]
+}
+
+
+for exc_data in [
+    {ORTH: "A.B.D.", NORM: "Amerika Birleşik Devletleri"}]:
+    _exc[exc_data[ORTH]] = [exc_data]
+
+
+for orth in ["Dr."]:
+    _exc[orth] = [{ORTH: orth}]
+
+
+TOKENIZER_EXCEPTIONS = _exc
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -18,7 +18,7 @@ _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id',
              'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
 _models = {'en': ['en_core_web_sm'],
           'de': ['de_core_news_md'],
-           'fr': ['fr_depvec_web_lg'],
+           'fr': ['fr_core_news_sm'],
           'xx': ['xx_ent_web_md']}


--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -6,6 +6,7 @@ from .. import util
 from ..displacy import parse_deps, parse_ents
 from ..tokens import Span
 from .util import get_doc
+from .._ml import PrecomputableAffine

 from pathlib import Path
 import pytest
@ -59,3 +60,19 @@ def test_displacy_parse_deps(en_vocab):
    assert deps['arcs'] == [{'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'},
                            {'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'},
                            {'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}]
+
+
+def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
+    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP)
+    assert model.W.shape == (nF, nO, nP, nI)
+    tensor = model.ops.allocate((10, nI))
+    Y, get_dX = model.begin_update(tensor)
+    assert Y.shape == (tensor.shape[0]+1, nF, nO, nP)
+    assert model.d_pad.shape == (1, nF, nO, nP)
+    dY = model.ops.allocate((15, nF, nO, nP))
+    ids = model.ops.allocate((15, nF))
+    ids[1,2] = -1
+    dY[1,2] = 1
+    assert model.d_pad[0, 2, 0, 0] == 0.
+    model._backprop_padding(dY, ids)
+    assert model.d_pad[0, 2, 0, 0] == 1.
--- a/website/_includes/_page_models.jade
+++ b/website/_includes/_page_models.jade
@ -40,6 +40,8 @@ for id in CURRENT_MODELS

            each label in ["Pipeline", "Vectors", "Sources", "Author", "License"]
                - var field = label.toLowerCase()
+                if field == "vectors"
+                    - field = "vecs"
                +row
                    +cell.u-nowrap
                        +label=label
--- a/website/_includes/_scripts.jade
+++ b/website/_includes/_scripts.jade
@ -13,7 +13,6 @@ script(src="/assets/js/vendor/prism.min.js")

 if SECTION == "models"
    script(src="/assets/js/vendor/chart.min.js")
-    script(src="/assets/js/models.js?v#{V_JS}" type="module")

 script
    if quickstart
@ -24,15 +23,15 @@ script
        | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
        | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');

-
-if IS_PAGE
-    script
+    if IS_PAGE
        | ((window.gitter = {}).chat = {}).options = {
        |     useStyles: false,
        |     activationElement: '.js-gitter-button',
        |     targetElement: '.js-gitter',
        |     room: '!{SOCIAL.gitter}'
        | };
+
+if IS_PAGE
    script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)


@ -48,10 +47,23 @@ if IS_PAGE
 - ModelLoader = "new ModelLoader('" + MODELS_REPO + "'," + JSON.stringify(CURRENT_MODELS) + "," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + ");"
 - ModelComparer = "new ModelComparer('" + MODELS_REPO + "'," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + "," + JSON.stringify(LANGUAGES) + "," + JSON.stringify(MODEL_META) + "," + JSON.stringify(default_models || false) + ");"

-//- Browsers with JS module support.
-    Will be ignored otherwise.
-
-script(type="module")
+if environment == "deploy"
+    //- DEPLOY: use compiled rollup.js and instantiate classes directly
+    script(src="/assets/js/rollup.js")
+    script
+        !=ProgressBar
+        if changelog
+            !=Changelog
+        if IS_PAGE
+            !=NavHighlighter
+            !=GitHubEmbed
+        if HAS_MODELS
+            !=ModeLoader
+        if compare_models
+            !=ModelComparer
+else
+    //- DEVELOPMENT: Use ES6 modules
+    script(type="module")
        | import ProgressBar from '/assets/js/progress.js';
        !=ProgressBar
        if changelog
@ -68,19 +80,3 @@ script(type="module")
        if compare_models
            | import { ModelComparer } from '/assets/js/models.js';
            !=ModelComparer
-
-//- Browsers with no JS module support.
-    Won't be fetched or interpreted otherwise.
-
-script(nomodule src="/assets/js/rollup.js")
-script(nomodule)
-    !=ProgressBar
-    if changelog
-        !=Changelog
-    if IS_PAGE
-        !=NavHighlighter
-        !=GitHubEmbed
-    if HAS_MODELS
-        !=ModeLoader
-    if compare_models
-        !=ModelComparer
--- a/website/assets/css/_base/_layout.sass
+++ b/website/assets/css/_base/_layout.sass
@ -12,7 +12,6 @@ body
    animation: fadeIn 0.25s ease
    background: $color-back
    color: $color-front
-    //scroll-behavior: smooth


 //- Paragraphs
--- a/website/assets/js/models.js
+++ b/website/assets/js/models.js
@ -20,21 +20,33 @@ const CHART_FONTS = {
 * @property {function} vectors - Format vector data (entries and dimensions).
 * @property {function} version - Format model version number.
 */
-export const formats = {
+const formats = {
    author: (author, url) => url ? `<a href="${url}" target="_blank">${author}</a>` : author,
    license: (license, url) => url ? `<a href="${url}" target="_blank">${license}</a>` : license,
    sources: sources => (sources instanceof Array) ? sources.join(', ') : sources,
    pipeline: pipes => (pipes && pipes.length) ? pipes.map(p => `<code>${p}</code>`).join(', ') : '-',
-    vectors: vec => vec ? `${abbrNumber(vec.keys)} keys, ${abbrNumber(vec.vectors)} unique vectors (${vec.width} dimensions)` : 'n/a',
+    vectors: vec => formatVectors(vec),
    version: version => `<code>v${version}</code>`
 };

+/**
+ * Format word vectors data depending on contents.
+ * @property {Object} data - The vectors object from the model's meta.json.
+ */
+const formatVectors = data => {
+    if (!data) return 'n/a';
+    if (Object.values(data).every(n => n == 0)) return 'context vectors only';
+    const { keys, vectors: vecs, width } = data;
+    return `${abbrNumber(keys)} keys, ${abbrNumber(vecs)} unique vectors (${width} dimensions)`;
+}
+
+
 /**
 * Find the latest version of a model in a compatibility table.
 * @param {string} model - The model name.
 * @param {Object} compat - Compatibility table, keyed by spaCy version.
 */
-export const getLatestVersion = (model, compat = {}) => {
+const getLatestVersion = (model, compat = {}) => {
    for (let [spacy_v, models] of Object.entries(compat)) {
        if (models[model]) return models[model][0];
    }
@ -90,7 +102,7 @@ export class ModelLoader {
        const tpl = new Templater(modelId);
        tpl.get('table').removeAttribute('data-loading');
        tpl.get('error').style.display = 'block';
-        for (let key of ['sources', 'pipeline', 'vectors', 'author', 'license']) {
+        for (let key of ['sources', 'pipeline', 'vecs', 'author', 'license']) {
            tpl.get(key).parentElement.parentElement.style.display = 'none';
        }
    }
@ -120,8 +132,8 @@ export class ModelLoader {
        if (author) tpl.fill('author', formats.author(author, url), true);
        if (license) tpl.fill('license', formats.license(license, this.licenses[license]), true);
        if (sources) tpl.fill('sources', formats.sources(sources));
-        if (vectors) tpl.fill('vectors', formats.vectors(vectors));
-        else tpl.get('vectors').parentElement.parentElement.style.display = 'none';
+        if (vectors) tpl.fill('vecs', formats.vectors(vectors));
+        else tpl.get('vecs').parentElement.parentElement.style.display = 'none';
        if (pipeline && pipeline.length) tpl.fill('pipeline', formats.pipeline(pipeline), true);
        else tpl.get('pipeline').parentElement.parentElement.style.display = 'none';
    }
@ -186,6 +198,7 @@ export class ModelComparer {
        this.fonts = CHART_FONTS;
        this.defaultModels = defaultModels;
        this.tpl.get('result').style.display = 'block';
+        this.tpl.get('error').style.display = 'none';
        this.fetchCompat()
            .then(compat => this.init(compat))
            .catch(this.showError.bind(this))
@ -223,8 +236,9 @@ export class ModelComparer {
        const version = getLatestVersion(name, this.compat);
        const modelName = `${name}-${version}`;
        return new Promise((resolve, reject) => {
+            if (!version) reject();
            // resolve immediately if model already loaded, e.g. in this.models
-            if (this.models[name]) resolve(this.models[name]);
+            else if (this.models[name]) resolve(this.models[name]);
            else fetch(`${this.url}/meta/${modelName}.json`)
                .then(res => handleResponse(res))
                .then(json => json.ok ? resolve(this.saveModel(name, json)) : reject())
@ -306,12 +320,13 @@ export class ModelComparer {
        this.tpl.fill(`size${i}`, size);
        this.tpl.fill(`desc${i}`, description || 'n/a');
        this.tpl.fill(`pipeline${i}`, formats.pipeline(pipeline), true);
-        this.tpl.fill(`vectors${i}`, formats.vectors(vectors));
+        this.tpl.fill(`vecs${i}`, formats.vectors(vectors));
        this.tpl.fill(`sources${i}`, formats.sources(sources));
        this.tpl.fill(`author${i}`, formats.author(author, url), true);
        this.tpl.fill(`license${i}`, formats.license(license, this.licenses[license]), true);
        // check if model accuracy or speed includes one of the pre-set keys
-        for (let key of [...metaKeys, ...Object.keys(this.benchKeys.speed)]) {
+        const allKeys = [].concat(...Object.entries(this.benchKeys).map(([_, v]) => Object.keys(v)));
+        for (let key of allKeys) {
            if (accuracy[key]) this.tpl.fill(`${key}${i}`, accuracy[key].toFixed(2))
            else if (speed[key]) this.tpl.fill(`${key}${i}`, convertNumber(Math.round(speed[key])))
            else this.tpl.fill(`${key}${i}`, 'n/a')
--- a/website/assets/js/util.js
+++ b/website/assets/js/util.js
@ -59,11 +59,12 @@ export const convertNumber = (num = 0, separator = ',') =>
 * @param {number|string} num - The number to convert.
 * @param {number} fixed - Number of decimals.
 */
-export const abbrNumber = (num = 0, fixed = 2) => {
+export const abbrNumber = (num = 0, fixed = 1) => {
    const suffixes = ['', 'k', 'm', 'b', 't'];
    if (num === null || num === 0) return 0;
    const b = num.toPrecision(2).split('e');
    const k = (b.length === 1) ? 0 : Math.floor(Math.min(b[1].slice(1), 14) / 3);
-    const c = (k < 1) ? num.toFixed(fixed) : (num / Math.pow(10, k * 3)).toFixed(fixed + 1);
+    const n = (k < 1) ? num : num / Math.pow(10, k * 3);
+    const c = (k >= 1 && n >= 100 ) ? Math.round(n) : n.toFixed(fixed);
    return (c < 0 ? c : Math.abs(c)) + suffixes[k];
 }
--- a/website/models/_data.json
+++ b/website/models/_data.json
@ -12,6 +12,7 @@
            "Portuguese": "pt",
            "French": "fr",
            "Italian": "it",
+            "Dutch": "nl",
            "Multi-Language": "xx"
        }
    },
@ -40,11 +41,9 @@

    "MODELS": {
        "en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"],
-        "de": ["de_dep_news_sm"],
-        "es": ["es_core_web_sm"],
-        "pt": [],
-        "fr": [],
-        "it": [],
+        "de": ["de_core_news_sm"],
+        "es": ["es_core_news_sm", "es_core_news_md"],
+        "it": ["it_core_news_sm"],
        "xx": ["xx_ent_wiki_sm"]
    },

@ -66,6 +65,7 @@
        "gpu": "words per second on GPU",
        "pipeline": "Processing pipeline components in order",
        "sources": "Sources of training data",
+        "vecs": "Word vectors included in the model. Models that only support context vectors compute similarity via the tensors shared with the pipeline.",
        "benchmark_parser": "Parser accuracy",
        "benchmark_ner": "NER accuracy",
        "benchmark_speed": "Speed"
@ -74,9 +74,11 @@
    "MODEL_LICENSES": {
        "CC BY-SA":     "https://creativecommons.org/licenses/by-sa/3.0/",
        "CC BY-SA 3.0": "https://creativecommons.org/licenses/by-sa/3.0/",
+        "CC BY-SA 4.0": "https://creativecommons.org/licenses/by-sa/4.0/",
        "CC BY-NC":     "https://creativecommons.org/licenses/by-nc/3.0/",
        "CC BY-NC 3.0": "https://creativecommons.org/licenses/by-nc/3.0/",
-        "GPL":          "http://www.gnu.de/documents/gpl.en.html"
+        "GPL":          "https://www.gnu.org/licenses/gpl.html",
+        "LGPL":         "https://www.gnu.org/licenses/lgpl.html"
    },

    "MODEL_BENCHMARKS": {
@ -99,6 +101,9 @@
        "da": "Danish",
        "hu": "Hungarian",
        "pl": "Polish",
+        "ro": "Romanian",
+        "hr": "Croatian",
+        "tr": "Turkish",
        "he": "Hebrew",
        "ga": "Irish",
        "bn": "Bengali",
--- a/website/models/comparison.jade
+++ b/website/models/comparison.jade
@ -53,6 +53,8 @@ div(data-tpl=TPL data-tpl-key="result" style="display: none")

        for label in ["Version", "Size", "Pipeline", "Vectors", "Sources", "Author", "License"]
            - var field = label.toLowerCase()
+            if field == "vectors"
+                - field = "vecs"
            +row
                +cell.u-nowrap
                    +label=label
--- a/website/models/nl.jade
+++ b/website/models/nl.jade
@ -0,0 +1,6 @@
+//- 💫 DOCS > MODELS > NL
+
+include ../_includes/_mixins
+
+//- This is a placeholder. The page is rendered via the template at
+//- /_includes/_page-model.jade.
--- a/website/package.json
+++ b/website/package.json
@ -9,7 +9,8 @@
    "babel-cli": "^6.14.0",
    "harp": "^0.24.0",
    "rollup": "^0.50.0",
-    "uglify-js": "^2.7.3"
+    "uglify-js": "^2.7.3",
+    "broken-link-checker": "^0.7.6"
  },
  "dependencies": {},
  "scripts": {
--- a/website/usage/_adding-languages/_language-data.jade
+++ b/website/usage/_adding-languages/_language-data.jade
@ -218,7 +218,7 @@ p
    |  If an exception consists of more than one token, the #[code ORTH] values
    |  combined always need to #[strong match the original string]. The way the
    |  original string is split up can be pretty arbitrary sometimes – for
-    |  example "gonna" is split into "gon" (lemma "go") nad "na" (lemma "to").
+    |  example "gonna" is split into "gon" (lemma "go") and "na" (lemma "to").
    |  Because of how the tokenizer works, it's currently not possible to split
    |  single-letter strings into multiple tokens.

--- a/website/usage/_spacy-101/_word-vectors.jade
+++ b/website/usage/_spacy-101/_word-vectors.jade
@ -4,9 +4,9 @@ p
    |  Similarity is determined by comparing #[strong word vectors] or "word
    |  embeddings", multi-dimensional meaning representations of a word. Word
    |  vectors can be generated using an algorithm like
-    |  #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's
-    |  #[+a("/models") default models] come with
-    |  #[strong 300-dimensional vectors] that look like this:
+    |  #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. spaCy's medium
+    |  #[code md] and large #[code lg] #[+a("/models") models] come with
+    |  #[strong multi-dimensional vectors] that look like this:

 +code("banana.vector", false, false, 250).
    array([2.02280000e-01,  -7.66180009e-02,   3.70319992e-01,
--- a/website/usage/_vectors-similarity/_basics.jade
+++ b/website/usage/_vectors-similarity/_basics.jade
@ -4,12 +4,9 @@
    |  Dense, real valued vectors representing distributional similarity
    |  information are now a cornerstone of practical NLP. The most common way
    |  to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]
-    |  family of algorithms. The default
-    |  #[+a("/models/en") English model] installs
-    |  300-dimensional vectors trained on the
-    |  #[+a("http://commoncrawl.org") Common Crawl] corpus.
-    |  If you need to train a word2vec model, we recommend the implementation in
-    |  the Python library #[+a("https://radimrehurek.com/gensim/") Gensim].
+    |  family of algorithms. If you need to train a word2vec model, we recommend
+    |  the implementation in the Python library
+    |  #[+a("https://radimrehurek.com/gensim/") Gensim].

 include ../_spacy-101/_similarity
 include ../_spacy-101/_word-vectors