Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-07-23 06:29:48 +03:00 · 2017-11-03 13:29:56 +01:00 · 2017-11-03 13:29:56 +01:00 · 33bd2428db
commit 33bd2428db
parent 6681058abd 58bb4bd484
36 changed files with 1409 additions and 97 deletions
--- a/.github/CONTRIBUTOR_AGREEMENT.md
+++ b/.github/CONTRIBUTOR_AGREEMENT.md
@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
-    * [ ] I am signing on behalf of myself as an individual and no other person
+    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
@ -98,9 +98,9 @@ mark both statements:
 | Field                          | Entry                              |
 |------------------------------- | --------------------               |
-| Name                           |                      |
+| Name                           | Abhinav Sharma                     |
-| Company name (if applicable)   |                      |
+| Company name (if applicable)   | Fourtek I.T. Solutions Pvt. Ltd.   |
-| Title or role (if applicable)  |                      |
+| Title or role (if applicable)  | Machine Learning Engineer          |
-| Date                           |                      |
+| Date                           | 3 Novermber 2017                   |
-| GitHub username                |                      |
+| GitHub username                | abhi18av                           |
-| Website (optional)             |                      |
+| Website (optional)             | https://abhi18av.github.io/        |
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -150,10 +150,10 @@ class PrecomputableAffine(Model):
    def _backprop_padding(self, dY, ids):
        # (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0
-        for i in range(ids.shape[0]):
+        mask = ids < 0.
-            for j in range(ids.shape[1]):
+        mask = mask.sum(axis=1)
-                if ids[i,j] < 0:
+        d_pad = dY * mask.reshape((ids.shape[0], 1, 1))
-                    self.d_pad[0,j] += dY[i, j]
+        self.d_pad += d_pad.sum(axis=0)
        return dY, ids
    @staticmethod
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -85,6 +85,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
    batch_sizes = util.compounding(util.env_opt('batch_from', 1),
                                   util.env_opt('batch_to', 16),
                                   util.env_opt('batch_compound', 1.001))
    max_doc_len = util.env_opt('max_doc_len', 5000)
    corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
    n_train_words = corpus.count_train()
@ -108,6 +109,9 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in minibatch(train_docs, size=batch_sizes):
                    batch = [(d, g) for (d, g) in batch if len(d) < max_doc_len]
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    nlp.update(docs, golds, sgd=optimizer,
                               drop=next(dropout_rates), losses=losses)
--- a/spacy/lang/bn/tokenizer_exceptions.py
+++ b/spacy/lang/bn/tokenizer_exceptions.py
@ -20,7 +20,7 @@ for exc_data in [
    {ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"},
    {ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"},
    {ORTH: "মি.লি.", LEMMA: "মিলিলিটার"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]
 TOKENIZER_EXCEPTIONS = _exc
--- a/spacy/lang/da/tokenizer_exceptions.py
+++ b/spacy/lang/da/tokenizer_exceptions.py
@ -8,7 +8,6 @@ _exc = {}
 for exc_data in [
    {ORTH: "Kbh.", LEMMA: "København", NORM: "København"},
    {ORTH: "Jan.", LEMMA: "januar", NORM: "januar"},
    {ORTH: "Feb.", LEMMA: "februar", NORM: "februar"},
    {ORTH: "Mar.", LEMMA: "marts", NORM: "marts"},
@ -21,7 +20,7 @@ for exc_data in [
    {ORTH: "Okt.", LEMMA: "oktober", NORM: "oktober"},
    {ORTH: "Nov.", LEMMA: "november", NORM: "november"},
    {ORTH: "Dec.", LEMMA: "december", NORM: "december"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]
 for orth in [
    "A/S", "beg.", "bl.a.", "ca.", "d.s.s.", "dvs.", "f.eks.", "fr.", "hhv.",
--- a/spacy/lang/de/tokenizer_exceptions.py
+++ b/spacy/lang/de/tokenizer_exceptions.py
@ -164,7 +164,7 @@ for exc_data in [
    {ORTH: "z.b.", LEMMA: "zum Beispiel"},
    {ORTH: "zzgl.", LEMMA: "zuzüglich"},
    {ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]
 for orth in [
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -276,7 +276,7 @@ for exc_data in [
    exc_data_apos = dict(exc_data)
    exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
    for data in [exc_data, exc_data_apos]:
-        _exc[data[ORTH]] = [dict(data)]
+        _exc[data[ORTH]] = [data]
 # Times
@ -440,7 +440,7 @@ for exc_data in [
    {ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
    {ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
    {ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]
 for orth in [
--- a/spacy/lang/es/tokenizer_exceptions.py
+++ b/spacy/lang/es/tokenizer_exceptions.py
@ -26,7 +26,7 @@ for exc_data in [
    {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"},
    {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
    {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]
 # Times
--- a/spacy/lang/fi/tokenizer_exceptions.py
+++ b/spacy/lang/fi/tokenizer_exceptions.py
@ -73,7 +73,7 @@ for exc_data in [
    {ORTH: "ts.", LEMMA: "toisin sanoen"},
    {ORTH: "vm.", LEMMA: "viimeksi mainittu"},
    {ORTH: "srk.", LEMMA: "seurakunta"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]
 TOKENIZER_EXCEPTIONS = _exc
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@ -54,7 +54,7 @@ for exc_data in [
    {LEMMA: "degrés", ORTH: "d°"},
    {LEMMA: "saint", ORTH: "St."},
    {LEMMA: "sainte", ORTH: "Ste."}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]
 for orth in FR_BASE_EXCEPTIONS + ["etc."]:
--- a/spacy/lang/hr/init.py
+++ b/spacy/lang/hr/init.py
@ -0,0 +1,27 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .stop_words import STOP_WORDS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups
 class CroatianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'hr'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
    stop_words = STOP_WORDS
 class Croatian(Language):
    lang = 'hr'
    Defaults = CroatianDefaults
 __all__ = ['Croatian']
--- a/spacy/lang/hr/stop_words.py
+++ b/spacy/lang/hr/stop_words.py
@ -0,0 +1,187 @@
 # encoding: utf8
 from __future__ import unicode_literals
 # Source: https://github.com/stopwords-iso/stopwords-hr
 STOP_WORDS = set("""
 a
 ako
 ali
 bi
 bih
 bila
 bili
 bilo
 bio
 bismo
 biste
 biti
 bumo
 da
 do
 duž
 ga
 hoće
 hoćemo
 hoćete
 hoćeš
 hoću
 i
 iako
 ih
 ili
 iz
 ja
 je
 jedna
 jedne
 jedno
 jer
 jesam
 jesi
 jesmo
 jest
 jeste
 jesu
 jim
 joj
 još
 ju
 kada
 kako
 kao
 koja
 koje
 koji
 kojima
 koju
 kroz
 li
 me
 mene
 meni
 mi
 mimo
 moj
 moja
 moje
 mu
 na
 nad
 nakon
 nam
 nama
 nas
 naš
 naša
 naše
 našeg
 ne
 nego
 neka
 neki
 nekog
 neku
 nema
 netko
 neće
 nećemo
 nećete
 nećeš
 neću
 nešto
 ni
 nije
 nikoga
 nikoje
 nikoju
 nisam
 nisi
 nismo
 niste
 nisu
 njega
 njegov
 njegova
 njegovo
 njemu
 njezin
 njezina
 njezino
 njih
 njihov
 njihova
 njihovo
 njim
 njima
 njoj
 nju
 no
 o
 od
 odmah
 on
 ona
 oni
 ono
 ova
 pa
 pak
 po
 pod
 pored
 prije
 s
 sa
 sam
 samo
 se
 sebe
 sebi
 si
 smo
 ste
 su
 sve
 svi
 svog
 svoj
 svoja
 svoje
 svom
 ta
 tada
 taj
 tako
 te
 tebe
 tebi
 ti
 to
 toj
 tome
 tu
 tvoj
 tvoja
 tvoje
 u
 uz
 vam
 vama
 vas
 vaš
 vaša
 vaše
 već
 vi
 vrlo
 za
 zar
 će
 ćemo
 ćete
 ćeš
 ću
 što
 """.split())
--- a/spacy/lang/nb/tokenizer_exceptions.py
+++ b/spacy/lang/nb/tokenizer_exceptions.py
@ -11,7 +11,7 @@ for exc_data in [
    {ORTH: "jan.", LEMMA: "januar"},
    {ORTH: "feb.", LEMMA: "februar"},
    {ORTH: "jul.", LEMMA: "juli"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]
 for orth in [
--- a/spacy/lang/pl/tokenizer_exceptions.py
+++ b/spacy/lang/pl/tokenizer_exceptions.py
@ -1,7 +1,7 @@
 # encoding: utf8
 from __future__ import unicode_literals
-from ..symbols import ORTH, LEMMA, POS
+from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN
 _exc = {}
@ -13,7 +13,7 @@ for exc_data in [
    {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
    {ORTH: "tj.", LEMMA: "to jest", POS: ADV},
    {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)],
+    _exc[exc_data[ORTH]] = [exc_data]
 for orth in [
    "w.", "r."]:
--- a/spacy/lang/ro/init.py
+++ b/spacy/lang/ro/init.py
@ -0,0 +1,28 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups
 class RomanianDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'ro'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
 class Romanian(Language):
    lang = 'ro'
    Defaults = RomanianDefaults
 __all__ = ['Romanian']
--- a/spacy/lang/ro/stop_words.py
+++ b/spacy/lang/ro/stop_words.py
@ -0,0 +1,442 @@
 # encoding: utf8
 from __future__ import unicode_literals
 # Source: https://github.com/stopwords-iso/stopwords-ro
 STOP_WORDS = set("""
 a
 abia
 acea
 aceasta
 această
 aceea
 aceeasi
 acei
 aceia
 acel
 acela
 acelasi
 acele
 acelea
 acest
 acesta
 aceste
 acestea
 acestei
 acestia
 acestui
 aceşti
 aceştia
 acolo
 acord
 acum
 adica
 ai
 aia
 aibă
 aici
 aiurea
 al
 ala
 alaturi
 ale
 alea
 alt
 alta
 altceva
 altcineva
 alte
 altfel
 alti
 altii
 altul
 am
 anume
 apoi
 ar
 are
 as
 asa
 asemenea
 asta
 astazi
 astea
 astfel
 astăzi
 asupra
 atare
 atat
 atata
 atatea
 atatia
 ati
 atit
 atita
 atitea
 atitia
 atunci
 au
 avea
 avem
 aveţi
 avut
 azi
 aş
 aşadar
 aţi
 b
 ba
 bine
 bucur
 bună
 c
 ca
 cam
 cand
 capat
 care
 careia
 carora
 caruia
 cat
 catre
 caut
 ce
 cea
 ceea
 cei
 ceilalti
 cel
 cele
 celor
 ceva
 chiar
 ci
 cinci
 cind
 cine
 cineva
 cit
 cita
 cite
 citeva
 citi
 citiva
 conform
 contra
 cu
 cui
 cum
 cumva
 curând
 curînd
 când
 cât
 câte
 câtva
 câţi
 cînd
 cît
 cîte
 cîtva
 cîţi
 că
 căci
 cărei
 căror
 cărui
 către
 d
 da
 daca
 dacă
 dar
 dat
 datorită
 dată
 dau
 de
 deasupra
 deci
 decit
 degraba
 deja
 deoarece
 departe
 desi
 despre
 deşi
 din
 dinaintea
 dintr
 dintr-
 dintre
 doar
 doi
 doilea
 două
 drept
 dupa
 după
 dă
 e
 ea
 ei
 el
 ele
 era
 eram
 este
 eu
 exact
 eşti
 f
 face
 fara
 fata
 fel
 fi
 fie
 fiecare
 fii
 fim
 fiu
 fiţi
 foarte
 fost
 frumos
 fără
 g
 geaba
 graţie
 h
 halbă
 i
 ia
 iar
 ieri
 ii
 il
 imi
 in
 inainte
 inapoi
 inca
 incit
 insa
 intr
 intre
 isi
 iti
 j
 k
 l
 la
 le
 li
 lor
 lui
 lângă
 lîngă
 m
 ma
 mai
 mare
 mea
 mei
 mele
 mereu
 meu
 mi
 mie
 mine
 mod
 mult
 multa
 multe
 multi
 multă
 mulţi
 mulţumesc
 mâine
 mîine
 mă
 n
 ne
 nevoie
 ni
 nici
 niciodata
 nicăieri
 nimeni
 nimeri
 nimic
 niste
 nişte
 noastre
 noastră
 noi
 noroc
 nostri
 nostru
 nou
 noua
 nouă
 noştri
 nu
 numai
 o
 opt
 or
 ori
 oricare
 orice
 oricine
 oricum
 oricând
 oricât
 oricînd
 oricît
 oriunde
 p
 pai
 parca
 patra
 patru
 patrulea
 pe
 pentru
 peste
 pic
 pina
 plus
 poate
 pot
 prea
 prima
 primul
 prin
 printr-
 putini
 puţin
 puţina
 puţină
 până
 pînă
 r
 rog
 s
 sa
 sa-mi
 sa-ti
 sai
 sale
 sau
 se
 si
 sint
 sintem
 spate
 spre
 sub
 sunt
 suntem
 sunteţi
 sus
 sută
 sînt
 sîntem
 sînteţi
 să
 săi
 său
 t
 ta
 tale
 te
 ti
 timp
 tine
 toata
 toate
 toată
 tocmai
 tot
 toti
 totul
 totusi
 totuşi
 toţi
 trei
 treia
 treilea
 tu
 tuturor
 tăi
 tău
 u
 ul
 ului
 un
 una
 unde
 undeva
 unei
 uneia
 unele
 uneori
 unii
 unor
 unora
 unu
 unui
 unuia
 unul
 v
 va
 vi
 voastre
 voastră
 voi
 vom
 vor
 vostru
 vouă
 voştri
 vreme
 vreo
 vreun
 vă
 x
 z
 zece
 zero
 zi
 zice
 îi
 îl
 îmi
 împotriva
 în
 înainte
 înaintea
 încotro
 încât
 încît
 între
 întrucât
 întrucît
 îţi
 ăla
 ălea
 ăsta
 ăstea
 ăştia
 şapte
 şase
 şi
 ştiu
 ţi
 ţie
 """.split())
--- a/spacy/lang/ro/tokenizer_exceptions.py
+++ b/spacy/lang/ro/tokenizer_exceptions.py
@ -0,0 +1,17 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...symbols import ORTH
 _exc = {}
 # Source: https://en.wiktionary.org/wiki/Category:Romanian_abbreviations
 for orth in [
    "1-a", "1-ul", "10-a", "10-lea", "2-a", "3-a", "3-lea", "6-lea",
    "d-voastră", "dvs.", "Rom.", "str."]:
    _exc[orth] = [{ORTH: orth}]
 TOKENIZER_EXCEPTIONS = _exc
--- a/spacy/lang/sv/tokenizer_exceptions.py
+++ b/spacy/lang/sv/tokenizer_exceptions.py
@ -68,7 +68,7 @@ for exc_data in [
    {ORTH: "Sön.", LEMMA: "Söndag"},
    {ORTH: "sthlm", LEMMA: "Stockholm"},
    {ORTH: "gbg", LEMMA: "Göteborg"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]
 for orth in [
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@ -68,7 +68,7 @@ for exc_data in [
    {ORTH: "\\n", POS: SPACE},
    {ORTH: "\u2014", POS: PUNCT, LEMMA: "--"},
    {ORTH: "\u00a0", POS: SPACE, LEMMA: "  "}]:
-    BASE_EXCEPTIONS[exc_data[ORTH]] = [dict(exc_data)]
+    BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data]
 for orth in [
--- a/spacy/lang/tr/init.py
+++ b/spacy/lang/tr/init.py
@ -0,0 +1,28 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups
 class TurkishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'tr'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
 class Turkish(Language):
    lang = 'tr'
    Defaults = TurkishDefaults
 __all__ = ['Turkish']
--- a/spacy/lang/tr/stop_words.py
+++ b/spacy/lang/tr/stop_words.py
@ -0,0 +1,512 @@
 # encoding: utf8
 from __future__ import unicode_literals
 # Source: https://github.com/stopwords-iso/stopwords-tr
 STOP_WORDS = set("""
 acaba
 acep
 adamakıllı
 adeta
 ait
 altmýþ
 altmış
 altý
 altı
 ama
 amma
 anca
 ancak
 arada
 artýk
 aslında
 aynen
 ayrıca
 az
 açıkça
 açıkçası
 bana
 bari
 bazen
 bazý
 bazı
 başkası
 baţka
 belki
 ben
 benden
 beni
 benim
 beri
 beriki
 beþ
 beş
 beţ
 bilcümle
 bile
 bin
 binaen
 binaenaleyh
 bir
 biraz
 birazdan
 birbiri
 birden
 birdenbire
 biri
 birice
 birileri
 birisi
 birkaç
 birkaçı
 birkez
 birlikte
 birçok
 birçoğu
 birþey
 birþeyi
 birşey
 birşeyi
 birţey
 bitevi
 biteviye
 bittabi
 biz
 bizatihi
 bizce
 bizcileyin
 bizden
 bize
 bizi
 bizim
 bizimki
 bizzat
 boşuna
 bu
 buna
 bunda
 bundan
 bunlar
 bunları
 bunların
 bunu
 bunun
 buracıkta
 burada
 buradan
 burası
 böyle
 böylece
 böylecene
 böylelikle
 böylemesine
 böylesine
 büsbütün
 bütün
 cuk
 cümlesi
 da
 daha
 dahi
 dahil
 dahilen
 daima
 dair
 dayanarak
 de
 defa
 dek
 demin
 demincek
 deminden
 denli
 derakap
 derhal
 derken
 deđil
 değil
 değin
 diye
 diđer
 diğer
 diğeri
 doksan
 dokuz
 dolayı
 dolayısıyla
 doğru
 dört
 edecek
 eden
 ederek
 edilecek
 ediliyor
 edilmesi
 ediyor
 elbet
 elbette
 elli
 emme
 en
 enikonu
 epey
 epeyce
 epeyi
 esasen
 esnasında
 etmesi
 etraflı
 etraflıca
 etti
 ettiği
 ettiğini
 evleviyetle
 evvel
 evvela
 evvelce
 evvelden
 evvelemirde
 evveli
 eđer
 eğer
 fakat
 filanca
 gah
 gayet
 gayetle
 gayri
 gayrı
 gelgelelim
 gene
 gerek
 gerçi
 geçende
 geçenlerde
 gibi
 gibilerden
 gibisinden
 gine
 göre
 gırla
 hakeza
 halbuki
 halen
 halihazırda
 haliyle
 handiyse
 hangi
 hangisi
 hani
 hariç
 hasebiyle
 hasılı
 hatta
 hele
 hem
 henüz
 hep
 hepsi
 her
 herhangi
 herkes
 herkesin
 hiç
 hiçbir
 hiçbiri
 hoş
 hulasaten
 iken
 iki
 ila
 ile
 ilen
 ilgili
 ilk
 illa
 illaki
 imdi
 indinde
 inen
 insermi
 ise
 ister
 itibaren
 itibariyle
 itibarıyla
 iyi
 iyice
 iyicene
 için
 iş
 işte
 iţte
 kadar
 kaffesi
 kah
 kala
 kanýmca
 karşın
 katrilyon
 kaynak
 kaçı
 kelli
 kendi
 kendilerine
 kendini
 kendisi
 kendisine
 kendisini
 kere
 kez
 keza
 kezalik
 keşke
 keţke
 ki
 kim
 kimden
 kime
 kimi
 kimisi
 kimse
 kimsecik
 kimsecikler
 külliyen
 kýrk
 kýsaca
 kırk
 kısaca
 lakin
 leh
 lütfen
 maada
 madem
 mademki
 mamafih
 mebni
 međer
 meğer
 meğerki
 meğerse
 milyar
 milyon
 mu
 mü
 mý
 mı
 nasýl
 nasıl
 nasılsa
 nazaran
 naşi
 ne
 neden
 nedeniyle
 nedenle
 nedense
 nerde
 nerden
 nerdeyse
 nere
 nerede
 nereden
 neredeyse
 neresi
 nereye
 netekim
 neye
 neyi
 neyse
 nice
 nihayet
 nihayetinde
 nitekim
 niye
 niçin
 o
 olan
 olarak
 oldu
 olduklarını
 oldukça
 olduğu
 olduğunu
 olmadı
 olmadığı
 olmak
 olması
 olmayan
 olmaz
 olsa
 olsun
 olup
 olur
 olursa
 oluyor
 on
 ona
 onca
 onculayın
 onda
 ondan
 onlar
 onlardan
 onlari
 onlarýn
 onları
 onların
 onu
 onun
 oracık
 oracıkta
 orada
 oradan
 oranca
 oranla
 oraya
 otuz
 oysa
 oysaki
 pek
 pekala
 peki
 pekçe
 peyderpey
 rağmen
 sadece
 sahi
 sahiden
 sana
 sanki
 sekiz
 seksen
 sen
 senden
 seni
 senin
 siz
 sizden
 sizi
 sizin
 sonra
 sonradan
 sonraları
 sonunda
 tabii
 tam
 tamam
 tamamen
 tamamıyla
 tarafından
 tek
 trilyon
 tüm
 var
 vardı
 vasıtasıyla
 ve
 velev
 velhasıl
 velhasılıkelam
 veya
 veyahut
 ya
 yahut
 yakinen
 yakında
 yakından
 yakınlarda
 yalnız
 yalnızca
 yani
 yapacak
 yapmak
 yaptı
 yaptıkları
 yaptığı
 yaptığını
 yapılan
 yapılması
 yapıyor
 yedi
 yeniden
 yenilerde
 yerine
 yetmiþ
 yetmiş
 yetmiţ
 yine
 yirmi
 yok
 yoksa
 yoluyla
 yüz
 yüzünden
 zarfında
 zaten
 zati
 zira
 çabuk
 çabukça
 çeşitli
 çok
 çokları
 çoklarınca
 çokluk
 çoklukla
 çokça
 çoğu
 çoğun
 çoğunca
 çoğunlukla
 çünkü
 öbür
 öbürkü
 öbürü
 önce
 önceden
 önceleri
 öncelikle
 öteki
 ötekisi
 öyle
 öylece
 öylelikle
 öylemesine
 öz
 üzere
 üç
 þey
 þeyden
 þeyi
 þeyler
 þu
 þuna
 þunda
 þundan
 þunu
 şayet
 şey
 şeyden
 şeyi
 şeyler
 şu
 şuna
 şuncacık
 şunda
 şundan
 şunlar
 şunları
 şunu
 şunun
 şura
 şuracık
 şuracıkta
 şurası
 şöyle
 ţayet
 ţimdi
 ţu
 ţöyle
 """.split())
--- a/spacy/lang/tr/tokenizer_exceptions.py
+++ b/spacy/lang/tr/tokenizer_exceptions.py
@ -0,0 +1,27 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...symbols import ORTH, NORM
 # These exceptions are mostly for example purposes – hoping that Turkish
 # speakers can contribute in the future! Source of copy-pasted examples:
 # https://en.wiktionary.org/wiki/Category:Turkish_language
 _exc = {
    "sağol": [
        {ORTH: "sağ"},
        {ORTH: "ol", NORM: "olun"}]
 }
 for exc_data in [
    {ORTH: "A.B.D.", NORM: "Amerika Birleşik Devletleri"}]:
    _exc[exc_data[ORTH]] = [exc_data]
 for orth in ["Dr."]:
    _exc[orth] = [{ORTH: orth}]
 TOKENIZER_EXCEPTIONS = _exc
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -18,7 +18,7 @@ _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id',
              'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
 _models = {'en': ['en_core_web_sm'],
           'de': ['de_core_news_md'],
-           'fr': ['fr_depvec_web_lg'],
+           'fr': ['fr_core_news_sm'],
           'xx': ['xx_ent_web_md']}
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -6,6 +6,7 @@ from .. import util
 from ..displacy import parse_deps, parse_ents
 from ..tokens import Span
 from .util import get_doc
 from .._ml import PrecomputableAffine
 from pathlib import Path
 import pytest
@ -59,3 +60,19 @@ def test_displacy_parse_deps(en_vocab):
    assert deps['arcs'] == [{'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'},
                            {'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'},
                            {'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}]
 def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP)
    assert model.W.shape == (nF, nO, nP, nI)
    tensor = model.ops.allocate((10, nI))
    Y, get_dX = model.begin_update(tensor)
    assert Y.shape == (tensor.shape[0]+1, nF, nO, nP)
    assert model.d_pad.shape == (1, nF, nO, nP)
    dY = model.ops.allocate((15, nF, nO, nP))
    ids = model.ops.allocate((15, nF))
    ids[1,2] = -1
    dY[1,2] = 1
    assert model.d_pad[0, 2, 0, 0] == 0.
    model._backprop_padding(dY, ids)
    assert model.d_pad[0, 2, 0, 0] == 1.
--- a/website/_includes/_page_models.jade
+++ b/website/_includes/_page_models.jade
@ -40,6 +40,8 @@ for id in CURRENT_MODELS
            each label in ["Pipeline", "Vectors", "Sources", "Author", "License"]
                - var field = label.toLowerCase()
                if field == "vectors"
                    - field = "vecs"
                +row
                    +cell.u-nowrap
                        +label=label
--- a/website/_includes/_scripts.jade
+++ b/website/_includes/_scripts.jade
@ -13,7 +13,6 @@ script(src="/assets/js/vendor/prism.min.js")
 if SECTION == "models"
    script(src="/assets/js/vendor/chart.min.js")
    script(src="/assets/js/models.js?v#{V_JS}" type="module")
 script
    if quickstart
@ -24,15 +23,15 @@ script
        | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
        | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
-
+    if IS_PAGE
 if IS_PAGE
    script
        | ((window.gitter = {}).chat = {}).options = {
        |     useStyles: false,
        |     activationElement: '.js-gitter-button',
        |     targetElement: '.js-gitter',
        |     room: '!{SOCIAL.gitter}'
        | };
 if IS_PAGE
    script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
@ -48,10 +47,23 @@ if IS_PAGE
 - ModelLoader = "new ModelLoader('" + MODELS_REPO + "'," + JSON.stringify(CURRENT_MODELS) + "," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + ");"
 - ModelComparer = "new ModelComparer('" + MODELS_REPO + "'," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + "," + JSON.stringify(LANGUAGES) + "," + JSON.stringify(MODEL_META) + "," + JSON.stringify(default_models || false) + ");"
-//- Browsers with JS module support.
+if environment == "deploy"
-    Will be ignored otherwise.
+    //- DEPLOY: use compiled rollup.js and instantiate classes directly
-
+    script(src="/assets/js/rollup.js")
-script(type="module")
+    script
        !=ProgressBar
        if changelog
            !=Changelog
        if IS_PAGE
            !=NavHighlighter
            !=GitHubEmbed
        if HAS_MODELS
            !=ModeLoader
        if compare_models
            !=ModelComparer
 else
    //- DEVELOPMENT: Use ES6 modules
    script(type="module")
        | import ProgressBar from '/assets/js/progress.js';
        !=ProgressBar
        if changelog
@ -68,19 +80,3 @@ script(type="module")
        if compare_models
            | import { ModelComparer } from '/assets/js/models.js';
            !=ModelComparer
 //- Browsers with no JS module support.
    Won't be fetched or interpreted otherwise.
 script(nomodule src="/assets/js/rollup.js")
 script(nomodule)
    !=ProgressBar
    if changelog
        !=Changelog
    if IS_PAGE
        !=NavHighlighter
        !=GitHubEmbed
    if HAS_MODELS
        !=ModeLoader
    if compare_models
        !=ModelComparer
--- a/website/assets/css/_base/_layout.sass
+++ b/website/assets/css/_base/_layout.sass
@ -12,7 +12,6 @@ body
    animation: fadeIn 0.25s ease
    background: $color-back
    color: $color-front
    //scroll-behavior: smooth
 //- Paragraphs
--- a/website/assets/js/models.js
+++ b/website/assets/js/models.js
@ -20,21 +20,33 @@ const CHART_FONTS = {
 * @property {function} vectors - Format vector data (entries and dimensions).
 * @property {function} version - Format model version number.
 */
-export const formats = {
+const formats = {
    author: (author, url) => url ? `<a href="${url}" target="_blank">${author}</a>` : author,
    license: (license, url) => url ? `<a href="${url}" target="_blank">${license}</a>` : license,
    sources: sources => (sources instanceof Array) ? sources.join(', ') : sources,
    pipeline: pipes => (pipes && pipes.length) ? pipes.map(p => `<code>${p}</code>`).join(', ') : '-',
-    vectors: vec => vec ? `${abbrNumber(vec.keys)} keys, ${abbrNumber(vec.vectors)} unique vectors (${vec.width} dimensions)` : 'n/a',
+    vectors: vec => formatVectors(vec),
    version: version => `<code>v${version}</code>`
 };
 /**
 * Format word vectors data depending on contents.
 * @property {Object} data - The vectors object from the model's meta.json.
 */
 const formatVectors = data => {
    if (!data) return 'n/a';
    if (Object.values(data).every(n => n == 0)) return 'context vectors only';
    const { keys, vectors: vecs, width } = data;
    return `${abbrNumber(keys)} keys, ${abbrNumber(vecs)} unique vectors (${width} dimensions)`;
 }
 /**
 * Find the latest version of a model in a compatibility table.
 * @param {string} model - The model name.
 * @param {Object} compat - Compatibility table, keyed by spaCy version.
 */
-export const getLatestVersion = (model, compat = {}) => {
+const getLatestVersion = (model, compat = {}) => {
    for (let [spacy_v, models] of Object.entries(compat)) {
        if (models[model]) return models[model][0];
    }
@ -90,7 +102,7 @@ export class ModelLoader {
        const tpl = new Templater(modelId);
        tpl.get('table').removeAttribute('data-loading');
        tpl.get('error').style.display = 'block';
-        for (let key of ['sources', 'pipeline', 'vectors', 'author', 'license']) {
+        for (let key of ['sources', 'pipeline', 'vecs', 'author', 'license']) {
            tpl.get(key).parentElement.parentElement.style.display = 'none';
        }
    }
@ -120,8 +132,8 @@ export class ModelLoader {
        if (author) tpl.fill('author', formats.author(author, url), true);
        if (license) tpl.fill('license', formats.license(license, this.licenses[license]), true);
        if (sources) tpl.fill('sources', formats.sources(sources));
-        if (vectors) tpl.fill('vectors', formats.vectors(vectors));
+        if (vectors) tpl.fill('vecs', formats.vectors(vectors));
-        else tpl.get('vectors').parentElement.parentElement.style.display = 'none';
+        else tpl.get('vecs').parentElement.parentElement.style.display = 'none';
        if (pipeline && pipeline.length) tpl.fill('pipeline', formats.pipeline(pipeline), true);
        else tpl.get('pipeline').parentElement.parentElement.style.display = 'none';
    }
@ -186,6 +198,7 @@ export class ModelComparer {
        this.fonts = CHART_FONTS;
        this.defaultModels = defaultModels;
        this.tpl.get('result').style.display = 'block';
        this.tpl.get('error').style.display = 'none';
        this.fetchCompat()
            .then(compat => this.init(compat))
            .catch(this.showError.bind(this))
@ -223,8 +236,9 @@ export class ModelComparer {
        const version = getLatestVersion(name, this.compat);
        const modelName = `${name}-${version}`;
        return new Promise((resolve, reject) => {
            if (!version) reject();
            // resolve immediately if model already loaded, e.g. in this.models
-            if (this.models[name]) resolve(this.models[name]);
+            else if (this.models[name]) resolve(this.models[name]);
            else fetch(`${this.url}/meta/${modelName}.json`)
                .then(res => handleResponse(res))
                .then(json => json.ok ? resolve(this.saveModel(name, json)) : reject())
@ -306,12 +320,13 @@ export class ModelComparer {
        this.tpl.fill(`size${i}`, size);
        this.tpl.fill(`desc${i}`, description || 'n/a');
        this.tpl.fill(`pipeline${i}`, formats.pipeline(pipeline), true);
-        this.tpl.fill(`vectors${i}`, formats.vectors(vectors));
+        this.tpl.fill(`vecs${i}`, formats.vectors(vectors));
        this.tpl.fill(`sources${i}`, formats.sources(sources));
        this.tpl.fill(`author${i}`, formats.author(author, url), true);
        this.tpl.fill(`license${i}`, formats.license(license, this.licenses[license]), true);
        // check if model accuracy or speed includes one of the pre-set keys
-        for (let key of [...metaKeys, ...Object.keys(this.benchKeys.speed)]) {
+        const allKeys = [].concat(...Object.entries(this.benchKeys).map(([_, v]) => Object.keys(v)));
        for (let key of allKeys) {
            if (accuracy[key]) this.tpl.fill(`${key}${i}`, accuracy[key].toFixed(2))
            else if (speed[key]) this.tpl.fill(`${key}${i}`, convertNumber(Math.round(speed[key])))
            else this.tpl.fill(`${key}${i}`, 'n/a')
--- a/website/assets/js/util.js
+++ b/website/assets/js/util.js
@ -59,11 +59,12 @@ export const convertNumber = (num = 0, separator = ',') =>
 * @param {number|string} num - The number to convert.
 * @param {number} fixed - Number of decimals.
 */
-export const abbrNumber = (num = 0, fixed = 2) => {
+export const abbrNumber = (num = 0, fixed = 1) => {
    const suffixes = ['', 'k', 'm', 'b', 't'];
    if (num === null || num === 0) return 0;
    const b = num.toPrecision(2).split('e');
    const k = (b.length === 1) ? 0 : Math.floor(Math.min(b[1].slice(1), 14) / 3);
-    const c = (k < 1) ? num.toFixed(fixed) : (num / Math.pow(10, k * 3)).toFixed(fixed + 1);
+    const n = (k < 1) ? num : num / Math.pow(10, k * 3);
    const c = (k >= 1 && n >= 100 ) ? Math.round(n) : n.toFixed(fixed);
    return (c < 0 ? c : Math.abs(c)) + suffixes[k];
 }
--- a/website/models/_data.json
+++ b/website/models/_data.json
@ -12,6 +12,7 @@
            "Portuguese": "pt",
            "French": "fr",
            "Italian": "it",
            "Dutch": "nl",
            "Multi-Language": "xx"
        }
    },
@ -40,11 +41,9 @@
    "MODELS": {
        "en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"],
-        "de": ["de_dep_news_sm"],
+        "de": ["de_core_news_sm"],
-        "es": ["es_core_web_sm"],
+        "es": ["es_core_news_sm", "es_core_news_md"],
-        "pt": [],
+        "it": ["it_core_news_sm"],
        "fr": [],
        "it": [],
        "xx": ["xx_ent_wiki_sm"]
    },
@ -66,6 +65,7 @@
        "gpu": "words per second on GPU",
        "pipeline": "Processing pipeline components in order",
        "sources": "Sources of training data",
        "vecs": "Word vectors included in the model. Models that only support context vectors compute similarity via the tensors shared with the pipeline.",
        "benchmark_parser": "Parser accuracy",
        "benchmark_ner": "NER accuracy",
        "benchmark_speed": "Speed"
@ -74,9 +74,11 @@
    "MODEL_LICENSES": {
        "CC BY-SA":     "https://creativecommons.org/licenses/by-sa/3.0/",
        "CC BY-SA 3.0": "https://creativecommons.org/licenses/by-sa/3.0/",
        "CC BY-SA 4.0": "https://creativecommons.org/licenses/by-sa/4.0/",
        "CC BY-NC":     "https://creativecommons.org/licenses/by-nc/3.0/",
        "CC BY-NC 3.0": "https://creativecommons.org/licenses/by-nc/3.0/",
-        "GPL":          "http://www.gnu.de/documents/gpl.en.html"
+        "GPL":          "https://www.gnu.org/licenses/gpl.html",
        "LGPL":         "https://www.gnu.org/licenses/lgpl.html"
    },
    "MODEL_BENCHMARKS": {
@ -99,6 +101,9 @@
        "da": "Danish",
        "hu": "Hungarian",
        "pl": "Polish",
        "ro": "Romanian",
        "hr": "Croatian",
        "tr": "Turkish",
        "he": "Hebrew",
        "ga": "Irish",
        "bn": "Bengali",
--- a/website/models/comparison.jade
+++ b/website/models/comparison.jade
@ -53,6 +53,8 @@ div(data-tpl=TPL data-tpl-key="result" style="display: none")
        for label in ["Version", "Size", "Pipeline", "Vectors", "Sources", "Author", "License"]
            - var field = label.toLowerCase()
            if field == "vectors"
                - field = "vecs"
            +row
                +cell.u-nowrap
                    +label=label
--- a/website/models/nl.jade
+++ b/website/models/nl.jade
@ -0,0 +1,6 @@
 //- 💫 DOCS > MODELS > NL
 include ../_includes/_mixins
 //- This is a placeholder. The page is rendered via the template at
 //- /_includes/_page-model.jade.
--- a/website/package.json
+++ b/website/package.json
@ -9,7 +9,8 @@
    "babel-cli": "^6.14.0",
    "harp": "^0.24.0",
    "rollup": "^0.50.0",
-    "uglify-js": "^2.7.3"
+    "uglify-js": "^2.7.3",
    "broken-link-checker": "^0.7.6"
  },
  "dependencies": {},
  "scripts": {
--- a/website/usage/_adding-languages/_language-data.jade
+++ b/website/usage/_adding-languages/_language-data.jade
@ -218,7 +218,7 @@ p
    |  If an exception consists of more than one token, the #[code ORTH] values
    |  combined always need to #[strong match the original string]. The way the
    |  original string is split up can be pretty arbitrary sometimes – for
-    |  example "gonna" is split into "gon" (lemma "go") nad "na" (lemma "to").
+    |  example "gonna" is split into "gon" (lemma "go") and "na" (lemma "to").
    |  Because of how the tokenizer works, it's currently not possible to split
    |  single-letter strings into multiple tokens.
--- a/website/usage/_spacy-101/_word-vectors.jade
+++ b/website/usage/_spacy-101/_word-vectors.jade
@ -4,9 +4,9 @@ p
    |  Similarity is determined by comparing #[strong word vectors] or "word
    |  embeddings", multi-dimensional meaning representations of a word. Word
    |  vectors can be generated using an algorithm like
-    |  #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's
+    |  #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. spaCy's medium
-    |  #[+a("/models") default models] come with
+    |  #[code md] and large #[code lg] #[+a("/models") models] come with
-    |  #[strong 300-dimensional vectors] that look like this:
+    |  #[strong multi-dimensional vectors] that look like this:
 +code("banana.vector", false, false, 250).
    array([2.02280000e-01,  -7.66180009e-02,   3.70319992e-01,
--- a/website/usage/_vectors-similarity/_basics.jade
+++ b/website/usage/_vectors-similarity/_basics.jade
@ -4,12 +4,9 @@
    |  Dense, real valued vectors representing distributional similarity
    |  information are now a cornerstone of practical NLP. The most common way
    |  to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]
-    |  family of algorithms. The default
+    |  family of algorithms. If you need to train a word2vec model, we recommend
-    |  #[+a("/models/en") English model] installs
+    |  the implementation in the Python library
-    |  300-dimensional vectors trained on the
+    |  #[+a("https://radimrehurek.com/gensim/") Gensim].
    |  #[+a("http://commoncrawl.org") Common Crawl] corpus.
    |  If you need to train a word2vec model, we recommend the implementation in
    |  the Python library #[+a("https://radimrehurek.com/gensim/") Gensim].
 include ../_spacy-101/_similarity
 include ../_spacy-101/_word-vectors