diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py
new file mode 100644
index 000000000..d1cd04f42
--- /dev/null
+++ b/spacy/lang/tr/__init__.py
@@ -0,0 +1,28 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .stop_words import STOP_WORDS
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
+from ...language import Language
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class TurkishDefaults(Language.Defaults):
+ lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+ lex_attr_getters[LANG] = lambda text: 'tr'
+ lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+ tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+ stop_words = STOP_WORDS
+
+
+class Turkish(Language):
+ lang = 'tr'
+ Defaults = TurkishDefaults
+
+
+__all__ = ['Turkish']
+
diff --git a/spacy/lang/tr/stop_words.py b/spacy/lang/tr/stop_words.py
new file mode 100644
index 000000000..aaed02a3e
--- /dev/null
+++ b/spacy/lang/tr/stop_words.py
@@ -0,0 +1,512 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+# Source: https://github.com/stopwords-iso/stopwords-tr
+
+STOP_WORDS = set("""
+acaba
+acep
+adamakıllı
+adeta
+ait
+altmýþ
+altmış
+altý
+altı
+ama
+amma
+anca
+ancak
+arada
+artýk
+aslında
+aynen
+ayrıca
+az
+açıkça
+açıkçası
+bana
+bari
+bazen
+bazý
+bazı
+başkası
+baţka
+belki
+ben
+benden
+beni
+benim
+beri
+beriki
+beþ
+beş
+beţ
+bilcümle
+bile
+bin
+binaen
+binaenaleyh
+bir
+biraz
+birazdan
+birbiri
+birden
+birdenbire
+biri
+birice
+birileri
+birisi
+birkaç
+birkaçı
+birkez
+birlikte
+birçok
+birçoğu
+birþey
+birþeyi
+birşey
+birşeyi
+birţey
+bitevi
+biteviye
+bittabi
+biz
+bizatihi
+bizce
+bizcileyin
+bizden
+bize
+bizi
+bizim
+bizimki
+bizzat
+boşuna
+bu
+buna
+bunda
+bundan
+bunlar
+bunları
+bunların
+bunu
+bunun
+buracıkta
+burada
+buradan
+burası
+böyle
+böylece
+böylecene
+böylelikle
+böylemesine
+böylesine
+büsbütün
+bütün
+cuk
+cümlesi
+da
+daha
+dahi
+dahil
+dahilen
+daima
+dair
+dayanarak
+de
+defa
+dek
+demin
+demincek
+deminden
+denli
+derakap
+derhal
+derken
+deđil
+değil
+değin
+diye
+diđer
+diğer
+diğeri
+doksan
+dokuz
+dolayı
+dolayısıyla
+doğru
+dört
+edecek
+eden
+ederek
+edilecek
+ediliyor
+edilmesi
+ediyor
+elbet
+elbette
+elli
+emme
+en
+enikonu
+epey
+epeyce
+epeyi
+esasen
+esnasında
+etmesi
+etraflı
+etraflıca
+etti
+ettiği
+ettiğini
+evleviyetle
+evvel
+evvela
+evvelce
+evvelden
+evvelemirde
+evveli
+eđer
+eğer
+fakat
+filanca
+gah
+gayet
+gayetle
+gayri
+gayrı
+gelgelelim
+gene
+gerek
+gerçi
+geçende
+geçenlerde
+gibi
+gibilerden
+gibisinden
+gine
+göre
+gırla
+hakeza
+halbuki
+halen
+halihazırda
+haliyle
+handiyse
+hangi
+hangisi
+hani
+hariç
+hasebiyle
+hasılı
+hatta
+hele
+hem
+henüz
+hep
+hepsi
+her
+herhangi
+herkes
+herkesin
+hiç
+hiçbir
+hiçbiri
+hoş
+hulasaten
+iken
+iki
+ila
+ile
+ilen
+ilgili
+ilk
+illa
+illaki
+imdi
+indinde
+inen
+insermi
+ise
+ister
+itibaren
+itibariyle
+itibarıyla
+iyi
+iyice
+iyicene
+için
+iş
+işte
+iţte
+kadar
+kaffesi
+kah
+kala
+kanýmca
+karşın
+katrilyon
+kaynak
+kaçı
+kelli
+kendi
+kendilerine
+kendini
+kendisi
+kendisine
+kendisini
+kere
+kez
+keza
+kezalik
+keşke
+keţke
+ki
+kim
+kimden
+kime
+kimi
+kimisi
+kimse
+kimsecik
+kimsecikler
+külliyen
+kýrk
+kýsaca
+kırk
+kısaca
+lakin
+leh
+lütfen
+maada
+madem
+mademki
+mamafih
+mebni
+međer
+meğer
+meğerki
+meğerse
+milyar
+milyon
+mu
+mü
+mý
+mı
+nasýl
+nasıl
+nasılsa
+nazaran
+naşi
+ne
+neden
+nedeniyle
+nedenle
+nedense
+nerde
+nerden
+nerdeyse
+nere
+nerede
+nereden
+neredeyse
+neresi
+nereye
+netekim
+neye
+neyi
+neyse
+nice
+nihayet
+nihayetinde
+nitekim
+niye
+niçin
+o
+olan
+olarak
+oldu
+olduklarını
+oldukça
+olduğu
+olduğunu
+olmadı
+olmadığı
+olmak
+olması
+olmayan
+olmaz
+olsa
+olsun
+olup
+olur
+olursa
+oluyor
+on
+ona
+onca
+onculayın
+onda
+ondan
+onlar
+onlardan
+onlari
+onlarýn
+onları
+onların
+onu
+onun
+oracık
+oracıkta
+orada
+oradan
+oranca
+oranla
+oraya
+otuz
+oysa
+oysaki
+pek
+pekala
+peki
+pekçe
+peyderpey
+rağmen
+sadece
+sahi
+sahiden
+sana
+sanki
+sekiz
+seksen
+sen
+senden
+seni
+senin
+siz
+sizden
+sizi
+sizin
+sonra
+sonradan
+sonraları
+sonunda
+tabii
+tam
+tamam
+tamamen
+tamamıyla
+tarafından
+tek
+trilyon
+tüm
+var
+vardı
+vasıtasıyla
+ve
+velev
+velhasıl
+velhasılıkelam
+veya
+veyahut
+ya
+yahut
+yakinen
+yakında
+yakından
+yakınlarda
+yalnız
+yalnızca
+yani
+yapacak
+yapmak
+yaptı
+yaptıkları
+yaptığı
+yaptığını
+yapılan
+yapılması
+yapıyor
+yedi
+yeniden
+yenilerde
+yerine
+yetmiþ
+yetmiş
+yetmiţ
+yine
+yirmi
+yok
+yoksa
+yoluyla
+yüz
+yüzünden
+zarfında
+zaten
+zati
+zira
+çabuk
+çabukça
+çeşitli
+çok
+çokları
+çoklarınca
+çokluk
+çoklukla
+çokça
+çoğu
+çoğun
+çoğunca
+çoğunlukla
+çünkü
+öbür
+öbürkü
+öbürü
+önce
+önceden
+önceleri
+öncelikle
+öteki
+ötekisi
+öyle
+öylece
+öylelikle
+öylemesine
+öz
+üzere
+üç
+þey
+þeyden
+þeyi
+þeyler
+þu
+þuna
+þunda
+þundan
+þunu
+şayet
+şey
+şeyden
+şeyi
+şeyler
+şu
+şuna
+şuncacık
+şunda
+şundan
+şunlar
+şunları
+şunu
+şunun
+şura
+şuracık
+şuracıkta
+şurası
+şöyle
+ţayet
+ţimdi
+ţu
+ţöyle
+""".split())
diff --git a/spacy/lang/tr/tokenizer_exceptions.py b/spacy/lang/tr/tokenizer_exceptions.py
new file mode 100644
index 000000000..c945c0058
--- /dev/null
+++ b/spacy/lang/tr/tokenizer_exceptions.py
@@ -0,0 +1,27 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import ORTH, NORM
+
+
+# These exceptions are mostly for example purposes – hoping that Turkish
+# speakers can contribute in the future! Source of copy-pasted examples:
+# https://en.wiktionary.org/wiki/Category:Turkish_language
+
+_exc = {
+ "sağol": [
+ {ORTH: "sağ"},
+ {ORTH: "ol", NORM: "olun"}]
+}
+
+
+for exc_data in [
+ {ORTH: "A.B.D.", NORM: "Amerika Birleşik Devletleri"}]:
+ _exc[exc_data[ORTH]] = [exc_data]
+
+
+for orth in ["Dr."]:
+ _exc[orth] = [{ORTH: orth}]
+
+
+TOKENIZER_EXCEPTIONS = _exc
diff --git a/website/_includes/_page_models.jade b/website/_includes/_page_models.jade
index 1cab930fb..c7742fa38 100644
--- a/website/_includes/_page_models.jade
+++ b/website/_includes/_page_models.jade
@@ -40,6 +40,8 @@ for id in CURRENT_MODELS
each label in ["Pipeline", "Vectors", "Sources", "Author", "License"]
- var field = label.toLowerCase()
+ if field == "vectors"
+ - field = "vecs"
+row
+cell.u-nowrap
+label=label
diff --git a/website/assets/js/models.js b/website/assets/js/models.js
index 2d371ee1f..f5757c8cb 100644
--- a/website/assets/js/models.js
+++ b/website/assets/js/models.js
@@ -20,21 +20,33 @@ const CHART_FONTS = {
* @property {function} vectors - Format vector data (entries and dimensions).
* @property {function} version - Format model version number.
*/
-export const formats = {
+const formats = {
author: (author, url) => url ? `${author}` : author,
license: (license, url) => url ? `${license}` : license,
sources: sources => (sources instanceof Array) ? sources.join(', ') : sources,
pipeline: pipes => (pipes && pipes.length) ? pipes.map(p => `${p}
`).join(', ') : '-',
- vectors: vec => vec ? `${abbrNumber(vec.keys)} keys, ${abbrNumber(vec.vectors)} unique vectors (${vec.width} dimensions)` : 'n/a',
+ vectors: vec => formatVectors(vec),
version: version => `v${version}
`
};
+/**
+ * Format word vectors data depending on contents.
+ * @property {Object} data - The vectors object from the model's meta.json.
+ */
+const formatVectors = data => {
+ if (!data) return 'n/a';
+ if (Object.values(data).every(n => n == 0)) return 'context vectors only';
+ const { keys, vectors: vecs, width } = data;
+ return `${abbrNumber(keys)} keys, ${abbrNumber(vecs)} unique vectors (${width} dimensions)`;
+}
+
+
/**
* Find the latest version of a model in a compatibility table.
* @param {string} model - The model name.
* @param {Object} compat - Compatibility table, keyed by spaCy version.
*/
-export const getLatestVersion = (model, compat = {}) => {
+const getLatestVersion = (model, compat = {}) => {
for (let [spacy_v, models] of Object.entries(compat)) {
if (models[model]) return models[model][0];
}
@@ -90,7 +102,7 @@ export class ModelLoader {
const tpl = new Templater(modelId);
tpl.get('table').removeAttribute('data-loading');
tpl.get('error').style.display = 'block';
- for (let key of ['sources', 'pipeline', 'vectors', 'author', 'license']) {
+ for (let key of ['sources', 'pipeline', 'vecs', 'author', 'license']) {
tpl.get(key).parentElement.parentElement.style.display = 'none';
}
}
@@ -120,8 +132,8 @@ export class ModelLoader {
if (author) tpl.fill('author', formats.author(author, url), true);
if (license) tpl.fill('license', formats.license(license, this.licenses[license]), true);
if (sources) tpl.fill('sources', formats.sources(sources));
- if (vectors) tpl.fill('vectors', formats.vectors(vectors));
- else tpl.get('vectors').parentElement.parentElement.style.display = 'none';
+ if (vectors) tpl.fill('vecs', formats.vectors(vectors));
+ else tpl.get('vecs').parentElement.parentElement.style.display = 'none';
if (pipeline && pipeline.length) tpl.fill('pipeline', formats.pipeline(pipeline), true);
else tpl.get('pipeline').parentElement.parentElement.style.display = 'none';
}
@@ -223,8 +235,9 @@ export class ModelComparer {
const version = getLatestVersion(name, this.compat);
const modelName = `${name}-${version}`;
return new Promise((resolve, reject) => {
+ if (!version) reject();
// resolve immediately if model already loaded, e.g. in this.models
- if (this.models[name]) resolve(this.models[name]);
+ else if (this.models[name]) resolve(this.models[name]);
else fetch(`${this.url}/meta/${modelName}.json`)
.then(res => handleResponse(res))
.then(json => json.ok ? resolve(this.saveModel(name, json)) : reject())
@@ -306,12 +319,13 @@ export class ModelComparer {
this.tpl.fill(`size${i}`, size);
this.tpl.fill(`desc${i}`, description || 'n/a');
this.tpl.fill(`pipeline${i}`, formats.pipeline(pipeline), true);
- this.tpl.fill(`vectors${i}`, formats.vectors(vectors));
+ this.tpl.fill(`vecs${i}`, formats.vectors(vectors));
this.tpl.fill(`sources${i}`, formats.sources(sources));
this.tpl.fill(`author${i}`, formats.author(author, url), true);
this.tpl.fill(`license${i}`, formats.license(license, this.licenses[license]), true);
// check if model accuracy or speed includes one of the pre-set keys
- for (let key of [...metaKeys, ...Object.keys(this.benchKeys.speed)]) {
+ const allKeys = [].concat(...Object.entries(this.benchKeys).map(([_, v]) => Object.keys(v)));
+ for (let key of allKeys) {
if (accuracy[key]) this.tpl.fill(`${key}${i}`, accuracy[key].toFixed(2))
else if (speed[key]) this.tpl.fill(`${key}${i}`, convertNumber(Math.round(speed[key])))
else this.tpl.fill(`${key}${i}`, 'n/a')
diff --git a/website/assets/js/util.js b/website/assets/js/util.js
index 65d05774c..90e0b5994 100644
--- a/website/assets/js/util.js
+++ b/website/assets/js/util.js
@@ -59,11 +59,12 @@ export const convertNumber = (num = 0, separator = ',') =>
* @param {number|string} num - The number to convert.
* @param {number} fixed - Number of decimals.
*/
-export const abbrNumber = (num = 0, fixed = 2) => {
+export const abbrNumber = (num = 0, fixed = 1) => {
const suffixes = ['', 'k', 'm', 'b', 't'];
if (num === null || num === 0) return 0;
const b = num.toPrecision(2).split('e');
const k = (b.length === 1) ? 0 : Math.floor(Math.min(b[1].slice(1), 14) / 3);
- const c = (k < 1) ? num.toFixed(fixed) : (num / Math.pow(10, k * 3)).toFixed(fixed + 1);
+ const n = (k < 1) ? num : num / Math.pow(10, k * 3);
+ const c = (k >= 1 && n >= 100 ) ? Math.round(n) : n.toFixed(fixed);
return (c < 0 ? c : Math.abs(c)) + suffixes[k];
}
diff --git a/website/models/_data.json b/website/models/_data.json
index 62f21dd6f..8507a3fa1 100644
--- a/website/models/_data.json
+++ b/website/models/_data.json
@@ -12,6 +12,7 @@
"Portuguese": "pt",
"French": "fr",
"Italian": "it",
+ "Dutch": "nl",
"Multi-Language": "xx"
}
},
@@ -39,12 +40,13 @@
},
"MODELS": {
- "en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"],
- "de": ["de_dep_news_sm"],
- "es": ["es_core_web_sm"],
- "pt": [],
- "fr": [],
- "it": [],
+ "en": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_vectors_web_lg"],
+ "de": ["de_core_news_sm", "de_core_news_md"],
+ "es": ["es_core_news_sm", "es_core_news_md", "es_vectors_web_lg"],
+ "pt": ["pt_core_news_sm"],
+ "fr": ["fr_core_news_sm", "fr_core_news_md", "fr_vectors_web_lg"],
+ "it": ["it_core_news_sm"],
+ "nl": ["nl_core_news_sm"],
"xx": ["xx_ent_wiki_sm"]
},
@@ -66,6 +68,7 @@
"gpu": "words per second on GPU",
"pipeline": "Processing pipeline components in order",
"sources": "Sources of training data",
+ "vecs": "Word vectors included in the model. Models that only support context vectors compute similarity via the tensors shared with the pipeline.",
"benchmark_parser": "Parser accuracy",
"benchmark_ner": "NER accuracy",
"benchmark_speed": "Speed"
@@ -103,6 +106,7 @@
"pl": "Polish",
"ro": "Romanian",
"hr": "Croatian",
+ "tr": "Turkish",
"he": "Hebrew",
"ga": "Irish",
"bn": "Bengali",
diff --git a/website/models/comparison.jade b/website/models/comparison.jade
index 881a9aff4..b0ab61efe 100644
--- a/website/models/comparison.jade
+++ b/website/models/comparison.jade
@@ -53,6 +53,8 @@ div(data-tpl=TPL data-tpl-key="result" style="display: none")
for label in ["Version", "Size", "Pipeline", "Vectors", "Sources", "Author", "License"]
- var field = label.toLowerCase()
+ if field == "vectors"
+ - field = "vecs"
+row
+cell.u-nowrap
+label=label
diff --git a/website/models/nl.jade b/website/models/nl.jade
new file mode 100644
index 000000000..081b4a712
--- /dev/null
+++ b/website/models/nl.jade
@@ -0,0 +1,6 @@
+//- 💫 DOCS > MODELS > NL
+
+include ../_includes/_mixins
+
+//- This is a placeholder. The page is rendered via the template at
+//- /_includes/_page-model.jade.
diff --git a/website/usage/_spacy-101/_word-vectors.jade b/website/usage/_spacy-101/_word-vectors.jade
index bb9add8a6..c38360014 100644
--- a/website/usage/_spacy-101/_word-vectors.jade
+++ b/website/usage/_spacy-101/_word-vectors.jade
@@ -4,9 +4,9 @@ p
| Similarity is determined by comparing #[strong word vectors] or "word
| embeddings", multi-dimensional meaning representations of a word. Word
| vectors can be generated using an algorithm like
- | #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's
- | #[+a("/models") default models] come with
- | #[strong 300-dimensional vectors] that look like this:
+ | #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. spaCy's medium
+ | #[code md] and large #[code lg] #[+a("/models") models] come with
+ | #[strong multi-dimensional vectors] that look like this:
+code("banana.vector", false, false, 250).
array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01,
diff --git a/website/usage/_vectors-similarity/_basics.jade b/website/usage/_vectors-similarity/_basics.jade
index 07ad6bcd4..734495c6e 100644
--- a/website/usage/_vectors-similarity/_basics.jade
+++ b/website/usage/_vectors-similarity/_basics.jade
@@ -4,12 +4,9 @@
| Dense, real valued vectors representing distributional similarity
| information are now a cornerstone of practical NLP. The most common way
| to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]
- | family of algorithms. The default
- | #[+a("/models/en") English model] installs
- | 300-dimensional vectors trained on the
- | #[+a("http://commoncrawl.org") Common Crawl] corpus.
- | If you need to train a word2vec model, we recommend the implementation in
- | the Python library #[+a("https://radimrehurek.com/gensim/") Gensim].
+ | family of algorithms. If you need to train a word2vec model, we recommend
+ | the implementation in the Python library
+ | #[+a("https://radimrehurek.com/gensim/") Gensim].
include ../_spacy-101/_similarity
include ../_spacy-101/_word-vectors