Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-11-04 18:07:26 +03:00 · 2017-11-03 00:55:20 +01:00 · 2017-11-03 00:55:20 +01:00 · 54a716f2ec
commit 54a716f2ec
parent 260e6ee3fb 43512c68b2
11 changed files with 619 additions and 26 deletions
--- a/spacy/lang/tr/init.py
+++ b/spacy/lang/tr/init.py
@ -0,0 +1,28 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups
 class TurkishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'tr'
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
 class Turkish(Language):
    lang = 'tr'
    Defaults = TurkishDefaults
 __all__ = ['Turkish']
--- a/spacy/lang/tr/stop_words.py
+++ b/spacy/lang/tr/stop_words.py
@ -0,0 +1,512 @@
 # encoding: utf8
 from __future__ import unicode_literals
 # Source: https://github.com/stopwords-iso/stopwords-tr
 STOP_WORDS = set("""
 acaba
 acep
 adamakıllı
 adeta
 ait
 altmýþ
 altmış
 altý
 altı
 ama
 amma
 anca
 ancak
 arada
 artýk
 aslında
 aynen
 ayrıca
 az
 açıkça
 açıkçası
 bana
 bari
 bazen
 bazý
 bazı
 başkası
 baţka
 belki
 ben
 benden
 beni
 benim
 beri
 beriki
 beþ
 beş
 beţ
 bilcümle
 bile
 bin
 binaen
 binaenaleyh
 bir
 biraz
 birazdan
 birbiri
 birden
 birdenbire
 biri
 birice
 birileri
 birisi
 birkaç
 birkaçı
 birkez
 birlikte
 birçok
 birçoğu
 birþey
 birþeyi
 birşey
 birşeyi
 birţey
 bitevi
 biteviye
 bittabi
 biz
 bizatihi
 bizce
 bizcileyin
 bizden
 bize
 bizi
 bizim
 bizimki
 bizzat
 boşuna
 bu
 buna
 bunda
 bundan
 bunlar
 bunları
 bunların
 bunu
 bunun
 buracıkta
 burada
 buradan
 burası
 böyle
 böylece
 böylecene
 böylelikle
 böylemesine
 böylesine
 büsbütün
 bütün
 cuk
 cümlesi
 da
 daha
 dahi
 dahil
 dahilen
 daima
 dair
 dayanarak
 de
 defa
 dek
 demin
 demincek
 deminden
 denli
 derakap
 derhal
 derken
 deđil
 değil
 değin
 diye
 diđer
 diğer
 diğeri
 doksan
 dokuz
 dolayı
 dolayısıyla
 doğru
 dört
 edecek
 eden
 ederek
 edilecek
 ediliyor
 edilmesi
 ediyor
 elbet
 elbette
 elli
 emme
 en
 enikonu
 epey
 epeyce
 epeyi
 esasen
 esnasında
 etmesi
 etraflı
 etraflıca
 etti
 ettiği
 ettiğini
 evleviyetle
 evvel
 evvela
 evvelce
 evvelden
 evvelemirde
 evveli
 eđer
 eğer
 fakat
 filanca
 gah
 gayet
 gayetle
 gayri
 gayrı
 gelgelelim
 gene
 gerek
 gerçi
 geçende
 geçenlerde
 gibi
 gibilerden
 gibisinden
 gine
 göre
 gırla
 hakeza
 halbuki
 halen
 halihazırda
 haliyle
 handiyse
 hangi
 hangisi
 hani
 hariç
 hasebiyle
 hasılı
 hatta
 hele
 hem
 henüz
 hep
 hepsi
 her
 herhangi
 herkes
 herkesin
 hiç
 hiçbir
 hiçbiri
 hoş
 hulasaten
 iken
 iki
 ila
 ile
 ilen
 ilgili
 ilk
 illa
 illaki
 imdi
 indinde
 inen
 insermi
 ise
 ister
 itibaren
 itibariyle
 itibarıyla
 iyi
 iyice
 iyicene
 için
 iş
 işte
 iţte
 kadar
 kaffesi
 kah
 kala
 kanýmca
 karşın
 katrilyon
 kaynak
 kaçı
 kelli
 kendi
 kendilerine
 kendini
 kendisi
 kendisine
 kendisini
 kere
 kez
 keza
 kezalik
 keşke
 keţke
 ki
 kim
 kimden
 kime
 kimi
 kimisi
 kimse
 kimsecik
 kimsecikler
 külliyen
 kýrk
 kýsaca
 kırk
 kısaca
 lakin
 leh
 lütfen
 maada
 madem
 mademki
 mamafih
 mebni
 međer
 meğer
 meğerki
 meğerse
 milyar
 milyon
 mu
 mü
 mý
 mı
 nasýl
 nasıl
 nasılsa
 nazaran
 naşi
 ne
 neden
 nedeniyle
 nedenle
 nedense
 nerde
 nerden
 nerdeyse
 nere
 nerede
 nereden
 neredeyse
 neresi
 nereye
 netekim
 neye
 neyi
 neyse
 nice
 nihayet
 nihayetinde
 nitekim
 niye
 niçin
 o
 olan
 olarak
 oldu
 olduklarını
 oldukça
 olduğu
 olduğunu
 olmadı
 olmadığı
 olmak
 olması
 olmayan
 olmaz
 olsa
 olsun
 olup
 olur
 olursa
 oluyor
 on
 ona
 onca
 onculayın
 onda
 ondan
 onlar
 onlardan
 onlari
 onlarýn
 onları
 onların
 onu
 onun
 oracık
 oracıkta
 orada
 oradan
 oranca
 oranla
 oraya
 otuz
 oysa
 oysaki
 pek
 pekala
 peki
 pekçe
 peyderpey
 rağmen
 sadece
 sahi
 sahiden
 sana
 sanki
 sekiz
 seksen
 sen
 senden
 seni
 senin
 siz
 sizden
 sizi
 sizin
 sonra
 sonradan
 sonraları
 sonunda
 tabii
 tam
 tamam
 tamamen
 tamamıyla
 tarafından
 tek
 trilyon
 tüm
 var
 vardı
 vasıtasıyla
 ve
 velev
 velhasıl
 velhasılıkelam
 veya
 veyahut
 ya
 yahut
 yakinen
 yakında
 yakından
 yakınlarda
 yalnız
 yalnızca
 yani
 yapacak
 yapmak
 yaptı
 yaptıkları
 yaptığı
 yaptığını
 yapılan
 yapılması
 yapıyor
 yedi
 yeniden
 yenilerde
 yerine
 yetmiþ
 yetmiş
 yetmiţ
 yine
 yirmi
 yok
 yoksa
 yoluyla
 yüz
 yüzünden
 zarfında
 zaten
 zati
 zira
 çabuk
 çabukça
 çeşitli
 çok
 çokları
 çoklarınca
 çokluk
 çoklukla
 çokça
 çoğu
 çoğun
 çoğunca
 çoğunlukla
 çünkü
 öbür
 öbürkü
 öbürü
 önce
 önceden
 önceleri
 öncelikle
 öteki
 ötekisi
 öyle
 öylece
 öylelikle
 öylemesine
 öz
 üzere
 üç
 þey
 þeyden
 þeyi
 þeyler
 þu
 þuna
 þunda
 þundan
 þunu
 şayet
 şey
 şeyden
 şeyi
 şeyler
 şu
 şuna
 şuncacık
 şunda
 şundan
 şunlar
 şunları
 şunu
 şunun
 şura
 şuracık
 şuracıkta
 şurası
 şöyle
 ţayet
 ţimdi
 ţu
 ţöyle
 """.split())
--- a/spacy/lang/tr/tokenizer_exceptions.py
+++ b/spacy/lang/tr/tokenizer_exceptions.py
@ -0,0 +1,27 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...symbols import ORTH, NORM
 # These exceptions are mostly for example purposes – hoping that Turkish
 # speakers can contribute in the future! Source of copy-pasted examples:
 # https://en.wiktionary.org/wiki/Category:Turkish_language
 _exc = {
    "sağol": [
        {ORTH: "sağ"},
        {ORTH: "ol", NORM: "olun"}]
 }
 for exc_data in [
    {ORTH: "A.B.D.", NORM: "Amerika Birleşik Devletleri"}]:
    _exc[exc_data[ORTH]] = [exc_data]
 for orth in ["Dr."]:
    _exc[orth] = [{ORTH: orth}]
 TOKENIZER_EXCEPTIONS = _exc
--- a/website/_includes/_page_models.jade
+++ b/website/_includes/_page_models.jade
@ -40,6 +40,8 @@ for id in CURRENT_MODELS
            each label in ["Pipeline", "Vectors", "Sources", "Author", "License"]
                - var field = label.toLowerCase()
                if field == "vectors"
                    - field = "vecs"
                +row
                    +cell.u-nowrap
                        +label=label
--- a/website/assets/js/models.js
+++ b/website/assets/js/models.js
@ -20,21 +20,33 @@ const CHART_FONTS = {
 * @property {function} vectors - Format vector data (entries and dimensions).
 * @property {function} version - Format model version number.
 */
-export const formats = {
+const formats = {
    author: (author, url) => url ? `<a href="${url}" target="_blank">${author}</a>` : author,
    license: (license, url) => url ? `<a href="${url}" target="_blank">${license}</a>` : license,
    sources: sources => (sources instanceof Array) ? sources.join(', ') : sources,
    pipeline: pipes => (pipes && pipes.length) ? pipes.map(p => `<code>${p}</code>`).join(', ') : '-',
-    vectors: vec => vec ? `${abbrNumber(vec.keys)} keys, ${abbrNumber(vec.vectors)} unique vectors (${vec.width} dimensions)` : 'n/a',
+    vectors: vec => formatVectors(vec),
    version: version => `<code>v${version}</code>`
 };
 /**
 * Format word vectors data depending on contents.
 * @property {Object} data - The vectors object from the model's meta.json.
 */
 const formatVectors = data => {
    if (!data) return 'n/a';
    if (Object.values(data).every(n => n == 0)) return 'context vectors only';
    const { keys, vectors: vecs, width } = data;
    return `${abbrNumber(keys)} keys, ${abbrNumber(vecs)} unique vectors (${width} dimensions)`;
 }
 /**
 * Find the latest version of a model in a compatibility table.
 * @param {string} model - The model name.
 * @param {Object} compat - Compatibility table, keyed by spaCy version.
 */
-export const getLatestVersion = (model, compat = {}) => {
+const getLatestVersion = (model, compat = {}) => {
    for (let [spacy_v, models] of Object.entries(compat)) {
        if (models[model]) return models[model][0];
    }
@ -90,7 +102,7 @@ export class ModelLoader {
        const tpl = new Templater(modelId);
        tpl.get('table').removeAttribute('data-loading');
        tpl.get('error').style.display = 'block';
-        for (let key of ['sources', 'pipeline', 'vectors', 'author', 'license']) {
+        for (let key of ['sources', 'pipeline', 'vecs', 'author', 'license']) {
            tpl.get(key).parentElement.parentElement.style.display = 'none';
        }
    }
@ -120,8 +132,8 @@ export class ModelLoader {
        if (author) tpl.fill('author', formats.author(author, url), true);
        if (license) tpl.fill('license', formats.license(license, this.licenses[license]), true);
        if (sources) tpl.fill('sources', formats.sources(sources));
-        if (vectors) tpl.fill('vectors', formats.vectors(vectors));
+        if (vectors) tpl.fill('vecs', formats.vectors(vectors));
-        else tpl.get('vectors').parentElement.parentElement.style.display = 'none';
+        else tpl.get('vecs').parentElement.parentElement.style.display = 'none';
        if (pipeline && pipeline.length) tpl.fill('pipeline', formats.pipeline(pipeline), true);
        else tpl.get('pipeline').parentElement.parentElement.style.display = 'none';
    }
@ -223,8 +235,9 @@ export class ModelComparer {
        const version = getLatestVersion(name, this.compat);
        const modelName = `${name}-${version}`;
        return new Promise((resolve, reject) => {
            if (!version) reject();
            // resolve immediately if model already loaded, e.g. in this.models
-            if (this.models[name]) resolve(this.models[name]);
+            else if (this.models[name]) resolve(this.models[name]);
            else fetch(`${this.url}/meta/${modelName}.json`)
                .then(res => handleResponse(res))
                .then(json => json.ok ? resolve(this.saveModel(name, json)) : reject())
@ -306,12 +319,13 @@ export class ModelComparer {
        this.tpl.fill(`size${i}`, size);
        this.tpl.fill(`desc${i}`, description || 'n/a');
        this.tpl.fill(`pipeline${i}`, formats.pipeline(pipeline), true);
-        this.tpl.fill(`vectors${i}`, formats.vectors(vectors));
+        this.tpl.fill(`vecs${i}`, formats.vectors(vectors));
        this.tpl.fill(`sources${i}`, formats.sources(sources));
        this.tpl.fill(`author${i}`, formats.author(author, url), true);
        this.tpl.fill(`license${i}`, formats.license(license, this.licenses[license]), true);
        // check if model accuracy or speed includes one of the pre-set keys
-        for (let key of [...metaKeys, ...Object.keys(this.benchKeys.speed)]) {
+        const allKeys = [].concat(...Object.entries(this.benchKeys).map(([_, v]) => Object.keys(v)));
        for (let key of allKeys) {
            if (accuracy[key]) this.tpl.fill(`${key}${i}`, accuracy[key].toFixed(2))
            else if (speed[key]) this.tpl.fill(`${key}${i}`, convertNumber(Math.round(speed[key])))
            else this.tpl.fill(`${key}${i}`, 'n/a')
--- a/website/assets/js/util.js
+++ b/website/assets/js/util.js
@ -59,11 +59,12 @@ export const convertNumber = (num = 0, separator = ',') =>
 * @param {number|string} num - The number to convert.
 * @param {number} fixed - Number of decimals.
 */
-export const abbrNumber = (num = 0, fixed = 2) => {
+export const abbrNumber = (num = 0, fixed = 1) => {
    const suffixes = ['', 'k', 'm', 'b', 't'];
    if (num === null || num === 0) return 0;
    const b = num.toPrecision(2).split('e');
    const k = (b.length === 1) ? 0 : Math.floor(Math.min(b[1].slice(1), 14) / 3);
-    const c = (k < 1) ? num.toFixed(fixed) : (num / Math.pow(10, k * 3)).toFixed(fixed + 1);
+    const n = (k < 1) ? num : num / Math.pow(10, k * 3);
    const c = (k >= 1 && n >= 100 ) ? Math.round(n) : n.toFixed(fixed);
    return (c < 0 ? c : Math.abs(c)) + suffixes[k];
 }
--- a/website/models/_data.json
+++ b/website/models/_data.json
@ -12,6 +12,7 @@
            "Portuguese": "pt",
            "French": "fr",
            "Italian": "it",
            "Dutch": "nl",
            "Multi-Language": "xx"
        }
    },
@ -39,12 +40,13 @@
    },
    "MODELS": {
-        "en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"],
+        "en": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_vectors_web_lg"],
-        "de": ["de_dep_news_sm"],
+        "de": ["de_core_news_sm", "de_core_news_md"],
-        "es": ["es_core_web_sm"],
+        "es": ["es_core_news_sm", "es_core_news_md", "es_vectors_web_lg"],
-        "pt": [],
+        "pt": ["pt_core_news_sm"],
-        "fr": [],
+        "fr": ["fr_core_news_sm", "fr_core_news_md", "fr_vectors_web_lg"],
-        "it": [],
+        "it": ["it_core_news_sm"],
        "nl": ["nl_core_news_sm"],
        "xx": ["xx_ent_wiki_sm"]
    },
@ -66,6 +68,7 @@
        "gpu": "words per second on GPU",
        "pipeline": "Processing pipeline components in order",
        "sources": "Sources of training data",
        "vecs": "Word vectors included in the model. Models that only support context vectors compute similarity via the tensors shared with the pipeline.",
        "benchmark_parser": "Parser accuracy",
        "benchmark_ner": "NER accuracy",
        "benchmark_speed": "Speed"
@ -103,6 +106,7 @@
        "pl": "Polish",
        "ro": "Romanian",
        "hr": "Croatian",
        "tr": "Turkish",
        "he": "Hebrew",
        "ga": "Irish",
        "bn": "Bengali",
--- a/website/models/comparison.jade
+++ b/website/models/comparison.jade
@ -53,6 +53,8 @@ div(data-tpl=TPL data-tpl-key="result" style="display: none")
        for label in ["Version", "Size", "Pipeline", "Vectors", "Sources", "Author", "License"]
            - var field = label.toLowerCase()
            if field == "vectors"
                - field = "vecs"
            +row
                +cell.u-nowrap
                    +label=label
--- a/website/models/nl.jade
+++ b/website/models/nl.jade
@ -0,0 +1,6 @@
 //- 💫 DOCS > MODELS > NL
 include ../_includes/_mixins
 //- This is a placeholder. The page is rendered via the template at
 //- /_includes/_page-model.jade.
--- a/website/usage/_spacy-101/_word-vectors.jade
+++ b/website/usage/_spacy-101/_word-vectors.jade
@ -4,9 +4,9 @@ p
    |  Similarity is determined by comparing #[strong word vectors] or "word
    |  embeddings", multi-dimensional meaning representations of a word. Word
    |  vectors can be generated using an algorithm like
-    |  #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's
+    |  #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. spaCy's medium
-    |  #[+a("/models") default models] come with
+    |  #[code md] and large #[code lg] #[+a("/models") models] come with
-    |  #[strong 300-dimensional vectors] that look like this:
+    |  #[strong multi-dimensional vectors] that look like this:
 +code("banana.vector", false, false, 250).
    array([2.02280000e-01,  -7.66180009e-02,   3.70319992e-01,
--- a/website/usage/_vectors-similarity/_basics.jade
+++ b/website/usage/_vectors-similarity/_basics.jade
@ -4,12 +4,9 @@
    |  Dense, real valued vectors representing distributional similarity
    |  information are now a cornerstone of practical NLP. The most common way
    |  to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]
-    |  family of algorithms. The default
+    |  family of algorithms. If you need to train a word2vec model, we recommend
-    |  #[+a("/models/en") English model] installs
+    |  the implementation in the Python library
-    |  300-dimensional vectors trained on the
+    |  #[+a("https://radimrehurek.com/gensim/") Gensim].
    |  #[+a("http://commoncrawl.org") Common Crawl] corpus.
    |  If you need to train a word2vec model, we recommend the implementation in
    |  the Python library #[+a("https://radimrehurek.com/gensim/") Gensim].
 include ../_spacy-101/_similarity
 include ../_spacy-101/_word-vectors