Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-07-06 04:43:17 +03:00 · 2017-11-06 22:09:20 +01:00 · 2017-11-06 22:09:20 +01:00 · 906aece532
commit 906aece532
parent dd90fe09f5 6447b8e396
15 changed files with 141 additions and 109 deletions
--- a/website/_harp.json
+++ b/website/_harp.json
@ -66,8 +66,7 @@
                { "id": 3, "title": "3.x", "checked": true }]
            },
            { "id": "config", "title": "Configuration", "multiple": true, "options": [
-                {"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" },
+                {"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" }]
                {"id": "gpu", "title": "GPU", "help": "Run spaCy on GPU to make it faster. Requires an NVDIA graphics card with CUDA 2+. See section below for more info."}]
            },
            { "id": "model", "title": "Models", "multiple": true }
        ],
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -243,13 +243,12 @@ mixin code(label, language, prompt, height, icon, wrap)
    pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
        if label
            h4.u-text-label.u-text-label--dark=label
        - var icon = icon || (prompt == 'accept' || prompt == 'reject')
        if icon
            - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
            .c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
                +icon(icon, 18)
-        code.c-code-block__content(class=wrap ? "u-wrap" : null data-prompt=icon ? null : prompt)
+        code.c-code-block__content(class=wrap ? "u-wrap" : null data-prompt=prompt)
            block
@ -262,14 +261,14 @@ mixin code-wrapper()
 //- Code blocks to display old/new versions
    label - [string] ARIA label for block. Defaults to "correct"/"incorrect".
-mixin code-old(label)
+mixin code-old(label, lang, prompt)
    - var label = label || 'incorrect'
-    +code(false, false, false, false, "reject").o-block-small(aria-label=label)
+    +code(false, lang, prompt, false, "reject").o-block-small(aria-label=label)
        block
-mixin code-new(label)
+mixin code-new(label, lang, prompt)
    - var label = label || 'correct'
-    +code(false, false, false, false, "accept").o-block-small(aria-label=label)
+    +code(false, lang, prompt, false, "accept").o-block-small(aria-label=label)
        block
@ -452,8 +451,8 @@ mixin head-cell()
 //- Table cell (only used within +row in +table)
-mixin cell(align)
+mixin cell(...style)
-    td.c-table__cell.u-text(class=align ? "u-text-" + align : null)&attributes(attributes)
+    td.c-table__cell.u-text(class=prefixArgs(style, "c-table__cell"))&attributes(attributes)
        block
--- a/website/_includes/_page_models.jade
+++ b/website/_includes/_page_models.jade
@ -71,7 +71,7 @@ for id in CURRENT_MODELS
                                        +label=label
                                            if MODEL_META[field]
                                                |  #[+help(MODEL_META[field]).u-color-subtle]
-                                    +cell("right")(data-tpl=id data-tpl-key=field)
+                                    +cell("num")(data-tpl=id data-tpl-key=field)
                                        |  n/a
        p.u-text-small.u-color-dark(data-tpl=id data-tpl-key="notes")
--- a/website/assets/css/_components/_tables.sass
+++ b/website/assets/css/_components/_tables.sass
@ -32,6 +32,15 @@
    &:not(:last-child)
        border-right: 1px solid $color-subtle
    &.c-table__cell--num
        text-align: right
        font-feature-settings: "tnum"
        font-variant-numeric: tabular-nums
        & > strong
            font-feature-settings: initial
            font-variant-numeric: initial
 //- Table head cell
--- a/website/index.jade
+++ b/website/index.jade
@ -111,8 +111,8 @@ include _includes/_mixins
        |  deliver accuracy in-line with the latest research systems,
        |  even when  evaluated from raw text. With these innovations, spaCy
        |  v2.0's models are #[strong 10&times; smaller],
-        |  #[strong 20% more accurate], and #[strong just as fast] as the
+        |  #[strong 20% more accurate], and #[strong even cheaper to run] than
-        |  previous generation.
+        |  the previous generation.
    .o-block-small.u-text-right
        +button("/models", true, "secondary-light") Download models
--- a/website/models/index.jade
+++ b/website/models/index.jade
@ -20,8 +20,8 @@ include ../_includes/_mixins
        |  deliver #[strong accuracy in-line with the latest research systems],
        |  even when  evaluated from raw text. With these innovations, spaCy
        |  v2.0's models are #[strong 10&times; smaller],
-        |  #[strong 20% more accurate], and #[strong just as fast] as the
+        |  #[strong 20% more accurate], and #[strong even cheaper to run] than
-        |  previous generation.
+        |  the previous generation.
    include ../usage/_models/_quickstart
--- a/website/usage/_facts-figures/_benchmarks-choi-2015.jade
+++ b/website/usage/_facts-figures/_benchmarks-choi-2015.jade
@ -5,41 +5,41 @@
        +cell #[strong spaCy v2.x]
        +cell 2017
        +cell Python / Cython
-        +cell("right") #[strong 92.6]
+        +cell("num") #[strong 92.6]
-        +cell("right") #[em n/a]
+        +cell("num") #[em n/a]
            |  #[+help("This table shows speed as benchmarked by Choi et al. We therefore can't provide comparable figures, as we'd be running the benchmark on different hardware.").u-color-dark]
    +row
        +cell #[strong spaCy v1.x]
        +cell 2015
        +cell Python / Cython
-        +cell("right") 91.8
+        +cell("num") 91.8
-        +cell("right") 13,963
+        +cell("num") 13,963
    +row
        +cell ClearNLP
        +cell 2015
        +cell Java
-        +cell("right") 91.7
+        +cell("num") 91.7
-        +cell("right") 10,271
+        +cell("num") 10,271
    +row
        +cell CoreNLP
        +cell 2015
        +cell Java
-        +cell("right") 89.6
+        +cell("num") 89.6
-        +cell("right") 8,602
+        +cell("num") 8,602
    +row
        +cell MATE
        +cell 2015
        +cell Java
-        +cell("right") 92.5
+        +cell("num") 92.5
-        +cell("right") 550
+        +cell("num") 550
    +row
        +cell Turbo
        +cell 2015
        +cell C++
-        +cell("right") 92.4
+        +cell("num") 92.4
-        +cell("right") 349
+        +cell("num") 349
--- a/website/usage/_facts-figures/_benchmarks-models.jade
+++ b/website/usage/_facts-figures/_benchmarks-models.jade
@ -20,34 +20,34 @@ p
    +row
        +cell #[+a("/models/en#en_core_web_sm") #[code en_core_web_sm]] 2.0.0a8
        each data in ["2.x", "neural"]
-            +cell("right")=data
+            +cell("num")=data
-        +cell("right") 91.7
+        +cell("num") 91.7
-        +cell("right") 85.3
+        +cell("num") 85.3
-        +cell("right") 97.0
+        +cell("num") 97.0
-        +cell("right") 10.1k
+        +cell("num") 10.1k
-        +cell("right") #[strong 35 MB]
+        +cell("num") #[strong 35MB]
    +row
        +cell #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] 2.0.0a3
        each data in ["2.x", "neural"]
-            +cell("right")=data
+            +cell("num")=data
-        +cell("right") #[strong 91.9]
+        +cell("num") #[strong 91.9]
-        +cell("right") #[strong 85.9]
+        +cell("num") #[strong 85.9]
-        +cell("right") #[strong 97.2]
+        +cell("num") #[strong 97.2]
-        +cell("right") 5.0k
+        +cell("num") 10.0k
-        +cell("right") 812 MB
+        +cell("num") 812MB
    +row("divider")
        +cell #[code en_core_web_sm] 1.2.0
        each data in ["1.x", "linear", 86.6, 78.5, 96.6]
-            +cell("right")=data
+            +cell("num")=data
-        +cell("right") #[strong 25.7k]
+        +cell("num") #[strong 25.7k]
-        +cell("right") 50 MB
+        +cell("num") 50MB
    +row
        +cell #[code en_core_web_md] 1.2.1
        each data in ["1.x", "linear", 90.6, 81.4, 96.7, "18.8k", "1GB"]
-            +cell("right")=data
+            +cell("num")=data
 +h(4, "benchmarks-models-spanish") Spanish
@ -59,29 +59,29 @@ p
 +table(["Model", "spaCy", "Type", "UAS", "NER F", "POS", "WPS", "Size"])
    +row
        +cell #[+a("/models/es#es_core_news_sm") #[code es_core_news_sm]] 2.0.0a0
-        +cell("right") 2.x
+        +cell("num") 2.x
-        +cell("right") neural
+        +cell("num") neural
-        +cell("right") 89.8
+        +cell("num") 89.8
-        +cell("right") 88.7
+        +cell("num") 88.7
-        +cell("right") #[strong 96.9]
+        +cell("num") #[strong 96.9]
-        +cell("right") #[em n/a]
+        +cell("num") #[em n/a]
-        +cell("right") #[strong 35 MB]
+        +cell("num") #[strong 35MB]
    +row
        +cell #[+a("/models/es#es_core_news_md") #[code es_core_news_md]] 2.0.0a0
-        +cell("right") 2.x
+        +cell("num") 2.x
-        +cell("right") neural
+        +cell("num") neural
-        +cell("right") #[strong 90.2]
+        +cell("num") #[strong 90.2]
-        +cell("right") 89.0
+        +cell("num") 89.0
-        +cell("right") 97.8
+        +cell("num") 97.8
-        +cell("right") #[em n/a]
+        +cell("num") #[em n/a]
-        +cell("right") 93 MB
+        +cell("num") 93MB
    +row("divider")
        +cell #[code es_core_web_md] 1.1.0
        each data in ["1.x", "linear", 87.5]
-            +cell("right")=data
+            +cell("num")=data
-        +cell("right") #[strong 94.2]
+        +cell("num") #[strong 94.2]
-        +cell("right") 96.7
+        +cell("num") 96.7
-        +cell("right") #[em n/a]
+        +cell("num") #[em n/a]
-        +cell("right") 377 MB
+        +cell("num") 377MB
--- a/website/usage/_facts-figures/_benchmarks.jade
+++ b/website/usage/_facts-figures/_benchmarks.jade
@ -50,55 +50,55 @@ p
        +cell spaCy v2.0.0
        +cell 2017
        +cell neural
-        +cell("right") 94.48
+        +cell("num") 94.48
    +row
        +cell spaCy v1.1.0
        +cell 2016
        +cell linear
-        +cell("right") 92.80
+        +cell("num") 92.80
    +row("divider")
        +cell
            +a("https://arxiv.org/pdf/1611.01734.pdf") Dozat and Manning
            +cell 2017
            +cell neural
-            +cell("right") #[strong 95.75]
+            +cell("num") #[strong 95.75]
    +row
        +cell
            +a("http://arxiv.org/abs/1603.06042") Andor et al.
        +cell 2016
        +cell neural
-        +cell("right") 94.44
+        +cell("num") 94.44
    +row
        +cell
            +a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet Parsey McParseface
        +cell 2016
        +cell neural
-        +cell("right") 94.15
+        +cell("num") 94.15
    +row
        +cell
            +a("http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43800.pdf") Weiss et al.
        +cell 2015
        +cell neural
-        +cell("right") 93.91
+        +cell("num") 93.91
    +row
        +cell
            +a("http://research.google.com/pubs/archive/38148.pdf") Zhang and McDonald
        +cell 2014
        +cell linear
-        +cell("right") 93.32
+        +cell("num") 93.32
    +row
        +cell
            +a("http://www.cs.cmu.edu/~ark/TurboParser/") Martins et al.
        +cell 2013
        +cell linear
-        +cell("right") 93.10
+        +cell("num") 93.10
 +h(4, "ner-accuracy-ontonotes5") NER accuracy (OntoNotes 5, no pre-process)
@ -113,35 +113,35 @@ p
        +cell spaCy #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] v2.0.0a3
        +cell 2017
        +cell neural
-        +cell("right") 85.85
+        +cell("num") 85.85
    +row("divider")
        +cell
            +a("https://arxiv.org/pdf/1702.02098.pdf") Strubell et al.
        +cell 2017
        +cell neural
-        +cell("right") #[strong 86.81]
+        +cell("num") #[strong 86.81]
    +row
        +cell
            +a("https://www.semanticscholar.org/paper/Named-Entity-Recognition-with-Bidirectional-LSTM-C-Chiu-Nichols/10a4db59e81d26b2e0e896d3186ef81b4458b93f") Chiu and Nichols
        +cell 2016
        +cell neural
-        +cell("right") 86.19
+        +cell("num") 86.19
    +row
        +cell
            +a("https://www.semanticscholar.org/paper/A-Joint-Model-for-Entity-Analysis-Coreference-Typi-Durrett-Klein/28eb033eee5f51c5e5389cbb6b777779203a6778") Durrett and Klein
        +cell 2014
        +cell neural
-        +cell("right") 84.04
+        +cell("num") 84.04
    +row
        +cell
            +a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth
        +cell 2009
        +cell linear
-        +cell("right") 83.45
+        +cell("num") 83.45
 +h(3, "spacy-models") Model comparison
@ -183,24 +183,24 @@ p
    +row
        +cell #[strong spaCy]
        each data in [ "0.2ms", "1ms", "19ms"]
-            +cell("right") #[strong=data]
+            +cell("num") #[strong=data]
        each data in ["1x", "1x", "1x"]
-            +cell("right")=data
+            +cell("num")=data
    +row
        +cell CoreNLP
        each data in ["2ms", "10ms", "49ms", "10x", "10x", "2.6x"]
-            +cell("right")=data
+            +cell("num")=data
    +row
        +cell ZPar
        each data in ["1ms", "8ms", "850ms", "5x", "8x", "44.7x"]
-            +cell("right")=data
+            +cell("num")=data
    +row
        +cell NLTK
        each data in ["4ms", "443ms"]
-            +cell("right")=data
+            +cell("num")=data
-        +cell("right") #[em n/a]
+        +cell("num") #[em n/a]
        each data in ["20x", "443x"]
-            +cell("right")=data
+            +cell("num")=data
-        +cell("right") #[em n/a]
+        +cell("num") #[em n/a]
--- a/website/usage/_install/_instructions.jade
+++ b/website/usage/_install/_instructions.jade
@ -79,12 +79,19 @@ p
    python -m spacy validate
 +h(3, "gpu") Run spaCy with GPU
    +tag experimental
 +infobox("Important note", "⚠️")
    |  The instructions below refer to installation with CUDA 8.0. In order to
    |  install with CUDA 9.0, set the environment variable #[code CUDA9=1]
    |  before installing Thinc. You'll also need to adjust the path to the
    |  CUDA runtime.
 p
    |  As of v2.0, spaCy's comes with neural network models that are implemented
    |  in our machine learning library, #[+a(gh("thinc")) Thinc]. For GPU
    |  support, we've been grateful to use the work of
-    |  #[+a("http://chainer.org") Chainer]'s CuPy module, which provides
+    |  Chainer's #[+a("https://cupy.chainer.org") CuPy] module, which provides
    |  a NumPy-compatible interface for GPU arrays.
 p
@ -93,11 +100,11 @@ p
    |  CUDA. Finally, install spaCy.
 +code(false, "bash").
-    export CUDA_HOME=/usr/local/cuda-8.0 # Or wherever your CUDA is
+    export CUDA_HOME=/usr/local/cuda-8.0  # or wherever your CUDA is
    export PATH=$PATH:$CUDA_HOME/bin
    pip install spacy
-    python -c "import thinc.neural.gpu_ops" # Check the GPU ops were built
+    python -c "import thinc.neural.gpu_ops"  # check the GPU ops were built
 +h(3, "source") Compile from source
--- a/website/usage/_install/_quickstart.jade
+++ b/website/usage/_install/_quickstart.jade
@ -11,9 +11,6 @@
    +qs({config: 'venv', os: 'linux'}) source .env/bin/activate
    +qs({config: 'venv', os: 'windows'}) .env\Scripts\activate
    +qs({config: 'gpu', os: 'mac'}) export PATH=$PATH:/usr/local/cuda-8.0/bin
    +qs({config: 'gpu', os: 'linux'}) export PATH=$PATH:/usr/local/cuda-8.0/bin
    +qs({package: 'pip'}) pip install -U spacy
    +qs({package: 'conda'}) conda install -c conda-forge spacy
--- a/website/usage/_spacy-101/_word-vectors.jade
+++ b/website/usage/_spacy-101/_word-vectors.jade
@ -4,9 +4,8 @@ p
    |  Similarity is determined by comparing #[strong word vectors] or "word
    |  embeddings", multi-dimensional meaning representations of a word. Word
    |  vectors can be generated using an algorithm like
-    |  #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. spaCy's medium
+    |  #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec] and usually
-    |  #[code md] and large #[code lg] #[+a("/models") models] come with
+    |  look like this:
    |  #[strong multi-dimensional vectors] that look like this:
 +code("banana.vector", false, false, 250).
    array([2.02280000e-01,  -7.66180009e-02,   3.70319992e-01,
@ -110,8 +109,21 @@ p
          -2.97650009e-01,   7.89430022e-01,   3.31680000e-01,
          -1.19659996e+00,  -4.71559986e-02,   5.31750023e-01], dtype=float32)
 +infobox("Important note", "⚠️")
    |  To make them compact and fast, spaCy's small #[+a("/models") models]
    |  (all packages that end in #[code sm]) #[strong don&apos;t ship with word vectors], and
    |  only include context-sensitive #[strong tensors]. This means you can
    |  still use the #[code similarity()] methods to compare documents, spans
    |  and tokens – but the result won't be as good, and individual tokens won't
    |  have any vectors assigned. So in order to use #[em real] word vectors,
    |  you need to download a larger model:
    +code-wrapper
        +code-new(false, "bash", "$") spacy download en_core_web_lg
 p
-    |  The #[code .vector] attribute will return an object's vector.
+    |  Models that come with built-in word vectors make them available as the
    |  #[+api("token#vector") #[code Token.vector]] attribute.
    |  #[+api("doc#vector") #[code Doc.vector]] and
    |  #[+api("span#vector") #[code Span.vector]] will default to an average
    |  of their token vectors. You can also check if a token has a vector
@ -119,6 +131,7 @@ p
    |  vectors.
 +code.
    nlp = spacy.load('en_core_web_lg')
    tokens = nlp(u'dog cat banana sasquatch')
    for token in tokens:
@ -143,10 +156,9 @@ p
    |  they're part of the model's vocabulary, and come with a vector. The word
    |  "sasquatch" on the other hand is a lot less common and out-of-vocabulary
    |  – so its vector representation consists of 300 dimensions of #[code 0],
-    |  which means it's practically nonexistent.
+    |  which means it's practically nonexistent. If your application will
-
+    |  benefit from a #[strong large vocabulary] with more vectors, you should
-p
+    |  consider using one of the larger models or loading in a full vector
-    |  If your application will benefit from a large vocabulary with more
+    |  package, for example,
-    |  vectors, you should consider using one of the
+    |  #[+a("/models/en#en_vectors_web_lg") #[code en_vectors_web_lg]], which
-    |  #[+a("/models") larger models] instead of the default,
+    |  includes over #[strong 1 million unique vectors].
    |  smaller ones, which usually come with a clipped vocabulary.
--- a/website/usage/_v2/_features.jade
+++ b/website/usage/_v2/_features.jade
@ -10,8 +10,9 @@ p
 +h(3, "features-models") Convolutional neural network models
 +aside-code("Example", "bash")
-    for model in ["en", "de", "fr", "es", "pt", "it"]
+    for _, lang in MODELS
-        | spacy download #{model}  # default #{LANGUAGES[model]} model!{'\n'}
+        if lang != "xx"
            | spacy download #{lang}  # default #{LANGUAGES[lang]} model!{'\n'}
    | spacy download xx_ent_wiki_sm  # multi-language NER
 p
@ -20,14 +21,22 @@ p
    |  been designed and implemented from scratch specifically for spaCy, to
    |  give you an unmatched balance of speed, size and accuracy. The new
    |  models are #[strong 10&times; smaller], #[strong 20% more accurate],
-    |  and #[strong just as fast] as the previous generation.
+    |  and #[strong even cheaper to run] than the previous generation.
-    |  #[strong GPU usage] is now supported via
+
-    |  #[+a("http://chainer.org") Chainer]'s CuPy module.
+p
    |  spaCy v2.0's new neural network models bring significant improvements in
    |  accuracy, especially for English Named Entity Recognition. The new
    |  #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] model makes
    |  about #[strong 25% fewer mistakes] than the corresponding v1.x model and
    |  is within #[strong 1% of the current state-of-the-art]
    |  (#[+a("https://arxiv.org/pdf/1702.02098.pdf") Strubell et al., 2017]).
    |  The v2.0 models are also cheaper to run at scale, as they require
    |  #[strong under 1 GB of memory] per process.
 +infobox
    |  #[+label-inline Usage:] #[+a("/models") Models directory],
    |  #[+a("/models/comparison") Models comparison],
-    |  #[+a("/usage/#gpu") Using spaCy with GPU]
+    |  #[+a("#benchmarks") Benchmarks]
 +h(3, "features-pipelines") Improved processing pipelines
--- a/website/usage/_v2/_summary.jade
+++ b/website/usage/_v2/_summary.jade
@ -22,7 +22,7 @@ p
            |  #[strong deep learning-powered models] for spaCy's tagger,
            |  parser and entity recognizer. The new models are
            |  #[strong 10&times; smaller], #[strong 20% more accurate] and
-            |  just as fast as the previous generation.
+            |  #[strong even cheaper to run] than the previous generation.
        p
            |  We've also made several usability improvements that are
--- a/website/usage/models.jade
+++ b/website/usage/models.jade
@ -3,7 +3,7 @@
 include ../_includes/_mixins
 p
-    |  As of v1.7.0, models for spaCy can be installed as #[strong Python packages].
+    |  spaCy's models can be installed as #[strong Python packages].
    |  This means that they're a component of your application, just like any
    |  other module. They're versioned and can be defined as a dependency in your
    |  #[code requirements.txt]. Models can be installed from a download URL or