Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-08-02 03:10:22 +03:00 · 2017-11-06 22:09:20 +01:00 · 2017-11-06 22:09:20 +01:00 · 906aece532
commit 906aece532
parent dd90fe09f5 6447b8e396
15 changed files with 141 additions and 109 deletions
--- a/website/_harp.json
+++ b/website/_harp.json
@ -66,8 +66,7 @@
                { "id": 3, "title": "3.x", "checked": true }]
            },
            { "id": "config", "title": "Configuration", "multiple": true, "options": [
-                {"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" },
-                {"id": "gpu", "title": "GPU", "help": "Run spaCy on GPU to make it faster. Requires an NVDIA graphics card with CUDA 2+. See section below for more info."}]
+                {"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" }]
            },
            { "id": "model", "title": "Models", "multiple": true }
        ],
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -243,13 +243,12 @@ mixin code(label, language, prompt, height, icon, wrap)
    pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
        if label
            h4.u-text-label.u-text-label--dark=label
-        - var icon = icon || (prompt == 'accept' || prompt == 'reject')
        if icon
            - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
            .c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
                +icon(icon, 18)

-        code.c-code-block__content(class=wrap ? "u-wrap" : null data-prompt=icon ? null : prompt)
+        code.c-code-block__content(class=wrap ? "u-wrap" : null data-prompt=prompt)
            block


@ -262,14 +261,14 @@ mixin code-wrapper()
 //- Code blocks to display old/new versions
    label - [string] ARIA label for block. Defaults to "correct"/"incorrect".

-mixin code-old(label)
+mixin code-old(label, lang, prompt)
    - var label = label || 'incorrect'
-    +code(false, false, false, false, "reject").o-block-small(aria-label=label)
+    +code(false, lang, prompt, false, "reject").o-block-small(aria-label=label)
        block

-mixin code-new(label)
+mixin code-new(label, lang, prompt)
    - var label = label || 'correct'
-    +code(false, false, false, false, "accept").o-block-small(aria-label=label)
+    +code(false, lang, prompt, false, "accept").o-block-small(aria-label=label)
        block


@ -452,8 +451,8 @@ mixin head-cell()

 //- Table cell (only used within +row in +table)

-mixin cell(align)
-    td.c-table__cell.u-text(class=align ? "u-text-" + align : null)&attributes(attributes)
+mixin cell(...style)
+    td.c-table__cell.u-text(class=prefixArgs(style, "c-table__cell"))&attributes(attributes)
        block


--- a/website/_includes/_page_models.jade
+++ b/website/_includes/_page_models.jade
@ -71,7 +71,7 @@ for id in CURRENT_MODELS
                                        +label=label
                                            if MODEL_META[field]
                                                |  #[+help(MODEL_META[field]).u-color-subtle]
-                                    +cell("right")(data-tpl=id data-tpl-key=field)
+                                    +cell("num")(data-tpl=id data-tpl-key=field)
                                        |  n/a

        p.u-text-small.u-color-dark(data-tpl=id data-tpl-key="notes")
--- a/website/assets/css/_components/_tables.sass
+++ b/website/assets/css/_components/_tables.sass
@ -32,6 +32,15 @@
    &:not(:last-child)
        border-right: 1px solid $color-subtle

+    &.c-table__cell--num
+        text-align: right
+        font-feature-settings: "tnum"
+        font-variant-numeric: tabular-nums
+
+        & > strong
+            font-feature-settings: initial
+            font-variant-numeric: initial
+

 //- Table head cell

--- a/website/index.jade
+++ b/website/index.jade
@ -111,8 +111,8 @@ include _includes/_mixins
        |  deliver accuracy in-line with the latest research systems,
        |  even when  evaluated from raw text. With these innovations, spaCy
        |  v2.0's models are #[strong 10&times; smaller],
-        |  #[strong 20% more accurate], and #[strong just as fast] as the
-        |  previous generation.
+        |  #[strong 20% more accurate], and #[strong even cheaper to run] than
+        |  the previous generation.

    .o-block-small.u-text-right
        +button("/models", true, "secondary-light") Download models
--- a/website/models/index.jade
+++ b/website/models/index.jade
@ -20,8 +20,8 @@ include ../_includes/_mixins
        |  deliver #[strong accuracy in-line with the latest research systems],
        |  even when  evaluated from raw text. With these innovations, spaCy
        |  v2.0's models are #[strong 10&times; smaller],
-        |  #[strong 20% more accurate], and #[strong just as fast] as the
-        |  previous generation.
+        |  #[strong 20% more accurate], and #[strong even cheaper to run] than
+        |  the previous generation.

    include ../usage/_models/_quickstart

--- a/website/usage/_facts-figures/_benchmarks-choi-2015.jade
+++ b/website/usage/_facts-figures/_benchmarks-choi-2015.jade
@ -5,41 +5,41 @@
        +cell #[strong spaCy v2.x]
        +cell 2017
        +cell Python / Cython
-        +cell("right") #[strong 92.6]
-        +cell("right") #[em n/a]
+        +cell("num") #[strong 92.6]
+        +cell("num") #[em n/a]
            |  #[+help("This table shows speed as benchmarked by Choi et al. We therefore can't provide comparable figures, as we'd be running the benchmark on different hardware.").u-color-dark]

    +row
        +cell #[strong spaCy v1.x]
        +cell 2015
        +cell Python / Cython
-        +cell("right") 91.8
-        +cell("right") 13,963
+        +cell("num") 91.8
+        +cell("num") 13,963

    +row
        +cell ClearNLP
        +cell 2015
        +cell Java
-        +cell("right") 91.7
-        +cell("right") 10,271
+        +cell("num") 91.7
+        +cell("num") 10,271

    +row
        +cell CoreNLP
        +cell 2015
        +cell Java
-        +cell("right") 89.6
-        +cell("right") 8,602
+        +cell("num") 89.6
+        +cell("num") 8,602

    +row
        +cell MATE
        +cell 2015
        +cell Java
-        +cell("right") 92.5
-        +cell("right") 550
+        +cell("num") 92.5
+        +cell("num") 550

    +row
        +cell Turbo
        +cell 2015
        +cell C++
-        +cell("right") 92.4
-        +cell("right") 349
+        +cell("num") 92.4
+        +cell("num") 349
--- a/website/usage/_facts-figures/_benchmarks-models.jade
+++ b/website/usage/_facts-figures/_benchmarks-models.jade
@ -20,34 +20,34 @@ p
    +row
        +cell #[+a("/models/en#en_core_web_sm") #[code en_core_web_sm]] 2.0.0a8
        each data in ["2.x", "neural"]
-            +cell("right")=data
-        +cell("right") 91.7
-        +cell("right") 85.3
-        +cell("right") 97.0
-        +cell("right") 10.1k
-        +cell("right") #[strong 35 MB]
+            +cell("num")=data
+        +cell("num") 91.7
+        +cell("num") 85.3
+        +cell("num") 97.0
+        +cell("num") 10.1k
+        +cell("num") #[strong 35MB]

    +row
        +cell #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] 2.0.0a3
        each data in ["2.x", "neural"]
-            +cell("right")=data
-        +cell("right") #[strong 91.9]
-        +cell("right") #[strong 85.9]
-        +cell("right") #[strong 97.2]
-        +cell("right") 5.0k
-        +cell("right") 812 MB
+            +cell("num")=data
+        +cell("num") #[strong 91.9]
+        +cell("num") #[strong 85.9]
+        +cell("num") #[strong 97.2]
+        +cell("num") 10.0k
+        +cell("num") 812MB

    +row("divider")
        +cell #[code en_core_web_sm] 1.2.0
        each data in ["1.x", "linear", 86.6, 78.5, 96.6]
-            +cell("right")=data
-        +cell("right") #[strong 25.7k]
-        +cell("right") 50 MB
+            +cell("num")=data
+        +cell("num") #[strong 25.7k]
+        +cell("num") 50MB

    +row
        +cell #[code en_core_web_md] 1.2.1
-        each data in ["1.x", "linear", 90.6, 81.4, 96.7, "18.8k", "1 GB"]
-            +cell("right")=data
+        each data in ["1.x", "linear", 90.6, 81.4, 96.7, "18.8k", "1GB"]
+            +cell("num")=data

 +h(4, "benchmarks-models-spanish") Spanish

@ -59,29 +59,29 @@ p
 +table(["Model", "spaCy", "Type", "UAS", "NER F", "POS", "WPS", "Size"])
    +row
        +cell #[+a("/models/es#es_core_news_sm") #[code es_core_news_sm]] 2.0.0a0
-        +cell("right") 2.x
-        +cell("right") neural
-        +cell("right") 89.8
-        +cell("right") 88.7
-        +cell("right") #[strong 96.9]
-        +cell("right") #[em n/a]
-        +cell("right") #[strong 35 MB]
+        +cell("num") 2.x
+        +cell("num") neural
+        +cell("num") 89.8
+        +cell("num") 88.7
+        +cell("num") #[strong 96.9]
+        +cell("num") #[em n/a]
+        +cell("num") #[strong 35MB]

    +row
        +cell #[+a("/models/es#es_core_news_md") #[code es_core_news_md]] 2.0.0a0
-        +cell("right") 2.x
-        +cell("right") neural
-        +cell("right") #[strong 90.2]
-        +cell("right") 89.0
-        +cell("right") 97.8
-        +cell("right") #[em n/a]
-        +cell("right") 93 MB
+        +cell("num") 2.x
+        +cell("num") neural
+        +cell("num") #[strong 90.2]
+        +cell("num") 89.0
+        +cell("num") 97.8
+        +cell("num") #[em n/a]
+        +cell("num") 93MB

    +row("divider")
        +cell #[code es_core_web_md] 1.1.0
        each data in ["1.x", "linear", 87.5]
-            +cell("right")=data
-        +cell("right") #[strong 94.2]
-        +cell("right") 96.7
-        +cell("right") #[em n/a]
-        +cell("right") 377 MB
+            +cell("num")=data
+        +cell("num") #[strong 94.2]
+        +cell("num") 96.7
+        +cell("num") #[em n/a]
+        +cell("num") 377MB
--- a/website/usage/_facts-figures/_benchmarks.jade
+++ b/website/usage/_facts-figures/_benchmarks.jade
@ -50,55 +50,55 @@ p
        +cell spaCy v2.0.0
        +cell 2017
        +cell neural
-        +cell("right") 94.48
+        +cell("num") 94.48

    +row
        +cell spaCy v1.1.0
        +cell 2016
        +cell linear
-        +cell("right") 92.80
+        +cell("num") 92.80

    +row("divider")
        +cell
            +a("https://arxiv.org/pdf/1611.01734.pdf") Dozat and Manning
            +cell 2017
            +cell neural
-            +cell("right") #[strong 95.75]
+            +cell("num") #[strong 95.75]

    +row
        +cell
            +a("http://arxiv.org/abs/1603.06042") Andor et al.
        +cell 2016
        +cell neural
-        +cell("right") 94.44
+        +cell("num") 94.44

    +row
        +cell
            +a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet Parsey McParseface
        +cell 2016
        +cell neural
-        +cell("right") 94.15
+        +cell("num") 94.15

    +row
        +cell
            +a("http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43800.pdf") Weiss et al.
        +cell 2015
        +cell neural
-        +cell("right") 93.91
+        +cell("num") 93.91

    +row
        +cell
            +a("http://research.google.com/pubs/archive/38148.pdf") Zhang and McDonald
        +cell 2014
        +cell linear
-        +cell("right") 93.32
+        +cell("num") 93.32

    +row
        +cell
            +a("http://www.cs.cmu.edu/~ark/TurboParser/") Martins et al.
        +cell 2013
        +cell linear
-        +cell("right") 93.10
+        +cell("num") 93.10

 +h(4, "ner-accuracy-ontonotes5") NER accuracy (OntoNotes 5, no pre-process)

@ -113,35 +113,35 @@ p
        +cell spaCy #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] v2.0.0a3
        +cell 2017
        +cell neural
-        +cell("right") 85.85
+        +cell("num") 85.85

    +row("divider")
        +cell
            +a("https://arxiv.org/pdf/1702.02098.pdf") Strubell et al.
        +cell 2017
        +cell neural
-        +cell("right") #[strong 86.81]
+        +cell("num") #[strong 86.81]

    +row
        +cell
            +a("https://www.semanticscholar.org/paper/Named-Entity-Recognition-with-Bidirectional-LSTM-C-Chiu-Nichols/10a4db59e81d26b2e0e896d3186ef81b4458b93f") Chiu and Nichols
        +cell 2016
        +cell neural
-        +cell("right") 86.19
+        +cell("num") 86.19

    +row
        +cell
            +a("https://www.semanticscholar.org/paper/A-Joint-Model-for-Entity-Analysis-Coreference-Typi-Durrett-Klein/28eb033eee5f51c5e5389cbb6b777779203a6778") Durrett and Klein
        +cell 2014
        +cell neural
-        +cell("right") 84.04
+        +cell("num") 84.04

    +row
        +cell
            +a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth
        +cell 2009
        +cell linear
-        +cell("right") 83.45
+        +cell("num") 83.45

 +h(3, "spacy-models") Model comparison

@ -183,24 +183,24 @@ p
    +row
        +cell #[strong spaCy]
        each data in [ "0.2ms", "1ms", "19ms"]
-            +cell("right") #[strong=data]
+            +cell("num") #[strong=data]

        each data in ["1x", "1x", "1x"]
-            +cell("right")=data
+            +cell("num")=data

    +row
        +cell CoreNLP
        each data in ["2ms", "10ms", "49ms", "10x", "10x", "2.6x"]
-            +cell("right")=data
+            +cell("num")=data
    +row
        +cell ZPar
        each data in ["1ms", "8ms", "850ms", "5x", "8x", "44.7x"]
-            +cell("right")=data
+            +cell("num")=data
    +row
        +cell NLTK
        each data in ["4ms", "443ms"]
-            +cell("right")=data
-        +cell("right") #[em n/a]
+            +cell("num")=data
+        +cell("num") #[em n/a]
        each data in ["20x", "443x"]
-            +cell("right")=data
-        +cell("right") #[em n/a]
+            +cell("num")=data
+        +cell("num") #[em n/a]
--- a/website/usage/_install/_instructions.jade
+++ b/website/usage/_install/_instructions.jade
@ -79,12 +79,19 @@ p
    python -m spacy validate

 +h(3, "gpu") Run spaCy with GPU
+    +tag experimental
+
+infobox("Important note", "⚠️")
+    |  The instructions below refer to installation with CUDA 8.0. In order to
+    |  install with CUDA 9.0, set the environment variable #[code CUDA9=1]
+    |  before installing Thinc. You'll also need to adjust the path to the
+    |  CUDA runtime.

 p
    |  As of v2.0, spaCy's comes with neural network models that are implemented
    |  in our machine learning library, #[+a(gh("thinc")) Thinc]. For GPU
    |  support, we've been grateful to use the work of
-    |  #[+a("http://chainer.org") Chainer]'s CuPy module, which provides
+    |  Chainer's #[+a("https://cupy.chainer.org") CuPy] module, which provides
    |  a NumPy-compatible interface for GPU arrays.

 p
@ -93,11 +100,11 @@ p
    |  CUDA. Finally, install spaCy.

 +code(false, "bash").
-    export CUDA_HOME=/usr/local/cuda-8.0 # Or wherever your CUDA is
+    export CUDA_HOME=/usr/local/cuda-8.0  # or wherever your CUDA is
    export PATH=$PATH:$CUDA_HOME/bin

    pip install spacy
-    python -c "import thinc.neural.gpu_ops" # Check the GPU ops were built
+    python -c "import thinc.neural.gpu_ops"  # check the GPU ops were built

 +h(3, "source") Compile from source

--- a/website/usage/_install/_quickstart.jade
+++ b/website/usage/_install/_quickstart.jade
@ -11,9 +11,6 @@
    +qs({config: 'venv', os: 'linux'}) source .env/bin/activate
    +qs({config: 'venv', os: 'windows'}) .env\Scripts\activate

-    +qs({config: 'gpu', os: 'mac'}) export PATH=$PATH:/usr/local/cuda-8.0/bin
-    +qs({config: 'gpu', os: 'linux'}) export PATH=$PATH:/usr/local/cuda-8.0/bin
-
    +qs({package: 'pip'}) pip install -U spacy
    +qs({package: 'conda'}) conda install -c conda-forge spacy

--- a/website/usage/_spacy-101/_word-vectors.jade
+++ b/website/usage/_spacy-101/_word-vectors.jade
@ -4,9 +4,8 @@ p
    |  Similarity is determined by comparing #[strong word vectors] or "word
    |  embeddings", multi-dimensional meaning representations of a word. Word
    |  vectors can be generated using an algorithm like
-    |  #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. spaCy's medium
-    |  #[code md] and large #[code lg] #[+a("/models") models] come with
-    |  #[strong multi-dimensional vectors] that look like this:
+    |  #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec] and usually
+    |  look like this:

 +code("banana.vector", false, false, 250).
    array([2.02280000e-01,  -7.66180009e-02,   3.70319992e-01,
@ -110,8 +109,21 @@ p
          -2.97650009e-01,   7.89430022e-01,   3.31680000e-01,
          -1.19659996e+00,  -4.71559986e-02,   5.31750023e-01], dtype=float32)

+infobox("Important note", "⚠️")
+    |  To make them compact and fast, spaCy's small #[+a("/models") models]
+    |  (all packages that end in #[code sm]) #[strong don&apos;t ship with word vectors], and
+    |  only include context-sensitive #[strong tensors]. This means you can
+    |  still use the #[code similarity()] methods to compare documents, spans
+    |  and tokens – but the result won't be as good, and individual tokens won't
+    |  have any vectors assigned. So in order to use #[em real] word vectors,
+    |  you need to download a larger model:
+
+    +code-wrapper
+        +code-new(false, "bash", "$") spacy download en_core_web_lg
+
 p
-    |  The #[code .vector] attribute will return an object's vector.
+    |  Models that come with built-in word vectors make them available as the
+    |  #[+api("token#vector") #[code Token.vector]] attribute.
    |  #[+api("doc#vector") #[code Doc.vector]] and
    |  #[+api("span#vector") #[code Span.vector]] will default to an average
    |  of their token vectors. You can also check if a token has a vector
@ -119,6 +131,7 @@ p
    |  vectors.

 +code.
+    nlp = spacy.load('en_core_web_lg')
    tokens = nlp(u'dog cat banana sasquatch')

    for token in tokens:
@ -143,10 +156,9 @@ p
    |  they're part of the model's vocabulary, and come with a vector. The word
    |  "sasquatch" on the other hand is a lot less common and out-of-vocabulary
    |  – so its vector representation consists of 300 dimensions of #[code 0],
-    |  which means it's practically nonexistent.
-
-p
-    |  If your application will benefit from a large vocabulary with more
-    |  vectors, you should consider using one of the
-    |  #[+a("/models") larger models] instead of the default,
-    |  smaller ones, which usually come with a clipped vocabulary.
+    |  which means it's practically nonexistent. If your application will
+    |  benefit from a #[strong large vocabulary] with more vectors, you should
+    |  consider using one of the larger models or loading in a full vector
+    |  package, for example,
+    |  #[+a("/models/en#en_vectors_web_lg") #[code en_vectors_web_lg]], which
+    |  includes over #[strong 1 million unique vectors].
--- a/website/usage/_v2/_features.jade
+++ b/website/usage/_v2/_features.jade
@ -10,8 +10,9 @@ p
 +h(3, "features-models") Convolutional neural network models

 +aside-code("Example", "bash")
-    for model in ["en", "de", "fr", "es", "pt", "it"]
-        | spacy download #{model}  # default #{LANGUAGES[model]} model!{'\n'}
+    for _, lang in MODELS
+        if lang != "xx"
+            | spacy download #{lang}  # default #{LANGUAGES[lang]} model!{'\n'}
    | spacy download xx_ent_wiki_sm  # multi-language NER

 p
@ -20,14 +21,22 @@ p
    |  been designed and implemented from scratch specifically for spaCy, to
    |  give you an unmatched balance of speed, size and accuracy. The new
    |  models are #[strong 10&times; smaller], #[strong 20% more accurate],
-    |  and #[strong just as fast] as the previous generation.
-    |  #[strong GPU usage] is now supported via
-    |  #[+a("http://chainer.org") Chainer]'s CuPy module.
+    |  and #[strong even cheaper to run] than the previous generation.
+
+p
+    |  spaCy v2.0's new neural network models bring significant improvements in
+    |  accuracy, especially for English Named Entity Recognition. The new
+    |  #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] model makes
+    |  about #[strong 25% fewer mistakes] than the corresponding v1.x model and
+    |  is within #[strong 1% of the current state-of-the-art]
+    |  (#[+a("https://arxiv.org/pdf/1702.02098.pdf") Strubell et al., 2017]).
+    |  The v2.0 models are also cheaper to run at scale, as they require
+    |  #[strong under 1 GB of memory] per process.

 +infobox
    |  #[+label-inline Usage:] #[+a("/models") Models directory],
    |  #[+a("/models/comparison") Models comparison],
-    |  #[+a("/usage/#gpu") Using spaCy with GPU]
+    |  #[+a("#benchmarks") Benchmarks]

 +h(3, "features-pipelines") Improved processing pipelines

--- a/website/usage/_v2/_summary.jade
+++ b/website/usage/_v2/_summary.jade
@ -22,7 +22,7 @@ p
            |  #[strong deep learning-powered models] for spaCy's tagger,
            |  parser and entity recognizer. The new models are
            |  #[strong 10&times; smaller], #[strong 20% more accurate] and
-            |  just as fast as the previous generation.
+            |  #[strong even cheaper to run] than the previous generation.

        p
            |  We've also made several usability improvements that are
--- a/website/usage/models.jade
+++ b/website/usage/models.jade
@ -3,7 +3,7 @@
 include ../_includes/_mixins

 p
-    |  As of v1.7.0, models for spaCy can be installed as #[strong Python packages].
+    |  spaCy's models can be installed as #[strong Python packages].
    |  This means that they're a component of your application, just like any
    |  other module. They're versioned and can be defined as a dependency in your
    |  #[code requirements.txt]. Models can be installed from a download URL or