Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-11-06 22:09:20 +01:00
commit 906aece532
15 changed files with 141 additions and 109 deletions

View File

@ -66,8 +66,7 @@
{ "id": 3, "title": "3.x", "checked": true }]
},
{ "id": "config", "title": "Configuration", "multiple": true, "options": [
{"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" },
{"id": "gpu", "title": "GPU", "help": "Run spaCy on GPU to make it faster. Requires an NVDIA graphics card with CUDA 2+. See section below for more info."}]
{"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" }]
},
{ "id": "model", "title": "Models", "multiple": true }
],

View File

@ -243,13 +243,12 @@ mixin code(label, language, prompt, height, icon, wrap)
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
if label
h4.u-text-label.u-text-label--dark=label
- var icon = icon || (prompt == 'accept' || prompt == 'reject')
if icon
- var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
.c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
+icon(icon, 18)
code.c-code-block__content(class=wrap ? "u-wrap" : null data-prompt=icon ? null : prompt)
code.c-code-block__content(class=wrap ? "u-wrap" : null data-prompt=prompt)
block
@ -262,14 +261,14 @@ mixin code-wrapper()
//- Code blocks to display old/new versions
label - [string] ARIA label for block. Defaults to "correct"/"incorrect".
mixin code-old(label)
mixin code-old(label, lang, prompt)
- var label = label || 'incorrect'
+code(false, false, false, false, "reject").o-block-small(aria-label=label)
+code(false, lang, prompt, false, "reject").o-block-small(aria-label=label)
block
mixin code-new(label)
mixin code-new(label, lang, prompt)
- var label = label || 'correct'
+code(false, false, false, false, "accept").o-block-small(aria-label=label)
+code(false, lang, prompt, false, "accept").o-block-small(aria-label=label)
block
@ -452,8 +451,8 @@ mixin head-cell()
//- Table cell (only used within +row in +table)
mixin cell(align)
td.c-table__cell.u-text(class=align ? "u-text-" + align : null)&attributes(attributes)
mixin cell(...style)
td.c-table__cell.u-text(class=prefixArgs(style, "c-table__cell"))&attributes(attributes)
block

View File

@ -71,7 +71,7 @@ for id in CURRENT_MODELS
+label=label
if MODEL_META[field]
| #[+help(MODEL_META[field]).u-color-subtle]
+cell("right")(data-tpl=id data-tpl-key=field)
+cell("num")(data-tpl=id data-tpl-key=field)
| n/a
p.u-text-small.u-color-dark(data-tpl=id data-tpl-key="notes")

View File

@ -32,6 +32,15 @@
&:not(:last-child)
border-right: 1px solid $color-subtle
&.c-table__cell--num
text-align: right
font-feature-settings: "tnum"
font-variant-numeric: tabular-nums
& > strong
font-feature-settings: initial
font-variant-numeric: initial
//- Table head cell

View File

@ -111,8 +111,8 @@ include _includes/_mixins
| deliver accuracy in-line with the latest research systems,
| even when evaluated from raw text. With these innovations, spaCy
| v2.0's models are #[strong 10× smaller],
| #[strong 20% more accurate], and #[strong just as fast] as the
| previous generation.
| #[strong 20% more accurate], and #[strong even cheaper to run] than
| the previous generation.
.o-block-small.u-text-right
+button("/models", true, "secondary-light") Download models

View File

@ -20,8 +20,8 @@ include ../_includes/_mixins
| deliver #[strong accuracy in-line with the latest research systems],
| even when evaluated from raw text. With these innovations, spaCy
| v2.0's models are #[strong 10× smaller],
| #[strong 20% more accurate], and #[strong just as fast] as the
| previous generation.
| #[strong 20% more accurate], and #[strong even cheaper to run] than
| the previous generation.
include ../usage/_models/_quickstart

View File

@ -5,41 +5,41 @@
+cell #[strong spaCy v2.x]
+cell 2017
+cell Python / Cython
+cell("right") #[strong 92.6]
+cell("right") #[em n/a]
+cell("num") #[strong 92.6]
+cell("num") #[em n/a]
| #[+help("This table shows speed as benchmarked by Choi et al. We therefore can't provide comparable figures, as we'd be running the benchmark on different hardware.").u-color-dark]
+row
+cell #[strong spaCy v1.x]
+cell 2015
+cell Python / Cython
+cell("right") 91.8
+cell("right") 13,963
+cell("num") 91.8
+cell("num") 13,963
+row
+cell ClearNLP
+cell 2015
+cell Java
+cell("right") 91.7
+cell("right") 10,271
+cell("num") 91.7
+cell("num") 10,271
+row
+cell CoreNLP
+cell 2015
+cell Java
+cell("right") 89.6
+cell("right") 8,602
+cell("num") 89.6
+cell("num") 8,602
+row
+cell MATE
+cell 2015
+cell Java
+cell("right") 92.5
+cell("right") 550
+cell("num") 92.5
+cell("num") 550
+row
+cell Turbo
+cell 2015
+cell C++
+cell("right") 92.4
+cell("right") 349
+cell("num") 92.4
+cell("num") 349

View File

@ -20,34 +20,34 @@ p
+row
+cell #[+a("/models/en#en_core_web_sm") #[code en_core_web_sm]] 2.0.0a8
each data in ["2.x", "neural"]
+cell("right")=data
+cell("right") 91.7
+cell("right") 85.3
+cell("right") 97.0
+cell("right") 10.1k
+cell("right") #[strong 35 MB]
+cell("num")=data
+cell("num") 91.7
+cell("num") 85.3
+cell("num") 97.0
+cell("num") 10.1k
+cell("num") #[strong 35MB]
+row
+cell #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] 2.0.0a3
each data in ["2.x", "neural"]
+cell("right")=data
+cell("right") #[strong 91.9]
+cell("right") #[strong 85.9]
+cell("right") #[strong 97.2]
+cell("right") 5.0k
+cell("right") 812 MB
+cell("num")=data
+cell("num") #[strong 91.9]
+cell("num") #[strong 85.9]
+cell("num") #[strong 97.2]
+cell("num") 10.0k
+cell("num") 812MB
+row("divider")
+cell #[code en_core_web_sm] 1.2.0
each data in ["1.x", "linear", 86.6, 78.5, 96.6]
+cell("right")=data
+cell("right") #[strong 25.7k]
+cell("right") 50 MB
+cell("num")=data
+cell("num") #[strong 25.7k]
+cell("num") 50MB
+row
+cell #[code en_core_web_md] 1.2.1
each data in ["1.x", "linear", 90.6, 81.4, 96.7, "18.8k", "1GB"]
+cell("right")=data
+cell("num")=data
+h(4, "benchmarks-models-spanish") Spanish
@ -59,29 +59,29 @@ p
+table(["Model", "spaCy", "Type", "UAS", "NER F", "POS", "WPS", "Size"])
+row
+cell #[+a("/models/es#es_core_news_sm") #[code es_core_news_sm]] 2.0.0a0
+cell("right") 2.x
+cell("right") neural
+cell("right") 89.8
+cell("right") 88.7
+cell("right") #[strong 96.9]
+cell("right") #[em n/a]
+cell("right") #[strong 35 MB]
+cell("num") 2.x
+cell("num") neural
+cell("num") 89.8
+cell("num") 88.7
+cell("num") #[strong 96.9]
+cell("num") #[em n/a]
+cell("num") #[strong 35MB]
+row
+cell #[+a("/models/es#es_core_news_md") #[code es_core_news_md]] 2.0.0a0
+cell("right") 2.x
+cell("right") neural
+cell("right") #[strong 90.2]
+cell("right") 89.0
+cell("right") 97.8
+cell("right") #[em n/a]
+cell("right") 93 MB
+cell("num") 2.x
+cell("num") neural
+cell("num") #[strong 90.2]
+cell("num") 89.0
+cell("num") 97.8
+cell("num") #[em n/a]
+cell("num") 93MB
+row("divider")
+cell #[code es_core_web_md] 1.1.0
each data in ["1.x", "linear", 87.5]
+cell("right")=data
+cell("right") #[strong 94.2]
+cell("right") 96.7
+cell("right") #[em n/a]
+cell("right") 377 MB
+cell("num")=data
+cell("num") #[strong 94.2]
+cell("num") 96.7
+cell("num") #[em n/a]
+cell("num") 377MB

View File

@ -50,55 +50,55 @@ p
+cell spaCy v2.0.0
+cell 2017
+cell neural
+cell("right") 94.48
+cell("num") 94.48
+row
+cell spaCy v1.1.0
+cell 2016
+cell linear
+cell("right") 92.80
+cell("num") 92.80
+row("divider")
+cell
+a("https://arxiv.org/pdf/1611.01734.pdf") Dozat and Manning
+cell 2017
+cell neural
+cell("right") #[strong 95.75]
+cell("num") #[strong 95.75]
+row
+cell
+a("http://arxiv.org/abs/1603.06042") Andor et al.
+cell 2016
+cell neural
+cell("right") 94.44
+cell("num") 94.44
+row
+cell
+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet Parsey McParseface
+cell 2016
+cell neural
+cell("right") 94.15
+cell("num") 94.15
+row
+cell
+a("http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43800.pdf") Weiss et al.
+cell 2015
+cell neural
+cell("right") 93.91
+cell("num") 93.91
+row
+cell
+a("http://research.google.com/pubs/archive/38148.pdf") Zhang and McDonald
+cell 2014
+cell linear
+cell("right") 93.32
+cell("num") 93.32
+row
+cell
+a("http://www.cs.cmu.edu/~ark/TurboParser/") Martins et al.
+cell 2013
+cell linear
+cell("right") 93.10
+cell("num") 93.10
+h(4, "ner-accuracy-ontonotes5") NER accuracy (OntoNotes 5, no pre-process)
@ -113,35 +113,35 @@ p
+cell spaCy #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] v2.0.0a3
+cell 2017
+cell neural
+cell("right") 85.85
+cell("num") 85.85
+row("divider")
+cell
+a("https://arxiv.org/pdf/1702.02098.pdf") Strubell et al.
+cell 2017
+cell neural
+cell("right") #[strong 86.81]
+cell("num") #[strong 86.81]
+row
+cell
+a("https://www.semanticscholar.org/paper/Named-Entity-Recognition-with-Bidirectional-LSTM-C-Chiu-Nichols/10a4db59e81d26b2e0e896d3186ef81b4458b93f") Chiu and Nichols
+cell 2016
+cell neural
+cell("right") 86.19
+cell("num") 86.19
+row
+cell
+a("https://www.semanticscholar.org/paper/A-Joint-Model-for-Entity-Analysis-Coreference-Typi-Durrett-Klein/28eb033eee5f51c5e5389cbb6b777779203a6778") Durrett and Klein
+cell 2014
+cell neural
+cell("right") 84.04
+cell("num") 84.04
+row
+cell
+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth
+cell 2009
+cell linear
+cell("right") 83.45
+cell("num") 83.45
+h(3, "spacy-models") Model comparison
@ -183,24 +183,24 @@ p
+row
+cell #[strong spaCy]
each data in [ "0.2ms", "1ms", "19ms"]
+cell("right") #[strong=data]
+cell("num") #[strong=data]
each data in ["1x", "1x", "1x"]
+cell("right")=data
+cell("num")=data
+row
+cell CoreNLP
each data in ["2ms", "10ms", "49ms", "10x", "10x", "2.6x"]
+cell("right")=data
+cell("num")=data
+row
+cell ZPar
each data in ["1ms", "8ms", "850ms", "5x", "8x", "44.7x"]
+cell("right")=data
+cell("num")=data
+row
+cell NLTK
each data in ["4ms", "443ms"]
+cell("right")=data
+cell("right") #[em n/a]
+cell("num")=data
+cell("num") #[em n/a]
each data in ["20x", "443x"]
+cell("right")=data
+cell("right") #[em n/a]
+cell("num")=data
+cell("num") #[em n/a]

View File

@ -79,12 +79,19 @@ p
python -m spacy validate
+h(3, "gpu") Run spaCy with GPU
+tag experimental
+infobox("Important note", "⚠️")
| The instructions below refer to installation with CUDA 8.0. In order to
| install with CUDA 9.0, set the environment variable #[code CUDA9=1]
| before installing Thinc. You'll also need to adjust the path to the
| CUDA runtime.
p
| As of v2.0, spaCy's comes with neural network models that are implemented
| in our machine learning library, #[+a(gh("thinc")) Thinc]. For GPU
| support, we've been grateful to use the work of
| #[+a("http://chainer.org") Chainer]'s CuPy module, which provides
| Chainer's #[+a("https://cupy.chainer.org") CuPy] module, which provides
| a NumPy-compatible interface for GPU arrays.
p
@ -93,11 +100,11 @@ p
| CUDA. Finally, install spaCy.
+code(false, "bash").
export CUDA_HOME=/usr/local/cuda-8.0 # Or wherever your CUDA is
export CUDA_HOME=/usr/local/cuda-8.0 # or wherever your CUDA is
export PATH=$PATH:$CUDA_HOME/bin
pip install spacy
python -c "import thinc.neural.gpu_ops" # Check the GPU ops were built
python -c "import thinc.neural.gpu_ops" # check the GPU ops were built
+h(3, "source") Compile from source

View File

@ -11,9 +11,6 @@
+qs({config: 'venv', os: 'linux'}) source .env/bin/activate
+qs({config: 'venv', os: 'windows'}) .env\Scripts\activate
+qs({config: 'gpu', os: 'mac'}) export PATH=$PATH:/usr/local/cuda-8.0/bin
+qs({config: 'gpu', os: 'linux'}) export PATH=$PATH:/usr/local/cuda-8.0/bin
+qs({package: 'pip'}) pip install -U spacy
+qs({package: 'conda'}) conda install -c conda-forge spacy

View File

@ -4,9 +4,8 @@ p
| Similarity is determined by comparing #[strong word vectors] or "word
| embeddings", multi-dimensional meaning representations of a word. Word
| vectors can be generated using an algorithm like
| #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. spaCy's medium
| #[code md] and large #[code lg] #[+a("/models") models] come with
| #[strong multi-dimensional vectors] that look like this:
| #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec] and usually
| look like this:
+code("banana.vector", false, false, 250).
array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01,
@ -110,8 +109,21 @@ p
-2.97650009e-01, 7.89430022e-01, 3.31680000e-01,
-1.19659996e+00, -4.71559986e-02, 5.31750023e-01], dtype=float32)
+infobox("Important note", "⚠️")
| To make them compact and fast, spaCy's small #[+a("/models") models]
| (all packages that end in #[code sm]) #[strong don't ship with word vectors], and
| only include context-sensitive #[strong tensors]. This means you can
| still use the #[code similarity()] methods to compare documents, spans
| and tokens but the result won't be as good, and individual tokens won't
| have any vectors assigned. So in order to use #[em real] word vectors,
| you need to download a larger model:
+code-wrapper
+code-new(false, "bash", "$") spacy download en_core_web_lg
p
| The #[code .vector] attribute will return an object's vector.
| Models that come with built-in word vectors make them available as the
| #[+api("token#vector") #[code Token.vector]] attribute.
| #[+api("doc#vector") #[code Doc.vector]] and
| #[+api("span#vector") #[code Span.vector]] will default to an average
| of their token vectors. You can also check if a token has a vector
@ -119,6 +131,7 @@ p
| vectors.
+code.
nlp = spacy.load('en_core_web_lg')
tokens = nlp(u'dog cat banana sasquatch')
for token in tokens:
@ -143,10 +156,9 @@ p
| they're part of the model's vocabulary, and come with a vector. The word
| "sasquatch" on the other hand is a lot less common and out-of-vocabulary
| so its vector representation consists of 300 dimensions of #[code 0],
| which means it's practically nonexistent.
p
| If your application will benefit from a large vocabulary with more
| vectors, you should consider using one of the
| #[+a("/models") larger models] instead of the default,
| smaller ones, which usually come with a clipped vocabulary.
| which means it's practically nonexistent. If your application will
| benefit from a #[strong large vocabulary] with more vectors, you should
| consider using one of the larger models or loading in a full vector
| package, for example,
| #[+a("/models/en#en_vectors_web_lg") #[code en_vectors_web_lg]], which
| includes over #[strong 1 million unique vectors].

View File

@ -10,8 +10,9 @@ p
+h(3, "features-models") Convolutional neural network models
+aside-code("Example", "bash")
for model in ["en", "de", "fr", "es", "pt", "it"]
| spacy download #{model} # default #{LANGUAGES[model]} model!{'\n'}
for _, lang in MODELS
if lang != "xx"
| spacy download #{lang} # default #{LANGUAGES[lang]} model!{'\n'}
| spacy download xx_ent_wiki_sm # multi-language NER
p
@ -20,14 +21,22 @@ p
| been designed and implemented from scratch specifically for spaCy, to
| give you an unmatched balance of speed, size and accuracy. The new
| models are #[strong 10× smaller], #[strong 20% more accurate],
| and #[strong just as fast] as the previous generation.
| #[strong GPU usage] is now supported via
| #[+a("http://chainer.org") Chainer]'s CuPy module.
| and #[strong even cheaper to run] than the previous generation.
p
| spaCy v2.0's new neural network models bring significant improvements in
| accuracy, especially for English Named Entity Recognition. The new
| #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] model makes
| about #[strong 25% fewer mistakes] than the corresponding v1.x model and
| is within #[strong 1% of the current state-of-the-art]
| (#[+a("https://arxiv.org/pdf/1702.02098.pdf") Strubell et al., 2017]).
| The v2.0 models are also cheaper to run at scale, as they require
| #[strong under 1 GB of memory] per process.
+infobox
| #[+label-inline Usage:] #[+a("/models") Models directory],
| #[+a("/models/comparison") Models comparison],
| #[+a("/usage/#gpu") Using spaCy with GPU]
| #[+a("#benchmarks") Benchmarks]
+h(3, "features-pipelines") Improved processing pipelines

View File

@ -22,7 +22,7 @@ p
| #[strong deep learning-powered models] for spaCy's tagger,
| parser and entity recognizer. The new models are
| #[strong 10× smaller], #[strong 20% more accurate] and
| just as fast as the previous generation.
| #[strong even cheaper to run] than the previous generation.
p
| We've also made several usability improvements that are

View File

@ -3,7 +3,7 @@
include ../_includes/_mixins
p
| As of v1.7.0, models for spaCy can be installed as #[strong Python packages].
| spaCy's models can be installed as #[strong Python packages].
| This means that they're a component of your application, just like any
| other module. They're versioned and can be defined as a dependency in your
| #[code requirements.txt]. Models can be installed from a download URL or