diff --git a/website/_harp.json b/website/_harp.json index fc4651c75..8cd9bbbf4 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -66,8 +66,7 @@ { "id": 3, "title": "3.x", "checked": true }] }, { "id": "config", "title": "Configuration", "multiple": true, "options": [ - {"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" }, - {"id": "gpu", "title": "GPU", "help": "Run spaCy on GPU to make it faster. Requires an NVDIA graphics card with CUDA 2+. See section below for more info."}] + {"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" }] }, { "id": "model", "title": "Models", "multiple": true } ], diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index 31ab6ff2b..fba383ee0 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -243,13 +243,12 @@ mixin code(label, language, prompt, height, icon, wrap) pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes) if label h4.u-text-label.u-text-label--dark=label - - var icon = icon || (prompt == 'accept' || prompt == 'reject') if icon - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'} .c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null) +icon(icon, 18) - code.c-code-block__content(class=wrap ? "u-wrap" : null data-prompt=icon ? null : prompt) + code.c-code-block__content(class=wrap ? "u-wrap" : null data-prompt=prompt) block @@ -262,14 +261,14 @@ mixin code-wrapper() //- Code blocks to display old/new versions label - [string] ARIA label for block. Defaults to "correct"/"incorrect". -mixin code-old(label) +mixin code-old(label, lang, prompt) - var label = label || 'incorrect' - +code(false, false, false, false, "reject").o-block-small(aria-label=label) + +code(false, lang, prompt, false, "reject").o-block-small(aria-label=label) block -mixin code-new(label) +mixin code-new(label, lang, prompt) - var label = label || 'correct' - +code(false, false, false, false, "accept").o-block-small(aria-label=label) + +code(false, lang, prompt, false, "accept").o-block-small(aria-label=label) block @@ -452,8 +451,8 @@ mixin head-cell() //- Table cell (only used within +row in +table) -mixin cell(align) - td.c-table__cell.u-text(class=align ? "u-text-" + align : null)&attributes(attributes) +mixin cell(...style) + td.c-table__cell.u-text(class=prefixArgs(style, "c-table__cell"))&attributes(attributes) block diff --git a/website/_includes/_page_models.jade b/website/_includes/_page_models.jade index 9edf67837..e0dcf2b1a 100644 --- a/website/_includes/_page_models.jade +++ b/website/_includes/_page_models.jade @@ -71,7 +71,7 @@ for id in CURRENT_MODELS +label=label if MODEL_META[field] | #[+help(MODEL_META[field]).u-color-subtle] - +cell("right")(data-tpl=id data-tpl-key=field) + +cell("num")(data-tpl=id data-tpl-key=field) | n/a p.u-text-small.u-color-dark(data-tpl=id data-tpl-key="notes") diff --git a/website/assets/css/_components/_tables.sass b/website/assets/css/_components/_tables.sass index 99ae998ff..3d0060b42 100644 --- a/website/assets/css/_components/_tables.sass +++ b/website/assets/css/_components/_tables.sass @@ -32,6 +32,15 @@ &:not(:last-child) border-right: 1px solid $color-subtle + &.c-table__cell--num + text-align: right + font-feature-settings: "tnum" + font-variant-numeric: tabular-nums + + & > strong + font-feature-settings: initial + font-variant-numeric: initial + //- Table head cell diff --git a/website/index.jade b/website/index.jade index 79a6dd76d..3e15559ac 100644 --- a/website/index.jade +++ b/website/index.jade @@ -111,8 +111,8 @@ include _includes/_mixins | deliver accuracy in-line with the latest research systems, | even when evaluated from raw text. With these innovations, spaCy | v2.0's models are #[strong 10× smaller], - | #[strong 20% more accurate], and #[strong just as fast] as the - | previous generation. + | #[strong 20% more accurate], and #[strong even cheaper to run] than + | the previous generation. .o-block-small.u-text-right +button("/models", true, "secondary-light") Download models diff --git a/website/models/index.jade b/website/models/index.jade index 861514d1c..5298da3df 100644 --- a/website/models/index.jade +++ b/website/models/index.jade @@ -20,8 +20,8 @@ include ../_includes/_mixins | deliver #[strong accuracy in-line with the latest research systems], | even when evaluated from raw text. With these innovations, spaCy | v2.0's models are #[strong 10× smaller], - | #[strong 20% more accurate], and #[strong just as fast] as the - | previous generation. + | #[strong 20% more accurate], and #[strong even cheaper to run] than + | the previous generation. include ../usage/_models/_quickstart diff --git a/website/usage/_facts-figures/_benchmarks-choi-2015.jade b/website/usage/_facts-figures/_benchmarks-choi-2015.jade index b91bd6624..7df9cf14e 100644 --- a/website/usage/_facts-figures/_benchmarks-choi-2015.jade +++ b/website/usage/_facts-figures/_benchmarks-choi-2015.jade @@ -5,41 +5,41 @@ +cell #[strong spaCy v2.x] +cell 2017 +cell Python / Cython - +cell("right") #[strong 92.6] - +cell("right") #[em n/a] + +cell("num") #[strong 92.6] + +cell("num") #[em n/a] | #[+help("This table shows speed as benchmarked by Choi et al. We therefore can't provide comparable figures, as we'd be running the benchmark on different hardware.").u-color-dark] +row +cell #[strong spaCy v1.x] +cell 2015 +cell Python / Cython - +cell("right") 91.8 - +cell("right") 13,963 + +cell("num") 91.8 + +cell("num") 13,963 +row +cell ClearNLP +cell 2015 +cell Java - +cell("right") 91.7 - +cell("right") 10,271 + +cell("num") 91.7 + +cell("num") 10,271 +row +cell CoreNLP +cell 2015 +cell Java - +cell("right") 89.6 - +cell("right") 8,602 + +cell("num") 89.6 + +cell("num") 8,602 +row +cell MATE +cell 2015 +cell Java - +cell("right") 92.5 - +cell("right") 550 + +cell("num") 92.5 + +cell("num") 550 +row +cell Turbo +cell 2015 +cell C++ - +cell("right") 92.4 - +cell("right") 349 + +cell("num") 92.4 + +cell("num") 349 diff --git a/website/usage/_facts-figures/_benchmarks-models.jade b/website/usage/_facts-figures/_benchmarks-models.jade index 97e4b2c8b..fc10c1b4f 100644 --- a/website/usage/_facts-figures/_benchmarks-models.jade +++ b/website/usage/_facts-figures/_benchmarks-models.jade @@ -20,34 +20,34 @@ p +row +cell #[+a("/models/en#en_core_web_sm") #[code en_core_web_sm]] 2.0.0a8 each data in ["2.x", "neural"] - +cell("right")=data - +cell("right") 91.7 - +cell("right") 85.3 - +cell("right") 97.0 - +cell("right") 10.1k - +cell("right") #[strong 35 MB] + +cell("num")=data + +cell("num") 91.7 + +cell("num") 85.3 + +cell("num") 97.0 + +cell("num") 10.1k + +cell("num") #[strong 35MB] +row +cell #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] 2.0.0a3 each data in ["2.x", "neural"] - +cell("right")=data - +cell("right") #[strong 91.9] - +cell("right") #[strong 85.9] - +cell("right") #[strong 97.2] - +cell("right") 5.0k - +cell("right") 812 MB + +cell("num")=data + +cell("num") #[strong 91.9] + +cell("num") #[strong 85.9] + +cell("num") #[strong 97.2] + +cell("num") 10.0k + +cell("num") 812MB +row("divider") +cell #[code en_core_web_sm] 1.2.0 each data in ["1.x", "linear", 86.6, 78.5, 96.6] - +cell("right")=data - +cell("right") #[strong 25.7k] - +cell("right") 50 MB + +cell("num")=data + +cell("num") #[strong 25.7k] + +cell("num") 50MB +row +cell #[code en_core_web_md] 1.2.1 - each data in ["1.x", "linear", 90.6, 81.4, 96.7, "18.8k", "1 GB"] - +cell("right")=data + each data in ["1.x", "linear", 90.6, 81.4, 96.7, "18.8k", "1GB"] + +cell("num")=data +h(4, "benchmarks-models-spanish") Spanish @@ -59,29 +59,29 @@ p +table(["Model", "spaCy", "Type", "UAS", "NER F", "POS", "WPS", "Size"]) +row +cell #[+a("/models/es#es_core_news_sm") #[code es_core_news_sm]] 2.0.0a0 - +cell("right") 2.x - +cell("right") neural - +cell("right") 89.8 - +cell("right") 88.7 - +cell("right") #[strong 96.9] - +cell("right") #[em n/a] - +cell("right") #[strong 35 MB] + +cell("num") 2.x + +cell("num") neural + +cell("num") 89.8 + +cell("num") 88.7 + +cell("num") #[strong 96.9] + +cell("num") #[em n/a] + +cell("num") #[strong 35MB] +row +cell #[+a("/models/es#es_core_news_md") #[code es_core_news_md]] 2.0.0a0 - +cell("right") 2.x - +cell("right") neural - +cell("right") #[strong 90.2] - +cell("right") 89.0 - +cell("right") 97.8 - +cell("right") #[em n/a] - +cell("right") 93 MB + +cell("num") 2.x + +cell("num") neural + +cell("num") #[strong 90.2] + +cell("num") 89.0 + +cell("num") 97.8 + +cell("num") #[em n/a] + +cell("num") 93MB +row("divider") +cell #[code es_core_web_md] 1.1.0 each data in ["1.x", "linear", 87.5] - +cell("right")=data - +cell("right") #[strong 94.2] - +cell("right") 96.7 - +cell("right") #[em n/a] - +cell("right") 377 MB + +cell("num")=data + +cell("num") #[strong 94.2] + +cell("num") 96.7 + +cell("num") #[em n/a] + +cell("num") 377MB diff --git a/website/usage/_facts-figures/_benchmarks.jade b/website/usage/_facts-figures/_benchmarks.jade index ffeab9b8c..b530b84de 100644 --- a/website/usage/_facts-figures/_benchmarks.jade +++ b/website/usage/_facts-figures/_benchmarks.jade @@ -50,55 +50,55 @@ p +cell spaCy v2.0.0 +cell 2017 +cell neural - +cell("right") 94.48 + +cell("num") 94.48 +row +cell spaCy v1.1.0 +cell 2016 +cell linear - +cell("right") 92.80 + +cell("num") 92.80 +row("divider") +cell +a("https://arxiv.org/pdf/1611.01734.pdf") Dozat and Manning +cell 2017 +cell neural - +cell("right") #[strong 95.75] + +cell("num") #[strong 95.75] +row +cell +a("http://arxiv.org/abs/1603.06042") Andor et al. +cell 2016 +cell neural - +cell("right") 94.44 + +cell("num") 94.44 +row +cell +a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet Parsey McParseface +cell 2016 +cell neural - +cell("right") 94.15 + +cell("num") 94.15 +row +cell +a("http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43800.pdf") Weiss et al. +cell 2015 +cell neural - +cell("right") 93.91 + +cell("num") 93.91 +row +cell +a("http://research.google.com/pubs/archive/38148.pdf") Zhang and McDonald +cell 2014 +cell linear - +cell("right") 93.32 + +cell("num") 93.32 +row +cell +a("http://www.cs.cmu.edu/~ark/TurboParser/") Martins et al. +cell 2013 +cell linear - +cell("right") 93.10 + +cell("num") 93.10 +h(4, "ner-accuracy-ontonotes5") NER accuracy (OntoNotes 5, no pre-process) @@ -113,35 +113,35 @@ p +cell spaCy #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] v2.0.0a3 +cell 2017 +cell neural - +cell("right") 85.85 + +cell("num") 85.85 +row("divider") +cell +a("https://arxiv.org/pdf/1702.02098.pdf") Strubell et al. +cell 2017 +cell neural - +cell("right") #[strong 86.81] + +cell("num") #[strong 86.81] +row +cell +a("https://www.semanticscholar.org/paper/Named-Entity-Recognition-with-Bidirectional-LSTM-C-Chiu-Nichols/10a4db59e81d26b2e0e896d3186ef81b4458b93f") Chiu and Nichols +cell 2016 +cell neural - +cell("right") 86.19 + +cell("num") 86.19 +row +cell +a("https://www.semanticscholar.org/paper/A-Joint-Model-for-Entity-Analysis-Coreference-Typi-Durrett-Klein/28eb033eee5f51c5e5389cbb6b777779203a6778") Durrett and Klein +cell 2014 +cell neural - +cell("right") 84.04 + +cell("num") 84.04 +row +cell +a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth +cell 2009 +cell linear - +cell("right") 83.45 + +cell("num") 83.45 +h(3, "spacy-models") Model comparison @@ -183,24 +183,24 @@ p +row +cell #[strong spaCy] each data in [ "0.2ms", "1ms", "19ms"] - +cell("right") #[strong=data] + +cell("num") #[strong=data] each data in ["1x", "1x", "1x"] - +cell("right")=data + +cell("num")=data +row +cell CoreNLP each data in ["2ms", "10ms", "49ms", "10x", "10x", "2.6x"] - +cell("right")=data + +cell("num")=data +row +cell ZPar each data in ["1ms", "8ms", "850ms", "5x", "8x", "44.7x"] - +cell("right")=data + +cell("num")=data +row +cell NLTK each data in ["4ms", "443ms"] - +cell("right")=data - +cell("right") #[em n/a] + +cell("num")=data + +cell("num") #[em n/a] each data in ["20x", "443x"] - +cell("right")=data - +cell("right") #[em n/a] + +cell("num")=data + +cell("num") #[em n/a] diff --git a/website/usage/_install/_instructions.jade b/website/usage/_install/_instructions.jade index eb89fd7a5..5885ad85a 100644 --- a/website/usage/_install/_instructions.jade +++ b/website/usage/_install/_instructions.jade @@ -79,12 +79,19 @@ p python -m spacy validate +h(3, "gpu") Run spaCy with GPU + +tag experimental + ++infobox("Important note", "⚠️") + | The instructions below refer to installation with CUDA 8.0. In order to + | install with CUDA 9.0, set the environment variable #[code CUDA9=1] + | before installing Thinc. You'll also need to adjust the path to the + | CUDA runtime. p | As of v2.0, spaCy's comes with neural network models that are implemented | in our machine learning library, #[+a(gh("thinc")) Thinc]. For GPU | support, we've been grateful to use the work of - | #[+a("http://chainer.org") Chainer]'s CuPy module, which provides + | Chainer's #[+a("https://cupy.chainer.org") CuPy] module, which provides | a NumPy-compatible interface for GPU arrays. p @@ -93,11 +100,11 @@ p | CUDA. Finally, install spaCy. +code(false, "bash"). - export CUDA_HOME=/usr/local/cuda-8.0 # Or wherever your CUDA is + export CUDA_HOME=/usr/local/cuda-8.0 # or wherever your CUDA is export PATH=$PATH:$CUDA_HOME/bin pip install spacy - python -c "import thinc.neural.gpu_ops" # Check the GPU ops were built + python -c "import thinc.neural.gpu_ops" # check the GPU ops were built +h(3, "source") Compile from source diff --git a/website/usage/_install/_quickstart.jade b/website/usage/_install/_quickstart.jade index af4f008d8..e6aa3d2c6 100644 --- a/website/usage/_install/_quickstart.jade +++ b/website/usage/_install/_quickstart.jade @@ -11,9 +11,6 @@ +qs({config: 'venv', os: 'linux'}) source .env/bin/activate +qs({config: 'venv', os: 'windows'}) .env\Scripts\activate - +qs({config: 'gpu', os: 'mac'}) export PATH=$PATH:/usr/local/cuda-8.0/bin - +qs({config: 'gpu', os: 'linux'}) export PATH=$PATH:/usr/local/cuda-8.0/bin - +qs({package: 'pip'}) pip install -U spacy +qs({package: 'conda'}) conda install -c conda-forge spacy diff --git a/website/usage/_spacy-101/_word-vectors.jade b/website/usage/_spacy-101/_word-vectors.jade index c38360014..3fcd93caa 100644 --- a/website/usage/_spacy-101/_word-vectors.jade +++ b/website/usage/_spacy-101/_word-vectors.jade @@ -4,9 +4,8 @@ p | Similarity is determined by comparing #[strong word vectors] or "word | embeddings", multi-dimensional meaning representations of a word. Word | vectors can be generated using an algorithm like - | #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. spaCy's medium - | #[code md] and large #[code lg] #[+a("/models") models] come with - | #[strong multi-dimensional vectors] that look like this: + | #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec] and usually + | look like this: +code("banana.vector", false, false, 250). array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01, @@ -110,8 +109,21 @@ p -2.97650009e-01, 7.89430022e-01, 3.31680000e-01, -1.19659996e+00, -4.71559986e-02, 5.31750023e-01], dtype=float32) ++infobox("Important note", "⚠️") + | To make them compact and fast, spaCy's small #[+a("/models") models] + | (all packages that end in #[code sm]) #[strong don't ship with word vectors], and + | only include context-sensitive #[strong tensors]. This means you can + | still use the #[code similarity()] methods to compare documents, spans + | and tokens – but the result won't be as good, and individual tokens won't + | have any vectors assigned. So in order to use #[em real] word vectors, + | you need to download a larger model: + + +code-wrapper + +code-new(false, "bash", "$") spacy download en_core_web_lg + p - | The #[code .vector] attribute will return an object's vector. + | Models that come with built-in word vectors make them available as the + | #[+api("token#vector") #[code Token.vector]] attribute. | #[+api("doc#vector") #[code Doc.vector]] and | #[+api("span#vector") #[code Span.vector]] will default to an average | of their token vectors. You can also check if a token has a vector @@ -119,6 +131,7 @@ p | vectors. +code. + nlp = spacy.load('en_core_web_lg') tokens = nlp(u'dog cat banana sasquatch') for token in tokens: @@ -143,10 +156,9 @@ p | they're part of the model's vocabulary, and come with a vector. The word | "sasquatch" on the other hand is a lot less common and out-of-vocabulary | – so its vector representation consists of 300 dimensions of #[code 0], - | which means it's practically nonexistent. - -p - | If your application will benefit from a large vocabulary with more - | vectors, you should consider using one of the - | #[+a("/models") larger models] instead of the default, - | smaller ones, which usually come with a clipped vocabulary. + | which means it's practically nonexistent. If your application will + | benefit from a #[strong large vocabulary] with more vectors, you should + | consider using one of the larger models or loading in a full vector + | package, for example, + | #[+a("/models/en#en_vectors_web_lg") #[code en_vectors_web_lg]], which + | includes over #[strong 1 million unique vectors]. diff --git a/website/usage/_v2/_features.jade b/website/usage/_v2/_features.jade index ada54b94d..d31217747 100644 --- a/website/usage/_v2/_features.jade +++ b/website/usage/_v2/_features.jade @@ -10,8 +10,9 @@ p +h(3, "features-models") Convolutional neural network models +aside-code("Example", "bash") - for model in ["en", "de", "fr", "es", "pt", "it"] - | spacy download #{model} # default #{LANGUAGES[model]} model!{'\n'} + for _, lang in MODELS + if lang != "xx" + | spacy download #{lang} # default #{LANGUAGES[lang]} model!{'\n'} | spacy download xx_ent_wiki_sm # multi-language NER p @@ -20,14 +21,22 @@ p | been designed and implemented from scratch specifically for spaCy, to | give you an unmatched balance of speed, size and accuracy. The new | models are #[strong 10× smaller], #[strong 20% more accurate], - | and #[strong just as fast] as the previous generation. - | #[strong GPU usage] is now supported via - | #[+a("http://chainer.org") Chainer]'s CuPy module. + | and #[strong even cheaper to run] than the previous generation. + +p + | spaCy v2.0's new neural network models bring significant improvements in + | accuracy, especially for English Named Entity Recognition. The new + | #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] model makes + | about #[strong 25% fewer mistakes] than the corresponding v1.x model and + | is within #[strong 1% of the current state-of-the-art] + | (#[+a("https://arxiv.org/pdf/1702.02098.pdf") Strubell et al., 2017]). + | The v2.0 models are also cheaper to run at scale, as they require + | #[strong under 1 GB of memory] per process. +infobox | #[+label-inline Usage:] #[+a("/models") Models directory], | #[+a("/models/comparison") Models comparison], - | #[+a("/usage/#gpu") Using spaCy with GPU] + | #[+a("#benchmarks") Benchmarks] +h(3, "features-pipelines") Improved processing pipelines diff --git a/website/usage/_v2/_summary.jade b/website/usage/_v2/_summary.jade index 84f238476..6d9681d10 100644 --- a/website/usage/_v2/_summary.jade +++ b/website/usage/_v2/_summary.jade @@ -22,7 +22,7 @@ p | #[strong deep learning-powered models] for spaCy's tagger, | parser and entity recognizer. The new models are | #[strong 10× smaller], #[strong 20% more accurate] and - | just as fast as the previous generation. + | #[strong even cheaper to run] than the previous generation. p | We've also made several usability improvements that are diff --git a/website/usage/models.jade b/website/usage/models.jade index 6b9c4f2bf..0880e1840 100644 --- a/website/usage/models.jade +++ b/website/usage/models.jade @@ -3,7 +3,7 @@ include ../_includes/_mixins p - | As of v1.7.0, models for spaCy can be installed as #[strong Python packages]. + | spaCy's models can be installed as #[strong Python packages]. | This means that they're a component of your application, just like any | other module. They're versioned and can be defined as a dependency in your | #[code requirements.txt]. Models can be installed from a download URL or