mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
906aece532
|
@ -66,8 +66,7 @@
|
||||||
{ "id": 3, "title": "3.x", "checked": true }]
|
{ "id": 3, "title": "3.x", "checked": true }]
|
||||||
},
|
},
|
||||||
{ "id": "config", "title": "Configuration", "multiple": true, "options": [
|
{ "id": "config", "title": "Configuration", "multiple": true, "options": [
|
||||||
{"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" },
|
{"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" }]
|
||||||
{"id": "gpu", "title": "GPU", "help": "Run spaCy on GPU to make it faster. Requires an NVDIA graphics card with CUDA 2+. See section below for more info."}]
|
|
||||||
},
|
},
|
||||||
{ "id": "model", "title": "Models", "multiple": true }
|
{ "id": "model", "title": "Models", "multiple": true }
|
||||||
],
|
],
|
||||||
|
|
|
@ -243,13 +243,12 @@ mixin code(label, language, prompt, height, icon, wrap)
|
||||||
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
|
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
|
||||||
if label
|
if label
|
||||||
h4.u-text-label.u-text-label--dark=label
|
h4.u-text-label.u-text-label--dark=label
|
||||||
- var icon = icon || (prompt == 'accept' || prompt == 'reject')
|
|
||||||
if icon
|
if icon
|
||||||
- var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
|
- var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
|
||||||
.c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
|
.c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
|
||||||
+icon(icon, 18)
|
+icon(icon, 18)
|
||||||
|
|
||||||
code.c-code-block__content(class=wrap ? "u-wrap" : null data-prompt=icon ? null : prompt)
|
code.c-code-block__content(class=wrap ? "u-wrap" : null data-prompt=prompt)
|
||||||
block
|
block
|
||||||
|
|
||||||
|
|
||||||
|
@ -262,14 +261,14 @@ mixin code-wrapper()
|
||||||
//- Code blocks to display old/new versions
|
//- Code blocks to display old/new versions
|
||||||
label - [string] ARIA label for block. Defaults to "correct"/"incorrect".
|
label - [string] ARIA label for block. Defaults to "correct"/"incorrect".
|
||||||
|
|
||||||
mixin code-old(label)
|
mixin code-old(label, lang, prompt)
|
||||||
- var label = label || 'incorrect'
|
- var label = label || 'incorrect'
|
||||||
+code(false, false, false, false, "reject").o-block-small(aria-label=label)
|
+code(false, lang, prompt, false, "reject").o-block-small(aria-label=label)
|
||||||
block
|
block
|
||||||
|
|
||||||
mixin code-new(label)
|
mixin code-new(label, lang, prompt)
|
||||||
- var label = label || 'correct'
|
- var label = label || 'correct'
|
||||||
+code(false, false, false, false, "accept").o-block-small(aria-label=label)
|
+code(false, lang, prompt, false, "accept").o-block-small(aria-label=label)
|
||||||
block
|
block
|
||||||
|
|
||||||
|
|
||||||
|
@ -452,8 +451,8 @@ mixin head-cell()
|
||||||
|
|
||||||
//- Table cell (only used within +row in +table)
|
//- Table cell (only used within +row in +table)
|
||||||
|
|
||||||
mixin cell(align)
|
mixin cell(...style)
|
||||||
td.c-table__cell.u-text(class=align ? "u-text-" + align : null)&attributes(attributes)
|
td.c-table__cell.u-text(class=prefixArgs(style, "c-table__cell"))&attributes(attributes)
|
||||||
block
|
block
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -71,7 +71,7 @@ for id in CURRENT_MODELS
|
||||||
+label=label
|
+label=label
|
||||||
if MODEL_META[field]
|
if MODEL_META[field]
|
||||||
| #[+help(MODEL_META[field]).u-color-subtle]
|
| #[+help(MODEL_META[field]).u-color-subtle]
|
||||||
+cell("right")(data-tpl=id data-tpl-key=field)
|
+cell("num")(data-tpl=id data-tpl-key=field)
|
||||||
| n/a
|
| n/a
|
||||||
|
|
||||||
p.u-text-small.u-color-dark(data-tpl=id data-tpl-key="notes")
|
p.u-text-small.u-color-dark(data-tpl=id data-tpl-key="notes")
|
||||||
|
|
|
@ -32,6 +32,15 @@
|
||||||
&:not(:last-child)
|
&:not(:last-child)
|
||||||
border-right: 1px solid $color-subtle
|
border-right: 1px solid $color-subtle
|
||||||
|
|
||||||
|
&.c-table__cell--num
|
||||||
|
text-align: right
|
||||||
|
font-feature-settings: "tnum"
|
||||||
|
font-variant-numeric: tabular-nums
|
||||||
|
|
||||||
|
& > strong
|
||||||
|
font-feature-settings: initial
|
||||||
|
font-variant-numeric: initial
|
||||||
|
|
||||||
|
|
||||||
//- Table head cell
|
//- Table head cell
|
||||||
|
|
||||||
|
|
|
@ -111,8 +111,8 @@ include _includes/_mixins
|
||||||
| deliver accuracy in-line with the latest research systems,
|
| deliver accuracy in-line with the latest research systems,
|
||||||
| even when evaluated from raw text. With these innovations, spaCy
|
| even when evaluated from raw text. With these innovations, spaCy
|
||||||
| v2.0's models are #[strong 10× smaller],
|
| v2.0's models are #[strong 10× smaller],
|
||||||
| #[strong 20% more accurate], and #[strong just as fast] as the
|
| #[strong 20% more accurate], and #[strong even cheaper to run] than
|
||||||
| previous generation.
|
| the previous generation.
|
||||||
|
|
||||||
.o-block-small.u-text-right
|
.o-block-small.u-text-right
|
||||||
+button("/models", true, "secondary-light") Download models
|
+button("/models", true, "secondary-light") Download models
|
||||||
|
|
|
@ -20,8 +20,8 @@ include ../_includes/_mixins
|
||||||
| deliver #[strong accuracy in-line with the latest research systems],
|
| deliver #[strong accuracy in-line with the latest research systems],
|
||||||
| even when evaluated from raw text. With these innovations, spaCy
|
| even when evaluated from raw text. With these innovations, spaCy
|
||||||
| v2.0's models are #[strong 10× smaller],
|
| v2.0's models are #[strong 10× smaller],
|
||||||
| #[strong 20% more accurate], and #[strong just as fast] as the
|
| #[strong 20% more accurate], and #[strong even cheaper to run] than
|
||||||
| previous generation.
|
| the previous generation.
|
||||||
|
|
||||||
include ../usage/_models/_quickstart
|
include ../usage/_models/_quickstart
|
||||||
|
|
||||||
|
|
|
@ -5,41 +5,41 @@
|
||||||
+cell #[strong spaCy v2.x]
|
+cell #[strong spaCy v2.x]
|
||||||
+cell 2017
|
+cell 2017
|
||||||
+cell Python / Cython
|
+cell Python / Cython
|
||||||
+cell("right") #[strong 92.6]
|
+cell("num") #[strong 92.6]
|
||||||
+cell("right") #[em n/a]
|
+cell("num") #[em n/a]
|
||||||
| #[+help("This table shows speed as benchmarked by Choi et al. We therefore can't provide comparable figures, as we'd be running the benchmark on different hardware.").u-color-dark]
|
| #[+help("This table shows speed as benchmarked by Choi et al. We therefore can't provide comparable figures, as we'd be running the benchmark on different hardware.").u-color-dark]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[strong spaCy v1.x]
|
+cell #[strong spaCy v1.x]
|
||||||
+cell 2015
|
+cell 2015
|
||||||
+cell Python / Cython
|
+cell Python / Cython
|
||||||
+cell("right") 91.8
|
+cell("num") 91.8
|
||||||
+cell("right") 13,963
|
+cell("num") 13,963
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell ClearNLP
|
+cell ClearNLP
|
||||||
+cell 2015
|
+cell 2015
|
||||||
+cell Java
|
+cell Java
|
||||||
+cell("right") 91.7
|
+cell("num") 91.7
|
||||||
+cell("right") 10,271
|
+cell("num") 10,271
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell CoreNLP
|
+cell CoreNLP
|
||||||
+cell 2015
|
+cell 2015
|
||||||
+cell Java
|
+cell Java
|
||||||
+cell("right") 89.6
|
+cell("num") 89.6
|
||||||
+cell("right") 8,602
|
+cell("num") 8,602
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell MATE
|
+cell MATE
|
||||||
+cell 2015
|
+cell 2015
|
||||||
+cell Java
|
+cell Java
|
||||||
+cell("right") 92.5
|
+cell("num") 92.5
|
||||||
+cell("right") 550
|
+cell("num") 550
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell Turbo
|
+cell Turbo
|
||||||
+cell 2015
|
+cell 2015
|
||||||
+cell C++
|
+cell C++
|
||||||
+cell("right") 92.4
|
+cell("num") 92.4
|
||||||
+cell("right") 349
|
+cell("num") 349
|
||||||
|
|
|
@ -20,34 +20,34 @@ p
|
||||||
+row
|
+row
|
||||||
+cell #[+a("/models/en#en_core_web_sm") #[code en_core_web_sm]] 2.0.0a8
|
+cell #[+a("/models/en#en_core_web_sm") #[code en_core_web_sm]] 2.0.0a8
|
||||||
each data in ["2.x", "neural"]
|
each data in ["2.x", "neural"]
|
||||||
+cell("right")=data
|
+cell("num")=data
|
||||||
+cell("right") 91.7
|
+cell("num") 91.7
|
||||||
+cell("right") 85.3
|
+cell("num") 85.3
|
||||||
+cell("right") 97.0
|
+cell("num") 97.0
|
||||||
+cell("right") 10.1k
|
+cell("num") 10.1k
|
||||||
+cell("right") #[strong 35 MB]
|
+cell("num") #[strong 35MB]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] 2.0.0a3
|
+cell #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] 2.0.0a3
|
||||||
each data in ["2.x", "neural"]
|
each data in ["2.x", "neural"]
|
||||||
+cell("right")=data
|
+cell("num")=data
|
||||||
+cell("right") #[strong 91.9]
|
+cell("num") #[strong 91.9]
|
||||||
+cell("right") #[strong 85.9]
|
+cell("num") #[strong 85.9]
|
||||||
+cell("right") #[strong 97.2]
|
+cell("num") #[strong 97.2]
|
||||||
+cell("right") 5.0k
|
+cell("num") 10.0k
|
||||||
+cell("right") 812 MB
|
+cell("num") 812MB
|
||||||
|
|
||||||
+row("divider")
|
+row("divider")
|
||||||
+cell #[code en_core_web_sm] 1.2.0
|
+cell #[code en_core_web_sm] 1.2.0
|
||||||
each data in ["1.x", "linear", 86.6, 78.5, 96.6]
|
each data in ["1.x", "linear", 86.6, 78.5, 96.6]
|
||||||
+cell("right")=data
|
+cell("num")=data
|
||||||
+cell("right") #[strong 25.7k]
|
+cell("num") #[strong 25.7k]
|
||||||
+cell("right") 50 MB
|
+cell("num") 50MB
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code en_core_web_md] 1.2.1
|
+cell #[code en_core_web_md] 1.2.1
|
||||||
each data in ["1.x", "linear", 90.6, 81.4, 96.7, "18.8k", "1GB"]
|
each data in ["1.x", "linear", 90.6, 81.4, 96.7, "18.8k", "1GB"]
|
||||||
+cell("right")=data
|
+cell("num")=data
|
||||||
|
|
||||||
+h(4, "benchmarks-models-spanish") Spanish
|
+h(4, "benchmarks-models-spanish") Spanish
|
||||||
|
|
||||||
|
@ -59,29 +59,29 @@ p
|
||||||
+table(["Model", "spaCy", "Type", "UAS", "NER F", "POS", "WPS", "Size"])
|
+table(["Model", "spaCy", "Type", "UAS", "NER F", "POS", "WPS", "Size"])
|
||||||
+row
|
+row
|
||||||
+cell #[+a("/models/es#es_core_news_sm") #[code es_core_news_sm]] 2.0.0a0
|
+cell #[+a("/models/es#es_core_news_sm") #[code es_core_news_sm]] 2.0.0a0
|
||||||
+cell("right") 2.x
|
+cell("num") 2.x
|
||||||
+cell("right") neural
|
+cell("num") neural
|
||||||
+cell("right") 89.8
|
+cell("num") 89.8
|
||||||
+cell("right") 88.7
|
+cell("num") 88.7
|
||||||
+cell("right") #[strong 96.9]
|
+cell("num") #[strong 96.9]
|
||||||
+cell("right") #[em n/a]
|
+cell("num") #[em n/a]
|
||||||
+cell("right") #[strong 35 MB]
|
+cell("num") #[strong 35MB]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[+a("/models/es#es_core_news_md") #[code es_core_news_md]] 2.0.0a0
|
+cell #[+a("/models/es#es_core_news_md") #[code es_core_news_md]] 2.0.0a0
|
||||||
+cell("right") 2.x
|
+cell("num") 2.x
|
||||||
+cell("right") neural
|
+cell("num") neural
|
||||||
+cell("right") #[strong 90.2]
|
+cell("num") #[strong 90.2]
|
||||||
+cell("right") 89.0
|
+cell("num") 89.0
|
||||||
+cell("right") 97.8
|
+cell("num") 97.8
|
||||||
+cell("right") #[em n/a]
|
+cell("num") #[em n/a]
|
||||||
+cell("right") 93 MB
|
+cell("num") 93MB
|
||||||
|
|
||||||
+row("divider")
|
+row("divider")
|
||||||
+cell #[code es_core_web_md] 1.1.0
|
+cell #[code es_core_web_md] 1.1.0
|
||||||
each data in ["1.x", "linear", 87.5]
|
each data in ["1.x", "linear", 87.5]
|
||||||
+cell("right")=data
|
+cell("num")=data
|
||||||
+cell("right") #[strong 94.2]
|
+cell("num") #[strong 94.2]
|
||||||
+cell("right") 96.7
|
+cell("num") 96.7
|
||||||
+cell("right") #[em n/a]
|
+cell("num") #[em n/a]
|
||||||
+cell("right") 377 MB
|
+cell("num") 377MB
|
||||||
|
|
|
@ -50,55 +50,55 @@ p
|
||||||
+cell spaCy v2.0.0
|
+cell spaCy v2.0.0
|
||||||
+cell 2017
|
+cell 2017
|
||||||
+cell neural
|
+cell neural
|
||||||
+cell("right") 94.48
|
+cell("num") 94.48
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell spaCy v1.1.0
|
+cell spaCy v1.1.0
|
||||||
+cell 2016
|
+cell 2016
|
||||||
+cell linear
|
+cell linear
|
||||||
+cell("right") 92.80
|
+cell("num") 92.80
|
||||||
|
|
||||||
+row("divider")
|
+row("divider")
|
||||||
+cell
|
+cell
|
||||||
+a("https://arxiv.org/pdf/1611.01734.pdf") Dozat and Manning
|
+a("https://arxiv.org/pdf/1611.01734.pdf") Dozat and Manning
|
||||||
+cell 2017
|
+cell 2017
|
||||||
+cell neural
|
+cell neural
|
||||||
+cell("right") #[strong 95.75]
|
+cell("num") #[strong 95.75]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell
|
+cell
|
||||||
+a("http://arxiv.org/abs/1603.06042") Andor et al.
|
+a("http://arxiv.org/abs/1603.06042") Andor et al.
|
||||||
+cell 2016
|
+cell 2016
|
||||||
+cell neural
|
+cell neural
|
||||||
+cell("right") 94.44
|
+cell("num") 94.44
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell
|
+cell
|
||||||
+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet Parsey McParseface
|
+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet Parsey McParseface
|
||||||
+cell 2016
|
+cell 2016
|
||||||
+cell neural
|
+cell neural
|
||||||
+cell("right") 94.15
|
+cell("num") 94.15
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell
|
+cell
|
||||||
+a("http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43800.pdf") Weiss et al.
|
+a("http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43800.pdf") Weiss et al.
|
||||||
+cell 2015
|
+cell 2015
|
||||||
+cell neural
|
+cell neural
|
||||||
+cell("right") 93.91
|
+cell("num") 93.91
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell
|
+cell
|
||||||
+a("http://research.google.com/pubs/archive/38148.pdf") Zhang and McDonald
|
+a("http://research.google.com/pubs/archive/38148.pdf") Zhang and McDonald
|
||||||
+cell 2014
|
+cell 2014
|
||||||
+cell linear
|
+cell linear
|
||||||
+cell("right") 93.32
|
+cell("num") 93.32
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell
|
+cell
|
||||||
+a("http://www.cs.cmu.edu/~ark/TurboParser/") Martins et al.
|
+a("http://www.cs.cmu.edu/~ark/TurboParser/") Martins et al.
|
||||||
+cell 2013
|
+cell 2013
|
||||||
+cell linear
|
+cell linear
|
||||||
+cell("right") 93.10
|
+cell("num") 93.10
|
||||||
|
|
||||||
+h(4, "ner-accuracy-ontonotes5") NER accuracy (OntoNotes 5, no pre-process)
|
+h(4, "ner-accuracy-ontonotes5") NER accuracy (OntoNotes 5, no pre-process)
|
||||||
|
|
||||||
|
@ -113,35 +113,35 @@ p
|
||||||
+cell spaCy #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] v2.0.0a3
|
+cell spaCy #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] v2.0.0a3
|
||||||
+cell 2017
|
+cell 2017
|
||||||
+cell neural
|
+cell neural
|
||||||
+cell("right") 85.85
|
+cell("num") 85.85
|
||||||
|
|
||||||
+row("divider")
|
+row("divider")
|
||||||
+cell
|
+cell
|
||||||
+a("https://arxiv.org/pdf/1702.02098.pdf") Strubell et al.
|
+a("https://arxiv.org/pdf/1702.02098.pdf") Strubell et al.
|
||||||
+cell 2017
|
+cell 2017
|
||||||
+cell neural
|
+cell neural
|
||||||
+cell("right") #[strong 86.81]
|
+cell("num") #[strong 86.81]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell
|
+cell
|
||||||
+a("https://www.semanticscholar.org/paper/Named-Entity-Recognition-with-Bidirectional-LSTM-C-Chiu-Nichols/10a4db59e81d26b2e0e896d3186ef81b4458b93f") Chiu and Nichols
|
+a("https://www.semanticscholar.org/paper/Named-Entity-Recognition-with-Bidirectional-LSTM-C-Chiu-Nichols/10a4db59e81d26b2e0e896d3186ef81b4458b93f") Chiu and Nichols
|
||||||
+cell 2016
|
+cell 2016
|
||||||
+cell neural
|
+cell neural
|
||||||
+cell("right") 86.19
|
+cell("num") 86.19
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell
|
+cell
|
||||||
+a("https://www.semanticscholar.org/paper/A-Joint-Model-for-Entity-Analysis-Coreference-Typi-Durrett-Klein/28eb033eee5f51c5e5389cbb6b777779203a6778") Durrett and Klein
|
+a("https://www.semanticscholar.org/paper/A-Joint-Model-for-Entity-Analysis-Coreference-Typi-Durrett-Klein/28eb033eee5f51c5e5389cbb6b777779203a6778") Durrett and Klein
|
||||||
+cell 2014
|
+cell 2014
|
||||||
+cell neural
|
+cell neural
|
||||||
+cell("right") 84.04
|
+cell("num") 84.04
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell
|
+cell
|
||||||
+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth
|
+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth
|
||||||
+cell 2009
|
+cell 2009
|
||||||
+cell linear
|
+cell linear
|
||||||
+cell("right") 83.45
|
+cell("num") 83.45
|
||||||
|
|
||||||
+h(3, "spacy-models") Model comparison
|
+h(3, "spacy-models") Model comparison
|
||||||
|
|
||||||
|
@ -183,24 +183,24 @@ p
|
||||||
+row
|
+row
|
||||||
+cell #[strong spaCy]
|
+cell #[strong spaCy]
|
||||||
each data in [ "0.2ms", "1ms", "19ms"]
|
each data in [ "0.2ms", "1ms", "19ms"]
|
||||||
+cell("right") #[strong=data]
|
+cell("num") #[strong=data]
|
||||||
|
|
||||||
each data in ["1x", "1x", "1x"]
|
each data in ["1x", "1x", "1x"]
|
||||||
+cell("right")=data
|
+cell("num")=data
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell CoreNLP
|
+cell CoreNLP
|
||||||
each data in ["2ms", "10ms", "49ms", "10x", "10x", "2.6x"]
|
each data in ["2ms", "10ms", "49ms", "10x", "10x", "2.6x"]
|
||||||
+cell("right")=data
|
+cell("num")=data
|
||||||
+row
|
+row
|
||||||
+cell ZPar
|
+cell ZPar
|
||||||
each data in ["1ms", "8ms", "850ms", "5x", "8x", "44.7x"]
|
each data in ["1ms", "8ms", "850ms", "5x", "8x", "44.7x"]
|
||||||
+cell("right")=data
|
+cell("num")=data
|
||||||
+row
|
+row
|
||||||
+cell NLTK
|
+cell NLTK
|
||||||
each data in ["4ms", "443ms"]
|
each data in ["4ms", "443ms"]
|
||||||
+cell("right")=data
|
+cell("num")=data
|
||||||
+cell("right") #[em n/a]
|
+cell("num") #[em n/a]
|
||||||
each data in ["20x", "443x"]
|
each data in ["20x", "443x"]
|
||||||
+cell("right")=data
|
+cell("num")=data
|
||||||
+cell("right") #[em n/a]
|
+cell("num") #[em n/a]
|
||||||
|
|
|
@ -79,12 +79,19 @@ p
|
||||||
python -m spacy validate
|
python -m spacy validate
|
||||||
|
|
||||||
+h(3, "gpu") Run spaCy with GPU
|
+h(3, "gpu") Run spaCy with GPU
|
||||||
|
+tag experimental
|
||||||
|
|
||||||
|
+infobox("Important note", "⚠️")
|
||||||
|
| The instructions below refer to installation with CUDA 8.0. In order to
|
||||||
|
| install with CUDA 9.0, set the environment variable #[code CUDA9=1]
|
||||||
|
| before installing Thinc. You'll also need to adjust the path to the
|
||||||
|
| CUDA runtime.
|
||||||
|
|
||||||
p
|
p
|
||||||
| As of v2.0, spaCy's comes with neural network models that are implemented
|
| As of v2.0, spaCy's comes with neural network models that are implemented
|
||||||
| in our machine learning library, #[+a(gh("thinc")) Thinc]. For GPU
|
| in our machine learning library, #[+a(gh("thinc")) Thinc]. For GPU
|
||||||
| support, we've been grateful to use the work of
|
| support, we've been grateful to use the work of
|
||||||
| #[+a("http://chainer.org") Chainer]'s CuPy module, which provides
|
| Chainer's #[+a("https://cupy.chainer.org") CuPy] module, which provides
|
||||||
| a NumPy-compatible interface for GPU arrays.
|
| a NumPy-compatible interface for GPU arrays.
|
||||||
|
|
||||||
p
|
p
|
||||||
|
@ -93,11 +100,11 @@ p
|
||||||
| CUDA. Finally, install spaCy.
|
| CUDA. Finally, install spaCy.
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash").
|
||||||
export CUDA_HOME=/usr/local/cuda-8.0 # Or wherever your CUDA is
|
export CUDA_HOME=/usr/local/cuda-8.0 # or wherever your CUDA is
|
||||||
export PATH=$PATH:$CUDA_HOME/bin
|
export PATH=$PATH:$CUDA_HOME/bin
|
||||||
|
|
||||||
pip install spacy
|
pip install spacy
|
||||||
python -c "import thinc.neural.gpu_ops" # Check the GPU ops were built
|
python -c "import thinc.neural.gpu_ops" # check the GPU ops were built
|
||||||
|
|
||||||
+h(3, "source") Compile from source
|
+h(3, "source") Compile from source
|
||||||
|
|
||||||
|
|
|
@ -11,9 +11,6 @@
|
||||||
+qs({config: 'venv', os: 'linux'}) source .env/bin/activate
|
+qs({config: 'venv', os: 'linux'}) source .env/bin/activate
|
||||||
+qs({config: 'venv', os: 'windows'}) .env\Scripts\activate
|
+qs({config: 'venv', os: 'windows'}) .env\Scripts\activate
|
||||||
|
|
||||||
+qs({config: 'gpu', os: 'mac'}) export PATH=$PATH:/usr/local/cuda-8.0/bin
|
|
||||||
+qs({config: 'gpu', os: 'linux'}) export PATH=$PATH:/usr/local/cuda-8.0/bin
|
|
||||||
|
|
||||||
+qs({package: 'pip'}) pip install -U spacy
|
+qs({package: 'pip'}) pip install -U spacy
|
||||||
+qs({package: 'conda'}) conda install -c conda-forge spacy
|
+qs({package: 'conda'}) conda install -c conda-forge spacy
|
||||||
|
|
||||||
|
|
|
@ -4,9 +4,8 @@ p
|
||||||
| Similarity is determined by comparing #[strong word vectors] or "word
|
| Similarity is determined by comparing #[strong word vectors] or "word
|
||||||
| embeddings", multi-dimensional meaning representations of a word. Word
|
| embeddings", multi-dimensional meaning representations of a word. Word
|
||||||
| vectors can be generated using an algorithm like
|
| vectors can be generated using an algorithm like
|
||||||
| #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. spaCy's medium
|
| #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec] and usually
|
||||||
| #[code md] and large #[code lg] #[+a("/models") models] come with
|
| look like this:
|
||||||
| #[strong multi-dimensional vectors] that look like this:
|
|
||||||
|
|
||||||
+code("banana.vector", false, false, 250).
|
+code("banana.vector", false, false, 250).
|
||||||
array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01,
|
array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01,
|
||||||
|
@ -110,8 +109,21 @@ p
|
||||||
-2.97650009e-01, 7.89430022e-01, 3.31680000e-01,
|
-2.97650009e-01, 7.89430022e-01, 3.31680000e-01,
|
||||||
-1.19659996e+00, -4.71559986e-02, 5.31750023e-01], dtype=float32)
|
-1.19659996e+00, -4.71559986e-02, 5.31750023e-01], dtype=float32)
|
||||||
|
|
||||||
|
+infobox("Important note", "⚠️")
|
||||||
|
| To make them compact and fast, spaCy's small #[+a("/models") models]
|
||||||
|
| (all packages that end in #[code sm]) #[strong don't ship with word vectors], and
|
||||||
|
| only include context-sensitive #[strong tensors]. This means you can
|
||||||
|
| still use the #[code similarity()] methods to compare documents, spans
|
||||||
|
| and tokens – but the result won't be as good, and individual tokens won't
|
||||||
|
| have any vectors assigned. So in order to use #[em real] word vectors,
|
||||||
|
| you need to download a larger model:
|
||||||
|
|
||||||
|
+code-wrapper
|
||||||
|
+code-new(false, "bash", "$") spacy download en_core_web_lg
|
||||||
|
|
||||||
p
|
p
|
||||||
| The #[code .vector] attribute will return an object's vector.
|
| Models that come with built-in word vectors make them available as the
|
||||||
|
| #[+api("token#vector") #[code Token.vector]] attribute.
|
||||||
| #[+api("doc#vector") #[code Doc.vector]] and
|
| #[+api("doc#vector") #[code Doc.vector]] and
|
||||||
| #[+api("span#vector") #[code Span.vector]] will default to an average
|
| #[+api("span#vector") #[code Span.vector]] will default to an average
|
||||||
| of their token vectors. You can also check if a token has a vector
|
| of their token vectors. You can also check if a token has a vector
|
||||||
|
@ -119,6 +131,7 @@ p
|
||||||
| vectors.
|
| vectors.
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
|
nlp = spacy.load('en_core_web_lg')
|
||||||
tokens = nlp(u'dog cat banana sasquatch')
|
tokens = nlp(u'dog cat banana sasquatch')
|
||||||
|
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
|
@ -143,10 +156,9 @@ p
|
||||||
| they're part of the model's vocabulary, and come with a vector. The word
|
| they're part of the model's vocabulary, and come with a vector. The word
|
||||||
| "sasquatch" on the other hand is a lot less common and out-of-vocabulary
|
| "sasquatch" on the other hand is a lot less common and out-of-vocabulary
|
||||||
| – so its vector representation consists of 300 dimensions of #[code 0],
|
| – so its vector representation consists of 300 dimensions of #[code 0],
|
||||||
| which means it's practically nonexistent.
|
| which means it's practically nonexistent. If your application will
|
||||||
|
| benefit from a #[strong large vocabulary] with more vectors, you should
|
||||||
p
|
| consider using one of the larger models or loading in a full vector
|
||||||
| If your application will benefit from a large vocabulary with more
|
| package, for example,
|
||||||
| vectors, you should consider using one of the
|
| #[+a("/models/en#en_vectors_web_lg") #[code en_vectors_web_lg]], which
|
||||||
| #[+a("/models") larger models] instead of the default,
|
| includes over #[strong 1 million unique vectors].
|
||||||
| smaller ones, which usually come with a clipped vocabulary.
|
|
||||||
|
|
|
@ -10,8 +10,9 @@ p
|
||||||
+h(3, "features-models") Convolutional neural network models
|
+h(3, "features-models") Convolutional neural network models
|
||||||
|
|
||||||
+aside-code("Example", "bash")
|
+aside-code("Example", "bash")
|
||||||
for model in ["en", "de", "fr", "es", "pt", "it"]
|
for _, lang in MODELS
|
||||||
| spacy download #{model} # default #{LANGUAGES[model]} model!{'\n'}
|
if lang != "xx"
|
||||||
|
| spacy download #{lang} # default #{LANGUAGES[lang]} model!{'\n'}
|
||||||
| spacy download xx_ent_wiki_sm # multi-language NER
|
| spacy download xx_ent_wiki_sm # multi-language NER
|
||||||
|
|
||||||
p
|
p
|
||||||
|
@ -20,14 +21,22 @@ p
|
||||||
| been designed and implemented from scratch specifically for spaCy, to
|
| been designed and implemented from scratch specifically for spaCy, to
|
||||||
| give you an unmatched balance of speed, size and accuracy. The new
|
| give you an unmatched balance of speed, size and accuracy. The new
|
||||||
| models are #[strong 10× smaller], #[strong 20% more accurate],
|
| models are #[strong 10× smaller], #[strong 20% more accurate],
|
||||||
| and #[strong just as fast] as the previous generation.
|
| and #[strong even cheaper to run] than the previous generation.
|
||||||
| #[strong GPU usage] is now supported via
|
|
||||||
| #[+a("http://chainer.org") Chainer]'s CuPy module.
|
p
|
||||||
|
| spaCy v2.0's new neural network models bring significant improvements in
|
||||||
|
| accuracy, especially for English Named Entity Recognition. The new
|
||||||
|
| #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] model makes
|
||||||
|
| about #[strong 25% fewer mistakes] than the corresponding v1.x model and
|
||||||
|
| is within #[strong 1% of the current state-of-the-art]
|
||||||
|
| (#[+a("https://arxiv.org/pdf/1702.02098.pdf") Strubell et al., 2017]).
|
||||||
|
| The v2.0 models are also cheaper to run at scale, as they require
|
||||||
|
| #[strong under 1 GB of memory] per process.
|
||||||
|
|
||||||
+infobox
|
+infobox
|
||||||
| #[+label-inline Usage:] #[+a("/models") Models directory],
|
| #[+label-inline Usage:] #[+a("/models") Models directory],
|
||||||
| #[+a("/models/comparison") Models comparison],
|
| #[+a("/models/comparison") Models comparison],
|
||||||
| #[+a("/usage/#gpu") Using spaCy with GPU]
|
| #[+a("#benchmarks") Benchmarks]
|
||||||
|
|
||||||
+h(3, "features-pipelines") Improved processing pipelines
|
+h(3, "features-pipelines") Improved processing pipelines
|
||||||
|
|
||||||
|
|
|
@ -22,7 +22,7 @@ p
|
||||||
| #[strong deep learning-powered models] for spaCy's tagger,
|
| #[strong deep learning-powered models] for spaCy's tagger,
|
||||||
| parser and entity recognizer. The new models are
|
| parser and entity recognizer. The new models are
|
||||||
| #[strong 10× smaller], #[strong 20% more accurate] and
|
| #[strong 10× smaller], #[strong 20% more accurate] and
|
||||||
| just as fast as the previous generation.
|
| #[strong even cheaper to run] than the previous generation.
|
||||||
|
|
||||||
p
|
p
|
||||||
| We've also made several usability improvements that are
|
| We've also made several usability improvements that are
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
include ../_includes/_mixins
|
include ../_includes/_mixins
|
||||||
|
|
||||||
p
|
p
|
||||||
| As of v1.7.0, models for spaCy can be installed as #[strong Python packages].
|
| spaCy's models can be installed as #[strong Python packages].
|
||||||
| This means that they're a component of your application, just like any
|
| This means that they're a component of your application, just like any
|
||||||
| other module. They're versioned and can be defined as a dependency in your
|
| other module. They're versioned and can be defined as a dependency in your
|
||||||
| #[code requirements.txt]. Models can be installed from a download URL or
|
| #[code requirements.txt]. Models can be installed from a download URL or
|
||||||
|
|
Loading…
Reference in New Issue
Block a user