mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Update docs and quickstart widget [ci skip]
This commit is contained in:
parent
ba84371ab0
commit
7d526d0d40
5
website/package-lock.json
generated
5
website/package-lock.json
generated
|
@ -17455,6 +17455,11 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"jinja-to-js": {
|
||||
"version": "3.2.3",
|
||||
"resolved": "https://registry.npmjs.org/jinja-to-js/-/jinja-to-js-3.2.3.tgz",
|
||||
"integrity": "sha512-ktEBxQG17fYaFcHThB719+EbePBx+AkkORQMyuP0UuLPS2zx8uJXP5CsItXjUUwMHFPj3hCRkyqEYzLbeklYgQ=="
|
||||
},
|
||||
"jpeg-js": {
|
||||
"version": "0.2.0",
|
||||
"resolved": "https://registry.npmjs.org/jpeg-js/-/jpeg-js-0.2.0.tgz",
|
||||
|
|
|
@ -41,6 +41,7 @@
|
|||
"gatsby-transformer-sharp": "^2.1.13",
|
||||
"html-to-react": "^1.3.4",
|
||||
"intersection-observer": "^0.5.1",
|
||||
"jinja-to-js": "^3.2.3",
|
||||
"node-sass": "^4.11.0",
|
||||
"parse-numeric-range": "0.0.2",
|
||||
"prismjs": "^1.15.0",
|
||||
|
@ -52,20 +53,22 @@
|
|||
"remark-react": "^5.0.1"
|
||||
},
|
||||
"scripts": {
|
||||
"build": "gatsby build",
|
||||
"dev": "gatsby develop",
|
||||
"build": "npm run python:setup && gatsby build",
|
||||
"dev": "npm run python:setup && gatsby develop",
|
||||
"dev:nightly": "BRANCH=nightly.spacy.io npm run dev",
|
||||
"lint": "eslint **",
|
||||
"clear": "rm -rf .cache",
|
||||
"test": "echo \"Write tests! -> https://gatsby.app/unit-testing\""
|
||||
"test": "echo \"Write tests! -> https://gatsby.app/unit-testing\"",
|
||||
"python:install": "pip install setup/requirements.txt",
|
||||
"python:setup": "cd setup && ./setup.sh"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@sindresorhus/slugify": "^0.8.0",
|
||||
"browser-monads": "^1.0.0",
|
||||
"md-attr-parser": "^1.2.1",
|
||||
"prettier": "^1.16.4",
|
||||
"raw-loader": "^1.0.0",
|
||||
"unist-util-visit": "^1.4.0",
|
||||
"@sindresorhus/slugify": "^0.8.0"
|
||||
"unist-util-visit": "^1.4.0"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
|
|
1209
website/setup/jinja_to_js.py
Normal file
1209
website/setup/jinja_to_js.py
Normal file
File diff suppressed because it is too large
Load Diff
107
website/setup/quickstart_training_cpu.jinja
Normal file
107
website/setup/quickstart_training_cpu.jinja
Normal file
|
@ -0,0 +1,107 @@
|
|||
{# Template for "CPU" configs. The transformer will use a different template. #}
|
||||
# This is an auto-generated partial config for training a model.
|
||||
# To use it for training, auto-fill it with all default values.
|
||||
# python -m spacy init config config.cfg --base base_config.cfg
|
||||
[paths]
|
||||
train = ""
|
||||
dev = ""
|
||||
|
||||
[nlp]
|
||||
lang = "{{ lang }}"
|
||||
pipeline = {{ pipeline|safe }}
|
||||
vectors = {{ ('"en_vectors_web_lg"' if optimize == "accuracy" else false)|safe }}
|
||||
tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
|
||||
|
||||
[components]
|
||||
|
||||
[components.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[components.tok2vec.model]
|
||||
@architectures = "spacy.Tok2Vec.v1"
|
||||
|
||||
[components.tok2vec.model.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v1"
|
||||
width = ${components.tok2vec.model.encode:width}
|
||||
rows = {{ 2000 if optimize == "efficiency" else 7000 }}
|
||||
also_embed_subwords = {{ true if has_letters else false }}
|
||||
also_use_static_vectors = {{ true if optimize == "accuracy" else false }}
|
||||
|
||||
[components.tok2vec.model.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||
width = {{ 96 if optimize == "efficiency" else 256 }}
|
||||
depth = {{ 4 if optimize == "efficiency" else 8 }}
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
|
||||
{% if "tagger" in components %}
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[components.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
nO = null
|
||||
|
||||
[components.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode:width}
|
||||
{%- endif %}
|
||||
|
||||
{% if "parser" in components -%}
|
||||
[components.parser]
|
||||
factory = "parser"
|
||||
|
||||
[components.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = true
|
||||
nO = null
|
||||
|
||||
[components.parser.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode:width}
|
||||
{%- endif %}
|
||||
|
||||
{% if "ner" in components -%}
|
||||
[components.ner]
|
||||
factory = "ner"
|
||||
|
||||
[components.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 6
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
nO = null
|
||||
|
||||
[components.parser.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode:width}
|
||||
{% endif -%}
|
||||
|
||||
[training]
|
||||
|
||||
[training.train_corpus]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths:train}
|
||||
|
||||
[training.dev_corpus]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths:dev}
|
||||
|
||||
[training.score_weights]
|
||||
{%- if "tagger" in components %}
|
||||
tag_acc = {{ (1.0 / components|length)|round() }}
|
||||
{%- endif -%}
|
||||
{%- if "parser" in components %}
|
||||
dep_uas = 0.0
|
||||
dep_las = {{ (1.0 / components|length)|round() }}
|
||||
sents_f = 0.0
|
||||
{%- endif %}
|
||||
{%- if "ner" in components %}
|
||||
ents_f = {{ (1.0 / components|length)|round() }}
|
||||
ents_p = 0.0
|
||||
ents_r = 0.0
|
||||
{%- endif -%}
|
3
website/setup/requirements.txt
Normal file
3
website/setup/requirements.txt
Normal file
|
@ -0,0 +1,3 @@
|
|||
# These are used to compile the training quickstart config
|
||||
jinja2
|
||||
typer
|
1
website/setup/setup.sh
Executable file
1
website/setup/setup.sh
Executable file
|
@ -0,0 +1 @@
|
|||
python jinja_to_js.py quickstart_training_cpu.jinja ../src/widgets/quickstart-training-generator.js
|
|
@ -15,24 +15,18 @@ function getNewChecked(optionId, checkedForId, multiple) {
|
|||
return [...checkedForId, optionId]
|
||||
}
|
||||
|
||||
function getRawContent(ref) {
|
||||
if (ref.current && ref.current.childNodes) {
|
||||
// Select all currently visible nodes (spans and text nodes)
|
||||
const result = [...ref.current.childNodes].filter(el => el.offsetParent !== null)
|
||||
return result.map(el => el.textContent).join('\n')
|
||||
}
|
||||
return ''
|
||||
}
|
||||
|
||||
const Quickstart = ({
|
||||
data = [],
|
||||
title,
|
||||
description,
|
||||
copy = true,
|
||||
download,
|
||||
rawContent = null,
|
||||
id = 'quickstart',
|
||||
setters = {},
|
||||
hidePrompts,
|
||||
small,
|
||||
codeLang,
|
||||
children,
|
||||
}) => {
|
||||
const contentRef = useRef()
|
||||
|
@ -46,6 +40,16 @@ const Quickstart = ({
|
|||
const [copySuccess, setCopySuccess] = useState(false)
|
||||
const [otherState, setOtherState] = useState({})
|
||||
const setOther = (id, value) => setOtherState({ ...otherState, [id]: value })
|
||||
const getRawContent = ref => {
|
||||
if (rawContent !== null) return rawContent
|
||||
if (ref.current && ref.current.childNodes) {
|
||||
// Select all currently visible nodes (spans and text nodes)
|
||||
const result = [...ref.current.childNodes].filter(el => el.offsetParent !== null)
|
||||
return result.map(el => el.textContent).join('\n')
|
||||
}
|
||||
return ''
|
||||
}
|
||||
|
||||
const onClickCopy = () => {
|
||||
copyAreaRef.current.value = getRawContent(contentRef)
|
||||
copyToClipboard(copyAreaRef, setCopySuccess)
|
||||
|
@ -210,7 +214,14 @@ const Quickstart = ({
|
|||
}
|
||||
)}
|
||||
<pre className={classes.code}>
|
||||
<code className={classes.results} data-quickstart-results="" ref={contentRef}>
|
||||
<code
|
||||
className={classNames(classes.results, {
|
||||
[classes.small]: !!small,
|
||||
[`language-${codeLang}`]: !!codeLang,
|
||||
})}
|
||||
data-quickstart-results=""
|
||||
ref={contentRef}
|
||||
>
|
||||
{children}
|
||||
</code>
|
||||
|
||||
|
|
|
@ -41,6 +41,6 @@ Search.propTypes = {
|
|||
apiKey: PropTypes.string.isRequired,
|
||||
indexName: PropTypes.string.isRequired,
|
||||
}).isRequired,
|
||||
id: PropTypes.string.isRequired,
|
||||
placeholder: PropTypes.string.isRequired,
|
||||
id: PropTypes.string,
|
||||
placeholder: PropTypes.string,
|
||||
}
|
||||
|
|
|
@ -124,6 +124,16 @@
|
|||
& > span
|
||||
display: block
|
||||
|
||||
.small
|
||||
font-size: var(--font-size-sm)
|
||||
line-height: 1.65
|
||||
white-space: pre
|
||||
max-height: 400px
|
||||
overflow-y: auto
|
||||
|
||||
& > span
|
||||
display: inline
|
||||
|
||||
.hide-prompts .prompt:before
|
||||
content: initial !important
|
||||
|
||||
|
|
10
website/src/widgets/quickstart-training-generator.js
Normal file
10
website/src/widgets/quickstart-training-generator.js
Normal file
|
@ -0,0 +1,10 @@
|
|||
import jinjaToJS from "jinja-to-js";export default function templateQuickstartTrainingCpu(ctx) {
|
||||
var __result = "";
|
||||
var __tmp;
|
||||
var __runtime = jinjaToJS.runtime;
|
||||
var __filters = jinjaToJS.filters;
|
||||
var __globals = jinjaToJS.globals;
|
||||
var context = jinjaToJS.createContext(ctx);
|
||||
__result += "\n# This is an auto-generated partial config for training a model.\n# To use it for training, auto-fill it with all default values.\n# python -m spacy init config config.cfg --base base_config.cfg\n[paths]\ntrain = \"\"\ndev = \"\"\n\n[nlp]\nlang = \"";__result += "" + __runtime.escape((__tmp = (context.lang)) == null ? "" : __tmp);__result += "\"\npipeline = ";__result += "" + ((__tmp = (context.pipeline)) == null ? "" : __tmp);__result += "\nvectors = ";__result += "" + ((__tmp = ((context.optimize==="accuracy" ? "\"en_vectors_web_lg\"" : false))) == null ? "" : __tmp);__result += "\ntokenizer = {\"@tokenizers\": \"spacy.Tokenizer.v1\"}\n\n[components]\n\n[components.tok2vec]\nfactory = \"tok2vec\"\n\n[components.tok2vec.model]\n@architectures = \"spacy.Tok2Vec.v1\"\n\n[components.tok2vec.model.embed]\n@architectures = \"spacy.MultiHashEmbed.v1\"\nwidth = ${components.tok2vec.model.encode:width}\nrows = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 2000 : 7000))) == null ? "" : __tmp);__result += "\nalso_embed_subwords = ";__result += "" + __runtime.escape((__tmp = ((context.has_letters ? true : false))) == null ? "" : __tmp);__result += "\nalso_use_static_vectors = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="accuracy" ? true : false))) == null ? "" : __tmp);__result += "\n\n[components.tok2vec.model.encode]\n@architectures = \"spacy.MaxoutWindowEncoder.v1\"\nwidth = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 96 : 256))) == null ? "" : __tmp);__result += "\ndepth = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 4 : 8))) == null ? "" : __tmp);__result += "\nwindow_size = 1\nmaxout_pieces = 3\n\n";if(context.components.includes("tagger")){__result += "\n[components.tagger]\nfactory = \"tagger\"\n\n[components.tagger.model]\n@architectures = \"spacy.Tagger.v1\"\nnO = null\n\n[components.tagger.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode:width}";}__result += "\n\n";if(context.components.includes("parser")){__result += "[components.parser]\nfactory = \"parser\"\n\n[components.parser.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 8\nhidden_width = 128\nmaxout_pieces = 3\nuse_upper = true\nnO = null\n\n[components.parser.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode:width}";}__result += "\n\n";if(context.components.includes("ner")){__result += "[components.ner]\nfactory = \"ner\"\n\n[components.ner.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 6\nhidden_width = 64\nmaxout_pieces = 2\nuse_upper = true\nnO = null\n\n[components.parser.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode:width}\n";}__result += "[training]\n\n[training.train_corpus]\n@readers = \"spacy.Corpus.v1\"\npath = ${paths:train}\n\n[training.dev_corpus]\n@readers = \"spacy.Corpus.v1\"\npath = ${paths:dev}\n\n[training.score_weights]";if(context.components.includes("tagger")){__result += "\ntag_acc = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 100) / 100)) == null ? "" : __tmp);}if(context.components.includes("parser")){__result += "\ndep_uas = 0.0\ndep_las = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 100) / 100)) == null ? "" : __tmp);__result += "\nsents_f = 0.0";}if(context.components.includes("ner")){__result += "\nents_f = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 100) / 100)) == null ? "" : __tmp);__result += "\nents_p = 0.0\nents_r = 0.0";}
|
||||
return __result;
|
||||
}
|
|
@ -1,13 +1,16 @@
|
|||
import React, { useState } from 'react'
|
||||
import { StaticQuery, graphql } from 'gatsby'
|
||||
import highlightCode from 'gatsby-remark-prismjs/highlight-code.js'
|
||||
|
||||
import { Quickstart, QS } from '../components/quickstart'
|
||||
import generator from './quickstart-training-generator'
|
||||
import { isString, htmlToReact } from '../components/util'
|
||||
|
||||
const DEFAULT_LANG = 'en'
|
||||
const DEFAULT_HARDWARE = 'gpu'
|
||||
const DEFAULT_OPT = 'efficiency'
|
||||
const COMPONENTS = ['tagger', 'parser', 'ner', 'textcat']
|
||||
const COMMENT = `# This is an auto-generated partial config for training a model.
|
||||
# To use it for training, auto-fill it with all default values.
|
||||
# python -m spacy init config config.cfg --base base_config.cfg`
|
||||
|
||||
const DATA = [
|
||||
{
|
||||
id: 'lang',
|
||||
|
@ -25,9 +28,8 @@ const DATA = [
|
|||
id: 'hardware',
|
||||
title: 'Hardware',
|
||||
options: [
|
||||
{ id: 'cpu-only', title: 'CPU only' },
|
||||
{ id: 'cpu', title: 'CPU preferred' },
|
||||
{ id: 'gpu', title: 'GPU', checked: true },
|
||||
{ id: 'cpu', title: 'CPU preferred', checked: DEFAULT_HARDWARE === 'cpu' },
|
||||
{ id: 'gpu', title: 'GPU', checked: DEFAULT_HARDWARE === 'gpu' },
|
||||
],
|
||||
},
|
||||
{
|
||||
|
@ -35,28 +37,42 @@ const DATA = [
|
|||
title: 'Optimize for',
|
||||
help: '...',
|
||||
options: [
|
||||
{ id: 'efficiency', title: 'efficiency', checked: true },
|
||||
{ id: 'accuracy', title: 'accuracy' },
|
||||
{ id: 'efficiency', title: 'efficiency', checked: DEFAULT_OPT === 'efficiency' },
|
||||
{ id: 'accuracy', title: 'accuracy', checked: DEFAULT_OPT === 'accuracy' },
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'config',
|
||||
title: 'Configuration',
|
||||
options: [
|
||||
{
|
||||
id: 'independent',
|
||||
title: 'independent components',
|
||||
help: "Make components independent and don't share weights",
|
||||
},
|
||||
],
|
||||
multiple: true,
|
||||
},
|
||||
]
|
||||
|
||||
function stringify(value) {
|
||||
if (isString(value) && value.startsWith('${')) return value
|
||||
const string = JSON.stringify(value)
|
||||
if (Array.isArray(value)) return string.replace(/,/g, ', ')
|
||||
return string
|
||||
}
|
||||
|
||||
export default function QuickstartTraining({ id, title, download = 'config.cfg' }) {
|
||||
const [lang, setLang] = useState(DEFAULT_LANG)
|
||||
const [pipeline, setPipeline] = useState([])
|
||||
const setters = { lang: setLang, components: setPipeline }
|
||||
const [components, setComponents] = useState([])
|
||||
const [[hardware], setHardware] = useState([DEFAULT_HARDWARE])
|
||||
const [[optimize], setOptimize] = useState([DEFAULT_OPT])
|
||||
const setters = {
|
||||
lang: setLang,
|
||||
components: setComponents,
|
||||
hardware: setHardware,
|
||||
optimize: setOptimize,
|
||||
}
|
||||
const content = generator({
|
||||
lang,
|
||||
pipeline: stringify(components),
|
||||
components,
|
||||
optimize,
|
||||
hardware,
|
||||
})
|
||||
const rawContent = content.trim().replace(/\n\n\n+/g, '\n\n')
|
||||
const displayContent = highlightCode('ini', rawContent)
|
||||
.split('\n')
|
||||
.map(line => (line.startsWith('#') ? `<span class="token comment">${line}</span>` : line))
|
||||
.join('\n')
|
||||
return (
|
||||
<StaticQuery
|
||||
query={query}
|
||||
|
@ -66,47 +82,19 @@ export default function QuickstartTraining({ id, title, download = 'config.cfg'
|
|||
id: code,
|
||||
title: name,
|
||||
}))
|
||||
const recommendedTrf = Object.assign(
|
||||
{},
|
||||
...langs.map(({ code }) => ({ [code]: { sm: 'TODO', lg: 'TODO' } }))
|
||||
)
|
||||
return (
|
||||
<Quickstart
|
||||
download={download}
|
||||
rawContent={content}
|
||||
data={DATA}
|
||||
title={title}
|
||||
id={id}
|
||||
setters={setters}
|
||||
hidePrompts
|
||||
small
|
||||
codeLang="ini"
|
||||
>
|
||||
<QS comment>{COMMENT}</QS>
|
||||
<span>[paths]</span>
|
||||
<span>train = ""</span>
|
||||
<span>dev = ""</span>
|
||||
<br />
|
||||
<span>[nlp]</span>
|
||||
<span>lang = "{lang}"</span>
|
||||
<span>pipeline = {JSON.stringify(pipeline).replace(/,/g, ', ')}</span>
|
||||
<br />
|
||||
<span>[components]</span>
|
||||
<br />
|
||||
<span>[components.transformer]</span>
|
||||
<QS optimize="efficiency">name = "{recommendedTrf[lang].sm}"</QS>
|
||||
<QS optimize="accuracy">name = "{recommendedTrf[lang].lg}"</QS>
|
||||
{!!pipeline.length && <br />}
|
||||
{pipeline.map((pipe, i) => (
|
||||
<>
|
||||
{i !== 0 && <br />}
|
||||
<span>[components.{pipe}]</span>
|
||||
<span>factory = "{pipe}"</span>
|
||||
<QS config="independent">
|
||||
<br />
|
||||
[components.parser.model.tok2vec]
|
||||
<br />
|
||||
@architectures = "spacy.Tok2Vec.v1"
|
||||
</QS>
|
||||
</>
|
||||
))}
|
||||
{htmlToReact(displayContent)}
|
||||
</Quickstart>
|
||||
)
|
||||
}}
|
||||
|
|
Loading…
Reference in New Issue
Block a user