Update docs and quickstart widget [ci skip]

This commit is contained in:
Ines Montani 2020-08-13 01:17:40 +02:00
parent ba84371ab0
commit 7d526d0d40
11 changed files with 1418 additions and 71 deletions

View File

@ -17455,6 +17455,11 @@
}
}
},
"jinja-to-js": {
"version": "3.2.3",
"resolved": "https://registry.npmjs.org/jinja-to-js/-/jinja-to-js-3.2.3.tgz",
"integrity": "sha512-ktEBxQG17fYaFcHThB719+EbePBx+AkkORQMyuP0UuLPS2zx8uJXP5CsItXjUUwMHFPj3hCRkyqEYzLbeklYgQ=="
},
"jpeg-js": {
"version": "0.2.0",
"resolved": "https://registry.npmjs.org/jpeg-js/-/jpeg-js-0.2.0.tgz",

View File

@ -41,6 +41,7 @@
"gatsby-transformer-sharp": "^2.1.13",
"html-to-react": "^1.3.4",
"intersection-observer": "^0.5.1",
"jinja-to-js": "^3.2.3",
"node-sass": "^4.11.0",
"parse-numeric-range": "0.0.2",
"prismjs": "^1.15.0",
@ -52,20 +53,22 @@
"remark-react": "^5.0.1"
},
"scripts": {
"build": "gatsby build",
"dev": "gatsby develop",
"build": "npm run python:setup && gatsby build",
"dev": "npm run python:setup && gatsby develop",
"dev:nightly": "BRANCH=nightly.spacy.io npm run dev",
"lint": "eslint **",
"clear": "rm -rf .cache",
"test": "echo \"Write tests! -> https://gatsby.app/unit-testing\""
"test": "echo \"Write tests! -> https://gatsby.app/unit-testing\"",
"python:install": "pip install setup/requirements.txt",
"python:setup": "cd setup && ./setup.sh"
},
"devDependencies": {
"@sindresorhus/slugify": "^0.8.0",
"browser-monads": "^1.0.0",
"md-attr-parser": "^1.2.1",
"prettier": "^1.16.4",
"raw-loader": "^1.0.0",
"unist-util-visit": "^1.4.0",
"@sindresorhus/slugify": "^0.8.0"
"unist-util-visit": "^1.4.0"
},
"repository": {
"type": "git",

1209
website/setup/jinja_to_js.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,107 @@
{# Template for "CPU" configs. The transformer will use a different template. #}
# This is an auto-generated partial config for training a model.
# To use it for training, auto-fill it with all default values.
# python -m spacy init config config.cfg --base base_config.cfg
[paths]
train = ""
dev = ""
[nlp]
lang = "{{ lang }}"
pipeline = {{ pipeline|safe }}
vectors = {{ ('"en_vectors_web_lg"' if optimize == "accuracy" else false)|safe }}
tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
[components]
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode:width}
rows = {{ 2000 if optimize == "efficiency" else 7000 }}
also_embed_subwords = {{ true if has_letters else false }}
also_use_static_vectors = {{ true if optimize == "accuracy" else false }}
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = {{ 96 if optimize == "efficiency" else 256 }}
depth = {{ 4 if optimize == "efficiency" else 8 }}
window_size = 1
maxout_pieces = 3
{% if "tagger" in components %}
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v1"
nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
{%- endif %}
{% if "parser" in components -%}
[components.parser]
factory = "parser"
[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8
hidden_width = 128
maxout_pieces = 3
use_upper = true
nO = null
[components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
{%- endif %}
{% if "ner" in components -%}
[components.ner]
factory = "ner"
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 6
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null
[components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
{% endif -%}
[training]
[training.train_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:train}
[training.dev_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:dev}
[training.score_weights]
{%- if "tagger" in components %}
tag_acc = {{ (1.0 / components|length)|round() }}
{%- endif -%}
{%- if "parser" in components %}
dep_uas = 0.0
dep_las = {{ (1.0 / components|length)|round() }}
sents_f = 0.0
{%- endif %}
{%- if "ner" in components %}
ents_f = {{ (1.0 / components|length)|round() }}
ents_p = 0.0
ents_r = 0.0
{%- endif -%}

View File

@ -0,0 +1,3 @@
# These are used to compile the training quickstart config
jinja2
typer

1
website/setup/setup.sh Executable file
View File

@ -0,0 +1 @@
python jinja_to_js.py quickstart_training_cpu.jinja ../src/widgets/quickstart-training-generator.js

View File

@ -15,24 +15,18 @@ function getNewChecked(optionId, checkedForId, multiple) {
return [...checkedForId, optionId]
}
function getRawContent(ref) {
if (ref.current && ref.current.childNodes) {
// Select all currently visible nodes (spans and text nodes)
const result = [...ref.current.childNodes].filter(el => el.offsetParent !== null)
return result.map(el => el.textContent).join('\n')
}
return ''
}
const Quickstart = ({
data = [],
title,
description,
copy = true,
download,
rawContent = null,
id = 'quickstart',
setters = {},
hidePrompts,
small,
codeLang,
children,
}) => {
const contentRef = useRef()
@ -46,6 +40,16 @@ const Quickstart = ({
const [copySuccess, setCopySuccess] = useState(false)
const [otherState, setOtherState] = useState({})
const setOther = (id, value) => setOtherState({ ...otherState, [id]: value })
const getRawContent = ref => {
if (rawContent !== null) return rawContent
if (ref.current && ref.current.childNodes) {
// Select all currently visible nodes (spans and text nodes)
const result = [...ref.current.childNodes].filter(el => el.offsetParent !== null)
return result.map(el => el.textContent).join('\n')
}
return ''
}
const onClickCopy = () => {
copyAreaRef.current.value = getRawContent(contentRef)
copyToClipboard(copyAreaRef, setCopySuccess)
@ -210,7 +214,14 @@ const Quickstart = ({
}
)}
<pre className={classes.code}>
<code className={classes.results} data-quickstart-results="" ref={contentRef}>
<code
className={classNames(classes.results, {
[classes.small]: !!small,
[`language-${codeLang}`]: !!codeLang,
})}
data-quickstart-results=""
ref={contentRef}
>
{children}
</code>

View File

@ -41,6 +41,6 @@ Search.propTypes = {
apiKey: PropTypes.string.isRequired,
indexName: PropTypes.string.isRequired,
}).isRequired,
id: PropTypes.string.isRequired,
placeholder: PropTypes.string.isRequired,
id: PropTypes.string,
placeholder: PropTypes.string,
}

View File

@ -124,6 +124,16 @@
& > span
display: block
.small
font-size: var(--font-size-sm)
line-height: 1.65
white-space: pre
max-height: 400px
overflow-y: auto
& > span
display: inline
.hide-prompts .prompt:before
content: initial !important

View File

@ -0,0 +1,10 @@
import jinjaToJS from "jinja-to-js";export default function templateQuickstartTrainingCpu(ctx) {
var __result = "";
var __tmp;
var __runtime = jinjaToJS.runtime;
var __filters = jinjaToJS.filters;
var __globals = jinjaToJS.globals;
var context = jinjaToJS.createContext(ctx);
__result += "\n# This is an auto-generated partial config for training a model.\n# To use it for training, auto-fill it with all default values.\n# python -m spacy init config config.cfg --base base_config.cfg\n[paths]\ntrain = \"\"\ndev = \"\"\n\n[nlp]\nlang = \"";__result += "" + __runtime.escape((__tmp = (context.lang)) == null ? "" : __tmp);__result += "\"\npipeline = ";__result += "" + ((__tmp = (context.pipeline)) == null ? "" : __tmp);__result += "\nvectors = ";__result += "" + ((__tmp = ((context.optimize==="accuracy" ? "\"en_vectors_web_lg\"" : false))) == null ? "" : __tmp);__result += "\ntokenizer = {\"@tokenizers\": \"spacy.Tokenizer.v1\"}\n\n[components]\n\n[components.tok2vec]\nfactory = \"tok2vec\"\n\n[components.tok2vec.model]\n@architectures = \"spacy.Tok2Vec.v1\"\n\n[components.tok2vec.model.embed]\n@architectures = \"spacy.MultiHashEmbed.v1\"\nwidth = ${components.tok2vec.model.encode:width}\nrows = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 2000 : 7000))) == null ? "" : __tmp);__result += "\nalso_embed_subwords = ";__result += "" + __runtime.escape((__tmp = ((context.has_letters ? true : false))) == null ? "" : __tmp);__result += "\nalso_use_static_vectors = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="accuracy" ? true : false))) == null ? "" : __tmp);__result += "\n\n[components.tok2vec.model.encode]\n@architectures = \"spacy.MaxoutWindowEncoder.v1\"\nwidth = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 96 : 256))) == null ? "" : __tmp);__result += "\ndepth = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 4 : 8))) == null ? "" : __tmp);__result += "\nwindow_size = 1\nmaxout_pieces = 3\n\n";if(context.components.includes("tagger")){__result += "\n[components.tagger]\nfactory = \"tagger\"\n\n[components.tagger.model]\n@architectures = \"spacy.Tagger.v1\"\nnO = null\n\n[components.tagger.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode:width}";}__result += "\n\n";if(context.components.includes("parser")){__result += "[components.parser]\nfactory = \"parser\"\n\n[components.parser.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 8\nhidden_width = 128\nmaxout_pieces = 3\nuse_upper = true\nnO = null\n\n[components.parser.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode:width}";}__result += "\n\n";if(context.components.includes("ner")){__result += "[components.ner]\nfactory = \"ner\"\n\n[components.ner.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 6\nhidden_width = 64\nmaxout_pieces = 2\nuse_upper = true\nnO = null\n\n[components.parser.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode:width}\n";}__result += "[training]\n\n[training.train_corpus]\n@readers = \"spacy.Corpus.v1\"\npath = ${paths:train}\n\n[training.dev_corpus]\n@readers = \"spacy.Corpus.v1\"\npath = ${paths:dev}\n\n[training.score_weights]";if(context.components.includes("tagger")){__result += "\ntag_acc = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 100) / 100)) == null ? "" : __tmp);}if(context.components.includes("parser")){__result += "\ndep_uas = 0.0\ndep_las = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 100) / 100)) == null ? "" : __tmp);__result += "\nsents_f = 0.0";}if(context.components.includes("ner")){__result += "\nents_f = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 100) / 100)) == null ? "" : __tmp);__result += "\nents_p = 0.0\nents_r = 0.0";}
return __result;
}

View File

@ -1,13 +1,16 @@
import React, { useState } from 'react'
import { StaticQuery, graphql } from 'gatsby'
import highlightCode from 'gatsby-remark-prismjs/highlight-code.js'
import { Quickstart, QS } from '../components/quickstart'
import generator from './quickstart-training-generator'
import { isString, htmlToReact } from '../components/util'
const DEFAULT_LANG = 'en'
const DEFAULT_HARDWARE = 'gpu'
const DEFAULT_OPT = 'efficiency'
const COMPONENTS = ['tagger', 'parser', 'ner', 'textcat']
const COMMENT = `# This is an auto-generated partial config for training a model.
# To use it for training, auto-fill it with all default values.
# python -m spacy init config config.cfg --base base_config.cfg`
const DATA = [
{
id: 'lang',
@ -25,9 +28,8 @@ const DATA = [
id: 'hardware',
title: 'Hardware',
options: [
{ id: 'cpu-only', title: 'CPU only' },
{ id: 'cpu', title: 'CPU preferred' },
{ id: 'gpu', title: 'GPU', checked: true },
{ id: 'cpu', title: 'CPU preferred', checked: DEFAULT_HARDWARE === 'cpu' },
{ id: 'gpu', title: 'GPU', checked: DEFAULT_HARDWARE === 'gpu' },
],
},
{
@ -35,28 +37,42 @@ const DATA = [
title: 'Optimize for',
help: '...',
options: [
{ id: 'efficiency', title: 'efficiency', checked: true },
{ id: 'accuracy', title: 'accuracy' },
{ id: 'efficiency', title: 'efficiency', checked: DEFAULT_OPT === 'efficiency' },
{ id: 'accuracy', title: 'accuracy', checked: DEFAULT_OPT === 'accuracy' },
],
},
{
id: 'config',
title: 'Configuration',
options: [
{
id: 'independent',
title: 'independent components',
help: "Make components independent and don't share weights",
},
],
multiple: true,
},
]
function stringify(value) {
if (isString(value) && value.startsWith('${')) return value
const string = JSON.stringify(value)
if (Array.isArray(value)) return string.replace(/,/g, ', ')
return string
}
export default function QuickstartTraining({ id, title, download = 'config.cfg' }) {
const [lang, setLang] = useState(DEFAULT_LANG)
const [pipeline, setPipeline] = useState([])
const setters = { lang: setLang, components: setPipeline }
const [components, setComponents] = useState([])
const [[hardware], setHardware] = useState([DEFAULT_HARDWARE])
const [[optimize], setOptimize] = useState([DEFAULT_OPT])
const setters = {
lang: setLang,
components: setComponents,
hardware: setHardware,
optimize: setOptimize,
}
const content = generator({
lang,
pipeline: stringify(components),
components,
optimize,
hardware,
})
const rawContent = content.trim().replace(/\n\n\n+/g, '\n\n')
const displayContent = highlightCode('ini', rawContent)
.split('\n')
.map(line => (line.startsWith('#') ? `<span class="token comment">${line}</span>` : line))
.join('\n')
return (
<StaticQuery
query={query}
@ -66,47 +82,19 @@ export default function QuickstartTraining({ id, title, download = 'config.cfg'
id: code,
title: name,
}))
const recommendedTrf = Object.assign(
{},
...langs.map(({ code }) => ({ [code]: { sm: 'TODO', lg: 'TODO' } }))
)
return (
<Quickstart
download={download}
rawContent={content}
data={DATA}
title={title}
id={id}
setters={setters}
hidePrompts
small
codeLang="ini"
>
<QS comment>{COMMENT}</QS>
<span>[paths]</span>
<span>train = ""</span>
<span>dev = ""</span>
<br />
<span>[nlp]</span>
<span>lang = "{lang}"</span>
<span>pipeline = {JSON.stringify(pipeline).replace(/,/g, ', ')}</span>
<br />
<span>[components]</span>
<br />
<span>[components.transformer]</span>
<QS optimize="efficiency">name = "{recommendedTrf[lang].sm}"</QS>
<QS optimize="accuracy">name = "{recommendedTrf[lang].lg}"</QS>
{!!pipeline.length && <br />}
{pipeline.map((pipe, i) => (
<>
{i !== 0 && <br />}
<span>[components.{pipe}]</span>
<span>factory = "{pipe}"</span>
<QS config="independent">
<br />
[components.parser.model.tok2vec]
<br />
@architectures = "spacy.Tok2Vec.v1"
</QS>
</>
))}
{htmlToReact(displayContent)}
</Quickstart>
)
}}