Update docs and quickstart widget [ci skip]

2025-08-09 06:34:54 +03:00 · 2020-08-13 01:17:40 +02:00 · 2020-08-13 01:17:40 +02:00 · 7d526d0d40
commit 7d526d0d40
parent ba84371ab0
11 changed files with 1418 additions and 71 deletions
--- a/website/package-lock.json
+++ b/website/package-lock.json
@ -17455,6 +17455,11 @@
                }
            }
        },
+        "jinja-to-js": {
+            "version": "3.2.3",
+            "resolved": "https://registry.npmjs.org/jinja-to-js/-/jinja-to-js-3.2.3.tgz",
+            "integrity": "sha512-ktEBxQG17fYaFcHThB719+EbePBx+AkkORQMyuP0UuLPS2zx8uJXP5CsItXjUUwMHFPj3hCRkyqEYzLbeklYgQ=="
+        },
        "jpeg-js": {
            "version": "0.2.0",
            "resolved": "https://registry.npmjs.org/jpeg-js/-/jpeg-js-0.2.0.tgz",
--- a/website/package.json
+++ b/website/package.json
@ -41,6 +41,7 @@
        "gatsby-transformer-sharp": "^2.1.13",
        "html-to-react": "^1.3.4",
        "intersection-observer": "^0.5.1",
+        "jinja-to-js": "^3.2.3",
        "node-sass": "^4.11.0",
        "parse-numeric-range": "0.0.2",
        "prismjs": "^1.15.0",
@ -52,20 +53,22 @@
        "remark-react": "^5.0.1"
    },
    "scripts": {
-        "build": "gatsby build",
-        "dev": "gatsby develop",
+        "build": "npm run python:setup && gatsby build",
+        "dev": "npm run python:setup && gatsby develop",
        "dev:nightly": "BRANCH=nightly.spacy.io npm run dev",
        "lint": "eslint **",
        "clear": "rm -rf .cache",
-        "test": "echo \"Write tests! -> https://gatsby.app/unit-testing\""
+        "test": "echo \"Write tests! -> https://gatsby.app/unit-testing\"",
+        "python:install": "pip install setup/requirements.txt",
+        "python:setup": "cd setup && ./setup.sh"
    },
    "devDependencies": {
+        "@sindresorhus/slugify": "^0.8.0",
        "browser-monads": "^1.0.0",
        "md-attr-parser": "^1.2.1",
        "prettier": "^1.16.4",
        "raw-loader": "^1.0.0",
-        "unist-util-visit": "^1.4.0",
-        "@sindresorhus/slugify": "^0.8.0"
+        "unist-util-visit": "^1.4.0"
    },
    "repository": {
        "type": "git",
--- a/website/setup/jinja_to_js.py
+++ b/website/setup/jinja_to_js.py
--- a/website/setup/quickstart_training_cpu.jinja
+++ b/website/setup/quickstart_training_cpu.jinja
@ -0,0 +1,107 @@
+{# Template for "CPU" configs. The transformer will use a different template. #}
+# This is an auto-generated partial config for training a model.
+# To use it for training, auto-fill it with all default values.
+# python -m spacy init config config.cfg --base base_config.cfg
+[paths]
+train = ""
+dev = ""
+
+[nlp]
+lang = "{{ lang }}"
+pipeline = {{ pipeline|safe }}
+vectors = {{ ('"en_vectors_web_lg"' if optimize == "accuracy" else false)|safe }}
+tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
+
+[components]
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v1"
+
+[components.tok2vec.model.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = ${components.tok2vec.model.encode:width}
+rows = {{ 2000 if optimize == "efficiency" else 7000 }}
+also_embed_subwords = {{ true if has_letters else false }}
+also_use_static_vectors = {{ true if optimize == "accuracy" else false }}
+
+[components.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v1"
+width = {{ 96 if optimize == "efficiency" else 256 }}
+depth = {{ 4 if optimize == "efficiency" else 8 }}
+window_size = 1
+maxout_pieces = 3
+
+{% if "tagger" in components %}
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode:width}
+{%- endif %}
+
+{% if "parser" in components -%}
+[components.parser]
+factory = "parser"
+
+[components.parser.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 8
+hidden_width = 128
+maxout_pieces = 3
+use_upper = true
+nO = null
+
+[components.parser.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode:width}
+{%- endif %}
+
+{% if "ner" in components -%}
+[components.ner]
+factory = "ner"
+
+[components.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 6
+hidden_width = 64
+maxout_pieces = 2
+use_upper = true
+nO = null
+
+[components.parser.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode:width}
+{% endif -%}
+
+[training]
+
+[training.train_corpus]
+@readers = "spacy.Corpus.v1"
+path = ${paths:train}
+
+[training.dev_corpus]
+@readers = "spacy.Corpus.v1"
+path = ${paths:dev}
+
+[training.score_weights]
+{%- if "tagger" in components %}
+tag_acc = {{ (1.0 / components|length)|round() }}
+{%- endif -%}
+{%- if "parser" in components %}
+dep_uas = 0.0
+dep_las = {{ (1.0 / components|length)|round() }}
+sents_f = 0.0
+{%- endif %}
+{%- if "ner" in components %}
+ents_f = {{ (1.0 / components|length)|round() }}
+ents_p = 0.0
+ents_r = 0.0
+{%- endif -%}
--- a/website/setup/requirements.txt
+++ b/website/setup/requirements.txt
@ -0,0 +1,3 @@
+# These are used to compile the training quickstart config
+jinja2
+typer
--- a/website/setup/setup.sh
+++ b/website/setup/setup.sh
@ -0,0 +1 @@
+python jinja_to_js.py quickstart_training_cpu.jinja ../src/widgets/quickstart-training-generator.js
--- a/website/src/components/quickstart.js
+++ b/website/src/components/quickstart.js
@ -15,24 +15,18 @@ function getNewChecked(optionId, checkedForId, multiple) {
    return [...checkedForId, optionId]
 }

-function getRawContent(ref) {
-    if (ref.current && ref.current.childNodes) {
-        // Select all currently visible nodes (spans and text nodes)
-        const result = [...ref.current.childNodes].filter(el => el.offsetParent !== null)
-        return result.map(el => el.textContent).join('\n')
-    }
-    return ''
-}
-
 const Quickstart = ({
    data = [],
    title,
    description,
    copy = true,
    download,
+    rawContent = null,
    id = 'quickstart',
    setters = {},
    hidePrompts,
+    small,
+    codeLang,
    children,
 }) => {
    const contentRef = useRef()
@ -46,6 +40,16 @@ const Quickstart = ({
    const [copySuccess, setCopySuccess] = useState(false)
    const [otherState, setOtherState] = useState({})
    const setOther = (id, value) => setOtherState({ ...otherState, [id]: value })
+    const getRawContent = ref => {
+        if (rawContent !== null) return rawContent
+        if (ref.current && ref.current.childNodes) {
+            // Select all currently visible nodes (spans and text nodes)
+            const result = [...ref.current.childNodes].filter(el => el.offsetParent !== null)
+            return result.map(el => el.textContent).join('\n')
+        }
+        return ''
+    }
+
    const onClickCopy = () => {
        copyAreaRef.current.value = getRawContent(contentRef)
        copyToClipboard(copyAreaRef, setCopySuccess)
@ -210,7 +214,14 @@ const Quickstart = ({
                    }
                )}
                <pre className={classes.code}>
-                    <code className={classes.results} data-quickstart-results="" ref={contentRef}>
+                    <code
+                        className={classNames(classes.results, {
+                            [classes.small]: !!small,
+                            [`language-${codeLang}`]: !!codeLang,
+                        })}
+                        data-quickstart-results=""
+                        ref={contentRef}
+                    >
                        {children}
                    </code>

--- a/website/src/components/search.js
+++ b/website/src/components/search.js
@ -41,6 +41,6 @@ Search.propTypes = {
        apiKey: PropTypes.string.isRequired,
        indexName: PropTypes.string.isRequired,
    }).isRequired,
-    id: PropTypes.string.isRequired,
-    placeholder: PropTypes.string.isRequired,
+    id: PropTypes.string,
+    placeholder: PropTypes.string,
 }
--- a/website/src/styles/quickstart.module.sass
+++ b/website/src/styles/quickstart.module.sass
@ -124,6 +124,16 @@
    & > span
        display: block

+.small
+    font-size: var(--font-size-sm)
+    line-height: 1.65
+    white-space: pre
+    max-height: 400px
+    overflow-y: auto
+
+    & > span
+        display: inline
+
 .hide-prompts .prompt:before
    content: initial !important

--- a/website/src/widgets/quickstart-training-generator.js
+++ b/website/src/widgets/quickstart-training-generator.js
@ -0,0 +1,10 @@
+import jinjaToJS from "jinja-to-js";export default function templateQuickstartTrainingCpu(ctx) {
+    var __result = "";
+    var __tmp;
+    var __runtime = jinjaToJS.runtime;
+    var __filters = jinjaToJS.filters;
+    var __globals = jinjaToJS.globals;
+    var context = jinjaToJS.createContext(ctx);
+    __result += "\n# This is an auto-generated partial config for training a model.\n# To use it for training, auto-fill it with all default values.\n# python -m spacy init config config.cfg --base base_config.cfg\n[paths]\ntrain = \"\"\ndev = \"\"\n\n[nlp]\nlang = \"";__result += "" + __runtime.escape((__tmp = (context.lang)) == null ? "" : __tmp);__result += "\"\npipeline = ";__result += "" + ((__tmp = (context.pipeline)) == null ? "" : __tmp);__result += "\nvectors = ";__result += "" + ((__tmp = ((context.optimize==="accuracy" ? "\"en_vectors_web_lg\"" : false))) == null ? "" : __tmp);__result += "\ntokenizer = {\"@tokenizers\": \"spacy.Tokenizer.v1\"}\n\n[components]\n\n[components.tok2vec]\nfactory = \"tok2vec\"\n\n[components.tok2vec.model]\n@architectures = \"spacy.Tok2Vec.v1\"\n\n[components.tok2vec.model.embed]\n@architectures = \"spacy.MultiHashEmbed.v1\"\nwidth = ${components.tok2vec.model.encode:width}\nrows = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 2000 : 7000))) == null ? "" : __tmp);__result += "\nalso_embed_subwords = ";__result += "" + __runtime.escape((__tmp = ((context.has_letters ? true : false))) == null ? "" : __tmp);__result += "\nalso_use_static_vectors = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="accuracy" ? true : false))) == null ? "" : __tmp);__result += "\n\n[components.tok2vec.model.encode]\n@architectures = \"spacy.MaxoutWindowEncoder.v1\"\nwidth = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 96 : 256))) == null ? "" : __tmp);__result += "\ndepth = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 4 : 8))) == null ? "" : __tmp);__result += "\nwindow_size = 1\nmaxout_pieces = 3\n\n";if(context.components.includes("tagger")){__result += "\n[components.tagger]\nfactory = \"tagger\"\n\n[components.tagger.model]\n@architectures = \"spacy.Tagger.v1\"\nnO = null\n\n[components.tagger.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode:width}";}__result += "\n\n";if(context.components.includes("parser")){__result += "[components.parser]\nfactory = \"parser\"\n\n[components.parser.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 8\nhidden_width = 128\nmaxout_pieces = 3\nuse_upper = true\nnO = null\n\n[components.parser.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode:width}";}__result += "\n\n";if(context.components.includes("ner")){__result += "[components.ner]\nfactory = \"ner\"\n\n[components.ner.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 6\nhidden_width = 64\nmaxout_pieces = 2\nuse_upper = true\nnO = null\n\n[components.parser.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode:width}\n";}__result += "[training]\n\n[training.train_corpus]\n@readers = \"spacy.Corpus.v1\"\npath = ${paths:train}\n\n[training.dev_corpus]\n@readers = \"spacy.Corpus.v1\"\npath = ${paths:dev}\n\n[training.score_weights]";if(context.components.includes("tagger")){__result += "\ntag_acc = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 100) / 100)) == null ? "" : __tmp);}if(context.components.includes("parser")){__result += "\ndep_uas = 0.0\ndep_las = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 100) / 100)) == null ? "" : __tmp);__result += "\nsents_f = 0.0";}if(context.components.includes("ner")){__result += "\nents_f = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 100) / 100)) == null ? "" : __tmp);__result += "\nents_p = 0.0\nents_r = 0.0";}
+    return __result;
+}
--- a/website/src/widgets/quickstart-training.js
+++ b/website/src/widgets/quickstart-training.js
@ -1,13 +1,16 @@
 import React, { useState } from 'react'
 import { StaticQuery, graphql } from 'gatsby'
+import highlightCode from 'gatsby-remark-prismjs/highlight-code.js'

 import { Quickstart, QS } from '../components/quickstart'
+import generator from './quickstart-training-generator'
+import { isString, htmlToReact } from '../components/util'

 const DEFAULT_LANG = 'en'
+const DEFAULT_HARDWARE = 'gpu'
+const DEFAULT_OPT = 'efficiency'
 const COMPONENTS = ['tagger', 'parser', 'ner', 'textcat']
-const COMMENT = `# This is an auto-generated partial config for training a model.
-# To use it for training, auto-fill it with all default values.
-# python -m spacy init config config.cfg --base base_config.cfg`
+
 const DATA = [
    {
        id: 'lang',
@ -25,9 +28,8 @@ const DATA = [
        id: 'hardware',
        title: 'Hardware',
        options: [
-            { id: 'cpu-only', title: 'CPU only' },
-            { id: 'cpu', title: 'CPU preferred' },
-            { id: 'gpu', title: 'GPU', checked: true },
+            { id: 'cpu', title: 'CPU preferred', checked: DEFAULT_HARDWARE === 'cpu' },
+            { id: 'gpu', title: 'GPU', checked: DEFAULT_HARDWARE === 'gpu' },
        ],
    },
    {
@ -35,28 +37,42 @@ const DATA = [
        title: 'Optimize for',
        help: '...',
        options: [
-            { id: 'efficiency', title: 'efficiency', checked: true },
-            { id: 'accuracy', title: 'accuracy' },
+            { id: 'efficiency', title: 'efficiency', checked: DEFAULT_OPT === 'efficiency' },
+            { id: 'accuracy', title: 'accuracy', checked: DEFAULT_OPT === 'accuracy' },
        ],
    },
-    {
-        id: 'config',
-        title: 'Configuration',
-        options: [
-            {
-                id: 'independent',
-                title: 'independent components',
-                help: "Make components independent and don't share weights",
-            },
-        ],
-        multiple: true,
-    },
 ]

+function stringify(value) {
+    if (isString(value) && value.startsWith('${')) return value
+    const string = JSON.stringify(value)
+    if (Array.isArray(value)) return string.replace(/,/g, ', ')
+    return string
+}
+
 export default function QuickstartTraining({ id, title, download = 'config.cfg' }) {
    const [lang, setLang] = useState(DEFAULT_LANG)
-    const [pipeline, setPipeline] = useState([])
-    const setters = { lang: setLang, components: setPipeline }
+    const [components, setComponents] = useState([])
+    const [[hardware], setHardware] = useState([DEFAULT_HARDWARE])
+    const [[optimize], setOptimize] = useState([DEFAULT_OPT])
+    const setters = {
+        lang: setLang,
+        components: setComponents,
+        hardware: setHardware,
+        optimize: setOptimize,
+    }
+    const content = generator({
+        lang,
+        pipeline: stringify(components),
+        components,
+        optimize,
+        hardware,
+    })
+    const rawContent = content.trim().replace(/\n\n\n+/g, '\n\n')
+    const displayContent = highlightCode('ini', rawContent)
+        .split('\n')
+        .map(line => (line.startsWith('#') ? `<span class="token comment">${line}</span>` : line))
+        .join('\n')
    return (
        <StaticQuery
            query={query}
@ -66,47 +82,19 @@ export default function QuickstartTraining({ id, title, download = 'config.cfg'
                    id: code,
                    title: name,
                }))
-                const recommendedTrf = Object.assign(
-                    {},
-                    ...langs.map(({ code }) => ({ [code]: { sm: 'TODO', lg: 'TODO' } }))
-                )
                return (
                    <Quickstart
                        download={download}
+                        rawContent={content}
                        data={DATA}
                        title={title}
                        id={id}
                        setters={setters}
                        hidePrompts
+                        small
+                        codeLang="ini"
                    >
-                        <QS comment>{COMMENT}</QS>
-                        <span>[paths]</span>
-                        <span>train = ""</span>
-                        <span>dev = ""</span>
-                        <br />
-                        <span>[nlp]</span>
-                        <span>lang = "{lang}"</span>
-                        <span>pipeline = {JSON.stringify(pipeline).replace(/,/g, ', ')}</span>
-                        <br />
-                        <span>[components]</span>
-                        <br />
-                        <span>[components.transformer]</span>
-                        <QS optimize="efficiency">name = "{recommendedTrf[lang].sm}"</QS>
-                        <QS optimize="accuracy">name = "{recommendedTrf[lang].lg}"</QS>
-                        {!!pipeline.length && <br />}
-                        {pipeline.map((pipe, i) => (
-                            <>
-                                {i !== 0 && <br />}
-                                <span>[components.{pipe}]</span>
-                                <span>factory = "{pipe}"</span>
-                                <QS config="independent">
-                                    <br />
-                                    [components.parser.model.tok2vec]
-                                    <br />
-                                    @architectures = "spacy.Tok2Vec.v1"
-                                </QS>
-                            </>
-                        ))}
+                        {htmlToReact(displayContent)}
                    </Quickstart>
                )
            }}
				`@ -0,0 +1 @@`
				`python jinja_to_js.py quickstart_training_cpu.jinja ../src/widgets/quickstart-training-generator.js`