Update docs and quickstart widget [ci skip]

2025-10-22 03:34:15 +03:00 · 2020-08-13 01:17:40 +02:00 · 2020-08-13 01:17:40 +02:00 · 7d526d0d40
commit 7d526d0d40
parent ba84371ab0
11 changed files with 1418 additions and 71 deletions
--- a/website/package-lock.json
+++ b/website/package-lock.json
@ -17455,6 +17455,11 @@
                }
            }
        },
        "jinja-to-js": {
            "version": "3.2.3",
            "resolved": "https://registry.npmjs.org/jinja-to-js/-/jinja-to-js-3.2.3.tgz",
            "integrity": "sha512-ktEBxQG17fYaFcHThB719+EbePBx+AkkORQMyuP0UuLPS2zx8uJXP5CsItXjUUwMHFPj3hCRkyqEYzLbeklYgQ=="
        },
        "jpeg-js": {
            "version": "0.2.0",
            "resolved": "https://registry.npmjs.org/jpeg-js/-/jpeg-js-0.2.0.tgz",
--- a/website/package.json
+++ b/website/package.json
@ -41,6 +41,7 @@
        "gatsby-transformer-sharp": "^2.1.13",
        "html-to-react": "^1.3.4",
        "intersection-observer": "^0.5.1",
        "jinja-to-js": "^3.2.3",
        "node-sass": "^4.11.0",
        "parse-numeric-range": "0.0.2",
        "prismjs": "^1.15.0",
@ -52,20 +53,22 @@
        "remark-react": "^5.0.1"
    },
    "scripts": {
-        "build": "gatsby build",
+        "build": "npm run python:setup && gatsby build",
-        "dev": "gatsby develop",
+        "dev": "npm run python:setup && gatsby develop",
        "dev:nightly": "BRANCH=nightly.spacy.io npm run dev",
        "lint": "eslint **",
        "clear": "rm -rf .cache",
-        "test": "echo \"Write tests! -> https://gatsby.app/unit-testing\""
+        "test": "echo \"Write tests! -> https://gatsby.app/unit-testing\"",
        "python:install": "pip install setup/requirements.txt",
        "python:setup": "cd setup && ./setup.sh"
    },
    "devDependencies": {
        "@sindresorhus/slugify": "^0.8.0",
        "browser-monads": "^1.0.0",
        "md-attr-parser": "^1.2.1",
        "prettier": "^1.16.4",
        "raw-loader": "^1.0.0",
-        "unist-util-visit": "^1.4.0",
+        "unist-util-visit": "^1.4.0"
        "@sindresorhus/slugify": "^0.8.0"
    },
    "repository": {
        "type": "git",
--- a/website/setup/jinja_to_js.py
+++ b/website/setup/jinja_to_js.py
--- a/website/setup/quickstart_training_cpu.jinja
+++ b/website/setup/quickstart_training_cpu.jinja
@ -0,0 +1,107 @@
 {# Template for "CPU" configs. The transformer will use a different template. #}
 # This is an auto-generated partial config for training a model.
 # To use it for training, auto-fill it with all default values.
 # python -m spacy init config config.cfg --base base_config.cfg
 [paths]
 train = ""
 dev = ""
 [nlp]
 lang = "{{ lang }}"
 pipeline = {{ pipeline|safe }}
 vectors = {{ ('"en_vectors_web_lg"' if optimize == "accuracy" else false)|safe }}
 tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
 [components]
 [components.tok2vec]
 factory = "tok2vec"
 [components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"
 [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
 width = ${components.tok2vec.model.encode:width}
 rows = {{ 2000 if optimize == "efficiency" else 7000 }}
 also_embed_subwords = {{ true if has_letters else false }}
 also_use_static_vectors = {{ true if optimize == "accuracy" else false }}
 [components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
 width = {{ 96 if optimize == "efficiency" else 256 }}
 depth = {{ 4 if optimize == "efficiency" else 8 }}
 window_size = 1
 maxout_pieces = 3
 {% if "tagger" in components %}
 [components.tagger]
 factory = "tagger"
 [components.tagger.model]
@architectures = "spacy.Tagger.v1"
 nO = null
 [components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.encode:width}
 {%- endif %}
 {% if "parser" in components -%}
 [components.parser]
 factory = "parser"
 [components.parser.model]
@architectures = "spacy.TransitionBasedParser.v1"
 nr_feature_tokens = 8
 hidden_width = 128
 maxout_pieces = 3
 use_upper = true
 nO = null
 [components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.encode:width}
 {%- endif %}
 {% if "ner" in components -%}
 [components.ner]
 factory = "ner"
 [components.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
 nr_feature_tokens = 6
 hidden_width = 64
 maxout_pieces = 2
 use_upper = true
 nO = null
 [components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.encode:width}
 {% endif -%}
 [training]
 [training.train_corpus]
@readers = "spacy.Corpus.v1"
 path = ${paths:train}
 [training.dev_corpus]
@readers = "spacy.Corpus.v1"
 path = ${paths:dev}
 [training.score_weights]
 {%- if "tagger" in components %}
 tag_acc = {{ (1.0 / components|length)|round() }}
 {%- endif -%}
 {%- if "parser" in components %}
 dep_uas = 0.0
 dep_las = {{ (1.0 / components|length)|round() }}
 sents_f = 0.0
 {%- endif %}
 {%- if "ner" in components %}
 ents_f = {{ (1.0 / components|length)|round() }}
 ents_p = 0.0
 ents_r = 0.0
 {%- endif -%}
--- a/website/setup/requirements.txt
+++ b/website/setup/requirements.txt
@ -0,0 +1,3 @@
 # These are used to compile the training quickstart config
 jinja2
 typer
--- a/website/setup/setup.sh
+++ b/website/setup/setup.sh
@ -0,0 +1 @@
 python jinja_to_js.py quickstart_training_cpu.jinja ../src/widgets/quickstart-training-generator.js
--- a/website/src/components/quickstart.js
+++ b/website/src/components/quickstart.js
@ -15,24 +15,18 @@ function getNewChecked(optionId, checkedForId, multiple) {
    return [...checkedForId, optionId]
 }
 function getRawContent(ref) {
    if (ref.current && ref.current.childNodes) {
        // Select all currently visible nodes (spans and text nodes)
        const result = [...ref.current.childNodes].filter(el => el.offsetParent !== null)
        return result.map(el => el.textContent).join('\n')
    }
    return ''
 }
 const Quickstart = ({
    data = [],
    title,
    description,
    copy = true,
    download,
    rawContent = null,
    id = 'quickstart',
    setters = {},
    hidePrompts,
    small,
    codeLang,
    children,
 }) => {
    const contentRef = useRef()
@ -46,6 +40,16 @@ const Quickstart = ({
    const [copySuccess, setCopySuccess] = useState(false)
    const [otherState, setOtherState] = useState({})
    const setOther = (id, value) => setOtherState({ ...otherState, [id]: value })
    const getRawContent = ref => {
        if (rawContent !== null) return rawContent
        if (ref.current && ref.current.childNodes) {
            // Select all currently visible nodes (spans and text nodes)
            const result = [...ref.current.childNodes].filter(el => el.offsetParent !== null)
            return result.map(el => el.textContent).join('\n')
        }
        return ''
    }
    const onClickCopy = () => {
        copyAreaRef.current.value = getRawContent(contentRef)
        copyToClipboard(copyAreaRef, setCopySuccess)
@ -210,7 +214,14 @@ const Quickstart = ({
                    }
                )}
                <pre className={classes.code}>
-                    <code className={classes.results} data-quickstart-results="" ref={contentRef}>
+                    <code
                        className={classNames(classes.results, {
                            [classes.small]: !!small,
                            [`language-${codeLang}`]: !!codeLang,
                        })}
                        data-quickstart-results=""
                        ref={contentRef}
                    >
                        {children}
                    </code>
--- a/website/src/components/search.js
+++ b/website/src/components/search.js
@ -41,6 +41,6 @@ Search.propTypes = {
        apiKey: PropTypes.string.isRequired,
        indexName: PropTypes.string.isRequired,
    }).isRequired,
-    id: PropTypes.string.isRequired,
+    id: PropTypes.string,
-    placeholder: PropTypes.string.isRequired,
+    placeholder: PropTypes.string,
 }
--- a/website/src/styles/quickstart.module.sass
+++ b/website/src/styles/quickstart.module.sass
@ -124,6 +124,16 @@
    & > span
        display: block
 .small
    font-size: var(--font-size-sm)
    line-height: 1.65
    white-space: pre
    max-height: 400px
    overflow-y: auto
    & > span
        display: inline
 .hide-prompts .prompt:before
    content: initial !important
--- a/website/src/widgets/quickstart-training-generator.js
+++ b/website/src/widgets/quickstart-training-generator.js
@ -0,0 +1,10 @@
 import jinjaToJS from "jinja-to-js";export default function templateQuickstartTrainingCpu(ctx) {
    var __result = "";
    var __tmp;
    var __runtime = jinjaToJS.runtime;
    var __filters = jinjaToJS.filters;
    var __globals = jinjaToJS.globals;
    var context = jinjaToJS.createContext(ctx);
    __result += "\n# This is an auto-generated partial config for training a model.\n# To use it for training, auto-fill it with all default values.\n# python -m spacy init config config.cfg --base base_config.cfg\n[paths]\ntrain = \"\"\ndev = \"\"\n\n[nlp]\nlang = \"";__result += "" + __runtime.escape((__tmp = (context.lang)) == null ? "" : __tmp);__result += "\"\npipeline = ";__result += "" + ((__tmp = (context.pipeline)) == null ? "" : __tmp);__result += "\nvectors = ";__result += "" + ((__tmp = ((context.optimize==="accuracy" ? "\"en_vectors_web_lg\"" : false))) == null ? "" : __tmp);__result += "\ntokenizer = {\"@tokenizers\": \"spacy.Tokenizer.v1\"}\n\n[components]\n\n[components.tok2vec]\nfactory = \"tok2vec\"\n\n[components.tok2vec.model]\n@architectures = \"spacy.Tok2Vec.v1\"\n\n[components.tok2vec.model.embed]\n@architectures = \"spacy.MultiHashEmbed.v1\"\nwidth = ${components.tok2vec.model.encode:width}\nrows = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 2000 : 7000))) == null ? "" : __tmp);__result += "\nalso_embed_subwords = ";__result += "" + __runtime.escape((__tmp = ((context.has_letters ? true : false))) == null ? "" : __tmp);__result += "\nalso_use_static_vectors = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="accuracy" ? true : false))) == null ? "" : __tmp);__result += "\n\n[components.tok2vec.model.encode]\n@architectures = \"spacy.MaxoutWindowEncoder.v1\"\nwidth = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 96 : 256))) == null ? "" : __tmp);__result += "\ndepth = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 4 : 8))) == null ? "" : __tmp);__result += "\nwindow_size = 1\nmaxout_pieces = 3\n\n";if(context.components.includes("tagger")){__result += "\n[components.tagger]\nfactory = \"tagger\"\n\n[components.tagger.model]\n@architectures = \"spacy.Tagger.v1\"\nnO = null\n\n[components.tagger.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode:width}";}__result += "\n\n";if(context.components.includes("parser")){__result += "[components.parser]\nfactory = \"parser\"\n\n[components.parser.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 8\nhidden_width = 128\nmaxout_pieces = 3\nuse_upper = true\nnO = null\n\n[components.parser.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode:width}";}__result += "\n\n";if(context.components.includes("ner")){__result += "[components.ner]\nfactory = \"ner\"\n\n[components.ner.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 6\nhidden_width = 64\nmaxout_pieces = 2\nuse_upper = true\nnO = null\n\n[components.parser.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode:width}\n";}__result += "[training]\n\n[training.train_corpus]\n@readers = \"spacy.Corpus.v1\"\npath = ${paths:train}\n\n[training.dev_corpus]\n@readers = \"spacy.Corpus.v1\"\npath = ${paths:dev}\n\n[training.score_weights]";if(context.components.includes("tagger")){__result += "\ntag_acc = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 100) / 100)) == null ? "" : __tmp);}if(context.components.includes("parser")){__result += "\ndep_uas = 0.0\ndep_las = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 100) / 100)) == null ? "" : __tmp);__result += "\nsents_f = 0.0";}if(context.components.includes("ner")){__result += "\nents_f = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 100) / 100)) == null ? "" : __tmp);__result += "\nents_p = 0.0\nents_r = 0.0";}
    return __result;
 }
--- a/website/src/widgets/quickstart-training.js
+++ b/website/src/widgets/quickstart-training.js
@ -1,13 +1,16 @@
 import React, { useState } from 'react'
 import { StaticQuery, graphql } from 'gatsby'
 import highlightCode from 'gatsby-remark-prismjs/highlight-code.js'
 import { Quickstart, QS } from '../components/quickstart'
 import generator from './quickstart-training-generator'
 import { isString, htmlToReact } from '../components/util'
 const DEFAULT_LANG = 'en'
 const DEFAULT_HARDWARE = 'gpu'
 const DEFAULT_OPT = 'efficiency'
 const COMPONENTS = ['tagger', 'parser', 'ner', 'textcat']
-const COMMENT = `# This is an auto-generated partial config for training a model.
+
 # To use it for training, auto-fill it with all default values.
 # python -m spacy init config config.cfg --base base_config.cfg`
 const DATA = [
    {
        id: 'lang',
@ -25,9 +28,8 @@ const DATA = [
        id: 'hardware',
        title: 'Hardware',
        options: [
-            { id: 'cpu-only', title: 'CPU only' },
+            { id: 'cpu', title: 'CPU preferred', checked: DEFAULT_HARDWARE === 'cpu' },
-            { id: 'cpu', title: 'CPU preferred' },
+            { id: 'gpu', title: 'GPU', checked: DEFAULT_HARDWARE === 'gpu' },
            { id: 'gpu', title: 'GPU', checked: true },
        ],
    },
    {
@ -35,28 +37,42 @@ const DATA = [
        title: 'Optimize for',
        help: '...',
        options: [
-            { id: 'efficiency', title: 'efficiency', checked: true },
+            { id: 'efficiency', title: 'efficiency', checked: DEFAULT_OPT === 'efficiency' },
-            { id: 'accuracy', title: 'accuracy' },
+            { id: 'accuracy', title: 'accuracy', checked: DEFAULT_OPT === 'accuracy' },
        ],
    },
    {
        id: 'config',
        title: 'Configuration',
        options: [
            {
                id: 'independent',
                title: 'independent components',
                help: "Make components independent and don't share weights",
            },
        ],
        multiple: true,
    },
 ]
 function stringify(value) {
    if (isString(value) && value.startsWith('${')) return value
    const string = JSON.stringify(value)
    if (Array.isArray(value)) return string.replace(/,/g, ', ')
    return string
 }
 export default function QuickstartTraining({ id, title, download = 'config.cfg' }) {
    const [lang, setLang] = useState(DEFAULT_LANG)
-    const [pipeline, setPipeline] = useState([])
+    const [components, setComponents] = useState([])
-    const setters = { lang: setLang, components: setPipeline }
+    const [[hardware], setHardware] = useState([DEFAULT_HARDWARE])
    const [[optimize], setOptimize] = useState([DEFAULT_OPT])
    const setters = {
        lang: setLang,
        components: setComponents,
        hardware: setHardware,
        optimize: setOptimize,
    }
    const content = generator({
        lang,
        pipeline: stringify(components),
        components,
        optimize,
        hardware,
    })
    const rawContent = content.trim().replace(/\n\n\n+/g, '\n\n')
    const displayContent = highlightCode('ini', rawContent)
        .split('\n')
        .map(line => (line.startsWith('#') ? `<span class="token comment">${line}</span>` : line))
        .join('\n')
    return (
        <StaticQuery
            query={query}
@ -66,47 +82,19 @@ export default function QuickstartTraining({ id, title, download = 'config.cfg'
                    id: code,
                    title: name,
                }))
                const recommendedTrf = Object.assign(
                    {},
                    ...langs.map(({ code }) => ({ [code]: { sm: 'TODO', lg: 'TODO' } }))
                )
                return (
                    <Quickstart
                        download={download}
                        rawContent={content}
                        data={DATA}
                        title={title}
                        id={id}
                        setters={setters}
                        hidePrompts
                        small
                        codeLang="ini"
                    >
-                        <QS comment>{COMMENT}</QS>
+                        {htmlToReact(displayContent)}
                        <span>[paths]</span>
                        <span>train = ""</span>
                        <span>dev = ""</span>
                        <br />
                        <span>[nlp]</span>
                        <span>lang = "{lang}"</span>
                        <span>pipeline = {JSON.stringify(pipeline).replace(/,/g, ', ')}</span>
                        <br />
                        <span>[components]</span>
                        <br />
                        <span>[components.transformer]</span>
                        <QS optimize="efficiency">name = "{recommendedTrf[lang].sm}"</QS>
                        <QS optimize="accuracy">name = "{recommendedTrf[lang].lg}"</QS>
                        {!!pipeline.length && <br />}
                        {pipeline.map((pipe, i) => (
                            <>
                                {i !== 0 && <br />}
                                <span>[components.{pipe}]</span>
                                <span>factory = "{pipe}"</span>
                                <QS config="independent">
                                    <br />
                                    [components.parser.model.tok2vec]
                                    <br />
                                    @architectures = "spacy.Tok2Vec.v1"
                                </QS>
                            </>
                        ))}
                    </Quickstart>
                )
            }}
		`@ -0,0 +1 @@`
							`python jinja_to_js.py quickstart_training_cpu.jinja ../src/widgets/quickstart-training-generator.js`