Update config generation defaults and transformers (#6832)

2025-10-26 21:51:24 +03:00 · 2021-01-27 23:56:33 +11:00 · 2021-01-27 23:56:33 +11:00 · ec5f55aa5b
commit ec5f55aa5b
parent 6b68ad027b
4 changed files with 19 additions and 12 deletions
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@ -140,7 +140,8 @@ def init_config(
        template = Template(f.read())
    # Filter out duplicates since tok2vec and transformer are added by template
    pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
-    reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, {})).dict()
+    defaults = RECOMMENDATIONS["__default__"]
+    reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, defaults)).dict()
    variables = {
        "lang": lang,
        "components": pipeline,
@ -167,7 +168,9 @@ def init_config(
        "Pipeline": ", ".join(pipeline),
        "Optimize for": optimize,
        "Hardware": variables["hardware"].upper(),
-        "Transformer": template_vars.transformer.get("name", False),
+        "Transformer": template_vars.transformer.get("name")
+        if template_vars.use_transformer
+        else None,
    }
    msg.info("Generated config template specific for your use case")
    for label, value in use_case.items():
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -1,7 +1,7 @@
 {# This is a template for training configs used for the quickstart widget in
 the docs and the init config command. It encodes various best practices and
 can help generate the best possible configuration, given a user's requirements. #}
-{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
+{%- set use_transformer = hardware != "cpu" -%}
 {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
 [paths]
 train = null
@ -196,11 +196,6 @@ no_output_layer = false
 {# NON-TRANSFORMER PIPELINE #}
 {% else -%}

-{%- if hardware == "gpu" -%}
-# There are no recommended transformer weights available for language '{{ lang }}'
-# yet, so the pipeline described here is not transformer-based.
-{%- endif %}
-
 [components.tok2vec]
 factory = "tok2vec"

--- a/spacy/cli/templates/quickstart_training_recommendations.yml
+++ b/spacy/cli/templates/quickstart_training_recommendations.yml
@ -1,6 +1,15 @@
 # Recommended settings and available resources for each language, if available.
 # Not all languages have recommended word vectors or transformers and for some,
 # the recommended transformer for efficiency and accuracy may be the same.
+__default__:
+  word_vectors: null
+  transformer:
+    efficiency:
+      name: bert-base-multilingual-uncased
+      size_factor: 3
+    accuracy:
+      name: bert-base-multilingual-uncased
+      size_factor: 3
 ar:
  word_vectors: null
  transformer:
--- a/website/src/widgets/quickstart-training.js
+++ b/website/src/widgets/quickstart-training.js
@ -7,7 +7,7 @@ import generator, { DATA as GENERATOR_DATA } from './quickstart-training-generat
 import { htmlToReact } from '../components/util'

 const DEFAULT_LANG = 'en'
-const DEFAULT_HARDWARE = 'gpu'
+const DEFAULT_HARDWARE = 'cpu'
 const DEFAULT_OPT = 'efficiency'
 const COMPONENTS = ['tagger', 'parser', 'ner', 'textcat']
 const COMMENT = `# This is an auto-generated partial config. To use it with 'spacy train'
@ -31,8 +31,8 @@ const DATA = [
        id: 'hardware',
        title: 'Hardware',
        options: [
-            { id: 'cpu', title: 'CPU preferred', checked: DEFAULT_HARDWARE === 'cpu' },
-            { id: 'gpu', title: 'GPU', checked: DEFAULT_HARDWARE === 'gpu' },
+            { id: 'cpu', title: 'CPU', checked: DEFAULT_HARDWARE === 'cpu' },
+            { id: 'gpu', title: 'GPU (transformer)', checked: DEFAULT_HARDWARE === 'gpu' },
        ],
    },
    {
@ -58,7 +58,7 @@ export default function QuickstartTraining({ id, title, download = 'base_config.
        hardware: setHardware,
        optimize: setOptimize,
    }
-    const reco = GENERATOR_DATA[lang] || {}
+    const reco = GENERATOR_DATA[lang] || GENERATOR_DATA.__default__
    const content = generator({
        lang,
        components,