Update config generation defaults and transformers (#6832)

This commit is contained in:
Ines Montani 2021-01-27 23:56:33 +11:00 committed by GitHub
parent 6b68ad027b
commit ec5f55aa5b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 19 additions and 12 deletions

View File

@ -140,7 +140,8 @@ def init_config(
template = Template(f.read())
# Filter out duplicates since tok2vec and transformer are added by template
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, {})).dict()
defaults = RECOMMENDATIONS["__default__"]
reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, defaults)).dict()
variables = {
"lang": lang,
"components": pipeline,
@ -167,7 +168,9 @@ def init_config(
"Pipeline": ", ".join(pipeline),
"Optimize for": optimize,
"Hardware": variables["hardware"].upper(),
"Transformer": template_vars.transformer.get("name", False),
"Transformer": template_vars.transformer.get("name")
if template_vars.use_transformer
else None,
}
msg.info("Generated config template specific for your use case")
for label, value in use_case.items():

View File

@ -1,7 +1,7 @@
{# This is a template for training configs used for the quickstart widget in
the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #}
{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
{%- set use_transformer = hardware != "cpu" -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
[paths]
train = null
@ -196,11 +196,6 @@ no_output_layer = false
{# NON-TRANSFORMER PIPELINE #}
{% else -%}
{%- if hardware == "gpu" -%}
# There are no recommended transformer weights available for language '{{ lang }}'
# yet, so the pipeline described here is not transformer-based.
{%- endif %}
[components.tok2vec]
factory = "tok2vec"

View File

@ -1,6 +1,15 @@
# Recommended settings and available resources for each language, if available.
# Not all languages have recommended word vectors or transformers and for some,
# the recommended transformer for efficiency and accuracy may be the same.
__default__:
word_vectors: null
transformer:
efficiency:
name: bert-base-multilingual-uncased
size_factor: 3
accuracy:
name: bert-base-multilingual-uncased
size_factor: 3
ar:
word_vectors: null
transformer:

View File

@ -7,7 +7,7 @@ import generator, { DATA as GENERATOR_DATA } from './quickstart-training-generat
import { htmlToReact } from '../components/util'
const DEFAULT_LANG = 'en'
const DEFAULT_HARDWARE = 'gpu'
const DEFAULT_HARDWARE = 'cpu'
const DEFAULT_OPT = 'efficiency'
const COMPONENTS = ['tagger', 'parser', 'ner', 'textcat']
const COMMENT = `# This is an auto-generated partial config. To use it with 'spacy train'
@ -31,8 +31,8 @@ const DATA = [
id: 'hardware',
title: 'Hardware',
options: [
{ id: 'cpu', title: 'CPU preferred', checked: DEFAULT_HARDWARE === 'cpu' },
{ id: 'gpu', title: 'GPU', checked: DEFAULT_HARDWARE === 'gpu' },
{ id: 'cpu', title: 'CPU', checked: DEFAULT_HARDWARE === 'cpu' },
{ id: 'gpu', title: 'GPU (transformer)', checked: DEFAULT_HARDWARE === 'gpu' },
],
},
{
@ -58,7 +58,7 @@ export default function QuickstartTraining({ id, title, download = 'base_config.
hardware: setHardware,
optimize: setOptimize,
}
const reco = GENERATOR_DATA[lang] || {}
const reco = GENERATOR_DATA[lang] || GENERATOR_DATA.__default__
const content = generator({
lang,
components,