spaCy/spacy/cli/templates/quickstart_training.jinja

367 lines
8.5 KiB
Plaintext
Raw Normal View History

2020-08-13 18:38:30 +03:00
{# This is a template for training configs used for the quickstart widget in
the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #}
2020-08-15 15:50:29 +03:00
{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
2020-08-13 18:38:30 +03:00
[paths]
train = null
dev = null
2020-08-13 18:38:30 +03:00
2020-08-15 15:50:29 +03:00
[system]
2020-09-20 13:30:53 +03:00
{% if use_transformer -%}
gpu_allocator = "pytorch"
{% else -%}
gpu_allocator = null
{% endif %}
2020-08-15 15:50:29 +03:00
2020-08-13 18:38:30 +03:00
[nlp]
lang = "{{ lang }}"
2020-08-15 15:50:29 +03:00
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
2020-08-13 18:38:30 +03:00
tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
[components]
{# TRANSFORMER PIPELINE #}
2020-08-15 15:50:29 +03:00
{%- if use_transformer -%}
2020-08-13 18:38:30 +03:00
[components.transformer]
factory = "transformer"
[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v1"
2020-08-15 15:50:29 +03:00
name = "{{ transformer["name"] }}"
2020-08-13 18:38:30 +03:00
tokenizer_config = {"use_fast": true}
[components.transformer.model.get_spans]
2020-09-03 18:37:06 +03:00
@span_getters = "spacy-transformers.strided_spans.v1"
2020-08-13 18:38:30 +03:00
window = 128
stride = 96
{% if "morphologizer" in components %}
[components.morphologizer]
factory = "morphologizer"
[components.morphologizer.model]
@architectures = "spacy.Tagger.v1"
nO = null
[components.morphologizer.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.morphologizer.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{%- endif %}
2020-08-13 18:38:30 +03:00
{% if "tagger" in components %}
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v1"
nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
2020-08-13 18:38:30 +03:00
grad_factor = 1.0
[components.tagger.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{%- endif %}
{% if "parser" in components -%}
[components.parser]
factory = "parser"
[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v1"
state_type = "parser"
extra_state_tokens = false
2020-08-13 18:38:30 +03:00
hidden_width = 128
maxout_pieces = 3
use_upper = false
nO = null
[components.parser.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
2020-08-13 18:38:30 +03:00
grad_factor = 1.0
[components.parser.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{%- endif %}
{% if "ner" in components -%}
[components.ner]
factory = "ner"
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
state_type = "ner"
extra_state_tokens = false
2020-08-13 18:38:30 +03:00
hidden_width = 64
maxout_pieces = 2
use_upper = false
nO = null
[components.ner.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
2020-08-13 18:38:30 +03:00
grad_factor = 1.0
[components.ner.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{% endif -%}
2020-09-22 11:40:05 +03:00
{% if "entity_linker" in components -%}
[components.entity_linker]
factory = "entity_linker"
get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
incl_context = true
incl_prior = true
[components.entity_linker.model]
@architectures = "spacy.EntityLinker.v1"
nO = null
[components.entity_linker.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.entity_linker.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
2020-09-22 11:40:05 +03:00
{% endif -%}
2020-09-22 11:22:06 +03:00
{% if "textcat" in components %}
[components.textcat]
factory = "textcat"
{% if optimize == "accuracy" %}
[components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v2"
nO = null
[components.textcat.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.textcat.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
2020-09-22 11:22:06 +03:00
exclusive_classes = false
ngram_size = 1
no_output_layer = false
2020-09-22 11:22:06 +03:00
{% else -%}
[components.textcat.model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
2020-09-22 11:22:06 +03:00
{%- endif %}
{%- endif %}
2020-08-13 18:38:30 +03:00
{# NON-TRANSFORMER PIPELINE #}
{% else -%}
{%- if hardware == "gpu" -%}
# There are no recommended transformer weights available for language '{{ lang }}'
# yet, so the pipeline described here is not transformer-based.
{%- endif %}
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode.width}
2020-10-05 22:19:41 +03:00
{% if has_letters -%}
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
rows = [5000, 2500, 2500, 2500]
{% else -%}
attrs = ["ORTH", "SHAPE"]
rows = [5000, 2500]
{% endif -%}
2020-10-05 22:21:30 +03:00
include_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
2020-08-13 18:38:30 +03:00
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = {{ 96 if optimize == "efficiency" else 256 }}
depth = {{ 4 if optimize == "efficiency" else 8 }}
window_size = 1
maxout_pieces = 3
{% if "morphologizer" in components %}
[components.morphologizer]
factory = "morphologizer"
[components.morphologizer.model]
@architectures = "spacy.Tagger.v1"
nO = null
[components.morphologizer.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
{%- endif %}
2020-08-13 18:38:30 +03:00
{% if "tagger" in components %}
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v1"
nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
2020-08-13 18:38:30 +03:00
{%- endif %}
{% if "parser" in components -%}
[components.parser]
factory = "parser"
[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v1"
state_type = "parser"
extra_state_tokens = false
2020-08-13 18:38:30 +03:00
hidden_width = 128
maxout_pieces = 3
use_upper = true
nO = null
[components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
2020-08-13 18:38:30 +03:00
{%- endif %}
{% if "ner" in components %}
[components.ner]
factory = "ner"
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
state_type = "ner"
extra_state_tokens = false
2020-08-13 18:38:30 +03:00
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null
[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
2020-08-13 18:38:30 +03:00
{% endif %}
2020-09-22 11:22:06 +03:00
2020-09-22 11:40:05 +03:00
{% if "entity_linker" in components -%}
[components.entity_linker]
factory = "entity_linker"
get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
incl_context = true
incl_prior = true
[components.entity_linker.model]
@architectures = "spacy.EntityLinker.v1"
nO = null
[components.entity_linker.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
{% endif %}
2020-09-22 11:22:06 +03:00
{% if "textcat" in components %}
[components.textcat]
factory = "textcat"
{% if optimize == "accuracy" %}
[components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v2"
nO = null
[components.textcat.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
2020-09-22 11:22:06 +03:00
exclusive_classes = false
ngram_size = 1
no_output_layer = false
2020-09-22 11:22:06 +03:00
{% else -%}
[components.textcat.model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
2020-09-22 11:22:06 +03:00
{%- endif %}
{%- endif %}
2020-08-13 18:38:30 +03:00
{% endif %}
{% for pipe in components %}
{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "entity_linker"] %}
2020-08-13 18:38:30 +03:00
{# Other components defined by the user: we just assume they're factories #}
[components.{{ pipe }}]
factory = "{{ pipe }}"
{% endif %}
{% endfor %}
[corpora]
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = {{ 500 if hardware == "gpu" else 2000 }}
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
2020-08-13 18:38:30 +03:00
[training]
2020-08-15 15:50:29 +03:00
{% if use_transformer -%}
accumulate_gradient = {{ transformer["size_factor"] }}
2020-09-23 14:21:42 +03:00
{% endif -%}
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
2020-08-13 18:38:30 +03:00
[training.optimizer]
@optimizers = "Adam.v1"
2020-09-04 22:22:50 +03:00
{% if use_transformer -%}
2020-08-13 18:38:30 +03:00
[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 5e-5
2020-09-04 22:22:50 +03:00
{% endif %}
2020-08-13 18:38:30 +03:00
2020-08-15 15:50:29 +03:00
{% if use_transformer %}
2020-08-13 18:38:30 +03:00
[training.batcher]
2020-09-03 18:30:41 +03:00
@batchers = "spacy.batch_by_padded.v1"
2020-08-13 18:38:30 +03:00
discard_oversize = true
size = 2000
buffer = 256
{%- else %}
[training.batcher]
2020-09-03 18:30:41 +03:00
@batchers = "spacy.batch_by_words.v1"
2020-08-13 18:38:30 +03:00
discard_oversize = false
tolerance = 0.2
[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
{% endif %}
2020-09-28 13:05:23 +03:00
[initialize]
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = null
{% else -%}
vectors = "{{ word_vectors }}"
{% endif -%}