spaCy/spacy/cli/templates/quickstart_training.jinja
Daniël de Kok da7ad97519
Update TextCatBOW to use the fixed SparseLinear layer (#13149)
* Update `TextCatBOW` to use the fixed `SparseLinear` layer

A while ago, we fixed the `SparseLinear` layer to use all available
parameters: https://github.com/explosion/thinc/pull/754

This change updates `TextCatBOW` to `v3` which uses the new
`SparseLinear_v2` layer. This results in a sizeable improvement on a
text categorization task that was tested.

While at it, this `spacy.TextCatBOW.v3` also adds the `length_exponent`
option to make it possible to change the hidden size. Ideally, we'd just
have an option called `length`. But the way that `TextCatBOW` uses
hashes results in a non-uniform distribution of parameters when the
length is not a power of two.

* Replace TexCatBOW `length_exponent` parameter by `length`

We now round up the length to the next power of two if it isn't
a power of two.

* Remove some tests for TextCatBOW.v2

* Fix missing import
2023-11-29 09:11:54 +01:00

652 lines
16 KiB
Django/Jinja

{# This is a template for training configs used for the quickstart widget in
the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #}
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
[paths]
train = null
dev = null
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = null
{% else -%}
vectors = "{{ word_vectors }}"
{% endif -%}
[system]
{% if use_transformer -%}
gpu_allocator = "pytorch"
{% else -%}
gpu_allocator = null
{% endif %}
[nlp]
lang = "{{ lang }}"
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
{%- set with_accuracy = optimize == "accuracy" -%}
{# The BOW textcat doesn't need a source of features, so it can omit the
tok2vec/transformer. #}
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "span_finder" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
{%- else -%}
{%- set full_pipeline = components -%}
{%- endif %}
pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
batch_size = {{ 128 if hardware == "gpu" else 1000 }}
[components]
{# TRANSFORMER PIPELINE #}
{%- if use_transformer -%}
[components.transformer]
factory = "transformer"
[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v3"
name = "{{ transformer["name"] }}"
tokenizer_config = {"use_fast": true}
[components.transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96
{% if "morphologizer" in components %}
[components.morphologizer]
factory = "morphologizer"
[components.morphologizer.model]
@architectures = "spacy.Tagger.v2"
nO = null
[components.morphologizer.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.morphologizer.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{%- endif %}
{% if "tagger" in components %}
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v2"
nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.tagger.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{%- endif %}
{% if "parser" in components -%}
[components.parser]
factory = "parser"
[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "parser"
extra_state_tokens = false
hidden_width = 128
maxout_pieces = 3
use_upper = false
nO = null
[components.parser.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.parser.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{%- endif %}
{% if "ner" in components -%}
[components.ner]
factory = "ner"
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = false
nO = null
[components.ner.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.ner.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{% endif -%}
{% if "span_finder" in components -%}
[components.span_finder]
factory = "span_finder"
max_length = 25
min_length = null
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
spans_key = "sc"
threshold = 0.5
[components.span_finder.model]
@architectures = "spacy.SpanFinder.v1"
[components.span_finder.model.scorer]
@layers = "spacy.LinearLogistic.v1"
nO = 2
[components.span_finder.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.span_finder.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{% endif -%}
{% if "spancat" in components -%}
[components.spancat]
factory = "spancat"
max_positive = null
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
spans_key = "sc"
threshold = 0.5
[components.spancat.model]
@architectures = "spacy.SpanCategorizer.v1"
[components.spancat.model.reducer]
@layers = "spacy.mean_max_reducer.v1"
hidden_size = 128
[components.spancat.model.scorer]
@layers = "spacy.LinearLogistic.v1"
nO = null
nI = null
[components.spancat.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.spancat.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
[components.spancat.suggester]
@misc = "spacy.ngram_suggester.v1"
sizes = [1,2,3]
{% endif -%}
{% if "spancat_singlelabel" in components %}
[components.spancat_singlelabel]
factory = "spancat_singlelabel"
negative_weight = 1.0
allow_overlap = true
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
spans_key = "sc"
[components.spancat_singlelabel.model]
@architectures = "spacy.SpanCategorizer.v1"
[components.spancat_singlelabel.model.reducer]
@layers = "spacy.mean_max_reducer.v1"
hidden_size = 128
[components.spancat_singlelabel.model.scorer]
@layers = "Softmax.v2"
[components.spancat_singlelabel.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.spancat_singlelabel.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
[components.spancat_singlelabel.suggester]
@misc = "spacy.ngram_suggester.v1"
sizes = [1,2,3]
{% endif %}
{% if "trainable_lemmatizer" in components -%}
[components.trainable_lemmatizer]
factory = "trainable_lemmatizer"
backoff = "orth"
min_tree_freq = 3
overwrite = false
scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}
top_k = 1
[components.trainable_lemmatizer.model]
@architectures = "spacy.Tagger.v2"
nO = null
normalize = false
[components.trainable_lemmatizer.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.trainable_lemmatizer.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{% endif -%}
{% if "entity_linker" in components -%}
[components.entity_linker]
factory = "entity_linker"
get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
incl_context = true
incl_prior = true
[components.entity_linker.model]
@architectures = "spacy.EntityLinker.v2"
nO = null
[components.entity_linker.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.entity_linker.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{% endif -%}
{% if "textcat" in components %}
[components.textcat]
factory = "textcat"
{% if optimize == "accuracy" %}
[components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v2"
nO = null
[components.textcat.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.textcat.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true
length = 262144
ngram_size = 1
no_output_layer = false
{% else -%}
[components.textcat.model]
@architectures = "spacy.TextCatCNN.v2"
exclusive_classes = true
nO = null
[components.textcat.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.textcat.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{%- endif %}
{%- endif %}
{% if "textcat_multilabel" in components %}
[components.textcat_multilabel]
factory = "textcat_multilabel"
{% if optimize == "accuracy" %}
[components.textcat_multilabel.model]
@architectures = "spacy.TextCatEnsemble.v2"
nO = null
[components.textcat_multilabel.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.textcat_multilabel.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
[components.textcat_multilabel.model.linear_model]
@architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false
length = 262144
ngram_size = 1
no_output_layer = false
{% else -%}
[components.textcat_multilabel.model]
@architectures = "spacy.TextCatCNN.v2"
exclusive_classes = false
nO = null
[components.textcat_multilabel.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.textcat_multilabel.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{%- endif %}
{%- endif %}
{# NON-TRANSFORMER PIPELINE #}
{% else -%}
{% if "tok2vec" in full_pipeline -%}
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
rows = [5000, 1000, 2500, 2500]
include_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = {{ 96 if optimize == "efficiency" else 256 }}
depth = {{ 4 if optimize == "efficiency" else 8 }}
window_size = 1
maxout_pieces = 3
{% endif -%}
{% if "morphologizer" in components %}
[components.morphologizer]
factory = "morphologizer"
label_smoothing = 0.05
[components.morphologizer.model]
@architectures = "spacy.Tagger.v2"
nO = null
[components.morphologizer.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
{%- endif %}
{% if "tagger" in components %}
[components.tagger]
factory = "tagger"
label_smoothing = 0.05
[components.tagger.model]
@architectures = "spacy.Tagger.v2"
nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
{%- endif %}
{% if "parser" in components -%}
[components.parser]
factory = "parser"
[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "parser"
extra_state_tokens = false
hidden_width = 128
maxout_pieces = 3
use_upper = true
nO = null
[components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
{%- endif %}
{% if "ner" in components %}
[components.ner]
factory = "ner"
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null
[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
{% endif %}
{% if "span_finder" in components %}
[components.span_finder]
factory = "span_finder"
max_length = 25
min_length = null
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
spans_key = "sc"
threshold = 0.5
[components.span_finder.model]
@architectures = "spacy.SpanFinder.v1"
[components.span_finder.model.scorer]
@layers = "spacy.LinearLogistic.v1"
nO = 2
[components.span_finder.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
{% endif %}
{% if "spancat" in components %}
[components.spancat]
factory = "spancat"
max_positive = null
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
spans_key = "sc"
threshold = 0.5
[components.spancat.model]
@architectures = "spacy.SpanCategorizer.v1"
[components.spancat.model.reducer]
@layers = "spacy.mean_max_reducer.v1"
hidden_size = 128
[components.spancat.model.scorer]
@layers = "spacy.LinearLogistic.v1"
nO = null
nI = null
[components.spancat.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
[components.spancat.suggester]
@misc = "spacy.ngram_suggester.v1"
sizes = [1,2,3]
{% endif %}
{% if "spancat_singlelabel" in components %}
[components.spancat_singlelabel]
factory = "spancat_singlelabel"
negative_weight = 1.0
allow_overlap = true
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
spans_key = "sc"
[components.spancat_singlelabel.model]
@architectures = "spacy.SpanCategorizer.v1"
[components.spancat_singlelabel.model.reducer]
@layers = "spacy.mean_max_reducer.v1"
hidden_size = 128
[components.spancat_singlelabel.model.scorer]
@layers = "Softmax.v2"
[components.spancat_singlelabel.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
[components.spancat_singlelabel.suggester]
@misc = "spacy.ngram_suggester.v1"
sizes = [1,2,3]
{% endif %}
{% if "trainable_lemmatizer" in components -%}
[components.trainable_lemmatizer]
factory = "trainable_lemmatizer"
backoff = "orth"
min_tree_freq = 3
overwrite = false
scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}
top_k = 1
[components.trainable_lemmatizer.model]
@architectures = "spacy.Tagger.v2"
nO = null
normalize = false
[components.trainable_lemmatizer.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
{% endif -%}
{% if "entity_linker" in components -%}
[components.entity_linker]
factory = "entity_linker"
get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
incl_context = true
incl_prior = true
[components.entity_linker.model]
@architectures = "spacy.EntityLinker.v2"
nO = null
[components.entity_linker.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
{% endif %}
{% if "textcat" in components %}
[components.textcat]
factory = "textcat"
{% if optimize == "accuracy" %}
[components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v2"
nO = null
[components.textcat.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true
length = 262144
ngram_size = 1
no_output_layer = false
{% else -%}
[components.textcat.model]
@architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
{%- endif %}
{%- endif %}
{% if "textcat_multilabel" in components %}
[components.textcat_multilabel]
factory = "textcat_multilabel"
{% if optimize == "accuracy" %}
[components.textcat_multilabel.model]
@architectures = "spacy.TextCatEnsemble.v2"
nO = null
[components.textcat_multilabel.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
[components.textcat_multilabel.model.linear_model]
@architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false
length = 262144
ngram_size = 1
no_output_layer = false
{% else -%}
[components.textcat_multilabel.model]
@architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false
length = 262144
ngram_size = 1
no_output_layer = false
{%- endif %}
{%- endif %}
{% endif %}
{% for pipe in components %}
{% if pipe not in listener_components %}
{# Other components defined by the user: we just assume they're factories #}
[components.{{ pipe }}]
factory = "{{ pipe }}"
{% endif %}
{% endfor %}
[corpora]
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
[training]
{% if use_transformer -%}
accumulate_gradient = {{ transformer["size_factor"] }}
{% endif -%}
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
[training.optimizer]
@optimizers = "Adam.v1"
{% if use_transformer -%}
[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 5e-5
{% endif %}
{% if use_transformer %}
[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
discard_oversize = true
size = 2000
buffer = 256
{%- else %}
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
{% endif %}
[initialize]
vectors = ${paths.vectors}