From 03762b4b926c574a14727c8a6777356c35eb5d7c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 1 Apr 2022 09:01:04 +0200 Subject: [PATCH] Add spancat, trainable_lemmatizer to quickstart (#10524) * Add `SPACY` and `IS_SPACE` as default `tok2vec` features --- spacy/cli/templates/quickstart_training.jinja | 111 +++++++++++++++++- website/src/widgets/quickstart-training.js | 2 +- 2 files changed, 108 insertions(+), 5 deletions(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index b84fb3a8f..ae11dcafc 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -3,6 +3,7 @@ the docs and the init config command. It encodes various best practices and can help generate the best possible configuration, given a user's requirements. #} {%- set use_transformer = hardware != "cpu" -%} {%- set transformer = transformer_data[optimize] if use_transformer else {} -%} +{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%} [paths] train = null dev = null @@ -24,10 +25,10 @@ lang = "{{ lang }}" {%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%} {%- set with_accuracy = optimize == "accuracy" -%} {%- set has_accurate_textcat = has_textcat and with_accuracy -%} -{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or has_accurate_textcat) -%} -{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %} +{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%} +{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%} {%- else -%} -{%- set full_pipeline = components %} +{%- set full_pipeline = components -%} {%- endif %} pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }} batch_size = {{ 128 if hardware == "gpu" else 1000 }} @@ -123,6 +124,60 @@ grad_factor = 1.0 @layers = "reduce_mean.v1" {% endif -%} +{% if "spancat" in components -%} +[components.spancat] +factory = "spancat" +max_positive = null +scorer = {"@scorers":"spacy.spancat_scorer.v1"} +spans_key = "sc" +threshold = 0.5 + +[components.spancat.model] +@architectures = "spacy.SpanCategorizer.v1" + +[components.spancat.model.reducer] +@layers = "spacy.mean_max_reducer.v1" +hidden_size = 128 + +[components.spancat.model.scorer] +@layers = "spacy.LinearLogistic.v1" +nO = null +nI = null + +[components.spancat.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 + +[components.spancat.model.tok2vec.pooling] +@layers = "reduce_mean.v1" + +[components.spancat.suggester] +@misc = "spacy.ngram_suggester.v1" +sizes = [1,2,3] +{% endif -%} + +{% if "trainable_lemmatizer" in components -%} +[components.trainable_lemmatizer] +factory = "trainable_lemmatizer" +backoff = "orth" +min_tree_freq = 3 +overwrite = false +scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"} +top_k = 1 + +[components.trainable_lemmatizer.model] +@architectures = "spacy.Tagger.v2" +nO = null +normalize = false + +[components.trainable_lemmatizer.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 + +[components.trainable_lemmatizer.model.tok2vec.pooling] +@layers = "reduce_mean.v1" +{% endif -%} + {% if "entity_linker" in components -%} [components.entity_linker] factory = "entity_linker" @@ -295,6 +350,54 @@ nO = null width = ${components.tok2vec.model.encode.width} {% endif %} +{% if "spancat" in components %} +[components.spancat] +factory = "spancat" +max_positive = null +scorer = {"@scorers":"spacy.spancat_scorer.v1"} +spans_key = "sc" +threshold = 0.5 + +[components.spancat.model] +@architectures = "spacy.SpanCategorizer.v1" + +[components.spancat.model.reducer] +@layers = "spacy.mean_max_reducer.v1" +hidden_size = 128 + +[components.spancat.model.scorer] +@layers = "spacy.LinearLogistic.v1" +nO = null +nI = null + +[components.spancat.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} + +[components.spancat.suggester] +@misc = "spacy.ngram_suggester.v1" +sizes = [1,2,3] +{% endif %} + +{% if "trainable_lemmatizer" in components -%} +[components.trainable_lemmatizer] +factory = "trainable_lemmatizer" +backoff = "orth" +min_tree_freq = 3 +overwrite = false +scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"} +top_k = 1 + +[components.trainable_lemmatizer.model] +@architectures = "spacy.Tagger.v2" +nO = null +normalize = false + +[components.trainable_lemmatizer.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +{% endif -%} + {% if "entity_linker" in components -%} [components.entity_linker] factory = "entity_linker" @@ -369,7 +472,7 @@ no_output_layer = false {% endif %} {% for pipe in components %} -{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker"] %} +{% if pipe not in listener_components %} {# Other components defined by the user: we just assume they're factories #} [components.{{ pipe }}] factory = "{{ pipe }}" diff --git a/website/src/widgets/quickstart-training.js b/website/src/widgets/quickstart-training.js index 2d3a0e679..fbeeaf79d 100644 --- a/website/src/widgets/quickstart-training.js +++ b/website/src/widgets/quickstart-training.js @@ -10,7 +10,7 @@ const DEFAULT_LANG = 'en' const DEFAULT_HARDWARE = 'cpu' const DEFAULT_OPT = 'efficiency' const DEFAULT_TEXTCAT_EXCLUSIVE = true -const COMPONENTS = ['tagger', 'morphologizer', 'parser', 'ner', 'textcat'] +const COMPONENTS = ['tagger', 'morphologizer', 'trainable_lemmatizer', 'parser', 'ner', 'spancat', 'textcat'] const COMMENT = `# This is an auto-generated partial config. To use it with 'spacy train' # you can run spacy init fill-config to auto-fill all default settings: # python -m spacy init fill-config ./base_config.cfg ./config.cfg`