From 135de82a2d7073d535d1ffd1e4254e5dca37c046 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 22 Sep 2020 10:22:06 +0200 Subject: [PATCH 1/5] add textcat to quickstart --- spacy/cli/templates/quickstart_training.jinja | 48 ++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 0db4c8a59..2c7ce024b 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -93,6 +93,29 @@ grad_factor = 1.0 @layers = "reduce_mean.v1" {% endif -%} +{% if "textcat" in components %} +[components.textcat] +factory = "textcat" + +{% if optimize == "accuracy" %} +[components.textcat.model] +@architectures = "spacy.TextCatEnsemble.v1" +exclusive_classes = false +width = 64 +conv_depth = 2 +embed_size = 2000 +window_size = 1 +ngram_size = 1 +nO = null + +{% else -%} +[components.textcat.model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size = 1 +{%- endif %} +{%- endif %} + {# NON-TRANSFORMER PIPELINE #} {% else -%} @@ -167,10 +190,33 @@ nO = null @architectures = "spacy.Tok2VecListener.v1" width = ${components.tok2vec.model.encode.width} {% endif %} + +{% if "textcat" in components %} +[components.textcat] +factory = "textcat" + +{% if optimize == "accuracy" %} +[components.textcat.model] +@architectures = "spacy.TextCatEnsemble.v1" +exclusive_classes = false +width = 64 +conv_depth = 2 +embed_size = 2000 +window_size = 1 +ngram_size = 1 +nO = null + +{% else -%} +[components.textcat.model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size = 1 +{%- endif %} +{%- endif %} {% endif %} {% for pipe in components %} -{% if pipe not in ["tagger", "parser", "ner"] %} +{% if pipe not in ["tagger", "parser", "ner", "textcat"] %} {# Other components defined by the user: we just assume they're factories #} [components.{{ pipe }}] factory = "{{ pipe }}" From 396b33257f7dff646040067c2ed7872d8c194f8b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 22 Sep 2020 10:40:05 +0200 Subject: [PATCH 2/5] add entity_linker to jinja template --- spacy/cli/init_config.py | 2 +- spacy/cli/templates/quickstart_training.jinja | 34 ++++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index e70195e15..5203c5dbb 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -36,7 +36,7 @@ def init_config_cli( """ Generate a starter config.cfg for training. Based on your requirements specified via the CLI arguments, this command generates a config with the - optimal settings for you use case. This includes the choice of architecture, + optimal settings for your use case. This includes the choice of architecture, pretrained weights and related hyperparameters. DOCS: https://nightly.spacy.io/api/cli#init-config diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 2c7ce024b..0674f0964 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -93,6 +93,22 @@ grad_factor = 1.0 @layers = "reduce_mean.v1" {% endif -%} +{% if "entity_linker" in components -%} +[components.entity_linker] +factory = "entity_linker" +get_candidates = {"@misc":"spacy.CandidateGenerator.v1"} +incl_context = true +incl_prior = true + +[components.entity_linker.model] +@architectures = "spacy.EntityLinker.v1" +nO = null + +[components.entity_linker.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +{% endif -%} + {% if "textcat" in components %} [components.textcat] factory = "textcat" @@ -191,6 +207,22 @@ nO = null width = ${components.tok2vec.model.encode.width} {% endif %} +{% if "entity_linker" in components -%} +[components.entity_linker] +factory = "entity_linker" +get_candidates = {"@misc":"spacy.CandidateGenerator.v1"} +incl_context = true +incl_prior = true + +[components.entity_linker.model] +@architectures = "spacy.EntityLinker.v1" +nO = null + +[components.entity_linker.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +{% endif %} + {% if "textcat" in components %} [components.textcat] factory = "textcat" @@ -216,7 +248,7 @@ ngram_size = 1 {% endif %} {% for pipe in components %} -{% if pipe not in ["tagger", "parser", "ner", "textcat"] %} +{% if pipe not in ["tagger", "parser", "ner", "textcat", "entity_linker"] %} {# Other components defined by the user: we just assume they're factories #} [components.{{ pipe }}] factory = "{{ pipe }}" From e931f4d75771dc63b2573e2cbd7c834de96def7d Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 22 Sep 2020 10:56:43 +0200 Subject: [PATCH 3/5] add textcat score --- spacy/cli/templates/quickstart_training.jinja | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 0674f0964..0e83b9bdb 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -323,3 +323,6 @@ ents_f = {{ (1.0 / components|length)|round(2) }} ents_p = 0.0 ents_r = 0.0 {%- endif -%} +{%- if "textcat" in components %} +cats_score = {{ (1.0 / components|length)|round(2) }} +{%- endif -%} From 085a1c8e2b4b3a136025ef693bb6e7537d88729f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 22 Sep 2020 12:06:40 +0200 Subject: [PATCH 4/5] add no_output_layer to TextCatBOW config --- spacy/cli/templates/quickstart_training.jinja | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 0e83b9bdb..a0d9f78ac 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -129,6 +129,7 @@ nO = null @architectures = "spacy.TextCatBOW.v1" exclusive_classes = false ngram_size = 1 +no_output_layer = false {%- endif %} {%- endif %} @@ -243,6 +244,7 @@ nO = null @architectures = "spacy.TextCatBOW.v1" exclusive_classes = false ngram_size = 1 +no_output_layer = false {%- endif %} {%- endif %} {% endif %} From 556f3e4652a33eb1465e1f886310653d8e3d2fd2 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 23 Sep 2020 09:24:28 +0200 Subject: [PATCH 5/5] add pooling to NEL's TransformerListener --- spacy/cli/templates/quickstart_training.jinja | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index a0d9f78ac..c55374899 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -107,6 +107,9 @@ nO = null [components.entity_linker.model.tok2vec] @architectures = "spacy-transformers.TransformerListener.v1" grad_factor = 1.0 + +[components.entity_linker.model.tok2vec.pooling] +@layers = "reduce_mean.v1" {% endif -%} {% if "textcat" in components %}