From 75a202ce6506177d5de97b47bfd96fd3c7909503 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sun, 18 Oct 2020 14:50:41 +0200 Subject: [PATCH] TextCat updates and fixes (#6263) * small fix in example imports * throw error when train_corpus or dev_corpus is not a string * small fix in custom logger example * limit macro_auc to labels with 2 annotations * fix typo * also create parents of output_dir if need be * update documentation of textcat scores * refactor TextCatEnsemble * fix tests for new AUC definition * bump to 3.0.0a42 * update docs * rename to spacy.TextCatEnsemble.v2 * spacy.TextCatEnsemble.v1 in legacy * cleanup * small fix * update to 3.0.0rc2 * fix import that got lost in merge * cursed IDE * fix two typos --- spacy/about.py | 2 +- spacy/cli/init_pipeline.py | 2 +- spacy/cli/templates/quickstart_training.jinja | 36 +++++++----- spacy/cli/train.py | 2 +- spacy/errors.py | 6 +- spacy/ml/models/textcat.py | 57 ++++++++++++++----- spacy/ml/models/tok2vec.py | 2 +- spacy/pipeline/textcat.py | 35 +++++++++--- spacy/scorer.py | 30 ++++++---- spacy/tests/pipeline/test_pipe_factories.py | 14 ++--- spacy/tests/pipeline/test_textcat.py | 7 +-- spacy/tests/test_models.py | 27 +++------ spacy/tests/test_scorer.py | 6 +- spacy/tests/training/test_readers.py | 2 +- spacy/training/initialize.py | 4 ++ website/docs/api/architectures.md | 56 +++++++++++++----- website/docs/api/scorer.md | 26 ++++++--- website/docs/usage/layers-architectures.md | 35 ++++++++---- website/docs/usage/processing-pipelines.md | 9 +-- website/docs/usage/training.md | 4 +- 20 files changed, 235 insertions(+), 127 deletions(-) diff --git a/spacy/about.py b/spacy/about.py index bf1d53a7b..24a3ead22 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0rc1" +__version__ = "3.0.0rc2" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 1c0233539..f45097205 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -100,7 +100,7 @@ def init_labels_cli( extract the labels.""" util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) if not output_path.exists(): - output_path.mkdir() + output_path.mkdir(parents=True) overrides = parse_config_overrides(ctx.args) import_code(code_path) setup_gpu(use_gpu) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index d92de9c15..1194438de 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -136,15 +136,19 @@ factory = "textcat" {% if optimize == "accuracy" %} [components.textcat.model] -@architectures = "spacy.TextCatEnsemble.v1" -exclusive_classes = false -width = 64 -conv_depth = 2 -embed_size = 2000 -window_size = 1 -ngram_size = 1 +@architectures = "spacy.TextCatEnsemble.v2" nO = null +[components.textcat.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 + +[components.textcat.model.linear_model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size = 1 +no_output_layer = false + {% else -%} [components.textcat.model] @architectures = "spacy.TextCatBOW.v1" @@ -271,15 +275,19 @@ factory = "textcat" {% if optimize == "accuracy" %} [components.textcat.model] -@architectures = "spacy.TextCatEnsemble.v1" -exclusive_classes = false -width = 64 -conv_depth = 2 -embed_size = 2000 -window_size = 1 -ngram_size = 1 +@architectures = "spacy.TextCatEnsemble.v2" nO = null +[components.textcat.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} + +[components.textcat.model.linear_model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size = 1 +no_output_layer = false + {% else -%} [components.textcat.model] @architectures = "spacy.TextCatBOW.v1" diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 0b27f63dc..fe1e82eb2 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -44,7 +44,7 @@ def train_cli( if not config_path or not config_path.exists(): msg.fail("Config file not found", config_path, exits=1) if output_path is not None and not output_path.exists(): - output_path.mkdir() + output_path.mkdir(parents=True) msg.good(f"Created output directory: {output_path}") overrides = parse_config_overrides(ctx.args) import_code(code_path) diff --git a/spacy/errors.py b/spacy/errors.py index 5fab0bab1..2898fbcaa 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -398,8 +398,8 @@ class Errors: E163 = ("cumsum was found to be unstable: its last element does not " "correspond to sum") E164 = ("x is neither increasing nor decreasing: {x}.") - E165 = ("Only one class present in y_true. ROC AUC score is not defined in " - "that case.") + E165 = ("Only one class present in the gold labels: {label}. " + "ROC AUC score is not defined in that case.") E166 = ("Can only merge DocBins with the same value for '{param}'.\n" "Current DocBin: {current}\nOther DocBin: {other}") E169 = ("Can't find module: {module}") @@ -456,6 +456,8 @@ class Errors: "issue tracker: http://github.com/explosion/spaCy/issues") # TODO: fix numbering after merging develop into master + E897 = ("Field '{field}' should be a dot-notation string referring to the " + "relevant section in the config, but found type {type} instead.") E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute " "is not set or None. If you've implemented a custom component, make " "sure to store the component model as `self.model` in your " diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index ec8998e2d..d4aed2839 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -1,4 +1,6 @@ -from typing import Optional +from typing import Optional, List + +from thinc.types import Floats2d from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum @@ -10,12 +12,13 @@ from ...util import registry from ..extract_ngrams import extract_ngrams from ..staticvectors import StaticVectors from ..featureextractor import FeatureExtractor +from ...tokens import Doc @registry.architectures.register("spacy.TextCatCNN.v1") def build_simple_cnn_text_classifier( tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None -) -> Model: +) -> Model[List[Doc], Floats2d]: """ Build a simple CNN text classifier, given a token-to-vector model as inputs. If exclusive_classes=True, a softmax non-linearity is applied, so that the @@ -23,15 +26,14 @@ def build_simple_cnn_text_classifier( is applied instead, so that outputs are in the range [0, 1]. """ with Model.define_operators({">>": chain}): + cnn = tok2vec >> list2ragged() >> reduce_mean() if exclusive_classes: output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO")) - model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer + model = cnn >> output_layer model.set_ref("output_layer", output_layer) else: linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO")) - model = ( - tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic() - ) + model = cnn >> linear_layer >> Logistic() model.set_ref("output_layer", linear_layer) model.set_ref("tok2vec", tok2vec) model.set_dim("nO", nO) @@ -45,8 +47,7 @@ def build_bow_text_classifier( ngram_size: int, no_output_layer: bool, nO: Optional[int] = None, -) -> Model: - # Don't document this yet, I'm not sure it's right. +) -> Model[List[Doc], Floats2d]: with Model.define_operators({">>": chain}): sparse_linear = SparseLinear(nO) model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear @@ -59,6 +60,39 @@ def build_bow_text_classifier( return model +@registry.architectures.register("spacy.TextCatEnsemble.v2") +def build_text_classifier( + tok2vec: Model[List[Doc], List[Floats2d]], + linear_model: Model[List[Doc], Floats2d], + nO: Optional[int] = None, +) -> Model[List[Doc], Floats2d]: + exclusive_classes = not linear_model.attrs["multi_label"] + with Model.define_operators({">>": chain, "|": concatenate}): + width = tok2vec.get_dim("nO") + cnn_model = ( + tok2vec + >> list2ragged() + >> ParametricAttention(width) # TODO: benchmark performance difference of this layer + >> reduce_sum() + >> residual(Maxout(nO=width, nI=width)) + >> Linear(nO=nO, nI=width) + >> Dropout(0.0) + ) + + nO_double = nO * 2 if nO else None + if exclusive_classes: + output_layer = Softmax(nO=nO, nI=nO_double) + else: + output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic() + model = (linear_model | cnn_model) >> output_layer + model.set_ref("tok2vec", tok2vec) + if model.has_dim("nO") is not False: + model.set_dim("nO", nO) + model.set_ref("output_layer", linear_model.get_ref("output_layer")) + model.attrs["multi_label"] = not exclusive_classes + return model + +# TODO: move to legacy @registry.architectures.register("spacy.TextCatEnsemble.v1") def build_text_classifier( width: int, @@ -158,11 +192,8 @@ def build_text_classifier( @registry.architectures.register("spacy.TextCatLowData.v1") def build_text_classifier_lowdata( - width: int, - pretrained_vectors: Optional[bool], - dropout: Optional[float], - nO: Optional[int] = None, -) -> Model: + width: int, dropout: Optional[float], nO: Optional[int] = None +) -> Model[List[Doc], Floats2d]: # Don't document this yet, I'm not sure it's right. # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims" with Model.define_operators({">>": chain, "**": clone}): diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 95e200927..8755d0d0d 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -106,7 +106,7 @@ def MultiHashEmbed( ) -> Model[List[Doc], List[Floats2d]]: """Construct an embedding layer that separately embeds a number of lexical attributes using hash embedding, concatenates the results, and passes it - through a feed-forward subnetwork to build a mixed representations. + through a feed-forward subnetwork to build a mixed representation. The features used can be configured with the 'attrs' argument. The suggested attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 5ebe0e104..0781a000c 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -16,15 +16,30 @@ from ..vocab import Vocab default_model_config = """ [model] -@architectures = "spacy.TextCatEnsemble.v1" -exclusive_classes = false -pretrained_vectors = null +@architectures = "spacy.TextCatEnsemble.v2" + +[model.tok2vec] +@architectures = "spacy.Tok2Vec.v1" + +[model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" width = 64 -conv_depth = 2 -embed_size = 2000 +rows = [2000, 2000, 1000, 1000, 1000, 1000] +attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] +include_static_vectors = false + +[model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = ${model.tok2vec.embed.width} window_size = 1 +maxout_pieces = 3 +depth = 2 + +[model.linear_model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false ngram_size = 1 -dropout = null +no_output_layer = false """ DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"] @@ -60,9 +75,11 @@ subword_features = true default_score_weights={ "cats_score": 1.0, "cats_score_desc": None, - "cats_p": None, - "cats_r": None, - "cats_f": None, + "cats_micro_p": None, + "cats_micro_r": None, + "cats_micro_f": None, + "cats_macro_p": None, + "cats_macro_r": None, "cats_macro_f": None, "cats_macro_auc": None, "cats_f_per_type": None, diff --git a/spacy/scorer.py b/spacy/scorer.py index d1065f3a9..273bda898 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -59,7 +59,9 @@ class PRFScore: class ROCAUCScore: - """An AUC ROC score.""" + """An AUC ROC score. This is only defined for binary classification. + Use the method is_binary before calculating the score, otherwise it + may throw an error.""" def __init__(self) -> None: self.golds = [] @@ -71,16 +73,16 @@ class ROCAUCScore: self.cands.append(cand) self.golds.append(gold) + def is_binary(self): + return len(np.unique(self.golds)) == 2 + @property def score(self): + if not self.is_binary(): + raise ValueError(Errors.E165.format(label=set(self.golds))) if len(self.golds) == self.saved_score_at_len: return self.saved_score - try: - self.saved_score = _roc_auc_score(self.golds, self.cands) - # catch ValueError: Only one class present in y_true. - # ROC AUC score is not defined in that case. - except ValueError: - self.saved_score = -float("inf") + self.saved_score = _roc_auc_score(self.golds, self.cands) self.saved_score_at_len = len(self.golds) return self.saved_score @@ -362,9 +364,13 @@ class Scorer: for all: attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc), attr_score_desc (text description of the overall score), + attr_micro_p, + attr_micro_r, attr_micro_f, + attr_macro_p, + attr_macro_r, attr_macro_f, - attr_auc, + attr_macro_auc, attr_f_per_type, attr_auc_per_type @@ -431,7 +437,9 @@ class Scorer: macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats - macro_auc = sum(auc.score for auc in auc_per_type.values()) / n_cats + # Limit macro_auc to those labels with gold annotations, + # but still divide by all cats to avoid artificial boosting of datasets with missing labels + macro_auc = sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values()) / n_cats results = { f"{attr}_score": None, f"{attr}_score_desc": None, @@ -443,7 +451,7 @@ class Scorer: f"{attr}_macro_f": macro_f, f"{attr}_macro_auc": macro_auc, f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()}, - f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()}, + f"{attr}_auc_per_type": {k: v.score if v.is_binary() else None for k, v in auc_per_type.items()}, } if len(labels) == 2 and not multi_label and positive_label: positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"] @@ -726,7 +734,7 @@ def _roc_auc_score(y_true, y_score): `_ """ if len(np.unique(y_true)) != 2: - raise ValueError(Errors.E165) + raise ValueError(Errors.E165.format(label=np.unique(y_true))) fpr, tpr, _ = _roc_curve(y_true, y_score) return _auc(fpr, tpr) diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index cac394913..6f07c0220 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -2,6 +2,7 @@ import pytest from spacy.language import Language from spacy.lang.en import English from spacy.lang.de import German +from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.tokens import Doc from spacy.util import registry, SimpleFrozenDict, combine_score_weights from thinc.api import Model, Linear, ConfigValidationError @@ -156,15 +157,10 @@ def test_pipe_class_component_model(): name = "test_class_component_model" default_config = { "model": { - "@architectures": "spacy.TextCatEnsemble.v1", - "exclusive_classes": False, - "pretrained_vectors": None, - "width": 64, - "embed_size": 2000, - "window_size": 1, - "conv_depth": 2, - "ngram_size": 1, - "dropout": None, + "@architectures": "spacy.TextCatEnsemble.v2", + "tok2vec": DEFAULT_TOK2VEC_MODEL, + "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, + "no_output_layer": False}, }, "value1": 10, } diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 91348b1b3..06d512a32 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -140,7 +140,7 @@ def test_overfitting_IO(): nlp = English() nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"} # Set exclusive labels - config = {"model": {"exclusive_classes": True}} + config = {"model": {"linear_model": {"exclusive_classes": True}}} textcat = nlp.add_pipe("textcat", config=config) train_examples = [] for text, annotations in TRAIN_DATA: @@ -192,9 +192,8 @@ def test_overfitting_IO(): {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}, - {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None}, - {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None}, - {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None}, + {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}, + {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}, ], diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index e8884e6b2..200d7dcfd 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -4,32 +4,23 @@ from thinc.api import fix_random_seed, Adam, set_dropout_rate from numpy.testing import assert_array_equal import numpy from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder -from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier +from spacy.ml.models import build_bow_text_classifier, build_simple_cnn_text_classifier from spacy.ml.staticvectors import StaticVectors from spacy.lang.en import English from spacy.lang.en.examples import sentences as EN_SENTENCES -def get_textcat_kwargs(): +def get_textcat_bow_kwargs(): return { - "width": 64, - "embed_size": 2000, - "pretrained_vectors": None, - "exclusive_classes": False, + "exclusive_classes": True, "ngram_size": 1, - "window_size": 1, - "conv_depth": 2, - "dropout": None, - "nO": 7, + "no_output_layer": False, + "nO": 34, } def get_textcat_cnn_kwargs(): - return { - "tok2vec": test_tok2vec(), - "exclusive_classes": False, - "nO": 13, - } + return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13} def get_all_params(model): @@ -105,7 +96,7 @@ def test_multi_hash_embed(): "seed,model_func,kwargs", [ (0, build_Tok2Vec_model, get_tok2vec_kwargs()), - (0, build_text_classifier, get_textcat_kwargs()), + (0, build_bow_text_classifier, get_textcat_bow_kwargs()), (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()), ], ) @@ -125,7 +116,7 @@ def test_models_initialize_consistently(seed, model_func, kwargs): "seed,model_func,kwargs,get_X", [ (0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), - (0, build_text_classifier, get_textcat_kwargs(), get_docs), + (0, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs), (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), ], ) @@ -160,7 +151,7 @@ def test_models_predict_consistently(seed, model_func, kwargs, get_X): "seed,dropout,model_func,kwargs,get_X", [ (0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), - (0, 0.2, build_text_classifier, get_textcat_kwargs(), get_docs), + (0, 0.2, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs), (0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), ], ) diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 4c1b09849..2682cd0ea 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -334,7 +334,8 @@ def test_roc_auc_score(): score = ROCAUCScore() score.score_set(0.25, 0) score.score_set(0.75, 0) - assert score.score == -float("inf") + with pytest.raises(ValueError): + s = score.score y_true = [1, 1] y_score = [0.25, 0.75] @@ -344,4 +345,5 @@ def test_roc_auc_score(): score = ROCAUCScore() score.score_set(0.25, 1) score.score_set(0.75, 1) - assert score.score == -float("inf") + with pytest.raises(ValueError): + s = score.score diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index 9d82ca50a..ff2559d2a 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -51,7 +51,7 @@ def test_readers(): for example in train_corpus(nlp): nlp.update([example], sgd=optimizer) scores = nlp.evaluate(list(dev_corpus(nlp))) - assert scores["cats_score"] + assert scores["cats_score"] == 0.0 # ensure the pipeline runs doc = nlp("Quick test") assert doc.cats diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 7c84caf95..3d79eb78f 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -36,6 +36,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": # Resolve all training-relevant sections using the filled nlp config T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] + if not isinstance(T["train_corpus"], str): + raise ConfigValidationError(desc=Errors.E897.format(field="training.train_corpus", type=type(T["train_corpus"]))) + if not isinstance(T["dev_corpus"], str): + raise ConfigValidationError(desc=Errors.E897.format(field="training.dev_corpus", type=type(T["dev_corpus"]))) train_corpus, dev_corpus = resolve_dot_names(config, dot_names) optimizer = T["optimizer"] # Components that shouldn't be updated during training diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 3157c261a..fe2223017 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -143,7 +143,7 @@ argument that connects to the shared `tok2vec` component in the pipeline. Construct an embedding layer that separately embeds a number of lexical attributes using hash embedding, concatenates the results, and passes it through -a feed-forward subnetwork to build a mixed representations. The features used +a feed-forward subnetwork to build a mixed representation. The features used can be configured with the `attrs` argument. The suggested attributes are `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account some subword information, without construction a fully character-based @@ -516,26 +516,54 @@ several different built-in architectures. It is recommended to experiment with different architectures and settings to determine what works best on your specific data and challenge. -### spacy.TextCatEnsemble.v1 {#TextCatEnsemble} +### spacy.TextCatEnsemble.v2 {#TextCatEnsemble} > #### Example Config > > ```ini > [model] -> @architectures = "spacy.TextCatEnsemble.v1" -> exclusive_classes = false -> pretrained_vectors = null -> width = 64 -> embed_size = 2000 -> conv_depth = 2 -> window_size = 1 -> ngram_size = 1 -> dropout = null +> @architectures = "spacy.TextCatEnsemble.v2" > nO = null +> +> [model.linear_model] +> @architectures = "spacy.TextCatBOW.v1" +> exclusive_classes = true +> ngram_size = 1 +> no_output_layer = false +> +> [model.tok2vec] +> @architectures = "spacy.Tok2Vec.v1" +> +> [model.tok2vec.embed] +> @architectures = "spacy.MultiHashEmbed.v1" +> width = 64 +> rows = [2000, 2000, 1000, 1000, 1000, 1000] +> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] +> include_static_vectors = false +> +> [model.tok2vec.encode] +> @architectures = "spacy.MaxoutWindowEncoder.v1" +> width = ${model.tok2vec.embed.width} +> window_size = 1 +> maxout_pieces = 3 +> depth = 2 > ``` -Stacked ensemble of a bag-of-words model and a neural network model. The neural -network has an internal CNN Tok2Vec layer and uses attention. +Stacked ensemble of a linear bag-of-words model and a neural network model. The +neural network is built upon a Tok2Vec layer and uses attention. The setting for +whether or not this model should cater for multi-label classification, is taken +from the linear model, where it is stored in `model.attrs["multi_label"]`. + +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `linear_model` | The linear bag-of-words model. ~~Model[List[Doc], Floats2d]~~ | +| `tok2vec` | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~ | +| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | + + + +The v1 was functionally similar, but used an internal `tok2vec` instead of taking it as argument. | Name | Description | | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -550,6 +578,8 @@ network has an internal CNN Tok2Vec layer and uses attention. | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | + + ### spacy.TextCatCNN.v1 {#TextCatCNN} > #### Example Config diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index 0dbc0de33..388d92801 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -174,15 +174,25 @@ Calculate the UAS, LAS, and LAS per type scores for dependency parses. ## Scorer.score_cats {#score_cats tag="staticmethod" new="3"} Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict -containing scores for each label like `Doc.cats`. The reported overall score -depends on the scorer settings: +containing scores for each label like `Doc.cats`. The returned dictionary +contains the following scores: -1. **all:** `{attr}_score` (one of `{attr}_f` / `{attr}_macro_f` / - `{attr}_macro_auc`), `{attr}_score_desc` (text description of the overall - score), `{attr}_f_per_type`, `{attr}_auc_per_type` -2. **binary exclusive with positive label:** `{attr}_p`, `{attr}_r`, `{attr}_f` -3. **3+ exclusive classes**, macro-averaged F-score: `{attr}_macro_f`; -4. **multilabel**, macro-averaged AUC: `{attr}_macro_auc` +- `{attr}_micro_p`, `{attr}_micro_r` and `{attr}_micro_f`: each instance across + each label is weighted equally +- `{attr}_macro_p`, `{attr}_macro_r` and `{attr}_macro_f`: the average values + across evaluations per label +- `{attr}_f_per_type` and `{attr}_auc_per_type`: each contains a dictionary of + scores, keyed by label +- A final `{attr}_score` and corresponding `{attr}_score_desc` (text + description) + +The reported `{attr}_score` depends on the classification properties: + +- **binary exclusive with positive label:** `{attr}_score` is set to the F-score + of the positive label +- **3+ exclusive classes**, macro-averaged F-score: + `{attr}_score = {attr}_macro_f` +- **multilabel**, macro-averaged AUC: `{attr}_score = {attr}_macro_auc` > #### Example > diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index d7b2593e7..aa62a77d4 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -130,16 +130,31 @@ factory = "textcat" labels = [] [components.textcat.model] -@architectures = "spacy.TextCatEnsemble.v1" -exclusive_classes = false -pretrained_vectors = null -width = 64 -conv_depth = 2 -embed_size = 2000 -window_size = 1 -ngram_size = 1 -dropout = 0 +@architectures = "spacy.TextCatEnsemble.v2" nO = null + +[components.textcat.model.tok2vec] +@architectures = "spacy.Tok2Vec.v1" + +[components.textcat.model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = 64 +rows = [2000, 2000, 1000, 1000, 1000, 1000] +attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] +include_static_vectors = false + +[components.textcat.model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = ${components.textcat.model.tok2vec.embed.width} +window_size = 1 +maxout_pieces = 3 +depth = 2 + +[components.textcat.model.linear_model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = false +ngram_size = 1 +no_output_layer = false ``` spaCy has two additional built-in `textcat` architectures, and you can easily @@ -687,7 +702,7 @@ Before the model can be used, it needs to be [initialized](/usage/training#initialization). This function receives a callback to access the full **training data set**, or a representative sample. This data set can be used to deduce all **relevant labels**. Alternatively, a list of -labels can be provided to `initialize`, or you can call +labels can be provided to `initialize`, or you can call `RelationExtractor.add_label` directly. The number of labels defines the output dimensionality of the network, and will be used to do [shape inference](https://thinc.ai/docs/usage-models#validation) throughout the diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index a0cf36909..ef44009ae 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1244,15 +1244,10 @@ labels = [] # This function is created and then passed to the "textcat" component as # the argument "model" [components.textcat.model] -@architectures = "spacy.TextCatEnsemble.v1" +@architectures = "spacy.TextCatBOW.v1" exclusive_classes = false -pretrained_vectors = null -width = 64 -conv_depth = 2 -embed_size = 2000 -window_size = 1 ngram_size = 1 -dropout = null +no_output_layer = false [components.other_textcat] factory = "textcat" diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 5a42d2172..274ea5989 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -717,7 +717,7 @@ tabular results to a file: ```python ### functions.py import sys -from typing import IO, Tuple, Callable, Dict, Any +from typing import IO, Tuple, Callable, Dict, Any, Optional import spacy from spacy import Language from pathlib import Path @@ -729,7 +729,7 @@ def custom_logger(log_path): stdout: IO=sys.stdout, stderr: IO=sys.stderr ) -> Tuple[Callable, Callable]: - stdout.write(f"Logging to {log_path}\n") + stdout.write(f"Logging to {log_path}\\n") log_file = Path(log_path).open("w", encoding="utf8") log_file.write("step\\t") log_file.write("score\\t")