From 8d89d581850ef18b4befbfe0775ca4275d199cca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Mon, 29 Jan 2024 11:35:11 +0100 Subject: [PATCH] Add lazily-initialized tok2vec to simulate transformers Add a lazily-initialized tok2vec to the tests and test the current textcat models with it. Fix some additional issues found using this test. --- spacy/ml/models/textcat.py | 10 ++++++++ spacy/tests/pipeline/test_textcat.py | 38 ++++++++++++++++++++++++++++ spacy/tests/tok2vec.py | 35 +++++++++++++++++++++++++ 3 files changed, 83 insertions(+) create mode 100644 spacy/tests/tok2vec.py diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index a9aba27fb..601c94a7f 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -185,6 +185,11 @@ def build_text_classifier_v2( def init_ensemble_textcat(model, X, Y) -> Model: + # When tok2vec is lazily initialized, we need to initialize it before + # the rest of the chain to ensure that we can get its width. + tok2vec = model.get_ref("tok2vec") + tok2vec.initialize(X) + tok2vec_width = get_tok2vec_width(model) model.get_ref("attention_layer").set_dim("nO", tok2vec_width) model.get_ref("maxout_layer").set_dim("nO", tok2vec_width) @@ -272,6 +277,11 @@ def _build_parametric_attention_with_residual_nonlinear( def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model: + # When tok2vec is lazily initialized, we need to initialize it before + # the rest of the chain to ensure that we can get its width. + tok2vec = model.get_ref("tok2vec") + tok2vec.initialize(X) + tok2vec_width = get_tok2vec_width(model) model.get_ref("attention_layer").set_dim("nO", tok2vec_width) model.get_ref("key_transform").set_dim("nI", tok2vec_width) diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 7a78c3dac..045b93ec8 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -30,6 +30,9 @@ from spacy.training.initialize import init_nlp from ..util import make_tempdir +# Ensure that the architecture gets added to the registry. +from ..tok2vec import build_lazy_init_tok2vec as _ + TRAIN_DATA_SINGLE_LABEL = [ ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}), ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}), @@ -40,6 +43,13 @@ TRAIN_DATA_MULTI_LABEL = [ ("I'm confused but happy", {"cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}}), ] +lazy_init_model_config = """ +[model] +@architectures = "LazyInitTok2Vec.v1" +width = 96 +""" +LAZY_INIT_TOK2VEC_MODEL = Config().from_str(lazy_init_model_config)["model"] + def make_get_examples_single_label(nlp): train_examples = [] @@ -546,6 +556,34 @@ def test_error_with_multi_labels(): nlp.initialize(get_examples=lambda: train_examples) +# fmt: off +@pytest.mark.parametrize( + "name,textcat_config", + [ + # ENSEMBLE V2 + ("textcat_multilabel", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), + ("textcat", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), + # PARAMETRIC ATTENTION V1 + ("textcat", {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": True}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": False}), + # REDUCE + ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), + ], +) +# fmt: on +def test_tok2vec_lazy_init(name, textcat_config): + # Check that we can properly initialize and use a textcat model using + # a lazily-initialized tok2vec. + nlp = English() + pipe_config = {"model": textcat_config} + textcat = nlp.add_pipe(name, config=pipe_config) + textcat.add_label("POSITIVE") + textcat.add_label("NEGATIVE") + nlp.initialize() + nlp.pipe(["This is a test."]) + + @pytest.mark.parametrize( "name,get_examples, train_data", [ diff --git a/spacy/tests/tok2vec.py b/spacy/tests/tok2vec.py new file mode 100644 index 000000000..59294b495 --- /dev/null +++ b/spacy/tests/tok2vec.py @@ -0,0 +1,35 @@ +from typing import List + +from spacy.util import registry +from thinc.types import Floats2d +from spacy.tokens import Doc +from thinc.api import Model + + +@registry.architectures("LazyInitTok2Vec.v1") +def build_lazy_init_tok2vec(*, width: int) -> Model[List[Doc], List[Floats2d]]: + """tok2vec model of which the output size is only known after + initialization. This implementation does not output meaningful + embeddings, it is strictly for testing.""" + return Model( + "lazy_init_tok2vec", + lazy_init_tok2vec_forward, + init=lazy_init_tok2vec_init, + dims={"nO": None}, + attrs={"width": width}, + ) + + +def lazy_init_tok2vec_init(model: Model, X=None, Y=None): + width = model.attrs["width"] + model.set_dim("nO", width) + + +def lazy_init_tok2vec_forward(model: Model, X: List[Doc], is_train: bool): + width = model.get_dim("nO") + Y = [model.ops.alloc2f(len(doc), width) for doc in X] + + def backprop(dY): + return [] + + return Y, backprop