From 3983bc6b1e8355eb0fa17e7787e94074c1eb4a63 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 6 Jan 2021 12:44:04 +0100 Subject: [PATCH] Fix Transformer width in TextCatEnsemble (#6431) * add convenience method to determine tok2vec width in a model * fix transformer tok2vec dimensions in TextCatEnsemble architecture * init function should not be nested to avoid pickle issues --- spacy/ml/models/textcat.py | 36 +++++++++++++++++++++++++++--------- spacy/ml/models/tok2vec.py | 11 +++++++++++ 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 63dcb165a..8c7316f62 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -6,6 +6,7 @@ from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum from thinc.api import HashEmbed, with_array, with_cpu, uniqued from thinc.api import Relu, residual, expand_window +from thinc.layers.chain import init as init_chain from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER from ...util import registry @@ -13,6 +14,7 @@ from ..extract_ngrams import extract_ngrams from ..staticvectors import StaticVectors from ..featureextractor import FeatureExtractor from ...tokens import Doc +from .tok2vec import get_tok2vec_width @registry.architectures.register("spacy.TextCatCNN.v1") @@ -69,16 +71,17 @@ def build_text_classifier_v2( exclusive_classes = not linear_model.attrs["multi_label"] with Model.define_operators({">>": chain, "|": concatenate}): width = tok2vec.maybe_get_dim("nO") + attention_layer = ParametricAttention(width) # TODO: benchmark performance difference of this layer + maxout_layer = Maxout(nO=width, nI=width) + linear_layer = Linear(nO=nO, nI=width) cnn_model = ( - tok2vec - >> list2ragged() - >> ParametricAttention( - width - ) # TODO: benchmark performance difference of this layer - >> reduce_sum() - >> residual(Maxout(nO=width, nI=width)) - >> Linear(nO=nO, nI=width) - >> Dropout(0.0) + tok2vec + >> list2ragged() + >> attention_layer + >> reduce_sum() + >> residual(maxout_layer) + >> linear_layer + >> Dropout(0.0) ) nO_double = nO * 2 if nO else None @@ -91,7 +94,22 @@ def build_text_classifier_v2( if model.has_dim("nO") is not False: model.set_dim("nO", nO) model.set_ref("output_layer", linear_model.get_ref("output_layer")) + model.set_ref("attention_layer", attention_layer) + model.set_ref("maxout_layer", maxout_layer) + model.set_ref("linear_layer", linear_layer) model.attrs["multi_label"] = not exclusive_classes + + model.init = init_ensemble_textcat + return model + + +def init_ensemble_textcat(model, X, Y) -> Model: + tok2vec_width = get_tok2vec_width(model) + model.get_ref("attention_layer").set_dim("nO", tok2vec_width) + model.get_ref("maxout_layer").set_dim("nO", tok2vec_width) + model.get_ref("maxout_layer").set_dim("nI", tok2vec_width) + model.get_ref("linear_layer").set_dim("nI", tok2vec_width) + init_chain(model, X, Y) return model diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 8755d0d0d..0f727d85f 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -20,6 +20,17 @@ def tok2vec_listener_v1(width: int, upstream: str = "*"): return tok2vec +def get_tok2vec_width(model: Model): + nO = None + if model.has_ref("tok2vec"): + tok2vec = model.get_ref("tok2vec") + if tok2vec.has_dim("nO"): + nO = tok2vec.get_dim("nO") + elif tok2vec.has_ref("listener"): + nO = tok2vec.get_ref("listener").get_dim("nO") + return nO + + @registry.architectures.register("spacy.HashEmbedCNN.v1") def build_hash_embed_cnn_tok2vec( *,