# Semantic entailment/similarity with decomposable attention (using spaCy and Keras) # Practical state-of-the-art textual entailment with spaCy and Keras import numpy as np from keras import layers, Model, models, optimizers from keras import backend as K def build_model(vectors, shape, settings): max_length, nr_hidden, nr_class = shape input1 = layers.Input(shape=(max_length,), dtype="int32", name="words1") input2 = layers.Input(shape=(max_length,), dtype="int32", name="words2") # embeddings (projected) embed = create_embedding(vectors, max_length, nr_hidden) a = embed(input1) b = embed(input2) # step 1: attend F = create_feedforward(nr_hidden) att_weights = layers.dot([F(a), F(b)], axes=-1) G = create_feedforward(nr_hidden) if settings["entail_dir"] == "both": norm_weights_a = layers.Lambda(normalizer(1))(att_weights) norm_weights_b = layers.Lambda(normalizer(2))(att_weights) alpha = layers.dot([norm_weights_a, a], axes=1) beta = layers.dot([norm_weights_b, b], axes=1) # step 2: compare comp1 = layers.concatenate([a, beta]) comp2 = layers.concatenate([b, alpha]) v1 = layers.TimeDistributed(G)(comp1) v2 = layers.TimeDistributed(G)(comp2) # step 3: aggregate v1_sum = layers.Lambda(sum_word)(v1) v2_sum = layers.Lambda(sum_word)(v2) concat = layers.concatenate([v1_sum, v2_sum]) elif settings["entail_dir"] == "left": norm_weights_a = layers.Lambda(normalizer(1))(att_weights) alpha = layers.dot([norm_weights_a, a], axes=1) comp2 = layers.concatenate([b, alpha]) v2 = layers.TimeDistributed(G)(comp2) v2_sum = layers.Lambda(sum_word)(v2) concat = v2_sum else: norm_weights_b = layers.Lambda(normalizer(2))(att_weights) beta = layers.dot([norm_weights_b, b], axes=1) comp1 = layers.concatenate([a, beta]) v1 = layers.TimeDistributed(G)(comp1) v1_sum = layers.Lambda(sum_word)(v1) concat = v1_sum H = create_feedforward(nr_hidden) out = H(concat) out = layers.Dense(nr_class, activation="softmax")(out) model = Model([input1, input2], out) model.compile( optimizer=optimizers.Adam(lr=settings["lr"]), loss="categorical_crossentropy", metrics=["accuracy"], ) return model def create_embedding(vectors, max_length, projected_dim): return models.Sequential( [ layers.Embedding( vectors.shape[0], vectors.shape[1], input_length=max_length, weights=[vectors], trainable=False, ), layers.TimeDistributed( layers.Dense(projected_dim, activation=None, use_bias=False) ), ] ) def create_feedforward(num_units=200, activation="relu", dropout_rate=0.2): return models.Sequential( [ layers.Dense(num_units, activation=activation), layers.Dropout(dropout_rate), layers.Dense(num_units, activation=activation), layers.Dropout(dropout_rate), ] ) def normalizer(axis): def _normalize(att_weights): exp_weights = K.exp(att_weights) sum_weights = K.sum(exp_weights, axis=axis, keepdims=True) return exp_weights / sum_weights return _normalize def sum_word(x): return K.sum(x, axis=1) def test_build_model(): vectors = np.ndarray((100, 8), dtype="float32") shape = (10, 16, 3) settings = {"lr": 0.001, "dropout": 0.2, "gru_encode": True, "entail_dir": "both"} model = build_model(vectors, shape, settings) def test_fit_model(): def _generate_X(nr_example, length, nr_vector): X1 = np.ndarray((nr_example, length), dtype="int32") X1 *= X1 < nr_vector X1 *= 0 <= X1 X2 = np.ndarray((nr_example, length), dtype="int32") X2 *= X2 < nr_vector X2 *= 0 <= X2 return [X1, X2] def _generate_Y(nr_example, nr_class): ys = np.zeros((nr_example, nr_class), dtype="int32") for i in range(nr_example): ys[i, i % nr_class] = 1 return ys vectors = np.ndarray((100, 8), dtype="float32") shape = (10, 16, 3) settings = {"lr": 0.001, "dropout": 0.2, "gru_encode": True, "entail_dir": "both"} model = build_model(vectors, shape, settings) train_X = _generate_X(20, shape[0], vectors.shape[0]) train_Y = _generate_Y(20, shape[2]) dev_X = _generate_X(15, shape[0], vectors.shape[0]) dev_Y = _generate_Y(15, shape[2]) model.fit(train_X, train_Y, validation_data=(dev_X, dev_Y), epochs=5, batch_size=4) __all__ = [build_model]