Add partial embedding updates to Parikh model, fix dropout, other corrections.

2025-10-28 06:31:12 +03:00 · 2016-11-18 06:32:12 -06:00 · 2016-11-18 06:32:12 -06:00 · ff5ab75f5e
commit ff5ab75f5e
parent 80f473dfb8
3 changed files with 65 additions and 30 deletions
--- a/examples/keras_parikh_entailment/main.py
+++ b/examples/keras_parikh_entailment/main.py
@ -93,7 +93,7 @@ def read_snli(path):
    nr_hidden=("Number of hidden units", "option", "H", int),
    dropout=("Dropout level", "option", "d", float),
    learn_rate=("Learning rate", "option", "e", float),
-    batch_size=("Batch size for neural network training", "option", "b", float),
+    batch_size=("Batch size for neural network training", "option", "b", int),
    nr_epoch=("Number of training epochs", "option", "i", int),
    tree_truncate=("Truncate sentences by tree distance", "flag", "T", bool),
    gru_encode=("Encode sentences with bidirectional GRU", "flag", "E", bool),
--- a/examples/keras_parikh_entailment/keras_decomposable_attention.py
+++ b/examples/keras_parikh_entailment/keras_decomposable_attention.py
@ -3,8 +3,10 @@
 import numpy
 from keras.layers import InputSpec, Layer, Input, Dense, merge
-from keras.layers import Activation, Dropout, Embedding, TimeDistributed
+from keras.layers import Lambda, Activation, Dropout, Embedding, TimeDistributed
-from keras.layers import Bidirectional, GRU
+from keras.layers import Bidirectional, GRU, LSTM
 from keras.layers.noise import GaussianNoise
 from keras.layers.advanced_activations import ELU
 import keras.backend as K
 from keras.models import Sequential, Model, model_from_json
 from keras.regularizers import l2
@ -20,13 +22,13 @@ def build_model(vectors, shape, settings):
    ids2 = Input(shape=(max_length,), dtype='int32', name='words2')
    # Construct operations, which we'll chain together.
-    embed = _StaticEmbedding(vectors, max_length, nr_hidden)
+    embed = _StaticEmbedding(vectors, max_length, nr_hidden, dropout=0.2, nr_tune=5000)
    if settings['gru_encode']:
-        encode = _BiRNNEncoding(max_length, nr_hidden)
+        encode = _BiRNNEncoding(max_length, nr_hidden, dropout=settings['dropout'])
-    attend = _Attention(max_length, nr_hidden)
+    attend = _Attention(max_length, nr_hidden, dropout=settings['dropout'])
    align = _SoftAlignment(max_length, nr_hidden)
-    compare = _Comparison(max_length, nr_hidden)
+    compare = _Comparison(max_length, nr_hidden, dropout=settings['dropout'])
-    entail = _Entailment(nr_hidden, nr_class)
+    entail = _Entailment(nr_hidden, nr_class, dropout=settings['dropout'])
    # Declare the model as a computational graph.
    sent1 = embed(ids1) # Shape: (i, n)
@ -59,15 +61,26 @@ def build_model(vectors, shape, settings):
 class _StaticEmbedding(object):
-    def __init__(self, vectors, max_length, nr_out):
+    def __init__(self, vectors, max_length, nr_out, nr_tune=1000, dropout=0.0):
        self.nr_out = nr_out
        self.max_length = max_length
        self.embed = Embedding(
                        vectors.shape[0],
                        vectors.shape[1],
                        input_length=max_length,
                        weights=[vectors],
                        name='embed',
-                        trainable=False,
+                        trainable=False)
-                        dropout=0.0)
+        self.tune = Embedding(
                        nr_tune,
                        nr_out,
                        input_length=max_length,
                        weights=None,
                        name='tune',
                        trainable=True,
                        dropout=dropout)
        self.mod_ids = Lambda(lambda sent: sent % (nr_tune-1)+1,
                              output_shape=(self.max_length,))
        self.project = TimeDistributed(
                            Dense(
@ -77,23 +90,37 @@ class _StaticEmbedding(object):
                                name='project'))
    def __call__(self, sentence):
-        return self.project(self.embed(sentence))
+        def get_output_shape(shapes):
            print(shapes)
            return shapes[0]
        mod_sent = self.mod_ids(sentence) 
        tuning = self.tune(mod_sent)
        #tuning = merge([tuning, mod_sent],
        #    mode=lambda AB: AB[0] * (K.clip(K.cast(AB[1], 'float32'), 0, 1)),
        #    output_shape=(self.max_length, self.nr_out))
        pretrained = self.project(self.embed(sentence))
        vectors = merge([pretrained, tuning], mode='sum')
        return vectors
 class _BiRNNEncoding(object):
-    def __init__(self, max_length, nr_out):
+    def __init__(self, max_length, nr_out, dropout=0.0):
        self.model = Sequential()
-        self.model.add(Bidirectional(GRU(int(nr_out/2), return_sequences=True),
+        self.model.add(Bidirectional(LSTM(nr_out, return_sequences=True,
-                       input_shape=(max_length, nr_out)))
+                                         dropout_W=dropout, dropout_U=dropout),
                                         input_shape=(max_length, nr_out)))
        self.model.add(TimeDistributed(Dense(nr_out, activation='relu', init='he_normal')))
        self.model.add(TimeDistributed(Dropout(0.2)))
    def __call__(self, sentence):
        return self.model(sentence)
 class _Attention(object):
-    def __init__(self, max_length, nr_hidden, dropout=0.0, L2=1e-4, activation='relu'):
+    def __init__(self, max_length, nr_hidden, dropout=0.0, L2=0.0, activation='relu'):
        self.max_length = max_length
        self.model = Sequential()
        self.model.add(Dropout(dropout, input_shape=(nr_hidden,)))
        self.model.add(
            Dense(nr_hidden, name='attend1',
                init='he_normal', W_regularizer=l2(L2),
@ -134,18 +161,17 @@ class _SoftAlignment(object):
 class _Comparison(object):
-    def __init__(self, words, nr_hidden, L2=1e-6, dropout=0.2):
+    def __init__(self, words, nr_hidden, L2=0.0, dropout=0.0):
        self.words = words
        self.model = Sequential()
        self.model.add(Dropout(dropout, input_shape=(nr_hidden*2,)))
        self.model.add(Dense(nr_hidden, name='compare1',
-            init='he_normal', W_regularizer=l2(L2),
+            init='he_normal', W_regularizer=l2(L2)))
            input_shape=(nr_hidden*2,)))
        self.model.add(Activation('relu'))
        self.model.add(Dropout(dropout))
        self.model.add(Dense(nr_hidden, name='compare2',
                        W_regularizer=l2(L2), init='he_normal'))
        self.model.add(Activation('relu'))
        self.model.add(Dropout(dropout))
        self.model = TimeDistributed(self.model)
    def __call__(self, sent, align, **kwargs):
@ -156,13 +182,16 @@ class _Comparison(object):
 class _Entailment(object):
-    def __init__(self, nr_hidden, nr_out, dropout=0.2, L2=1e-4):
+    def __init__(self, nr_hidden, nr_out, dropout=0.0, L2=0.0):
        self.model = Sequential()
        self.model.add(Dropout(dropout, input_shape=(nr_hidden*2,)))
        self.model.add(Dense(nr_hidden, name='entail1',
-            init='he_normal', W_regularizer=l2(L2),
+            init='he_normal', W_regularizer=l2(L2)))
            input_shape=(nr_hidden*2,)))
        self.model.add(Activation('relu'))
        self.model.add(Dropout(dropout))
        self.model.add(Dense(nr_hidden, name='entail2',
            init='he_normal', W_regularizer=l2(L2)))
        self.model.add(Activation('relu'))
        self.model.add(Dense(nr_out, name='entail_out', activation='softmax',
                        W_regularizer=l2(L2), init='zero'))
--- a/examples/keras_parikh_entailment/spacy_hook.py
+++ b/examples/keras_parikh_entailment/spacy_hook.py
@ -1,5 +1,6 @@
 from keras.models import model_from_json
 import numpy
 import numpy.random
 class KerasSimilarityShim(object):
@ -31,16 +32,16 @@ class KerasSimilarityShim(object):
        return scores[0]
-def get_embeddings(vocab):
+def get_embeddings(vocab, nr_unk=100):
-    max_rank = max(lex.rank+1 for lex in vocab if lex.has_vector)
+    nr_vector = max(lex.rank for lex in vocab) + 1
-    vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32')
+    vectors = numpy.zeros((nr_vector+nr_unk+2, vocab.vectors_length), dtype='float32')
    for lex in vocab:
        if lex.has_vector:
-            vectors[lex.rank + 1] = lex.vector
+            vectors[lex.rank+1] = lex.vector / lex.vector_norm
    return vectors
-def get_word_ids(docs, rnn_encode=False, tree_truncate=False, max_length=100):
+def get_word_ids(docs, rnn_encode=False, tree_truncate=False, max_length=100, nr_unk=100):
    Xs = numpy.zeros((len(docs), max_length), dtype='int32')
    for i, doc in enumerate(docs):
        if tree_truncate:
@ -50,17 +51,22 @@ def get_word_ids(docs, rnn_encode=False, tree_truncate=False, max_length=100):
        words = []
        while len(words) <= max_length and queue:
            word = queue.pop(0)
-            if rnn_encode or (word.has_vector and not word.is_punct and not word.is_space):
+            if rnn_encode or (not word.is_punct and not word.is_space):
                words.append(word)
            if tree_truncate:
                queue.extend(list(word.lefts))
                queue.extend(list(word.rights))
        words.sort()
        for j, token in enumerate(words):
-            Xs[i, j] = token.rank + 1
+            if token.has_vector:
                Xs[i, j] = token.rank+1
            else:
                Xs[i, j] = (token.shape % (nr_unk-1))+2
            j += 1
            if j >= max_length:
                break
        else:
            Xs[i, len(words)] = 1
    return Xs