Add partial embedding updates to Parikh model, fix dropout, other corrections.

This commit is contained in:
Matthew Honnibal 2016-11-18 06:32:12 -06:00
parent 80f473dfb8
commit ff5ab75f5e
3 changed files with 65 additions and 30 deletions

View File

@ -93,7 +93,7 @@ def read_snli(path):
nr_hidden=("Number of hidden units", "option", "H", int), nr_hidden=("Number of hidden units", "option", "H", int),
dropout=("Dropout level", "option", "d", float), dropout=("Dropout level", "option", "d", float),
learn_rate=("Learning rate", "option", "e", float), learn_rate=("Learning rate", "option", "e", float),
batch_size=("Batch size for neural network training", "option", "b", float), batch_size=("Batch size for neural network training", "option", "b", int),
nr_epoch=("Number of training epochs", "option", "i", int), nr_epoch=("Number of training epochs", "option", "i", int),
tree_truncate=("Truncate sentences by tree distance", "flag", "T", bool), tree_truncate=("Truncate sentences by tree distance", "flag", "T", bool),
gru_encode=("Encode sentences with bidirectional GRU", "flag", "E", bool), gru_encode=("Encode sentences with bidirectional GRU", "flag", "E", bool),

View File

@ -3,8 +3,10 @@
import numpy import numpy
from keras.layers import InputSpec, Layer, Input, Dense, merge from keras.layers import InputSpec, Layer, Input, Dense, merge
from keras.layers import Activation, Dropout, Embedding, TimeDistributed from keras.layers import Lambda, Activation, Dropout, Embedding, TimeDistributed
from keras.layers import Bidirectional, GRU from keras.layers import Bidirectional, GRU, LSTM
from keras.layers.noise import GaussianNoise
from keras.layers.advanced_activations import ELU
import keras.backend as K import keras.backend as K
from keras.models import Sequential, Model, model_from_json from keras.models import Sequential, Model, model_from_json
from keras.regularizers import l2 from keras.regularizers import l2
@ -20,13 +22,13 @@ def build_model(vectors, shape, settings):
ids2 = Input(shape=(max_length,), dtype='int32', name='words2') ids2 = Input(shape=(max_length,), dtype='int32', name='words2')
# Construct operations, which we'll chain together. # Construct operations, which we'll chain together.
embed = _StaticEmbedding(vectors, max_length, nr_hidden) embed = _StaticEmbedding(vectors, max_length, nr_hidden, dropout=0.2, nr_tune=5000)
if settings['gru_encode']: if settings['gru_encode']:
encode = _BiRNNEncoding(max_length, nr_hidden) encode = _BiRNNEncoding(max_length, nr_hidden, dropout=settings['dropout'])
attend = _Attention(max_length, nr_hidden) attend = _Attention(max_length, nr_hidden, dropout=settings['dropout'])
align = _SoftAlignment(max_length, nr_hidden) align = _SoftAlignment(max_length, nr_hidden)
compare = _Comparison(max_length, nr_hidden) compare = _Comparison(max_length, nr_hidden, dropout=settings['dropout'])
entail = _Entailment(nr_hidden, nr_class) entail = _Entailment(nr_hidden, nr_class, dropout=settings['dropout'])
# Declare the model as a computational graph. # Declare the model as a computational graph.
sent1 = embed(ids1) # Shape: (i, n) sent1 = embed(ids1) # Shape: (i, n)
@ -59,15 +61,26 @@ def build_model(vectors, shape, settings):
class _StaticEmbedding(object): class _StaticEmbedding(object):
def __init__(self, vectors, max_length, nr_out): def __init__(self, vectors, max_length, nr_out, nr_tune=1000, dropout=0.0):
self.nr_out = nr_out
self.max_length = max_length
self.embed = Embedding( self.embed = Embedding(
vectors.shape[0], vectors.shape[0],
vectors.shape[1], vectors.shape[1],
input_length=max_length, input_length=max_length,
weights=[vectors], weights=[vectors],
name='embed', name='embed',
trainable=False, trainable=False)
dropout=0.0) self.tune = Embedding(
nr_tune,
nr_out,
input_length=max_length,
weights=None,
name='tune',
trainable=True,
dropout=dropout)
self.mod_ids = Lambda(lambda sent: sent % (nr_tune-1)+1,
output_shape=(self.max_length,))
self.project = TimeDistributed( self.project = TimeDistributed(
Dense( Dense(
@ -77,23 +90,37 @@ class _StaticEmbedding(object):
name='project')) name='project'))
def __call__(self, sentence): def __call__(self, sentence):
return self.project(self.embed(sentence)) def get_output_shape(shapes):
print(shapes)
return shapes[0]
mod_sent = self.mod_ids(sentence)
tuning = self.tune(mod_sent)
#tuning = merge([tuning, mod_sent],
# mode=lambda AB: AB[0] * (K.clip(K.cast(AB[1], 'float32'), 0, 1)),
# output_shape=(self.max_length, self.nr_out))
pretrained = self.project(self.embed(sentence))
vectors = merge([pretrained, tuning], mode='sum')
return vectors
class _BiRNNEncoding(object): class _BiRNNEncoding(object):
def __init__(self, max_length, nr_out): def __init__(self, max_length, nr_out, dropout=0.0):
self.model = Sequential() self.model = Sequential()
self.model.add(Bidirectional(GRU(int(nr_out/2), return_sequences=True), self.model.add(Bidirectional(LSTM(nr_out, return_sequences=True,
dropout_W=dropout, dropout_U=dropout),
input_shape=(max_length, nr_out))) input_shape=(max_length, nr_out)))
self.model.add(TimeDistributed(Dense(nr_out, activation='relu', init='he_normal')))
self.model.add(TimeDistributed(Dropout(0.2)))
def __call__(self, sentence): def __call__(self, sentence):
return self.model(sentence) return self.model(sentence)
class _Attention(object): class _Attention(object):
def __init__(self, max_length, nr_hidden, dropout=0.0, L2=1e-4, activation='relu'): def __init__(self, max_length, nr_hidden, dropout=0.0, L2=0.0, activation='relu'):
self.max_length = max_length self.max_length = max_length
self.model = Sequential() self.model = Sequential()
self.model.add(Dropout(dropout, input_shape=(nr_hidden,)))
self.model.add( self.model.add(
Dense(nr_hidden, name='attend1', Dense(nr_hidden, name='attend1',
init='he_normal', W_regularizer=l2(L2), init='he_normal', W_regularizer=l2(L2),
@ -134,18 +161,17 @@ class _SoftAlignment(object):
class _Comparison(object): class _Comparison(object):
def __init__(self, words, nr_hidden, L2=1e-6, dropout=0.2): def __init__(self, words, nr_hidden, L2=0.0, dropout=0.0):
self.words = words self.words = words
self.model = Sequential() self.model = Sequential()
self.model.add(Dropout(dropout, input_shape=(nr_hidden*2,)))
self.model.add(Dense(nr_hidden, name='compare1', self.model.add(Dense(nr_hidden, name='compare1',
init='he_normal', W_regularizer=l2(L2), init='he_normal', W_regularizer=l2(L2)))
input_shape=(nr_hidden*2,)))
self.model.add(Activation('relu')) self.model.add(Activation('relu'))
self.model.add(Dropout(dropout)) self.model.add(Dropout(dropout))
self.model.add(Dense(nr_hidden, name='compare2', self.model.add(Dense(nr_hidden, name='compare2',
W_regularizer=l2(L2), init='he_normal')) W_regularizer=l2(L2), init='he_normal'))
self.model.add(Activation('relu')) self.model.add(Activation('relu'))
self.model.add(Dropout(dropout))
self.model = TimeDistributed(self.model) self.model = TimeDistributed(self.model)
def __call__(self, sent, align, **kwargs): def __call__(self, sent, align, **kwargs):
@ -156,13 +182,16 @@ class _Comparison(object):
class _Entailment(object): class _Entailment(object):
def __init__(self, nr_hidden, nr_out, dropout=0.2, L2=1e-4): def __init__(self, nr_hidden, nr_out, dropout=0.0, L2=0.0):
self.model = Sequential() self.model = Sequential()
self.model.add(Dropout(dropout, input_shape=(nr_hidden*2,)))
self.model.add(Dense(nr_hidden, name='entail1', self.model.add(Dense(nr_hidden, name='entail1',
init='he_normal', W_regularizer=l2(L2), init='he_normal', W_regularizer=l2(L2)))
input_shape=(nr_hidden*2,)))
self.model.add(Activation('relu')) self.model.add(Activation('relu'))
self.model.add(Dropout(dropout)) self.model.add(Dropout(dropout))
self.model.add(Dense(nr_hidden, name='entail2',
init='he_normal', W_regularizer=l2(L2)))
self.model.add(Activation('relu'))
self.model.add(Dense(nr_out, name='entail_out', activation='softmax', self.model.add(Dense(nr_out, name='entail_out', activation='softmax',
W_regularizer=l2(L2), init='zero')) W_regularizer=l2(L2), init='zero'))

View File

@ -1,5 +1,6 @@
from keras.models import model_from_json from keras.models import model_from_json
import numpy import numpy
import numpy.random
class KerasSimilarityShim(object): class KerasSimilarityShim(object):
@ -31,16 +32,16 @@ class KerasSimilarityShim(object):
return scores[0] return scores[0]
def get_embeddings(vocab): def get_embeddings(vocab, nr_unk=100):
max_rank = max(lex.rank+1 for lex in vocab if lex.has_vector) nr_vector = max(lex.rank for lex in vocab) + 1
vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32') vectors = numpy.zeros((nr_vector+nr_unk+2, vocab.vectors_length), dtype='float32')
for lex in vocab: for lex in vocab:
if lex.has_vector: if lex.has_vector:
vectors[lex.rank + 1] = lex.vector vectors[lex.rank+1] = lex.vector / lex.vector_norm
return vectors return vectors
def get_word_ids(docs, rnn_encode=False, tree_truncate=False, max_length=100): def get_word_ids(docs, rnn_encode=False, tree_truncate=False, max_length=100, nr_unk=100):
Xs = numpy.zeros((len(docs), max_length), dtype='int32') Xs = numpy.zeros((len(docs), max_length), dtype='int32')
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
if tree_truncate: if tree_truncate:
@ -50,17 +51,22 @@ def get_word_ids(docs, rnn_encode=False, tree_truncate=False, max_length=100):
words = [] words = []
while len(words) <= max_length and queue: while len(words) <= max_length and queue:
word = queue.pop(0) word = queue.pop(0)
if rnn_encode or (word.has_vector and not word.is_punct and not word.is_space): if rnn_encode or (not word.is_punct and not word.is_space):
words.append(word) words.append(word)
if tree_truncate: if tree_truncate:
queue.extend(list(word.lefts)) queue.extend(list(word.lefts))
queue.extend(list(word.rights)) queue.extend(list(word.rights))
words.sort() words.sort()
for j, token in enumerate(words): for j, token in enumerate(words):
Xs[i, j] = token.rank + 1 if token.has_vector:
Xs[i, j] = token.rank+1
else:
Xs[i, j] = (token.shape % (nr_unk-1))+2
j += 1 j += 1
if j >= max_length: if j >= max_length:
break break
else:
Xs[i, len(words)] = 1
return Xs return Xs