From 89df91846ce9203dd03e4efa949fa813093d1d7b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 12 Nov 2016 11:43:37 -0600 Subject: [PATCH] Fix entailment example, and add a flag for BiRNN encoding. --- examples/keras_parikh_entailment/__main__.py | 56 ++++++++++++++----- .../keras_decomposable_attention.py | 16 ++++-- .../keras_parikh_entailment/spacy_hook.py | 28 ++++++---- 3 files changed, 70 insertions(+), 30 deletions(-) diff --git a/examples/keras_parikh_entailment/__main__.py b/examples/keras_parikh_entailment/__main__.py index ede6c9103..b0d5340f5 100644 --- a/examples/keras_parikh_entailment/__main__.py +++ b/examples/keras_parikh_entailment/__main__.py @@ -3,25 +3,46 @@ import spacy import plac from pathlib import Path +import ujson as json +import numpy +from keras.utils.np_utils import to_categorical from spacy_hook import get_embeddings, get_word_ids from spacy_hook import create_similarity_pipeline from keras_decomposable_attention import build_model + def train(model_dir, train_loc, dev_loc, shape, settings): + train_texts1, train_texts2, train_labels = read_snli(train_loc) + dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc) + print("Loading spaCy") - nlp = spacy.load('en', tagger=False, parser=False, entity=False, matcher=False) + nlp = spacy.load('en') print("Compiling network") model = build_model(get_embeddings(nlp.vocab), shape, settings) print("Processing texts...") - train_X = get_features(list(nlp.pipe(train_texts))) - dev_X = get_features(list(nlp.pipe(dev_texts))) + train_X1 = get_word_ids(list(nlp.pipe(train_texts1, n_threads=10, batch_size=10000)), + max_length=shape[0], + tree_truncate=settings['tree_truncate']) + train_X2 = get_word_ids(list(nlp.pipe(train_texts2, n_threads=10, batch_size=10000)), + max_length=shape[0], + tree_truncate=settings['tree_truncate']) + dev_X1 = get_word_ids(list(nlp.pipe(dev_texts1, n_threads=10, batch_size=10000)), + max_length=shape[0], + tree_truncate=settings['tree_truncate']) + dev_X2 = get_word_ids(list(nlp.pipe(dev_texts2, n_threads=10, batch_size=10000)), + max_length=shape[0], + tree_truncate=settings['tree_truncate']) + print(train_X1.shape, train_X2.shape) + print(dev_X1.shape, dev_X2.shape) + print(train_labels.shape, dev_labels.shape) + print(settings) model.fit( - train_X, + [train_X1, train_X2], train_labels, - validation_data=(dev_X, dev_labels), + validation_data=([dev_X1, dev_X2], dev_labels), nb_epoch=settings['nr_epoch'], batch_size=settings['batch_size']) @@ -56,16 +77,20 @@ def demo(model_dir): LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2} -def read_snli(loc): - with open(loc) as file_: +def read_snli(path): + texts1 = [] + texts2 = [] + labels = [] + with path.open() as file_: for line in file_: eg = json.loads(line) label = eg['gold_label'] if label == '-': continue - text1 = eg['sentence1'] - text2 = eg['sentence2'] - yield text1, text2, LABELS[label] + texts1.append(eg['sentence1']) + texts2.append(eg['sentence2']) + labels.append(LABELS[label]) + return texts1, texts2, to_categorical(numpy.asarray(labels, dtype='int32')) @plac.annotations( @@ -78,9 +103,13 @@ def read_snli(loc): dropout=("Dropout level", "option", "d", float), learn_rate=("Learning rate", "option", "e", float), batch_size=("Batch size for neural network training", "option", "b", float), - nr_epoch=("Number of training epochs", "option", "i", float) + nr_epoch=("Number of training epochs", "option", "i", float), + tree_truncate=("Truncate sentences by tree distance", "flag", "T", bool), + gru_encode=("Encode sentences with bidirectional GRU", "flag", "E", bool), ) def main(mode, model_dir, train_loc, dev_loc, + tree_truncate=False, + gru_encode=False, max_length=100, nr_hidden=100, dropout=0.2, @@ -92,7 +121,9 @@ def main(mode, model_dir, train_loc, dev_loc, 'lr': learn_rate, 'dropout': dropout, 'batch_size': batch_size, - 'nr_epoch': nr_epoch + 'nr_epoch': nr_epoch, + 'tree_truncate': tree_truncate, + 'gru_encode': gru_encode } if mode == 'train': train(model_dir, train_loc, dev_loc, shape, settings) @@ -101,6 +132,5 @@ def main(mode, model_dir, train_loc, dev_loc, else: demo(model_dir) - if __name__ == '__main__': plac.call(main) diff --git a/examples/keras_parikh_entailment/keras_decomposable_attention.py b/examples/keras_parikh_entailment/keras_decomposable_attention.py index ede435f42..80eac2da7 100644 --- a/examples/keras_parikh_entailment/keras_decomposable_attention.py +++ b/examples/keras_parikh_entailment/keras_decomposable_attention.py @@ -4,6 +4,7 @@ import numpy from keras.layers import InputSpec, Layer, Input, Dense, merge from keras.layers import Activation, Dropout, Embedding, TimeDistributed +from keras.layers import Bidirectional, GRU import keras.backend as K import theano.tensor as T from keras.models import Sequential, Model, model_from_json @@ -12,7 +13,7 @@ from keras.optimizers import Adam from keras.layers.normalization import BatchNormalization -def build_model(vectors, shape, settings, use_rnn_encoding=False): +def build_model(vectors, shape, settings): '''Compile the model.''' max_length, nr_hidden, nr_class = shape # Declare inputs. @@ -21,8 +22,8 @@ def build_model(vectors, shape, settings, use_rnn_encoding=False): # Construct operations, which we'll chain together. embed = _StaticEmbedding(vectors, max_length, nr_hidden) - if use_rnn_encoding: - encode = _BiLSTMEncode(max_length, nr_hidden) + if settings['gru_encode']: + encode = _BiRNNEncoding(max_length, nr_hidden) attend = _Attention(max_length, nr_hidden) align = _SoftAlignment(max_length, nr_hidden) compare = _Comparison(max_length, nr_hidden) @@ -32,7 +33,7 @@ def build_model(vectors, shape, settings, use_rnn_encoding=False): sent1 = embed(ids1) # Shape: (i, n) sent2 = embed(ids2) # Shape: (j, n) - if use_rnn_encoding: + if settings['gru_encode']: sent1 = encode(sent1) sent2 = encode(sent2) @@ -78,15 +79,18 @@ class _StaticEmbedding(object): def __call__(self, sentence): return self.project(self.embed(sentence)) - + + class _BiRNNEncoding(object): def __init__(self, max_length, nr_out): self.model = Sequential() - self.model.add(Bidirectional(LSTM(nr_out, input_length=max_length))) + self.model.add(Bidirectional(GRU(int(nr_out/2), return_sequences=True), + input_shape=(max_length, nr_out))) def __call__(self, sentence): return self.model(sentence) + class _Attention(object): def __init__(self, max_length, nr_hidden, dropout=0.0, L2=1e-4, activation='relu'): self.max_length = max_length diff --git a/examples/keras_parikh_entailment/spacy_hook.py b/examples/keras_parikh_entailment/spacy_hook.py index 78e5ab71a..71d6c3add 100644 --- a/examples/keras_parikh_entailment/spacy_hook.py +++ b/examples/keras_parikh_entailment/spacy_hook.py @@ -1,4 +1,5 @@ from keras.models import model_from_json +import numpy class KerasSimilarityShim(object): @@ -30,7 +31,7 @@ class KerasSimilarityShim(object): return scores[0] -def get_embeddings(cls, vocab): +def get_embeddings(vocab): max_rank = max(lex.rank+1 for lex in vocab if lex.has_vector) vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32') for lex in vocab: @@ -39,16 +40,24 @@ def get_embeddings(cls, vocab): return vectors -def get_word_ids(docs, max_length=100): +def get_word_ids(docs, tree_truncate=False, max_length=100): Xs = numpy.zeros((len(docs), max_length), dtype='int32') for i, doc in enumerate(docs): j = 0 - for token in doc: - if token.has_vector and not token.is_punct and not token.is_space: - Xs[i, j] = token.rank + 1 - j += 1 - if j >= max_length: - break + queue = [sent.root for sent in doc.sents] + words = [] + while len(words) <= max_length and queue: + word = queue.pop(0) + if word.has_vector and not word.is_punct and not word.is_space: + words.append(word) + queue.extend(list(word.lefts)) + queue.extend(list(word.rights)) + words.sort() + for j, token in enumerate(words): + Xs[i, j] = token.rank + 1 + j += 1 + if j >= max_length: + break return Xs @@ -57,6 +66,3 @@ def create_similarity_pipeline(nlp): nlp.path / 'similarity', nlp, feature_extracter=get_features)] - - -